D25872.diff
No OneTemporary
Actions

Size

7 MB

Referenced Files

None

Subscribers

None

D25872.diff
View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: head/Makefile.inc1
	===================================================================
	--- head/Makefile.inc1
	+++ head/Makefile.inc1
	@@ -2442,7 +2442,7 @@
	# Rebuild ctfconvert and ctfmerge to avoid difficult-to-diagnose failures
	# resulting from missing bug fixes or ELF Toolchain updates.
	.if ${MK_CDDL} != "no"
	-_dtrace_tools= cddl/lib/libctf cddl/usr.bin/ctfconvert \
	+_dtrace_tools= cddl/lib/libctf cddl/lib/libspl cddl/usr.bin/ctfconvert \
	cddl/usr.bin/ctfmerge
	.endif

	@@ -2756,7 +2756,12 @@
	${_cddl_lib_libumem} ${_cddl_lib_libnvpair} \
	${_cddl_lib_libuutil} \
	${_cddl_lib_libavl} \
	+ ${_cddl_lib_libicp} \
	+ ${_cddl_lib_libicp_rescue} \
	+ ${_cddl_lib_libspl} \
	+ ${_cddl_lib_libtpool} \
	${_cddl_lib_libzfs_core} ${_cddl_lib_libzfs} \
	+ ${_cddl_lib_libzutil} \
	${_cddl_lib_libctf} \
	lib/libufs \
	lib/libutil lib/libpjdlog ${_lib_libypclnt} lib/libz lib/msun \
	@@ -2826,21 +2831,34 @@
	_cddl_lib_libnvpair= cddl/lib/libnvpair
	_cddl_lib_libavl= cddl/lib/libavl
	_cddl_lib_libuutil= cddl/lib/libuutil
	+_cddl_lib_libspl= cddl/lib/libspl
	+
	+cddl/lib/libuutil__L: cddl/lib/libavl__L cddl/lib/libspl__L
	+
	.if ${MK_ZFS} != "no"
	+_cddl_lib_libicp= cddl/lib/libicp
	+_cddl_lib_libicp_rescue= cddl/lib/libicp_rescue
	+_cddl_lib_libtpool= cddl/lib/libtpool
	+_cddl_lib_libzutil= cddl/lib/libzutil
	_cddl_lib_libzfs_core= cddl/lib/libzfs_core
	_cddl_lib_libzfs= cddl/lib/libzfs

	+cddl/lib/libtpool__L: cddl/lib/libspl__L
	+
	+cddl/lib/libzutil__L: cddl/lib/libavl__L cddl/lib/libtpool__L
	+
	cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L

	cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L
	cddl/lib/libzfs__L: lib/libthr__L lib/libmd__L lib/libz__L cddl/lib/libumem__L
	cddl/lib/libzfs__L: cddl/lib/libuutil__L cddl/lib/libavl__L lib/libgeom__L
	+cddl/lib/libzfs__L: cddl/lib/libnvpair__L cddl/lib/libzutil__L

	lib/libbe__L: cddl/lib/libzfs__L
	.endif
	_cddl_lib_libctf= cddl/lib/libctf
	_cddl_lib= cddl/lib
	-cddl/lib/libctf__L: lib/libz__L
	+cddl/lib/libctf__L: lib/libz__L cddl/lib/libspl__L
	.endif
	# cddl/lib/libdtrace requires lib/libproc and lib/librtld_db
	_prebuild_libs+= lib/libprocstat lib/libproc lib/librtld_db
	Index: head/cddl/compat/opensolaris/include/fcntl.h
	===================================================================
	--- head/cddl/compat/opensolaris/include/fcntl.h
	+++ head/cddl/compat/opensolaris/include/fcntl.h
	@@ -32,7 +32,9 @@

	#include_next <fcntl.h>

	+#ifndef open64
	#define open64(...) open(__VA_ARGS__)
	+#endif
	#define openat64(...) openat(__VA_ARGS__)

	#endif
	Index: head/cddl/compat/opensolaris/include/mnttab.h
	===================================================================
	--- head/cddl/compat/opensolaris/include/mnttab.h
	+++ head/cddl/compat/opensolaris/include/mnttab.h
	@@ -1,35 +0,0 @@
	-/* $FreeBSD$ */
	-
	-#ifndef _OPENSOLARIS_MNTTAB_H_
	-#define _OPENSOLARIS_MNTTAB_H_
	-
	-#include <sys/param.h>
	-#include <sys/mount.h>
	-
	-#include <stdio.h>
	-#include <paths.h>
	-
	-#define MNTTAB _PATH_DEVZERO
	-#define MNT_LINE_MAX 1024
	-
	-#define MS_OVERLAY 0x0
	-#define MS_NOMNTTAB 0x0
	-#define MS_RDONLY 0x1
	-
	-#define umount2(p, f) unmount(p, f)
	-
	-struct mnttab {
	- char *mnt_special;
	- char *mnt_mountp;
	- char *mnt_fstype;
	- char *mnt_mntopts;
	-};
	-#define extmnttab mnttab
	-
	-int getmntany(FILE fd, struct mnttab mgetp, struct mnttab *mrefp);
	-int getmntent(FILE fp, struct mnttab mp);
	-char hasmntopt(struct mnttab mnt, char *opt);
	-
	-void statfs2mnttab(struct statfs sfs, struct mnttab mp);
	-
	-#endif /* !_OPENSOLARIS_MNTTAB_H_ */
	Index: head/cddl/contrib/opensolaris/cmd/lockstat/sym.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/lockstat/sym.c
	+++ head/cddl/contrib/opensolaris/cmd/lockstat/sym.c
	@@ -54,6 +54,7 @@
	#endif
	#include <sys/cpuvar.h>

	+
	typedef struct syment {
	uintptr_t addr;
	char *name;
	@@ -71,6 +72,11 @@
	#define elf_getshdr elf32_getshdr
	#endif
	#endif
	+
	+#define __sElfN(x) typedef __CONCAT(__CONCAT(__CONCAT(Elf,__ELF_WORD_SIZE),_),x) x
	+__sElfN(Sym);
	+__sElfN(Shdr);
	+#define elf_getshdr __elfN(getshdr)

	static void
	add_symbol(char *name, uintptr_t addr, size_t size)
	Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb.h
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.h
	+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.h
	@@ -1,33 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2017 Spectra Logic Corp Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-
	-#ifndef _ZDB_H
	-#define _ZDB_H
	-
	-void dump_intent_log(zilog_t *);
	-extern uint8_t dump_opt[256];
	-
	-#endif /* _ZDB_H */
	Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb.8
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.8
	+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.8
	@@ -1,414 +0,0 @@
	-.\"
	-.\" This file and its contents are supplied under the terms of the
	-.\" Common Development and Distribution License ("CDDL"), version 1.0.
	-.\" You may only use this file in accordance with the terms of version
	-.\" 1.0 of the CDDL.
	-.\"
	-.\" A full copy of the text of the CDDL should have accompanied this
	-.\" source. A copy of the CDDL is also available via the Internet at
	-.\" http://www.illumos.org/license/CDDL.
	-.\"
	-.\"
	-.\" Copyright 2012, Richard Lowe.
	-.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	-.\" Copyright 2017 Nexenta Systems, Inc.
	-.\"
	-.Dd February 25, 2020
	-.Dt ZDB 8
	-.Os
	-.Sh NAME
	-.Nm zdb
	-.Nd display zpool debugging and consistency information
	-.Sh SYNOPSIS
	-.Nm
	-.Op Fl AbcdDFGhikLMPsvX
	-.Op Fl e Oo Fl V Oc Op Fl p Ar path ...
	-.Op Fl I Ar inflight I/Os
	-.Oo Fl o Ar var Ns = Ns Ar value Oc Ns ...
	-.Op Fl t Ar txg
	-.Op Fl U Ar cache
	-.Op Fl x Ar dumpdir
	-.Op Ar poolname Op Ar object ...
	-.Nm
	-.Op Fl AdiPv
	-.Op Fl e Oo Fl V Oc Op Fl p Ar path ...
	-.Op Fl U Ar cache
	-.Ar dataset Op Ar object ...
	-.Nm
	-.Fl C
	-.Op Fl A
	-.Op Fl U Ar cache
	-.Nm
	-.Fl E
	-.Op Fl A
	-.Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15
	-.Nm
	-.Fl l
	-.Op Fl Aqu
	-.Ar device
	-.Nm
	-.Fl m
	-.Op Fl AFLPX
	-.Op Fl e Oo Fl V Oc Op Fl p Ar path ...
	-.Op Fl t Ar txg
	-.Op Fl U Ar cache
	-.Ar poolname Op Ar vdev Op Ar metaslab ...
	-.Nm
	-.Fl O
	-.Ar dataset path
	-.Nm
	-.Fl R
	-.Op Fl A
	-.Op Fl e Oo Fl V Oc Op Fl p Ar path ...
	-.Op Fl U Ar cache
	-.Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar size Ns Op : Ns Ar flags
	-.Nm
	-.Fl S
	-.Op Fl AP
	-.Op Fl e Oo Fl V Oc Op Fl p Ar path ...
	-.Op Fl U Ar cache
	-.Ar poolname
	-.Sh DESCRIPTION
	-The
	-.Nm
	-utility displays information about a ZFS pool useful for debugging and performs
	-some amount of consistency checking.
	-It is a not a general purpose tool and options
	-.Pq and facilities
	-may change.
	-This is neither a
	-.Xr fsck 8
	-nor an
	-.Xr fsdb 8
	-utility.
	-.Pp
	-The output of this command in general reflects the on-disk structure of a ZFS
	-pool, and is inherently unstable.
	-The precise output of most invocations is not documented, a knowledge of ZFS
	-internals is assumed.
	-.Pp
	-If the
	-.Ar dataset
	-argument does not contain any
	-.Qq Sy /
	-or
	-.Qq Sy @
	-characters, it is interpreted as a pool name.
	-The root dataset can be specified as
	-.Ar pool Ns /
	-.Pq pool name followed by a slash .
	-.Pp
	-When operating on an imported and active pool it is possible, though unlikely,
	-that zdb may interpret inconsistent pool data and behave erratically.
	-.Sh OPTIONS
	-Display options:
	-.Bl -tag -width Ds
	-.It Fl b
	-Display statistics regarding the number, size
	-.Pq logical, physical and allocated
	-and deduplication of blocks.
	-.It Fl c
	-Verify the checksum of all metadata blocks while printing block statistics
	-.Po see
	-.Fl b
	-.Pc .
	-.Pp
	-If specified multiple times, verify the checksums of all blocks.
	-.It Fl C
	-Display information about the configuration.
	-If specified with no other options, instead display information about the cache
	-file
	-.Pq Pa /boot/zfs/zpool.cache .
	-To specify the cache file to display, see
	-.Fl U .
	-.Pp
	-If specified multiple times, and a pool name is also specified display both the
	-cached configuration and the on-disk configuration.
	-If specified multiple times with
	-.Fl e
	-also display the configuration that would be used were the pool to be imported.
	-.It Fl d
	-Display information about datasets.
	-Specified once, displays basic dataset information: ID, create transaction,
	-size, and object count.
	-.Pp
	-If specified multiple times provides greater and greater verbosity.
	-.Pp
	-If object IDs are specified, display information about those specific objects
	-only.
	-.It Fl D
	-Display deduplication statistics, including the deduplication ratio
	-.Pq Sy dedup ,
	-compression ratio
	-.Pq Sy compress ,
	-inflation due to the zfs copies property
	-.Pq Sy copies ,
	-and an overall effective ratio
	-.Pq Sy dedup No * Sy compress No / Sy copies .
	-.It Fl DD
	-Display a histogram of deduplication statistics, showing the allocated
	-.Pq physically present on disk
	-and referenced
	-.Pq logically referenced in the pool
	-block counts and sizes by reference count.
	-.It Fl DDD
	-Display the statistics independently for each deduplication table.
	-.It Fl DDDD
	-Dump the contents of the deduplication tables describing duplicate blocks.
	-.It Fl DDDDD
	-Also dump the contents of the deduplication tables describing unique blocks.
	-.It Fl E Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15
	-Decode and display block from an embedded block pointer specified by the
	-.Ar word
	-arguments.
	-.It Fl h
	-Display pool history similar to
	-.Nm zpool Cm history ,
	-but include internal changes, transaction, and dataset information.
	-.It Fl i
	-Display information about intent log
	-.Pq ZIL
	-entries relating to each dataset.
	-If specified multiple times, display counts of each intent log transaction type.
	-.It Fl k
	-Examine the checkpointed state of the pool.
	-Note, the on disk format of the pool is not reverted to the checkpointed state.
	-.It Fl l Ar device
	-Read the vdev labels from the specified device.
	-.Nm Fl l
	-will return 0 if valid label was found, 1 if error occurred, and 2 if no valid
	-labels were found.
	-.Pp
	-If the
	-.Fl q
	-option is also specified, don't print the labels.
	-.Pp
	-If the
	-.Fl u
	-option is also specified, also display the uberblocks on this device.
	-.It Fl L
	-Disable leak detection and the loading of space maps.
	-By default,
	-.Nm
	-verifies that all non-free blocks are referenced, which can be very expensive.
	-.It Fl m
	-Display the offset, spacemap, and free space of each metaslab.
	-.It Fl mm
	-Also display information about the on-disk free space histogram associated with
	-each metaslab.
	-.It Fl mmm
	-Display the maximum contiguous free space, the in-core free space histogram, and
	-the percentage of free space in each space map.
	-.It Fl mmmm
	-Display every spacemap record.
	-.It Fl M
	-Display the offset, spacemap, and free space of each metaslab.
	-.It Fl MM
	-Also display information about the maximum contiguous free space and the
	-percentage of free space in each space map.
	-.It Fl MMM
	-Display every spacemap record.
	-.It Fl O Ar dataset path
	-Look up the specified
	-.Ar path
	-inside of the
	-.Ar dataset
	-and display its metadata and indirect blocks.
	-Specified
	-.Ar path
	-must be relative to the root of
	-.Ar dataset .
	-This option can be combined with
	-.Fl v
	-for increasing verbosity.
	-.It Xo
	-.Fl R Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar size Ns Op : Ns Ar flags
	-.Xc
	-Read and display a block from the specified device.
	-By default the block is displayed as a hex dump, but see the description of the
	-.Sy r
	-flag, below.
	-.Pp
	-The block is specified in terms of a colon-separated tuple
	-.Ar vdev
	-.Pq an integer vdev identifier
	-.Ar offset
	-.Pq the offset within the vdev
	-.Ar size
	-.Pq the size of the block to read
	-and, optionally,
	-.Ar flags
	-.Pq a set of flags, described below .
	-.Pp
	-.Bl -tag -compact -width "b offset"
	-.It Sy b Ar offset
	-Print block pointer
	-.It Sy d
	-Decompress the block
	-.It Sy e
	-Byte swap the block
	-.It Sy g
	-Dump gang block header
	-.It Sy i
	-Dump indirect block
	-.It Sy r
	-Dump raw uninterpreted block data
	-.El
	-.It Fl s
	-Report statistics on
	-.Nm zdb
	-I/O.
	-Display operation counts, bandwidth, and error counts of I/O to the pool from
	-.Nm .
	-.It Fl S
	-Simulate the effects of deduplication, constructing a DDT and then display
	-that DDT as with
	-.Fl DD .
	-.It Fl u
	-Display the current uberblock.
	-.El
	-.Pp
	-Other options:
	-.Bl -tag -width Ds
	-.It Fl A
	-Do not abort should any assertion fail.
	-.It Fl AA
	-Enable panic recovery, certain errors which would otherwise be fatal are
	-demoted to warnings.
	-.It Fl AAA
	-Do not abort if asserts fail and also enable panic recovery.
	-.It Fl e Op Fl p Ar path ...
	-Operate on an exported pool, not present in
	-.Pa /boot/zfs/zpool.cache .
	-The
	-.Fl p
	-flag specifies the path under which devices are to be searched.
	-.It Fl x Ar dumpdir
	-All blocks accessed will be copied to files in the specified directory.
	-The blocks will be placed in sparse files whose name is the same as
	-that of the file or device read.
	-.Nm
	-can be then run on the generated files.
	-Note that the
	-.Fl bbc
	-flags are sufficient to access
	-.Pq and thus copy
	-all metadata on the pool.
	-.It Fl F
	-Attempt to make an unreadable pool readable by trying progressively older
	-transactions.
	-.It Fl G
	-Dump the contents of the zfs_dbgmsg buffer before exiting
	-.Nm .
	-zfs_dbgmsg is a buffer used by ZFS to dump advanced debug information.
	-.It Fl I Ar inflight I/Os
	-Limit the number of outstanding checksum I/Os to the specified value.
	-The default value is 200.
	-This option affects the performance of the
	-.Fl c
	-option.
	-.It Fl o Ar var Ns = Ns Ar value ...
	-Set the given global libzpool variable to the provided value.
	-The value must be an unsigned 32-bit integer.
	-Currently only little-endian systems are supported to avoid accidentally setting
	-the high 32 bits of 64-bit variables.
	-.It Fl P
	-Print numbers in an unscaled form more amenable to parsing, eg. 1000000 rather
	-than 1M.
	-.It Fl t Ar transaction
	-Specify the highest transaction to use when searching for uberblocks.
	-See also the
	-.Fl u
	-and
	-.Fl l
	-options for a means to see the available uberblocks and their associated
	-transaction numbers.
	-.It Fl U Ar cachefile
	-Use a cache file other than
	-.Pa /boot/zfs/zpool.cache .
	-.It Fl v
	-Enable verbosity.
	-Specify multiple times for increased verbosity.
	-.It Fl V
	-Attempt verbatim import.
	-This mimics the behavior of the kernel when loading a pool from a cachefile.
	-Only usable with
	-.Fl e .
	-.It Fl X
	-Attempt
	-.Qq extreme
	-transaction rewind, that is attempt the same recovery as
	-.Fl F
	-but read transactions otherwise deemed too old.
	-.El
	-.Pp
	-Specifying a display option more than once enables verbosity for only that
	-option, with more occurrences enabling more verbosity.
	-.Pp
	-If no options are specified, all information about the named pool will be
	-displayed at default verbosity.
	-.Sh EXAMPLES
	-.Bl -tag -width Ds
	-.It Xo
	-.Sy Example 1
	-Display the configuration of imported pool
	-.Pa rpool
	-.Xc
	-.Bd -literal
	-# zdb -C rpool
	-
	-MOS Configuration:
	- version: 28
	- name: 'rpool'
	- ...
	-.Ed
	-.It Xo
	-.Sy Example 2
	-Display basic dataset information about
	-.Pa rpool
	-.Xc
	-.Bd -literal
	-# zdb -d rpool
	-Dataset mos [META], ID 0, cr_txg 4, 26.9M, 1051 objects
	-Dataset rpool/swap [ZVOL], ID 59, cr_txg 356, 486M, 2 objects
	- ...
	-.Ed
	-.It Xo
	-.Sy Example 3
	-Display basic information about object 0 in
	-.Pa rpool/export/home
	-.Xc
	-.Bd -literal
	-# zdb -d rpool/export/home 0
	-Dataset rpool/export/home [ZPL], ID 137, cr_txg 1546, 32K, 8 objects
	-
	- Object lvl iblk dblk dsize lsize %full type
	- 0 7 16K 16K 15.0K 16K 25.00 DMU dnode
	-.Ed
	-.It Xo
	-.Sy Example 4
	-Display the predicted effect of enabling deduplication on
	-.Pa rpool
	-.Xc
	-.Bd -literal
	-# zdb -S rpool
	-Simulated DDT histogram:
	-
	-bucket allocated referenced
	-______ ______________________________ ______________________________
	-refcnt blocks LSIZE PSIZE DSIZE blocks LSIZE PSIZE DSIZE
	------- ------ ----- ----- ----- ------ ----- ----- -----
	- 1 694K 27.1G 15.0G 15.0G 694K 27.1G 15.0G 15.0G
	- 2 35.0K 1.33G 699M 699M 74.7K 2.79G 1.45G 1.45G
	- ...
	-dedup = 1.11, compress = 1.80, copies = 1.00, dedup * compress / copies = 2.00
	-.Ed
	-.El
	-.Sh SEE ALSO
	-.Xr zfs 8 ,
	-.Xr zpool 8
	-.Sh HISTORY
	-The
	-.Nm
	-utility first appeared in
	-.Fx 7.0 .
	Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
	+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
	@@ -1,5749 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Nexenta Systems, Inc.
	- * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
	- * Copyright 2017 RackTop Systems.
	- */
	-
	-#include <stdio.h>
	-#include <unistd.h>
	-#include <stdio_ext.h>
	-#include <stdlib.h>
	-#include <ctype.h>
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/dmu.h>
	-#include <sys/zap.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zfs_sa.h>
	-#include <sys/sa.h>
	-#include <sys/sa_impl.h>
	-#include <sys/vdev.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dbuf.h>
	-#include <sys/zil.h>
	-#include <sys/zil_impl.h>
	-#include <sys/stat.h>
	-#include <sys/resource.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zio_compress.h>
	-#include <sys/zfs_fuid.h>
	-#include <sys/arc.h>
	-#include <sys/ddt.h>
	-#include <sys/zfeature.h>
	-#include <sys/abd.h>
	-#include <sys/blkptr.h>
	-#include <sys/dsl_scan.h>
	-#include <zfs_comutil.h>
	-#include <libcmdutils.h>
	-#undef verify
	-#include <libzfs.h>
	-
	-#include "zdb.h"
	-
	-#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
	- zio_compress_table[(idx)].ci_name : "UNKNOWN")
	-#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
	- zio_checksum_table[(idx)].ci_name : "UNKNOWN")
	-#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \
	- dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \
	- dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
	-#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \
	- (idx) == DMU_OTN_ZAP_DATA \|\| (idx) == DMU_OTN_ZAP_METADATA ? \
	- DMU_OT_ZAP_OTHER : \
	- (idx) == DMU_OTN_UINT64_DATA \|\| (idx) == DMU_OTN_UINT64_METADATA ? \
	- DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
	-
	-#ifndef lint
	-extern int reference_tracking_enable;
	-extern boolean_t zfs_recover;
	-extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
	-extern int zfs_vdev_async_read_max_active;
	-extern boolean_t spa_load_verify_dryrun;
	-extern int aok;
	-#else
	-int reference_tracking_enable;
	-boolean_t zfs_recover;
	-uint64_t zfs_arc_max, zfs_arc_meta_limit;
	-int zfs_vdev_async_read_max_active;
	-boolean_t spa_load_verify_dryrun;
	-int aok;
	-#endif
	-
	-static const char cmdname[] = "zdb";
	-uint8_t dump_opt[256];
	-
	-typedef void object_viewer_t(objset_t , uint64_t, void data, size_t size);
	-
	-static uint64_t *zopt_object = NULL;
	-static unsigned zopt_objects = 0;
	-static libzfs_handle_t *g_zfs;
	-static uint64_t max_inflight = 1000;
	-static int leaked_objects = 0;
	-
	-static void snprintf_blkptr_compact(char , size_t, const blkptr_t );
	-static void mos_obj_refd(uint64_t);
	-
	-/*
	- * These libumem hooks provide a reasonable set of defaults for the allocator's
	- * debugging facilities.
	- */
	-const char *
	-_umem_debug_init()
	-{
	- return ("default,verbose"); /* $UMEM_DEBUG setting */
	-}
	-
	-const char *
	-_umem_logging_init(void)
	-{
	- return ("fail,contents"); /* $UMEM_LOGGING setting */
	-}
	-
	-static void
	-usage(void)
	-{
	- (void) fprintf(stderr,
	- "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
	- "[-I <inflight I/Os>]\n"
	- "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
	- "\t\t[<poolname> [<object> ...]]\n"
	- "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] <dataset> "
	- "[<object> ...]\n"
	- "\t%s -C [-A] [-U <cache>]\n"
	- "\t%s -l [-Aqu] <device>\n"
	- "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
	- "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
	- "\t%s -O <dataset> <path>\n"
	- "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
	- "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
	- "\t%s -E [-A] word0:word1:...:word15\n"
	- "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
	- "<poolname>\n\n",
	- cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
	- cmdname, cmdname);
	-
	- (void) fprintf(stderr, " Dataset name must include at least one "
	- "separator character '/' or '@'\n");
	- (void) fprintf(stderr, " If dataset name is specified, only that "
	- "dataset is dumped\n");
	- (void) fprintf(stderr, " If object numbers are specified, only "
	- "those objects are dumped\n\n");
	- (void) fprintf(stderr, " Options to control amount of output:\n");
	- (void) fprintf(stderr, " -b block statistics\n");
	- (void) fprintf(stderr, " -c checksum all metadata (twice for "
	- "all data) blocks\n");
	- (void) fprintf(stderr, " -C config (or cachefile if alone)\n");
	- (void) fprintf(stderr, " -d dataset(s)\n");
	- (void) fprintf(stderr, " -D dedup statistics\n");
	- (void) fprintf(stderr, " -E decode and display block from an "
	- "embedded block pointer\n");
	- (void) fprintf(stderr, " -h pool history\n");
	- (void) fprintf(stderr, " -i intent logs\n");
	- (void) fprintf(stderr, " -l read label contents\n");
	- (void) fprintf(stderr, " -k examine the checkpointed state "
	- "of the pool\n");
	- (void) fprintf(stderr, " -L disable leak tracking (do not "
	- "load spacemaps)\n");
	- (void) fprintf(stderr, " -m metaslabs\n");
	- (void) fprintf(stderr, " -M metaslab groups\n");
	- (void) fprintf(stderr, " -O perform object lookups by path\n");
	- (void) fprintf(stderr, " -R read and display block from a "
	- "device\n");
	- (void) fprintf(stderr, " -s report stats on zdb's I/O\n");
	- (void) fprintf(stderr, " -S simulate dedup to measure effect\n");
	- (void) fprintf(stderr, " -v verbose (applies to all "
	- "others)\n\n");
	- (void) fprintf(stderr, " Below options are intended for use "
	- "with other options:\n");
	- (void) fprintf(stderr, " -A ignore assertions (-A), enable "
	- "panic recovery (-AA) or both (-AAA)\n");
	- (void) fprintf(stderr, " -e pool is exported/destroyed/"
	- "has altroot/not in a cachefile\n");
	- (void) fprintf(stderr, " -F attempt automatic rewind within "
	- "safe range of transaction groups\n");
	- (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before "
	- "exiting\n");
	- (void) fprintf(stderr, " -I <number of inflight I/Os> -- "
	- "specify the maximum number of "
	- "checksumming I/Os [default is 200]\n");
	- (void) fprintf(stderr, " -o <variable>=<value> set global "
	- "variable to an unsigned 32-bit integer value\n");
	- (void) fprintf(stderr, " -p <path> -- use one or more with "
	- "-e to specify path to vdev dir\n");
	- (void) fprintf(stderr, " -P print numbers in parseable form\n");
	- (void) fprintf(stderr, " -q don't print label contents\n");
	- (void) fprintf(stderr, " -t <txg> -- highest txg to use when "
	- "searching for uberblocks\n");
	- (void) fprintf(stderr, " -u uberblock\n");
	- (void) fprintf(stderr, " -U <cachefile_path> -- use alternate "
	- "cachefile\n");
	- (void) fprintf(stderr, " -V do verbatim import\n");
	- (void) fprintf(stderr, " -x <dumpdir> -- "
	- "dump all read blocks into specified directory\n");
	- (void) fprintf(stderr, " -X attempt extreme rewind (does not "
	- "work with dataset)\n\n");
	- (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
	- "to make only that option verbose\n");
	- (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
	- exit(1);
	-}
	-
	-static void
	-dump_debug_buffer()
	-{
	- if (dump_opt['G']) {
	- (void) printf("\n");
	- zfs_dbgmsg_print("zdb");
	- }
	-}
	-
	-/*
	- * Called for usage errors that are discovered after a call to spa_open(),
	- * dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
	- */
	-
	-static void
	-fatal(const char *fmt, ...)
	-{
	- va_list ap;
	-
	- va_start(ap, fmt);
	- (void) fprintf(stderr, "%s: ", cmdname);
	- (void) vfprintf(stderr, fmt, ap);
	- va_end(ap);
	- (void) fprintf(stderr, "\n");
	-
	- dump_debug_buffer();
	-
	- exit(1);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dump_packed_nvlist(objset_t os, uint64_t object, void data, size_t size)
	-{
	- nvlist_t *nv;
	- size_t nvsize = (uint64_t )data;
	- char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
	-
	- VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
	-
	- VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
	-
	- umem_free(packed, nvsize);
	-
	- dump_nvlist(nv, 8);
	-
	- nvlist_free(nv);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dump_history_offsets(objset_t os, uint64_t object, void data, size_t size)
	-{
	- spa_history_phys_t *shp = data;
	-
	- if (shp == NULL)
	- return;
	-
	- (void) printf("\t\tpool_create_len = %llu\n",
	- (u_longlong_t)shp->sh_pool_create_len);
	- (void) printf("\t\tphys_max_off = %llu\n",
	- (u_longlong_t)shp->sh_phys_max_off);
	- (void) printf("\t\tbof = %llu\n",
	- (u_longlong_t)shp->sh_bof);
	- (void) printf("\t\teof = %llu\n",
	- (u_longlong_t)shp->sh_eof);
	- (void) printf("\t\trecords_lost = %llu\n",
	- (u_longlong_t)shp->sh_records_lost);
	-}
	-
	-static void
	-zdb_nicenum(uint64_t num, char *buf, size_t buflen)
	-{
	- if (dump_opt['P'])
	- (void) snprintf(buf, buflen, "%llu", (longlong_t)num);
	- else
	- nicenum(num, buf, sizeof (buf));
	-}
	-
	-static const char histo_stars[] = "****************************************";
	-static const uint64_t histo_width = sizeof (histo_stars) - 1;
	-
	-static void
	-dump_histogram(const uint64_t *histo, int size, int offset)
	-{
	- int i;
	- int minidx = size - 1;
	- int maxidx = 0;
	- uint64_t max = 0;
	-
	- for (i = 0; i < size; i++) {
	- if (histo[i] > max)
	- max = histo[i];
	- if (histo[i] > 0 && i > maxidx)
	- maxidx = i;
	- if (histo[i] > 0 && i < minidx)
	- minidx = i;
	- }
	-
	- if (max < histo_width)
	- max = histo_width;
	-
	- for (i = minidx; i <= maxidx; i++) {
	- (void) printf("\t\t\t%3u: %6llu %s\n",
	- i + offset, (u_longlong_t)histo[i],
	- &histo_stars[(max - histo[i]) * histo_width / max]);
	- }
	-}
	-
	-static void
	-dump_zap_stats(objset_t *os, uint64_t object)
	-{
	- int error;
	- zap_stats_t zs;
	-
	- error = zap_get_stats(os, object, &zs);
	- if (error)
	- return;
	-
	- if (zs.zs_ptrtbl_len == 0) {
	- ASSERT(zs.zs_num_blocks == 1);
	- (void) printf("\tmicrozap: %llu bytes, %llu entries\n",
	- (u_longlong_t)zs.zs_blocksize,
	- (u_longlong_t)zs.zs_num_entries);
	- return;
	- }
	-
	- (void) printf("\tFat ZAP stats:\n");
	-
	- (void) printf("\t\tPointer table:\n");
	- (void) printf("\t\t\t%llu elements\n",
	- (u_longlong_t)zs.zs_ptrtbl_len);
	- (void) printf("\t\t\tzt_blk: %llu\n",
	- (u_longlong_t)zs.zs_ptrtbl_zt_blk);
	- (void) printf("\t\t\tzt_numblks: %llu\n",
	- (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
	- (void) printf("\t\t\tzt_shift: %llu\n",
	- (u_longlong_t)zs.zs_ptrtbl_zt_shift);
	- (void) printf("\t\t\tzt_blks_copied: %llu\n",
	- (u_longlong_t)zs.zs_ptrtbl_blks_copied);
	- (void) printf("\t\t\tzt_nextblk: %llu\n",
	- (u_longlong_t)zs.zs_ptrtbl_nextblk);
	-
	- (void) printf("\t\tZAP entries: %llu\n",
	- (u_longlong_t)zs.zs_num_entries);
	- (void) printf("\t\tLeaf blocks: %llu\n",
	- (u_longlong_t)zs.zs_num_leafs);
	- (void) printf("\t\tTotal blocks: %llu\n",
	- (u_longlong_t)zs.zs_num_blocks);
	- (void) printf("\t\tzap_block_type: 0x%llx\n",
	- (u_longlong_t)zs.zs_block_type);
	- (void) printf("\t\tzap_magic: 0x%llx\n",
	- (u_longlong_t)zs.zs_magic);
	- (void) printf("\t\tzap_salt: 0x%llx\n",
	- (u_longlong_t)zs.zs_salt);
	-
	- (void) printf("\t\tLeafs with 2^n pointers:\n");
	- dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
	-
	- (void) printf("\t\tBlocks with n*5 entries:\n");
	- dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
	-
	- (void) printf("\t\tBlocks n/10 full:\n");
	- dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
	-
	- (void) printf("\t\tEntries with n chunks:\n");
	- dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
	-
	- (void) printf("\t\tBuckets with n entries:\n");
	- dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_none(objset_t os, uint64_t object, void data, size_t size)
	-{
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_unknown(objset_t os, uint64_t object, void data, size_t size)
	-{
	- (void) printf("\tUNKNOWN OBJECT TYPE\n");
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_uint8(objset_t os, uint64_t object, void data, size_t size)
	-{
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_uint64(objset_t os, uint64_t object, void data, size_t size)
	-{
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_zap(objset_t os, uint64_t object, void data, size_t size)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t attr;
	- void *prop;
	- unsigned i;
	-
	- dump_zap_stats(os, object);
	- (void) printf("\n");
	-
	- for (zap_cursor_init(&zc, os, object);
	- zap_cursor_retrieve(&zc, &attr) == 0;
	- zap_cursor_advance(&zc)) {
	- (void) printf("\t\t%s = ", attr.za_name);
	- if (attr.za_num_integers == 0) {
	- (void) printf("\n");
	- continue;
	- }
	- prop = umem_zalloc(attr.za_num_integers *
	- attr.za_integer_length, UMEM_NOFAIL);
	- (void) zap_lookup(os, object, attr.za_name,
	- attr.za_integer_length, attr.za_num_integers, prop);
	- if (attr.za_integer_length == 1) {
	- (void) printf("%s", (char *)prop);
	- } else {
	- for (i = 0; i < attr.za_num_integers; i++) {
	- switch (attr.za_integer_length) {
	- case 2:
	- (void) printf("%u ",
	- ((uint16_t *)prop)[i]);
	- break;
	- case 4:
	- (void) printf("%u ",
	- ((uint32_t *)prop)[i]);
	- break;
	- case 8:
	- (void) printf("%lld ",
	- (u_longlong_t)((int64_t *)prop)[i]);
	- break;
	- }
	- }
	- }
	- (void) printf("\n");
	- umem_free(prop, attr.za_num_integers * attr.za_integer_length);
	- }
	- zap_cursor_fini(&zc);
	-}
	-
	-static void
	-dump_bpobj(objset_t os, uint64_t object, void data, size_t size)
	-{
	- bpobj_phys_t *bpop = data;
	- char bytes[32], comp[32], uncomp[32];
	-
	- /* make sure the output won't get truncated */
	- CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
	-
	- if (bpop == NULL)
	- return;
	-
	- zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
	- zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
	- zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
	-
	- (void) printf("\t\tnum_blkptrs = %llu\n",
	- (u_longlong_t)bpop->bpo_num_blkptrs);
	- (void) printf("\t\tbytes = %s\n", bytes);
	- if (size >= BPOBJ_SIZE_V1) {
	- (void) printf("\t\tcomp = %s\n", comp);
	- (void) printf("\t\tuncomp = %s\n", uncomp);
	- }
	- if (size >= sizeof (*bpop)) {
	- (void) printf("\t\tsubobjs = %llu\n",
	- (u_longlong_t)bpop->bpo_subobjs);
	- (void) printf("\t\tnum_subobjs = %llu\n",
	- (u_longlong_t)bpop->bpo_num_subobjs);
	- }
	-
	- if (dump_opt['d'] < 5)
	- return;
	-
	- for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) {
	- char blkbuf[BP_SPRINTF_LEN];
	- blkptr_t bp;
	-
	- int err = dmu_read(os, object,
	- i * sizeof (bp), sizeof (bp), &bp, 0);
	- if (err != 0) {
	- (void) printf("got error %u from dmu_read\n", err);
	- break;
	- }
	- snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
	- (void) printf("\t%s\n", blkbuf);
	- }
	-}
	-
	-/* ARGSUSED */
	-static void
	-dump_bpobj_subobjs(objset_t os, uint64_t object, void data, size_t size)
	-{
	- dmu_object_info_t doi;
	-
	- VERIFY0(dmu_object_info(os, object, &doi));
	- uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
	-
	- int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
	- if (err != 0) {
	- (void) printf("got error %u from dmu_read\n", err);
	- kmem_free(subobjs, doi.doi_max_offset);
	- return;
	- }
	-
	- int64_t last_nonzero = -1;
	- for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) {
	- if (subobjs[i] != 0)
	- last_nonzero = i;
	- }
	-
	- for (int64_t i = 0; i <= last_nonzero; i++) {
	- (void) printf("\t%llu\n", (longlong_t)subobjs[i]);
	- }
	- kmem_free(subobjs, doi.doi_max_offset);
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_ddt_zap(objset_t os, uint64_t object, void data, size_t size)
	-{
	- dump_zap_stats(os, object);
	- /* contents are printed elsewhere, properly decoded */
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_sa_attrs(objset_t os, uint64_t object, void data, size_t size)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t attr;
	-
	- dump_zap_stats(os, object);
	- (void) printf("\n");
	-
	- for (zap_cursor_init(&zc, os, object);
	- zap_cursor_retrieve(&zc, &attr) == 0;
	- zap_cursor_advance(&zc)) {
	- (void) printf("\t\t%s = ", attr.za_name);
	- if (attr.za_num_integers == 0) {
	- (void) printf("\n");
	- continue;
	- }
	- (void) printf(" %llx : [%d:%d:%d]\n",
	- (u_longlong_t)attr.za_first_integer,
	- (int)ATTR_LENGTH(attr.za_first_integer),
	- (int)ATTR_BSWAP(attr.za_first_integer),
	- (int)ATTR_NUM(attr.za_first_integer));
	- }
	- zap_cursor_fini(&zc);
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_sa_layouts(objset_t os, uint64_t object, void data, size_t size)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t attr;
	- uint16_t *layout_attrs;
	- unsigned i;
	-
	- dump_zap_stats(os, object);
	- (void) printf("\n");
	-
	- for (zap_cursor_init(&zc, os, object);
	- zap_cursor_retrieve(&zc, &attr) == 0;
	- zap_cursor_advance(&zc)) {
	- (void) printf("\t\t%s = [", attr.za_name);
	- if (attr.za_num_integers == 0) {
	- (void) printf("\n");
	- continue;
	- }
	-
	- VERIFY(attr.za_integer_length == 2);
	- layout_attrs = umem_zalloc(attr.za_num_integers *
	- attr.za_integer_length, UMEM_NOFAIL);
	-
	- VERIFY(zap_lookup(os, object, attr.za_name,
	- attr.za_integer_length,
	- attr.za_num_integers, layout_attrs) == 0);
	-
	- for (i = 0; i != attr.za_num_integers; i++)
	- (void) printf(" %d ", (int)layout_attrs[i]);
	- (void) printf("]\n");
	- umem_free(layout_attrs,
	- attr.za_num_integers * attr.za_integer_length);
	- }
	- zap_cursor_fini(&zc);
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_zpldir(objset_t os, uint64_t object, void data, size_t size)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t attr;
	- const char *typenames[] = {
	- /* 0 */ "not specified",
	- /* 1 */ "FIFO",
	- /* 2 */ "Character Device",
	- /* 3 */ "3 (invalid)",
	- /* 4 */ "Directory",
	- /* 5 */ "5 (invalid)",
	- /* 6 */ "Block Device",
	- /* 7 */ "7 (invalid)",
	- /* 8 */ "Regular File",
	- /* 9 */ "9 (invalid)",
	- /* 10 */ "Symbolic Link",
	- /* 11 */ "11 (invalid)",
	- /* 12 */ "Socket",
	- /* 13 */ "Door",
	- /* 14 */ "Event Port",
	- /* 15 */ "15 (invalid)",
	- };
	-
	- dump_zap_stats(os, object);
	- (void) printf("\n");
	-
	- for (zap_cursor_init(&zc, os, object);
	- zap_cursor_retrieve(&zc, &attr) == 0;
	- zap_cursor_advance(&zc)) {
	- (void) printf("\t\t%s = %lld (type: %s)\n",
	- attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
	- typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
	- }
	- zap_cursor_fini(&zc);
	-}
	-
	-static int
	-get_dtl_refcount(vdev_t *vd)
	-{
	- int refcount = 0;
	-
	- if (vd->vdev_ops->vdev_op_leaf) {
	- space_map_t *sm = vd->vdev_dtl_sm;
	-
	- if (sm != NULL &&
	- sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
	- return (1);
	- return (0);
	- }
	-
	- for (unsigned c = 0; c < vd->vdev_children; c++)
	- refcount += get_dtl_refcount(vd->vdev_child[c]);
	- return (refcount);
	-}
	-
	-static int
	-get_metaslab_refcount(vdev_t *vd)
	-{
	- int refcount = 0;
	-
	- if (vd->vdev_top == vd) {
	- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
	- space_map_t *sm = vd->vdev_ms[m]->ms_sm;
	-
	- if (sm != NULL &&
	- sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
	- refcount++;
	- }
	- }
	- for (unsigned c = 0; c < vd->vdev_children; c++)
	- refcount += get_metaslab_refcount(vd->vdev_child[c]);
	-
	- return (refcount);
	-}
	-
	-static int
	-get_obsolete_refcount(vdev_t *vd)
	-{
	- int refcount = 0;
	-
	- uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
	- if (vd->vdev_top == vd && obsolete_sm_obj != 0) {
	- dmu_object_info_t doi;
	- VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
	- obsolete_sm_obj, &doi));
	- if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
	- refcount++;
	- }
	- } else {
	- ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
	- ASSERT3U(obsolete_sm_obj, ==, 0);
	- }
	- for (unsigned c = 0; c < vd->vdev_children; c++) {
	- refcount += get_obsolete_refcount(vd->vdev_child[c]);
	- }
	-
	- return (refcount);
	-}
	-
	-static int
	-get_prev_obsolete_spacemap_refcount(spa_t *spa)
	-{
	- uint64_t prev_obj =
	- spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
	- if (prev_obj != 0) {
	- dmu_object_info_t doi;
	- VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
	- if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
	- return (1);
	- }
	- }
	- return (0);
	-}
	-
	-static int
	-get_checkpoint_refcount(vdev_t *vd)
	-{
	- int refcount = 0;
	-
	- if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
	- zap_contains(spa_meta_objset(vd->vdev_spa),
	- vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
	- refcount++;
	-
	- for (uint64_t c = 0; c < vd->vdev_children; c++)
	- refcount += get_checkpoint_refcount(vd->vdev_child[c]);
	-
	- return (refcount);
	-}
	-
	-static int
	-verify_spacemap_refcounts(spa_t *spa)
	-{
	- uint64_t expected_refcount = 0;
	- uint64_t actual_refcount;
	-
	- (void) feature_get_refcount(spa,
	- &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
	- &expected_refcount);
	- actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
	- actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
	- actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
	- actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
	- actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
	-
	- if (expected_refcount != actual_refcount) {
	- (void) printf("space map refcount mismatch: expected %lld != "
	- "actual %lld\n",
	- (longlong_t)expected_refcount,
	- (longlong_t)actual_refcount);
	- return (2);
	- }
	- return (0);
	-}
	-
	-static void
	-dump_spacemap(objset_t os, space_map_t sm)
	-{
	- char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
	- "INVALID", "INVALID", "INVALID", "INVALID" };
	-
	- if (sm == NULL)
	- return;
	-
	- (void) printf("space map object %llu:\n",
	- (longlong_t)sm->sm_object);
	- (void) printf(" smp_length = 0x%llx\n",
	- (longlong_t)sm->sm_phys->smp_length);
	- (void) printf(" smp_alloc = 0x%llx\n",
	- (longlong_t)sm->sm_phys->smp_alloc);
	-
	- if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
	- return;
	-
	- /*
	- * Print out the freelist entries in both encoded and decoded form.
	- */
	- uint8_t mapshift = sm->sm_shift;
	- int64_t alloc = 0;
	- uint64_t word, entry_id = 0;
	- for (uint64_t offset = 0; offset < space_map_length(sm);
	- offset += sizeof (word)) {
	-
	- VERIFY0(dmu_read(os, space_map_object(sm), offset,
	- sizeof (word), &word, DMU_READ_PREFETCH));
	-
	- if (sm_entry_is_debug(word)) {
	- (void) printf("\t [%6llu] %s: txg %llu pass %llu\n",
	- (u_longlong_t)entry_id,
	- ddata[SM_DEBUG_ACTION_DECODE(word)],
	- (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
	- (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
	- entry_id++;
	- continue;
	- }
	-
	- uint8_t words;
	- char entry_type;
	- uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
	-
	- if (sm_entry_is_single_word(word)) {
	- entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
	- 'A' : 'F';
	- entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
	- sm->sm_start;
	- entry_run = SM_RUN_DECODE(word) << mapshift;
	- words = 1;
	- } else {
	- /* it is a two-word entry so we read another word */
	- ASSERT(sm_entry_is_double_word(word));
	-
	- uint64_t extra_word;
	- offset += sizeof (extra_word);
	- VERIFY0(dmu_read(os, space_map_object(sm), offset,
	- sizeof (extra_word), &extra_word,
	- DMU_READ_PREFETCH));
	-
	- ASSERT3U(offset, <=, space_map_length(sm));
	-
	- entry_run = SM2_RUN_DECODE(word) << mapshift;
	- entry_vdev = SM2_VDEV_DECODE(word);
	- entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
	- 'A' : 'F';
	- entry_off = (SM2_OFFSET_DECODE(extra_word) <<
	- mapshift) + sm->sm_start;
	- words = 2;
	- }
	-
	- (void) printf("\t [%6llu] %c range:"
	- " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
	- (u_longlong_t)entry_id,
	- entry_type, (u_longlong_t)entry_off,
	- (u_longlong_t)(entry_off + entry_run),
	- (u_longlong_t)entry_run,
	- (u_longlong_t)entry_vdev, words);
	-
	- if (entry_type == 'A')
	- alloc += entry_run;
	- else
	- alloc -= entry_run;
	- entry_id++;
	- }
	- if (alloc != space_map_allocated(sm)) {
	- (void) printf("space_map_object alloc (%lld) INCONSISTENT "
	- "with space map summary (%lld)\n",
	- (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
	- }
	-}
	-
	-static void
	-dump_metaslab_stats(metaslab_t *msp)
	-{
	- char maxbuf[32];
	- range_tree_t *rt = msp->ms_allocatable;
	- avl_tree_t *t = &msp->ms_allocatable_by_size;
	- int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
	-
	- /* max sure nicenum has enough space */
	- CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
	-
	- zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));
	-
	- (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
	- "segments", avl_numnodes(t), "maxsize", maxbuf,
	- "freepct", free_pct);
	- (void) printf("\tIn-memory histogram:\n");
	- dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
	-}
	-
	-static void
	-dump_metaslab(metaslab_t *msp)
	-{
	- vdev_t *vd = msp->ms_group->mg_vd;
	- spa_t *spa = vd->vdev_spa;
	- space_map_t *sm = msp->ms_sm;
	- char freebuf[32];
	-
	- zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
	- sizeof (freebuf));
	-
	- (void) printf(
	- "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
	- (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
	- (u_longlong_t)space_map_object(sm), freebuf);
	-
	- if (dump_opt['m'] > 2 && !dump_opt['L']) {
	- mutex_enter(&msp->ms_lock);
	- VERIFY0(metaslab_load(msp));
	- range_tree_stat_verify(msp->ms_allocatable);
	- dump_metaslab_stats(msp);
	- metaslab_unload(msp);
	- mutex_exit(&msp->ms_lock);
	- }
	-
	- if (dump_opt['m'] > 1 && sm != NULL &&
	- spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
	- /*
	- * The space map histogram represents free space in chunks
	- * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
	- */
	- (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
	- (u_longlong_t)msp->ms_fragmentation);
	- dump_histogram(sm->sm_phys->smp_histogram,
	- SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
	- }
	-
	- ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
	- dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
	-}
	-
	-static void
	-print_vdev_metaslab_header(vdev_t *vd)
	-{
	- vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
	- const char *bias_str;
	-
	- bias_str = (alloc_bias == VDEV_BIAS_LOG \|\| vd->vdev_islog) ?
	- VDEV_ALLOC_BIAS_LOG :
	- (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
	- (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP :
	- vd->vdev_islog ? "log" : "";
	-
	- (void) printf("\tvdev %10llu %s\n"
	- "\t%-10s%5llu %-19s %-15s %-12s\n",
	- (u_longlong_t)vd->vdev_id, bias_str,
	- "metaslabs", (u_longlong_t)vd->vdev_ms_count,
	- "offset", "spacemap", "free");
	- (void) printf("\t%15s %19s %15s %12s\n",
	- "---------------", "-------------------",
	- "---------------", "------------");
	-}
	-
	-static void
	-dump_metaslab_groups(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- metaslab_class_t *mc = spa_normal_class(spa);
	- uint64_t fragmentation;
	-
	- metaslab_class_histogram_verify(mc);
	-
	- for (unsigned c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	- metaslab_group_t *mg = tvd->vdev_mg;
	-
	- if (mg == NULL \|\| mg->mg_class != mc)
	- continue;
	-
	- metaslab_group_histogram_verify(mg);
	- mg->mg_fragmentation = metaslab_group_fragmentation(mg);
	-
	- (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
	- "fragmentation",
	- (u_longlong_t)tvd->vdev_id,
	- (u_longlong_t)tvd->vdev_ms_count);
	- if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
	- (void) printf("%3s\n", "-");
	- } else {
	- (void) printf("%3llu%%\n",
	- (u_longlong_t)mg->mg_fragmentation);
	- }
	- dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
	- }
	-
	- (void) printf("\tpool %s\tfragmentation", spa_name(spa));
	- fragmentation = metaslab_class_fragmentation(mc);
	- if (fragmentation == ZFS_FRAG_INVALID)
	- (void) printf("\t%3s\n", "-");
	- else
	- (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
	- dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
	-}
	-
	-static void
	-print_vdev_indirect(vdev_t *vd)
	-{
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- vdev_indirect_births_t *vib = vd->vdev_indirect_births;
	-
	- if (vim == NULL) {
	- ASSERT3P(vib, ==, NULL);
	- return;
	- }
	-
	- ASSERT3U(vdev_indirect_mapping_object(vim), ==,
	- vic->vic_mapping_object);
	- ASSERT3U(vdev_indirect_births_object(vib), ==,
	- vic->vic_births_object);
	-
	- (void) printf("indirect births obj %llu:\n",
	- (longlong_t)vic->vic_births_object);
	- (void) printf(" vib_count = %llu\n",
	- (longlong_t)vdev_indirect_births_count(vib));
	- for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
	- vdev_indirect_birth_entry_phys_t *cur_vibe =
	- &vib->vib_entries[i];
	- (void) printf("\toffset %llx -> txg %llu\n",
	- (longlong_t)cur_vibe->vibe_offset,
	- (longlong_t)cur_vibe->vibe_phys_birth_txg);
	- }
	- (void) printf("\n");
	-
	- (void) printf("indirect mapping obj %llu:\n",
	- (longlong_t)vic->vic_mapping_object);
	- (void) printf(" vim_max_offset = 0x%llx\n",
	- (longlong_t)vdev_indirect_mapping_max_offset(vim));
	- (void) printf(" vim_bytes_mapped = 0x%llx\n",
	- (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
	- (void) printf(" vim_count = %llu\n",
	- (longlong_t)vdev_indirect_mapping_num_entries(vim));
	-
	- if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
	- return;
	-
	- uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
	-
	- for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
	- vdev_indirect_mapping_entry_phys_t *vimep =
	- &vim->vim_entries[i];
	- (void) printf("\t<%llx:%llx:%llx> -> "
	- "<%llx:%llx:%llx> (%x obsolete)\n",
	- (longlong_t)vd->vdev_id,
	- (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
	- (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
	- (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
	- (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
	- (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
	- counts[i]);
	- }
	- (void) printf("\n");
	-
	- uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
	- if (obsolete_sm_object != 0) {
	- objset_t *mos = vd->vdev_spa->spa_meta_objset;
	- (void) printf("obsolete space map object %llu:\n",
	- (u_longlong_t)obsolete_sm_object);
	- ASSERT(vd->vdev_obsolete_sm != NULL);
	- ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
	- obsolete_sm_object);
	- dump_spacemap(mos, vd->vdev_obsolete_sm);
	- (void) printf("\n");
	- }
	-}
	-
	-static void
	-dump_metaslabs(spa_t *spa)
	-{
	- vdev_t vd, rvd = spa->spa_root_vdev;
	- uint64_t m, c = 0, children = rvd->vdev_children;
	-
	- (void) printf("\nMetaslabs:\n");
	-
	- if (!dump_opt['d'] && zopt_objects > 0) {
	- c = zopt_object[0];
	-
	- if (c >= children)
	- (void) fatal("bad vdev id: %llu", (u_longlong_t)c);
	-
	- if (zopt_objects > 1) {
	- vd = rvd->vdev_child[c];
	- print_vdev_metaslab_header(vd);
	-
	- for (m = 1; m < zopt_objects; m++) {
	- if (zopt_object[m] < vd->vdev_ms_count)
	- dump_metaslab(
	- vd->vdev_ms[zopt_object[m]]);
	- else
	- (void) fprintf(stderr, "bad metaslab "
	- "number %llu\n",
	- (u_longlong_t)zopt_object[m]);
	- }
	- (void) printf("\n");
	- return;
	- }
	- children = c + 1;
	- }
	- for (; c < children; c++) {
	- vd = rvd->vdev_child[c];
	- print_vdev_metaslab_header(vd);
	-
	- print_vdev_indirect(vd);
	-
	- for (m = 0; m < vd->vdev_ms_count; m++)
	- dump_metaslab(vd->vdev_ms[m]);
	- (void) printf("\n");
	- }
	-}
	-
	-static void
	-dump_dde(const ddt_t ddt, const ddt_entry_t dde, uint64_t index)
	-{
	- const ddt_phys_t *ddp = dde->dde_phys;
	- const ddt_key_t *ddk = &dde->dde_key;
	- const char *types[4] = { "ditto", "single", "double", "triple" };
	- char blkbuf[BP_SPRINTF_LEN];
	- blkptr_t blk;
	-
	- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	- if (ddp->ddp_phys_birth == 0)
	- continue;
	- ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
	- snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
	- (void) printf("index %llx refcnt %llu %s %s\n",
	- (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
	- types[p], blkbuf);
	- }
	-}
	-
	-static void
	-dump_dedup_ratio(const ddt_stat_t *dds)
	-{
	- double rL, rP, rD, D, dedup, compress, copies;
	-
	- if (dds->dds_blocks == 0)
	- return;
	-
	- rL = (double)dds->dds_ref_lsize;
	- rP = (double)dds->dds_ref_psize;
	- rD = (double)dds->dds_ref_dsize;
	- D = (double)dds->dds_dsize;
	-
	- dedup = rD / D;
	- compress = rL / rP;
	- copies = rD / rP;
	-
	- (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
	- "dedup * compress / copies = %.2f\n\n",
	- dedup, compress, copies, dedup * compress / copies);
	-}
	-
	-static void
	-dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
	-{
	- char name[DDT_NAMELEN];
	- ddt_entry_t dde;
	- uint64_t walk = 0;
	- dmu_object_info_t doi;
	- uint64_t count, dspace, mspace;
	- int error;
	-
	- error = ddt_object_info(ddt, type, class, &doi);
	-
	- if (error == ENOENT)
	- return;
	- ASSERT(error == 0);
	-
	- error = ddt_object_count(ddt, type, class, &count);
	- ASSERT(error == 0);
	- if (count == 0)
	- return;
	-
	- dspace = doi.doi_physical_blocks_512 << 9;
	- mspace = doi.doi_fill_count * doi.doi_data_block_size;
	-
	- ddt_object_name(ddt, type, class, name);
	-
	- (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
	- name,
	- (u_longlong_t)count,
	- (u_longlong_t)(dspace / count),
	- (u_longlong_t)(mspace / count));
	-
	- if (dump_opt['D'] < 3)
	- return;
	-
	- zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
	-
	- if (dump_opt['D'] < 4)
	- return;
	-
	- if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
	- return;
	-
	- (void) printf("%s contents:\n\n", name);
	-
	- while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
	- dump_dde(ddt, &dde, walk);
	-
	- ASSERT3U(error, ==, ENOENT);
	-
	- (void) printf("\n");
	-}
	-
	-static void
	-dump_all_ddts(spa_t *spa)
	-{
	- ddt_histogram_t ddh_total;
	- ddt_stat_t dds_total;
	-
	- bzero(&ddh_total, sizeof (ddh_total));
	- bzero(&dds_total, sizeof (dds_total));
	-
	- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	- ddt_t *ddt = spa->spa_ddt[c];
	- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	- for (enum ddt_class class = 0; class < DDT_CLASSES;
	- class++) {
	- dump_ddt(ddt, type, class);
	- }
	- }
	- }
	-
	- ddt_get_dedup_stats(spa, &dds_total);
	-
	- if (dds_total.dds_blocks == 0) {
	- (void) printf("All DDTs are empty\n");
	- return;
	- }
	-
	- (void) printf("\n");
	-
	- if (dump_opt['D'] > 1) {
	- (void) printf("DDT histogram (aggregated over all DDTs):\n");
	- ddt_get_dedup_histogram(spa, &ddh_total);
	- zpool_dump_ddt(&dds_total, &ddh_total);
	- }
	-
	- dump_dedup_ratio(&dds_total);
	-}
	-
	-static void
	-dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
	-{
	- char *prefix = arg;
	-
	- (void) printf("%s [%llu,%llu) length %llu\n",
	- prefix,
	- (u_longlong_t)start,
	- (u_longlong_t)(start + size),
	- (u_longlong_t)(size));
	-}
	-
	-static void
	-dump_dtl(vdev_t *vd, int indent)
	-{
	- spa_t *spa = vd->vdev_spa;
	- boolean_t required;
	- const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
	- "outage" };
	- char prefix[256];
	-
	- spa_vdev_state_enter(spa, SCL_NONE);
	- required = vdev_dtl_required(vd);
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	-
	- if (indent == 0)
	- (void) printf("\nDirty time logs:\n\n");
	-
	- (void) printf("\t%*s%s [%s]\n", indent, "",
	- vd->vdev_path ? vd->vdev_path :
	- vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
	- required ? "DTL-required" : "DTL-expendable");
	-
	- for (int t = 0; t < DTL_TYPES; t++) {
	- range_tree_t *rt = vd->vdev_dtl[t];
	- if (range_tree_space(rt) == 0)
	- continue;
	- (void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
	- indent + 2, "", name[t]);
	- range_tree_walk(rt, dump_dtl_seg, prefix);
	- if (dump_opt['d'] > 5 && vd->vdev_children == 0)
	- dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
	- }
	-
	- for (unsigned c = 0; c < vd->vdev_children; c++)
	- dump_dtl(vd->vdev_child[c], indent + 4);
	-}
	-
	-/* from spa_history.c: spa_history_create_obj() */
	-#define HIS_BUF_LEN_DEF (128 << 10)
	-#define HIS_BUF_LEN_MAX (1 << 30)
	-
	-static void
	-dump_history(spa_t *spa)
	-{
	- nvlist_t **events = NULL;
	- char *buf = NULL;
	- uint64_t bufsize = HIS_BUF_LEN_DEF;
	- uint64_t resid, len, off = 0;
	- uint_t num = 0;
	- int error;
	- time_t tsec;
	- struct tm t;
	- char tbuf[30];
	- char internalstr[MAXPATHLEN];
	-
	- if ((buf = malloc(bufsize)) == NULL)
	- (void) fprintf(stderr, "Unable to read history: "
	- "out of memory\n");
	- do {
	- len = bufsize;
	-
	- if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
	- (void) fprintf(stderr, "Unable to read history: "
	- "error %d\n", error);
	- return;
	- }
	-
	- if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
	- break;
	- off -= resid;
	-
	- /*
	- * If the history block is too big, double the buffer
	- * size and try again.
	- */
	- if (resid == len) {
	- free(buf);
	- buf = NULL;
	-
	- bufsize <<= 1;
	- if ((bufsize >= HIS_BUF_LEN_MAX) \|\|
	- ((buf = malloc(bufsize)) == NULL)) {
	- (void) fprintf(stderr, "Unable to read history: "
	- "out of memory\n");
	- return;
	- }
	- }
	- } while (len != 0);
	- free(buf);
	-
	- (void) printf("\nHistory:\n");
	- for (unsigned i = 0; i < num; i++) {
	- uint64_t time, txg, ievent;
	- char cmd, intstr;
	- boolean_t printed = B_FALSE;
	-
	- if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
	- &time) != 0)
	- goto next;
	- if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
	- &cmd) != 0) {
	- if (nvlist_lookup_uint64(events[i],
	- ZPOOL_HIST_INT_EVENT, &ievent) != 0)
	- goto next;
	- verify(nvlist_lookup_uint64(events[i],
	- ZPOOL_HIST_TXG, &txg) == 0);
	- verify(nvlist_lookup_string(events[i],
	- ZPOOL_HIST_INT_STR, &intstr) == 0);
	- if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
	- goto next;
	-
	- (void) snprintf(internalstr,
	- sizeof (internalstr),
	- "[internal %s txg:%ju] %s",
	- zfs_history_event_names[ievent], (uintmax_t)txg,
	- intstr);
	- cmd = internalstr;
	- }
	- tsec = time;
	- (void) localtime_r(&tsec, &t);
	- (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
	- (void) printf("%s %s\n", tbuf, cmd);
	- printed = B_TRUE;
	-
	-next:
	- if (dump_opt['h'] > 1) {
	- if (!printed)
	- (void) printf("unrecognized record:\n");
	- dump_nvlist(events[i], 2);
	- }
	- }
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_dnode(objset_t os, uint64_t object, void data, size_t size)
	-{
	-}
	-
	-static uint64_t
	-blkid2offset(const dnode_phys_t dnp, const blkptr_t bp,
	- const zbookmark_phys_t *zb)
	-{
	- if (dnp == NULL) {
	- ASSERT(zb->zb_level < 0);
	- if (zb->zb_object == 0)
	- return (zb->zb_blkid);
	- return (zb->zb_blkid * BP_GET_LSIZE(bp));
	- }
	-
	- ASSERT(zb->zb_level >= 0);
	-
	- return ((zb->zb_blkid <<
	- (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
	- dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
	-}
	-
	-static void
	-snprintf_blkptr_compact(char blkbuf, size_t buflen, const blkptr_t bp)
	-{
	- const dva_t *dva = bp->blk_dva;
	- int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
	-
	- if (dump_opt['b'] >= 6) {
	- snprintf_blkptr(blkbuf, buflen, bp);
	- return;
	- }
	-
	- if (BP_IS_EMBEDDED(bp)) {
	- (void) sprintf(blkbuf,
	- "EMBEDDED et=%u %llxL/%llxP B=%llu",
	- (int)BPE_GET_ETYPE(bp),
	- (u_longlong_t)BPE_GET_LSIZE(bp),
	- (u_longlong_t)BPE_GET_PSIZE(bp),
	- (u_longlong_t)bp->blk_birth);
	- return;
	- }
	-
	- blkbuf[0] = '\0';
	- for (int i = 0; i < ndvas; i++)
	- (void) snprintf(blkbuf + strlen(blkbuf),
	- buflen - strlen(blkbuf), "%llu:%llx:%llx ",
	- (u_longlong_t)DVA_GET_VDEV(&dva[i]),
	- (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
	- (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
	-
	- if (BP_IS_HOLE(bp)) {
	- (void) snprintf(blkbuf + strlen(blkbuf),
	- buflen - strlen(blkbuf),
	- "%llxL B=%llu",
	- (u_longlong_t)BP_GET_LSIZE(bp),
	- (u_longlong_t)bp->blk_birth);
	- } else {
	- (void) snprintf(blkbuf + strlen(blkbuf),
	- buflen - strlen(blkbuf),
	- "%llxL/%llxP F=%llu B=%llu/%llu",
	- (u_longlong_t)BP_GET_LSIZE(bp),
	- (u_longlong_t)BP_GET_PSIZE(bp),
	- (u_longlong_t)BP_GET_FILL(bp),
	- (u_longlong_t)bp->blk_birth,
	- (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
	- }
	-}
	-
	-static void
	-print_indirect(blkptr_t bp, const zbookmark_phys_t zb,
	- const dnode_phys_t *dnp)
	-{
	- char blkbuf[BP_SPRINTF_LEN];
	- int l;
	-
	- if (!BP_IS_EMBEDDED(bp)) {
	- ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
	- ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
	- }
	-
	- (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
	-
	- ASSERT(zb->zb_level >= 0);
	-
	- for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
	- if (l == zb->zb_level) {
	- (void) printf("L%llx", (u_longlong_t)zb->zb_level);
	- } else {
	- (void) printf(" ");
	- }
	- }
	-
	- snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
	- (void) printf("%s\n", blkbuf);
	-}
	-
	-static int
	-visit_indirect(spa_t spa, const dnode_phys_t dnp,
	- blkptr_t bp, const zbookmark_phys_t zb)
	-{
	- int err = 0;
	-
	- if (bp->blk_birth == 0)
	- return (0);
	-
	- print_indirect(bp, zb, dnp);
	-
	- if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
	- arc_flags_t flags = ARC_FLAG_WAIT;
	- int i;
	- blkptr_t *cbp;
	- int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
	- arc_buf_t *buf;
	- uint64_t fill = 0;
	-
	- err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
	- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
	- if (err)
	- return (err);
	- ASSERT(buf->b_data);
	-
	- /* recursively visit blocks below this */
	- cbp = buf->b_data;
	- for (i = 0; i < epb; i++, cbp++) {
	- zbookmark_phys_t czb;
	-
	- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
	- zb->zb_level - 1,
	- zb->zb_blkid * epb + i);
	- err = visit_indirect(spa, dnp, cbp, &czb);
	- if (err)
	- break;
	- fill += BP_GET_FILL(cbp);
	- }
	- if (!err)
	- ASSERT3U(fill, ==, BP_GET_FILL(bp));
	- arc_buf_destroy(buf, &buf);
	- }
	-
	- return (err);
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_indirect(dnode_t *dn)
	-{
	- dnode_phys_t *dnp = dn->dn_phys;
	- int j;
	- zbookmark_phys_t czb;
	-
	- (void) printf("Indirect blocks:\n");
	-
	- SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
	- dn->dn_object, dnp->dn_nlevels - 1, 0);
	- for (j = 0; j < dnp->dn_nblkptr; j++) {
	- czb.zb_blkid = j;
	- (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
	- &dnp->dn_blkptr[j], &czb);
	- }
	-
	- (void) printf("\n");
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_dsl_dir(objset_t os, uint64_t object, void data, size_t size)
	-{
	- dsl_dir_phys_t *dd = data;
	- time_t crtime;
	- char nice[32];
	-
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);
	-
	- if (dd == NULL)
	- return;
	-
	- ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
	-
	- crtime = dd->dd_creation_time;
	- (void) printf("\t\tcreation_time = %s", ctime(&crtime));
	- (void) printf("\t\thead_dataset_obj = %llu\n",
	- (u_longlong_t)dd->dd_head_dataset_obj);
	- (void) printf("\t\tparent_dir_obj = %llu\n",
	- (u_longlong_t)dd->dd_parent_obj);
	- (void) printf("\t\torigin_obj = %llu\n",
	- (u_longlong_t)dd->dd_origin_obj);
	- (void) printf("\t\tchild_dir_zapobj = %llu\n",
	- (u_longlong_t)dd->dd_child_dir_zapobj);
	- zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
	- (void) printf("\t\tused_bytes = %s\n", nice);
	- zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
	- (void) printf("\t\tcompressed_bytes = %s\n", nice);
	- zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
	- (void) printf("\t\tuncompressed_bytes = %s\n", nice);
	- zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
	- (void) printf("\t\tquota = %s\n", nice);
	- zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
	- (void) printf("\t\treserved = %s\n", nice);
	- (void) printf("\t\tprops_zapobj = %llu\n",
	- (u_longlong_t)dd->dd_props_zapobj);
	- (void) printf("\t\tdeleg_zapobj = %llu\n",
	- (u_longlong_t)dd->dd_deleg_zapobj);
	- (void) printf("\t\tflags = %llx\n",
	- (u_longlong_t)dd->dd_flags);
	-
	-#define DO(which) \
	- zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
	- sizeof (nice)); \
	- (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
	- DO(HEAD);
	- DO(SNAP);
	- DO(CHILD);
	- DO(CHILD_RSRV);
	- DO(REFRSRV);
	-#undef DO
	- (void) printf("\t\tclones = %llu\n",
	- (u_longlong_t)dd->dd_clones);
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_dsl_dataset(objset_t os, uint64_t object, void data, size_t size)
	-{
	- dsl_dataset_phys_t *ds = data;
	- time_t crtime;
	- char used[32], compressed[32], uncompressed[32], unique[32];
	- char blkbuf[BP_SPRINTF_LEN];
	-
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);
	-
	- if (ds == NULL)
	- return;
	-
	- ASSERT(size == sizeof (*ds));
	- crtime = ds->ds_creation_time;
	- zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
	- zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
	- zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
	- sizeof (uncompressed));
	- zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
	- snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
	-
	- (void) printf("\t\tdir_obj = %llu\n",
	- (u_longlong_t)ds->ds_dir_obj);
	- (void) printf("\t\tprev_snap_obj = %llu\n",
	- (u_longlong_t)ds->ds_prev_snap_obj);
	- (void) printf("\t\tprev_snap_txg = %llu\n",
	- (u_longlong_t)ds->ds_prev_snap_txg);
	- (void) printf("\t\tnext_snap_obj = %llu\n",
	- (u_longlong_t)ds->ds_next_snap_obj);
	- (void) printf("\t\tsnapnames_zapobj = %llu\n",
	- (u_longlong_t)ds->ds_snapnames_zapobj);
	- (void) printf("\t\tnum_children = %llu\n",
	- (u_longlong_t)ds->ds_num_children);
	- (void) printf("\t\tuserrefs_obj = %llu\n",
	- (u_longlong_t)ds->ds_userrefs_obj);
	- (void) printf("\t\tcreation_time = %s", ctime(&crtime));
	- (void) printf("\t\tcreation_txg = %llu\n",
	- (u_longlong_t)ds->ds_creation_txg);
	- (void) printf("\t\tdeadlist_obj = %llu\n",
	- (u_longlong_t)ds->ds_deadlist_obj);
	- (void) printf("\t\tused_bytes = %s\n", used);
	- (void) printf("\t\tcompressed_bytes = %s\n", compressed);
	- (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
	- (void) printf("\t\tunique = %s\n", unique);
	- (void) printf("\t\tfsid_guid = %llu\n",
	- (u_longlong_t)ds->ds_fsid_guid);
	- (void) printf("\t\tguid = %llu\n",
	- (u_longlong_t)ds->ds_guid);
	- (void) printf("\t\tflags = %llx\n",
	- (u_longlong_t)ds->ds_flags);
	- (void) printf("\t\tnext_clones_obj = %llu\n",
	- (u_longlong_t)ds->ds_next_clones_obj);
	- (void) printf("\t\tprops_obj = %llu\n",
	- (u_longlong_t)ds->ds_props_obj);
	- (void) printf("\t\tbp = %s\n", blkbuf);
	-}
	-
	-/* ARGSUSED */
	-static int
	-dump_bptree_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- char blkbuf[BP_SPRINTF_LEN];
	-
	- if (bp->blk_birth != 0) {
	- snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	- (void) printf("\t%s\n", blkbuf);
	- }
	- return (0);
	-}
	-
	-static void
	-dump_bptree(objset_t os, uint64_t obj, const char name)
	-{
	- char bytes[32];
	- bptree_phys_t *bt;
	- dmu_buf_t *db;
	-
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
	-
	- if (dump_opt['d'] < 3)
	- return;
	-
	- VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
	- bt = db->db_data;
	- zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
	- (void) printf("\n %s: %llu datasets, %s\n",
	- name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
	- dmu_buf_rele(db, FTAG);
	-
	- if (dump_opt['d'] < 5)
	- return;
	-
	- (void) printf("\n");
	-
	- (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
	-}
	-
	-/* ARGSUSED */
	-static int
	-dump_bpobj_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- char blkbuf[BP_SPRINTF_LEN];
	-
	- ASSERT(bp->blk_birth != 0);
	- snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
	- (void) printf("\t%s\n", blkbuf);
	- return (0);
	-}
	-
	-static void
	-dump_full_bpobj(bpobj_t bpo, const char name, int indent)
	-{
	- char bytes[32];
	- char comp[32];
	- char uncomp[32];
	-
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
	-
	- if (dump_opt['d'] < 3)
	- return;
	-
	- zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
	- if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
	- zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
	- zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
	- (void) printf(" %*s: object %llu, %llu local blkptrs, "
	- "%llu subobjs in object %llu, %s (%s/%s comp)\n",
	- indent * 8, name,
	- (u_longlong_t)bpo->bpo_object,
	- (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
	- (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
	- (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
	- bytes, comp, uncomp);
	-
	- for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
	- uint64_t subobj;
	- bpobj_t subbpo;
	- int error;
	- VERIFY0(dmu_read(bpo->bpo_os,
	- bpo->bpo_phys->bpo_subobjs,
	- i * sizeof (subobj), sizeof (subobj), &subobj, 0));
	- error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
	- if (error != 0) {
	- (void) printf("ERROR %u while trying to open "
	- "subobj id %llu\n",
	- error, (u_longlong_t)subobj);
	- continue;
	- }
	- dump_full_bpobj(&subbpo, "subobj", indent + 1);
	- bpobj_close(&subbpo);
	- }
	- } else {
	- (void) printf(" %*s: object %llu, %llu blkptrs, %s\n",
	- indent * 8, name,
	- (u_longlong_t)bpo->bpo_object,
	- (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
	- bytes);
	- }
	-
	- if (dump_opt['d'] < 5)
	- return;
	-
	-
	- if (indent == 0) {
	- (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
	- (void) printf("\n");
	- }
	-}
	-
	-static void
	-bpobj_count_refd(bpobj_t *bpo)
	-{
	- mos_obj_refd(bpo->bpo_object);
	-
	- if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
	- mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
	- for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
	- uint64_t subobj;
	- bpobj_t subbpo;
	- int error;
	- VERIFY0(dmu_read(bpo->bpo_os,
	- bpo->bpo_phys->bpo_subobjs,
	- i * sizeof (subobj), sizeof (subobj), &subobj, 0));
	- error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
	- if (error != 0) {
	- (void) printf("ERROR %u while trying to open "
	- "subobj id %llu\n",
	- error, (u_longlong_t)subobj);
	- continue;
	- }
	- bpobj_count_refd(&subbpo);
	- bpobj_close(&subbpo);
	- }
	- }
	-}
	-
	-static void
	-dump_deadlist(dsl_deadlist_t *dl)
	-{
	- dsl_deadlist_entry_t *dle;
	- uint64_t unused;
	- char bytes[32];
	- char comp[32];
	- char uncomp[32];
	- uint64_t empty_bpobj =
	- dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj;
	-
	- /* force the tree to be loaded */
	- dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
	-
	- if (dl->dl_oldfmt) {
	- if (dl->dl_bpobj.bpo_object != empty_bpobj)
	- bpobj_count_refd(&dl->dl_bpobj);
	- } else {
	- mos_obj_refd(dl->dl_object);
	- for (dle = avl_first(&dl->dl_tree); dle;
	- dle = AVL_NEXT(&dl->dl_tree, dle)) {
	- if (dle->dle_bpobj.bpo_object != empty_bpobj)
	- bpobj_count_refd(&dle->dle_bpobj);
	- }
	- }
	-
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
	-
	- if (dump_opt['d'] < 3)
	- return;
	-
	- if (dl->dl_oldfmt) {
	- dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
	- return;
	- }
	-
	- zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
	- zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
	- zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
	- (void) printf("\n Deadlist: %s (%s/%s comp)\n",
	- bytes, comp, uncomp);
	-
	- if (dump_opt['d'] < 4)
	- return;
	-
	- (void) printf("\n");
	-
	- for (dle = avl_first(&dl->dl_tree); dle;
	- dle = AVL_NEXT(&dl->dl_tree, dle)) {
	- if (dump_opt['d'] >= 5) {
	- char buf[128];
	- (void) snprintf(buf, sizeof (buf),
	- "mintxg %llu -> obj %llu",
	- (longlong_t)dle->dle_mintxg,
	- (longlong_t)dle->dle_bpobj.bpo_object);
	- dump_full_bpobj(&dle->dle_bpobj, buf, 0);
	- } else {
	- (void) printf("mintxg %llu -> obj %llu\n",
	- (longlong_t)dle->dle_mintxg,
	- (longlong_t)dle->dle_bpobj.bpo_object);
	- }
	- }
	-}
	-
	-static avl_tree_t idx_tree;
	-static avl_tree_t domain_tree;
	-static boolean_t fuid_table_loaded;
	-static objset_t *sa_os = NULL;
	-static sa_attr_type_t *sa_attr_table = NULL;
	-
	-static int
	-open_objset(const char path, dmu_objset_type_t type, void tag, objset_t **osp)
	-{
	- int err;
	- uint64_t sa_attrs = 0;
	- uint64_t version = 0;
	-
	- VERIFY3P(sa_os, ==, NULL);
	- err = dmu_objset_own(path, type, B_TRUE, tag, osp);
	- if (err != 0) {
	- (void) fprintf(stderr, "failed to own dataset '%s': %s\n", path,
	- strerror(err));
	- return (err);
	- }
	-
	- if (dmu_objset_type(*osp) == DMU_OST_ZFS) {
	- (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
	- 8, 1, &version);
	- if (version >= ZPL_VERSION_SA) {
	- (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
	- 8, 1, &sa_attrs);
	- }
	- err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
	- &sa_attr_table);
	- if (err != 0) {
	- (void) fprintf(stderr, "sa_setup failed: %s\n",
	- strerror(err));
	- dmu_objset_disown(*osp, tag);
	- *osp = NULL;
	- }
	- }
	- sa_os = *osp;
	-
	- return (0);
	-}
	-
	-static void
	-close_objset(objset_t os, void tag)
	-{
	- VERIFY3P(os, ==, sa_os);
	- if (os->os_sa != NULL)
	- sa_tear_down(os);
	- dmu_objset_disown(os, tag);
	- sa_attr_table = NULL;
	- sa_os = NULL;
	-}
	-
	-static void
	-fuid_table_destroy()
	-{
	- if (fuid_table_loaded) {
	- zfs_fuid_table_destroy(&idx_tree, &domain_tree);
	- fuid_table_loaded = B_FALSE;
	- }
	-}
	-
	-/*
	- * print uid or gid information.
	- * For normal POSIX id just the id is printed in decimal format.
	- * For CIFS files with FUID the fuid is printed in hex followed by
	- * the domain-rid string.
	- */
	-static void
	-print_idstr(uint64_t id, const char *id_type)
	-{
	- if (FUID_INDEX(id)) {
	- char *domain;
	-
	- domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
	- (void) printf("\t%s %llx [%s-%d]\n", id_type,
	- (u_longlong_t)id, domain, (int)FUID_RID(id));
	- } else {
	- (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);
	- }
	-
	-}
	-
	-static void
	-dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
	-{
	- uint32_t uid_idx, gid_idx;
	-
	- uid_idx = FUID_INDEX(uid);
	- gid_idx = FUID_INDEX(gid);
	-
	- /* Load domain table, if not already loaded */
	- if (!fuid_table_loaded && (uid_idx \|\| gid_idx)) {
	- uint64_t fuid_obj;
	-
	- /* first find the fuid object. It lives in the master node */
	- VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
	- 8, 1, &fuid_obj) == 0);
	- zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
	- (void) zfs_fuid_table_load(os, fuid_obj,
	- &idx_tree, &domain_tree);
	- fuid_table_loaded = B_TRUE;
	- }
	-
	- print_idstr(uid, "uid");
	- print_idstr(gid, "gid");
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_znode(objset_t os, uint64_t object, void data, size_t size)
	-{
	- char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
	- sa_handle_t *hdl;
	- uint64_t xattr, rdev, gen;
	- uint64_t uid, gid, mode, fsize, parent, links;
	- uint64_t pflags;
	- uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
	- time_t z_crtime, z_atime, z_mtime, z_ctime;
	- sa_bulk_attr_t bulk[12];
	- int idx = 0;
	- int error;
	-
	- VERIFY3P(os, ==, sa_os);
	- if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
	- (void) printf("Failed to get handle for SA znode\n");
	- return;
	- }
	-
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
	- &links, 8);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
	- &mode, 8);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
	- NULL, &parent, 8);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
	- &fsize, 8);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
	- acctm, 16);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
	- modtm, 16);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
	- crtm, 16);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
	- chgtm, 16);
	- SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
	- &pflags, 8);
	-
	- if (sa_bulk_lookup(hdl, bulk, idx)) {
	- (void) sa_handle_destroy(hdl);
	- return;
	- }
	-
	- z_crtime = (time_t)crtm[0];
	- z_atime = (time_t)acctm[0];
	- z_mtime = (time_t)modtm[0];
	- z_ctime = (time_t)chgtm[0];
	-
	- if (dump_opt['d'] > 4) {
	- error = zfs_obj_to_path(os, object, path, sizeof (path));
	- if (error == ESTALE) {
	- (void) snprintf(path, sizeof (path), "on delete queue");
	- } else if (error != 0) {
	- leaked_objects++;
	- (void) snprintf(path, sizeof (path),
	- "path not found, possibly leaked");
	- }
	- (void) printf("\tpath %s\n", path);
	- }
	- dump_uidgid(os, uid, gid);
	- (void) printf("\tatime %s", ctime(&z_atime));
	- (void) printf("\tmtime %s", ctime(&z_mtime));
	- (void) printf("\tctime %s", ctime(&z_ctime));
	- (void) printf("\tcrtime %s", ctime(&z_crtime));
	- (void) printf("\tgen %llu\n", (u_longlong_t)gen);
	- (void) printf("\tmode %llo\n", (u_longlong_t)mode);
	- (void) printf("\tsize %llu\n", (u_longlong_t)fsize);
	- (void) printf("\tparent %llu\n", (u_longlong_t)parent);
	- (void) printf("\tlinks %llu\n", (u_longlong_t)links);
	- (void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
	- if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
	- sizeof (uint64_t)) == 0)
	- (void) printf("\txattr %llu\n", (u_longlong_t)xattr);
	- if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
	- sizeof (uint64_t)) == 0)
	- (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
	- sa_handle_destroy(hdl);
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_acl(objset_t os, uint64_t object, void data, size_t size)
	-{
	-}
	-
	-/ARGSUSED/
	-static void
	-dump_dmu_objset(objset_t os, uint64_t object, void data, size_t size)
	-{
	-}
	-
	-static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
	- dump_none, /* unallocated */
	- dump_zap, /* object directory */
	- dump_uint64, /* object array */
	- dump_none, /* packed nvlist */
	- dump_packed_nvlist, /* packed nvlist size */
	- dump_none, /* bpobj */
	- dump_bpobj, /* bpobj header */
	- dump_none, /* SPA space map header */
	- dump_none, /* SPA space map */
	- dump_none, /* ZIL intent log */
	- dump_dnode, /* DMU dnode */
	- dump_dmu_objset, /* DMU objset */
	- dump_dsl_dir, /* DSL directory */
	- dump_zap, /* DSL directory child map */
	- dump_zap, /* DSL dataset snap map */
	- dump_zap, /* DSL props */
	- dump_dsl_dataset, /* DSL dataset */
	- dump_znode, /* ZFS znode */
	- dump_acl, /* ZFS V0 ACL */
	- dump_uint8, /* ZFS plain file */
	- dump_zpldir, /* ZFS directory */
	- dump_zap, /* ZFS master node */
	- dump_zap, /* ZFS delete queue */
	- dump_uint8, /* zvol object */
	- dump_zap, /* zvol prop */
	- dump_uint8, /* other uint8[] */
	- dump_uint64, /* other uint64[] */
	- dump_zap, /* other ZAP */
	- dump_zap, /* persistent error log */
	- dump_uint8, /* SPA history */
	- dump_history_offsets, /* SPA history offsets */
	- dump_zap, /* Pool properties */
	- dump_zap, /* DSL permissions */
	- dump_acl, /* ZFS ACL */
	- dump_uint8, /* ZFS SYSACL */
	- dump_none, /* FUID nvlist */
	- dump_packed_nvlist, /* FUID nvlist size */
	- dump_zap, /* DSL dataset next clones */
	- dump_zap, /* DSL scrub queue */
	- dump_zap, /* ZFS user/group used */
	- dump_zap, /* ZFS user/group quota */
	- dump_zap, /* snapshot refcount tags */
	- dump_ddt_zap, /* DDT ZAP object */
	- dump_zap, /* DDT statistics */
	- dump_znode, /* SA object */
	- dump_zap, /* SA Master Node */
	- dump_sa_attrs, /* SA attribute registration */
	- dump_sa_layouts, /* SA attribute layouts */
	- dump_zap, /* DSL scrub translations */
	- dump_none, /* fake dedup BP */
	- dump_zap, /* deadlist */
	- dump_none, /* deadlist hdr */
	- dump_zap, /* dsl clones */
	- dump_bpobj_subobjs, /* bpobj subobjs */
	- dump_unknown, /* Unknown type, must be last */
	-};
	-
	-static void
	-dump_object(objset_t os, uint64_t object, int verbosity, int print_header,
	- uint64_t *dnode_slots_used)
	-{
	- dmu_buf_t *db = NULL;
	- dmu_object_info_t doi;
	- dnode_t *dn;
	- void *bonus = NULL;
	- size_t bsize = 0;
	- char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
	- char bonus_size[32];
	- char aux[50];
	- int error;
	-
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
	-
	- if (*print_header) {
	- (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
	- "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
	- "lsize", "%full", "type");
	- *print_header = 0;
	- }
	-
	- if (object == 0) {
	- dn = DMU_META_DNODE(os);
	- } else {
	- error = dmu_bonus_hold(os, object, FTAG, &db);
	- if (error)
	- fatal("dmu_bonus_hold(%llu) failed, errno %u",
	- object, error);
	- bonus = db->db_data;
	- bsize = db->db_size;
	- dn = DB_DNODE((dmu_buf_impl_t *)db);
	- }
	- dmu_object_info_from_dnode(dn, &doi);
	-
	- if (dnode_slots_used != NULL)
	- *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
	-
	- zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
	- zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
	- zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
	- zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
	- zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
	- zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
	- (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
	- doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
	- doi.doi_max_offset);
	-
	- aux[0] = '\0';
	-
	- if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT \|\| verbosity >= 6) {
	- (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
	- ZDB_CHECKSUM_NAME(doi.doi_checksum));
	- }
	-
	- if (doi.doi_compress != ZIO_COMPRESS_INHERIT \|\| verbosity >= 6) {
	- (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
	- ZDB_COMPRESS_NAME(doi.doi_compress));
	- }
	-
	- (void) printf("%10" PRIu64
	- " %3u %5s %5s %5s %5s %5s %6s %s%s\n",
	- object, doi.doi_indirection, iblk, dblk,
	- asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
	-
	- if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
	- (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n",
	- "", "", "", "", "", "", bonus_size, "bonus",
	- ZDB_OT_NAME(doi.doi_bonus_type));
	- }
	-
	- if (verbosity >= 4) {
	- (void) printf("\tdnode flags: %s%s%s\n",
	- (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
	- "USED_BYTES " : "",
	- (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
	- "USERUSED_ACCOUNTED " : "",
	- (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
	- "SPILL_BLKPTR" : "");
	- (void) printf("\tdnode maxblkid: %llu\n",
	- (longlong_t)dn->dn_phys->dn_maxblkid);
	-
	- object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
	- bonus, bsize);
	- object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
	- *print_header = 1;
	- }
	-
	- if (verbosity >= 5)
	- dump_indirect(dn);
	-
	- if (verbosity >= 5) {
	- /*
	- * Report the list of segments that comprise the object.
	- */
	- uint64_t start = 0;
	- uint64_t end;
	- uint64_t blkfill = 1;
	- int minlvl = 1;
	-
	- if (dn->dn_type == DMU_OT_DNODE) {
	- minlvl = 0;
	- blkfill = DNODES_PER_BLOCK;
	- }
	-
	- for (;;) {
	- char segsize[32];
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
	- error = dnode_next_offset(dn,
	- 0, &start, minlvl, blkfill, 0);
	- if (error)
	- break;
	- end = start;
	- error = dnode_next_offset(dn,
	- DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
	- zdb_nicenum(end - start, segsize, sizeof (segsize));
	- (void) printf("\t\tsegment [%016llx, %016llx)"
	- " size %5s\n", (u_longlong_t)start,
	- (u_longlong_t)end, segsize);
	- if (error)
	- break;
	- start = end;
	- }
	- }
	-
	- if (db != NULL)
	- dmu_buf_rele(db, FTAG);
	-}
	-
	-static void
	-count_dir_mos_objects(dsl_dir_t *dd)
	-{
	- mos_obj_refd(dd->dd_object);
	- mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
	- mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
	- mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
	- mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
	-}
	-
	-static void
	-count_ds_mos_objects(dsl_dataset_t *ds)
	-{
	- mos_obj_refd(ds->ds_object);
	- mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
	- mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
	- mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
	- mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
	-
	- if (!dsl_dataset_is_snapshot(ds)) {
	- count_dir_mos_objects(ds->ds_dir);
	- }
	-}
	-
	-static const char *objset_types[DMU_OST_NUMTYPES] = {
	- "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
	-
	-static void
	-dump_dir(objset_t *os)
	-{
	- dmu_objset_stats_t dds;
	- uint64_t object, object_count;
	- uint64_t refdbytes, usedobjs, scratch;
	- char numbuf[32];
	- char blkbuf[BP_SPRINTF_LEN + 20];
	- char osname[ZFS_MAX_DATASET_NAME_LEN];
	- const char *type = "UNKNOWN";
	- int verbosity = dump_opt['d'];
	- int print_header = 1;
	- unsigned i;
	- int error;
	- uint64_t total_slots_used = 0;
	- uint64_t max_slot_used = 0;
	- uint64_t dnode_slots;
	-
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
	-
	- dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	- dmu_objset_fast_stat(os, &dds);
	- dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	-
	- if (dds.dds_type < DMU_OST_NUMTYPES)
	- type = objset_types[dds.dds_type];
	-
	- if (dds.dds_type == DMU_OST_META) {
	- dds.dds_creation_txg = TXG_INITIAL;
	- usedobjs = BP_GET_FILL(os->os_rootbp);
	- refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
	- dd_used_bytes;
	- } else {
	- dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
	- }
	-
	- ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
	-
	- zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
	-
	- if (verbosity >= 4) {
	- (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
	- (void) snprintf_blkptr(blkbuf + strlen(blkbuf),
	- sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
	- } else {
	- blkbuf[0] = '\0';
	- }
	-
	- dmu_objset_name(os, osname);
	-
	- (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
	- "%s, %llu objects%s%s\n",
	- osname, type, (u_longlong_t)dmu_objset_id(os),
	- (u_longlong_t)dds.dds_creation_txg,
	- numbuf, (u_longlong_t)usedobjs, blkbuf,
	- (dds.dds_inconsistent) ? " (inconsistent)" : "");
	-
	- if (zopt_objects != 0) {
	- for (i = 0; i < zopt_objects; i++)
	- dump_object(os, zopt_object[i], verbosity,
	- &print_header, NULL);
	- (void) printf("\n");
	- return;
	- }
	-
	- if (dump_opt['i'] != 0 \|\| verbosity >= 2)
	- dump_intent_log(dmu_objset_zil(os));
	-
	- if (dmu_objset_ds(os) != NULL) {
	- dsl_dataset_t *ds = dmu_objset_ds(os);
	- dump_deadlist(&ds->ds_deadlist);
	-
	- if (dsl_dataset_remap_deadlist_exists(ds)) {
	- (void) printf("ds_remap_deadlist:\n");
	- dump_deadlist(&ds->ds_remap_deadlist);
	- }
	- count_ds_mos_objects(ds);
	- }
	-
	- if (verbosity < 2)
	- return;
	-
	- if (BP_IS_HOLE(os->os_rootbp))
	- return;
	-
	- dump_object(os, 0, verbosity, &print_header, NULL);
	- object_count = 0;
	- if (DMU_USERUSED_DNODE(os) != NULL &&
	- DMU_USERUSED_DNODE(os)->dn_type != 0) {
	- dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
	- NULL);
	- dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
	- NULL);
	- }
	-
	- object = 0;
	- while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
	- dump_object(os, object, verbosity, &print_header, &dnode_slots);
	- object_count++;
	- total_slots_used += dnode_slots;
	- max_slot_used = object + dnode_slots - 1;
	- }
	-
	- (void) printf("\n");
	-
	- (void) printf(" Dnode slots:\n");
	- (void) printf("\tTotal used: %10llu\n",
	- (u_longlong_t)total_slots_used);
	- (void) printf("\tMax used: %10llu\n",
	- (u_longlong_t)max_slot_used);
	- (void) printf("\tPercent empty: %10lf\n",
	- (double)(max_slot_used - total_slots_used)*100 /
	- (double)max_slot_used);
	-
	- (void) printf("\n");
	-
	- if (error != ESRCH) {
	- (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
	- abort();
	- }
	-
	- ASSERT3U(object_count, ==, usedobjs);
	-
	- if (leaked_objects != 0) {
	- (void) printf("%d potentially leaked objects detected\n",
	- leaked_objects);
	- leaked_objects = 0;
	- }
	-}
	-
	-static void
	-dump_uberblock(uberblock_t ub, const char header, const char *footer)
	-{
	- time_t timestamp = ub->ub_timestamp;
	-
	- (void) printf("%s", header ? header : "");
	- (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
	- (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
	- (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
	- (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
	- (void) printf("\ttimestamp = %llu UTC = %s",
	- (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
	-
	- (void) printf("\tmmp_magic = %016llx\n",
	- (u_longlong_t)ub->ub_mmp_magic);
	- if (MMP_VALID(ub)) {
	- (void) printf("\tmmp_delay = %0llu\n",
	- (u_longlong_t)ub->ub_mmp_delay);
	- if (MMP_SEQ_VALID(ub))
	- (void) printf("\tmmp_seq = %u\n",
	- (unsigned int) MMP_SEQ(ub));
	- if (MMP_FAIL_INT_VALID(ub))
	- (void) printf("\tmmp_fail = %u\n",
	- (unsigned int) MMP_FAIL_INT(ub));
	- if (MMP_INTERVAL_VALID(ub))
	- (void) printf("\tmmp_write = %u\n",
	- (unsigned int) MMP_INTERVAL(ub));
	- /* After MMP_* to make summarize_uberblock_mmp cleaner */
	- (void) printf("\tmmp_valid = %x\n",
	- (unsigned int) ub->ub_mmp_config & 0xFF);
	- }
	-
	- if (dump_opt['u'] >= 3) {
	- char blkbuf[BP_SPRINTF_LEN];
	- snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
	- (void) printf("\trootbp = %s\n", blkbuf);
	- }
	- (void) printf("\tcheckpoint_txg = %llu\n",
	- (u_longlong_t)ub->ub_checkpoint_txg);
	- (void) printf("%s", footer ? footer : "");
	-}
	-
	-static void
	-dump_config(spa_t *spa)
	-{
	- dmu_buf_t *db;
	- size_t nvsize = 0;
	- int error = 0;
	-
	-
	- error = dmu_bonus_hold(spa->spa_meta_objset,
	- spa->spa_config_object, FTAG, &db);
	-
	- if (error == 0) {
	- nvsize = (uint64_t )db->db_data;
	- dmu_buf_rele(db, FTAG);
	-
	- (void) printf("\nMOS Configuration:\n");
	- dump_packed_nvlist(spa->spa_meta_objset,
	- spa->spa_config_object, (void *)&nvsize, 1);
	- } else {
	- (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
	- (u_longlong_t)spa->spa_config_object, error);
	- }
	-}
	-
	-static void
	-dump_cachefile(const char *cachefile)
	-{
	- int fd;
	- struct stat64 statbuf;
	- char *buf;
	- nvlist_t *config;
	-
	- if ((fd = open64(cachefile, O_RDONLY)) < 0) {
	- (void) fprintf(stderr, "cannot open '%s': %s\n", cachefile,
	- strerror(errno));
	- exit(1);
	- }
	-
	- if (fstat64(fd, &statbuf) != 0) {
	- (void) fprintf(stderr, "failed to stat '%s': %s\n", cachefile,
	- strerror(errno));
	- exit(1);
	- }
	-
	- if ((buf = malloc(statbuf.st_size)) == NULL) {
	- (void) fprintf(stderr, "failed to allocate %llu bytes\n",
	- (u_longlong_t)statbuf.st_size);
	- exit(1);
	- }
	-
	- if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
	- (void) fprintf(stderr, "failed to read %llu bytes\n",
	- (u_longlong_t)statbuf.st_size);
	- exit(1);
	- }
	-
	- (void) close(fd);
	-
	- if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
	- (void) fprintf(stderr, "failed to unpack nvlist\n");
	- exit(1);
	- }
	-
	- free(buf);
	-
	- dump_nvlist(config, 0);
	-
	- nvlist_free(config);
	-}
	-
	-#define ZDB_MAX_UB_HEADER_SIZE 32
	-
	-static void
	-dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
	-{
	- vdev_t vd;
	- vdev_t *vdp = &vd;
	- char header[ZDB_MAX_UB_HEADER_SIZE];
	-
	- vd.vdev_ashift = ashift;
	- vdp->vdev_top = vdp;
	-
	- for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
	- uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
	- uberblock_t ub = (void )((char *)lbl + uoff);
	-
	- if (uberblock_verify(ub))
	- continue;
	-
	- if ((dump_opt['u'] < 4) &&
	- (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
	- (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
	- continue;
	-
	- (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
	- "Uberblock[%d]\n", i);
	- dump_uberblock(ub, header, "");
	- }
	-}
	-
	-static char curpath[PATH_MAX];
	-
	-/*
	- * Iterate through the path components, recursively passing
	- * current one's obj and remaining path until we find the obj
	- * for the last one.
	- */
	-static int
	-dump_path_impl(objset_t os, uint64_t obj, char name)
	-{
	- int err;
	- int header = 1;
	- uint64_t child_obj;
	- char *s;
	- dmu_buf_t *db;
	- dmu_object_info_t doi;
	-
	- if ((s = strchr(name, '/')) != NULL)
	- *s = '\0';
	- err = zap_lookup(os, obj, name, 8, 1, &child_obj);
	-
	- (void) strlcat(curpath, name, sizeof (curpath));
	-
	- if (err != 0) {
	- (void) fprintf(stderr, "failed to lookup %s: %s\n",
	- curpath, strerror(err));
	- return (err);
	- }
	-
	- child_obj = ZFS_DIRENT_OBJ(child_obj);
	- err = sa_buf_hold(os, child_obj, FTAG, &db);
	- if (err != 0) {
	- (void) fprintf(stderr,
	- "failed to get SA dbuf for obj %llu: %s\n",
	- (u_longlong_t)child_obj, strerror(err));
	- return (EINVAL);
	- }
	- dmu_object_info_from_db(db, &doi);
	- sa_buf_rele(db, FTAG);
	-
	- if (doi.doi_bonus_type != DMU_OT_SA &&
	- doi.doi_bonus_type != DMU_OT_ZNODE) {
	- (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
	- doi.doi_bonus_type, (u_longlong_t)child_obj);
	- return (EINVAL);
	- }
	-
	- if (dump_opt['v'] > 6) {
	- (void) printf("obj=%llu %s type=%d bonustype=%d\n",
	- (u_longlong_t)child_obj, curpath, doi.doi_type,
	- doi.doi_bonus_type);
	- }
	-
	- (void) strlcat(curpath, "/", sizeof (curpath));
	-
	- switch (doi.doi_type) {
	- case DMU_OT_DIRECTORY_CONTENTS:
	- if (s != NULL && *(s + 1) != '\0')
	- return (dump_path_impl(os, child_obj, s + 1));
	- /FALLTHROUGH/
	- case DMU_OT_PLAIN_FILE_CONTENTS:
	- dump_object(os, child_obj, dump_opt['v'], &header, NULL);
	- return (0);
	- default:
	- (void) fprintf(stderr, "object %llu has non-file/directory "
	- "type %d\n", (u_longlong_t)obj, doi.doi_type);
	- break;
	- }
	-
	- return (EINVAL);
	-}
	-
	-/*
	- * Dump the blocks for the object specified by path inside the dataset.
	- */
	-static int
	-dump_path(char ds, char path)
	-{
	- int err;
	- objset_t *os;
	- uint64_t root_obj;
	-
	- err = open_objset(ds, DMU_OST_ZFS, FTAG, &os);
	- if (err != 0)
	- return (err);
	-
	- err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
	- if (err != 0) {
	- (void) fprintf(stderr, "can't lookup root znode: %s\n",
	- strerror(err));
	- dmu_objset_disown(os, FTAG);
	- return (EINVAL);
	- }
	-
	- (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
	-
	- err = dump_path_impl(os, root_obj, path);
	-
	- close_objset(os, FTAG);
	- return (err);
	-}
	-
	-static int
	-dump_label(const char *dev)
	-{
	- int fd;
	- vdev_label_t label;
	- char path[MAXPATHLEN];
	- char *buf = label.vl_vdev_phys.vp_nvlist;
	- size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
	- struct stat64 statbuf;
	- uint64_t psize, ashift;
	- boolean_t label_found = B_FALSE;
	-
	- (void) strlcpy(path, dev, sizeof (path));
	- if (dev[0] == '/') {
	- if (strncmp(dev, ZFS_DISK_ROOTD,
	- strlen(ZFS_DISK_ROOTD)) == 0) {
	- (void) snprintf(path, sizeof (path), "%s%s",
	- ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD));
	- }
	- } else if (stat64(path, &statbuf) != 0) {
	- char *s;
	-
	- (void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD,
	- dev);
	- if (((s = strrchr(dev, 's')) == NULL &&
	- (s = strchr(dev, 'p')) == NULL) \|\|
	- !isdigit(*(s + 1)))
	- (void) strlcat(path, "s0", sizeof (path));
	- }
	-
	- if ((fd = open64(path, O_RDONLY)) < 0) {
	- (void) fprintf(stderr, "cannot open '%s': %s\n", path,
	- strerror(errno));
	- exit(1);
	- }
	-
	- if (fstat64(fd, &statbuf) != 0) {
	- (void) fprintf(stderr, "failed to stat '%s': %s\n", path,
	- strerror(errno));
	- (void) close(fd);
	- exit(1);
	- }
	-
	- if (S_ISBLK(statbuf.st_mode)) {
	- (void) fprintf(stderr,
	- "cannot use '%s': character device required\n", path);
	- (void) close(fd);
	- exit(1);
	- }
	-
	- psize = statbuf.st_size;
	- psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
	-
	- for (int l = 0; l < VDEV_LABELS; l++) {
	- nvlist_t *config = NULL;
	-
	- if (!dump_opt['q']) {
	- (void) printf("------------------------------------\n");
	- (void) printf("LABEL %d\n", l);
	- (void) printf("------------------------------------\n");
	- }
	-
	- if (pread64(fd, &label, sizeof (label),
	- vdev_label_offset(psize, l, 0)) != sizeof (label)) {
	- if (!dump_opt['q'])
	- (void) printf("failed to read label %d\n", l);
	- continue;
	- }
	-
	- if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
	- if (!dump_opt['q'])
	- (void) printf("failed to unpack label %d\n", l);
	- ashift = SPA_MINBLOCKSHIFT;
	- } else {
	- nvlist_t *vdev_tree = NULL;
	-
	- if (!dump_opt['q'])
	- dump_nvlist(config, 4);
	- if ((nvlist_lookup_nvlist(config,
	- ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) \|\|
	- (nvlist_lookup_uint64(vdev_tree,
	- ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
	- ashift = SPA_MINBLOCKSHIFT;
	- nvlist_free(config);
	- label_found = B_TRUE;
	- }
	- if (dump_opt['u'])
	- dump_label_uberblocks(&label, ashift);
	- }
	-
	- (void) close(fd);
	-
	- return (label_found ? 0 : 2);
	-}
	-
	-static uint64_t dataset_feature_count[SPA_FEATURES];
	-static uint64_t remap_deadlist_count = 0;
	-
	-/ARGSUSED/
	-static int
	-dump_one_dir(const char dsname, void arg)
	-{
	- int error;
	- objset_t *os;
	-
	- error = open_objset(dsname, DMU_OST_ANY, FTAG, &os);
	- if (error != 0)
	- return (0);
	-
	- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	- if (!dmu_objset_ds(os)->ds_feature_inuse[f])
	- continue;
	- ASSERT(spa_feature_table[f].fi_flags &
	- ZFEATURE_FLAG_PER_DATASET);
	- dataset_feature_count[f]++;
	- }
	-
	- if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
	- remap_deadlist_count++;
	- }
	-
	- dump_dir(os);
	- close_objset(os, FTAG);
	- fuid_table_destroy();
	- return (0);
	-}
	-
	-/*
	- * Block statistics.
	- */
	-#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
	-typedef struct zdb_blkstats {
	- uint64_t zb_asize;
	- uint64_t zb_lsize;
	- uint64_t zb_psize;
	- uint64_t zb_count;
	- uint64_t zb_gangs;
	- uint64_t zb_ditto_samevdev;
	- uint64_t zb_ditto_same_ms;
	- uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
	-} zdb_blkstats_t;
	-
	-/*
	- * Extended object types to report deferred frees and dedup auto-ditto blocks.
	- */
	-#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
	-#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
	-#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)
	-#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)
	-
	-static const char *zdb_ot_extname[] = {
	- "deferred free",
	- "dedup ditto",
	- "other",
	- "Total",
	-};
	-
	-#define ZB_TOTAL DN_MAX_LEVELS
	-
	-typedef struct zdb_cb {
	- zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
	- uint64_t zcb_removing_size;
	- uint64_t zcb_checkpoint_size;
	- uint64_t zcb_dedup_asize;
	- uint64_t zcb_dedup_blocks;
	- uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
	- uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
	- [BPE_PAYLOAD_SIZE];
	- uint64_t zcb_start;
	- hrtime_t zcb_lastprint;
	- uint64_t zcb_totalasize;
	- uint64_t zcb_errors[256];
	- int zcb_readfails;
	- int zcb_haderrors;
	- spa_t *zcb_spa;
	- uint32_t **zcb_vd_obsolete_counts;
	-} zdb_cb_t;
	-
	-/* test if two DVA offsets from same vdev are within the same metaslab */
	-static boolean_t
	-same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
	-{
	- vdev_t *vd = vdev_lookup_top(spa, vdev);
	- uint64_t ms_shift = vd->vdev_ms_shift;
	-
	- return ((off1 >> ms_shift) == (off2 >> ms_shift));
	-}
	-
	-static void
	-zdb_count_block(zdb_cb_t zcb, zilog_t zilog, const blkptr_t *bp,
	- dmu_object_type_t type)
	-{
	- uint64_t refcnt = 0;
	-
	- ASSERT(type < ZDB_OT_TOTAL);
	-
	- if (zilog && zil_bp_tree_add(zilog, bp) != 0)
	- return;
	-
	- spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
	-
	- for (int i = 0; i < 4; i++) {
	- int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
	- int t = (i & 1) ? type : ZDB_OT_TOTAL;
	- int equal;
	- zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
	-
	- zb->zb_asize += BP_GET_ASIZE(bp);
	- zb->zb_lsize += BP_GET_LSIZE(bp);
	- zb->zb_psize += BP_GET_PSIZE(bp);
	- zb->zb_count++;
	-
	- /*
	- * The histogram is only big enough to record blocks up to
	- * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
	- * "other", bucket.
	- */
	- unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
	- idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
	- zb->zb_psize_histogram[idx]++;
	-
	- zb->zb_gangs += BP_COUNT_GANG(bp);
	-
	- switch (BP_GET_NDVAS(bp)) {
	- case 2:
	- if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	- DVA_GET_VDEV(&bp->blk_dva[1])) {
	- zb->zb_ditto_samevdev++;
	-
	- if (same_metaslab(zcb->zcb_spa,
	- DVA_GET_VDEV(&bp->blk_dva[0]),
	- DVA_GET_OFFSET(&bp->blk_dva[0]),
	- DVA_GET_OFFSET(&bp->blk_dva[1])))
	- zb->zb_ditto_same_ms++;
	- }
	- break;
	- case 3:
	- equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	- DVA_GET_VDEV(&bp->blk_dva[1])) +
	- (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	- DVA_GET_VDEV(&bp->blk_dva[2])) +
	- (DVA_GET_VDEV(&bp->blk_dva[1]) ==
	- DVA_GET_VDEV(&bp->blk_dva[2]));
	- if (equal != 0) {
	- zb->zb_ditto_samevdev++;
	-
	- if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	- DVA_GET_VDEV(&bp->blk_dva[1]) &&
	- same_metaslab(zcb->zcb_spa,
	- DVA_GET_VDEV(&bp->blk_dva[0]),
	- DVA_GET_OFFSET(&bp->blk_dva[0]),
	- DVA_GET_OFFSET(&bp->blk_dva[1])))
	- zb->zb_ditto_same_ms++;
	- else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	- DVA_GET_VDEV(&bp->blk_dva[2]) &&
	- same_metaslab(zcb->zcb_spa,
	- DVA_GET_VDEV(&bp->blk_dva[0]),
	- DVA_GET_OFFSET(&bp->blk_dva[0]),
	- DVA_GET_OFFSET(&bp->blk_dva[2])))
	- zb->zb_ditto_same_ms++;
	- else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
	- DVA_GET_VDEV(&bp->blk_dva[2]) &&
	- same_metaslab(zcb->zcb_spa,
	- DVA_GET_VDEV(&bp->blk_dva[1]),
	- DVA_GET_OFFSET(&bp->blk_dva[1]),
	- DVA_GET_OFFSET(&bp->blk_dva[2])))
	- zb->zb_ditto_same_ms++;
	- }
	- break;
	- }
	- }
	-
	- spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
	-
	- if (BP_IS_EMBEDDED(bp)) {
	- zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
	- zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
	- [BPE_GET_PSIZE(bp)]++;
	- return;
	- }
	-
	- if (dump_opt['L'])
	- return;
	-
	- if (BP_GET_DEDUP(bp)) {
	- ddt_t *ddt;
	- ddt_entry_t *dde;
	-
	- ddt = ddt_select(zcb->zcb_spa, bp);
	- ddt_enter(ddt);
	- dde = ddt_lookup(ddt, bp, B_FALSE);
	-
	- if (dde == NULL) {
	- refcnt = 0;
	- } else {
	- ddt_phys_t *ddp = ddt_phys_select(dde, bp);
	- ddt_phys_decref(ddp);
	- refcnt = ddp->ddp_refcnt;
	- if (ddt_phys_total_refcnt(dde) == 0)
	- ddt_remove(ddt, dde);
	- }
	- ddt_exit(ddt);
	- }
	-
	- VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
	- refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
	- bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-zdb_blkptr_done(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- blkptr_t *bp = zio->io_bp;
	- int ioerr = zio->io_error;
	- zdb_cb_t *zcb = zio->io_private;
	- zbookmark_phys_t *zb = &zio->io_bookmark;
	-
	- abd_free(zio->io_abd);
	-
	- mutex_enter(&spa->spa_scrub_lock);
	- spa->spa_scrub_inflight--;
	- spa->spa_load_verify_ios--;
	- cv_broadcast(&spa->spa_scrub_io_cv);
	-
	- if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	- char blkbuf[BP_SPRINTF_LEN];
	-
	- zcb->zcb_haderrors = 1;
	- zcb->zcb_errors[ioerr]++;
	-
	- if (dump_opt['b'] >= 2)
	- snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	- else
	- blkbuf[0] = '\0';
	-
	- (void) printf("zdb_blkptr_cb: "
	- "Got error %d reading "
	- "<%llu, %llu, %lld, %llx> %s -- skipping\n",
	- ioerr,
	- (u_longlong_t)zb->zb_objset,
	- (u_longlong_t)zb->zb_object,
	- (u_longlong_t)zb->zb_level,
	- (u_longlong_t)zb->zb_blkid,
	- blkbuf);
	- }
	- mutex_exit(&spa->spa_scrub_lock);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zdb_blkptr_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	-{
	- zdb_cb_t *zcb = arg;
	- dmu_object_type_t type;
	- boolean_t is_metadata;
	-
	- if (bp == NULL)
	- return (0);
	-
	- if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
	- char blkbuf[BP_SPRINTF_LEN];
	- snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	- (void) printf("objset %llu object %llu "
	- "level %lld offset 0x%llx %s\n",
	- (u_longlong_t)zb->zb_objset,
	- (u_longlong_t)zb->zb_object,
	- (longlong_t)zb->zb_level,
	- (u_longlong_t)blkid2offset(dnp, bp, zb),
	- blkbuf);
	- }
	-
	- if (BP_IS_HOLE(bp))
	- return (0);
	-
	- type = BP_GET_TYPE(bp);
	-
	- zdb_count_block(zcb, zilog, bp,
	- (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
	-
	- is_metadata = (BP_GET_LEVEL(bp) != 0 \|\| DMU_OT_IS_METADATA(type));
	-
	- if (!BP_IS_EMBEDDED(bp) &&
	- (dump_opt['c'] > 1 \|\| (dump_opt['c'] && is_metadata))) {
	- size_t size = BP_GET_PSIZE(bp);
	- abd_t *abd = abd_alloc(size, B_FALSE);
	- int flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCRUB \| ZIO_FLAG_RAW;
	-
	- /* If it's an intent log block, failure is expected. */
	- if (zb->zb_level == ZB_ZIL_LEVEL)
	- flags \|= ZIO_FLAG_SPECULATIVE;
	-
	- mutex_enter(&spa->spa_scrub_lock);
	- while (spa->spa_load_verify_ios > max_inflight)
	- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
	- spa->spa_scrub_inflight++;
	- spa->spa_load_verify_ios++;
	- mutex_exit(&spa->spa_scrub_lock);
	-
	- zio_nowait(zio_read(NULL, spa, bp, abd, size,
	- zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
	- }
	-
	- zcb->zcb_readfails = 0;
	-
	- /* only call gethrtime() every 100 blocks */
	- static int iters;
	- if (++iters > 100)
	- iters = 0;
	- else
	- return (0);
	-
	- if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
	- uint64_t now = gethrtime();
	- char buf[10];
	- uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
	- int kb_per_sec =
	- 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
	- int sec_remaining =
	- (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
	-
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);
	-
	- zfs_nicenum(bytes, buf, sizeof (buf));
	- (void) fprintf(stderr,
	- "\r%5s completed (%4dMB/s) "
	- "estimated time remaining: %uhr %02umin %02usec ",
	- buf, kb_per_sec / 1024,
	- sec_remaining / 60 / 60,
	- sec_remaining / 60 % 60,
	- sec_remaining % 60);
	-
	- zcb->zcb_lastprint = now;
	- }
	-
	- return (0);
	-}
	-
	-static void
	-zdb_leak(void *arg, uint64_t start, uint64_t size)
	-{
	- vdev_t *vd = arg;
	-
	- (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
	- (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
	-}
	-
	-static metaslab_ops_t zdb_metaslab_ops = {
	- NULL /* alloc */
	-};
	-
	-static void
	-zdb_ddt_leak_init(spa_t spa, zdb_cb_t zcb)
	-{
	- ddt_bookmark_t ddb;
	- ddt_entry_t dde;
	- int error;
	-
	- ASSERT(!dump_opt['L']);
	-
	- bzero(&ddb, sizeof (ddb));
	- while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
	- blkptr_t blk;
	- ddt_phys_t *ddp = dde.dde_phys;
	-
	- if (ddb.ddb_class == DDT_CLASS_UNIQUE)
	- return;
	-
	- ASSERT(ddt_phys_total_refcnt(&dde) > 1);
	-
	- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	- if (ddp->ddp_phys_birth == 0)
	- continue;
	- ddt_bp_create(ddb.ddb_checksum,
	- &dde.dde_key, ddp, &blk);
	- if (p == DDT_PHYS_DITTO) {
	- zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
	- } else {
	- zcb->zcb_dedup_asize +=
	- BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
	- zcb->zcb_dedup_blocks++;
	- }
	- }
	- ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
	- ddt_enter(ddt);
	- VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
	- ddt_exit(ddt);
	- }
	-
	- ASSERT(error == ENOENT);
	-}
	-
	-/* ARGSUSED */
	-static void
	-claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
	- uint64_t size, void *arg)
	-{
	- /*
	- * This callback was called through a remap from
	- * a device being removed. Therefore, the vdev that
	- * this callback is applied to is a concrete
	- * vdev.
	- */
	- ASSERT(vdev_is_concrete(vd));
	-
	- VERIFY0(metaslab_claim_impl(vd, offset, size,
	- spa_min_claim_txg(vd->vdev_spa)));
	-}
	-
	-static void
	-claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
	-{
	- vdev_t *vd = arg;
	-
	- vdev_indirect_ops.vdev_op_remap(vd, offset, size,
	- claim_segment_impl_cb, NULL);
	-}
	-
	-/*
	- * After accounting for all allocated blocks that are directly referenced,
	- * we might have missed a reference to a block from a partially complete
	- * (and thus unused) indirect mapping object. We perform a secondary pass
	- * through the metaslabs we have already mapped and claim the destination
	- * blocks.
	- */
	-static void
	-zdb_claim_removing(spa_t spa, zdb_cb_t zcb)
	-{
	- if (dump_opt['L'])
	- return;
	-
	- if (spa->spa_vdev_removal == NULL)
	- return;
	-
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	-
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	- vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	-
	- for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
	- metaslab_t *msp = vd->vdev_ms[msi];
	-
	- if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
	- break;
	-
	- ASSERT0(range_tree_space(svr->svr_allocd_segs));
	-
	- if (msp->ms_sm != NULL) {
	- VERIFY0(space_map_load(msp->ms_sm,
	- svr->svr_allocd_segs, SM_ALLOC));
	-
	- /*
	- * Clear everything past what has been synced unless
	- * it's past the spacemap, because we have not allocated
	- * mappings for it yet.
	- */
	- uint64_t vim_max_offset =
	- vdev_indirect_mapping_max_offset(vim);
	- uint64_t sm_end = msp->ms_sm->sm_start +
	- msp->ms_sm->sm_size;
	- if (sm_end > vim_max_offset)
	- range_tree_clear(svr->svr_allocd_segs,
	- vim_max_offset, sm_end - vim_max_offset);
	- }
	-
	- zcb->zcb_removing_size +=
	- range_tree_space(svr->svr_allocd_segs);
	- range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
	- }
	-
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	-}
	-
	-/* ARGSUSED */
	-static int
	-increment_indirect_mapping_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- zdb_cb_t *zcb = arg;
	- spa_t *spa = zcb->zcb_spa;
	- vdev_t *vd;
	- const dva_t *dva = &bp->blk_dva[0];
	-
	- ASSERT(!dump_opt['L']);
	- ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	- vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
	- ASSERT3P(vd, !=, NULL);
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
	- ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
	-
	- vdev_indirect_mapping_increment_obsolete_count(
	- vd->vdev_indirect_mapping,
	- DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
	- zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
	-
	- return (0);
	-}
	-
	-static uint32_t *
	-zdb_load_obsolete_counts(vdev_t *vd)
	-{
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- spa_t *spa = vd->vdev_spa;
	- spa_condensing_indirect_phys_t *scip =
	- &spa->spa_condensing_indirect_phys;
	- uint32_t *counts;
	-
	- EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL);
	- counts = vdev_indirect_mapping_load_obsolete_counts(vim);
	- if (vd->vdev_obsolete_sm != NULL) {
	- vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
	- vd->vdev_obsolete_sm);
	- }
	- if (scip->scip_vdev == vd->vdev_id &&
	- scip->scip_prev_obsolete_sm_object != 0) {
	- space_map_t *prev_obsolete_sm = NULL;
	- VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
	- scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
	- vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
	- prev_obsolete_sm);
	- space_map_close(prev_obsolete_sm);
	- }
	- return (counts);
	-}
	-
	-typedef struct checkpoint_sm_exclude_entry_arg {
	- vdev_t *cseea_vd;
	- uint64_t cseea_checkpoint_size;
	-} checkpoint_sm_exclude_entry_arg_t;
	-
	-static int
	-checkpoint_sm_exclude_entry_cb(space_map_entry_t sme, void arg)
	-{
	- checkpoint_sm_exclude_entry_arg_t *cseea = arg;
	- vdev_t *vd = cseea->cseea_vd;
	- metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
	- uint64_t end = sme->sme_offset + sme->sme_run;
	-
	- ASSERT(sme->sme_type == SM_FREE);
	-
	- /*
	- * Since the vdev_checkpoint_sm exists in the vdev level
	- * and the ms_sm space maps exist in the metaslab level,
	- * an entry in the checkpoint space map could theoretically
	- * cross the boundaries of the metaslab that it belongs.
	- *
	- * In reality, because of the way that we populate and
	- * manipulate the checkpoint's space maps currently,
	- * there shouldn't be any entries that cross metaslabs.
	- * Hence the assertion below.
	- *
	- * That said, there is no fundamental requirement that
	- * the checkpoint's space map entries should not cross
	- * metaslab boundaries. So if needed we could add code
	- * that handles metaslab-crossing segments in the future.
	- */
	- VERIFY3U(sme->sme_offset, >=, ms->ms_start);
	- VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
	-
	- /*
	- * By removing the entry from the allocated segments we
	- * also verify that the entry is there to begin with.
	- */
	- mutex_enter(&ms->ms_lock);
	- range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
	- mutex_exit(&ms->ms_lock);
	-
	- cseea->cseea_checkpoint_size += sme->sme_run;
	- return (0);
	-}
	-
	-static void
	-zdb_leak_init_vdev_exclude_checkpoint(vdev_t vd, zdb_cb_t zcb)
	-{
	- spa_t *spa = vd->vdev_spa;
	- space_map_t *checkpoint_sm = NULL;
	- uint64_t checkpoint_sm_obj;
	-
	- /*
	- * If there is no vdev_top_zap, we are in a pool whose
	- * version predates the pool checkpoint feature.
	- */
	- if (vd->vdev_top_zap == 0)
	- return;
	-
	- /*
	- * If there is no reference of the vdev_checkpoint_sm in
	- * the vdev_top_zap, then one of the following scenarios
	- * is true:
	- *
	- * 1] There is no checkpoint
	- * 2] There is a checkpoint, but no checkpointed blocks
	- * have been freed yet
	- * 3] The current vdev is indirect
	- *
	- * In these cases we return immediately.
	- */
	- if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
	- VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
	- return;
	-
	- VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
	- VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
	- &checkpoint_sm_obj));
	-
	- checkpoint_sm_exclude_entry_arg_t cseea;
	- cseea.cseea_vd = vd;
	- cseea.cseea_checkpoint_size = 0;
	-
	- VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
	- checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
	-
	- VERIFY0(space_map_iterate(checkpoint_sm,
	- space_map_length(checkpoint_sm),
	- checkpoint_sm_exclude_entry_cb, &cseea));
	- space_map_close(checkpoint_sm);
	-
	- zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
	-}
	-
	-static void
	-zdb_leak_init_exclude_checkpoint(spa_t spa, zdb_cb_t zcb)
	-{
	- ASSERT(!dump_opt['L']);
	-
	- vdev_t *rvd = spa->spa_root_vdev;
	- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	- ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
	- zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
	- }
	-}
	-
	-static void
	-load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- for (uint64_t i = 0; i < rvd->vdev_children; i++) {
	- vdev_t *vd = rvd->vdev_child[i];
	-
	- ASSERT3U(i, ==, vd->vdev_id);
	-
	- if (vd->vdev_ops == &vdev_indirect_ops)
	- continue;
	-
	- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
	- metaslab_t *msp = vd->vdev_ms[m];
	-
	- (void) fprintf(stderr,
	- "\rloading concrete vdev %llu, "
	- "metaslab %llu of %llu ...",
	- (longlong_t)vd->vdev_id,
	- (longlong_t)msp->ms_id,
	- (longlong_t)vd->vdev_ms_count);
	-
	- mutex_enter(&msp->ms_lock);
	- metaslab_unload(msp);
	-
	- /*
	- * We don't want to spend the CPU manipulating the
	- * size-ordered tree, so clear the range_tree ops.
	- */
	- msp->ms_allocatable->rt_ops = NULL;
	-
	- if (msp->ms_sm != NULL) {
	- VERIFY0(space_map_load(msp->ms_sm,
	- msp->ms_allocatable, maptype));
	- }
	- if (!msp->ms_loaded)
	- msp->ms_loaded = B_TRUE;
	- mutex_exit(&msp->ms_lock);
	- }
	- }
	-}
	-
	-/*
	- * vm_idxp is an in-out parameter which (for indirect vdevs) is the
	- * index in vim_entries that has the first entry in this metaslab.
	- * On return, it will be set to the first entry after this metaslab.
	- */
	-static void
	-load_indirect_ms_allocatable_tree(vdev_t vd, metaslab_t msp,
	- uint64_t *vim_idxp)
	-{
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	-
	- mutex_enter(&msp->ms_lock);
	- metaslab_unload(msp);
	-
	- /*
	- * We don't want to spend the CPU manipulating the
	- * size-ordered tree, so clear the range_tree ops.
	- */
	- msp->ms_allocatable->rt_ops = NULL;
	-
	- for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
	- (*vim_idxp)++) {
	- vdev_indirect_mapping_entry_phys_t *vimep =
	- &vim->vim_entries[*vim_idxp];
	- uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
	- uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
	- ASSERT3U(ent_offset, >=, msp->ms_start);
	- if (ent_offset >= msp->ms_start + msp->ms_size)
	- break;
	-
	- /*
	- * Mappings do not cross metaslab boundaries,
	- * because we create them by walking the metaslabs.
	- */
	- ASSERT3U(ent_offset + ent_len, <=,
	- msp->ms_start + msp->ms_size);
	- range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
	- }
	-
	- if (!msp->ms_loaded)
	- msp->ms_loaded = B_TRUE;
	- mutex_exit(&msp->ms_lock);
	-}
	-
	-static void
	-zdb_leak_init_prepare_indirect_vdevs(spa_t spa, zdb_cb_t zcb)
	-{
	- ASSERT(!dump_opt['L']);
	-
	- vdev_t *rvd = spa->spa_root_vdev;
	- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *vd = rvd->vdev_child[c];
	-
	- ASSERT3U(c, ==, vd->vdev_id);
	-
	- if (vd->vdev_ops != &vdev_indirect_ops)
	- continue;
	-
	- /*
	- * Note: we don't check for mapping leaks on
	- * removing vdevs because their ms_allocatable's
	- * are used to look for leaks in allocated space.
	- */
	- zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
	-
	- /*
	- * Normally, indirect vdevs don't have any
	- * metaslabs. We want to set them up for
	- * zio_claim().
	- */
	- VERIFY0(vdev_metaslab_init(vd, 0));
	-
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- uint64_t vim_idx = 0;
	- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
	-
	- (void) fprintf(stderr,
	- "\rloading indirect vdev %llu, "
	- "metaslab %llu of %llu ...",
	- (longlong_t)vd->vdev_id,
	- (longlong_t)vd->vdev_ms[m]->ms_id,
	- (longlong_t)vd->vdev_ms_count);
	-
	- load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
	- &vim_idx);
	- }
	- ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
	- }
	-}
	-
	-static void
	-zdb_leak_init(spa_t spa, zdb_cb_t zcb)
	-{
	- zcb->zcb_spa = spa;
	-
	- if (dump_opt['L'])
	- return;
	-
	- dsl_pool_t *dp = spa->spa_dsl_pool;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- /*
	- * We are going to be changing the meaning of the metaslab's
	- * ms_allocatable. Ensure that the allocator doesn't try to
	- * use the tree.
	- */
	- spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
	- spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
	-
	- zcb->zcb_vd_obsolete_counts =
	- umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
	- UMEM_NOFAIL);
	-
	- /*
	- * For leak detection, we overload the ms_allocatable trees
	- * to contain allocated segments instead of free segments.
	- * As a result, we can't use the normal metaslab_load/unload
	- * interfaces.
	- */
	- zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
	- load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
	-
	- /*
	- * On load_concrete_ms_allocatable_trees() we loaded all the
	- * allocated entries from the ms_sm to the ms_allocatable for
	- * each metaslab. If the pool has a checkpoint or is in the
	- * middle of discarding a checkpoint, some of these blocks
	- * may have been freed but their ms_sm may not have been
	- * updated because they are referenced by the checkpoint. In
	- * order to avoid false-positives during leak-detection, we
	- * go through the vdev's checkpoint space map and exclude all
	- * its entries from their relevant ms_allocatable.
	- *
	- * We also aggregate the space held by the checkpoint and add
	- * it to zcb_checkpoint_size.
	- *
	- * Note that at this point we are also verifying that all the
	- * entries on the checkpoint_sm are marked as allocated in
	- * the ms_sm of their relevant metaslab.
	- * [see comment in checkpoint_sm_exclude_entry_cb()]
	- */
	- zdb_leak_init_exclude_checkpoint(spa, zcb);
	- ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
	-
	- /* for cleaner progress output */
	- (void) fprintf(stderr, "\n");
	-
	- if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
	- ASSERT(spa_feature_is_enabled(spa,
	- SPA_FEATURE_DEVICE_REMOVAL));
	- (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
	- increment_indirect_mapping_cb, zcb, NULL);
	- }
	-
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	- zdb_ddt_leak_init(spa, zcb);
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	-}
	-
	-static boolean_t
	-zdb_check_for_obsolete_leaks(vdev_t vd, zdb_cb_t zcb)
	-{
	- boolean_t leaks = B_FALSE;
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- uint64_t total_leaked = 0;
	-
	- ASSERT(vim != NULL);
	-
	- for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
	- vdev_indirect_mapping_entry_phys_t *vimep =
	- &vim->vim_entries[i];
	- uint64_t obsolete_bytes = 0;
	- uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
	- metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	-
	- /*
	- * This is not very efficient but it's easy to
	- * verify correctness.
	- */
	- for (uint64_t inner_offset = 0;
	- inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
	- inner_offset += 1 << vd->vdev_ashift) {
	- if (range_tree_contains(msp->ms_allocatable,
	- offset + inner_offset, 1 << vd->vdev_ashift)) {
	- obsolete_bytes += 1 << vd->vdev_ashift;
	- }
	- }
	-
	- int64_t bytes_leaked = obsolete_bytes -
	- zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
	- ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
	- zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
	- if (bytes_leaked != 0 &&
	- (vdev_obsolete_counts_are_precise(vd) \|\|
	- dump_opt['d'] >= 5)) {
	- (void) printf("obsolete indirect mapping count "
	- "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
	- (u_longlong_t)vd->vdev_id,
	- (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
	- (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
	- (u_longlong_t)bytes_leaked);
	- }
	- total_leaked += ABS(bytes_leaked);
	- }
	-
	- if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) {
	- int pct_leaked = total_leaked * 100 /
	- vdev_indirect_mapping_bytes_mapped(vim);
	- (void) printf("cannot verify obsolete indirect mapping "
	- "counts of vdev %llu because precise feature was not "
	- "enabled when it was removed: %d%% (%llx bytes) of mapping"
	- "unreferenced\n",
	- (u_longlong_t)vd->vdev_id, pct_leaked,
	- (u_longlong_t)total_leaked);
	- } else if (total_leaked > 0) {
	- (void) printf("obsolete indirect mapping count mismatch "
	- "for vdev %llu -- %llx total bytes mismatched\n",
	- (u_longlong_t)vd->vdev_id,
	- (u_longlong_t)total_leaked);
	- leaks \|= B_TRUE;
	- }
	-
	- vdev_indirect_mapping_free_obsolete_counts(vim,
	- zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
	- zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
	-
	- return (leaks);
	-}
	-
	-static boolean_t
	-zdb_leak_fini(spa_t spa, zdb_cb_t zcb)
	-{
	- if (dump_opt['L'])
	- return (B_FALSE);
	-
	- boolean_t leaks = B_FALSE;
	-
	- vdev_t *rvd = spa->spa_root_vdev;
	- for (unsigned c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *vd = rvd->vdev_child[c];
	-#if DEBUG
	- metaslab_group_t *mg = vd->vdev_mg;
	-#endif
	-
	- if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
	- leaks \|= zdb_check_for_obsolete_leaks(vd, zcb);
	- }
	-
	- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
	- metaslab_t *msp = vd->vdev_ms[m];
	- ASSERT3P(mg, ==, msp->ms_group);
	-
	- /*
	- * ms_allocatable has been overloaded
	- * to contain allocated segments. Now that
	- * we finished traversing all blocks, any
	- * block that remains in the ms_allocatable
	- * represents an allocated block that we
	- * did not claim during the traversal.
	- * Claimed blocks would have been removed
	- * from the ms_allocatable. For indirect
	- * vdevs, space remaining in the tree
	- * represents parts of the mapping that are
	- * not referenced, which is not a bug.
	- */
	- if (vd->vdev_ops == &vdev_indirect_ops) {
	- range_tree_vacate(msp->ms_allocatable,
	- NULL, NULL);
	- } else {
	- range_tree_vacate(msp->ms_allocatable,
	- zdb_leak, vd);
	- }
	-
	- if (msp->ms_loaded) {
	- msp->ms_loaded = B_FALSE;
	- }
	- }
	-
	- }
	-
	- umem_free(zcb->zcb_vd_obsolete_counts,
	- rvd->vdev_children * sizeof (uint32_t *));
	- zcb->zcb_vd_obsolete_counts = NULL;
	-
	- return (leaks);
	-}
	-
	-/* ARGSUSED */
	-static int
	-count_block_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- zdb_cb_t *zcb = arg;
	-
	- if (dump_opt['b'] >= 5) {
	- char blkbuf[BP_SPRINTF_LEN];
	- snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	- (void) printf("[%s] %s\n",
	- "deferred free", blkbuf);
	- }
	- zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
	- return (0);
	-}
	-
	-static int
	-dump_block_stats(spa_t *spa)
	-{
	- zdb_cb_t zcb;
	- zdb_blkstats_t zb, tzb;
	- uint64_t norm_alloc, norm_space, total_alloc, total_found;
	- int flags = TRAVERSE_PRE \| TRAVERSE_PREFETCH_METADATA \| TRAVERSE_HARD;
	- boolean_t leaks = B_FALSE;
	- int err;
	-
	- bzero(&zcb, sizeof (zcb));
	- (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
	- (dump_opt['c'] \|\| !dump_opt['L']) ? "to verify " : "",
	- (dump_opt['c'] == 1) ? "metadata " : "",
	- dump_opt['c'] ? "checksums " : "",
	- (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
	- !dump_opt['L'] ? "nothing leaked " : "");
	-
	- /*
	- * When leak detection is enabled we load all space maps as SM_ALLOC
	- * maps, then traverse the pool claiming each block we discover. If
	- * the pool is perfectly consistent, the segment trees will be empty
	- * when we're done. Anything left over is a leak; any block we can't
	- * claim (because it's not part of any space map) is a double
	- * allocation, reference to a freed block, or an unclaimed log block.
	- *
	- * When leak detection is disabled (-L option) we still traverse the
	- * pool claiming each block we discover, but we skip opening any space
	- * maps.
	- */
	- bzero(&zcb, sizeof (zdb_cb_t));
	- zdb_leak_init(spa, &zcb);
	-
	- /*
	- * If there's a deferred-free bplist, process that first.
	- */
	- (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
	- count_block_cb, &zcb, NULL);
	-
	- if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
	- (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
	- count_block_cb, &zcb, NULL);
	- }
	-
	- zdb_claim_removing(spa, &zcb);
	-
	- if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
	- VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
	- spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
	- &zcb, NULL));
	- }
	-
	- if (dump_opt['c'] > 1)
	- flags \|= TRAVERSE_PREFETCH_DATA;
	-
	- zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
	- zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
	- zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
	- zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
	- err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
	-
	- /*
	- * If we've traversed the data blocks then we need to wait for those
	- * I/Os to complete. We leverage "The Godfather" zio to wait on
	- * all async I/Os to complete.
	- */
	- if (dump_opt['c']) {
	- for (int i = 0; i < max_ncpus; i++) {
	- (void) zio_wait(spa->spa_async_zio_root[i]);
	- spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	- ZIO_FLAG_GODFATHER);
	- }
	- }
	-
	- /*
	- * Done after zio_wait() since zcb_haderrors is modified in
	- * zdb_blkptr_done()
	- */
	- zcb.zcb_haderrors \|= err;
	-
	- if (zcb.zcb_haderrors) {
	- (void) printf("\nError counts:\n\n");
	- (void) printf("\t%5s %s\n", "errno", "count");
	- for (int e = 0; e < 256; e++) {
	- if (zcb.zcb_errors[e] != 0) {
	- (void) printf("\t%5d %llu\n",
	- e, (u_longlong_t)zcb.zcb_errors[e]);
	- }
	- }
	- }
	-
	- /*
	- * Report any leaked segments.
	- */
	- leaks \|= zdb_leak_fini(spa, &zcb);
	-
	- tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
	-
	- norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
	- norm_space = metaslab_class_get_space(spa_normal_class(spa));
	-
	- total_alloc = norm_alloc +
	- metaslab_class_get_alloc(spa_log_class(spa)) +
	- metaslab_class_get_alloc(spa_special_class(spa)) +
	- metaslab_class_get_alloc(spa_dedup_class(spa));
	- total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
	- zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
	-
	- if (total_found == total_alloc && !dump_opt['L']) {
	- (void) printf("\n\tNo leaks (block sum matches space"
	- " maps exactly)\n");
	- } else if (!dump_opt['L']) {
	- (void) printf("block traversal size %llu != alloc %llu "
	- "(%s %lld)\n",
	- (u_longlong_t)total_found,
	- (u_longlong_t)total_alloc,
	- (dump_opt['L']) ? "unreachable" : "leaked",
	- (longlong_t)(total_alloc - total_found));
	- leaks = B_TRUE;
	- }
	-
	- if (tzb->zb_count == 0)
	- return (2);
	-
	- (void) printf("\n");
	- (void) printf("\t%-16s %14llu\n", "bp count:",
	- (u_longlong_t)tzb->zb_count);
	- (void) printf("\t%-16s %14llu\n", "ganged count:",
	- (longlong_t)tzb->zb_gangs);
	- (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:",
	- (u_longlong_t)tzb->zb_lsize,
	- (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
	- (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
	- "bp physical:", (u_longlong_t)tzb->zb_psize,
	- (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
	- (double)tzb->zb_lsize / tzb->zb_psize);
	- (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
	- "bp allocated:", (u_longlong_t)tzb->zb_asize,
	- (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
	- (double)tzb->zb_lsize / tzb->zb_asize);
	- (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n",
	- "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize,
	- (u_longlong_t)zcb.zcb_dedup_blocks,
	- (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
	- (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
	- (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
	-
	- if (spa_special_class(spa)->mc_rotor != NULL) {
	- uint64_t alloc = metaslab_class_get_alloc(
	- spa_special_class(spa));
	- uint64_t space = metaslab_class_get_space(
	- spa_special_class(spa));
	-
	- (void) printf("\t%-16s %14llu used: %5.2f%%\n",
	- "Special class", (u_longlong_t)alloc,
	- 100.0 * alloc / space);
	- }
	-
	- if (spa_dedup_class(spa)->mc_rotor != NULL) {
	- uint64_t alloc = metaslab_class_get_alloc(
	- spa_dedup_class(spa));
	- uint64_t space = metaslab_class_get_space(
	- spa_dedup_class(spa));
	-
	- (void) printf("\t%-16s %14llu used: %5.2f%%\n",
	- "Dedup class", (u_longlong_t)alloc,
	- 100.0 * alloc / space);
	- }
	-
	- for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
	- if (zcb.zcb_embedded_blocks[i] == 0)
	- continue;
	- (void) printf("\n");
	- (void) printf("\tadditional, non-pointer bps of type %u: "
	- "%10llu\n",
	- i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
	-
	- if (dump_opt['b'] >= 3) {
	- (void) printf("\t number of (compressed) bytes: "
	- "number of bps\n");
	- dump_histogram(zcb.zcb_embedded_histogram[i],
	- sizeof (zcb.zcb_embedded_histogram[i]) /
	- sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
	- }
	- }
	-
	- if (tzb->zb_ditto_samevdev != 0) {
	- (void) printf("\tDittoed blocks on same vdev: %llu\n",
	- (longlong_t)tzb->zb_ditto_samevdev);
	- }
	- if (tzb->zb_ditto_same_ms != 0) {
	- (void) printf("\tDittoed blocks in same metaslab: %llu\n",
	- (longlong_t)tzb->zb_ditto_same_ms);
	- }
	-
	- for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
	- vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	-
	- if (vim == NULL) {
	- continue;
	- }
	-
	- char mem[32];
	- zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
	- mem, vdev_indirect_mapping_size(vim));
	-
	- (void) printf("\tindirect vdev id %llu has %llu segments "
	- "(%s in memory)\n",
	- (longlong_t)vd->vdev_id,
	- (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
	- }
	-
	- if (dump_opt['b'] >= 2) {
	- int l, t, level;
	- (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
	- "\t avg\t comp\t%%Total\tType\n");
	-
	- for (t = 0; t <= ZDB_OT_TOTAL; t++) {
	- char csize[32], lsize[32], psize[32], asize[32];
	- char avg[32], gang[32];
	- const char *typename;
	-
	- /* make sure nicenum has enough space */
	- CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
	- CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);
	-
	- if (t < DMU_OT_NUMTYPES)
	- typename = dmu_ot[t].ot_name;
	- else
	- typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
	-
	- if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
	- (void) printf("%6s\t%5s\t%5s\t%5s"
	- "\t%5s\t%5s\t%6s\t%s\n",
	- "-",
	- "-",
	- "-",
	- "-",
	- "-",
	- "-",
	- "-",
	- typename);
	- continue;
	- }
	-
	- for (l = ZB_TOTAL - 1; l >= -1; l--) {
	- level = (l == -1 ? ZB_TOTAL : l);
	- zb = &zcb.zcb_type[level][t];
	-
	- if (zb->zb_asize == 0)
	- continue;
	-
	- if (dump_opt['b'] < 3 && level != ZB_TOTAL)
	- continue;
	-
	- if (level == 0 && zb->zb_asize ==
	- zcb.zcb_type[ZB_TOTAL][t].zb_asize)
	- continue;
	-
	- zdb_nicenum(zb->zb_count, csize,
	- sizeof (csize));
	- zdb_nicenum(zb->zb_lsize, lsize,
	- sizeof (lsize));
	- zdb_nicenum(zb->zb_psize, psize,
	- sizeof (psize));
	- zdb_nicenum(zb->zb_asize, asize,
	- sizeof (asize));
	- zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
	- sizeof (avg));
	- zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
	-
	- (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
	- "\t%5.2f\t%6.2f\t",
	- csize, lsize, psize, asize, avg,
	- (double)zb->zb_lsize / zb->zb_psize,
	- 100.0 * zb->zb_asize / tzb->zb_asize);
	-
	- if (level == ZB_TOTAL)
	- (void) printf("%s\n", typename);
	- else
	- (void) printf(" L%d %s\n",
	- level, typename);
	-
	- if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
	- (void) printf("\t number of ganged "
	- "blocks: %s\n", gang);
	- }
	-
	- if (dump_opt['b'] >= 4) {
	- (void) printf("psize "
	- "(in 512-byte sectors): "
	- "number of blocks\n");
	- dump_histogram(zb->zb_psize_histogram,
	- PSIZE_HISTO_SIZE, 0);
	- }
	- }
	- }
	- }
	-
	- (void) printf("\n");
	-
	- if (leaks)
	- return (2);
	-
	- if (zcb.zcb_haderrors)
	- return (3);
	-
	- return (0);
	-}
	-
	-typedef struct zdb_ddt_entry {
	- ddt_key_t zdde_key;
	- uint64_t zdde_ref_blocks;
	- uint64_t zdde_ref_lsize;
	- uint64_t zdde_ref_psize;
	- uint64_t zdde_ref_dsize;
	- avl_node_t zdde_node;
	-} zdb_ddt_entry_t;
	-
	-/* ARGSUSED */
	-static int
	-zdb_ddt_add_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	-{
	- avl_tree_t *t = arg;
	- avl_index_t where;
	- zdb_ddt_entry_t *zdde, zdde_search;
	-
	- if (bp == NULL \|\| BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp))
	- return (0);
	-
	- if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
	- (void) printf("traversing objset %llu, %llu objects, "
	- "%lu blocks so far\n",
	- (u_longlong_t)zb->zb_objset,
	- (u_longlong_t)BP_GET_FILL(bp),
	- avl_numnodes(t));
	- }
	-
	- if (BP_IS_HOLE(bp) \|\| BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF \|\|
	- BP_GET_LEVEL(bp) > 0 \|\| DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
	- return (0);
	-
	- ddt_key_fill(&zdde_search.zdde_key, bp);
	-
	- zdde = avl_find(t, &zdde_search, &where);
	-
	- if (zdde == NULL) {
	- zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
	- zdde->zdde_key = zdde_search.zdde_key;
	- avl_insert(t, zdde, where);
	- }
	-
	- zdde->zdde_ref_blocks += 1;
	- zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
	- zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
	- zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
	-
	- return (0);
	-}
	-
	-static void
	-dump_simulated_ddt(spa_t *spa)
	-{
	- avl_tree_t t;
	- void *cookie = NULL;
	- zdb_ddt_entry_t *zdde;
	- ddt_histogram_t ddh_total;
	- ddt_stat_t dds_total;
	-
	- bzero(&ddh_total, sizeof (ddh_total));
	- bzero(&dds_total, sizeof (dds_total));
	- avl_create(&t, ddt_entry_compare,
	- sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
	-
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	-
	- (void) traverse_pool(spa, 0, TRAVERSE_PRE \| TRAVERSE_PREFETCH_METADATA,
	- zdb_ddt_add_cb, &t);
	-
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	-
	- while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
	- ddt_stat_t dds;
	- uint64_t refcnt = zdde->zdde_ref_blocks;
	- ASSERT(refcnt != 0);
	-
	- dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
	- dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
	- dds.dds_psize = zdde->zdde_ref_psize / refcnt;
	- dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
	-
	- dds.dds_ref_blocks = zdde->zdde_ref_blocks;
	- dds.dds_ref_lsize = zdde->zdde_ref_lsize;
	- dds.dds_ref_psize = zdde->zdde_ref_psize;
	- dds.dds_ref_dsize = zdde->zdde_ref_dsize;
	-
	- ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
	- &dds, 0);
	-
	- umem_free(zdde, sizeof (*zdde));
	- }
	-
	- avl_destroy(&t);
	-
	- ddt_histogram_stat(&dds_total, &ddh_total);
	-
	- (void) printf("Simulated DDT histogram:\n");
	-
	- zpool_dump_ddt(&dds_total, &ddh_total);
	-
	- dump_dedup_ratio(&dds_total);
	-}
	-
	-static int
	-verify_device_removal_feature_counts(spa_t *spa)
	-{
	- uint64_t dr_feature_refcount = 0;
	- uint64_t oc_feature_refcount = 0;
	- uint64_t indirect_vdev_count = 0;
	- uint64_t precise_vdev_count = 0;
	- uint64_t obsolete_counts_object_count = 0;
	- uint64_t obsolete_sm_count = 0;
	- uint64_t obsolete_counts_count = 0;
	- uint64_t scip_count = 0;
	- uint64_t obsolete_bpobj_count = 0;
	- int ret = 0;
	-
	- spa_condensing_indirect_phys_t *scip =
	- &spa->spa_condensing_indirect_phys;
	- if (scip->scip_next_mapping_object != 0) {
	- vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
	- ASSERT(scip->scip_prev_obsolete_sm_object != 0);
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	-
	- (void) printf("Condensing indirect vdev %llu: new mapping "
	- "object %llu, prev obsolete sm %llu\n",
	- (u_longlong_t)scip->scip_vdev,
	- (u_longlong_t)scip->scip_next_mapping_object,
	- (u_longlong_t)scip->scip_prev_obsolete_sm_object);
	- if (scip->scip_prev_obsolete_sm_object != 0) {
	- space_map_t *prev_obsolete_sm = NULL;
	- VERIFY0(space_map_open(&prev_obsolete_sm,
	- spa->spa_meta_objset,
	- scip->scip_prev_obsolete_sm_object,
	- 0, vd->vdev_asize, 0));
	- dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
	- (void) printf("\n");
	- space_map_close(prev_obsolete_sm);
	- }
	-
	- scip_count += 2;
	- }
	-
	- for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
	- vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	-
	- if (vic->vic_mapping_object != 0) {
	- ASSERT(vd->vdev_ops == &vdev_indirect_ops \|\|
	- vd->vdev_removing);
	- indirect_vdev_count++;
	-
	- if (vd->vdev_indirect_mapping->vim_havecounts) {
	- obsolete_counts_count++;
	- }
	- }
	- if (vdev_obsolete_counts_are_precise(vd)) {
	- ASSERT(vic->vic_mapping_object != 0);
	- precise_vdev_count++;
	- }
	- if (vdev_obsolete_sm_object(vd) != 0) {
	- ASSERT(vic->vic_mapping_object != 0);
	- obsolete_sm_count++;
	- }
	- }
	-
	- (void) feature_get_refcount(spa,
	- &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
	- &dr_feature_refcount);
	- (void) feature_get_refcount(spa,
	- &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
	- &oc_feature_refcount);
	-
	- if (dr_feature_refcount != indirect_vdev_count) {
	- ret = 1;
	- (void) printf("Number of indirect vdevs (%llu) " \
	- "does not match feature count (%llu)\n",
	- (u_longlong_t)indirect_vdev_count,
	- (u_longlong_t)dr_feature_refcount);
	- } else {
	- (void) printf("Verified device_removal feature refcount " \
	- "of %llu is correct\n",
	- (u_longlong_t)dr_feature_refcount);
	- }
	-
	- if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_OBSOLETE_BPOBJ) == 0) {
	- obsolete_bpobj_count++;
	- }
	-
	-
	- obsolete_counts_object_count = precise_vdev_count;
	- obsolete_counts_object_count += obsolete_sm_count;
	- obsolete_counts_object_count += obsolete_counts_count;
	- obsolete_counts_object_count += scip_count;
	- obsolete_counts_object_count += obsolete_bpobj_count;
	- obsolete_counts_object_count += remap_deadlist_count;
	-
	- if (oc_feature_refcount != obsolete_counts_object_count) {
	- ret = 1;
	- (void) printf("Number of obsolete counts objects (%llu) " \
	- "does not match feature count (%llu)\n",
	- (u_longlong_t)obsolete_counts_object_count,
	- (u_longlong_t)oc_feature_refcount);
	- (void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
	- "ob:%llu rd:%llu\n",
	- (u_longlong_t)precise_vdev_count,
	- (u_longlong_t)obsolete_sm_count,
	- (u_longlong_t)obsolete_counts_count,
	- (u_longlong_t)scip_count,
	- (u_longlong_t)obsolete_bpobj_count,
	- (u_longlong_t)remap_deadlist_count);
	- } else {
	- (void) printf("Verified indirect_refcount feature refcount " \
	- "of %llu is correct\n",
	- (u_longlong_t)oc_feature_refcount);
	- }
	- return (ret);
	-}
	-
	-static void
	-zdb_set_skip_mmp(char *target)
	-{
	- spa_t *spa;
	-
	- /*
	- * Disable the activity check to allow examination of
	- * active pools.
	- */
	- mutex_enter(&spa_namespace_lock);
	- if ((spa = spa_lookup(target)) != NULL) {
	- spa->spa_import_flags \|= ZFS_IMPORT_SKIP_MMP;
	- }
	- mutex_exit(&spa_namespace_lock);
	-}
	-
	-#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
	-/*
	- * Import the checkpointed state of the pool specified by the target
	- * parameter as readonly. The function also accepts a pool config
	- * as an optional parameter, else it attempts to infer the config by
	- * the name of the target pool.
	- *
	- * Note that the checkpointed state's pool name will be the name of
	- * the original pool with the above suffix appened to it. In addition,
	- * if the target is not a pool name (e.g. a path to a dataset) then
	- * the new_path parameter is populated with the updated path to
	- * reflect the fact that we are looking into the checkpointed state.
	- *
	- * The function returns a newly-allocated copy of the name of the
	- * pool containing the checkpointed state. When this copy is no
	- * longer needed it should be freed with free(3C). Same thing
	- * applies to the new_path parameter if allocated.
	- */
	-static char *
	-import_checkpointed_state(char target, nvlist_t cfg, char **new_path)
	-{
	- int error = 0;
	- char poolname, bogus_name;
	-
	- /* If the target is not a pool, the extract the pool name */
	- char *path_start = strchr(target, '/');
	- if (path_start != NULL) {
	- size_t poolname_len = path_start - target;
	- poolname = strndup(target, poolname_len);
	- } else {
	- poolname = target;
	- }
	-
	- if (cfg == NULL) {
	- zdb_set_skip_mmp(poolname);
	- error = spa_get_stats(poolname, &cfg, NULL, 0);
	- if (error != 0) {
	- fatal("Tried to read config of pool \"%s\" but "
	- "spa_get_stats() failed with error %d\n",
	- poolname, error);
	- }
	- }
	-
	- (void) asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX);
	- fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
	-
	- error = spa_import(bogus_name, cfg, NULL,
	- ZFS_IMPORT_MISSING_LOG \| ZFS_IMPORT_CHECKPOINT \|
	- ZFS_IMPORT_SKIP_MMP);
	- if (error != 0) {
	- fatal("Tried to import pool \"%s\" but spa_import() failed "
	- "with error %d\n", bogus_name, error);
	- }
	-
	- if (new_path != NULL && path_start != NULL)
	- (void) asprintf(new_path, "%s%s", bogus_name, path_start);
	-
	- if (target != poolname)
	- free(poolname);
	-
	- return (bogus_name);
	-}
	-
	-typedef struct verify_checkpoint_sm_entry_cb_arg {
	- vdev_t *vcsec_vd;
	-
	- /* the following fields are only used for printing progress */
	- uint64_t vcsec_entryid;
	- uint64_t vcsec_num_entries;
	-} verify_checkpoint_sm_entry_cb_arg_t;
	-
	-#define ENTRIES_PER_PROGRESS_UPDATE 10000
	-
	-static int
	-verify_checkpoint_sm_entry_cb(space_map_entry_t sme, void arg)
	-{
	- verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
	- vdev_t *vd = vcsec->vcsec_vd;
	- metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
	- uint64_t end = sme->sme_offset + sme->sme_run;
	-
	- ASSERT(sme->sme_type == SM_FREE);
	-
	- if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
	- (void) fprintf(stderr,
	- "\rverifying vdev %llu, space map entry %llu of %llu ...",
	- (longlong_t)vd->vdev_id,
	- (longlong_t)vcsec->vcsec_entryid,
	- (longlong_t)vcsec->vcsec_num_entries);
	- }
	- vcsec->vcsec_entryid++;
	-
	- /*
	- * See comment in checkpoint_sm_exclude_entry_cb()
	- */
	- VERIFY3U(sme->sme_offset, >=, ms->ms_start);
	- VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
	-
	- /*
	- * The entries in the vdev_checkpoint_sm should be marked as
	- * allocated in the checkpointed state of the pool, therefore
	- * their respective ms_allocateable trees should not contain them.
	- */
	- mutex_enter(&ms->ms_lock);
	- range_tree_verify_not_present(ms->ms_allocatable,
	- sme->sme_offset, sme->sme_run);
	- mutex_exit(&ms->ms_lock);
	-
	- return (0);
	-}
	-
	-/*
	- * Verify that all segments in the vdev_checkpoint_sm are allocated
	- * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
	- * ms_allocatable).
	- *
	- * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
	- * each vdev in the current state of the pool to the metaslab space maps
	- * (ms_sm) of the checkpointed state of the pool.
	- *
	- * Note that the function changes the state of the ms_allocatable
	- * trees of the current spa_t. The entries of these ms_allocatable
	- * trees are cleared out and then repopulated from with the free
	- * entries of their respective ms_sm space maps.
	- */
	-static void
	-verify_checkpoint_vdev_spacemaps(spa_t checkpoint, spa_t current)
	-{
	- vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
	- vdev_t *current_rvd = current->spa_root_vdev;
	-
	- load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
	-
	- for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
	- vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
	- vdev_t *current_vd = current_rvd->vdev_child[c];
	-
	- space_map_t *checkpoint_sm = NULL;
	- uint64_t checkpoint_sm_obj;
	-
	- if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
	- /*
	- * Since we don't allow device removal in a pool
	- * that has a checkpoint, we expect that all removed
	- * vdevs were removed from the pool before the
	- * checkpoint.
	- */
	- ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
	- continue;
	- }
	-
	- /*
	- * If the checkpoint space map doesn't exist, then nothing
	- * here is checkpointed so there's nothing to verify.
	- */
	- if (current_vd->vdev_top_zap == 0 \|\|
	- zap_contains(spa_meta_objset(current),
	- current_vd->vdev_top_zap,
	- VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
	- continue;
	-
	- VERIFY0(zap_lookup(spa_meta_objset(current),
	- current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
	- sizeof (uint64_t), 1, &checkpoint_sm_obj));
	-
	- VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
	- checkpoint_sm_obj, 0, current_vd->vdev_asize,
	- current_vd->vdev_ashift));
	-
	- verify_checkpoint_sm_entry_cb_arg_t vcsec;
	- vcsec.vcsec_vd = ckpoint_vd;
	- vcsec.vcsec_entryid = 0;
	- vcsec.vcsec_num_entries =
	- space_map_length(checkpoint_sm) / sizeof (uint64_t);
	- VERIFY0(space_map_iterate(checkpoint_sm,
	- space_map_length(checkpoint_sm),
	- verify_checkpoint_sm_entry_cb, &vcsec));
	- dump_spacemap(current->spa_meta_objset, checkpoint_sm);
	- space_map_close(checkpoint_sm);
	- }
	-
	- /*
	- * If we've added vdevs since we took the checkpoint, ensure
	- * that their checkpoint space maps are empty.
	- */
	- if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
	- for (uint64_t c = ckpoint_rvd->vdev_children;
	- c < current_rvd->vdev_children; c++) {
	- vdev_t *current_vd = current_rvd->vdev_child[c];
	- ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
	- }
	- }
	-
	- /* for cleaner progress output */
	- (void) fprintf(stderr, "\n");
	-}
	-
	-/*
	- * Verifies that all space that's allocated in the checkpoint is
	- * still allocated in the current version, by checking that everything
	- * in checkpoint's ms_allocatable (which is actually allocated, not
	- * allocatable/free) is not present in current's ms_allocatable.
	- *
	- * Note that the function changes the state of the ms_allocatable
	- * trees of both spas when called. The entries of all ms_allocatable
	- * trees are cleared out and then repopulated from their respective
	- * ms_sm space maps. In the checkpointed state we load the allocated
	- * entries, and in the current state we load the free entries.
	- */
	-static void
	-verify_checkpoint_ms_spacemaps(spa_t checkpoint, spa_t current)
	-{
	- vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
	- vdev_t *current_rvd = current->spa_root_vdev;
	-
	- load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
	- load_concrete_ms_allocatable_trees(current, SM_FREE);
	-
	- for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
	- vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
	- vdev_t *current_vd = current_rvd->vdev_child[i];
	-
	- if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
	- /*
	- * See comment in verify_checkpoint_vdev_spacemaps()
	- */
	- ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
	- continue;
	- }
	-
	- for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
	- metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
	- metaslab_t *current_msp = current_vd->vdev_ms[m];
	-
	- (void) fprintf(stderr,
	- "\rverifying vdev %llu of %llu, "
	- "metaslab %llu of %llu ...",
	- (longlong_t)current_vd->vdev_id,
	- (longlong_t)current_rvd->vdev_children,
	- (longlong_t)current_vd->vdev_ms[m]->ms_id,
	- (longlong_t)current_vd->vdev_ms_count);
	-
	- /*
	- * We walk through the ms_allocatable trees that
	- * are loaded with the allocated blocks from the
	- * ms_sm spacemaps of the checkpoint. For each
	- * one of these ranges we ensure that none of them
	- * exists in the ms_allocatable trees of the
	- * current state which are loaded with the ranges
	- * that are currently free.
	- *
	- * This way we ensure that none of the blocks that
	- * are part of the checkpoint were freed by mistake.
	- */
	- range_tree_walk(ckpoint_msp->ms_allocatable,
	- (range_tree_func_t *)range_tree_verify_not_present,
	- current_msp->ms_allocatable);
	- }
	- }
	-
	- /* for cleaner progress output */
	- (void) fprintf(stderr, "\n");
	-}
	-
	-static void
	-verify_checkpoint_blocks(spa_t *spa)
	-{
	- ASSERT(!dump_opt['L']);
	-
	- spa_t *checkpoint_spa;
	- char *checkpoint_pool;
	- nvlist_t *config = NULL;
	- int error = 0;
	-
	- /*
	- * We import the checkpointed state of the pool (under a different
	- * name) so we can do verification on it against the current state
	- * of the pool.
	- */
	- checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
	- NULL);
	- ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
	-
	- error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
	- if (error != 0) {
	- fatal("Tried to open pool \"%s\" but spa_open() failed with "
	- "error %d\n", checkpoint_pool, error);
	- }
	-
	- /*
	- * Ensure that ranges in the checkpoint space maps of each vdev
	- * are allocated according to the checkpointed state's metaslab
	- * space maps.
	- */
	- verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
	-
	- /*
	- * Ensure that allocated ranges in the checkpoint's metaslab
	- * space maps remain allocated in the metaslab space maps of
	- * the current state.
	- */
	- verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
	-
	- /*
	- * Once we are done, we get rid of the checkpointed state.
	- */
	- spa_close(checkpoint_spa, FTAG);
	- free(checkpoint_pool);
	-}
	-
	-static void
	-dump_leftover_checkpoint_blocks(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- for (uint64_t i = 0; i < rvd->vdev_children; i++) {
	- vdev_t *vd = rvd->vdev_child[i];
	-
	- space_map_t *checkpoint_sm = NULL;
	- uint64_t checkpoint_sm_obj;
	-
	- if (vd->vdev_top_zap == 0)
	- continue;
	-
	- if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
	- VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
	- continue;
	-
	- VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
	- VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
	- sizeof (uint64_t), 1, &checkpoint_sm_obj));
	-
	- VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
	- checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
	- dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
	- space_map_close(checkpoint_sm);
	- }
	-}
	-
	-static int
	-verify_checkpoint(spa_t *spa)
	-{
	- uberblock_t checkpoint;
	- int error;
	-
	- if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
	- return (0);
	-
	- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
	- sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
	-
	- if (error == ENOENT && !dump_opt['L']) {
	- /*
	- * If the feature is active but the uberblock is missing
	- * then we must be in the middle of discarding the
	- * checkpoint.
	- */
	- (void) printf("\nPartially discarded checkpoint "
	- "state found:\n");
	- dump_leftover_checkpoint_blocks(spa);
	- return (0);
	- } else if (error != 0) {
	- (void) printf("lookup error %d when looking for "
	- "checkpointed uberblock in MOS\n", error);
	- return (error);
	- }
	- dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
	-
	- if (checkpoint.ub_checkpoint_txg == 0) {
	- (void) printf("\nub_checkpoint_txg not set in checkpointed "
	- "uberblock\n");
	- error = 3;
	- }
	-
	- if (error == 0 && !dump_opt['L'])
	- verify_checkpoint_blocks(spa);
	-
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static void
	-mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
	-{
	- for (uint64_t i = start; i < size; i++) {
	- (void) printf("MOS object %llu referenced but not allocated\n",
	- (u_longlong_t)i);
	- }
	-}
	-
	-static range_tree_t *mos_refd_objs;
	-
	-static void
	-mos_obj_refd(uint64_t obj)
	-{
	- if (obj != 0 && mos_refd_objs != NULL)
	- range_tree_add(mos_refd_objs, obj, 1);
	-}
	-
	-static void
	-mos_leak_vdev(vdev_t *vd)
	-{
	- mos_obj_refd(vd->vdev_dtl_object);
	- mos_obj_refd(vd->vdev_ms_array);
	- mos_obj_refd(vd->vdev_top_zap);
	- mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
	- mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
	- mos_obj_refd(vd->vdev_leaf_zap);
	- if (vd->vdev_checkpoint_sm != NULL)
	- mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
	- if (vd->vdev_indirect_mapping != NULL) {
	- mos_obj_refd(vd->vdev_indirect_mapping->
	- vim_phys->vimp_counts_object);
	- }
	- if (vd->vdev_obsolete_sm != NULL)
	- mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
	-
	- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
	- metaslab_t *ms = vd->vdev_ms[m];
	- mos_obj_refd(space_map_object(ms->ms_sm));
	- }
	-
	- for (uint64_t c = 0; c < vd->vdev_children; c++) {
	- mos_leak_vdev(vd->vdev_child[c]);
	- }
	-}
	-
	-static int
	-dump_mos_leaks(spa_t *spa)
	-{
	- int rv = 0;
	- objset_t *mos = spa->spa_meta_objset;
	- dsl_pool_t *dp = spa->spa_dsl_pool;
	-
	- /* Visit and mark all referenced objects in the MOS */
	-
	- mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
	- mos_obj_refd(spa->spa_pool_props_object);
	- mos_obj_refd(spa->spa_config_object);
	- mos_obj_refd(spa->spa_ddt_stat_object);
	- mos_obj_refd(spa->spa_feat_desc_obj);
	- mos_obj_refd(spa->spa_feat_enabled_txg_obj);
	- mos_obj_refd(spa->spa_feat_for_read_obj);
	- mos_obj_refd(spa->spa_feat_for_write_obj);
	- mos_obj_refd(spa->spa_history);
	- mos_obj_refd(spa->spa_errlog_last);
	- mos_obj_refd(spa->spa_errlog_scrub);
	- mos_obj_refd(spa->spa_all_vdev_zaps);
	- mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
	- mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
	- mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
	- bpobj_count_refd(&spa->spa_deferred_bpobj);
	- mos_obj_refd(dp->dp_empty_bpobj);
	- bpobj_count_refd(&dp->dp_obsolete_bpobj);
	- bpobj_count_refd(&dp->dp_free_bpobj);
	- mos_obj_refd(spa->spa_l2cache.sav_object);
	- mos_obj_refd(spa->spa_spares.sav_object);
	-
	- mos_obj_refd(spa->spa_condensing_indirect_phys.
	- scip_next_mapping_object);
	- mos_obj_refd(spa->spa_condensing_indirect_phys.
	- scip_prev_obsolete_sm_object);
	- if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
	- vdev_indirect_mapping_t *vim =
	- vdev_indirect_mapping_open(mos,
	- spa->spa_condensing_indirect_phys.scip_next_mapping_object);
	- mos_obj_refd(vim->vim_phys->vimp_counts_object);
	- vdev_indirect_mapping_close(vim);
	- }
	-
	- if (dp->dp_origin_snap != NULL) {
	- dsl_dataset_t *ds;
	-
	- dsl_pool_config_enter(dp, FTAG);
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
	- FTAG, &ds));
	- count_ds_mos_objects(ds);
	- dump_deadlist(&ds->ds_deadlist);
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_config_exit(dp, FTAG);
	-
	- count_ds_mos_objects(dp->dp_origin_snap);
	- dump_deadlist(&dp->dp_origin_snap->ds_deadlist);
	- }
	- count_dir_mos_objects(dp->dp_mos_dir);
	- if (dp->dp_free_dir != NULL)
	- count_dir_mos_objects(dp->dp_free_dir);
	- if (dp->dp_leak_dir != NULL)
	- count_dir_mos_objects(dp->dp_leak_dir);
	-
	- mos_leak_vdev(spa->spa_root_vdev);
	-
	- for (uint64_t class = 0; class < DDT_CLASSES; class++) {
	- for (uint64_t type = 0; type < DDT_TYPES; type++) {
	- for (uint64_t cksum = 0;
	- cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
	- ddt_t *ddt = spa->spa_ddt[cksum];
	- mos_obj_refd(ddt->ddt_object[type][class]);
	- }
	- }
	- }
	-
	- /*
	- * Visit all allocated objects and make sure they are referenced.
	- */
	- uint64_t object = 0;
	- while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
	- if (range_tree_contains(mos_refd_objs, object, 1)) {
	- range_tree_remove(mos_refd_objs, object, 1);
	- } else {
	- dmu_object_info_t doi;
	- const char *name;
	- dmu_object_info(mos, object, &doi);
	- if (doi.doi_type & DMU_OT_NEWTYPE) {
	- dmu_object_byteswap_t bswap =
	- DMU_OT_BYTESWAP(doi.doi_type);
	- name = dmu_ot_byteswap[bswap].ob_name;
	- } else {
	- name = dmu_ot[doi.doi_type].ot_name;
	- }
	-
	- (void) printf("MOS object %llu (%s) leaked\n",
	- (u_longlong_t)object, name);
	- rv = 2;
	- }
	- }
	- (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
	- if (!range_tree_is_empty(mos_refd_objs))
	- rv = 2;
	- range_tree_vacate(mos_refd_objs, NULL, NULL);
	- range_tree_destroy(mos_refd_objs);
	- return (rv);
	-}
	-
	-static void
	-dump_zpool(spa_t *spa)
	-{
	- dsl_pool_t *dp = spa_get_dsl(spa);
	- int rc = 0;
	-
	- if (dump_opt['S']) {
	- dump_simulated_ddt(spa);
	- return;
	- }
	-
	- if (!dump_opt['e'] && dump_opt['C'] > 1) {
	- (void) printf("\nCached configuration:\n");
	- dump_nvlist(spa->spa_config, 8);
	- }
	-
	- if (dump_opt['C'])
	- dump_config(spa);
	-
	- if (dump_opt['u'])
	- dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
	-
	- if (dump_opt['D'])
	- dump_all_ddts(spa);
	-
	- if (dump_opt['d'] > 2 \|\| dump_opt['m'])
	- dump_metaslabs(spa);
	- if (dump_opt['M'])
	- dump_metaslab_groups(spa);
	-
	- if (dump_opt['d'] \|\| dump_opt['i']) {
	- mos_refd_objs = range_tree_create(NULL, NULL);
	- dump_dir(dp->dp_meta_objset);
	-
	- if (dump_opt['d'] >= 3) {
	- dsl_pool_t *dp = spa->spa_dsl_pool;
	- dump_full_bpobj(&spa->spa_deferred_bpobj,
	- "Deferred frees", 0);
	- if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
	- dump_full_bpobj(&dp->dp_free_bpobj,
	- "Pool snapshot frees", 0);
	- }
	- if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
	- ASSERT(spa_feature_is_enabled(spa,
	- SPA_FEATURE_DEVICE_REMOVAL));
	- dump_full_bpobj(&dp->dp_obsolete_bpobj,
	- "Pool obsolete blocks", 0);
	- }
	-
	- if (spa_feature_is_active(spa,
	- SPA_FEATURE_ASYNC_DESTROY)) {
	- dump_bptree(spa->spa_meta_objset,
	- dp->dp_bptree_obj,
	- "Pool dataset frees");
	- }
	- dump_dtl(spa->spa_root_vdev, 0);
	- }
	- (void) dmu_objset_find(spa_name(spa), dump_one_dir,
	- NULL, DS_FIND_SNAPSHOTS \| DS_FIND_CHILDREN);
	-
	- if (rc == 0 && !dump_opt['L'])
	- rc = dump_mos_leaks(spa);
	-
	- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	- uint64_t refcount;
	-
	- if (!(spa_feature_table[f].fi_flags &
	- ZFEATURE_FLAG_PER_DATASET)) {
	- ASSERT0(dataset_feature_count[f]);
	- continue;
	- }
	- (void) feature_get_refcount(spa,
	- &spa_feature_table[f], &refcount);
	- if (dataset_feature_count[f] != refcount) {
	- (void) printf("%s feature refcount mismatch: "
	- "%lld datasets != %lld refcount\n",
	- spa_feature_table[f].fi_uname,
	- (longlong_t)dataset_feature_count[f],
	- (longlong_t)refcount);
	- rc = 2;
	- } else {
	- (void) printf("Verified %s feature refcount "
	- "of %llu is correct\n",
	- spa_feature_table[f].fi_uname,
	- (longlong_t)refcount);
	- }
	- }
	-
	- if (rc == 0) {
	- rc = verify_device_removal_feature_counts(spa);
	- }
	- }
	-
	- if (rc == 0 && (dump_opt['b'] \|\| dump_opt['c']))
	- rc = dump_block_stats(spa);
	-
	- if (rc == 0)
	- rc = verify_spacemap_refcounts(spa);
	-
	- if (dump_opt['s'])
	- show_pool_stats(spa);
	-
	- if (dump_opt['h'])
	- dump_history(spa);
	-
	- if (rc == 0)
	- rc = verify_checkpoint(spa);
	-
	- if (rc != 0) {
	- dump_debug_buffer();
	- exit(rc);
	- }
	-}
	-
	-#define ZDB_FLAG_CHECKSUM 0x0001
	-#define ZDB_FLAG_DECOMPRESS 0x0002
	-#define ZDB_FLAG_BSWAP 0x0004
	-#define ZDB_FLAG_GBH 0x0008
	-#define ZDB_FLAG_INDIRECT 0x0010
	-#define ZDB_FLAG_PHYS 0x0020
	-#define ZDB_FLAG_RAW 0x0040
	-#define ZDB_FLAG_PRINT_BLKPTR 0x0080
	-
	-static int flagbits[256];
	-
	-static void
	-zdb_print_blkptr(blkptr_t *bp, int flags)
	-{
	- char blkbuf[BP_SPRINTF_LEN];
	-
	- if (flags & ZDB_FLAG_BSWAP)
	- byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
	-
	- snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	- (void) printf("%s\n", blkbuf);
	-}
	-
	-static void
	-zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
	-{
	- int i;
	-
	- for (i = 0; i < nbps; i++)
	- zdb_print_blkptr(&bp[i], flags);
	-}
	-
	-static void
	-zdb_dump_gbh(void *buf, int flags)
	-{
	- zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
	-}
	-
	-static void
	-zdb_dump_block_raw(void *buf, uint64_t size, int flags)
	-{
	- if (flags & ZDB_FLAG_BSWAP)
	- byteswap_uint64_array(buf, size);
	- (void) write(1, buf, size);
	-}
	-
	-static void
	-zdb_dump_block(char label, void buf, uint64_t size, int flags)
	-{
	- uint64_t d = (uint64_t )buf;
	- unsigned nwords = size / sizeof (uint64_t);
	- int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
	- unsigned i, j;
	- const char *hdr;
	- char *c;
	-
	-
	- if (do_bswap)
	- hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
	- else
	- hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
	-
	- (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
	-
	- for (i = 0; i < nwords; i += 2) {
	- (void) printf("%06llx: %016llx %016llx ",
	- (u_longlong_t)(i * sizeof (uint64_t)),
	- (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
	- (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
	-
	- c = (char *)&d[i];
	- for (j = 0; j < 2 * sizeof (uint64_t); j++)
	- (void) printf("%c", isprint(c[j]) ? c[j] : '.');
	- (void) printf("\n");
	- }
	-}
	-
	-/*
	- * There are two acceptable formats:
	- * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
	- * child[.child]* - For example: 0.1.1
	- *
	- * The second form can be used to specify arbitrary vdevs anywhere
	- * in the heirarchy. For example, in a pool with a mirror of
	- * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
	- */
	-static vdev_t *
	-zdb_vdev_lookup(vdev_t vdev, const char path)
	-{
	- char s, p, *q;
	- unsigned i;
	-
	- if (vdev == NULL)
	- return (NULL);
	-
	- /* First, assume the x.x.x.x format */
	- i = strtoul(path, &s, 10);
	- if (s == path \|\| (s && s != '.' && s != '\0'))
	- goto name;
	- if (i >= vdev->vdev_children)
	- return (NULL);
	-
	- vdev = vdev->vdev_child[i];
	- if (*s == '\0')
	- return (vdev);
	- return (zdb_vdev_lookup(vdev, s+1));
	-
	-name:
	- for (i = 0; i < vdev->vdev_children; i++) {
	- vdev_t *vc = vdev->vdev_child[i];
	-
	- if (vc->vdev_path == NULL) {
	- vc = zdb_vdev_lookup(vc, path);
	- if (vc == NULL)
	- continue;
	- else
	- return (vc);
	- }
	-
	- p = strrchr(vc->vdev_path, '/');
	- p = p ? p + 1 : vc->vdev_path;
	- q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
	-
	- if (strcmp(vc->vdev_path, path) == 0)
	- return (vc);
	- if (strcmp(p, path) == 0)
	- return (vc);
	- if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
	- return (vc);
	- }
	-
	- return (NULL);
	-}
	-
	-/* ARGSUSED */
	-static int
	-random_get_pseudo_bytes_cb(void buf, size_t len, void unused)
	-{
	- return (random_get_pseudo_bytes(buf, len));
	-}
	-
	-/*
	- * Read a block from a pool and print it out. The syntax of the
	- * block descriptor is:
	- *
	- * pool:vdev_specifier:offset:size[:flags]
	- *
	- * pool - The name of the pool you wish to read from
	- * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
	- * offset - offset, in hex, in bytes
	- * size - Amount of data to read, in hex, in bytes
	- * flags - A string of characters specifying options
	- * b: Decode a blkptr at given offset within block
	- * *c: Calculate and display checksums
	- * d: Decompress data before dumping
	- * e: Byteswap data before dumping
	- * g: Display data as a gang block header
	- * i: Display as an indirect block
	- * p: Do I/O to physical offset
	- * r: Dump raw data to stdout
	- *
	- * * = not yet implemented
	- */
	-static void
	-zdb_read_block(char thing, spa_t spa)
	-{
	- blkptr_t blk, *bp = &blk;
	- dva_t *dva = bp->blk_dva;
	- int flags = 0;
	- uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
	- zio_t *zio;
	- vdev_t *vd;
	- abd_t *pabd;
	- void lbuf, buf;
	- const char s, vdev;
	- char p, dup, *flagstr;
	- int i, error;
	-
	- dup = strdup(thing);
	- s = strtok(dup, ":");
	- vdev = s ? s : "";
	- s = strtok(NULL, ":");
	- offset = strtoull(s ? s : "", NULL, 16);
	- s = strtok(NULL, ":");
	- size = strtoull(s ? s : "", NULL, 16);
	- s = strtok(NULL, ":");
	- if (s)
	- flagstr = strdup(s);
	- else
	- flagstr = strdup("");
	-
	- s = NULL;
	- if (size == 0)
	- s = "size must not be zero";
	- if (!IS_P2ALIGNED(size, DEV_BSIZE))
	- s = "size must be a multiple of sector size";
	- if (!IS_P2ALIGNED(offset, DEV_BSIZE))
	- s = "offset must be a multiple of sector size";
	- if (s) {
	- (void) printf("Invalid block specifier: %s - %s\n", thing, s);
	- free(flagstr);
	- free(dup);
	- return;
	- }
	-
	- for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
	- for (i = 0; flagstr[i]; i++) {
	- int bit = flagbits[(uchar_t)flagstr[i]];
	-
	- if (bit == 0) {
	- (void) printf("***Invalid flag: %c\n",
	- flagstr[i]);
	- continue;
	- }
	- flags \|= bit;
	-
	- /* If it's not something with an argument, keep going */
	- if ((bit & (ZDB_FLAG_CHECKSUM \|
	- ZDB_FLAG_PRINT_BLKPTR)) == 0)
	- continue;
	-
	- p = &flagstr[i + 1];
	- if (bit == ZDB_FLAG_PRINT_BLKPTR)
	- blkptr_offset = strtoull(p, &p, 16);
	- if (p != ':' && p != '\0') {
	- (void) printf("***Invalid flag arg: '%s'\n", s);
	- free(flagstr);
	- free(dup);
	- return;
	- }
	- i += p - &flagstr[i + 1]; /* skip over the number */
	- }
	- }
	- free(flagstr);
	-
	- vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
	- if (vd == NULL) {
	- (void) printf("***Invalid vdev: %s\n", vdev);
	- free(dup);
	- return;
	- } else {
	- if (vd->vdev_path)
	- (void) fprintf(stderr, "Found vdev: %s\n",
	- vd->vdev_path);
	- else
	- (void) fprintf(stderr, "Found vdev type: %s\n",
	- vd->vdev_ops->vdev_op_type);
	- }
	-
	- psize = size;
	- lsize = size;
	-
	- pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
	- lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
	-
	- BP_ZERO(bp);
	-
	- DVA_SET_VDEV(&dva[0], vd->vdev_id);
	- DVA_SET_OFFSET(&dva[0], offset);
	- DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
	- DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
	-
	- BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
	-
	- BP_SET_LSIZE(bp, lsize);
	- BP_SET_PSIZE(bp, psize);
	- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
	- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
	- BP_SET_TYPE(bp, DMU_OT_NONE);
	- BP_SET_LEVEL(bp, 0);
	- BP_SET_DEDUP(bp, 0);
	- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
	-
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	- zio = zio_root(spa, NULL, NULL, 0);
	-
	- if (vd == vd->vdev_top) {
	- /*
	- * Treat this as a normal block read.
	- */
	- zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
	- ZIO_PRIORITY_SYNC_READ,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_RAW, NULL));
	- } else {
	- /*
	- * Treat this as a vdev child I/O.
	- */
	- zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
	- psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
	- ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_QUEUE \|
	- ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY \|
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_RAW \| ZIO_FLAG_OPTIONAL,
	- NULL, NULL));
	- }
	-
	- error = zio_wait(zio);
	- spa_config_exit(spa, SCL_STATE, FTAG);
	-
	- if (error) {
	- (void) printf("Read of %s failed, error: %d\n", thing, error);
	- goto out;
	- }
	-
	- if (flags & ZDB_FLAG_DECOMPRESS) {
	- /*
	- * We don't know how the data was compressed, so just try
	- * every decompress function at every inflated blocksize.
	- */
	- enum zio_compress c;
	- void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
	- void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
	-
	- abd_copy_to_buf(pbuf2, pabd, psize);
	-
	- VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
	- random_get_pseudo_bytes_cb, NULL));
	-
	- VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
	- SPA_MAXBLOCKSIZE - psize));
	-
	- for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
	- lsize -= SPA_MINBLOCKSIZE) {
	- for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
	- if (zio_decompress_data(c, pabd,
	- lbuf, psize, lsize) == 0 &&
	- zio_decompress_data_buf(c, pbuf2,
	- lbuf2, psize, lsize) == 0 &&
	- bcmp(lbuf, lbuf2, lsize) == 0)
	- break;
	- }
	- if (c != ZIO_COMPRESS_FUNCTIONS)
	- break;
	- lsize -= SPA_MINBLOCKSIZE;
	- }
	-
	- umem_free(pbuf2, SPA_MAXBLOCKSIZE);
	- umem_free(lbuf2, SPA_MAXBLOCKSIZE);
	-
	- if (lsize <= psize) {
	- (void) printf("Decompress of %s failed\n", thing);
	- goto out;
	- }
	- buf = lbuf;
	- size = lsize;
	- } else {
	- buf = abd_to_buf(pabd);
	- size = psize;
	- }
	-
	- if (flags & ZDB_FLAG_PRINT_BLKPTR)
	- zdb_print_blkptr((blkptr_t )(void )
	- ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
	- else if (flags & ZDB_FLAG_RAW)
	- zdb_dump_block_raw(buf, size, flags);
	- else if (flags & ZDB_FLAG_INDIRECT)
	- zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
	- flags);
	- else if (flags & ZDB_FLAG_GBH)
	- zdb_dump_gbh(buf, flags);
	- else
	- zdb_dump_block(thing, buf, size, flags);
	-
	-out:
	- abd_free(pabd);
	- umem_free(lbuf, SPA_MAXBLOCKSIZE);
	- free(dup);
	-}
	-
	-static void
	-zdb_embedded_block(char *thing)
	-{
	- blkptr_t bp;
	- unsigned long long words = (void )&bp;
	- char *buf;
	- int err;
	-
	- bzero(&bp, sizeof (bp));
	- err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
	- "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
	- words + 0, words + 1, words + 2, words + 3,
	- words + 4, words + 5, words + 6, words + 7,
	- words + 8, words + 9, words + 10, words + 11,
	- words + 12, words + 13, words + 14, words + 15);
	- if (err != 16) {
	- (void) fprintf(stderr, "invalid input format\n");
	- exit(1);
	- }
	- ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
	- buf = malloc(SPA_MAXBLOCKSIZE);
	- if (buf == NULL) {
	- (void) fprintf(stderr, "out of memory\n");
	- exit(1);
	- }
	- err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
	- if (err != 0) {
	- (void) fprintf(stderr, "decode failed: %u\n", err);
	- free(buf);
	- exit(1);
	- }
	- zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
	- free(buf);
	-}
	-
	-int
	-main(int argc, char **argv)
	-{
	- int c;
	- struct rlimit rl = { 1024, 1024 };
	- spa_t *spa = NULL;
	- objset_t *os = NULL;
	- int dump_all = 1;
	- int verbose = 0;
	- int error = 0;
	- char **searchdirs = NULL;
	- int nsearch = 0;
	- char target, target_pool;
	- nvlist_t *policy = NULL;
	- uint64_t max_txg = UINT64_MAX;
	- int flags = ZFS_IMPORT_MISSING_LOG;
	- int rewind = ZPOOL_NEVER_REWIND;
	- char *spa_config_path_env;
	- boolean_t target_is_spa = B_TRUE;
	- nvlist_t *cfg = NULL;
	-
	- (void) setrlimit(RLIMIT_NOFILE, &rl);
	- (void) enable_extended_FILE_stdio(-1, -1);
	-
	- dprintf_setup(&argc, argv);
	-
	- /*
	- * If there is an environment variable SPA_CONFIG_PATH it overrides
	- * default spa_config_path setting. If -U flag is specified it will
	- * override this environment variable settings once again.
	- */
	- spa_config_path_env = getenv("SPA_CONFIG_PATH");
	- if (spa_config_path_env != NULL)
	- spa_config_path = spa_config_path_env;
	-
	- while ((c = getopt(argc, argv,
	- "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
	- switch (c) {
	- case 'b':
	- case 'c':
	- case 'C':
	- case 'd':
	- case 'D':
	- case 'E':
	- case 'G':
	- case 'h':
	- case 'i':
	- case 'l':
	- case 'm':
	- case 'M':
	- case 'O':
	- case 'R':
	- case 's':
	- case 'S':
	- case 'u':
	- dump_opt[c]++;
	- dump_all = 0;
	- break;
	- case 'A':
	- case 'e':
	- case 'F':
	- case 'k':
	- case 'L':
	- case 'P':
	- case 'q':
	- case 'X':
	- dump_opt[c]++;
	- break;
	- /* NB: Sort single match options below. */
	- case 'I':
	- max_inflight = strtoull(optarg, NULL, 0);
	- if (max_inflight == 0) {
	- (void) fprintf(stderr, "maximum number "
	- "of inflight I/Os must be greater "
	- "than 0\n");
	- usage();
	- }
	- break;
	- case 'o':
	- error = set_global_var(optarg);
	- if (error != 0)
	- usage();
	- break;
	- case 'p':
	- if (searchdirs == NULL) {
	- searchdirs = umem_alloc(sizeof (char *),
	- UMEM_NOFAIL);
	- } else {
	- char *tmp = umem_alloc((nsearch + 1)
	- sizeof (char *), UMEM_NOFAIL);
	- bcopy(searchdirs, tmp, nsearch *
	- sizeof (char *));
	- umem_free(searchdirs,
	- nsearch * sizeof (char *));
	- searchdirs = tmp;
	- }
	- searchdirs[nsearch++] = optarg;
	- break;
	- case 't':
	- max_txg = strtoull(optarg, NULL, 0);
	- if (max_txg < TXG_INITIAL) {
	- (void) fprintf(stderr, "incorrect txg "
	- "specified: %s\n", optarg);
	- usage();
	- }
	- break;
	- case 'U':
	- spa_config_path = optarg;
	- if (spa_config_path[0] != '/') {
	- (void) fprintf(stderr,
	- "cachefile must be an absolute path "
	- "(i.e. start with a slash)\n");
	- usage();
	- }
	- break;
	- case 'v':
	- verbose++;
	- break;
	- case 'V':
	- flags = ZFS_IMPORT_VERBATIM;
	- break;
	- case 'x':
	- vn_dumpdir = optarg;
	- break;
	- default:
	- usage();
	- break;
	- }
	- }
	-
	- if (!dump_opt['e'] && searchdirs != NULL) {
	- (void) fprintf(stderr, "-p option requires use of -e\n");
	- usage();
	- }
	-
	- /*
	- * ZDB does not typically re-read blocks; therefore limit the ARC
	- * to 256 MB, which can be used entirely for metadata.
	- */
	- zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
	-
	- /*
	- * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
	- * "zdb -b" uses traversal prefetch which uses async reads.
	- * For good performance, let several of them be active at once.
	- */
	- zfs_vdev_async_read_max_active = 10;
	-
	- /*
	- * Disable reference tracking for better performance.
	- */
	- reference_tracking_enable = B_FALSE;
	-
	- /*
	- * Do not fail spa_load when spa_load_verify fails. This is needed
	- * to load non-idle pools.
	- */
	- spa_load_verify_dryrun = B_TRUE;
	-
	- kernel_init(FREAD);
	- g_zfs = libzfs_init();
	- if (g_zfs == NULL)
	- fatal("Fail to initialize zfs");
	-
	- if (dump_all)
	- verbose = MAX(verbose, 1);
	-
	- for (c = 0; c < 256; c++) {
	- if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
	- dump_opt[c] = 1;
	- if (dump_opt[c])
	- dump_opt[c] += verbose;
	- }
	-
	- aok = (dump_opt['A'] == 1) \|\| (dump_opt['A'] > 2);
	- zfs_recover = (dump_opt['A'] > 1);
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 2 && dump_opt['R'])
	- usage();
	-
	- if (dump_opt['E']) {
	- if (argc != 1)
	- usage();
	- zdb_embedded_block(argv[0]);
	- return (0);
	- }
	-
	- if (argc < 1) {
	- if (!dump_opt['e'] && dump_opt['C']) {
	- dump_cachefile(spa_config_path);
	- return (0);
	- }
	- usage();
	- }
	-
	- if (dump_opt['l'])
	- return (dump_label(argv[0]));
	-
	- if (dump_opt['O']) {
	- if (argc != 2)
	- usage();
	- dump_opt['v'] = verbose + 3;
	- return (dump_path(argv[0], argv[1]));
	- }
	-
	- if (dump_opt['X'] \|\| dump_opt['F'])
	- rewind = ZPOOL_DO_REWIND \|
	- (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
	-
	- if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 \|\|
	- nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 \|\|
	- nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
	- fatal("internal error: %s", strerror(ENOMEM));
	-
	- error = 0;
	- target = argv[0];
	-
	- if (strpbrk(target, "/@") != NULL) {
	- size_t targetlen;
	-
	- target_pool = strdup(target);
	- *strpbrk(target_pool, "/@") = '\0';
	-
	- target_is_spa = B_FALSE;
	- targetlen = strlen(target);
	- if (targetlen && target[targetlen - 1] == '/')
	- target[targetlen - 1] = '\0';
	- } else {
	- target_pool = target;
	- }
	-
	- if (dump_opt['e']) {
	- importargs_t args = { 0 };
	-
	- args.paths = nsearch;
	- args.path = searchdirs;
	- args.can_be_active = B_TRUE;
	-
	- error = zpool_tryimport(g_zfs, target_pool, &cfg, &args);
	-
	- if (error == 0) {
	-
	- if (nvlist_add_nvlist(cfg,
	- ZPOOL_LOAD_POLICY, policy) != 0) {
	- fatal("can't open '%s': %s",
	- target, strerror(ENOMEM));
	- }
	-
	- if (dump_opt['C'] > 1) {
	- (void) printf("\nConfiguration for import:\n");
	- dump_nvlist(cfg, 8);
	- }
	-
	- /*
	- * Disable the activity check to allow examination of
	- * active pools.
	- */
	- error = spa_import(target_pool, cfg, NULL,
	- flags \| ZFS_IMPORT_SKIP_MMP);
	- }
	- }
	-
	- char *checkpoint_pool = NULL;
	- char *checkpoint_target = NULL;
	- if (dump_opt['k']) {
	- checkpoint_pool = import_checkpointed_state(target, cfg,
	- &checkpoint_target);
	-
	- if (checkpoint_target != NULL)
	- target = checkpoint_target;
	-
	- }
	-
	- if (error == 0) {
	- if (dump_opt['k'] && (target_is_spa \|\| dump_opt['R'])) {
	- ASSERT(checkpoint_pool != NULL);
	- ASSERT(checkpoint_target == NULL);
	-
	- error = spa_open(checkpoint_pool, &spa, FTAG);
	- if (error != 0) {
	- fatal("Tried to open pool \"%s\" but "
	- "spa_open() failed with error %d\n",
	- checkpoint_pool, error);
	- }
	-
	- } else if (target_is_spa \|\| dump_opt['R']) {
	- zdb_set_skip_mmp(target);
	- error = spa_open_rewind(target, &spa, FTAG, policy,
	- NULL);
	- if (error) {
	- /*
	- * If we're missing the log device then
	- * try opening the pool after clearing the
	- * log state.
	- */
	- mutex_enter(&spa_namespace_lock);
	- if ((spa = spa_lookup(target)) != NULL &&
	- spa->spa_log_state == SPA_LOG_MISSING) {
	- spa->spa_log_state = SPA_LOG_CLEAR;
	- error = 0;
	- }
	- mutex_exit(&spa_namespace_lock);
	-
	- if (!error) {
	- error = spa_open_rewind(target, &spa,
	- FTAG, policy, NULL);
	- }
	- }
	- } else {
	- zdb_set_skip_mmp(target);
	- error = open_objset(target, DMU_OST_ANY, FTAG, &os);
	- }
	- }
	- nvlist_free(policy);
	-
	- if (error)
	- fatal("can't open '%s': %s", target, strerror(error));
	-
	- argv++;
	- argc--;
	- if (!dump_opt['R']) {
	- if (argc > 0) {
	- zopt_objects = argc;
	- zopt_object = calloc(zopt_objects, sizeof (uint64_t));
	- for (unsigned i = 0; i < zopt_objects; i++) {
	- errno = 0;
	- zopt_object[i] = strtoull(argv[i], NULL, 0);
	- if (zopt_object[i] == 0 && errno != 0)
	- fatal("bad number %s: %s",
	- argv[i], strerror(errno));
	- }
	- }
	- if (os != NULL) {
	- dump_dir(os);
	- } else if (zopt_objects > 0 && !dump_opt['m']) {
	- dump_dir(spa->spa_meta_objset);
	- } else {
	- dump_zpool(spa);
	- }
	- } else {
	- flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
	- flagbits['c'] = ZDB_FLAG_CHECKSUM;
	- flagbits['d'] = ZDB_FLAG_DECOMPRESS;
	- flagbits['e'] = ZDB_FLAG_BSWAP;
	- flagbits['g'] = ZDB_FLAG_GBH;
	- flagbits['i'] = ZDB_FLAG_INDIRECT;
	- flagbits['p'] = ZDB_FLAG_PHYS;
	- flagbits['r'] = ZDB_FLAG_RAW;
	-
	- for (int i = 0; i < argc; i++)
	- zdb_read_block(argv[i], spa);
	- }
	-
	- if (dump_opt['k']) {
	- free(checkpoint_pool);
	- if (!target_is_spa)
	- free(checkpoint_target);
	- }
	-
	- if (os != NULL)
	- close_objset(os, FTAG);
	- else
	- spa_close(spa, FTAG);
	-
	- fuid_table_destroy();
	-
	- dump_debug_buffer();
	-
	- libzfs_fini(g_zfs);
	- kernel_fini();
	-
	- return (error);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
	+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
	@@ -1,424 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * Print intent log header and statistics.
	- */
	-
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <ctype.h>
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/dmu.h>
	-#include <sys/stat.h>
	-#include <sys/resource.h>
	-#include <sys/zil.h>
	-#include <sys/zil_impl.h>
	-#include <sys/spa_impl.h>
	-#include <sys/abd.h>
	-
	-#include "zdb.h"
	-
	-extern uint8_t dump_opt[256];
	-
	-static char tab_prefix[4] = "\t\t\t";
	-
	-static void
	-print_log_bp(const blkptr_t bp, const char prefix)
	-{
	- char blkbuf[BP_SPRINTF_LEN];
	-
	- snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	- (void) printf("%s%s\n", prefix, blkbuf);
	-}
	-
	-/* ARGSUSED */
	-static void
	-zil_prt_rec_create(zilog_t zilog, int txtype, void arg)
	-{
	- lr_create_t *lr = arg;
	- time_t crtime = lr->lr_crtime[0];
	- char name, link;
	- lr_attr_t *lrattr;
	-
	- name = (char *)(lr + 1);
	-
	- if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR \|\|
	- lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
	- lrattr = (lr_attr_t *)(lr + 1);
	- name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
	- }
	-
	- if (txtype == TX_SYMLINK) {
	- link = name + strlen(name) + 1;
	- (void) printf("%s%s -> %s\n", tab_prefix, name, link);
	- } else if (txtype != TX_MKXATTR) {
	- (void) printf("%s%s\n", tab_prefix, name);
	- }
	-
	- (void) printf("%s%s", tab_prefix, ctime(&crtime));
	- (void) printf("%sdoid %" PRIu64 ", foid %" PRIu64 ", slots %" PRIu64
	- ", mode %" PRIo64 "\n",
	- tab_prefix, lr->lr_doid,
	- (uint64_t)LR_FOID_GET_OBJ(lr->lr_foid),
	- (uint64_t)LR_FOID_GET_SLOTS(lr->lr_foid),
	- lr->lr_mode);
	- (void) printf("%suid %" PRIu64 ", gid %" PRIu64 ", gen %" PRIu64
	- ", rdev %#" PRIx64 "\n",
	- tab_prefix, lr->lr_uid, lr->lr_gid, lr->lr_gen, lr->lr_rdev);
	-}
	-
	-/* ARGSUSED */
	-static void
	-zil_prt_rec_remove(zilog_t zilog, int txtype, void arg)
	-{
	- lr_remove_t *lr = arg;
	-
	- (void) printf("%sdoid %llu, name %s\n", tab_prefix,
	- (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
	-}
	-
	-/* ARGSUSED */
	-static void
	-zil_prt_rec_link(zilog_t zilog, int txtype, void arg)
	-{
	- lr_link_t *lr = arg;
	-
	- (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix,
	- (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
	- (char *)(lr + 1));
	-}
	-
	-/* ARGSUSED */
	-static void
	-zil_prt_rec_rename(zilog_t zilog, int txtype, void arg)
	-{
	- lr_rename_t *lr = arg;
	- char snm = (char )(lr + 1);
	- char *tnm = snm + strlen(snm) + 1;
	-
	- (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix,
	- (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
	- (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zil_prt_rec_write_cb(void data, size_t len, void unused)
	-{
	- char *cdata = data;
	- for (size_t i = 0; i < len; i++) {
	- if (isprint(*cdata))
	- (void) printf("%c ", *cdata);
	- else
	- (void) printf("%2X", *cdata);
	- cdata++;
	- }
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-zil_prt_rec_write(zilog_t zilog, int txtype, void arg)
	-{
	- lr_write_t *lr = arg;
	- abd_t *data;
	- blkptr_t *bp = &lr->lr_blkptr;
	- zbookmark_phys_t zb;
	- int verbose = MAX(dump_opt['d'], dump_opt['i']);
	- int error;
	-
	- (void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix,
	- (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
	- (u_longlong_t)lr->lr_length);
	-
	- if (txtype == TX_WRITE2 \|\| verbose < 5)
	- return;
	-
	- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
	- (void) printf("%shas blkptr, %s\n", tab_prefix,
	- !BP_IS_HOLE(bp) &&
	- bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
	- "will claim" : "won't claim");
	- print_log_bp(bp, tab_prefix);
	-
	- if (BP_IS_HOLE(bp)) {
	- (void) printf("\t\t\tLSIZE 0x%llx\n",
	- (u_longlong_t)BP_GET_LSIZE(bp));
	- (void) printf("%s<hole>\n", tab_prefix);
	- return;
	- }
	- if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
	- (void) printf("%s<block already committed>\n",
	- tab_prefix);
	- return;
	- }
	-
	- SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
	- lr->lr_foid, ZB_ZIL_LEVEL,
	- lr->lr_offset / BP_GET_LSIZE(bp));
	-
	- data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
	- error = zio_wait(zio_read(NULL, zilog->zl_spa,
	- bp, data, BP_GET_LSIZE(bp), NULL, NULL,
	- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
	- if (error)
	- goto out;
	- } else {
	- /* data is stored after the end of the lr_write record */
	- data = abd_alloc(lr->lr_length, B_FALSE);
	- abd_copy_from_buf(data, lr + 1, lr->lr_length);
	- }
	-
	- (void) printf("%s", tab_prefix);
	- (void) abd_iterate_func(data,
	- 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
	- zil_prt_rec_write_cb, NULL);
	- (void) printf("\n");
	-
	-out:
	- abd_free(data);
	-}
	-
	-/* ARGSUSED */
	-static void
	-zil_prt_rec_truncate(zilog_t zilog, int txtype, void arg)
	-{
	- lr_truncate_t *lr = arg;
	-
	- (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix,
	- (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
	- (u_longlong_t)lr->lr_length);
	-}
	-
	-/* ARGSUSED */
	-static void
	-zil_prt_rec_setattr(zilog_t zilog, int txtype, void arg)
	-{
	- lr_setattr_t *lr = arg;
	- time_t atime = (time_t)lr->lr_atime[0];
	- time_t mtime = (time_t)lr->lr_mtime[0];
	-
	- (void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix,
	- (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
	-
	- if (lr->lr_mask & AT_MODE) {
	- (void) printf("%sAT_MODE %llo\n", tab_prefix,
	- (longlong_t)lr->lr_mode);
	- }
	-
	- if (lr->lr_mask & AT_UID) {
	- (void) printf("%sAT_UID %llu\n", tab_prefix,
	- (u_longlong_t)lr->lr_uid);
	- }
	-
	- if (lr->lr_mask & AT_GID) {
	- (void) printf("%sAT_GID %llu\n", tab_prefix,
	- (u_longlong_t)lr->lr_gid);
	- }
	-
	- if (lr->lr_mask & AT_SIZE) {
	- (void) printf("%sAT_SIZE %llu\n", tab_prefix,
	- (u_longlong_t)lr->lr_size);
	- }
	-
	- if (lr->lr_mask & AT_ATIME) {
	- (void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix,
	- (u_longlong_t)lr->lr_atime[0],
	- (u_longlong_t)lr->lr_atime[1],
	- ctime(&atime));
	- }
	-
	- if (lr->lr_mask & AT_MTIME) {
	- (void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix,
	- (u_longlong_t)lr->lr_mtime[0],
	- (u_longlong_t)lr->lr_mtime[1],
	- ctime(&mtime));
	- }
	-}
	-
	-/* ARGSUSED */
	-static void
	-zil_prt_rec_acl(zilog_t zilog, int txtype, void arg)
	-{
	- lr_acl_t *lr = arg;
	-
	- (void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix,
	- (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
	-}
	-
	-typedef void (zil_prt_rec_func_t)(zilog_t , int, void *);
	-typedef struct zil_rec_info {
	- zil_prt_rec_func_t zri_print;
	- const char *zri_name;
	- uint64_t zri_count;
	-} zil_rec_info_t;
	-
	-static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
	- {.zri_print = NULL, .zri_name = "Total "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKXATTR "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_SYMLINK "},
	- {.zri_print = zil_prt_rec_remove, .zri_name = "TX_REMOVE "},
	- {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "},
	- {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "},
	- {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "},
	- {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "},
	- {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "},
	- {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "},
	- {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "},
	- {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_ACL "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ATTR "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL_ATTR "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ATTR "},
	- {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL_ATTR "},
	- {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "},
	-};
	-
	-/* ARGSUSED */
	-static int
	-print_log_record(zilog_t zilog, lr_t lr, void *arg, uint64_t claim_txg)
	-{
	- int txtype;
	- int verbose = MAX(dump_opt['d'], dump_opt['i']);
	-
	- /* reduce size of txtype to strip off TX_CI bit */
	- txtype = lr->lrc_txtype;
	-
	- ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE);
	- ASSERT(lr->lrc_txg);
	-
	- (void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n",
	- (lr->lrc_txtype & TX_CI) ? "CI-" : "",
	- zil_rec_info[txtype].zri_name,
	- (u_longlong_t)lr->lrc_reclen,
	- (u_longlong_t)lr->lrc_txg,
	- (u_longlong_t)lr->lrc_seq);
	-
	- if (txtype && verbose >= 3)
	- zil_rec_info[txtype].zri_print(zilog, txtype, lr);
	-
	- zil_rec_info[txtype].zri_count++;
	- zil_rec_info[0].zri_count++;
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-print_log_block(zilog_t zilog, blkptr_t bp, void *arg, uint64_t claim_txg)
	-{
	- char blkbuf[BP_SPRINTF_LEN + 10];
	- int verbose = MAX(dump_opt['d'], dump_opt['i']);
	- const char *claim;
	-
	- if (verbose <= 3)
	- return (0);
	-
	- if (verbose >= 5) {
	- (void) strcpy(blkbuf, ", ");
	- snprintf_blkptr(blkbuf + strlen(blkbuf),
	- sizeof (blkbuf) - strlen(blkbuf), bp);
	- } else {
	- blkbuf[0] = '\0';
	- }
	-
	- if (claim_txg != 0)
	- claim = "already claimed";
	- else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
	- claim = "will claim";
	- else
	- claim = "won't claim";
	-
	- (void) printf("\tBlock seqno %llu, %s%s\n",
	- (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
	-
	- return (0);
	-}
	-
	-static void
	-print_log_stats(int verbose)
	-{
	- unsigned i, w, p10;
	-
	- if (verbose > 3)
	- (void) printf("\n");
	-
	- if (zil_rec_info[0].zri_count == 0)
	- return;
	-
	- for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10)
	- w++;
	-
	- for (i = 0; i < TX_MAX_TYPE; i++)
	- if (zil_rec_info[i].zri_count \|\| verbose >= 3)
	- (void) printf("\t\t%s %*llu\n",
	- zil_rec_info[i].zri_name, w,
	- (u_longlong_t)zil_rec_info[i].zri_count);
	- (void) printf("\n");
	-}
	-
	-/* ARGSUSED */
	-void
	-dump_intent_log(zilog_t *zilog)
	-{
	- const zil_header_t *zh = zilog->zl_header;
	- int verbose = MAX(dump_opt['d'], dump_opt['i']);
	- int i;
	-
	- if (BP_IS_HOLE(&zh->zh_log) \|\| verbose < 1)
	- return;
	-
	- (void) printf("\n ZIL header: claim_txg %llu, "
	- "claim_blk_seq %llu, claim_lr_seq %llu",
	- (u_longlong_t)zh->zh_claim_txg,
	- (u_longlong_t)zh->zh_claim_blk_seq,
	- (u_longlong_t)zh->zh_claim_lr_seq);
	- (void) printf(" replay_seq %llu, flags 0x%llx\n",
	- (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
	-
	- for (i = 0; i < TX_MAX_TYPE; i++)
	- zil_rec_info[i].zri_count = 0;
	-
	- /* see comment in zil_claim() or zil_check_log_chain() */
	- if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
	- zh->zh_claim_txg == 0)
	- return;
	-
	- if (verbose >= 2) {
	- (void) printf("\n");
	- (void) zil_parse(zilog, print_log_block, print_log_record, NULL,
	- zh->zh_claim_txg);
	- print_log_stats(verbose);
	- }
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zfs/zfs-program.8
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zfs/zfs-program.8
	+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs-program.8
	@@ -1,551 +0,0 @@
	-.\" This file and its contents are supplied under the terms of the
	-.\" Common Development and Distribution License ("CDDL"), version 1.0.
	-.\" You may only use this file in accordance with the terms of version
	-.\" 1.0 of the CDDL.
	-.\"
	-.\" A full copy of the text of the CDDL should have accompanied this
	-.\" source. A copy of the CDDL is also available via the Internet at
	-.\" http://www.illumos.org/license/CDDL.
	-.\"
	-.\"
	-.\" Copyright (c) 2016, 2017 by Delphix. All rights reserved.
	-.\" Copyright (c) 2018 Datto Inc.
	-.\"
	-.Dd April 18, 2020
	-.Dt ZFS-PROGRAM 8
	-.Os
	-.Sh NAME
	-.Nm zfs program
	-.Nd executes ZFS channel programs
	-.Sh SYNOPSIS
	-.Cm zfs program
	-.Op Fl jn
	-.Op Fl t Ar instruction-limit
	-.Op Fl m Ar memory-limit
	-.Ar pool
	-.Ar script
	-.\".Op Ar optional arguments to channel program
	-.Sh DESCRIPTION
	-The ZFS channel program interface allows ZFS administrative operations to be
	-run programmatically as a Lua script.
	-The entire script is executed atomically, with no other administrative
	-operations taking effect concurrently.
	-A library of ZFS calls is made available to channel program scripts.
	-Channel programs may only be run with root privileges.
	-.Pp
	-A modified version of the Lua 5.2 interpreter is used to run channel program
	-scripts.
	-The Lua 5.2 manual can be found at:
	-.Bd -centered -offset indent
	-.Lk http://www.lua.org/manual/5.2/
	-.Ed
	-.Pp
	-The channel program given by
	-.Ar script
	-will be run on
	-.Ar pool ,
	-and any attempts to access or modify other pools will cause an error.
	-.Sh OPTIONS
	-.Bl -tag -width "-t"
	-.It Fl j
	-Display channel program output in JSON format.
	-When this flag is specified and standard output is empty -
	-channel program encountered an error.
	-The details of such an error will be printed to standard error in plain text.
	-.It Fl n
	-Executes a read-only channel program, which runs faster.
	-The program cannot change on-disk state by calling functions from the
	-zfs.sync submodule.
	-The program can be used to gather information such as properties and
	-determining if changes would succeed (zfs.check.*).
	-Without this flag, all pending changes must be synced to disk before a
	-channel program can complete.
	-.It Fl t Ar instruction-limit
	-Execution time limit, in number of Lua instructions to execute.
	-If a channel program executes more than the specified number of instructions,
	-it will be stopped and an error will be returned.
	-The default limit is 10 million instructions, and it can be set to a maximum of
	-100 million instructions.
	-.It Fl m Ar memory-limit
	-Memory limit, in bytes.
	-If a channel program attempts to allocate more memory than the given limit, it
	-will be stopped and an error returned.
	-The default memory limit is 10 MB, and can be set to a maximum of 100 MB.
	-.El
	-.Pp
	-All remaining argument strings will be passed directly to the Lua script as
	-described in the
	-.Sx LUA INTERFACE
	-section below.
	-.Sh LUA INTERFACE
	-A channel program can be invoked either from the command line, or via a library
	-call to
	-.Fn lzc_channel_program .
	-.Ss Arguments
	-Arguments passed to the channel program are converted to a Lua table.
	-If invoked from the command line, extra arguments to the Lua script will be
	-accessible as an array stored in the argument table with the key 'argv':
	-.Bd -literal -offset indent
	-args = ...
	-argv = args["argv"]
	--- argv == {1="arg1", 2="arg2", ...}
	-.Ed
	-.Pp
	-If invoked from the libZFS interface, an arbitrary argument list can be
	-passed to the channel program, which is accessible via the same
	-"..." syntax in Lua:
	-.Bd -literal -offset indent
	-args = ...
	--- args == {"foo"="bar", "baz"={...}, ...}
	-.Ed
	-.Pp
	-Note that because Lua arrays are 1-indexed, arrays passed to Lua from the
	-libZFS interface will have their indices incremented by 1.
	-That is, the element
	-in
	-.Va arr[0]
	-in a C array passed to a channel program will be stored in
	-.Va arr[1]
	-when accessed from Lua.
	-.Ss Return Values
	-Lua return statements take the form:
	-.Bd -literal -offset indent
	-return ret0, ret1, ret2, ...
	-.Ed
	-.Pp
	-Return statements returning multiple values are permitted internally in a
	-channel program script, but attempting to return more than one value from the
	-top level of the channel program is not permitted and will throw an error.
	-However, tables containing multiple values can still be returned.
	-If invoked from the command line, a return statement:
	-.Bd -literal -offset indent
	-a = {foo="bar", baz=2}
	-return a
	-.Ed
	-.Pp
	-Will be output formatted as:
	-.Bd -literal -offset indent
	-Channel program fully executed with return value:
	- return:
	- baz: 2
	- foo: 'bar'
	-.Ed
	-.Ss Fatal Errors
	-If the channel program encounters a fatal error while running, a non-zero exit
	-status will be returned.
	-If more information about the error is available, a singleton list will be
	-returned detailing the error:
	-.Bd -literal -offset indent
	-error: "error string, including Lua stack trace"
	-.Ed
	-.Pp
	-If a fatal error is returned, the channel program may have not executed at all,
	-may have partially executed, or may have fully executed but failed to pass a
	-return value back to userland.
	-.Pp
	-If the channel program exhausts an instruction or memory limit, a fatal error
	-will be generated and the program will be stopped, leaving the program partially
	-executed.
	-No attempt is made to reverse or undo any operations already performed.
	-Note that because both the instruction count and amount of memory used by a
	-channel program are deterministic when run against the same inputs and
	-filesystem state, as long as a channel program has run successfully once, you
	-can guarantee that it will finish successfully against a similar size system.
	-.Pp
	-If a channel program attempts to return too large a value, the program will
	-fully execute but exit with a nonzero status code and no return value.
	-.Pp
	-.Em Note:
	-ZFS API functions do not generate Fatal Errors when correctly invoked, they
	-return an error code and the channel program continues executing.
	-See the
	-.Sx ZFS API
	-section below for function-specific details on error return codes.
	-.Ss Lua to C Value Conversion
	-When invoking a channel program via the libZFS interface, it is necessary to
	-translate arguments and return values from Lua values to their C equivalents,
	-and vice-versa.
	-.Pp
	-There is a correspondence between nvlist values in C and Lua tables.
	-A Lua table which is returned from the channel program will be recursively
	-converted to an nvlist, with table values converted to their natural
	-equivalents:
	-.Bd -literal -offset indent
	-string -> string
	-number -> int64
	-boolean -> boolean_value
	-nil -> boolean (no value)
	-table -> nvlist
	-.Ed
	-.Pp
	-Likewise, table keys are replaced by string equivalents as follows:
	-.Bd -literal -offset indent
	-string -> no change
	-number -> signed decimal string ("%lld")
	-boolean -> "true" \| "false"
	-.Ed
	-.Pp
	-Any collision of table key strings (for example, the string "true" and a
	-true boolean value) will cause a fatal error.
	-.Pp
	-Lua numbers are represented internally as signed 64-bit integers.
	-.Sh LUA STANDARD LIBRARY
	-The following Lua built-in base library functions are available:
	-.Bd -literal -offset indent
	-assert rawlen
	-collectgarbage rawget
	-error rawset
	-getmetatable select
	-ipairs setmetatable
	-next tonumber
	-pairs tostring
	-rawequal type
	-.Ed
	-.Pp
	-All functions in the
	-.Em coroutine ,
	-.Em string ,
	-and
	-.Em table
	-built-in submodules are also available.
	-A complete list and documentation of these modules is available in the Lua
	-manual.
	-.Pp
	-The following functions base library functions have been disabled and are
	-not available for use in channel programs:
	-.Bd -literal -offset indent
	-dofile
	-loadfile
	-load
	-pcall
	-print
	-xpcall
	-.Ed
	-.Sh ZFS API
	-.Ss Function Arguments
	-Each API function takes a fixed set of required positional arguments and
	-optional keyword arguments.
	-For example, the destroy function takes a single positional string argument
	-(the name of the dataset to destroy) and an optional "defer" keyword boolean
	-argument.
	-When using parentheses to specify the arguments to a Lua function, only
	-positional arguments can be used:
	-.Bd -literal -offset indent
	-zfs.sync.destroy("rpool@snap")
	-.Ed
	-.Pp
	-To use keyword arguments, functions must be called with a single argument that
	-is a Lua table containing entries mapping integers to positional arguments and
	-strings to keyword arguments:
	-.Bd -literal -offset indent
	-zfs.sync.destroy({1="rpool@snap", defer=true})
	-.Ed
	-.Pp
	-The Lua language allows curly braces to be used in place of parenthesis as
	-syntactic sugar for this calling convention:
	-.Bd -literal -offset indent
	-zfs.sync.snapshot{"rpool@snap", defer=true}
	-.Ed
	-.Ss Function Return Values
	-If an API function succeeds, it returns 0.
	-If it fails, it returns an error code and the channel program continues
	-executing.
	-API functions do not generate Fatal Errors except in the case of an
	-unrecoverable internal file system error.
	-.Pp
	-In addition to returning an error code, some functions also return extra
	-details describing what caused the error.
	-This extra description is given as a second return value, and will always be a
	-Lua table, or Nil if no error details were returned.
	-Different keys will exist in the error details table depending on the function
	-and error case.
	-Any such function may be called expecting a single return value:
	-.Bd -literal -offset indent
	-errno = zfs.sync.promote(dataset)
	-.Ed
	-.Pp
	-Or, the error details can be retrieved:
	-.Bd -literal -offset indent
	-errno, details = zfs.sync.promote(dataset)
	-if (errno == EEXIST) then
	- assert(details ~= Nil)
	- list_of_conflicting_snapshots = details
	-end
	-.Ed
	-.Pp
	-The following global aliases for API function error return codes are defined
	-for use in channel programs:
	-.Bd -literal -offset indent
	-EPERM ECHILD ENODEV ENOSPC
	-ENOENT EAGAIN ENOTDIR ESPIPE
	-ESRCH ENOMEM EISDIR EROFS
	-EINTR EACCES EINVAL EMLINK
	-EIO EFAULT ENFILE EPIPE
	-ENXIO ENOTBLK EMFILE EDOM
	-E2BIG EBUSY ENOTTY ERANGE
	-ENOEXEC EEXIST ETXTBSY EDQUOT
	-EBADF EXDEV EFBIG
	-.Ed
	-.Ss API Functions
	-For detailed descriptions of the exact behavior of any zfs administrative
	-operations, see the main
	-.Xr zfs 8
	-manual page.
	-.Bl -tag -width "xx"
	-.It Em zfs.debug(msg)
	-Record a debug message in the zfs_dbgmsg log.
	-A log of these messages can be printed via mdb's "::zfs_dbgmsg" command, or
	-can be monitored live by running:
	-.Bd -literal -offset indent
	- dtrace -n 'zfs-dbgmsg{trace(stringof(arg0))}'
	-.Ed
	-.Pp
	-msg (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Debug message to be printed.
	-.Ed
	-.It Em zfs.exists(dataset)
	-Returns true if the given dataset exists, or false if it doesn't.
	-A fatal error will be thrown if the dataset is not in the target pool.
	-That is, in a channel program running on rpool,
	-zfs.exists("rpool/nonexistent_fs") returns false, but
	-zfs.exists("somepool/fs_that_may_exist") will error.
	-.Pp
	-dataset (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Dataset to check for existence.
	-Must be in the target pool.
	-.Ed
	-.It Em zfs.get_prop(dataset, property)
	-Returns two values.
	-First, a string, number or table containing the property value for the given
	-dataset.
	-Second, a string containing the source of the property (i.e. the name of the
	-dataset in which it was set or nil if it is readonly).
	-Throws a Lua error if the dataset is invalid or the property doesn't exist.
	-Note that Lua only supports int64 number types whereas ZFS number properties
	-are uint64.
	-This means very large values (like guid) may wrap around and appear negative.
	-.Pp
	-dataset (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Filesystem or snapshot path to retrieve properties from.
	-.Ed
	-.Pp
	-property (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Name of property to retrieve.
	-All filesystem, snapshot and volume properties are supported except
	-for 'mounted' and 'iscsioptions.'
	-Also supports the 'written@snap' and 'written#bookmark' properties and
	-the '<user\|group><quota\|used>@id' properties, though the id must be in numeric
	-form.
	-.Ed
	-.El
	-.Bl -tag -width "xx"
	-.It Sy zfs.sync submodule
	-The sync submodule contains functions that modify the on-disk state.
	-They are executed in "syncing context".
	-.Pp
	-The available sync submodule functions are as follows:
	-.Bl -tag -width "xx"
	-.It Em zfs.sync.destroy(dataset, [defer=true\|false])
	-Destroy the given dataset.
	-Returns 0 on successful destroy, or a nonzero error code if the dataset could
	-not be destroyed (for example, if the dataset has any active children or
	-clones).
	-.Pp
	-dataset (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Filesystem or snapshot to be destroyed.
	-.Ed
	-.Pp
	-[optional] defer (boolean)
	-.Bd -ragged -compact -offset "xxxx"
	-Valid only for destroying snapshots.
	-If set to true, and the snapshot has holds or clones, allows the snapshot to be
	-marked for deferred deletion rather than failing.
	-.Ed
	-.It Em zfs.sync.promote(dataset)
	-Promote the given clone to a filesystem.
	-Returns 0 on successful promotion, or a nonzero error code otherwise.
	-If EEXIST is returned, the second return value will be an array of the clone's
	-snapshots whose names collide with snapshots of the parent filesystem.
	-.Pp
	-dataset (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Clone to be promoted.
	-.Ed
	-.It Em zfs.sync.rollback(filesystem)
	-Rollback to the previous snapshot for a dataset.
	-Returns 0 on successful rollback, or a nonzero error code otherwise.
	-Rollbacks can be performed on filesystems or zvols, but not on snapshots
	-or mounted datasets.
	-EBUSY is returned in the case where the filesystem is mounted.
	-.Pp
	-filesystem (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Filesystem to rollback.
	-.Ed
	-.It Em zfs.sync.snapshot(dataset)
	-Create a snapshot of a filesystem.
	-Returns 0 if the snapshot was successfully created,
	-and a nonzero error code otherwise.
	-.Pp
	-Note: Taking a snapshot will fail on any pool older than legacy version 27.
	-To enable taking snapshots from ZCP scripts, the pool must be upgraded.
	-.Pp
	-dataset (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Name of snapshot to create.
	-.Ed
	-.El
	-.It Sy zfs.check submodule
	-For each function in the zfs.sync submodule, there is a corresponding zfs.check
	-function which performs a "dry run" of the same operation.
	-Each takes the same arguments as its zfs.sync counterpart and returns 0 if the
	-operation would succeed, or a non-zero error code if it would fail, along with
	-any other error details.
	-That is, each has the same behavior as the corresponding sync function except
	-for actually executing the requested change.
	-For example,
	-.Em zfs.check.destroy("fs")
	-returns 0 if
	-.Em zfs.sync.destroy("fs")
	-would successfully destroy the dataset.
	-.Pp
	-The available zfs.check functions are:
	-.Bl -tag -width "xx"
	-.It Em zfs.check.destroy(dataset, [defer=true\|false])
	-.It Em zfs.check.promote(dataset)
	-.It Em zfs.check.rollback(filesystem)
	-.It Em zfs.check.snapshot(dataset)
	-.El
	-.It Sy zfs.list submodule
	-The zfs.list submodule provides functions for iterating over datasets and
	-properties.
	-Rather than returning tables, these functions act as Lua iterators, and are
	-generally used as follows:
	-.Bd -literal -offset indent
	-for child in zfs.list.children("rpool") do
	- ...
	-end
	-.Ed
	-.Pp
	-The available zfs.list functions are:
	-.Bl -tag -width "xx"
	-.It Em zfs.list.clones(snapshot)
	-Iterate through all clones of the given snapshot.
	-.Pp
	-snapshot (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Must be a valid snapshot path in the current pool.
	-.Ed
	-.It Em zfs.list.snapshots(dataset)
	-Iterate through all snapshots of the given dataset.
	-Each snapshot is returned as a string containing the full dataset name, e.g.
	-"pool/fs@snap".
	-.Pp
	-dataset (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Must be a valid filesystem or volume.
	-.Ed
	-.It Em zfs.list.children(dataset)
	-Iterate through all direct children of the given dataset.
	-Each child is returned as a string containing the full dataset name, e.g.
	-"pool/fs/child".
	-.Pp
	-dataset (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Must be a valid filesystem or volume.
	-.Ed
	-.It Em zfs.list.properties(dataset)
	-Iterate through all user properties for the given dataset.
	-.Pp
	-dataset (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Must be a valid filesystem, snapshot, or volume.
	-.Ed
	-.It Em zfs.list.system_properties(dataset)
	-Returns an array of strings, the names of the valid system (non-user defined)
	-properties for the given dataset.
	-Throws a Lua error if the dataset is invalid.
	-.Pp
	-dataset (string)
	-.Bd -ragged -compact -offset "xxxx"
	-Must be a valid filesystem, snapshot or volume.
	-.Ed
	-.El
	-.El
	-.Sh EXAMPLES
	-.Ss Example 1
	-The following channel program recursively destroys a filesystem and all its
	-snapshots and children in a naive manner.
	-Note that this does not involve any error handling or reporting.
	-.Bd -literal -offset indent
	-function destroy_recursive(root)
	- for child in zfs.list.children(root) do
	- destroy_recursive(child)
	- end
	- for snap in zfs.list.snapshots(root) do
	- zfs.sync.destroy(snap)
	- end
	- zfs.sync.destroy(root)
	-end
	-destroy_recursive("pool/somefs")
	-.Ed
	-.Ss Example 2
	-A more verbose and robust version of the same channel program, which
	-properly detects and reports errors, and also takes the dataset to destroy
	-as a command line argument, would be as follows:
	-.Bd -literal -offset indent
	-succeeded = {}
	-failed = {}
	-
	-function destroy_recursive(root)
	- for child in zfs.list.children(root) do
	- destroy_recursive(child)
	- end
	- for snap in zfs.list.snapshots(root) do
	- err = zfs.sync.destroy(snap)
	- if (err ~= 0) then
	- failed[snap] = err
	- else
	- succeeded[snap] = err
	- end
	- end
	- err = zfs.sync.destroy(root)
	- if (err ~= 0) then
	- failed[root] = err
	- else
	- succeeded[root] = err
	- end
	-end
	-
	-args = ...
	-argv = args["argv"]
	-
	-destroy_recursive(argv[1])
	-
	-results = {}
	-results["succeeded"] = succeeded
	-results["failed"] = failed
	-return results
	-.Ed
	-.Ss Example 3
	-The following function performs a forced promote operation by attempting to
	-promote the given clone and destroying any conflicting snapshots.
	-.Bd -literal -offset indent
	-function force_promote(ds)
	- errno, details = zfs.check.promote(ds)
	- if (errno == EEXIST) then
	- assert(details ~= Nil)
	- for i, snap in ipairs(details) do
	- zfs.sync.destroy(ds .. "@" .. snap)
	- end
	- elseif (errno ~= 0) then
	- return errno
	- end
	- return zfs.sync.promote(ds)
	-end
	-.Ed
	Index: head/cddl/contrib/opensolaris/cmd/zfs/zfs.8
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zfs/zfs.8
	+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs.8
	@@ -1,3973 +0,0 @@
	-'\" te
	-.\" Copyright (c) 2013, Martin Matuska <mm@FreeBSD.org>.
	-.\" All Rights Reserved.
	-.\"
	-.\" The contents of this file are subject to the terms of the
	-.\" Common Development and Distribution License (the "License").
	-.\" You may not use this file except in compliance with the License.
	-.\"
	-.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	-.\" or http://www.opensolaris.org/os/licensing.
	-.\" See the License for the specific language governing permissions
	-.\" and limitations under the License.
	-.\"
	-.\" When distributing Covered Code, include this CDDL HEADER in each
	-.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	-.\" If applicable, add the following below this CDDL HEADER, with the
	-.\" fields enclosed by brackets "[]" replaced with your own identifying
	-.\" information: Portions Copyright [yyyy] [name of copyright owner]
	-.\"
	-.\" Copyright (c) 2010, Sun Microsystems, Inc. All Rights Reserved.
	-.\" Copyright (c) 2011, 2014 by Delphix. All rights reserved.
	-.\" Copyright (c) 2011, Pawel Jakub Dawidek <pjd@FreeBSD.org>
	-.\" Copyright (c) 2012, Glen Barber <gjb@FreeBSD.org>
	-.\" Copyright (c) 2012, Bryan Drewery <bdrewery@FreeBSD.org>
	-.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	-.\" Copyright (c) 2013, Steven Hartland <smh@FreeBSD.org>
	-.\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved.
	-.\" Copyright (c) 2014, Xin LI <delphij@FreeBSD.org>
	-.\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved.
	-.\" Copyright 2019 Joyent, Inc.
	-.\" Copyright (c) 2018 Datto Inc.
	-.\"
	-.\" $FreeBSD$
	-.\"
	-.Dd February 16, 2020
	-.Dt ZFS 8
	-.Os
	-.Sh NAME
	-.Nm zfs
	-.Nd configures ZFS file systems
	-.Sh SYNOPSIS
	-.Nm
	-.Op Fl \&?
	-.Nm
	-.Cm create
	-.Op Fl pu
	-.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... Ar filesystem
	-.Nm
	-.Cm create
	-.Op Fl ps
	-.Op Fl b Ar blocksize
	-.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
	-.Fl V
	-.Ar size volume
	-.Nm
	-.Cm destroy
	-.Op Fl fnpRrv
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm destroy
	-.Op Fl dnpRrv
	-.Sm off
	-.Ar filesystem Ns \| Ns volume
	-.Ns @snap
	-.Op % Ns Ar snap
	-.Op , Ns Ar snap Op % Ns Ar snap
	-.Op , Ns ...
	-.Sm on
	-.Nm
	-.Cm destroy
	-.Ar filesystem Ns \| Ns Ar volume Ns # Ns Ar bookmark
	-.Nm
	-.Cm snapshot Ns \| Ns Cm snap
	-.Op Fl r
	-.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
	-.Ar filesystem@snapname Ns \| Ns Ar volume@snapname
	-.Ar filesystem@snapname Ns \| Ns Ar volume@snapname Ns ...
	-.Nm
	-.Cm rollback
	-.Op Fl rRf
	-.Ar snapshot
	-.Nm
	-.Cm clone
	-.Op Fl p
	-.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
	-.Ar snapshot filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm promote
	-.Ar clone-filesystem
	-.Nm
	-.Cm rename
	-.Op Fl f
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot
	-.Nm
	-.Cm rename
	-.Op Fl f
	-.Fl p
	-.Ar filesystem Ns \| Ns Ar volume
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm rename
	-.Fl r
	-.Ar snapshot snapshot
	-.Nm
	-.Cm rename
	-.Ar bookmark bookmark
	-.Nm
	-.Cm rename
	-.Fl u
	-.Op Fl p
	-.Ar filesystem filesystem
	-.Nm
	-.Cm list
	-.Op Fl r Ns \| Ns Fl d Ar depth
	-.Op Fl Hp
	-.Op Fl o Ar property Ns Oo , Ns property Ns Oc Ns ...
	-.Op Fl t Ar type Ns Oo , Ns type Ns Oc Ns ...
	-.Oo Fl s Ar property Oc Ns ...
	-.Oo Fl S Ar property Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot \| Ns Ar bookmark Ns ...
	-.Nm
	-.Cm remap
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm set
	-.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot Ns ...
	-.Nm
	-.Cm get
	-.Op Fl r Ns \| Ns Fl d Ar depth
	-.Op Fl Hp
	-.Op Fl o Ar all \| field Ns Oo , Ns Ar field Oc Ns ...
	-.Op Fl t Ar type Ns Oo Ns , Ar type Oc Ns ...
	-.Op Fl s Ar source Ns Oo Ns , Ns Ar source Oc Ns ...
	-.Ar all \| property Ns Oo Ns , Ns Ar property Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot Ns ...
	-.Nm
	-.Cm inherit
	-.Op Fl rS
	-.Ar property
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot Ns ...
	-.Nm
	-.Cm upgrade
	-.Op Fl v
	-.Nm
	-.Cm upgrade
	-.Op Fl r
	-.Op Fl V Ar version
	-.Fl a \| Ar filesystem
	-.Nm
	-.Cm userspace
	-.Op Fl Hinp
	-.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ...
	-.Oo Fl s Ar field Oc Ns ...
	-.Oo Fl S Ar field Oc Ns ...
	-.Op Fl t Ar type Ns Oo Ns , Ns Ar type Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar snapshot
	-.Nm
	-.Cm groupspace
	-.Op Fl Hinp
	-.Op Fl o Ar field Ns Oo , Ns field Oc Ns ...
	-.Oo Fl s Ar field Oc Ns ...
	-.Oo Fl S Ar field Oc Ns ...
	-.Op Fl t Ar type Ns Oo Ns , Ns Ar type Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar snapshot
	-.Nm
	-.Cm mount
	-.Nm
	-.Cm mount
	-.Op Fl vO
	-.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ...
	-.Fl a \| Ar filesystem
	-.Nm
	-.Cm unmount Ns \| Ns Cm umount
	-.Op Fl f
	-.Fl a \| Ar filesystem Ns \| Ns Ar mountpoint
	-.Nm
	-.Cm share
	-.Fl a \| Ar filesystem
	-.Nm
	-.Cm unshare
	-.Fl a \| Ar filesystem Ns \| Ns Ar mountpoint
	-.Nm
	-.Cm bookmark
	-.Ar snapshot
	-.Ar bookmark
	-.Nm
	-.Cm send
	-.Op Fl DLPRVcenpv
	-.Op Fl i Ar snapshot \| Fl I Ar snapshot
	-.Ar snapshot
	-.Nm
	-.Cm send
	-.Op Fl LPcenv
	-.Op Fl i Ar snapshot Ns \| Ns Ar bookmark
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot
	-.Nm
	-.Cm send
	-.Op Fl PVenv
	-.Fl t Ar receive_resume_token
	-.Nm
	-.Cm receive Ns \| Ns Cm recv
	-.Op Fl vnsFMu
	-.Op Fl o Sy origin Ns = Ns Ar snapshot
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot
	-.Nm
	-.Cm receive Ns \| Ns Cm recv
	-.Op Fl vnsFMu
	-.Op Fl d \| e
	-.Op Fl o Sy origin Ns = Ns Ar snapshot
	-.Ar filesystem
	-.Nm
	-.Cm receive Ns \| Ns Cm recv
	-.Fl A
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm allow
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm allow
	-.Op Fl ldug
	-.Ar user Ns \| Ns Ar group Ns Oo Ns , Ns Ar user Ns \| Ns Ar group Oc Ns ...
	-.Ar perm Ns \| Ns Ar @setname Ns
	-.Oo Ns , Ns Ar perm Ns \| Ns Ar @setname Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm allow
	-.Op Fl ld
	-.Fl e Ns \| Ns Cm everyone
	-.Ar perm Ns \| Ns Ar @setname Ns Op Ns , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ...
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm allow
	-.Fl c
	-.Ar perm Ns \| Ns Ar @setname Ns Op Ns , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ...
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm allow
	-.Fl s
	-.Ar @setname
	-.Ar perm Ns \| Ns Ar @setname Ns Op Ns , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ...
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm unallow
	-.Op Fl rldug
	-.Ar user Ns \| Ns Ar group Ns Oo Ns , Ns Ar user Ns \| Ns Ar group Oc Ns ...
	-.Oo Ar perm Ns \| Ns Ar @setname Ns Op , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ... Oc
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm unallow
	-.Op Fl rld
	-.Fl e Ns \| Ns Cm everyone
	-.Oo Ar perm Ns \| Ns Ar @setname Ns Op , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ... Oc
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm unallow
	-.Op Fl r
	-.Fl c
	-.Oo Ar perm Ns \| Ns Ar @setname Ns Op , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ... Oc
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm unallow
	-.Op Fl r
	-.Fl s
	-.Ar @setname
	-.Oo Ar perm Ns \| Ns Ar @setname Ns Op , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ... Oc
	-.Ar filesystem Ns \| Ns Ar volume
	-.Nm
	-.Cm hold
	-.Op Fl r
	-.Ar tag snapshot Ns ...
	-.Nm
	-.Cm holds
	-.Op Fl Hp
	-.Op Fl r Ns \| Ns Fl d Ar depth
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot Ns
	-.Ns ...
	-.Nm
	-.Cm release
	-.Op Fl r
	-.Ar tag snapshot Ns ...
	-.Nm
	-.Cm diff
	-.Op Fl FHt
	-.Ar snapshot
	-.Op Ar snapshot Ns \| Ns Ar filesystem
	-.Nm
	-.Cm program
	-.Op Fl jn
	-.Op Fl t Ar timeout
	-.Op Fl m Ar memory_limit
	-.Ar pool script
	-.Op Ar arg1 No ...
	-.Nm
	-.Cm jail
	-.Ar jailid Ns \| Ns Ar jailname filesystem
	-.Nm
	-.Cm unjail
	-.Ar jailid Ns \| Ns Ar jailname filesystem
	-.Sh DESCRIPTION
	-The
	-.Nm
	-command configures
	-.Tn ZFS
	-datasets within a
	-.Tn ZFS
	-storage pool, as described in
	-.Xr zpool 8 .
	-A dataset is identified by a unique path within the
	-.Tn ZFS
	-namespace. For example:
	-.Bd -ragged -offset 4n
	-.No pool/ Ns Brq filesystem,volume,snapshot
	-.Ed
	-.Pp
	-where the maximum length of a dataset name is
	-.Dv MAXNAMELEN
	-(256 bytes)
	-and the maximum amount of nesting allowed in a path is 50 levels deep.
	-.Pp
	-A dataset can be one of the following:
	-.Bl -hang -width 12n
	-.It Sy file system
	-A
	-.Tn ZFS
	-dataset of type
	-.Em filesystem
	-can be mounted within the standard system namespace and behaves like other file
	-systems. While
	-.Tn ZFS
	-file systems are designed to be
	-.Tn POSIX
	-compliant, known issues exist that prevent compliance in some cases.
	-Applications that depend on standards conformance might fail due to nonstandard
	-behavior when checking file system free space.
	-.It Sy volume
	-A logical volume exported as a raw or block device. This type of dataset should
	-only be used under special circumstances. File systems are typically used in
	-most environments.
	-.It Sy snapshot
	-A read-only version of a file system or volume at a given point in time. It is
	-specified as
	-.Em filesystem@name
	-or
	-.Em volume@name .
	-.El
	-.Ss ZFS File System Hierarchy
	-A
	-.Tn ZFS
	-storage pool is a logical collection of devices that provide space for
	-datasets. A storage pool is also the root of the
	-.Tn ZFS
	-file system hierarchy.
	-.Pp
	-The root of the pool can be accessed as a file system, such as mounting and
	-unmounting, taking snapshots, and setting properties. The physical storage
	-characteristics, however, are managed by the
	-.Xr zpool 8
	-command.
	-.Pp
	-See
	-.Xr zpool 8
	-for more information on creating and administering pools.
	-.Ss Snapshots
	-A snapshot is a read-only copy of a file system or volume. Snapshots can be
	-created extremely quickly, and initially consume no additional space within the
	-pool. As data within the active dataset changes, the snapshot consumes more
	-data than would otherwise be shared with the active dataset.
	-.Pp
	-Snapshots can have arbitrary names. Snapshots of volumes can be cloned or
	-rolled back, but cannot be accessed independently.
	-.Pp
	-File system snapshots can be accessed under the
	-.Pa \&.zfs/snapshot
	-directory in the root of the file system. Snapshots are automatically mounted
	-on demand and may be unmounted at regular intervals. The visibility of the
	-.Pa \&.zfs
	-directory can be controlled by the
	-.Sy snapdir
	-property.
	-.Ss Clones
	-A clone is a writable volume or file system whose initial contents are the same
	-as another dataset. As with snapshots, creating a clone is nearly
	-instantaneous, and initially consumes no additional space.
	-.Pp
	-Clones can only be created from a snapshot. When a snapshot is cloned, it
	-creates an implicit dependency between the parent and child. Even though the
	-clone is created somewhere else in the dataset hierarchy, the original snapshot
	-cannot be destroyed as long as a clone exists. The
	-.Sy origin
	-property exposes this dependency, and the
	-.Cm destroy
	-command lists any such dependencies, if they exist.
	-.Pp
	-The clone parent-child dependency relationship can be reversed by using the
	-.Cm promote
	-subcommand. This causes the "origin" file system to become a clone of the
	-specified file system, which makes it possible to destroy the file system that
	-the clone was created from.
	-.Ss Mount Points
	-Creating a
	-.Tn ZFS
	-file system is a simple operation, so the number of file systems per system is
	-likely to be numerous. To cope with this,
	-.Tn ZFS
	-automatically manages mounting and unmounting file systems without the need to
	-edit the
	-.Pa /etc/fstab
	-file. All automatically managed file systems are mounted by
	-.Tn ZFS
	-at boot time.
	-.Pp
	-By default, file systems are mounted under
	-.Pa /path ,
	-where
	-.Ar path
	-is the name of the file system in the
	-.Tn ZFS
	-namespace. Directories are created and destroyed as needed.
	-.Pp
	-A file system can also have a mount point set in the
	-.Sy mountpoint
	-property. This directory is created as needed, and
	-.Tn ZFS
	-automatically mounts the file system when the
	-.Qq Nm Cm mount Fl a
	-command is invoked (without editing
	-.Pa /etc/fstab ) .
	-The
	-.Sy mountpoint
	-property can be inherited, so if
	-.Em pool/home
	-has a mount point of
	-.Pa /home ,
	-then
	-.Em pool/home/user
	-automatically inherits a mount point of
	-.Pa /home/user .
	-.Pp
	-A file system
	-.Sy mountpoint
	-property of
	-.Cm none
	-prevents the file system from being mounted.
	-.Pp
	-If needed,
	-.Tn ZFS
	-file systems can also be managed with traditional tools
	-.Pq Xr mount 8 , Xr umount 8 , Xr fstab 5 .
	-If a file system's mount point is set to
	-.Cm legacy ,
	-.Tn ZFS
	-makes no attempt to manage the file system, and the administrator is
	-responsible for mounting and unmounting the file system.
	-.Ss Jails
	-.No A Tn ZFS
	-dataset can be attached to a jail by using the
	-.Qq Nm Cm jail
	-subcommand. You cannot attach a dataset to one jail and the children of the
	-same dataset to another jail. You can also not attach the root file system
	-of the jail or any dataset which needs to be mounted before the zfs rc script
	-is run inside the jail, as it would be attached unmounted until it is
	-mounted from the rc script inside the jail. To allow management of the
	-dataset from within a jail, the
	-.Sy jailed
	-property has to be set and the jail needs access to the
	-.Pa /dev/zfs
	-device. The
	-.Sy quota
	-property cannot be changed from within a jail. See
	-.Xr jail 8
	-for information on how to allow mounting
	-.Tn ZFS
	-datasets from within a jail.
	-.Pp
	-.No A Tn ZFS
	-dataset can be detached from a jail using the
	-.Qq Nm Cm unjail
	-subcommand.
	-.Pp
	-After a dataset is attached to a jail and the jailed property is set, a jailed
	-file system cannot be mounted outside the jail, since the jail administrator
	-might have set the mount point to an unacceptable value.
	-.Ss Deduplication
	-Deduplication is the process for removing redundant data at the block-level,
	-reducing the total amount of data stored. If a file system has the
	-.Cm dedup
	-property enabled, duplicate data blocks are removed synchronously. The result
	-is that only unique data is stored and common components are shared among
	-files.
	-.Ss Native Properties
	-Properties are divided into two types, native properties and user-defined (or
	-"user") properties. Native properties either export internal statistics or
	-control
	-.Tn ZFS
	-behavior. In addition, native properties are either editable or read-only. User
	-properties have no effect on
	-.Tn ZFS
	-behavior, but you can use them to annotate datasets in a way that is meaningful
	-in your environment. For more information about user properties, see the
	-.Qq Sx User Properties
	-section, below.
	-.Pp
	-Every dataset has a set of properties that export statistics about the dataset
	-as well as control various behaviors. Properties are inherited from the parent
	-unless overridden by the child. Some properties apply only to certain types of
	-datasets (file systems, volumes, or snapshots).
	-.Pp
	-The values of numeric properties can be specified using human-readable suffixes
	-(for example,
	-.Sy k , KB , M , Gb ,
	-and so forth, up to
	-.Sy Z
	-for zettabyte). The following are all valid (and equal) specifications:
	-.Bd -ragged -offset 4n
	-1536M, 1.5g, 1.50GB
	-.Ed
	-.Pp
	-The values of non-numeric properties are case sensitive and must be lowercase,
	-except for
	-.Sy mountpoint , sharenfs , No and Sy sharesmb .
	-.Pp
	-The following native properties consist of read-only statistics about the
	-dataset. These properties can be neither set, nor inherited. Native properties
	-apply to all dataset types unless otherwise noted.
	-.Bl -tag -width 2n
	-.It Sy available
	-The amount of space available to the dataset and all its children, assuming
	-that there is no other activity in the pool. Because space is shared within a
	-pool, availability can be limited by any number of factors, including physical
	-pool size, quotas, reservations, or other datasets within the pool.
	-.Pp
	-This property can also be referred to by its shortened column name,
	-.Sy avail .
	-.It Sy compressratio
	-For non-snapshots, the compression ratio achieved for the
	-.Sy used
	-space of this dataset, expressed as a multiplier. The
	-.Sy used
	-property includes descendant datasets, and, for clones, does not include
	-the space shared with the origin snapshot. For snapshots, the
	-.Sy compressratio
	-is the same as the
	-.Sy refcompressratio
	-property. Compression can be turned on by running:
	-.Qq Nm Cm set compression=on Ar dataset
	-The default value is
	-.Cm off .
	-.It Sy createtxg
	-The transaction group (txg) in which the dataset was created.
	-Bookmarks have the same
	-.Sy createtxg
	-as the snapshot they are initially tied to.
	-This property is suitable for ordering a list of snapshots,
	-e.g. for incremental send and receive.
	-.It Sy creation
	-The time this dataset was created.
	-.It Sy clones
	-For snapshots, this property is a comma-separated list of filesystems or
	-volumes which are clones of this snapshot. The clones'
	-.Sy origin
	-property is this snapshot. If the
	-.Sy clones
	-property is not empty, then this snapshot can not be destroyed (even with the
	-.Fl r
	-or
	-.Fl f
	-options).
	-.It Sy defer_destroy
	-This property is
	-.Cm on
	-if the snapshot has been marked for deferred destroy by using the
	-.Qq Nm Cm destroy -d
	-command. Otherwise, the property is
	-.Cm off .
	-.It Sy filesystem_count
	-The total number of filesystems and volumes that exist under this location in the
	-dataset tree.
	-This value is only available when a
	-.Sy filesystem_limit
	-has
	-been set somewhere in the tree under which the dataset resides.
	-.It Sy guid
	-The 64 bit GUID of this dataset or bookmark which does not change over its
	-entire lifetime.
	-When a snapshot is sent to another pool, the received snapshot has the same
	-GUID.
	-Thus, the
	-.Sy guid
	-is suitable to identify a snapshot across pools.
	-.It Sy logicalreferenced
	-The amount of space that is
	-.Qq logically
	-accessible by this dataset.
	-See the
	-.Sy referenced
	-property.
	-The logical space ignores the effect of the
	-.Sy compression
	-and
	-.Sy copies
	-properties, giving a quantity closer to the amount of data that applications
	-see.
	-However, it does include space consumed by metadata.
	-.Pp
	-This property can also be referred to by its shortened column name,
	-.Sy lrefer .
	-.It Sy logicalused
	-The amount of space that is
	-.Qq logically
	-consumed by this dataset and all its descendents.
	-See the
	-.Sy used
	-property.
	-The logical space ignores the effect of the
	-.Sy compression
	-and
	-.Sy copies
	-properties, giving a quantity closer to the amount of data that applications
	-see.
	-.Pp
	-This property can also be referred to by its shortened column name,
	-.Sy lused .
	-.It Sy mounted
	-For file systems, indicates whether the file system is currently mounted. This
	-property can be either
	-.Cm yes
	-or
	-.Cm no .
	-.It Sy origin
	-For cloned file systems or volumes, the snapshot from which the clone was
	-created. See also the
	-.Sy clones
	-property.
	-.It Sy receive_resume_token
	-For filesystems or volumes which have saved partially-completed state from
	-.Sy zfs receive -s ,
	-this opaque token can be provided to
	-.Sy zfs send -t
	-to resume and complete the
	-.Sy zfs receive .
	-.It Sy referenced
	-The amount of data that is accessible by this dataset, which may or may not be
	-shared with other datasets in the pool. When a snapshot or clone is created, it
	-initially references the same amount of space as the file system or snapshot it
	-was created from, since its contents are identical.
	-.Pp
	-This property can also be referred to by its shortened column name,
	-.Sy refer .
	-.It Sy refcompressratio
	-The compression ratio achieved for the
	-.Sy referenced
	-space of this dataset, expressed as a multiplier. See also the
	-.Sy compressratio
	-property.
	-.It Sy snapshot_count
	-The total number of snapshots that exist under this location in the dataset tree.
	-This value is only available when a
	-.Sy snapshot_limit
	-has been set somewhere
	-in the tree under which the dataset resides.
	-.It Sy type
	-The type of dataset:
	-.Sy filesystem , volume , No or Sy snapshot .
	-.It Sy used
	-The amount of space consumed by this dataset and all its descendents. This is
	-the value that is checked against this dataset's quota and reservation. The
	-space used does not include this dataset's reservation, but does take into
	-account the reservations of any descendent datasets. The amount of space that a
	-dataset consumes from its parent, as well as the amount of space that are freed
	-if this dataset is recursively destroyed, is the greater of its space used and
	-its reservation.
	-.Pp
	-When snapshots (see the
	-.Qq Sx Snapshots
	-section) are created, their space is
	-initially shared between the snapshot and the file system, and possibly with
	-previous snapshots. As the file system changes, space that was previously
	-shared becomes unique to the snapshot, and counted in the snapshot's space
	-used. Additionally, deleting snapshots can increase the amount of space unique
	-to (and used by) other snapshots.
	-.Pp
	-The amount of space used, available, or referenced does not take into account
	-pending changes. Pending changes are generally accounted for within a few
	-seconds. Committing a change to a disk using
	-.Xr fsync 2
	-or
	-.Sy O_SYNC
	-does not necessarily guarantee that the space usage information is updated
	-immediately.
	-.It Sy usedby*
	-The
	-.Sy usedby*
	-properties decompose the
	-.Sy used
	-properties into the various reasons that space is used. Specifically,
	-.Sy used No =
	-.Sy usedbysnapshots + usedbydataset + usedbychildren + usedbyrefreservation .
	-These properties are only available for datasets created
	-with
	-.Tn ZFS
	-pool version 13 pools and higher.
	-.It Sy usedbysnapshots
	-The amount of space consumed by snapshots of this dataset. In particular, it is
	-the amount of space that would be freed if all of this dataset's snapshots were
	-destroyed. Note that this is not simply the sum of the snapshots'
	-.Sy used
	-properties because space can be shared by multiple snapshots.
	-.It Sy usedbydataset
	-The amount of space used by this dataset itself, which would be freed if the
	-dataset were destroyed (after first removing any
	-.Sy refreservation
	-and destroying any necessary snapshots or descendents).
	-.It Sy usedbychildren
	-The amount of space used by children of this dataset, which would be freed if
	-all the dataset's children were destroyed.
	-.It Sy usedbyrefreservation
	-The amount of space used by a
	-.Sy refreservation
	-set on this dataset, which would be freed if the
	-.Sy refreservation
	-was removed.
	-.It Sy userused@ Ns Ar user
	-The amount of space consumed by the specified user in this dataset. Space is
	-charged to the owner of each file, as displayed by
	-.Qq Nm ls Fl l .
	-The amount of space charged is displayed by
	-.Qq Nm du
	-and
	-.Qq Nm ls Fl s .
	-See the
	-.Qq Nm Cm userspace
	-subcommand for more information.
	-.Pp
	-Unprivileged users can access only their own space usage. The root user, or a
	-user who has been granted the
	-.Sy userused
	-privilege with
	-.Qq Nm Cm allow ,
	-can access everyone's usage.
	-.Pp
	-The
	-.Sy userused@ Ns ...
	-properties are not displayed by
	-.Qq Nm Cm get all .
	-The user's name must be appended after the
	-.Sy @
	-symbol, using one of the following forms:
	-.Bl -bullet -offset 2n
	-.It
	-POSIX name (for example,
	-.Em joe )
	-.It
	-POSIX numeric ID (for example,
	-.Em 1001 )
	-.El
	-.It Sy userrefs
	-This property is set to the number of user holds on this snapshot. User holds
	-are set by using the
	-.Qq Nm Cm hold
	-command.
	-.It Sy groupused@ Ns Ar group
	-The amount of space consumed by the specified group in this dataset. Space is
	-charged to the group of each file, as displayed by
	-.Nm ls Fl l .
	-See the
	-.Sy userused@ Ns Ar user
	-property for more information.
	-.Pp
	-Unprivileged users can only access their own groups' space usage. The root
	-user, or a user who has been granted the
	-.Sy groupused
	-privilege with
	-.Qq Nm Cm allow ,
	-can access all groups' usage.
	-.It Sy volblocksize Ns = Ns Ar blocksize
	-For volumes, specifies the block size of the volume. The
	-.Ar blocksize
	-cannot be changed once the volume has been written, so it should be set at
	-volume creation time. The default
	-.Ar blocksize
	-for volumes is 8 Kbytes. Any
	-power of 2 from 512 bytes to 128 Kbytes is valid.
	-.Pp
	-This property can also be referred to by its shortened column name,
	-.Sy volblock .
	-.It Sy written
	-The amount of
	-.Sy referenced
	-space written to this dataset since the previous snapshot.
	-.It Sy written@ Ns Ar snapshot
	-The amount of
	-.Sy referenced
	-space written to this dataset since the specified snapshot. This is the space
	-that is referenced by this dataset but was not referenced by the specified
	-snapshot.
	-.Pp
	-The
	-.Ar snapshot
	-may be specified as a short snapshot name (just the part after the
	-.Sy @ ) ,
	-in which case it will be interpreted as a snapshot in the same filesystem as
	-this dataset. The
	-.Ar snapshot
	-may be a full snapshot name
	-.Pq Em filesystem@snapshot ,
	-which for clones may be a snapshot in the origin's filesystem (or the origin of
	-the origin's filesystem, etc).
	-.El
	-.Pp
	-The following native properties can be used to change the behavior of a
	-.Tn ZFS
	-dataset.
	-.Bl -tag -width 2n
	-.It Xo
	-.Sy aclinherit Ns = Ns Cm discard \|
	-.Cm noallow \|
	-.Cm restricted \|
	-.Cm passthrough \|
	-.Cm passthrough-x
	-.Xc
	-Controls how
	-.Tn ACL
	-entries are inherited when files and directories are created. A file system
	-with an
	-.Sy aclinherit
	-property of
	-.Cm discard
	-does not inherit any
	-.Tn ACL
	-entries. A file system with an
	-.Sy aclinherit
	-property value of
	-.Cm noallow
	-only inherits inheritable
	-.Tn ACL
	-entries that specify "deny" permissions. The property value
	-.Cm restricted
	-(the default) removes the
	-.Em write_acl
	-and
	-.Em write_owner
	-permissions when the
	-.Tn ACL
	-entry is inherited. A file system with an
	-.Sy aclinherit
	-property value of
	-.Cm passthrough
	-inherits all inheritable
	-.Tn ACL
	-entries without any modifications made to the
	-.Tn ACL
	-entries when they are inherited. A file system with an
	-.Sy aclinherit
	-property value of
	-.Cm passthrough-x
	-has the same meaning as
	-.Cm passthrough ,
	-except that the
	-.Em owner@ , group@ , No and Em everyone@ Tn ACE Ns s
	-inherit the execute permission only if the file creation mode also requests the
	-execute bit.
	-.Pp
	-When the property value is set to
	-.Cm passthrough ,
	-files are created with a mode determined by the inheritable
	-.Tn ACE Ns s.
	-If no inheritable
	-.Tn ACE Ns s
	-exist that affect the mode, then the mode is set in accordance to the requested
	-mode from the application.
	-.It Sy aclmode Ns = Ns Cm discard \| groupmask \| passthrough \| restricted
	-Controls how an
	-.Tn ACL
	-is modified during
	-.Xr chmod 2 .
	-A file system with an
	-.Sy aclmode
	-property of
	-.Cm discard
	-(the default) deletes all
	-.Tn ACL
	-entries that do not represent the mode of the file. An
	-.Sy aclmode
	-property of
	-.Cm groupmask
	-reduces permissions granted in all
	-.Em ALLOW
	-entries found in the
	-.Tn ACL
	-such that they are no greater than the group permissions specified by
	-.Xr chmod 2 .
	-A file system with an
	-.Sy aclmode
	-property of
	-.Cm passthrough
	-indicates that no changes are made to the
	-.Tn ACL
	-other than creating or updating the necessary
	-.Tn ACL
	-entries to represent the new mode of the file or directory.
	-An
	-.Sy aclmode
	-property of
	-.Cm restricted
	-will cause the
	-.Xr chmod 2
	-operation to return an error when used on any file or directory which has
	-a non-trivial
	-.Tn ACL
	-whose entries can not be represented by a mode.
	-.Xr chmod 2
	-is required to change the set user ID, set group ID, or sticky bits on a file
	-or directory, as they do not have equivalent
	-.Tn ACL
	-entries.
	-In order to use
	-.Xr chmod 2
	-on a file or directory with a non-trivial
	-.Tn ACL
	-when
	-.Sy aclmode
	-is set to
	-.Cm restricted ,
	-you must first remove all
	-.Tn ACL
	-entries which do not represent the current mode.
	-.It Sy atime Ns = Ns Cm on \| off
	-Controls whether the access time for files is updated when they are read.
	-Turning this property off avoids producing write traffic when reading files and
	-can result in significant performance gains, though it might confuse mailers
	-and other similar utilities. The default value is
	-.Cm on .
	-.It Sy canmount Ns = Ns Cm on \| off \| noauto
	-If this property is set to
	-.Cm off ,
	-the file system cannot be mounted, and is ignored by
	-.Qq Nm Cm mount Fl a .
	-Setting this property to
	-.Cm off
	-is similar to setting the
	-.Sy mountpoint
	-property to
	-.Cm none ,
	-except that the dataset still has a normal
	-.Sy mountpoint
	-property, which can be inherited. Setting this property to
	-.Cm off
	-allows datasets to be used solely as a mechanism to inherit properties. One
	-example of setting
	-.Sy canmount Ns = Ns Cm off
	-is to have two datasets with the same
	-.Sy mountpoint ,
	-so that the children of both datasets appear in the same directory, but might
	-have different inherited characteristics.
	-.Pp
	-When the
	-.Cm noauto
	-value is set, a dataset can only be mounted and unmounted explicitly. The
	-dataset is not mounted automatically when the dataset is created or imported,
	-nor is it mounted by the
	-.Qq Nm Cm mount Fl a
	-command or unmounted by the
	-.Qq Nm Cm umount Fl a
	-command.
	-.Pp
	-This property is not inherited.
	-.It Sy checksum Ns = Ns Cm on \| off \| fletcher2 \| fletcher4 \| sha256 \| noparity \| sha512 \| skein
	-Controls the checksum used to verify data integrity. The default value is
	-.Cm on ,
	-which automatically selects an appropriate algorithm (currently,
	-.Cm fletcher4 ,
	-but this may change in future releases). The value
	-.Cm off
	-disables integrity checking on user data.
	-The value
	-.Cm noparity
	-not only
	-disables integrity but also disables maintaining parity for user data. This
	-setting is used internally by a dump device residing on a RAID-Z pool and should
	-not be used by any other dataset.
	-Disabling checksums is
	-.Em NOT
	-a recommended practice.
	-The
	-.Sy sha512 ,
	-and
	-.Sy skein
	-checksum algorithms require enabling the appropriate features on the pool.
	-Please see
	-.Xr zpool-features 7
	-for more information on these algorithms.
	-.Pp
	-Changing this property affects only newly-written data.
	-.Pp
	-The salted checksum algorithm
	-.Pq Cm edonr
	-is currently not supported on FreeBSD.
	-.It Sy compression Ns = Ns Cm on \| off \| lzjb \| gzip \| gzip- Ns Ar N \| Cm zle \| Cm lz4
	-Controls the compression algorithm used for this dataset.
	-Setting compression to
	-.Cm on
	-indicates that the current default compression algorithm should be used.
	-The default balances compression and decompression speed, with compression
	-ratio and is expected to work well on a wide variety of workloads.
	-Unlike all other settings for this property, on does not select a fixed
	-compression type.
	-As new compression algorithms are added to ZFS and enabled on a pool, the
	-default compression algorithm may change.
	-The current default compression algorthm is either
	-.Cm lzjb
	-or, if the
	-.Sy lz4_compress
	-feature is enabled,
	-.Cm lz4 .
	-The
	-.Cm lzjb
	-compression algorithm is optimized for performance while providing decent data
	-compression. Setting compression to
	-.Cm on
	-uses the
	-.Cm lzjb
	-compression algorithm. The
	-.Cm gzip
	-compression algorithm uses the same compression as the
	-.Xr gzip 1
	-command. You can specify the
	-.Cm gzip
	-level by using the value
	-.Cm gzip- Ns Ar N
	-where
	-.Ar N
	-is an integer from 1 (fastest) to 9 (best compression ratio). Currently,
	-.Cm gzip
	-is equivalent to
	-.Cm gzip-6
	-(which is also the default for
	-.Xr gzip 1 ) .
	-The
	-.Cm zle
	-compression algorithm compresses runs of zeros.
	-.Pp
	-The
	-.Sy lz4
	-compression algorithm is a high-performance replacement
	-for the
	-.Sy lzjb
	-algorithm. It features significantly faster
	-compression and decompression, as well as a moderately higher
	-compression ratio than
	-.Sy lzjb ,
	-but can only be used on pools with
	-the
	-.Sy lz4_compress
	-feature set to
	-.Sy enabled .
	-See
	-.Xr zpool-features 7
	-for details on ZFS feature flags and the
	-.Sy lz4_compress
	-feature.
	-.Pp
	-This property can also be referred to by its shortened column name
	-.Cm compress .
	-Changing this property affects only newly-written data.
	-.It Sy copies Ns = Ns Cm 1 \| 2 \| 3
	-Controls the number of copies of data stored for this dataset. These copies are
	-in addition to any redundancy provided by the pool, for example, mirroring or
	-RAID-Z. The copies are stored on different disks, if possible. The space used
	-by multiple copies is charged to the associated file and dataset, changing the
	-.Sy used
	-property and counting against quotas and reservations.
	-.Pp
	-Changing this property only affects newly-written data. Therefore, set this
	-property at file system creation time by using the
	-.Fl o Cm copies= Ns Ar N
	-option.
	-.It Sy dedup Ns = Ns Cm on \| off \| verify \| sha256 Ns Oo Cm ,verify Oc \| Sy sha512 Ns Oo Cm ,verify Oc \| Sy skein Ns Oo Cm ,verify Oc
	-Configures deduplication for a dataset. The default value is
	-.Cm off .
	-The default deduplication checksum is
	-.Cm sha256
	-(this may change in the future).
	-When
	-.Sy dedup
	-is enabled, the checksum defined here overrides the
	-.Sy checksum
	-property. Setting the value to
	-.Cm verify
	-has the same effect as the setting
	-.Cm sha256,verify .
	-.Pp
	-If set to
	-.Cm verify ,
	-.Tn ZFS
	-will do a byte-to-byte comparsion in case of two blocks having the same
	-signature to make sure the block contents are identical.
	-.It Sy devices Ns = Ns Cm on \| off
	-The
	-.Sy devices
	-property is currently not supported on
	-.Fx .
	-.It Sy exec Ns = Ns Cm on \| off
	-Controls whether processes can be executed from within this file system. The
	-default value is
	-.Cm on .
	-.It Sy mlslabel Ns = Ns Ar label \| Cm none
	-The
	-.Sy mlslabel
	-property is currently not supported on
	-.Fx .
	-.It Sy filesystem_limit Ns = Ns Ar count \| Cm none
	-Limits the number of filesystems and volumes that can exist under this point in
	-the dataset tree.
	-The limit is not enforced if the user is allowed to change
	-the limit.
	-Setting a
	-.Sy filesystem_limit
	-on a descendent of a filesystem that
	-already has a
	-.Sy filesystem_limit
	-does not override the ancestor's
	-.Sy filesystem_limit ,
	-but rather imposes an additional limit.
	-This feature must be enabled to be used
	-.Po see
	-.Xr zpool-features 7
	-.Pc .
	-.It Sy special_small_blocks Ns = Ns Ar size
	-This value represents the threshold block size for including small file
	-blocks into the special allocation class.
	-Blocks smaller than or equal to this value will be assigned to the special
	-allocation class while greater blocks will be assigned to the regular class.
	-Valid values are zero or a power of two from 512B up to 128K.
	-The default size is 0 which means no small file blocks will be allocated in
	-the special class.
	-.Pp
	-Before setting this property, a special class vdev must be added to the
	-pool.
	-See
	-.Xr zpool 8
	-for more details on the special allocation class.
	-.It Sy mountpoint Ns = Ns Ar path \| Cm none \| legacy
	-Controls the mount point used for this file system.
	-See the
	-.Qq Sx Mount Points
	-section for more information on how this property is used.
	-.Pp
	-When the
	-.Sy mountpoint
	-property is changed for a file system, the file system and any children that
	-inherit the mount point are unmounted. If the new value is
	-.Cm legacy ,
	-then they remain unmounted. Otherwise, they are automatically remounted in the
	-new location if the property was previously
	-.Cm legacy
	-or
	-.Cm none ,
	-or if they were mounted before the property was changed. In addition, any
	-shared file systems are unshared and shared in the new location.
	-.It Sy nbmand Ns = Ns Cm on \| off
	-The
	-.Sy nbmand
	-property is currently not supported on
	-.Fx .
	-.It Sy primarycache Ns = Ns Cm all \| none \| metadata
	-Controls what is cached in the primary cache (ARC). If this property is set to
	-.Cm all ,
	-then both user data and metadata is cached. If this property is set to
	-.Cm none ,
	-then neither user data nor metadata is cached. If this property is set to
	-.Cm metadata ,
	-then only metadata is cached. The default value is
	-.Cm all .
	-.It Sy quota Ns = Ns Ar size \| Cm none
	-Limits the amount of space a dataset and its descendents can consume. This
	-property enforces a hard limit on the amount of space used. This includes all
	-space consumed by descendents, including file systems and snapshots. Setting a
	-quota on a descendent of a dataset that already has a quota does not override
	-the ancestor's quota, but rather imposes an additional limit.
	-.Pp
	-Quotas cannot be set on volumes, as the
	-.Sy volsize
	-property acts as an implicit quota.
	-.It Sy snapshot_limit Ns = Ns Ar count \| Cm none
	-Limits the number of snapshots that can be created on a dataset and its
	-descendents.
	-Setting a
	-.Sy snapshot_limit
	-on a descendent of a dataset that already
	-has a
	-.Sy snapshot_limit
	-does not override the ancestor's
	-.Sy snapshot_limit ,
	-but
	-rather imposes an additional limit.
	-The limit is not enforced if the user is
	-allowed to change the limit.
	-For example, this means that recursive snapshots
	-taken from the global zone are counted against each delegated dataset within
	-a jail.
	-This feature must be enabled to be used
	-.Po see
	-.Xr zpool-features 7
	-.Pc .
	-.It Sy userquota@ Ns Ar user Ns = Ns Ar size \| Cm none
	-Limits the amount of space consumed by the specified user.
	-Similar to the
	-.Sy refquota
	-property, the
	-.Sy userquota
	-space calculation does not include space that is used by descendent datasets,
	-such as snapshots and clones. User space consumption is identified by the
	-.Sy userspace@ Ns Ar user
	-property.
	-.Pp
	-Enforcement of user quotas may be delayed by several seconds. This delay means
	-that a user might exceed their quota before the system notices that they are
	-over quota and begins to refuse additional writes with the
	-.Em EDQUOT
	-error message. See the
	-.Cm userspace
	-subcommand for more information.
	-.Pp
	-Unprivileged users can only access their own groups' space usage. The root
	-user, or a user who has been granted the
	-.Sy userquota
	-privilege with
	-.Qq Nm Cm allow ,
	-can get and set everyone's quota.
	-.Pp
	-This property is not available on volumes, on file systems before version 4, or
	-on pools before version 15. The
	-.Sy userquota@ Ns ...
	-properties are not displayed by
	-.Qq Nm Cm get all .
	-The user's name must be appended after the
	-.Sy @
	-symbol, using one of the following forms:
	-.Bl -bullet -offset 2n
	-.It
	-POSIX name (for example,
	-.Em joe )
	-.It
	-POSIX numeric ID (for example,
	-.Em 1001 )
	-.El
	-.It Sy groupquota@ Ns Ar group Ns = Ns Ar size \| Cm none
	-Limits the amount of space consumed by the specified group. Group space
	-consumption is identified by the
	-.Sy userquota@ Ns Ar user
	-property.
	-.Pp
	-Unprivileged users can access only their own groups' space usage. The root
	-user, or a user who has been granted the
	-.Sy groupquota
	-privilege with
	-.Qq Nm Cm allow ,
	-can get and set all groups' quotas.
	-.It Sy readonly Ns = Ns Cm on \| off
	-Controls whether this dataset can be modified. The default value is
	-.Cm off .
	-.It Sy recordsize Ns = Ns Ar size
	-Specifies a suggested block size for files in the file system. This property is
	-designed solely for use with database workloads that access files in fixed-size
	-records.
	-.Tn ZFS
	-automatically tunes block sizes according to internal algorithms optimized for
	-typical access patterns.
	-.Pp
	-For databases that create very large files but access them in small random
	-chunks, these algorithms may be suboptimal. Specifying a
	-.Sy recordsize
	-greater than or equal to the record size of the database can result in
	-significant performance gains. Use of this property for general purpose file
	-systems is strongly discouraged, and may adversely affect performance.
	-.Pp
	-The size specified must be a power of two greater than or equal to 512 and less
	-than or equal to 128 Kbytes.
	-If the
	-.Sy large_blocks
	-feature is enabled on the pool, the size may be up to 1 Mbyte.
	-See
	-.Xr zpool-features 7
	-for details on ZFS feature flags.
	-.Pp
	-Changing the file system's
	-.Sy recordsize
	-affects only files created afterward; existing files are unaffected.
	-.Pp
	-This property can also be referred to by its shortened column name,
	-.Sy recsize .
	-.It Sy redundant_metadata Ns = Ns Cm all \| most
	-Controls what types of metadata are stored redundantly.
	-ZFS stores an extra copy of metadata, so that if a single block is corrupted,
	-the amount of user data lost is limited.
	-This extra copy is in addition to any redundancy provided at the pool level
	-.Pq e.g. by mirroring or RAID-Z ,
	-and is in addition to an extra copy specified by the
	-.Sy copies
	-property
	-.Pq up to a total of 3 copies .
	-For example if the pool is mirrored,
	-.Cm copies Ns = Ns Ar 2 ,
	-and
	-.Cm redundant_metadata Ns = Ns Ar most ,
	-then ZFS
	-stores 6 copies of most metadata, and 4 copies of data and some
	-metadata.
	-.Pp
	-When set to
	-.Cm all ,
	-ZFS stores an extra copy of all metadata.
	-If a
	-single on-disk block is corrupt, at worst a single block of user data
	-.Po which is
	-.Cm recordsize
	-bytes long
	-can be lost.
	-.Pc
	-.Pp
	-When set to
	-.Cm most ,
	-ZFS stores an extra copy of most types of
	-metadata.
	-This can improve performance of random writes, because less
	-metadata must be written.
	-In practice, at worst about 100 blocks
	-.Po of
	-.Cm recordsize
	-bytes each
	-.Pc
	-of user data can be lost if a single
	-on-disk block is corrupt.
	-The exact behavior of which metadata blocks
	-are stored redundantly may change in future releases.
	-.Pp
	-The default value is
	-.Cm all .
	-.It Sy refquota Ns = Ns Ar size \| Cm none
	-Limits the amount of space a dataset can consume. This property enforces a hard
	-limit on the amount of space used. This hard limit does not include space used
	-by descendents, including file systems and snapshots.
	-.It Sy refreservation Ns = Ns Ar size \| Cm none \| Cm auto
	-The minimum amount of space guaranteed to a dataset, not including its
	-descendents. When the amount of space used is below this value, the dataset is
	-treated as if it were taking up the amount of space specified by
	-.Sy refreservation .
	-The
	-.Sy refreservation
	-reservation is accounted for in the parent datasets' space used, and counts
	-against the parent datasets' quotas and reservations.
	-.Pp
	-If
	-.Sy refreservation
	-is set, a snapshot is only allowed if there is enough free pool space outside
	-of this reservation to accommodate the current number of "referenced" bytes in
	-the dataset.
	-.Pp
	-If
	-.Sy refreservation
	-is set to
	-.Sy auto ,
	-a volume is thick provisioned or not sparse.
	-.Sy refreservation Ns = Cm auto
	-is only supported on volumes.
	-See
	-.Sy volsize
	-in the Native Properties
	-section for more information about sparse volumes.
	-.Pp
	-This property can also be referred to by its shortened column name,
	-.Sy refreserv .
	-.It Sy reservation Ns = Ns Ar size \| Cm none
	-The minimum amount of space guaranteed to a dataset and its descendents. When
	-the amount of space used is below this value, the dataset is treated as if it
	-were taking up the amount of space specified by its reservation. Reservations
	-are accounted for in the parent datasets' space used, and count against the
	-parent datasets' quotas and reservations.
	-.Pp
	-This property can also be referred to by its shortened column name,
	-.Sy reserv .
	-.It Sy secondarycache Ns = Ns Cm all \| none \| metadata
	-Controls what is cached in the secondary cache (L2ARC). If this property is set
	-to
	-.Cm all ,
	-then both user data and metadata is cached. If this property is set to
	-.Cm none ,
	-then neither user data nor metadata is cached. If this property is set to
	-.Cm metadata ,
	-then only metadata is cached. The default value is
	-.Cm all .
	-.It Sy setuid Ns = Ns Cm on \| off
	-Controls whether the
	-.No set- Ns Tn UID
	-bit is respected for the file system. The default value is
	-.Cm on .
	-.It Sy sharesmb Ns = Ns Cm on \| off \| Ar opts
	-The
	-.Sy sharesmb
	-property currently has no effect on
	-.Fx .
	-.It Sy sharenfs Ns = Ns Cm on \| off \| Ar opts
	-Controls whether the file system is shared via
	-.Tn NFS ,
	-and what options are used. A file system with a
	-.Sy sharenfs
	-property of
	-.Cm off
	-is managed the traditional way via
	-.Xr exports 5 .
	-Otherwise, the file system is automatically shared and unshared with the
	-.Qq Nm Cm share
	-and
	-.Qq Nm Cm unshare
	-commands. If the property is set to
	-.Cm on
	-no
	-.Tn NFS
	-export options are used. Otherwise,
	-.Tn NFS
	-export options are equivalent to the contents of this property. The export
	-options may be comma-separated. See
	-.Xr exports 5
	-for a list of valid options.
	-.Pp
	-When the
	-.Sy sharenfs
	-property is changed for a dataset, the
	-.Xr mountd 8
	-daemon is reloaded.
	-.It Sy logbias Ns = Ns Cm latency \| throughput
	-Provide a hint to
	-.Tn ZFS
	-about handling of synchronous requests in this dataset.
	-If
	-.Sy logbias
	-is set to
	-.Cm latency
	-(the default),
	-.Tn ZFS
	-will use pool log devices (if configured) to handle the requests at low
	-latency. If
	-.Sy logbias
	-is set to
	-.Cm throughput ,
	-.Tn ZFS
	-will not use configured pool log devices.
	-.Tn ZFS
	-will instead optimize synchronous operations for global pool throughput and
	-efficient use of resources.
	-.It Sy snapdir Ns = Ns Cm hidden \| visible
	-Controls whether the
	-.Pa \&.zfs
	-directory is hidden or visible in the root of the file system as discussed in
	-the
	-.Qq Sx Snapshots
	-section. The default value is
	-.Cm hidden .
	-.It Sy sync Ns = Ns Cm standard \| always \| disabled
	-Controls the behavior of synchronous requests (e.g.
	-.Xr fsync 2 ,
	-O_DSYNC). This property accepts the following values:
	-.Bl -tag -offset 4n -width 8n
	-.It Sy standard
	-This is the POSIX specified behavior of ensuring all synchronous requests are
	-written to stable storage and all devices are flushed to ensure data is not
	-cached by device controllers (this is the default).
	-.It Sy always
	-All file system transactions are written and flushed before their system calls
	-return. This has a large performance penalty.
	-.It Sy disabled
	-Disables synchronous requests. File system transactions are only committed to
	-stable storage periodically. This option will give the highest performance.
	-However, it is very dangerous as
	-.Tn ZFS
	-would be ignoring the synchronous transaction demands of applications such as
	-databases or
	-.Tn NFS .
	-Administrators should only use this option when the risks are understood.
	-.El
	-.It Sy volsize Ns = Ns Ar size
	-For volumes, specifies the logical size of the volume. By default, creating a
	-volume establishes a reservation of equal size. For storage pools with a
	-version number of 9 or higher, a
	-.Sy refreservation
	-is set instead. Any changes to
	-.Sy volsize
	-are reflected in an equivalent change to the reservation (or
	-.Sy refreservation ) .
	-The
	-.Sy volsize
	-can only be set to a multiple of
	-.Cm volblocksize ,
	-and cannot be zero.
	-.Pp
	-The reservation is kept equal to the volume's logical size to prevent
	-unexpected behavior for consumers. Without the reservation, the volume could
	-run out of space, resulting in undefined behavior or data corruption, depending
	-on how the volume is used. These effects can also occur when the volume size is
	-changed while it is in use (particularly when shrinking the size). Extreme care
	-should be used when adjusting the volume size.
	-.Pp
	-Though not recommended, a "sparse volume" (also known as "thin provisioned")
	-can be created by specifying the
	-.Fl s
	-option to the
	-.Qq Nm Cm create Fl V
	-command, or by changing the value of the
	-.Sy refreservation
	-property, or
	-.Sy reservation
	-property on pool
	-.Po
	-version 8 or earlier
	-.Pc
	-after the volume has been created.
	-A "sparse volume" is a volume where the value of
	-.Sy refreservation
	-is less then the size of the volume plus the space required to store its
	-metadata.
	-Consequently, writes to a sparse volume can fail with
	-.Sy ENOSPC
	-when the pool is low on space. For a sparse volume, changes to
	-.Sy volsize
	-are not reflected in the
	-.Sy refreservation .
	-A volume that is not sparse is said to be "thick provisioned".
	-A sparse volume can become thick provisioned by setting
	-.Sy refreservation
	-to
	-.Sy auto .
	-.It Sy volmode Ns = Ns Cm default \| geom \| dev \| none
	-This property specifies how volumes should be exposed to the OS.
	-Setting it to
	-.Sy geom
	-exposes volumes as
	-.Xr geom 4
	-providers, providing maximal functionality.
	-Setting it to
	-.Sy dev
	-exposes volumes only as cdev device in devfs.
	-Such volumes can be accessed only as raw disk device files, i.e. they
	-can not be partitioned, mounted, participate in RAIDs, etc, but they
	-are faster, and in some use scenarios with untrusted consumer, such as
	-NAS or VM storage, can be more safe.
	-Volumes with property set to
	-.Sy none
	-are not exposed outside ZFS, but can be snapshoted, cloned, replicated, etc,
	-that can be suitable for backup purposes.
	-Value
	-.Sy default
	-means that volumes exposition is controlled by system-wide sysctl/tunable
	-.Va vfs.zfs.vol.mode ,
	-where
	-.Sy geom ,
	-.Sy dev
	-and
	-.Sy none
	-are encoded as 1, 2 and 3 respectively.
	-The default values is
	-.Sy geom .
	-This property can be changed any time, but so far it is processed only
	-during volume creation and pool import.
	-.It Sy vscan Ns = Ns Cm off \| on
	-The
	-.Sy vscan
	-property is currently not supported on
	-.Fx .
	-.It Sy xattr Ns = Ns Cm off \| on
	-The
	-.Sy xattr
	-property is currently not supported on
	-.Fx .
	-.It Sy jailed Ns = Ns Cm off \| on
	-Controls whether the dataset is managed from a jail. See the
	-.Qq Sx Jails
	-section for more information. The default value is
	-.Cm off .
	-.El
	-.Pp
	-The following three properties cannot be changed after the file system is
	-created, and therefore, should be set when the file system is created. If the
	-properties are not set with the
	-.Qq Nm Cm create
	-or
	-.Nm zpool Cm create
	-commands, these properties are inherited from the parent dataset. If the parent
	-dataset lacks these properties due to having been created prior to these
	-features being supported, the new file system will have the default values for
	-these properties.
	-.Bl -tag -width 4n
	-.It Sy casesensitivity Ns = Ns Cm sensitive \| insensitive \| mixed
	-Indicates whether the file name matching algorithm used by the file system
	-should be case-sensitive, case-insensitive, or allow a combination of both
	-styles of matching. The default value for the
	-.Sy casesensitivity
	-property is
	-.Cm sensitive .
	-Traditionally, UNIX and POSIX file systems have case-sensitive file names.
	-.Pp
	-The
	-.Cm mixed
	-value for the
	-.Sy casesensitivity
	-property indicates that the
	-file system can support requests for both case-sensitive and case-insensitive
	-matching behavior.
	-.It Sy normalization Ns = Ns Cm none \| formC \| formD \| formKC \| formKD
	-Indicates whether the file system should perform a
	-.Sy unicode
	-normalization of file names whenever two file names are compared, and which
	-normalization algorithm should be used. File names are always stored
	-unmodified, names are normalized as part of any comparison process. If this
	-property is set to a legal value other than
	-.Cm none ,
	-and the
	-.Sy utf8only
	-property was left unspecified, the
	-.Sy utf8only
	-property is automatically set to
	-.Cm on .
	-The default value of the
	-.Sy normalization
	-property is
	-.Cm none .
	-This property cannot be changed after the file system is created.
	-.It Sy utf8only Ns = Ns Cm on \| off
	-Indicates whether the file system should reject file names that include
	-characters that are not present in the
	-.Sy UTF-8
	-character code set. If this property is explicitly set to
	-.Cm off ,
	-the normalization property must either not be explicitly set or be set to
	-.Cm none .
	-The default value for the
	-.Sy utf8only
	-property is
	-.Cm off .
	-This property cannot be changed after the file system is created.
	-.El
	-.Pp
	-The
	-.Sy casesensitivity , normalization , No and Sy utf8only
	-properties are also new permissions that can be assigned to non-privileged
	-users by using the
	-.Tn ZFS
	-delegated administration feature.
	-.Ss Temporary Mount Point Properties
	-When a file system is mounted, either through
	-.Xr mount 8
	-for legacy mounts or the
	-.Qq Nm Cm mount
	-command for normal file systems, its mount options are set according to its
	-properties. The correlation between properties and mount options is as follows:
	-.Bl -column -offset 4n "PROPERTY" "MOUNT OPTION"
	-.It "PROPERTY MOUNT OPTION"
	-.It "atime atime/noatime"
	-.It "exec exec/noexec"
	-.It "readonly ro/rw"
	-.It "setuid suid/nosuid"
	-.El
	-.Pp
	-In addition, these options can be set on a per-mount basis using the
	-.Fl o
	-option, without affecting the property that is stored on disk. The values
	-specified on the command line override the values stored in the dataset. These
	-properties are reported as "temporary" by the
	-.Qq Nm Cm get
	-command. If the properties are changed while the dataset is mounted, the new
	-setting overrides any temporary settings.
	-.Ss User Properties
	-In addition to the standard native properties,
	-.Tn ZFS
	-supports arbitrary user properties. User properties have no effect on
	-.Tn ZFS
	-behavior, but applications or administrators can use them to annotate datasets
	-(file systems, volumes, and snapshots).
	-.Pp
	-User property names must contain a colon
	-.Pq Sy \&:
	-character to distinguish them from native properties. They may contain
	-lowercase letters, numbers, and the following punctuation characters: colon
	-.Pq Sy \&: ,
	-dash
	-.Pq Sy \&- ,
	-period
	-.Pq Sy \&.
	-and underscore
	-.Pq Sy \&_ .
	-The expected convention is that the property name is divided into two portions
	-such as
	-.Em module Ns Sy \&: Ns Em property ,
	-but this namespace is not enforced by
	-.Tn ZFS .
	-User property names can be at most 256 characters, and cannot begin with a dash
	-.Pq Sy \&- .
	-.Pp
	-When making programmatic use of user properties, it is strongly suggested to
	-use a reversed
	-.Tn DNS
	-domain name for the
	-.Ar module
	-component of property names to reduce the chance that two
	-independently-developed packages use the same property name for different
	-purposes. Property names beginning with
	-.Em com.sun
	-are reserved for use by Sun Microsystems.
	-.Pp
	-The values of user properties are arbitrary strings, are always inherited, and
	-are never validated. All of the commands that operate on properties
	-.Po
	-.Qq Nm Cm list ,
	-.Qq Nm Cm get ,
	-.Qq Nm Cm set
	-and so forth
	-.Pc
	-can be used to manipulate both native properties and user properties. Use the
	-.Qq Nm Cm inherit
	-command to clear a user property. If the property is not defined in any parent
	-dataset, it is removed entirely. Property values are limited to 1024
	-characters.
	-.Sh SUBCOMMANDS
	-All subcommands that modify state are logged persistently to the pool in their
	-original form.
	-.Bl -tag -width 2n
	-.It Xo
	-.Nm
	-.Op Fl \&?
	-.Xc
	-.Pp
	-Displays a help message.
	-.It Xo
	-.Nm
	-.Cm create
	-.Op Fl pu
	-.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
	-.Ar filesystem
	-.Xc
	-.Pp
	-Creates a new
	-.Tn ZFS
	-file system. The file system is automatically mounted according to the
	-.Sy mountpoint
	-property inherited from the parent.
	-.Bl -tag -width indent
	-.It Fl p
	-Creates all the non-existing parent datasets. Datasets created in this manner
	-are automatically mounted according to the
	-.Sy mountpoint
	-property inherited from their parent. Any property specified on the command
	-line using the
	-.Fl o
	-option is ignored. If the target filesystem already exists, the operation
	-completes successfully.
	-.It Fl u
	-Newly created file system is not mounted.
	-.It Fl o Ar property Ns = Ns Ar value
	-Sets the specified property as if the command
	-.Qq Nm Cm set Ar property Ns = Ns Ar value
	-was invoked at the same time the dataset was created. Any editable
	-.Tn ZFS
	-property can also be set at creation time. Multiple
	-.Fl o
	-options can be specified. An error results if the same property is specified in
	-multiple
	-.Fl o
	-options.
	-.El
	-.It Xo
	-.Nm
	-.Cm create
	-.Op Fl ps
	-.Op Fl b Ar blocksize
	-.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
	-.Fl V
	-.Ar size volume
	-.Xc
	-.Pp
	-Creates a volume of the given size. The volume is exported as a block device in
	-.Pa /dev/zvol/path ,
	-where
	-.Ar path
	-is the name of the volume in the
	-.Tn ZFS
	-namespace. The size represents the logical size as exported by the device. By
	-default, a reservation of equal size is created.
	-.Pp
	-.Ar size
	-is automatically rounded up to the nearest 128 Kbytes to ensure that
	-the volume has an integral number of blocks regardless of
	-.Ar blocksize .
	-.Bl -tag -width indent
	-.It Fl p
	-Creates all the non-existing parent datasets. Datasets created in this manner
	-are automatically mounted according to the
	-.Sy mountpoint
	-property inherited from their parent. Any property specified on the command
	-line using the
	-.Fl o
	-option is ignored. If the target filesystem already exists, the operation
	-completes successfully.
	-.It Fl s
	-Creates a sparse volume with no reservation. See
	-.Sy volsize
	-in the
	-.Qq Sx Native Properties
	-section for more information about sparse volumes.
	-.It Fl b Ar blocksize
	-Equivalent to
	-.Fl o Cm volblocksize Ns = Ns Ar blocksize .
	-If this option is specified in conjunction with
	-.Fl o Cm volblocksize ,
	-the resulting behavior is undefined.
	-.It Fl o Ar property Ns = Ns Ar value
	-Sets the specified property as if the
	-.Qq Nm Cm set Ar property Ns = Ns Ar value
	-command was invoked at the same time the dataset was created. Any editable
	-.Tn ZFS
	-property can also be set at creation time. Multiple
	-.Fl o
	-options can be specified. An error results if the same property is specified in
	-multiple
	-.Fl o
	-options.
	-.El
	-.It Xo
	-.Nm
	-.Cm destroy
	-.Op Fl fnpRrv
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.Pp
	-Destroys the given dataset. By default, the command unshares any file systems
	-that are currently shared, unmounts any file systems that are currently
	-mounted, and refuses to destroy a dataset that has active dependents (children
	-or clones).
	-.Bl -tag -width indent
	-.It Fl r
	-Recursively destroy all children.
	-.It Fl R
	-Recursively destroy all dependents, including cloned file systems outside the
	-target hierarchy.
	-.It Fl f
	-Force an unmount of any file systems using the
	-.Qq Nm Cm unmount Fl f
	-command. This option has no effect on non-file systems or unmounted file
	-systems.
	-.It Fl n
	-Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in
	-conjunction with the
	-.Fl v
	-or
	-.Fl p
	-flags to determine what data would be deleted.
	-.It Fl p
	-Print machine-parsable verbose information about the deleted data.
	-.It Fl v
	-Print verbose information about the deleted data.
	-.El
	-.Pp
	-Extreme care should be taken when applying either the
	-.Fl r
	-or the
	-.Fl R
	-options, as they can destroy large portions of a pool and cause unexpected
	-behavior for mounted file systems in use.
	-.It Xo
	-.Nm
	-.Cm destroy
	-.Op Fl dnpRrv
	-.Sm off
	-.Ar snapshot
	-.Op % Ns Ar snapname
	-.Op , Ns ...
	-.Sm on
	-.Xc
	-.Pp
	-The given snapshots are destroyed immediately if and only if the
	-.Qq Nm Cm destroy
	-command without the
	-.Fl d
	-option would have destroyed it. Such immediate destruction would occur, for
	-example, if the snapshot had no clones and the user-initiated reference count
	-were zero.
	-.Pp
	-If a snapshot does not qualify for immediate destruction, it is marked for
	-deferred deletion. In this state, it exists as a usable, visible snapshot until
	-both of the preconditions listed above are met, at which point it is destroyed.
	-.Pp
	-An inclusive range of snapshots may be specified by separating the
	-first and last snapshots with a percent sign
	-.Pq Sy % .
	-The first and/or last snapshots may be left blank, in which case the
	-filesystem's oldest or newest snapshot will be implied.
	-.Pp
	-Multiple snapshots
	-(or ranges of snapshots) of the same filesystem or volume may be specified
	-in a comma-separated list of snapshots.
	-Only the snapshot's short name (the
	-part after the
	-.Sy @ )
	-should be specified when using a range or comma-separated list to identify
	-multiple snapshots.
	-.Bl -tag -width indent
	-.It Fl r
	-Destroy (or mark for deferred deletion) all snapshots with this name in
	-descendent file systems.
	-.It Fl R
	-Recursively destroy all clones of these snapshots, including the clones,
	-snapshots, and children.
	-If this flag is specified, the
	-.Fl d
	-flag will have no effect.
	-.It Fl n
	-Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in
	-conjunction with the
	-.Fl v
	-or
	-.Fl p
	-flags to determine what data would be deleted.
	-.It Fl p
	-Print machine-parsable verbose information about the deleted data.
	-.It Fl v
	-Print verbose information about the deleted data.
	-.It Fl d
	-Defer snapshot deletion.
	-.El
	-.Pp
	-Extreme care should be taken when applying either the
	-.Fl r
	-or the
	-.Fl R
	-options, as they can destroy large portions of a pool and cause unexpected
	-behavior for mounted file systems in use.
	-.It Xo
	-.Nm
	-.Cm destroy
	-.Ar filesystem Ns \| Ns Ar volume Ns # Ns Ar bookmark
	-.Xc
	-.Pp
	-The given bookmark is destroyed.
	-.It Xo
	-.Nm
	-.Cm snapshot Ns \| Ns Cm snap
	-.Op Fl r
	-.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
	-.Ar filesystem@snapname Ns \| Ns volume@snapname
	-.Ar filesystem@snapname Ns \| Ns volume@snapname Ns ...
	-.Xc
	-.Pp
	-Creates snapshots with the given names. All previous modifications by
	-successful system calls to the file system are part of the snapshots.
	-Snapshots are taken atomically, so that all snapshots correspond to the same
	-moment in time. See the
	-.Qq Sx Snapshots
	-section for details.
	-.Bl -tag -width indent
	-.It Fl r
	-Recursively create snapshots of all descendent datasets
	-.It Fl o Ar property Ns = Ns Ar value
	-Sets the specified property; see
	-.Qq Nm Cm create
	-for details.
	-.El
	-.It Xo
	-.Nm
	-.Cm rollback
	-.Op Fl rRf
	-.Ar snapshot
	-.Xc
	-.Pp
	-Roll back the given dataset to a previous snapshot. When a dataset is rolled
	-back, all data that has changed since the snapshot is discarded, and the
	-dataset reverts to the state at the time of the snapshot. By default, the
	-command refuses to roll back to a snapshot other than the most recent one. In
	-order to do so, all intermediate snapshots and bookmarks must be destroyed
	-by specifying the
	-.Fl r
	-option.
	-.Pp
	-The
	-.Fl rR
	-options do not recursively destroy the child snapshots of a
	-recursive snapshot.
	-Only direct snapshots of the specified filesystem
	-are destroyed by either of these options.
	-To completely roll back a
	-recursive snapshot, you must rollback the individual child snapshots.
	-.Bl -tag -width indent
	-.It Fl r
	-Destroy any snapshots and bookmarks more recent than the one specified.
	-.It Fl R
	-Destroy any more recent snapshots and bookmarks, as well as any clones of those
	-snapshots.
	-.It Fl f
	-Used with the
	-.Fl R
	-option to force an unmount of any clone file systems that are to be destroyed.
	-.El
	-.It Xo
	-.Nm
	-.Cm clone
	-.Op Fl p
	-.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
	-.Ar snapshot filesystem Ns \| Ns Ar volume
	-.Xc
	-.Pp
	-Creates a clone of the given snapshot. See the
	-.Qq Sx Clones
	-section for details. The target dataset can be located anywhere in the
	-.Tn ZFS
	-hierarchy, and is created as the same type as the original.
	-.Bl -tag -width indent
	-.It Fl p
	-Creates all the non-existing parent datasets. Datasets created in this manner
	-are automatically mounted according to the
	-.Sy mountpoint
	-property inherited from their parent. If the target filesystem or volume
	-already exists, the operation completes successfully.
	-.It Fl o Ar property Ns = Ns Ar value
	-Sets the specified property; see
	-.Qq Nm Cm create
	-for details.
	-.El
	-.It Xo
	-.Nm
	-.Cm promote
	-.Ar clone-filesystem
	-.Xc
	-.Pp
	-Promotes a clone file system to no longer be dependent on its "origin"
	-snapshot. This makes it possible to destroy the file system that the clone was
	-created from. The clone parent-child dependency relationship is reversed, so
	-that the origin file system becomes a clone of the specified file system.
	-.Pp
	-The snapshot that was cloned, and any snapshots previous to this snapshot, are
	-now owned by the promoted clone. The space they use moves from the origin file
	-system to the promoted clone, so enough space must be available to accommodate
	-these snapshots. No new space is consumed by this operation, but the space
	-accounting is adjusted. The promoted clone must not have any conflicting
	-snapshot names of its own. The
	-.Cm rename
	-subcommand can be used to rename any conflicting snapshots.
	-.It Xo
	-.Nm
	-.Cm rename
	-.Op Fl f
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot
	-.Xc
	-.It Xo
	-.Nm
	-.Cm rename
	-.Op Fl f
	-.Fl p
	-.Ar filesystem Ns \| Ns Ar volume
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.It Xo
	-.Nm
	-.Cm rename
	-.Fl u
	-.Op Fl p
	-.Ar filesystem filesystem
	-.Xc
	-.Pp
	-Renames the given dataset. The new target can be located anywhere in the
	-.Tn ZFS
	-hierarchy, with the exception of snapshots. Snapshots can only be renamed
	-within the parent file system or volume. When renaming a snapshot, the parent
	-file system of the snapshot does not need to be specified as part of the second
	-argument. Renamed file systems can inherit new mount points, in which case they
	-are unmounted and remounted at the new mount point.
	-.Bl -tag -width indent
	-.It Fl p
	-Creates all the nonexistent parent datasets. Datasets created in this manner
	-are automatically mounted according to the
	-.Sy mountpoint
	-property inherited from their parent.
	-.It Fl u
	-Do not remount file systems during rename. If a file system's
	-.Sy mountpoint
	-property is set to
	-.Cm legacy
	-or
	-.Cm none ,
	-file system is not unmounted even if this option is not given.
	-.It Fl f
	-Force unmount any filesystems that need to be unmounted in the process.
	-This flag has no effect if used together with the
	-.Fl u
	-flag.
	-.El
	-.It Xo
	-.Nm
	-.Cm rename
	-.Fl r
	-.Ar snapshot snapshot
	-.Xc
	-.Pp
	-Recursively rename the snapshots of all descendent datasets. Snapshots are the
	-only dataset that can be renamed recursively.
	-.It Xo
	-.Nm
	-.Cm rename
	-.Ar bookmark bookmark
	-.Xc
	-.Pp
	-Renames the given bookmark.
	-Bookmarks can only be renamed within the parent file system or volume.
	-When renaming a bookmark, the parent file system or volume of the bookmark
	-does not need to be specified as part of the second argument.
	-.It Xo
	-.Nm
	-.Cm list
	-.Op Fl r Ns \| Ns Fl d Ar depth
	-.Op Fl Hp
	-.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ...
	-.Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ...
	-.Oo Fl s Ar property Oc Ns ...
	-.Oo Fl S Ar property Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot Ns ...
	-.Xc
	-.Pp
	-Lists the property information for the given datasets in tabular form. If
	-specified, you can list property information by the absolute pathname or the
	-relative pathname. By default, all file systems and volumes are displayed.
	-Snapshots are displayed if the
	-.Sy listsnaps
	-property is
	-.Cm on
	-(the default is
	-.Cm off ) .
	-The following fields are displayed,
	-.Sy name , used , available , referenced , mountpoint .
	-.Bl -tag -width indent
	-.It Fl r
	-Recursively display any children of the dataset on the command line.
	-.It Fl d Ar depth
	-Recursively display any children of the dataset, limiting the recursion to
	-.Ar depth .
	-A depth of
	-.Sy 1
	-will display only the dataset and its direct children.
	-.It Fl H
	-Used for scripting mode. Do not print headers and separate fields by a single
	-tab instead of arbitrary white space.
	-.It Fl p
	-Display numbers in parsable (exact) values.
	-.It Fl o Ar property Ns Oo , Ns Ar property Oc Ns ...
	-A comma-separated list of properties to display. The property must be:
	-.Bl -bullet -offset 2n
	-.It
	-One of the properties described in the
	-.Qq Sx Native Properties
	-section
	-.It
	-A user property
	-.It
	-The value
	-.Cm name
	-to display the dataset name
	-.It
	-The value
	-.Cm space
	-to display space usage properties on file systems and volumes. This is a
	-shortcut for specifying
	-.Fl o
	-.Sy name,avail,used,usedsnap,usedds,usedrefreserv,usedchild
	-.Fl t
	-.Sy filesystem,volume
	-syntax.
	-.El
	-.It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ...
	-A comma-separated list of types to display, where
	-.Ar type
	-is one of
	-.Sy filesystem , snapshot , snap , volume , bookmark , No or Sy all .
	-For example, specifying
	-.Fl t Cm snapshot
	-displays only snapshots.
	-.It Fl s Ar property
	-A property for sorting the output by column in ascending order based on the
	-value of the property. The property must be one of the properties described in
	-the
	-.Qq Sx Properties
	-section, or the special value
	-.Cm name
	-to sort by the dataset name. Multiple properties can be specified at one time
	-using multiple
	-.Fl s
	-property options. Multiple
	-.Fl s
	-options are evaluated from left to right in decreasing order of importance.
	-.Pp
	-The following is a list of sorting criteria:
	-.Bl -bullet -offset 2n
	-.It
	-Numeric types sort in numeric order.
	-.It
	-String types sort in alphabetical order.
	-.It
	-Types inappropriate for a row sort that row to the literal bottom, regardless
	-of the specified ordering.
	-.It
	-If no sorting options are specified the existing behavior of
	-.Qq Nm Cm list
	-is preserved.
	-.El
	-.It Fl S Ar property
	-Same as the
	-.Fl s
	-option, but sorts by property in descending order.
	-.El
	-.It Xo
	-.Nm
	-.Cm set
	-.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot
	-.Xc
	-.Pp
	-Sets the property or list of properties to the given value(s) for each dataset.
	-Only some properties can be edited. See the "Properties" section for more
	-information on what properties can be set and acceptable values. Numeric values
	-can be specified as exact values, or in a human-readable form with a suffix of
	-.Sy B , K , M , G , T , P , E , Z
	-(for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, or
	-zettabytes, respectively). User properties can be set on snapshots. For more
	-information, see the
	-.Qq Sx User Properties
	-section.
	-.It Xo
	-.Nm
	-.Cm get
	-.Op Fl r Ns \| Ns Fl d Ar depth
	-.Op Fl Hp
	-.Op Fl o Ar all \| field Ns Oo , Ns Ar field Oc Ns ...
	-.Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ...
	-.Op Fl s Ar source Ns Oo , Ns Ar source Oc Ns ...
	-.Ar all \| property Ns Oo , Ns Ar property Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot Ns \| Ns Ar bookmark Ns ...
	-.Xc
	-.Pp
	-Displays properties for the given datasets. If no datasets are specified, then
	-the command displays properties for all datasets on the system. For each
	-property, the following columns are displayed:
	-.Pp
	-.Bl -hang -width "property" -offset indent -compact
	-.It name
	-Dataset name
	-.It property
	-Property name
	-.It value
	-Property value
	-.It source
	-Property source. Can either be local, default, temporary, inherited, received,
	-or none
	-(\&-).
	-.El
	-.Pp
	-All columns except the
	-.Sy RECEIVED
	-column are displayed by default. The columns to display can be specified
	-by using the
	-.Fl o
	-option. This command takes a comma-separated list of properties as described in
	-the
	-.Qq Sx Native Properties
	-and
	-.Qq Sx User Properties
	-sections.
	-.Pp
	-The special value
	-.Cm all
	-can be used to display all properties that apply to the given dataset's type
	-(filesystem, volume, snapshot, or bookmark).
	-.Bl -tag -width indent
	-.It Fl r
	-Recursively display properties for any children.
	-.It Fl d Ar depth
	-Recursively display any children of the dataset, limiting the recursion to
	-.Ar depth .
	-A depth of
	-.Sy 1
	-will display only the dataset and its direct children.
	-.It Fl H
	-Display output in a form more easily parsed by scripts. Any headers are
	-omitted, and fields are explicitly separated by a single tab instead of an
	-arbitrary amount of space.
	-.It Fl p
	-Display numbers in parsable (exact) values.
	-.It Fl o Cm all \| Ar field Ns Oo , Ns Ar field Oc Ns ...
	-A comma-separated list of columns to display. Supported values are
	-.Sy name,property,value,received,source .
	-Default values are
	-.Sy name,property,value,source .
	-The keyword
	-.Cm all
	-specifies all columns.
	-.It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ...
	-A comma-separated list of types to display, where
	-.Ar type
	-is one of
	-.Sy filesystem , snapshot , volume , No or Sy all .
	-For example, specifying
	-.Fl t Cm snapshot
	-displays only snapshots.
	-.It Fl s Ar source Ns Oo , Ns Ar source Oc Ns ...
	-A comma-separated list of sources to display. Those properties coming from a
	-source other than those in this list are ignored. Each source must be one of
	-the following:
	-.Sy local,default,inherited,temporary,received,none .
	-The default value is all sources.
	-.El
	-.It Xo
	-.Nm
	-.Cm inherit
	-.Op Fl rS
	-.Ar property
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot Ns ...
	-.Xc
	-.Pp
	-Clears the specified property, causing it to be inherited from an ancestor,
	-restored to default if no ancestor has the property set, or with the
	-.Fl S
	-option reverted to the received value if one exists.
	-See the
	-.Qq Sx Properties
	-section for a listing of default values, and details on which properties can be
	-inherited.
	-.Bl -tag -width indent
	-.It Fl r
	-Recursively inherit the given property for all children.
	-.It Fl S
	-Revert the property to the received value if one exists; otherwise operate as
	-if the
	-.Fl S
	-option was not specified.
	-.El
	-.It Xo
	-.Nm
	-.Cm remap
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.Pp
	-Remap the indirect blocks in the given filesystem or volume so that they no
	-longer reference blocks on previously removed vdevs and we can eventually
	-shrink the size of the indirect mapping objects for the previously removed
	-vdevs. Note that remapping all blocks might not be possible and that
	-references from snapshots will still exist and cannot be remapped.
	-.It Xo
	-.Nm
	-.Cm upgrade
	-.Op Fl v
	-.Xc
	-.Pp
	-Displays a list of file systems that are not the most recent version.
	-.Bl -tag -width indent
	-.It Fl v
	-Displays
	-.Tn ZFS
	-filesystem versions supported by the current software. The current
	-.Tn ZFS
	-filesystem version and all previous supported versions are displayed, along
	-with an explanation of the features provided with each version.
	-.El
	-.It Xo
	-.Nm
	-.Cm upgrade
	-.Op Fl r
	-.Op Fl V Ar version
	-.Fl a \| Ar filesystem
	-.Xc
	-.Pp
	-Upgrades file systems to a new on-disk version. Once this is done, the file
	-systems will no longer be accessible on systems running older versions of the
	-software.
	-.Qq Nm Cm send
	-streams generated from new snapshots of these file systems cannot be accessed
	-on systems running older versions of the software.
	-.Pp
	-In general, the file system version is independent of the pool version. See
	-.Xr zpool 8
	-for information on the
	-.Nm zpool Cm upgrade
	-command.
	-.Pp
	-In some cases, the file system version and the pool version are interrelated
	-and the pool version must be upgraded before the file system version can be
	-upgraded.
	-.Bl -tag -width indent
	-.It Fl r
	-Upgrade the specified file system and all descendent file systems.
	-.It Fl V Ar version
	-Upgrade to the specified
	-.Ar version .
	-If the
	-.Fl V
	-flag is not specified, this command upgrades to the most recent version. This
	-option can only be used to increase the version number, and only up to the most
	-recent version supported by this software.
	-.It Fl a
	-Upgrade all file systems on all imported pools.
	-.It Ar filesystem
	-Upgrade the specified file system.
	-.El
	-.It Xo
	-.Nm
	-.Cm userspace
	-.Op Fl Hinp
	-.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ...
	-.Oo Fl s Ar field Oc Ns ...
	-.Oo Fl S Ar field Oc Ns ...
	-.Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar snapshot
	-.Xc
	-.Pp
	-Displays space consumed by, and quotas on, each user in the specified
	-filesystem or snapshot. This corresponds to the
	-.Sy userused@ Ns Ar user
	-and
	-.Sy userquota@ Ns Ar user
	-properties.
	-.Bl -tag -width indent
	-.It Fl n
	-Print numeric ID instead of user/group name.
	-.It Fl H
	-Do not print headers, use tab-delimited output.
	-.It Fl p
	-Use exact (parsable) numeric output.
	-.It Fl o Ar field Ns Oo , Ns Ar field Oc Ns ...
	-Display only the specified fields from the following set:
	-.Sy type,name,used,quota .
	-The default is to display all fields.
	-.It Fl s Ar field
	-Sort output by this field. The
	-.Fl s
	-and
	-.Fl S
	-flags may be specified multiple times to sort first by one field, then by
	-another. The default is
	-.Fl s Cm type Fl s Cm name .
	-.It Fl S Ar field
	-Sort by this field in reverse order. See
	-.Fl s .
	-.It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ...
	-Print only the specified types from the following set:
	-.Sy all,posixuser,smbuser,posixgroup,smbgroup .
	-.Pp
	-The default is
	-.Fl t Cm posixuser,smbuser .
	-.Pp
	-The default can be changed to include group types.
	-.It Fl i
	-Translate SID to POSIX ID. This flag currently has no effect on
	-.Fx .
	-.El
	-.It Xo
	-.Nm
	-.Cm groupspace
	-.Op Fl Hinp
	-.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ...
	-.Oo Fl s Ar field Oc Ns ...
	-.Oo Fl S Ar field Oc Ns ...
	-.Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar snapshot
	-.Xc
	-.Pp
	-Displays space consumed by, and quotas on, each group in the specified
	-filesystem or snapshot. This subcommand is identical to
	-.Qq Nm Cm userspace ,
	-except that the default types to display are
	-.Fl t Sy posixgroup,smbgroup .
	-.It Xo
	-.Nm
	-.Cm mount
	-.Xc
	-.Pp
	-Displays all
	-.Tn ZFS
	-file systems currently mounted.
	-.Bl -tag -width indent
	-.It Fl f
	-.El
	-.It Xo
	-.Nm
	-.Cm mount
	-.Op Fl vO
	-.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ...
	-.Fl a \| Ar filesystem
	-.Xc
	-.Pp
	-Mounts
	-.Tn ZFS
	-file systems.
	-.Bl -tag -width indent
	-.It Fl v
	-Report mount progress.
	-.It Fl O
	-Perform an overlay mount. Overlay mounts are not supported on
	-.Fx .
	-.It Fl o Ar property Ns Oo , Ns Ar property Oc Ns ...
	-An optional, comma-separated list of mount options to use temporarily for the
	-duration of the mount. See the
	-.Qq Sx Temporary Mount Point Properties
	-section for details.
	-.It Fl a
	-Mount all available
	-.Tn ZFS
	-file systems.
	-This command may be executed on
	-.Fx
	-system startup by
	-.Pa /etc/rc.d/zfs .
	-For more information, see variable
	-.Va zfs_enable
	-in
	-.Xr rc.conf 5 .
	-.It Ar filesystem
	-Mount the specified filesystem.
	-.El
	-.It Xo
	-.Nm
	-.Cm unmount Ns \| Ns Cm umount
	-.Op Fl f
	-.Fl a \| Ar filesystem Ns \| Ns Ar mountpoint
	-.Xc
	-.Pp
	-Unmounts currently mounted
	-.Tn ZFS
	-file systems.
	-.Bl -tag -width indent
	-.It Fl f
	-Forcefully unmount the file system, even if it is currently in use.
	-.It Fl a
	-Unmount all available
	-.Tn ZFS
	-file systems.
	-.It Ar filesystem \| mountpoint
	-Unmount the specified filesystem. The command can also be given a path to a
	-.Tn ZFS
	-file system mount point on the system.
	-.El
	-.It Xo
	-.Nm
	-.Cm share
	-.Fl a \| Ar filesystem
	-.Xc
	-.Pp
	-Shares
	-.Tn ZFS
	-file systems that have the
	-.Sy sharenfs
	-property set.
	-.Bl -tag -width indent
	-.It Fl a
	-Share all
	-.Tn ZFS
	-file systems that have the
	-.Sy sharenfs
	-property set.
	-This command may be executed on
	-.Fx
	-system startup by
	-.Pa /etc/rc.d/zfs .
	-For more information, see variable
	-.Va zfs_enable
	-in
	-.Xr rc.conf 5 .
	-.It Ar filesystem
	-Share the specified filesystem according to the
	-.Tn sharenfs
	-property. File systems are shared when the
	-.Tn sharenfs
	-property is set.
	-.El
	-.It Xo
	-.Nm
	-.Cm unshare
	-.Fl a \| Ar filesystem Ns \| Ns Ar mountpoint
	-.Xc
	-.Pp
	-Unshares
	-.Tn ZFS
	-file systems that have the
	-.Tn sharenfs
	-property set.
	-.Bl -tag -width indent
	-.It Fl a
	-Unshares
	-.Tn ZFS
	-file systems that have the
	-.Sy sharenfs
	-property set.
	-This command may be executed on
	-.Fx
	-system shutdown by
	-.Pa /etc/rc.d/zfs .
	-For more information, see variable
	-.Va zfs_enable
	-in
	-.Xr rc.conf 5 .
	-.It Ar filesystem \| mountpoint
	-Unshare the specified filesystem. The command can also be given a path to a
	-.Tn ZFS
	-file system shared on the system.
	-.El
	-.It Xo
	-.Nm
	-.Cm bookmark
	-.Ar snapshot
	-.Ar bookmark
	-.Xc
	-.Pp
	-Creates a bookmark of the given snapshot.
	-Bookmarks mark the point in time
	-when the snapshot was created, and can be used as the incremental source for
	-a
	-.Qq Nm Cm send
	-command.
	-.Pp
	-This feature must be enabled to be used.
	-See
	-.Xr zpool-features 7
	-for details on ZFS feature flags and the
	-.Sy bookmark
	-feature.
	-.It Xo
	-.Nm
	-.Cm send
	-.Op Fl DLPRVcenpv
	-.Op Fl i Ar snapshot \| Fl I Ar snapshot
	-.Ar snapshot
	-.Xc
	-.Pp
	-Creates a stream representation of the last
	-.Ar snapshot
	-argument (not part of
	-.Fl i
	-or
	-.Fl I )
	-which is written to standard output. The output can be redirected to
	-a file or to a different system (for example, using
	-.Xr ssh 1 ) .
	-By default, a full stream is generated.
	-.Bl -tag -width indent
	-.It Fl i Ar snapshot
	-Generate an incremental stream from the first
	-.Ar snapshot Pq the incremental source
	-to the second
	-.Ar snapshot Pq the incremental target .
	-The incremental source can be specified as the last component of the
	-snapshot name
	-.Pq the Em @ No character and following
	-and
	-it is assumed to be from the same file system as the incremental target.
	-.Pp
	-If the destination is a clone, the source may be the origin snapshot, which
	-must be fully specified (for example,
	-.Cm pool/fs@origin ,
	-not just
	-.Cm @origin ) .
	-.It Fl I Ar snapshot
	-Generate a stream package that sends all intermediary snapshots from the first
	-.Ar snapshot
	-to the second
	-.Ar snapshot .
	-For example,
	-.Ic -I @a fs@d
	-is similar to
	-.Ic -i @a fs@b; -i @b fs@c; -i @c fs@d .
	-The incremental
	-source may be specified as with the
	-.Fl i
	-option.
	-.It Fl R, -replicate
	-Generate a replication stream package, which will replicate the specified
	-filesystem, and all descendent file systems, up to the named snapshot. When
	-received, all properties, snapshots, descendent file systems, and clones are
	-preserved.
	-.Pp
	-If the
	-.Fl i
	-or
	-.Fl I
	-flags are used in conjunction with the
	-.Fl R
	-flag, an incremental replication stream is generated. The current values of
	-properties, and current snapshot and file system names are set when the stream
	-is received. If the
	-.Fl F
	-flag is specified when this stream is received, snapshots and file systems that
	-do not exist on the sending side are destroyed.
	-.It Fl D, -dedup
	-Generate a deduplicated stream. Blocks which would have been sent multiple
	-times in the send stream will only be sent once. The receiving system must
	-also support this feature to receive a deduplicated stream. This flag can
	-be used regardless of the dataset's
	-.Sy dedup
	-property, but performance will be much better if the filesystem uses a
	-dedup-capable checksum (eg.
	-.Sy sha256 ) .
	-.It Fl L, -large-block
	-Generate a stream which may contain blocks larger than 128KB.
	-This flag
	-has no effect if the
	-.Sy large_blocks
	-pool feature is disabled, or if the
	-.Sy recordsize
	-property of this filesystem has never been set above 128KB.
	-The receiving system must have the
	-.Sy large_blocks
	-pool feature enabled as well.
	-See
	-.Xr zpool-features 7
	-for details on ZFS feature flags and the
	-.Sy large_blocks
	-feature.
	-.It Fl e, -embed
	-Generate a more compact stream by using WRITE_EMBEDDED records for blocks
	-which are stored more compactly on disk by the
	-.Sy embedded_data
	-pool
	-feature.
	-This flag has no effect if the
	-.Sy embedded_data
	-feature is
	-disabled.
	-The receiving system must have the
	-.Sy embedded_data
	-feature
	-enabled.
	-If the
	-.Sy lz4_compress
	-feature is active on the sending system,
	-then the receiving system must have that feature enabled as well.
	-See
	-.Xr zpool-features 7
	-for details on ZFS feature flags and the
	-.Sy embedded_data
	-feature.
	-.It Fl c, -compressed
	-Generate a more compact stream by using compressed WRITE records for blocks
	-which are compressed on disk and in memory (see the
	-.Sy compression
	-property for details).
	-If the
	-.Sy lz4_compress
	-feature is active on the sending system, then the receiving system must have that
	-feature enabled as well. If the
	-.Sy large_blocks
	-feature is enabled on the sending system but the
	-.Fl L
	-option is not supplied in conjunction with
	-.Fl c
	-then the data will be decompressed before sending so it can be split
	-into smaller block sizes.
	-.It Fl p, -props
	-Include the dataset's properties in the stream. This flag is implicit when
	-.Fl R
	-is specified. The receiving system must also support this feature.
	-.It Fl n, -dryrun
	-Do a dry-run ("No-op") send. Do not generate any actual send data. This is
	-useful in conjunction with the
	-.Fl v
	-or
	-.Fl P
	-flags to determine what data will be sent.
	-In this case, the verbose output will be written to
	-standard output (contrast with a non-dry-run, where the stream is written
	-to standard output and the verbose output goes to standard error).
	-.It Fl P, -parsable
	-Print machine-parsable verbose information about the stream package generated.
	-.It Fl v, -verbose
	-Print verbose information about the stream package generated.
	-This information includes a per-second report of how much data has been sent.
	-.It Fl V
	-Set the process title to a per-second report of how much data has been sent.
	-.El
	-.Pp
	-The format of the stream is committed. You will be able to receive your streams
	-on future versions of
	-.Tn ZFS .
	-.It Xo
	-.Nm
	-.Cm send
	-.Op Fl LPcenv
	-.Op Fl i Ar snapshot Ns \| Ns Ar bookmark
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot
	-.Xc
	-.Pp
	-Generate a send stream, which may be of a filesystem, and may be
	-incremental from a bookmark.
	-If the destination is a filesystem or volume,
	-the pool must be read-only, or the filesystem must not be mounted.
	-When the
	-stream generated from a filesystem or volume is received, the default snapshot
	-name will be
	-.Pq --head-- .
	-.Bl -tag -width indent
	-.It Fl i Ar snapshot Ns \| Ns Ar bookmark
	-Generate an incremental send stream.
	-The incremental source must be an earlier
	-snapshot in the destination's history.
	-It will commonly be an earlier
	-snapshot in the destination's filesystem, in which case it can be
	-specified as the last component of the name
	-.Pq the Em # No or Em @ No character and following .
	-.Pp
	-If the incremental target is a clone, the incremental source can
	-be the origin snapshot, or an earlier snapshot in the origin's filesystem,
	-or the origin's origin, etc.
	-.It Fl n, -dryrun
	-Do a dry-run
	-.Pq Qq No-op
	-send.
	-Do not generate any actual send data.
	-This is useful in conjunction with the
	-.Fl v
	-or
	-.Fl P
	-flags to determine what data will be sent.
	-In this case, the verbose output will be written to standard output
	-.Po contrast with a non-dry-run, where the stream is written to standard output
	-and the verbose output goes to standard error
	-.Pc .
	-.It Fl v, -verbose
	-Print verbose information about the stream package generated.
	-This information includes a per-second report of how much data has been sent.
	-.It Fl L, -large-block
	-Generate a stream which may contain blocks larger than 128KB.
	-This flag
	-has no effect if the
	-.Sy large_blocks
	-pool feature is disabled, or if the
	-.Sy recordsize
	-property of this filesystem has never been set above 128KB.
	-The receiving system must have the
	-.Sy large_blocks
	-pool feature enabled as well.
	-See
	-.Xr zpool-features 7
	-for details on ZFS feature flags and the
	-.Sy large_blocks
	-feature.
	-.It Fl P, -parsable
	-Print machine-parsable verbose information about the stream package generated.
	-.It Fl c, -compressed
	-Generate a more compact stream by using compressed WRITE records for blocks
	-which are compressed on disk and in memory (see the
	-.Sy compression
	-property for details). If the
	-.Sy lz4_compress
	-feature is active on the sending system, then the receiving system must have
	-that feature enabled as well. If the
	-.Sy large_blocks
	-feature is enabled on the sending system but the
	-.Fl L
	-option is not supplied in conjunction with
	-.Fl c
	-then the data will be decompressed before sending so it can be split
	-into smaller block sizes.
	-.It Fl e, -embed
	-Generate a more compact stream by using WRITE_EMBEDDED records for blocks
	-which are stored more compactly on disk by the
	-.Sy embedded_data
	-pool
	-feature.
	-This flag has no effect if the
	-.Sy embedded_data
	-feature is
	-disabled.
	-The receiving system must have the
	-.Sy embedded_data
	-feature
	-enabled.
	-If the
	-.Sy lz4_compress
	-feature is active on the sending system,
	-then the receiving system must have that feature enabled as well.
	-See
	-.Xr zpool-features 7
	-for details on ZFS feature flags and the
	-.Sy embedded_data
	-feature.
	-.El
	-.It Xo
	-.Nm
	-.Cm send
	-.Op Fl Penv
	-.Fl t
	-.Ar receive_resume_token
	-.Xc
	-Creates a send stream which resumes an interrupted receive. The
	-.Ar receive_resume_token
	-is the value of this property on the filesystem
	-or volume that was being received into. See the documentation for
	-.Sy zfs receive -s
	-for more details.
	-.It Xo
	-.Nm
	-.Cm receive Ns \| Ns Cm recv
	-.Op Fl vnsFMu
	-.Op Fl o Sy origin Ns = Ns Ar snapshot
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot
	-.Xc
	-.It Xo
	-.Nm
	-.Cm receive Ns \| Ns Cm recv
	-.Op Fl vnsFMu
	-.Op Fl d \| e
	-.Op Fl o Sy origin Ns = Ns Ar snapshot
	-.Ar filesystem
	-.Xc
	-.Pp
	-Creates a snapshot whose contents are as specified in the stream provided on
	-standard input. If a full stream is received, then a new file system is created
	-as well. Streams are created using the
	-.Qq Nm Cm send
	-subcommand, which by default creates a full stream.
	-.Qq Nm Cm recv
	-can be used as an alias for
	-.Qq Nm Cm receive .
	-.Pp
	-If an incremental stream is received, then the destination file system must
	-already exist, and its most recent snapshot must match the incremental stream's
	-source. For
	-.Sy zvol Ns s,
	-the destination device link is destroyed and recreated, which means the
	-.Sy zvol
	-cannot be accessed during the
	-.Sy receive
	-operation.
	-.Pp
	-When a snapshot replication package stream that is generated by using the
	-.Qq Nm Cm send Fl R
	-command is received, any snapshots that do not exist on the sending location
	-are destroyed by using the
	-.Qq Nm Cm destroy Fl d
	-command.
	-.Pp
	-The name of the snapshot (and file system, if a full stream is received) that
	-this subcommand creates depends on the argument type and the
	-.Fl d
	-or
	-.Fl e
	-option.
	-.Pp
	-If the argument is a snapshot name, the specified
	-.Ar snapshot
	-is created. If the argument is a file system or volume name, a snapshot with
	-the same name as the sent snapshot is created within the specified
	-.Ar filesystem
	-or
	-.Ar volume .
	-If the
	-.Fl d
	-or
	-.Fl e
	-option is specified, the snapshot name is determined by appending the sent
	-snapshot's name to the specified
	-.Ar filesystem .
	-If the
	-.Fl d
	-option is specified, all but the pool name of the sent snapshot path is
	-appended (for example,
	-.Sy b/c@1
	-appended from sent snapshot
	-.Sy a/b/c@1 ) ,
	-and if the
	-.Fl e
	-option is specified, only the tail of the sent snapshot path is appended (for
	-example,
	-.Sy c@1
	-appended from sent snapshot
	-.Sy a/b/c@1 ) .
	-In the case of
	-.Fl d ,
	-any file systems needed to replicate the path of the sent snapshot are created
	-within the specified file system.
	-.Bl -tag -width indent
	-.It Fl d
	-Use the full sent snapshot path without the first element (without pool name)
	-to determine the name of the new snapshot as described in the paragraph above.
	-.It Fl e
	-Use only the last element of the sent snapshot path to determine the name of
	-the new snapshot as described in the paragraph above.
	-.It Fl u
	-File system that is associated with the received stream is not mounted.
	-.It Fl v
	-Print verbose information about the stream and the time required to perform the
	-receive operation.
	-.It Fl n
	-Do not actually receive the stream. This can be useful in conjunction with the
	-.Fl v
	-option to verify the name the receive operation would use.
	-.It Fl o Sy origin Ns = Ns Ar snapshot
	-Forces the stream to be received as a clone of the given snapshot.
	-If the stream is a full send stream, this will create the filesystem
	-described by the stream as a clone of the specified snapshot. Which
	-snapshot was specified will not affect the success or failure of the
	-receive, as long as the snapshot does exist. If the stream is an
	-incremental send stream, all the normal verification will be performed.
	-.It Fl F
	-Force a rollback of the file system to the most recent snapshot before
	-performing the receive operation. If receiving an incremental replication
	-stream (for example, one generated by
	-.Qq Nm Cm send Fl R Bro Fl i \| Fl I Brc ) ,
	-destroy snapshots and file systems that do not exist on the sending side.
	-.It Fl M
	-Force an unmount of the file system while receiving a snapshot.
	-This option is not supported on Linux.
	-.It Fl s
	-If the receive is interrupted, save the partially received state, rather
	-than deleting it. Interruption may be due to premature termination of
	-the stream
	-.Po e.g. due to network failure or failure of the remote system
	-if the stream is being read over a network connection
	-.Pc ,
	-a checksum error in the stream, termination of the
	-.Nm zfs Cm receive
	-process, or unclean shutdown of the system.
	-.Pp
	-The receive can be resumed with a stream generated by
	-.Nm zfs Cm send Fl t Ar token ,
	-where the
	-.Ar token
	-is the value of the
	-.Sy receive_resume_token
	-property of the filesystem or volume which is received into.
	-.Pp
	-To use this flag, the storage pool must have the
	-.Sy extensible_dataset
	-feature enabled. See
	-.Xr zpool-features 7
	-for details on ZFS feature flags.
	-.El
	-.It Xo
	-.Nm
	-.Cm receive Ns \| Ns Cm recv
	-.Fl A
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-Abort an interrupted
	-.Nm zfs Cm receive Fl s ,
	-deleting its saved partially received state.
	-.It Xo
	-.Nm
	-.Cm allow
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.Pp
	-Displays permissions that have been delegated on the specified filesystem or
	-volume. See the other forms of
	-.Qq Nm Cm allow
	-for more information.
	-.It Xo
	-.Nm
	-.Cm allow
	-.Op Fl ldug
	-.Ar user Ns \| Ns Ar group Ns Oo Ns , Ns Ar user Ns \| Ns Ar group Oc Ns ...
	-.Ar perm Ns \| Ns Ar @setname Ns
	-.Oo Ns , Ns Ar perm Ns \| Ns Ar @setname Oc Ns ...
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.It Xo
	-.Nm
	-.Cm allow
	-.Op Fl ld
	-.Fl e Ns \| Ns Cm everyone
	-.Ar perm Ns \| Ns Ar @setname Ns Op Ns , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ...
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.Pp
	-Delegates
	-.Tn ZFS
	-administration permission for the file systems to non-privileged users.
	-.Bl -tag -width indent
	-.It Xo
	-.Op Fl ug
	-.Ar user Ns \| Ns Ar group Ns Oo , Ar user Ns \| Ns Ar group Oc Ns ...
	-.Xc
	-Specifies to whom the permissions are delegated. Multiple entities can be
	-specified as a comma-separated list. If neither of the
	-.Fl ug
	-options are specified, then the argument is interpreted preferentially as the
	-keyword
	-.Cm everyone ,
	-then as a user name, and lastly as a group name. To specify
	-a user or group named
	-.Qq everyone ,
	-use the
	-.Fl u
	-or
	-.Fl g
	-options. To specify a group with the same name as a user, use the
	-.Fl g
	-option.
	-.It Op Fl e Ns \| Ns Cm everyone
	-Specifies that the permissions be delegated to
	-.Qq everyone .
	-.It Xo
	-.Ar perm Ns \| Ns Ar @setname Ns Oo , Ns Ar perm Ns \| Ns Ar @setname Oc Ns ...
	-.Xc
	-The permissions to delegate. Multiple permissions
	-may be specified as a comma-separated list. Permission names are the same as
	-.Tn ZFS
	-subcommand and property names. See the property list below. Property set names,
	-which begin with an at sign
	-.Pq Sy @ ,
	-may be specified. See the
	-.Fl s
	-form below for details.
	-.It Xo
	-.Op Fl ld
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-Specifies where the permissions are delegated. If neither of the
	-.Fl ld
	-options are specified, or both are, then the permissions are allowed for the
	-file system or volume, and all of its descendents. If only the
	-.Fl l
	-option is used, then is allowed "locally" only for the specified file system.
	-If only the
	-.Fl d
	-option is used, then is allowed only for the descendent file systems.
	-.El
	-.Pp
	-Permissions are generally the ability to use a
	-.Tn ZFS
	-subcommand or change a
	-.Tn ZFS
	-property. The following permissions are available:
	-.Bl -column -offset 4n "secondarycache" "subcommand"
	-.It NAME Ta TYPE Ta NOTES
	-.It allow Ta subcommand Ta Must Xo
	-also have the permission that is being allowed
	-.Xc
	-.It clone Ta subcommand Ta Must Xo
	-also have the 'create' ability and 'mount' ability in the origin file system
	-.Xc
	-.It create Ta subcommand Ta Must also have the 'mount' ability
	-.It destroy Ta subcommand Ta Must also have the 'mount' ability
	-.It diff Ta subcommand Ta Allows lookup of paths within a dataset given an
	-object number, and the ability to create snapshots necessary to 'zfs diff'
	-.It hold Ta subcommand Ta Allows adding a user hold to a snapshot
	-.It mount Ta subcommand Ta Allows mount/umount of Tn ZFS No datasets
	-.It promote Ta subcommand Ta Must Xo
	-also have the 'mount' and 'promote' ability in the origin file system
	-.Xc
	-.It receive Ta subcommand Ta Must also have the 'mount' and 'create' ability
	-.It release Ta subcommand Ta Allows Xo
	-releasing a user hold which might destroy the snapshot
	-.Xc
	-.It rename Ta subcommand Ta Must Xo
	-also have the 'mount' and 'create' ability in the new parent
	-.Xc
	-.It rollback Ta subcommand Ta Must also have the 'mount' ability
	-.It send Ta subcommand
	-.It share Ta subcommand Ta Allows Xo
	-sharing file systems over the
	-.Tn NFS
	-protocol
	-.Xc
	-.It snapshot Ta subcommand Ta Must also have the 'mount' ability
	-.It groupquota Ta other Ta Allows accessing any groupquota@... property
	-.It groupused Ta other Ta Allows reading any groupused@... property
	-.It userprop Ta other Ta Allows changing any user property
	-.It userquota Ta other Ta Allows accessing any userquota@... property
	-.It userused Ta other Ta Allows reading any userused@... property
	-.It aclinherit Ta property
	-.It aclmode Ta property
	-.It atime Ta property
	-.It canmount Ta property
	-.It casesensitivity Ta property
	-.It checksum Ta property
	-.It compression Ta property
	-.It copies Ta property
	-.It dedup Ta property
	-.It devices Ta property
	-.It exec Ta property
	-.It filesystem_limit Ta property
	-.It logbias Ta property
	-.It jailed Ta property
	-.It mlslabel Ta property
	-.It mountpoint Ta property
	-.It nbmand Ta property
	-.It normalization Ta property
	-.It primarycache Ta property
	-.It quota Ta property
	-.It readonly Ta property
	-.It recordsize Ta property
	-.It refquota Ta property
	-.It refreservation Ta property
	-.It reservation Ta property
	-.It secondarycache Ta property
	-.It setuid Ta property
	-.It sharenfs Ta property
	-.It sharesmb Ta property
	-.It snapdir Ta property
	-.It snapshot_limit Ta property
	-.It sync Ta property
	-.It utf8only Ta property
	-.It version Ta property
	-.It volblocksize Ta property
	-.It volsize Ta property
	-.It vscan Ta property
	-.It xattr Ta property
	-.El
	-.It Xo
	-.Nm
	-.Cm allow
	-.Fl c
	-.Ar perm Ns \| Ns Ar @setname Ns Op Ns , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ...
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.Pp
	-Sets "create time" permissions. These permissions are granted (locally) to the
	-creator of any newly-created descendent file system.
	-.It Xo
	-.Nm
	-.Cm allow
	-.Fl s
	-.Ar @setname
	-.Ar perm Ns \| Ns Ar @setname Ns Op Ns , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ...
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.Pp
	-Defines or adds permissions to a permission set. The set can be used by other
	-.Qq Nm Cm allow
	-commands for the specified file system and its descendents. Sets are evaluated
	-dynamically, so changes to a set are immediately reflected. Permission sets
	-follow the same naming restrictions as ZFS file systems, but the name must
	-begin with an "at sign"
	-.Pq Sy @ ,
	-and can be no more than 64 characters long.
	-.It Xo
	-.Nm
	-.Cm unallow
	-.Op Fl rldug
	-.Ar user Ns \| Ns Ar group Ns Oo Ns , Ns Ar user Ns \| Ns Ar group Oc Ns ...
	-.Oo Ar perm Ns \| Ns Ar @setname Ns Op , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ... Oc
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.It Xo
	-.Nm
	-.Cm unallow
	-.Op Fl rld
	-.Fl e Ns \| Ns Cm everyone
	-.Oo Ar perm Ns \| Ns Ar @setname Ns Op , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ... Oc
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.It Xo
	-.Nm
	-.Cm unallow
	-.Op Fl r
	-.Fl c
	-.Oo Ar perm Ns \| Ns Ar @setname Ns Op , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ... Oc
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.Pp
	-Removes permissions that were granted with the
	-.Qq Nm Cm allow
	-command. No permissions are explicitly denied, so other permissions granted are
	-still in effect. For example, if the permission is granted by an ancestor. If
	-no permissions are specified, then all permissions for the specified
	-.Ar user , group , No or everyone
	-are removed. Specifying
	-.Cm everyone
	-.Po or using the Fl e
	-option
	-.Pc only removes the permissions that were granted to everyone ,
	-not all permissions for every user and group. See the
	-.Qq Nm Cm allow
	-command for a description of the
	-.Fl ldugec
	-options.
	-.Bl -tag -width indent
	-.It Fl r
	-Recursively remove the permissions from this file system and all descendents.
	-.El
	-.It Xo
	-.Nm
	-.Cm unallow
	-.Op Fl r
	-.Fl s
	-.Ar @setname
	-.Oo Ar perm Ns \| Ns Ar @setname Ns Op , Ns Ar perm Ns \| Ns Ar @setname Ns
	-.Ns ... Oc
	-.Ar filesystem Ns \| Ns Ar volume
	-.Xc
	-.Pp
	-Removes permissions from a permission set. If no permissions are specified,
	-then all permissions are removed, thus removing the set entirely.
	-.It Xo
	-.Nm
	-.Cm hold
	-.Op Fl r
	-.Ar tag snapshot Ns ...
	-.Xc
	-.Pp
	-Adds a single reference, named with the
	-.Ar tag
	-argument, to the specified snapshot or snapshots. Each snapshot has its own tag
	-namespace, and tags must be unique within that space.
	-.Pp
	-If a hold exists on a snapshot, attempts to destroy that snapshot by using the
	-.Qq Nm Cm destroy
	-command returns
	-.Em EBUSY .
	-.Bl -tag -width indent
	-.It Fl r
	-Specifies that a hold with the given tag is applied recursively to the
	-snapshots of all descendent file systems.
	-.El
	-.It Xo
	-.Nm
	-.Cm holds
	-.Op Fl Hp
	-.Op Fl r Ns \| Ns Fl d Ar depth
	-.Ar filesystem Ns \| Ns Ar volume Ns \| Ns Ar snapshot Ns
	-.Ns ...
	-.Xc
	-.Pp
	-Lists all existing user references for the given dataset or datasets.
	-.Bl -tag -width indent
	-.It Fl H
	-Used for scripting mode. Do not print headers and separate fields by a single
	-tab instead of arbitrary white space.
	-.It Fl p
	-Display numbers in parsable (exact) values.
	-.It Fl r
	-Lists the holds that are set on the descendent snapshots of the named datasets
	-or snapshots, in addition to listing the holds on the named snapshots, if any.
	-.It Fl d Ar depth
	-Recursively display any holds on the named snapshots, or descendent snapshots of
	-the named datasets or snapshots, limiting the recursion to
	-.Ar depth .
	-.El
	-.It Xo
	-.Nm
	-.Cm release
	-.Op Fl r
	-.Ar tag snapshot Ns ...
	-.Xc
	-.Pp
	-Removes a single reference, named with the
	-.Ar tag
	-argument, from the specified snapshot or snapshots. The tag must already exist
	-for each snapshot.
	-.Bl -tag -width indent
	-.It Fl r
	-Recursively releases a hold with the given tag on the snapshots of all
	-descendent file systems.
	-.El
	-.It Xo
	-.Nm
	-.Cm diff
	-.Op Fl FHt
	-.Ar snapshot
	-.Op Ar snapshot Ns \| Ns Ar filesystem
	-.Xc
	-.Pp
	-Display the difference between a snapshot of a given filesystem and another
	-snapshot of that filesystem from a later time or the current contents of the
	-filesystem. The first column is a character indicating the type of change,
	-the other columns indicate pathname, new pathname
	-.Pq in case of rename ,
	-change in link count, and optionally file type and/or change time.
	-.Pp
	-The types of change are:
	-.Bl -column -offset 2n indent
	-.It \&- Ta path was removed
	-.It \&+ Ta path was added
	-.It \&M Ta path was modified
	-.It \&R Ta path was renamed
	-.El
	-.Bl -tag -width indent
	-.It Fl F
	-Display an indication of the type of file, in a manner similar to the
	-.Fl F
	-option of
	-.Xr ls 1 .
	-.Bl -column -offset 2n indent
	-.It \&B Ta block device
	-.It \&C Ta character device
	-.It \&F Ta regular file
	-.It \&/ Ta directory
	-.It \&@ Ta symbolic link
	-.It \&= Ta socket
	-.It \&> Ta door (not supported on Fx )
	-.It \&\| Ta named pipe (not supported on Fx )
	-.It \&P Ta event port (not supported on Fx )
	-.El
	-.It Fl H
	-Give more parsable tab-separated output, without header lines and without
	-arrows.
	-.It Fl t
	-Display the path's inode change time as the first column of output.
	-.El
	-.It Xo
	-.Nm
	-.Cm program
	-.Op Fl jn
	-.Op Fl t Ar timeout
	-.Op Fl m Ar memory_limit
	-.Ar pool script
	-.Op Ar arg1 No ...
	-.Xc
	-.Pp
	-Executes
	-.Ar script
	-as a ZFS channel program on
	-.Ar pool .
	-The ZFS channel
	-program interface allows ZFS administrative operations to be run
	-programmatically via a Lua script.
	-The entire script is executed atomically, with no other administrative
	-operations taking effect concurrently.
	-A library of ZFS calls is made available to channel program scripts.
	-Channel programs may only be run with root privileges.
	-.Pp
	-For full documentation of the ZFS channel program interface, see the manual
	-page for
	-.Xr zfs-program 8 .
	-.Bl -tag -width indent
	-.It Fl j
	-Display channel program output in JSON format.
	-When this flag is specified and standard output is empty -
	-channel program encountered an error.
	-The details of such an error will be printed to standard error in plain text.
	-.It Fl n
	-Executes a read-only channel program, which runs faster.
	-The program cannot change on-disk state by calling functions from
	-the zfs.sync submodule.
	-The program can be used to gather information such as properties and
	-determining if changes would succeed (zfs.check.*).
	-Without this flag, all pending changes must be synced to disk before
	-a channel program can complete.
	-.It Fl t Ar timeout
	-Execution time limit, in milliseconds.
	-If a channel program executes for longer than the provided timeout, it will
	-be stopped and an error will be returned.
	-The default timeout is 1000 ms, and can be set to a maximum of 10000 ms.
	-.It Fl m Ar memory-limit
	-Memory limit, in bytes.
	-If a channel program attempts to allocate more memory than the given limit,
	-it will be stopped and an error returned.
	-The default memory limit is 10 MB, and can be set to a maximum of 100 MB.
	-.Pp
	-All remaining argument strings are passed directly to the channel program as
	-arguments.
	-See
	-.Xr zfs-program 8
	-for more information.
	-.El
	-.It Xo
	-.Nm
	-.Cm jail
	-.Ar jailid filesystem
	-.Xc
	-.Pp
	-Attaches the specified
	-.Ar filesystem
	-to the jail identified by JID
	-.Ar jailid .
	-From now on this file system tree can be managed from within a jail if the
	-.Sy jailed
	-property has been set. To use this functionality, the jail needs the
	-.Va allow.mount
	-and
	-.Va allow.mount.zfs
	-parameters set to 1 and the
	-.Va enforce_statfs
	-parameter set to a value lower than 2.
	-.Pp
	-See
	-.Xr jail 8
	-for more information on managing jails and configuring the parameters above.
	-.It Xo
	-.Nm
	-.Cm unjail
	-.Ar jailid filesystem
	-.Xc
	-.Pp
	-Detaches the specified
	-.Ar filesystem
	-from the jail identified by JID
	-.Ar jailid .
	-.El
	-.Sh EXIT STATUS
	-The following exit values are returned:
	-.Bl -tag -offset 2n -width 2n
	-.It 0
	-Successful completion.
	-.It 1
	-An error occurred.
	-.It 2
	-Invalid command line options were specified.
	-.El
	-.Sh EXAMPLES
	-.Bl -tag -width 0n
	-.It Sy Example 1 No Creating a Tn ZFS No File System Hierarchy
	-.Pp
	-The following commands create a file system named
	-.Em pool/home
	-and a file system named
	-.Em pool/home/bob .
	-The mount point
	-.Pa /home
	-is set for the parent file system, and is automatically inherited by the child
	-file system.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs create pool/home
	-.Li # Ic zfs set mountpoint=/home pool/home
	-.Li # Ic zfs create pool/home/bob
	-.Ed
	-.It Sy Example 2 No Creating a Tn ZFS No Snapshot
	-.Pp
	-The following command creates a snapshot named
	-.Sy yesterday .
	-This snapshot is mounted on demand in the
	-.Pa \&.zfs/snapshot
	-directory at the root of the
	-.Em pool/home/bob
	-file system.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs snapshot pool/home/bob@yesterday
	-.Ed
	-.It Sy Example 3 No Creating and Destroying Multiple Snapshots
	-.Pp
	-The following command creates snapshots named
	-.Em yesterday
	-of
	-.Em pool/home
	-and all of its descendent file systems. Each snapshot is mounted on demand in
	-the
	-.Pa \&.zfs/snapshot
	-directory at the root of its file system. The second command destroys the newly
	-created snapshots.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs snapshot -r pool/home@yesterday
	-.Li # Ic zfs destroy -r pool/home@yesterday
	-.Ed
	-.It Sy Example 4 No Disabling and Enabling File System Compression
	-.Pp
	-The following command disables the
	-.Sy compression
	-property for all file systems under
	-.Em pool/home .
	-The next command explicitly enables
	-.Sy compression
	-for
	-.Em pool/home/anne .
	-.Bd -literal -offset 2n
	-.Li # Ic zfs set compression=off pool/home
	-.Li # Ic zfs set compression=on pool/home/anne
	-.Ed
	-.It Sy Example 5 No Listing Tn ZFS No Datasets
	-.Pp
	-The following command lists all active file systems and volumes in the system.
	-Snapshots are displayed if the
	-.Sy listsnaps
	-property is
	-.Cm on .
	-The default is
	-.Cm off .
	-See
	-.Xr zpool 8
	-for more information on pool properties.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs list
	- NAME USED AVAIL REFER MOUNTPOINT
	- pool 450K 457G 18K /pool
	- pool/home 315K 457G 21K /home
	- pool/home/anne 18K 457G 18K /home/anne
	- pool/home/bob 276K 457G 276K /home/bob
	-.Ed
	-.It Sy Example 6 No Setting a Quota on a Tn ZFS No File System
	-.Pp
	-The following command sets a quota of 50 Gbytes for
	-.Em pool/home/bob .
	-.Bd -literal -offset 2n
	-.Li # Ic zfs set quota=50G pool/home/bob
	-.Ed
	-.It Sy Example 7 No Listing Tn ZFS No Properties
	-.Pp
	-The following command lists all properties for
	-.Em pool/home/bob .
	-.Bd -literal -offset 2n
	-.Li # Ic zfs get all pool/home/bob
	-NAME PROPERTY VALUE SOURCE
	-pool/home/bob type filesystem -
	-pool/home/bob creation Tue Jul 21 15:53 2009 -
	-pool/home/bob used 21K -
	-pool/home/bob available 20.0G -
	-pool/home/bob referenced 21K -
	-pool/home/bob compressratio 1.00x -
	-pool/home/bob mounted yes -
	-pool/home/bob quota 20G local
	-pool/home/bob reservation none default
	-pool/home/bob recordsize 128K default
	-pool/home/bob mountpoint /home/bob default
	-pool/home/bob sharenfs off default
	-pool/home/bob checksum on default
	-pool/home/bob compression on local
	-pool/home/bob atime on default
	-pool/home/bob devices on default
	-pool/home/bob exec on default
	-pool/home/bob filesystem_limit none default
	-pool/home/bob setuid on default
	-pool/home/bob readonly off default
	-pool/home/bob jailed off default
	-pool/home/bob snapdir hidden default
	-pool/home/bob snapshot_limit none default
	-pool/home/bob aclmode discard default
	-pool/home/bob aclinherit restricted default
	-pool/home/bob canmount on default
	-pool/home/bob xattr on default
	-pool/home/bob copies 1 default
	-pool/home/bob version 5 -
	-pool/home/bob utf8only off -
	-pool/home/bob normalization none -
	-pool/home/bob casesensitivity sensitive -
	-pool/home/bob vscan off default
	-pool/home/bob nbmand off default
	-pool/home/bob sharesmb off default
	-pool/home/bob refquota none default
	-pool/home/bob refreservation none default
	-pool/home/bob primarycache all default
	-pool/home/bob secondarycache all default
	-pool/home/bob usedbysnapshots 0 -
	-pool/home/bob usedbydataset 21K -
	-pool/home/bob usedbychildren 0 -
	-pool/home/bob usedbyrefreservation 0 -
	-pool/home/bob logbias latency default
	-pool/home/bob dedup off default
	-pool/home/bob mlslabel -
	-pool/home/bob sync standard default
	-pool/home/bob refcompressratio 1.00x -
	-.Ed
	-.Pp
	-The following command gets a single property value.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs get -H -o value compression pool/home/bob
	-on
	-.Ed
	-.Pp
	-The following command lists all properties with local settings for
	-.Em pool/home/bob .
	-.Bd -literal -offset 2n
	-.Li # Ic zfs get -s local -o name,property,value all pool/home/bob
	-NAME PROPERTY VALUE
	-pool/home/bob quota 20G
	-pool/home/bob compression on
	-.Ed
	-.It Sy Example 8 No Rolling Back a Tn ZFS No File System
	-.Pp
	-The following command reverts the contents of
	-.Em pool/home/anne
	-to the snapshot named
	-.Em yesterday ,
	-deleting all intermediate snapshots.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs rollback -r pool/home/anne@yesterday
	-.Ed
	-.It Sy Example 9 No Creating a Tn ZFS No Clone
	-.Pp
	-The following command creates a writable file system whose initial contents are
	-the same as
	-.Em pool/home/bob@yesterday .
	-.Bd -literal -offset 2n
	-.Li # Ic zfs clone pool/home/bob@yesterday pool/clone
	-.Ed
	-.It Sy Example 10 No Promoting a Tn ZFS No Clone
	-.Pp
	-The following commands illustrate how to test out changes to a file system, and
	-then replace the original file system with the changed one, using clones, clone
	-promotion, and renaming:
	-.Bd -literal -offset 2n
	-.Li # Ic zfs create pool/project/production
	-.Ed
	-.Pp
	-Populate
	-.Pa /pool/project/production
	-with data and continue with the following commands:
	-.Bd -literal -offset 2n
	-.Li # Ic zfs snapshot pool/project/production@today
	-.Li # Ic zfs clone pool/project/production@today pool/project/beta
	-.Ed
	-.Pp
	-Now make changes to
	-.Pa /pool/project/beta
	-and continue with the following commands:
	-.Bd -literal -offset 2n
	-.Li # Ic zfs promote pool/project/beta
	-.Li # Ic zfs rename pool/project/production pool/project/legacy
	-.Li # Ic zfs rename pool/project/beta pool/project/production
	-.Ed
	-.Pp
	-Once the legacy version is no longer needed, it can be destroyed.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs destroy pool/project/legacy
	-.Ed
	-.It Sy Example 11 No Inheriting Tn ZFS No Properties
	-.Pp
	-The following command causes
	-.Em pool/home/bob
	-and
	-.Em pool/home/anne
	-to inherit the
	-.Sy checksum
	-property from their parent.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs inherit checksum pool/home/bob pool/home/anne
	-.Ed
	-.It Sy Example 12 No Remotely Replicating Tn ZFS No Data
	-.Pp
	-The following commands send a full stream and then an incremental stream to a
	-remote machine, restoring them into
	-.Sy poolB/received/fs@a
	-and
	-.Sy poolB/received/fs@b ,
	-respectively.
	-.Sy poolB
	-must contain the file system
	-.Sy poolB/received ,
	-and must not initially contain
	-.Sy poolB/received/fs .
	-.Bd -literal -offset 2n
	-.Li # Ic zfs send pool/fs@a \| ssh host zfs receive poolB/received/fs@a
	-.Li # Ic zfs send -i a pool/fs@b \| ssh host zfs receive poolB/received/fs
	-.Ed
	-.It Xo
	-.Sy Example 13
	-Using the
	-.Qq zfs receive -d
	-Option
	-.Xc
	-.Pp
	-The following command sends a full stream of
	-.Sy poolA/fsA/fsB@snap
	-to a remote machine, receiving it into
	-.Sy poolB/received/fsA/fsB@snap .
	-The
	-.Sy fsA/fsB@snap
	-portion of the received snapshot's name is determined from the name of the sent
	-snapshot.
	-.Sy poolB
	-must contain the file system
	-.Sy poolB/received .
	-If
	-.Sy poolB/received/fsA
	-does not exist, it is created as an empty file system.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs send poolA/fsA/fsB@snap \| ssh host zfs receive -d poolB/received
	-.Ed
	-.It Sy Example 14 No Setting User Properties
	-.Pp
	-The following example sets the user-defined
	-.Sy com.example:department
	-property for a dataset.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs set com.example:department=12345 tank/accounting
	-.Ed
	-.It Sy Example 15 No Performing a Rolling Snapshot
	-.Pp
	-The following example shows how to maintain a history of snapshots with a
	-consistent naming scheme. To keep a week's worth of snapshots, the user
	-destroys the oldest snapshot, renames the remaining snapshots, and then creates
	-a new snapshot, as follows:
	-.Bd -literal -offset 2n
	-.Li # Ic zfs destroy -r pool/users@7daysago
	-.Li # Ic zfs rename -r pool/users@6daysago @7daysago
	-.Li # Ic zfs rename -r pool/users@5daysago @6daysago
	-.Li # Ic zfs rename -r pool/users@4daysago @5daysago
	-.Li # Ic zfs rename -r pool/users@3daysago @4daysago
	-.Li # Ic zfs rename -r pool/users@2daysago @3daysago
	-.Li # Ic zfs rename -r pool/users@yesterday @2daysago
	-.Li # Ic zfs rename -r pool/users@today @yesterday
	-.Li # Ic zfs snapshot -r pool/users@today
	-.Ed
	-.It Xo
	-.Sy Example 16
	-Setting
	-.Qq sharenfs
	-Property Options on a ZFS File System
	-.Xc
	-.Pp
	-The following command shows how to set
	-.Sy sharenfs
	-property options to enable root access for a specific network on the
	-.Em tank/home
	-file system. The contents of the
	-.Sy sharenfs
	-property are valid
	-.Xr exports 5
	-options.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs set sharenfs="maproot=root,network 192.168.0.0/24" tank/home
	-.Ed
	-.Pp
	-Another way to write this command with the same result is:
	-.Bd -literal -offset 2n
	-.Li # Ic set zfs sharenfs="-maproot=root -network 192.168.0.0/24" tank/home
	-.Ed
	-.It Xo
	-.Sy Example 17
	-Delegating
	-.Tn ZFS
	-Administration Permissions on a
	-.Tn ZFS
	-Dataset
	-.Xc
	-.Pp
	-The following example shows how to set permissions so that user
	-.Em cindys
	-can create, destroy, mount, and take snapshots on
	-.Em tank/cindys .
	-The permissions on
	-.Em tank/cindys
	-are also displayed.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs allow cindys create,destroy,mount,snapshot tank/cindys
	-.Li # Ic zfs allow tank/cindys
	----- Permissions on tank/cindys --------------------------------------
	-Local+Descendent permissions:
	- user cindys create,destroy,mount,snapshot
	-.Ed
	-.It Sy Example 18 No Delegating Create Time Permissions on a Tn ZFS No Dataset
	-.Pp
	-The following example shows how to grant anyone in the group
	-.Em staff
	-to create file systems in
	-.Em tank/users .
	-This syntax also allows staff members to destroy their own file systems, but
	-not destroy anyone else's file system. The permissions on
	-.Em tank/users
	-are also displayed.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs allow staff create,mount tank/users
	-.Li # Ic zfs allow -c destroy tank/users
	-.Li # Ic zfs allow tank/users
	----- Permissions on tank/users ---------------------------------------
	-Permission sets:
	- destroy
	-Local+Descendent permissions:
	- group staff create,mount
	-.Ed
	-.It Xo
	-.Sy Example 19
	-Defining and Granting a Permission Set on a
	-.Tn ZFS
	-Dataset
	-.Xc
	-.Pp
	-The following example shows how to define and grant a permission set on the
	-.Em tank/users
	-file system. The permissions on
	-.Em tank/users
	-are also displayed.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs allow -s @pset create,destroy,snapshot,mount tank/users
	-.Li # Ic zfs allow staff @pset tank/users
	-.Li # Ic zfs allow tank/users
	----- Permissions on tank/users ---------------------------------------
	-Permission sets:
	- @pset create,destroy,mount,snapshot
	-Local+Descendent permissions:
	- group staff @pset
	-.Ed
	-.It Sy Example 20 No Delegating Property Permissions on a Tn ZFS No Dataset
	-.Pp
	-The following example shows to grant the ability to set quotas and reservations
	-on the
	-.Sy users/home
	-file system. The permissions on
	-.Sy users/home
	-are also displayed.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs allow cindys quota,reservation users/home
	-.Li # Ic zfs allow users/home
	----- Permissions on users/home ---------------------------------------
	-Local+Descendent permissions:
	- user cindys quota,reservation
	-.Li # Ic su - cindys
	-.Li cindys% Ic zfs set quota=10G users/home/marks
	-.Li cindys% Ic zfs get quota users/home/marks
	-NAME PROPERTY VALUE SOURCE
	-users/home/marks quota 10G local
	-.Ed
	-.It Sy Example 21 No Removing ZFS Delegated Permissions on a Tn ZFS No Dataset
	-.Pp
	-The following example shows how to remove the snapshot permission from the
	-.Em staff
	-group on the
	-.Em tank/users
	-file system. The permissions on
	-.Em tank/users
	-are also displayed.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs unallow staff snapshot tank/users
	-.Li # Ic zfs allow tank/users
	----- Permissions on tank/users ---------------------------------------
	-Permission sets:
	- @pset create,destroy,mount,snapshot
	-Local+Descendent permissions:
	- group staff @pset
	-.Ed
	-.It Sy Example 22 Showing the differences between a snapshot and a ZFS Dataset
	-.Pp
	-The following example shows how to see what has changed between a prior
	-snapshot of a ZFS Dataset and its current state. The
	-.Fl F
	-option is used to indicate type information for the files affected.
	-.Bd -literal -offset 2n
	-.Li # Ic zfs diff tank/test@before tank/test
	-M / /tank/test/
	-M F /tank/test/linked (+1)
	-R F /tank/test/oldname -> /tank/test/newname
	-- F /tank/test/deleted
	-+ F /tank/test/created
	-M F /tank/test/modified
	-.Ed
	-.El
	-.Sh SEE ALSO
	-.Xr chmod 2 ,
	-.Xr fsync 2 ,
	-.Xr exports 5 ,
	-.Xr fstab 5 ,
	-.Xr rc.conf 5 ,
	-.Xr jail 8 ,
	-.Xr mount 8 ,
	-.Xr umount 8 ,
	-.Xr zfs-program 8 ,
	-.Xr zpool 8
	-.Sh HISTORY
	-The
	-.Nm
	-utility first appeared in
	-.Fx 7.0 .
	-.Sh AUTHORS
	-This manual page is a
	-.Xr mdoc 7
	-reimplementation of the
	-.Tn OpenSolaris
	-manual page
	-.Em zfs(1M) ,
	-modified and customized for
	-.Fx
	-and licensed under the
	-Common Development and Distribution License
	-.Pq Tn CDDL .
	-.Pp
	-The
	-.Xr mdoc 7
	-implementation of this manual page was initially written by
	-.An Martin Matuska Aq mm@FreeBSD.org .
	Index: head/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h
	+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h
	@@ -1,62 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-#ifndef ZFS_ITER_H
	-#define ZFS_ITER_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct zfs_sort_column {
	- struct zfs_sort_column *sc_next;
	- struct zfs_sort_column *sc_last;
	- zfs_prop_t sc_prop;
	- char *sc_user_prop;
	- boolean_t sc_reverse;
	-} zfs_sort_column_t;
	-
	-#define ZFS_ITER_RECURSE (1 << 0)
	-#define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1)
	-#define ZFS_ITER_PROP_LISTSNAPS (1 << 2)
	-#define ZFS_ITER_DEPTH_LIMIT (1 << 3)
	-#define ZFS_ITER_RECVD_PROPS (1 << 4)
	-#define ZFS_ITER_SIMPLE (1 << 5)
	-#define ZFS_ITER_LITERAL_PROPS (1 << 6)
	-
	-int zfs_for_each(int, char **, int options, zfs_type_t,
	- zfs_sort_column_t , zprop_list_t , int, zfs_iter_f, void );
	-int zfs_add_sort_column(zfs_sort_column_t *, const char , boolean_t);
	-void zfs_free_sort_columns(zfs_sort_column_t *);
	-boolean_t zfs_sort_only_by_name(const zfs_sort_column_t *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* ZFS_ITER_H */
	Index: head/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c
	+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c
	@@ -1,497 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- */
	-
	-#include <libintl.h>
	-#include <libuutil.h>
	-#include <stddef.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-
	-#include <libzfs.h>
	-
	-#include "zfs_util.h"
	-#include "zfs_iter.h"
	-
	-/*
	- * This is a private interface used to gather up all the datasets specified on
	- * the command line so that we can iterate over them in order.
	- *
	- * First, we iterate over all filesystems, gathering them together into an
	- * AVL tree. We report errors for any explicitly specified datasets
	- * that we couldn't open.
	- *
	- * When finished, we have an AVL tree of ZFS handles. We go through and execute
	- * the provided callback for each one, passing whatever data the user supplied.
	- */
	-
	-typedef struct zfs_node {
	- zfs_handle_t *zn_handle;
	- uu_avl_node_t zn_avlnode;
	-} zfs_node_t;
	-
	-typedef struct callback_data {
	- uu_avl_t *cb_avl;
	- int cb_flags;
	- zfs_type_t cb_types;
	- zfs_sort_column_t *cb_sortcol;
	- zprop_list_t **cb_proplist;
	- int cb_depth_limit;
	- int cb_depth;
	- uint8_t cb_props_table[ZFS_NUM_PROPS];
	-} callback_data_t;
	-
	-uu_avl_pool_t *avl_pool;
	-
	-/*
	- * Include snaps if they were requested or if this a zfs list where types
	- * were not specified and the "listsnapshots" property is set on this pool.
	- */
	-static boolean_t
	-zfs_include_snapshots(zfs_handle_t zhp, callback_data_t cb)
	-{
	- zpool_handle_t *zph;
	-
	- if ((cb->cb_flags & ZFS_ITER_PROP_LISTSNAPS) == 0)
	- return (cb->cb_types & ZFS_TYPE_SNAPSHOT);
	-
	- zph = zfs_get_pool_handle(zhp);
	- return (zpool_get_prop_int(zph, ZPOOL_PROP_LISTSNAPS, NULL));
	-}
	-
	-/*
	- * Called for each dataset. If the object is of an appropriate type,
	- * add it to the avl tree and recurse over any children as necessary.
	- */
	-static int
	-zfs_callback(zfs_handle_t zhp, void data)
	-{
	- callback_data_t *cb = data;
	- boolean_t should_close = B_TRUE;
	- boolean_t include_snaps = zfs_include_snapshots(zhp, cb);
	- boolean_t include_bmarks = (cb->cb_types & ZFS_TYPE_BOOKMARK);
	-
	- if ((zfs_get_type(zhp) & cb->cb_types) \|\|
	- ((zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) && include_snaps)) {
	- uu_avl_index_t idx;
	- zfs_node_t *node = safe_malloc(sizeof (zfs_node_t));
	-
	- node->zn_handle = zhp;
	- uu_avl_node_init(node, &node->zn_avlnode, avl_pool);
	- if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol,
	- &idx) == NULL) {
	- if (cb->cb_proplist) {
	- if ((*cb->cb_proplist) &&
	- !(*cb->cb_proplist)->pl_all)
	- zfs_prune_proplist(zhp,
	- cb->cb_props_table);
	-
	- if (zfs_expand_proplist(zhp, cb->cb_proplist,
	- (cb->cb_flags & ZFS_ITER_RECVD_PROPS),
	- (cb->cb_flags & ZFS_ITER_LITERAL_PROPS))
	- != 0) {
	- free(node);
	- return (-1);
	- }
	- }
	- uu_avl_insert(cb->cb_avl, node, idx);
	- should_close = B_FALSE;
	- } else {
	- free(node);
	- }
	- }
	-
	- /*
	- * Recurse if necessary.
	- */
	- if (cb->cb_flags & ZFS_ITER_RECURSE &&
	- ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 \|\|
	- cb->cb_depth < cb->cb_depth_limit)) {
	- cb->cb_depth++;
	- if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM)
	- (void) zfs_iter_filesystems(zhp, zfs_callback, data);
	- if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT \|
	- ZFS_TYPE_BOOKMARK)) == 0) && include_snaps)
	- (void) zfs_iter_snapshots(zhp,
	- (cb->cb_flags & ZFS_ITER_SIMPLE) != 0, zfs_callback,
	- data, 0, 0);
	- if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT \|
	- ZFS_TYPE_BOOKMARK)) == 0) && include_bmarks)
	- (void) zfs_iter_bookmarks(zhp, zfs_callback, data);
	- cb->cb_depth--;
	- }
	-
	- if (should_close)
	- zfs_close(zhp);
	-
	- return (0);
	-}
	-
	-int
	-zfs_add_sort_column(zfs_sort_column_t *sc, const char name,
	- boolean_t reverse)
	-{
	- zfs_sort_column_t *col;
	- zfs_prop_t prop;
	-
	- if ((prop = zfs_name_to_prop(name)) == ZPROP_INVAL &&
	- !zfs_prop_user(name))
	- return (-1);
	-
	- col = safe_malloc(sizeof (zfs_sort_column_t));
	-
	- col->sc_prop = prop;
	- col->sc_reverse = reverse;
	- if (prop == ZPROP_INVAL) {
	- col->sc_user_prop = safe_malloc(strlen(name) + 1);
	- (void) strcpy(col->sc_user_prop, name);
	- }
	-
	- if (*sc == NULL) {
	- col->sc_last = col;
	- *sc = col;
	- } else {
	- (*sc)->sc_last->sc_next = col;
	- (*sc)->sc_last = col;
	- }
	-
	- return (0);
	-}
	-
	-void
	-zfs_free_sort_columns(zfs_sort_column_t *sc)
	-{
	- zfs_sort_column_t *col;
	-
	- while (sc != NULL) {
	- col = sc->sc_next;
	- free(sc->sc_user_prop);
	- free(sc);
	- sc = col;
	- }
	-}
	-
	-boolean_t
	-zfs_sort_only_by_name(const zfs_sort_column_t *sc)
	-{
	-
	- return (sc != NULL && sc->sc_next == NULL &&
	- sc->sc_prop == ZFS_PROP_NAME);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_compare(const void larg, const void rarg, void *unused)
	-{
	- zfs_handle_t l = ((zfs_node_t )larg)->zn_handle;
	- zfs_handle_t r = ((zfs_node_t )rarg)->zn_handle;
	- const char *lname = zfs_get_name(l);
	- const char *rname = zfs_get_name(r);
	- char lat, rat;
	- uint64_t lcreate, rcreate;
	- int ret;
	-
	- lat = (char *)strchr(lname, '@');
	- rat = (char *)strchr(rname, '@');
	-
	- if (lat != NULL)
	- *lat = '\0';
	- if (rat != NULL)
	- *rat = '\0';
	-
	- ret = strcmp(lname, rname);
	- if (ret == 0 && (lat != NULL \|\| rat != NULL)) {
	- /*
	- * If we're comparing a dataset to one of its snapshots, we
	- * always make the full dataset first.
	- */
	- if (lat == NULL) {
	- ret = -1;
	- } else if (rat == NULL) {
	- ret = 1;
	- } else {
	- /*
	- * If we have two snapshots from the same dataset, then
	- * we want to sort them according to creation time. We
	- * use the hidden CREATETXG property to get an absolute
	- * ordering of snapshots.
	- */
	- lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
	- rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
	-
	- /*
	- * Both lcreate and rcreate being 0 means we don't have
	- * properties and we should compare full name.
	- */
	- if (lcreate == 0 && rcreate == 0)
	- ret = strcmp(lat + 1, rat + 1);
	- else if (lcreate < rcreate)
	- ret = -1;
	- else if (lcreate > rcreate)
	- ret = 1;
	- }
	- }
	-
	- if (lat != NULL)
	- *lat = '@';
	- if (rat != NULL)
	- *rat = '@';
	-
	- return (ret);
	-}
	-
	-/*
	- * Sort datasets by specified columns.
	- *
	- * o Numeric types sort in ascending order.
	- * o String types sort in alphabetical order.
	- * o Types inappropriate for a row sort that row to the literal
	- * bottom, regardless of the specified ordering.
	- *
	- * If no sort columns are specified, or two datasets compare equally
	- * across all specified columns, they are sorted alphabetically by name
	- * with snapshots grouped under their parents.
	- */
	-static int
	-zfs_sort(const void larg, const void rarg, void *data)
	-{
	- zfs_handle_t l = ((zfs_node_t )larg)->zn_handle;
	- zfs_handle_t r = ((zfs_node_t )rarg)->zn_handle;
	- zfs_sort_column_t sc = (zfs_sort_column_t )data;
	- zfs_sort_column_t *psc;
	-
	- for (psc = sc; psc != NULL; psc = psc->sc_next) {
	- char lbuf[ZFS_MAXPROPLEN], rbuf[ZFS_MAXPROPLEN];
	- char lstr, rstr;
	- uint64_t lnum, rnum;
	- boolean_t lvalid, rvalid;
	- int ret = 0;
	-
	- /*
	- * We group the checks below the generic code. If 'lstr' and
	- * 'rstr' are non-NULL, then we do a string based comparison.
	- * Otherwise, we compare 'lnum' and 'rnum'.
	- */
	- lstr = rstr = NULL;
	- if (psc->sc_prop == ZPROP_INVAL) {
	- nvlist_t luser, ruser;
	- nvlist_t lval, rval;
	-
	- luser = zfs_get_user_props(l);
	- ruser = zfs_get_user_props(r);
	-
	- lvalid = (nvlist_lookup_nvlist(luser,
	- psc->sc_user_prop, &lval) == 0);
	- rvalid = (nvlist_lookup_nvlist(ruser,
	- psc->sc_user_prop, &rval) == 0);
	-
	- if (lvalid)
	- verify(nvlist_lookup_string(lval,
	- ZPROP_VALUE, &lstr) == 0);
	- if (rvalid)
	- verify(nvlist_lookup_string(rval,
	- ZPROP_VALUE, &rstr) == 0);
	- } else if (psc->sc_prop == ZFS_PROP_NAME) {
	- lvalid = rvalid = B_TRUE;
	-
	- (void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf));
	- (void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf));
	-
	- lstr = lbuf;
	- rstr = rbuf;
	- } else if (zfs_prop_is_string(psc->sc_prop)) {
	- lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf,
	- sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0);
	- rvalid = (zfs_prop_get(r, psc->sc_prop, rbuf,
	- sizeof (rbuf), NULL, NULL, 0, B_TRUE) == 0);
	-
	- lstr = lbuf;
	- rstr = rbuf;
	- } else {
	- lvalid = zfs_prop_valid_for_type(psc->sc_prop,
	- zfs_get_type(l));
	- rvalid = zfs_prop_valid_for_type(psc->sc_prop,
	- zfs_get_type(r));
	-
	- if (lvalid)
	- (void) zfs_prop_get_numeric(l, psc->sc_prop,
	- &lnum, NULL, NULL, 0);
	- if (rvalid)
	- (void) zfs_prop_get_numeric(r, psc->sc_prop,
	- &rnum, NULL, NULL, 0);
	- }
	-
	- if (!lvalid && !rvalid)
	- continue;
	- else if (!lvalid)
	- return (1);
	- else if (!rvalid)
	- return (-1);
	-
	- if (lstr)
	- ret = strcmp(lstr, rstr);
	- else if (lnum < rnum)
	- ret = -1;
	- else if (lnum > rnum)
	- ret = 1;
	-
	- if (ret != 0) {
	- if (psc->sc_reverse == B_TRUE)
	- ret = (ret < 0) ? 1 : -1;
	- return (ret);
	- }
	- }
	-
	- return (zfs_compare(larg, rarg, NULL));
	-}
	-
	-int
	-zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
	- zfs_sort_column_t sortcol, zprop_list_t *proplist, int limit,
	- zfs_iter_f callback, void *data)
	-{
	- callback_data_t cb = {0};
	- int ret = 0;
	- zfs_node_t *node;
	- uu_avl_walk_t *walk;
	-
	- avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t),
	- offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT);
	-
	- if (avl_pool == NULL)
	- nomem();
	-
	- cb.cb_sortcol = sortcol;
	- cb.cb_flags = flags;
	- cb.cb_proplist = proplist;
	- cb.cb_types = types;
	- cb.cb_depth_limit = limit;
	- /*
	- * If cb_proplist is provided then in the zfs_handles created we
	- * retain only those properties listed in cb_proplist and sortcol.
	- * The rest are pruned. So, the caller should make sure that no other
	- * properties other than those listed in cb_proplist/sortcol are
	- * accessed.
	- *
	- * If cb_proplist is NULL then we retain all the properties. We
	- * always retain the zoned property, which some other properties
	- * need (userquota & friends), and the createtxg property, which
	- * we need to sort snapshots.
	- */
	- if (cb.cb_proplist && *cb.cb_proplist) {
	- zprop_list_t p = cb.cb_proplist;
	-
	- while (p) {
	- if (p->pl_prop >= ZFS_PROP_TYPE &&
	- p->pl_prop < ZFS_NUM_PROPS) {
	- cb.cb_props_table[p->pl_prop] = B_TRUE;
	- }
	- p = p->pl_next;
	- }
	-
	- while (sortcol) {
	- if (sortcol->sc_prop >= ZFS_PROP_TYPE &&
	- sortcol->sc_prop < ZFS_NUM_PROPS) {
	- cb.cb_props_table[sortcol->sc_prop] = B_TRUE;
	- }
	- sortcol = sortcol->sc_next;
	- }
	-
	- cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE;
	- cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE;
	- } else {
	- (void) memset(cb.cb_props_table, B_TRUE,
	- sizeof (cb.cb_props_table));
	- }
	-
	- if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
	- nomem();
	-
	- if (argc == 0) {
	- /*
	- * If given no arguments, iterate over all datasets.
	- */
	- cb.cb_flags \|= ZFS_ITER_RECURSE;
	- ret = zfs_iter_root(g_zfs, zfs_callback, &cb);
	- } else {
	- int i;
	- zfs_handle_t *zhp;
	- zfs_type_t argtype;
	-
	- /*
	- * If we're recursive, then we always allow filesystems as
	- * arguments. If we also are interested in snapshots or
	- * bookmarks, then we can take volumes as well.
	- */
	- argtype = types;
	- if (flags & ZFS_ITER_RECURSE) {
	- argtype \|= ZFS_TYPE_FILESYSTEM;
	- if (types & (ZFS_TYPE_SNAPSHOT \| ZFS_TYPE_BOOKMARK))
	- argtype \|= ZFS_TYPE_VOLUME;
	- }
	-
	- for (i = 0; i < argc; i++) {
	- if (flags & ZFS_ITER_ARGS_CAN_BE_PATHS) {
	- zhp = zfs_path_to_zhandle(g_zfs, argv[i],
	- argtype);
	- } else {
	- zhp = zfs_open(g_zfs, argv[i], argtype);
	- }
	- if (zhp != NULL)
	- ret \|= zfs_callback(zhp, &cb);
	- else
	- ret = 1;
	- }
	- }
	-
	- /*
	- * At this point we've got our AVL tree full of zfs handles, so iterate
	- * over each one and execute the real user callback.
	- */
	- for (node = uu_avl_first(cb.cb_avl); node != NULL;
	- node = uu_avl_next(cb.cb_avl, node))
	- ret \|= callback(node->zn_handle, data);
	-
	- /*
	- * Finally, clean up the AVL tree.
	- */
	- if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
	- nomem();
	-
	- while ((node = uu_avl_walk_next(walk)) != NULL) {
	- uu_avl_remove(cb.cb_avl, node);
	- zfs_close(node->zn_handle);
	- free(node);
	- }
	-
	- uu_avl_walk_end(walk);
	- uu_avl_destroy(cb.cb_avl);
	- uu_avl_pool_destroy(avl_pool);
	-
	- return (ret);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
	+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
	@@ -1,7592 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	- * Copyright 2012 Milan Jurik. All rights reserved.
	- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
	- * Copyright 2016 Nexenta Systems, Inc.
	- * Copyright (c) 2019 Datto Inc.
	- */
	-
	-#include <assert.h>
	-#include <ctype.h>
	-#include <errno.h>
	-#include <getopt.h>
	-#include <libgen.h>
	-#include <libintl.h>
	-#include <libuutil.h>
	-#include <libnvpair.h>
	-#include <locale.h>
	-#include <stddef.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <fcntl.h>
	-#include <zone.h>
	-#include <grp.h>
	-#include <pwd.h>
	-#include <signal.h>
	-#include <sys/debug.h>
	-#include <sys/list.h>
	-#include <sys/mntent.h>
	-#include <sys/mnttab.h>
	-#include <sys/mount.h>
	-#include <sys/stat.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/types.h>
	-#include <time.h>
	-#include <err.h>
	-#include <jail.h>
	-
	-#include <libzfs.h>
	-#include <libzfs_core.h>
	-#include <zfs_prop.h>
	-#include <zfs_deleg.h>
	-#include <libuutil.h>
	-#ifdef illumos
	-#include <aclutils.h>
	-#include <directory.h>
	-#include <idmap.h>
	-#include <libshare.h>
	-#endif
	-
	-#include "zfs_iter.h"
	-#include "zfs_util.h"
	-#include "zfs_comutil.h"
	-
	-libzfs_handle_t *g_zfs;
	-
	-static FILE *mnttab_file;
	-static char history_str[HIS_MAX_RECORD_LEN];
	-static boolean_t log_history = B_TRUE;
	-
	-static int zfs_do_clone(int argc, char **argv);
	-static int zfs_do_create(int argc, char **argv);
	-static int zfs_do_destroy(int argc, char **argv);
	-static int zfs_do_get(int argc, char **argv);
	-static int zfs_do_inherit(int argc, char **argv);
	-static int zfs_do_list(int argc, char **argv);
	-static int zfs_do_mount(int argc, char **argv);
	-static int zfs_do_rename(int argc, char **argv);
	-static int zfs_do_rollback(int argc, char **argv);
	-static int zfs_do_set(int argc, char **argv);
	-static int zfs_do_upgrade(int argc, char **argv);
	-static int zfs_do_snapshot(int argc, char **argv);
	-static int zfs_do_unmount(int argc, char **argv);
	-static int zfs_do_share(int argc, char **argv);
	-static int zfs_do_unshare(int argc, char **argv);
	-static int zfs_do_send(int argc, char **argv);
	-static int zfs_do_receive(int argc, char **argv);
	-static int zfs_do_promote(int argc, char **argv);
	-static int zfs_do_userspace(int argc, char **argv);
	-static int zfs_do_allow(int argc, char **argv);
	-static int zfs_do_unallow(int argc, char **argv);
	-static int zfs_do_hold(int argc, char **argv);
	-static int zfs_do_holds(int argc, char **argv);
	-static int zfs_do_release(int argc, char **argv);
	-static int zfs_do_diff(int argc, char **argv);
	-static int zfs_do_jail(int argc, char **argv);
	-static int zfs_do_unjail(int argc, char **argv);
	-static int zfs_do_bookmark(int argc, char **argv);
	-static int zfs_do_remap(int argc, char **argv);
	-static int zfs_do_channel_program(int argc, char **argv);
	-
	-/*
	- * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
	- */
	-
	-#ifdef DEBUG
	-const char *
	-_umem_debug_init(void)
	-{
	- return ("default,verbose"); /* $UMEM_DEBUG setting */
	-}
	-
	-const char *
	-_umem_logging_init(void)
	-{
	- return ("fail,contents"); /* $UMEM_LOGGING setting */
	-}
	-#endif
	-
	-typedef enum {
	- HELP_CLONE,
	- HELP_CREATE,
	- HELP_DESTROY,
	- HELP_GET,
	- HELP_INHERIT,
	- HELP_UPGRADE,
	- HELP_JAIL,
	- HELP_UNJAIL,
	- HELP_LIST,
	- HELP_MOUNT,
	- HELP_PROMOTE,
	- HELP_RECEIVE,
	- HELP_RENAME,
	- HELP_ROLLBACK,
	- HELP_SEND,
	- HELP_SET,
	- HELP_SHARE,
	- HELP_SNAPSHOT,
	- HELP_UNMOUNT,
	- HELP_UNSHARE,
	- HELP_ALLOW,
	- HELP_UNALLOW,
	- HELP_USERSPACE,
	- HELP_GROUPSPACE,
	- HELP_HOLD,
	- HELP_HOLDS,
	- HELP_RELEASE,
	- HELP_DIFF,
	- HELP_REMAP,
	- HELP_BOOKMARK,
	- HELP_CHANNEL_PROGRAM,
	-} zfs_help_t;
	-
	-typedef struct zfs_command {
	- const char *name;
	- int (func)(int argc, char *argv);
	- zfs_help_t usage;
	-} zfs_command_t;
	-
	-/*
	- * Master command table. Each ZFS command has a name, associated function, and
	- * usage message. The usage messages need to be internationalized, so we have
	- * to have a function to return the usage message based on a command index.
	- *
	- * These commands are organized according to how they are displayed in the usage
	- * message. An empty command (one with a NULL name) indicates an empty line in
	- * the generic usage message.
	- */
	-static zfs_command_t command_table[] = {
	- { "create", zfs_do_create, HELP_CREATE },
	- { "destroy", zfs_do_destroy, HELP_DESTROY },
	- { NULL },
	- { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT },
	- { "rollback", zfs_do_rollback, HELP_ROLLBACK },
	- { "clone", zfs_do_clone, HELP_CLONE },
	- { "promote", zfs_do_promote, HELP_PROMOTE },
	- { "rename", zfs_do_rename, HELP_RENAME },
	- { "bookmark", zfs_do_bookmark, HELP_BOOKMARK },
	- { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM },
	- { NULL },
	- { "list", zfs_do_list, HELP_LIST },
	- { NULL },
	- { "set", zfs_do_set, HELP_SET },
	- { "get", zfs_do_get, HELP_GET },
	- { "inherit", zfs_do_inherit, HELP_INHERIT },
	- { "upgrade", zfs_do_upgrade, HELP_UPGRADE },
	- { "userspace", zfs_do_userspace, HELP_USERSPACE },
	- { "groupspace", zfs_do_userspace, HELP_GROUPSPACE },
	- { NULL },
	- { "mount", zfs_do_mount, HELP_MOUNT },
	- { "unmount", zfs_do_unmount, HELP_UNMOUNT },
	- { "share", zfs_do_share, HELP_SHARE },
	- { "unshare", zfs_do_unshare, HELP_UNSHARE },
	- { NULL },
	- { "send", zfs_do_send, HELP_SEND },
	- { "receive", zfs_do_receive, HELP_RECEIVE },
	- { NULL },
	- { "allow", zfs_do_allow, HELP_ALLOW },
	- { NULL },
	- { "unallow", zfs_do_unallow, HELP_UNALLOW },
	- { NULL },
	- { "hold", zfs_do_hold, HELP_HOLD },
	- { "holds", zfs_do_holds, HELP_HOLDS },
	- { "release", zfs_do_release, HELP_RELEASE },
	- { "diff", zfs_do_diff, HELP_DIFF },
	- { NULL },
	- { "jail", zfs_do_jail, HELP_JAIL },
	- { "unjail", zfs_do_unjail, HELP_UNJAIL },
	- { "remap", zfs_do_remap, HELP_REMAP },
	-};
	-
	-#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
	-
	-zfs_command_t *current_command;
	-
	-static const char *
	-get_usage(zfs_help_t idx)
	-{
	- switch (idx) {
	- case HELP_CLONE:
	- return (gettext("\tclone [-p] [-o property=value] ... "
	- "<snapshot> <filesystem\|volume>\n"));
	- case HELP_CREATE:
	- return (gettext("\tcreate [-pu] [-o property=value] ... "
	- "<filesystem>\n"
	- "\tcreate [-ps] [-b blocksize] [-o property=value] ... "
	- "-V <size> <volume>\n"));
	- case HELP_DESTROY:
	- return (gettext("\tdestroy [-fnpRrv] <filesystem\|volume>\n"
	- "\tdestroy [-dnpRrv] "
	- "<filesystem\|volume>@<snap>[%<snap>][,...]\n"
	- "\tdestroy <filesystem\|volume>#<bookmark>\n"));
	- case HELP_GET:
	- return (gettext("\tget [-rHp] [-d max] "
	- "[-o \"all\" \| field[,...]]\n"
	- "\t [-t type[,...]] [-s source[,...]]\n"
	- "\t <\"all\" \| property[,...]> "
	- "[filesystem\|volume\|snapshot\|bookmark] ...\n"));
	- case HELP_INHERIT:
	- return (gettext("\tinherit [-rS] <property> "
	- "<filesystem\|volume\|snapshot> ...\n"));
	- case HELP_UPGRADE:
	- return (gettext("\tupgrade [-v]\n"
	- "\tupgrade [-r] [-V version] <-a \| filesystem ...>\n"));
	- case HELP_JAIL:
	- return (gettext("\tjail <jailid\|jailname> <filesystem>\n"));
	- case HELP_UNJAIL:
	- return (gettext("\tunjail <jailid\|jailname> <filesystem>\n"));
	- case HELP_LIST:
	- return (gettext("\tlist [-Hp] [-r\|-d max] [-o property[,...]] "
	- "[-s property]...\n\t [-S property]... [-t type[,...]] "
	- "[filesystem\|volume\|snapshot] ...\n"));
	- case HELP_MOUNT:
	- return (gettext("\tmount\n"
	- "\tmount [-vO] [-o opts] <-a \| filesystem>\n"));
	- case HELP_PROMOTE:
	- return (gettext("\tpromote <clone-filesystem>\n"));
	- case HELP_RECEIVE:
	- return (gettext("\treceive\|recv [-vnsFMu] <filesystem\|volume\|"
	- "snapshot>\n"
	- "\treceive\|recv [-vnsFMu] [-o origin=<snapshot>] [-d \| -e] "
	- "<filesystem>\n"
	- "\treceive\|recv -A <filesystem\|volume>\n"));
	- case HELP_RENAME:
	- return (gettext("\trename [-f] <filesystem\|volume\|snapshot> "
	- "<filesystem\|volume\|snapshot>\n"
	- "\trename [-f] -p <filesystem\|volume> <filesystem\|volume>\n"
	- "\trename -r <snapshot> <snapshot>\n"
	- "\trename <bookmark> <bookmark>\n"
	- "\trename -u [-p] <filesystem> <filesystem>"));
	- case HELP_ROLLBACK:
	- return (gettext("\trollback [-rRf] <snapshot>\n"));
	- case HELP_SEND:
	- return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] "
	- "<snapshot>\n"
	- "\tsend [-LPcenv] [-i snapshot\|bookmark] "
	- "<filesystem\|volume\|snapshot>\n"
	- "\tsend [-nvPe] -t <receive_resume_token>\n"));
	- case HELP_SET:
	- return (gettext("\tset <property=value> ... "
	- "<filesystem\|volume\|snapshot> ...\n"));
	- case HELP_SHARE:
	- return (gettext("\tshare <-a \| filesystem>\n"));
	- case HELP_SNAPSHOT:
	- return (gettext("\tsnapshot\|snap [-r] [-o property=value] ... "
	- "<filesystem\|volume>@<snap> ...\n"));
	- case HELP_UNMOUNT:
	- return (gettext("\tunmount\|umount [-f] "
	- "<-a \| filesystem\|mountpoint>\n"));
	- case HELP_UNSHARE:
	- return (gettext("\tunshare "
	- "<-a \| filesystem\|mountpoint>\n"));
	- case HELP_ALLOW:
	- return (gettext("\tallow <filesystem\|volume>\n"
	- "\tallow [-ldug] "
	- "<\"everyone\"\|user\|group>[,...] <perm\|@setname>[,...]\n"
	- "\t <filesystem\|volume>\n"
	- "\tallow [-ld] -e <perm\|@setname>[,...] "
	- "<filesystem\|volume>\n"
	- "\tallow -c <perm\|@setname>[,...] <filesystem\|volume>\n"
	- "\tallow -s @setname <perm\|@setname>[,...] "
	- "<filesystem\|volume>\n"));
	- case HELP_UNALLOW:
	- return (gettext("\tunallow [-rldug] "
	- "<\"everyone\"\|user\|group>[,...]\n"
	- "\t [<perm\|@setname>[,...]] <filesystem\|volume>\n"
	- "\tunallow [-rld] -e [<perm\|@setname>[,...]] "
	- "<filesystem\|volume>\n"
	- "\tunallow [-r] -c [<perm\|@setname>[,...]] "
	- "<filesystem\|volume>\n"
	- "\tunallow [-r] -s @setname [<perm\|@setname>[,...]] "
	- "<filesystem\|volume>\n"));
	- case HELP_USERSPACE:
	- return (gettext("\tuserspace [-Hinp] [-o field[,...]] "
	- "[-s field] ...\n"
	- "\t [-S field] ... [-t type[,...]] "
	- "<filesystem\|snapshot>\n"));
	- case HELP_GROUPSPACE:
	- return (gettext("\tgroupspace [-Hinp] [-o field[,...]] "
	- "[-s field] ...\n"
	- "\t [-S field] ... [-t type[,...]] "
	- "<filesystem\|snapshot>\n"));
	- case HELP_HOLD:
	- return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
	- case HELP_HOLDS:
	- return (gettext("\tholds [-Hp] [-r\|-d depth] "
	- "<filesystem\|volume\|snapshot> ...\n"));
	- case HELP_RELEASE:
	- return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
	- case HELP_DIFF:
	- return (gettext("\tdiff [-FHt] <snapshot> "
	- "[snapshot\|filesystem]\n"));
	- case HELP_REMAP:
	- return (gettext("\tremap <filesystem \| volume>\n"));
	- case HELP_BOOKMARK:
	- return (gettext("\tbookmark <snapshot> <bookmark>\n"));
	- case HELP_CHANNEL_PROGRAM:
	- return (gettext("\tprogram [-jn] [-t <instruction limit>] "
	- "[-m <memory limit (b)>] <pool> <program file> "
	- "[lua args...]\n"));
	- }
	-
	- abort();
	- /* NOTREACHED */
	-}
	-
	-void
	-nomem(void)
	-{
	- (void) fprintf(stderr, gettext("internal error: out of memory\n"));
	- exit(1);
	-}
	-
	-/*
	- * Utility function to guarantee malloc() success.
	- */
	-
	-void *
	-safe_malloc(size_t size)
	-{
	- void *data;
	-
	- if ((data = calloc(1, size)) == NULL)
	- nomem();
	-
	- return (data);
	-}
	-
	-void *
	-safe_realloc(void *data, size_t size)
	-{
	- void *newp;
	- if ((newp = realloc(data, size)) == NULL) {
	- free(data);
	- nomem();
	- }
	-
	- return (newp);
	-}
	-
	-static char *
	-safe_strdup(char *str)
	-{
	- char *dupstr = strdup(str);
	-
	- if (dupstr == NULL)
	- nomem();
	-
	- return (dupstr);
	-}
	-
	-/*
	- * Callback routine that will print out information for each of
	- * the properties.
	- */
	-static int
	-usage_prop_cb(int prop, void *cb)
	-{
	- FILE *fp = cb;
	-
	- (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop));
	-
	- if (zfs_prop_readonly(prop))
	- (void) fprintf(fp, " NO ");
	- else
	- (void) fprintf(fp, "YES ");
	-
	- if (zfs_prop_inheritable(prop))
	- (void) fprintf(fp, " YES ");
	- else
	- (void) fprintf(fp, " NO ");
	-
	- if (zfs_prop_values(prop) == NULL)
	- (void) fprintf(fp, "-\n");
	- else
	- (void) fprintf(fp, "%s\n", zfs_prop_values(prop));
	-
	- return (ZPROP_CONT);
	-}
	-
	-/*
	- * Display usage message. If we're inside a command, display only the usage for
	- * that command. Otherwise, iterate over the entire command table and display
	- * a complete usage message.
	- */
	-static void
	-usage(boolean_t requested)
	-{
	- int i;
	- boolean_t show_properties = B_FALSE;
	- FILE *fp = requested ? stdout : stderr;
	-
	- if (current_command == NULL) {
	-
	- (void) fprintf(fp, gettext("usage: zfs command args ...\n"));
	- (void) fprintf(fp,
	- gettext("where 'command' is one of the following:\n\n"));
	-
	- for (i = 0; i < NCOMMAND; i++) {
	- if (command_table[i].name == NULL)
	- (void) fprintf(fp, "\n");
	- else
	- (void) fprintf(fp, "%s",
	- get_usage(command_table[i].usage));
	- }
	-
	- (void) fprintf(fp, gettext("\nEach dataset is of the form: "
	- "pool/[dataset/]*dataset[@name]\n"));
	- } else {
	- (void) fprintf(fp, gettext("usage:\n"));
	- (void) fprintf(fp, "%s", get_usage(current_command->usage));
	- }
	-
	- if (current_command != NULL &&
	- (strcmp(current_command->name, "set") == 0 \|\|
	- strcmp(current_command->name, "get") == 0 \|\|
	- strcmp(current_command->name, "inherit") == 0 \|\|
	- strcmp(current_command->name, "list") == 0))
	- show_properties = B_TRUE;
	-
	- if (show_properties) {
	- (void) fprintf(fp,
	- gettext("\nThe following properties are supported:\n"));
	-
	- (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n",
	- "PROPERTY", "EDIT", "INHERIT", "VALUES");
	-
	- /* Iterate over all properties */
	- (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
	- ZFS_TYPE_DATASET);
	-
	- (void) fprintf(fp, "\t%-15s ", "userused@...");
	- (void) fprintf(fp, " NO NO <size>\n");
	- (void) fprintf(fp, "\t%-15s ", "groupused@...");
	- (void) fprintf(fp, " NO NO <size>\n");
	- (void) fprintf(fp, "\t%-15s ", "userquota@...");
	- (void) fprintf(fp, "YES NO <size> \| none\n");
	- (void) fprintf(fp, "\t%-15s ", "groupquota@...");
	- (void) fprintf(fp, "YES NO <size> \| none\n");
	- (void) fprintf(fp, "\t%-15s ", "written@<snap>");
	- (void) fprintf(fp, " NO NO <size>\n");
	-
	- (void) fprintf(fp, gettext("\nSizes are specified in bytes "
	- "with standard units such as K, M, G, etc.\n"));
	- (void) fprintf(fp, gettext("\nUser-defined properties can "
	- "be specified by using a name containing a colon (:).\n"));
	- (void) fprintf(fp, gettext("\nThe {user\|group}{used\|quota}@ "
	- "properties must be appended with\n"
	- "a user or group specifier of one of these forms:\n"
	- " POSIX name (eg: \"matt\")\n"
	- " POSIX id (eg: \"126829\")\n"
	- " SMB name@domain (eg: \"matt@sun\")\n"
	- " SMB SID (eg: \"S-1-234-567-89\")\n"));
	- } else {
	- (void) fprintf(fp,
	- gettext("\nFor the property list, run: %s\n"),
	- "zfs set\|get");
	- (void) fprintf(fp,
	- gettext("\nFor the delegated permission list, run: %s\n"),
	- "zfs allow\|unallow");
	- }
	-
	- /*
	- * See comments at end of main().
	- */
	- if (getenv("ZFS_ABORT") != NULL) {
	- (void) printf("dumping core by request\n");
	- abort();
	- }
	-
	- exit(requested ? 0 : 2);
	-}
	-
	-/*
	- * Take a property=value argument string and add it to the given nvlist.
	- * Modifies the argument inplace.
	- */
	-static int
	-parseprop(nvlist_t props, char propname)
	-{
	- char propval, strval;
	-
	- if ((propval = strchr(propname, '=')) == NULL) {
	- (void) fprintf(stderr, gettext("missing "
	- "'=' for property=value argument\n"));
	- return (-1);
	- }
	- *propval = '\0';
	- propval++;
	- if (nvlist_lookup_string(props, propname, &strval) == 0) {
	- (void) fprintf(stderr, gettext("property '%s' "
	- "specified multiple times\n"), propname);
	- return (-1);
	- }
	- if (nvlist_add_string(props, propname, propval) != 0)
	- nomem();
	- return (0);
	-}
	-
	-static int
	-parse_depth(char opt, int flags)
	-{
	- char *tmp;
	- int depth;
	-
	- depth = (int)strtol(opt, &tmp, 0);
	- if (*tmp) {
	- (void) fprintf(stderr,
	- gettext("%s is not an integer\n"), opt);
	- usage(B_FALSE);
	- }
	- if (depth < 0) {
	- (void) fprintf(stderr,
	- gettext("Depth can not be negative.\n"));
	- usage(B_FALSE);
	- }
	- *flags \|= (ZFS_ITER_DEPTH_LIMIT\|ZFS_ITER_RECURSE);
	- return (depth);
	-}
	-
	-#define PROGRESS_DELAY 2 /* seconds */
	-
	-static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
	-static time_t pt_begin;
	-static char *pt_header = NULL;
	-static boolean_t pt_shown;
	-
	-static void
	-start_progress_timer(void)
	-{
	- pt_begin = time(NULL) + PROGRESS_DELAY;
	- pt_shown = B_FALSE;
	-}
	-
	-static void
	-set_progress_header(char *header)
	-{
	- assert(pt_header == NULL);
	- pt_header = safe_strdup(header);
	- if (pt_shown) {
	- (void) printf("%s: ", header);
	- (void) fflush(stdout);
	- }
	-}
	-
	-static void
	-update_progress(char *update)
	-{
	- if (!pt_shown && time(NULL) > pt_begin) {
	- int len = strlen(update);
	-
	- (void) printf("%s: %s%.s", pt_header, update, len, len,
	- pt_reverse);
	- (void) fflush(stdout);
	- pt_shown = B_TRUE;
	- } else if (pt_shown) {
	- int len = strlen(update);
	-
	- (void) printf("%s%.s", update, len, len, pt_reverse);
	- (void) fflush(stdout);
	- }
	-}
	-
	-static void
	-finish_progress(char *done)
	-{
	- if (pt_shown) {
	- (void) printf("%s\n", done);
	- (void) fflush(stdout);
	- }
	- free(pt_header);
	- pt_header = NULL;
	-}
	-
	-/*
	- * Check if the dataset is mountable and should be automatically mounted.
	- */
	-static boolean_t
	-should_auto_mount(zfs_handle_t *zhp)
	-{
	- if (!zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, zfs_get_type(zhp)))
	- return (B_FALSE);
	- return (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON);
	-}
	-
	-/*
	- * zfs clone [-p] [-o prop=value] ... <snap> <fs \| vol>
	- *
	- * Given an existing dataset, create a writable copy whose initial contents
	- * are the same as the source. The newly created dataset maintains a
	- * dependency on the original; the original cannot be destroyed so long as
	- * the clone exists.
	- *
	- * The '-p' flag creates all the non-existing ancestors of the target first.
	- */
	-static int
	-zfs_do_clone(int argc, char **argv)
	-{
	- zfs_handle_t *zhp = NULL;
	- boolean_t parents = B_FALSE;
	- nvlist_t *props;
	- int ret = 0;
	- int c;
	-
	- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "o:p")) != -1) {
	- switch (c) {
	- case 'o':
	- if (parseprop(props, optarg) != 0)
	- return (1);
	- break;
	- case 'p':
	- parents = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- goto usage;
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing source dataset "
	- "argument\n"));
	- goto usage;
	- }
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing target dataset "
	- "argument\n"));
	- goto usage;
	- }
	- if (argc > 2) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- goto usage;
	- }
	-
	- /* open the source dataset */
	- if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
	- return (1);
	-
	- if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM \|
	- ZFS_TYPE_VOLUME)) {
	- /*
	- * Now create the ancestors of the target dataset. If the
	- * target already exists and '-p' option was used we should not
	- * complain.
	- */
	- if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM \|
	- ZFS_TYPE_VOLUME))
	- return (0);
	- if (zfs_create_ancestors(g_zfs, argv[1]) != 0)
	- return (1);
	- }
	-
	- /* pass to libzfs */
	- ret = zfs_clone(zhp, argv[1], props);
	-
	- /* create the mountpoint if necessary */
	- if (ret == 0) {
	- zfs_handle_t *clone;
	-
	- clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET);
	- if (clone != NULL) {
	- /*
	- * If the user doesn't want the dataset
	- * automatically mounted, then skip the mount/share
	- * step.
	- */
	- if (should_auto_mount(clone)) {
	- if ((ret = zfs_mount(clone, NULL, 0)) != 0) {
	- (void) fprintf(stderr, gettext("clone "
	- "successfully created, "
	- "but not mounted\n"));
	- } else if ((ret = zfs_share(clone)) != 0) {
	- (void) fprintf(stderr, gettext("clone "
	- "successfully created, "
	- "but not shared\n"));
	- }
	- }
	- zfs_close(clone);
	- }
	- }
	-
	- zfs_close(zhp);
	- nvlist_free(props);
	-
	- return (!!ret);
	-
	-usage:
	- if (zhp)
	- zfs_close(zhp);
	- nvlist_free(props);
	- usage(B_FALSE);
	- return (-1);
	-}
	-
	-/*
	- * zfs create [-pu] [-o prop=value] ... fs
	- * zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size
	- *
	- * Create a new dataset. This command can be used to create filesystems
	- * and volumes. Snapshot creation is handled by 'zfs snapshot'.
	- * For volumes, the user must specify a size to be used.
	- *
	- * The '-s' flag applies only to volumes, and indicates that we should not try
	- * to set the reservation for this volume. By default we set a reservation
	- * equal to the size for any volume. For pools with SPA_VERSION >=
	- * SPA_VERSION_REFRESERVATION, we set a refreservation instead.
	- *
	- * The '-p' flag creates all the non-existing ancestors of the target first.
	- *
	- * The '-u' flag prevents mounting of newly created file system.
	- */
	-static int
	-zfs_do_create(int argc, char **argv)
	-{
	- zfs_type_t type = ZFS_TYPE_FILESYSTEM;
	- zfs_handle_t *zhp = NULL;
	- uint64_t volsize = 0;
	- int c;
	- boolean_t noreserve = B_FALSE;
	- boolean_t bflag = B_FALSE;
	- boolean_t parents = B_FALSE;
	- boolean_t nomount = B_FALSE;
	- int ret = 1;
	- nvlist_t *props;
	- uint64_t intval;
	-
	- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	-
	- /* check options */
	- while ((c = getopt(argc, argv, ":V:b:so:pu")) != -1) {
	- switch (c) {
	- case 'V':
	- type = ZFS_TYPE_VOLUME;
	- if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
	- (void) fprintf(stderr, gettext("bad volume "
	- "size '%s': %s\n"), optarg,
	- libzfs_error_description(g_zfs));
	- goto error;
	- }
	-
	- if (nvlist_add_uint64(props,
	- zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
	- nomem();
	- volsize = intval;
	- break;
	- case 'p':
	- parents = B_TRUE;
	- break;
	- case 'b':
	- bflag = B_TRUE;
	- if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
	- (void) fprintf(stderr, gettext("bad volume "
	- "block size '%s': %s\n"), optarg,
	- libzfs_error_description(g_zfs));
	- goto error;
	- }
	-
	- if (nvlist_add_uint64(props,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	- intval) != 0)
	- nomem();
	- break;
	- case 'o':
	- if (parseprop(props, optarg) != 0)
	- goto error;
	- break;
	- case 's':
	- noreserve = B_TRUE;
	- break;
	- case 'u':
	- nomount = B_TRUE;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing size "
	- "argument\n"));
	- goto badusage;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- goto badusage;
	- }
	- }
	-
	- if ((bflag \|\| noreserve) && type != ZFS_TYPE_VOLUME) {
	- (void) fprintf(stderr, gettext("'-s' and '-b' can only be "
	- "used when creating a volume\n"));
	- goto badusage;
	- }
	- if (nomount && type != ZFS_TYPE_FILESYSTEM) {
	- (void) fprintf(stderr, gettext("'-u' can only be "
	- "used when creating a file system\n"));
	- goto badusage;
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc == 0) {
	- (void) fprintf(stderr, gettext("missing %s argument\n"),
	- zfs_type_to_name(type));
	- goto badusage;
	- }
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- goto badusage;
	- }
	-
	- if (type == ZFS_TYPE_VOLUME && !noreserve) {
	- zpool_handle_t *zpool_handle;
	- nvlist_t *real_props = NULL;
	- uint64_t spa_version;
	- char *p;
	- zfs_prop_t resv_prop;
	- char *strval;
	- char msg[1024];
	-
	- if ((p = strchr(argv[0], '/')) != NULL)
	- *p = '\0';
	- zpool_handle = zpool_open(g_zfs, argv[0]);
	- if (p != NULL)
	- *p = '/';
	- if (zpool_handle == NULL)
	- goto error;
	- spa_version = zpool_get_prop_int(zpool_handle,
	- ZPOOL_PROP_VERSION, NULL);
	- if (spa_version >= SPA_VERSION_REFRESERVATION)
	- resv_prop = ZFS_PROP_REFRESERVATION;
	- else
	- resv_prop = ZFS_PROP_RESERVATION;
	-
	- (void) snprintf(msg, sizeof (msg),
	- gettext("cannot create '%s'"), argv[0]);
	- if (props && (real_props = zfs_valid_proplist(g_zfs, type,
	- props, 0, NULL, zpool_handle, msg)) == NULL) {
	- zpool_close(zpool_handle);
	- goto error;
	- }
	- zpool_close(zpool_handle);
	-
	- volsize = zvol_volsize_to_reservation(volsize, real_props);
	- nvlist_free(real_props);
	-
	- if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
	- &strval) != 0) {
	- if (nvlist_add_uint64(props,
	- zfs_prop_to_name(resv_prop), volsize) != 0) {
	- nvlist_free(props);
	- nomem();
	- }
	- }
	- }
	-
	- if (parents && zfs_name_valid(argv[0], type)) {
	- /*
	- * Now create the ancestors of target dataset. If the target
	- * already exists and '-p' option was used we should not
	- * complain.
	- */
	- if (zfs_dataset_exists(g_zfs, argv[0], type)) {
	- ret = 0;
	- goto error;
	- }
	- if (zfs_create_ancestors(g_zfs, argv[0]) != 0)
	- goto error;
	- }
	-
	- /* pass to libzfs */
	- if (zfs_create(g_zfs, argv[0], type, props) != 0)
	- goto error;
	-
	- if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
	- goto error;
	-
	- ret = 0;
	-
	- /*
	- * Mount and/or share the new filesystem as appropriate. We provide a
	- * verbose error message to let the user know that their filesystem was
	- * in fact created, even if we failed to mount or share it.
	- * If the user doesn't want the dataset automatically mounted,
	- * then skip the mount/share step altogether.
	- */
	- if (!nomount && should_auto_mount(zhp)) {
	- if (zfs_mount(zhp, NULL, 0) != 0) {
	- (void) fprintf(stderr, gettext("filesystem "
	- "successfully created, but not mounted\n"));
	- ret = 1;
	- } else if (zfs_share(zhp) != 0) {
	- (void) fprintf(stderr, gettext("filesystem "
	- "successfully created, but not shared\n"));
	- ret = 1;
	- }
	- }
	-
	-error:
	- if (zhp)
	- zfs_close(zhp);
	- nvlist_free(props);
	- return (ret);
	-badusage:
	- nvlist_free(props);
	- usage(B_FALSE);
	- return (2);
	-}
	-
	-/*
	- * zfs destroy [-rRf] <fs, vol>
	- * zfs destroy [-rRd] <snap>
	- *
	- * -r Recursively destroy all children
	- * -R Recursively destroy all dependents, including clones
	- * -f Force unmounting of any dependents
	- * -d If we can't destroy now, mark for deferred destruction
	- *
	- * Destroys the given dataset. By default, it will unmount any filesystems,
	- * and refuse to destroy a dataset that has any dependents. A dependent can
	- * either be a child, or a clone of a child.
	- */
	-typedef struct destroy_cbdata {
	- boolean_t cb_first;
	- boolean_t cb_force;
	- boolean_t cb_recurse;
	- boolean_t cb_error;
	- boolean_t cb_doclones;
	- zfs_handle_t *cb_target;
	- boolean_t cb_defer_destroy;
	- boolean_t cb_verbose;
	- boolean_t cb_parsable;
	- boolean_t cb_dryrun;
	- nvlist_t *cb_nvl;
	- nvlist_t *cb_batchedsnaps;
	-
	- /* first snap in contiguous run */
	- char *cb_firstsnap;
	- /* previous snap in contiguous run */
	- char *cb_prevsnap;
	- int64_t cb_snapused;
	- char *cb_snapspec;
	- char *cb_bookmark;
	-} destroy_cbdata_t;
	-
	-/*
	- * Check for any dependents based on the '-r' or '-R' flags.
	- */
	-static int
	-destroy_check_dependent(zfs_handle_t zhp, void data)
	-{
	- destroy_cbdata_t *cbp = data;
	- const char *tname = zfs_get_name(cbp->cb_target);
	- const char *name = zfs_get_name(zhp);
	-
	- if (strncmp(tname, name, strlen(tname)) == 0 &&
	- (name[strlen(tname)] == '/' \|\| name[strlen(tname)] == '@')) {
	- /*
	- * This is a direct descendant, not a clone somewhere else in
	- * the hierarchy.
	- */
	- if (cbp->cb_recurse)
	- goto out;
	-
	- if (cbp->cb_first) {
	- (void) fprintf(stderr, gettext("cannot destroy '%s': "
	- "%s has children\n"),
	- zfs_get_name(cbp->cb_target),
	- zfs_type_to_name(zfs_get_type(cbp->cb_target)));
	- (void) fprintf(stderr, gettext("use '-r' to destroy "
	- "the following datasets:\n"));
	- cbp->cb_first = B_FALSE;
	- cbp->cb_error = B_TRUE;
	- }
	-
	- (void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
	- } else {
	- /*
	- * This is a clone. We only want to report this if the '-r'
	- * wasn't specified, or the target is a snapshot.
	- */
	- if (!cbp->cb_recurse &&
	- zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
	- goto out;
	-
	- if (cbp->cb_first) {
	- (void) fprintf(stderr, gettext("cannot destroy '%s': "
	- "%s has dependent clones\n"),
	- zfs_get_name(cbp->cb_target),
	- zfs_type_to_name(zfs_get_type(cbp->cb_target)));
	- (void) fprintf(stderr, gettext("use '-R' to destroy "
	- "the following datasets:\n"));
	- cbp->cb_first = B_FALSE;
	- cbp->cb_error = B_TRUE;
	- cbp->cb_dryrun = B_TRUE;
	- }
	-
	- (void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
	- }
	-
	-out:
	- zfs_close(zhp);
	- return (0);
	-}
	-
	-static int
	-destroy_callback(zfs_handle_t zhp, void data)
	-{
	- destroy_cbdata_t *cb = data;
	- const char *name = zfs_get_name(zhp);
	-
	- if (cb->cb_verbose) {
	- if (cb->cb_parsable) {
	- (void) printf("destroy\t%s\n", name);
	- } else if (cb->cb_dryrun) {
	- (void) printf(gettext("would destroy %s\n"),
	- name);
	- } else {
	- (void) printf(gettext("will destroy %s\n"),
	- name);
	- }
	- }
	-
	- /*
	- * Ignore pools (which we've already flagged as an error before getting
	- * here).
	- */
	- if (strchr(zfs_get_name(zhp), '/') == NULL &&
	- zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
	- zfs_close(zhp);
	- return (0);
	- }
	- if (cb->cb_dryrun) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- /*
	- * We batch up all contiguous snapshots (even of different
	- * filesystems) and destroy them with one ioctl. We can't
	- * simply do all snap deletions and then all fs deletions,
	- * because we must delete a clone before its origin.
	- */
	- if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
	- fnvlist_add_boolean(cb->cb_batchedsnaps, name);
	- } else {
	- int error = zfs_destroy_snaps_nvl(g_zfs,
	- cb->cb_batchedsnaps, B_FALSE);
	- fnvlist_free(cb->cb_batchedsnaps);
	- cb->cb_batchedsnaps = fnvlist_alloc();
	-
	- if (error != 0 \|\|
	- zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 \|\|
	- zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
	- zfs_close(zhp);
	- return (-1);
	- }
	- }
	-
	- zfs_close(zhp);
	- return (0);
	-}
	-
	-static int
	-destroy_print_cb(zfs_handle_t zhp, void arg)
	-{
	- destroy_cbdata_t *cb = arg;
	- const char *name = zfs_get_name(zhp);
	- int err = 0;
	-
	- if (nvlist_exists(cb->cb_nvl, name)) {
	- if (cb->cb_firstsnap == NULL)
	- cb->cb_firstsnap = strdup(name);
	- if (cb->cb_prevsnap != NULL)
	- free(cb->cb_prevsnap);
	- /* this snap continues the current range */
	- cb->cb_prevsnap = strdup(name);
	- if (cb->cb_firstsnap == NULL \|\| cb->cb_prevsnap == NULL)
	- nomem();
	- if (cb->cb_verbose) {
	- if (cb->cb_parsable) {
	- (void) printf("destroy\t%s\n", name);
	- } else if (cb->cb_dryrun) {
	- (void) printf(gettext("would destroy %s\n"),
	- name);
	- } else {
	- (void) printf(gettext("will destroy %s\n"),
	- name);
	- }
	- }
	- } else if (cb->cb_firstsnap != NULL) {
	- /* end of this range */
	- uint64_t used = 0;
	- err = lzc_snaprange_space(cb->cb_firstsnap,
	- cb->cb_prevsnap, &used);
	- cb->cb_snapused += used;
	- free(cb->cb_firstsnap);
	- cb->cb_firstsnap = NULL;
	- free(cb->cb_prevsnap);
	- cb->cb_prevsnap = NULL;
	- }
	- zfs_close(zhp);
	- return (err);
	-}
	-
	-static int
	-destroy_print_snapshots(zfs_handle_t fs_zhp, destroy_cbdata_t cb)
	-{
	- int err = 0;
	- assert(cb->cb_firstsnap == NULL);
	- assert(cb->cb_prevsnap == NULL);
	- err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0);
	- if (cb->cb_firstsnap != NULL) {
	- uint64_t used = 0;
	- if (err == 0) {
	- err = lzc_snaprange_space(cb->cb_firstsnap,
	- cb->cb_prevsnap, &used);
	- }
	- cb->cb_snapused += used;
	- free(cb->cb_firstsnap);
	- cb->cb_firstsnap = NULL;
	- free(cb->cb_prevsnap);
	- cb->cb_prevsnap = NULL;
	- }
	- return (err);
	-}
	-
	-static int
	-snapshot_to_nvl_cb(zfs_handle_t zhp, void arg)
	-{
	- destroy_cbdata_t *cb = arg;
	- int err = 0;
	-
	- /* Check for clones. */
	- if (!cb->cb_doclones && !cb->cb_defer_destroy) {
	- cb->cb_target = zhp;
	- cb->cb_first = B_TRUE;
	- err = zfs_iter_dependents(zhp, B_TRUE,
	- destroy_check_dependent, cb);
	- }
	-
	- if (err == 0) {
	- if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp)))
	- nomem();
	- }
	- zfs_close(zhp);
	- return (err);
	-}
	-
	-static int
	-gather_snapshots(zfs_handle_t zhp, void arg)
	-{
	- destroy_cbdata_t *cb = arg;
	- int err = 0;
	-
	- err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb);
	- if (err == ENOENT)
	- err = 0;
	- if (err != 0)
	- goto out;
	-
	- if (cb->cb_verbose) {
	- err = destroy_print_snapshots(zhp, cb);
	- if (err != 0)
	- goto out;
	- }
	-
	- if (cb->cb_recurse)
	- err = zfs_iter_filesystems(zhp, gather_snapshots, cb);
	-
	-out:
	- zfs_close(zhp);
	- return (err);
	-}
	-
	-static int
	-destroy_clones(destroy_cbdata_t *cb)
	-{
	- nvpair_t *pair;
	- for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL);
	- pair != NULL;
	- pair = nvlist_next_nvpair(cb->cb_nvl, pair)) {
	- zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair),
	- ZFS_TYPE_SNAPSHOT);
	- if (zhp != NULL) {
	- boolean_t defer = cb->cb_defer_destroy;
	- int err = 0;
	-
	- /*
	- * We can't defer destroy non-snapshots, so set it to
	- * false while destroying the clones.
	- */
	- cb->cb_defer_destroy = B_FALSE;
	- err = zfs_iter_dependents(zhp, B_FALSE,
	- destroy_callback, cb);
	- cb->cb_defer_destroy = defer;
	- zfs_close(zhp);
	- if (err != 0)
	- return (err);
	- }
	- }
	- return (0);
	-}
	-
	-static int
	-zfs_do_destroy(int argc, char **argv)
	-{
	- destroy_cbdata_t cb = { 0 };
	- int rv = 0;
	- int err = 0;
	- int c;
	- zfs_handle_t *zhp = NULL;
	- char at, pound;
	- zfs_type_t type = ZFS_TYPE_DATASET;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "vpndfrR")) != -1) {
	- switch (c) {
	- case 'v':
	- cb.cb_verbose = B_TRUE;
	- break;
	- case 'p':
	- cb.cb_verbose = B_TRUE;
	- cb.cb_parsable = B_TRUE;
	- break;
	- case 'n':
	- cb.cb_dryrun = B_TRUE;
	- break;
	- case 'd':
	- cb.cb_defer_destroy = B_TRUE;
	- type = ZFS_TYPE_SNAPSHOT;
	- break;
	- case 'f':
	- cb.cb_force = B_TRUE;
	- break;
	- case 'r':
	- cb.cb_recurse = B_TRUE;
	- break;
	- case 'R':
	- cb.cb_recurse = B_TRUE;
	- cb.cb_doclones = B_TRUE;
	- break;
	- case '?':
	- default:
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc == 0) {
	- (void) fprintf(stderr, gettext("missing dataset argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- at = strchr(argv[0], '@');
	- pound = strchr(argv[0], '#');
	- if (at != NULL) {
	-
	- /* Build the list of snaps to destroy in cb_nvl. */
	- cb.cb_nvl = fnvlist_alloc();
	-
	- *at = '\0';
	- zhp = zfs_open(g_zfs, argv[0],
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (zhp == NULL)
	- return (1);
	-
	- cb.cb_snapspec = at + 1;
	- if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 \|\|
	- cb.cb_error) {
	- rv = 1;
	- goto out;
	- }
	-
	- if (nvlist_empty(cb.cb_nvl)) {
	- (void) fprintf(stderr, gettext("could not find any "
	- "snapshots to destroy; check snapshot names.\n"));
	- rv = 1;
	- goto out;
	- }
	-
	- if (cb.cb_verbose) {
	- char buf[16];
	- zfs_nicenum(cb.cb_snapused, buf, sizeof (buf));
	- if (cb.cb_parsable) {
	- (void) printf("reclaim\t%llu\n",
	- cb.cb_snapused);
	- } else if (cb.cb_dryrun) {
	- (void) printf(gettext("would reclaim %s\n"),
	- buf);
	- } else {
	- (void) printf(gettext("will reclaim %s\n"),
	- buf);
	- }
	- }
	-
	- if (!cb.cb_dryrun) {
	- if (cb.cb_doclones) {
	- cb.cb_batchedsnaps = fnvlist_alloc();
	- err = destroy_clones(&cb);
	- if (err == 0) {
	- err = zfs_destroy_snaps_nvl(g_zfs,
	- cb.cb_batchedsnaps, B_FALSE);
	- }
	- if (err != 0) {
	- rv = 1;
	- goto out;
	- }
	- }
	- if (err == 0) {
	- err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
	- cb.cb_defer_destroy);
	- }
	- }
	-
	- if (err != 0)
	- rv = 1;
	- } else if (pound != NULL) {
	- int err;
	- nvlist_t *nvl;
	-
	- if (cb.cb_dryrun) {
	- (void) fprintf(stderr,
	- "dryrun is not supported with bookmark\n");
	- return (-1);
	- }
	-
	- if (cb.cb_defer_destroy) {
	- (void) fprintf(stderr,
	- "defer destroy is not supported with bookmark\n");
	- return (-1);
	- }
	-
	- if (cb.cb_recurse) {
	- (void) fprintf(stderr,
	- "recursive is not supported with bookmark\n");
	- return (-1);
	- }
	-
	- if (!zfs_bookmark_exists(argv[0])) {
	- (void) fprintf(stderr, gettext("bookmark '%s' "
	- "does not exist.\n"), argv[0]);
	- return (1);
	- }
	-
	- nvl = fnvlist_alloc();
	- fnvlist_add_boolean(nvl, argv[0]);
	-
	- err = lzc_destroy_bookmarks(nvl, NULL);
	- if (err != 0) {
	- (void) zfs_standard_error(g_zfs, err,
	- "cannot destroy bookmark");
	- }
	-
	- nvlist_free(cb.cb_nvl);
	-
	- return (err);
	- } else {
	- /* Open the given dataset */
	- if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
	- return (1);
	-
	- cb.cb_target = zhp;
	-
	- /*
	- * Perform an explicit check for pools before going any further.
	- */
	- if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
	- zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
	- (void) fprintf(stderr, gettext("cannot destroy '%s': "
	- "operation does not apply to pools\n"),
	- zfs_get_name(zhp));
	- (void) fprintf(stderr, gettext("use 'zfs destroy -r "
	- "%s' to destroy all datasets in the pool\n"),
	- zfs_get_name(zhp));
	- (void) fprintf(stderr, gettext("use 'zpool destroy %s' "
	- "to destroy the pool itself\n"), zfs_get_name(zhp));
	- rv = 1;
	- goto out;
	- }
	-
	- /*
	- * Check for any dependents and/or clones.
	- */
	- cb.cb_first = B_TRUE;
	- if (!cb.cb_doclones &&
	- zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
	- &cb) != 0) {
	- rv = 1;
	- goto out;
	- }
	-
	- if (cb.cb_error) {
	- rv = 1;
	- goto out;
	- }
	-
	- cb.cb_batchedsnaps = fnvlist_alloc();
	- if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
	- &cb) != 0) {
	- rv = 1;
	- goto out;
	- }
	-
	- /*
	- * Do the real thing. The callback will close the
	- * handle regardless of whether it succeeds or not.
	- */
	- err = destroy_callback(zhp, &cb);
	- zhp = NULL;
	- if (err == 0) {
	- err = zfs_destroy_snaps_nvl(g_zfs,
	- cb.cb_batchedsnaps, cb.cb_defer_destroy);
	- }
	- if (err != 0)
	- rv = 1;
	- }
	-
	-out:
	- fnvlist_free(cb.cb_batchedsnaps);
	- fnvlist_free(cb.cb_nvl);
	- if (zhp != NULL)
	- zfs_close(zhp);
	- return (rv);
	-}
	-
	-static boolean_t
	-is_recvd_column(zprop_get_cbdata_t *cbp)
	-{
	- int i;
	- zfs_get_column_t col;
	-
	- for (i = 0; i < ZFS_GET_NCOLS &&
	- (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
	- if (col == GET_COL_RECVD)
	- return (B_TRUE);
	- return (B_FALSE);
	-}
	-
	-/*
	- * zfs get [-rHp] [-o all \| field[,field]...] [-s source[,source]...]
	- * < all \| property[,property]... > < fs \| snap \| vol > ...
	- *
	- * -r recurse over any child datasets
	- * -H scripted mode. Headers are stripped, and fields are separated
	- * by tabs instead of spaces.
	- * -o Set of fields to display. One of "name,property,value,
	- * received,source". Default is "name,property,value,source".
	- * "all" is an alias for all five.
	- * -s Set of sources to allow. One of
	- * "local,default,inherited,received,temporary,none". Default is
	- * all six.
	- * -p Display values in parsable (literal) format.
	- *
	- * Prints properties for the given datasets. The user can control which
	- * columns to display as well as which property types to allow.
	- */
	-
	-/*
	- * Invoked to display the properties for a single dataset.
	- */
	-static int
	-get_callback(zfs_handle_t zhp, void data)
	-{
	- char buf[ZFS_MAXPROPLEN];
	- char rbuf[ZFS_MAXPROPLEN];
	- zprop_source_t sourcetype;
	- char source[ZFS_MAX_DATASET_NAME_LEN];
	- zprop_get_cbdata_t *cbp = data;
	- nvlist_t *user_props = zfs_get_user_props(zhp);
	- zprop_list_t *pl = cbp->cb_proplist;
	- nvlist_t *propval;
	- char *strval;
	- char *sourceval;
	- boolean_t received = is_recvd_column(cbp);
	-
	- for (; pl != NULL; pl = pl->pl_next) {
	- char *recvdval = NULL;
	- /*
	- * Skip the special fake placeholder. This will also skip over
	- * the name property when 'all' is specified.
	- */
	- if (pl->pl_prop == ZFS_PROP_NAME &&
	- pl == cbp->cb_proplist)
	- continue;
	-
	- if (pl->pl_prop != ZPROP_INVAL) {
	- if (zfs_prop_get(zhp, pl->pl_prop, buf,
	- sizeof (buf), &sourcetype, source,
	- sizeof (source),
	- cbp->cb_literal) != 0) {
	- if (pl->pl_all)
	- continue;
	- if (!zfs_prop_valid_for_type(pl->pl_prop,
	- ZFS_TYPE_DATASET)) {
	- (void) fprintf(stderr,
	- gettext("No such property '%s'\n"),
	- zfs_prop_to_name(pl->pl_prop));
	- continue;
	- }
	- sourcetype = ZPROP_SRC_NONE;
	- (void) strlcpy(buf, "-", sizeof (buf));
	- }
	-
	- if (received && (zfs_prop_get_recvd(zhp,
	- zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
	- cbp->cb_literal) == 0))
	- recvdval = rbuf;
	-
	- zprop_print_one_property(zfs_get_name(zhp), cbp,
	- zfs_prop_to_name(pl->pl_prop),
	- buf, sourcetype, source, recvdval);
	- } else if (zfs_prop_userquota(pl->pl_user_prop)) {
	- sourcetype = ZPROP_SRC_LOCAL;
	-
	- if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
	- buf, sizeof (buf), cbp->cb_literal) != 0) {
	- sourcetype = ZPROP_SRC_NONE;
	- (void) strlcpy(buf, "-", sizeof (buf));
	- }
	-
	- zprop_print_one_property(zfs_get_name(zhp), cbp,
	- pl->pl_user_prop, buf, sourcetype, source, NULL);
	- } else if (zfs_prop_written(pl->pl_user_prop)) {
	- sourcetype = ZPROP_SRC_LOCAL;
	-
	- if (zfs_prop_get_written(zhp, pl->pl_user_prop,
	- buf, sizeof (buf), cbp->cb_literal) != 0) {
	- sourcetype = ZPROP_SRC_NONE;
	- (void) strlcpy(buf, "-", sizeof (buf));
	- }
	-
	- zprop_print_one_property(zfs_get_name(zhp), cbp,
	- pl->pl_user_prop, buf, sourcetype, source, NULL);
	- } else {
	- if (nvlist_lookup_nvlist(user_props,
	- pl->pl_user_prop, &propval) != 0) {
	- if (pl->pl_all)
	- continue;
	- sourcetype = ZPROP_SRC_NONE;
	- strval = "-";
	- } else {
	- verify(nvlist_lookup_string(propval,
	- ZPROP_VALUE, &strval) == 0);
	- verify(nvlist_lookup_string(propval,
	- ZPROP_SOURCE, &sourceval) == 0);
	-
	- if (strcmp(sourceval,
	- zfs_get_name(zhp)) == 0) {
	- sourcetype = ZPROP_SRC_LOCAL;
	- } else if (strcmp(sourceval,
	- ZPROP_SOURCE_VAL_RECVD) == 0) {
	- sourcetype = ZPROP_SRC_RECEIVED;
	- } else {
	- sourcetype = ZPROP_SRC_INHERITED;
	- (void) strlcpy(source,
	- sourceval, sizeof (source));
	- }
	- }
	-
	- if (received && (zfs_prop_get_recvd(zhp,
	- pl->pl_user_prop, rbuf, sizeof (rbuf),
	- cbp->cb_literal) == 0))
	- recvdval = rbuf;
	-
	- zprop_print_one_property(zfs_get_name(zhp), cbp,
	- pl->pl_user_prop, strval, sourcetype,
	- source, recvdval);
	- }
	- }
	-
	- return (0);
	-}
	-
	-static int
	-zfs_do_get(int argc, char **argv)
	-{
	- zprop_get_cbdata_t cb = { 0 };
	- int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
	- int types = ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK;
	- char value, fields;
	- int ret = 0;
	- int limit = 0;
	- zprop_list_t fake_name = { 0 };
	-
	- /*
	- * Set up default columns and sources.
	- */
	- cb.cb_sources = ZPROP_SRC_ALL;
	- cb.cb_columns[0] = GET_COL_NAME;
	- cb.cb_columns[1] = GET_COL_PROPERTY;
	- cb.cb_columns[2] = GET_COL_VALUE;
	- cb.cb_columns[3] = GET_COL_SOURCE;
	- cb.cb_type = ZFS_TYPE_DATASET;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) {
	- switch (c) {
	- case 'p':
	- cb.cb_literal = B_TRUE;
	- break;
	- case 'd':
	- limit = parse_depth(optarg, &flags);
	- break;
	- case 'r':
	- flags \|= ZFS_ITER_RECURSE;
	- break;
	- case 'H':
	- cb.cb_scripted = B_TRUE;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case 'o':
	- /*
	- * Process the set of columns to display. We zero out
	- * the structure to give us a blank slate.
	- */
	- bzero(&cb.cb_columns, sizeof (cb.cb_columns));
	- i = 0;
	- while (*optarg != '\0') {
	- static char *col_subopts[] =
	- { "name", "property", "value", "received",
	- "source", "all", NULL };
	-
	- if (i == ZFS_GET_NCOLS) {
	- (void) fprintf(stderr, gettext("too "
	- "many fields given to -o "
	- "option\n"));
	- usage(B_FALSE);
	- }
	-
	- switch (getsubopt(&optarg, col_subopts,
	- &value)) {
	- case 0:
	- cb.cb_columns[i++] = GET_COL_NAME;
	- break;
	- case 1:
	- cb.cb_columns[i++] = GET_COL_PROPERTY;
	- break;
	- case 2:
	- cb.cb_columns[i++] = GET_COL_VALUE;
	- break;
	- case 3:
	- cb.cb_columns[i++] = GET_COL_RECVD;
	- flags \|= ZFS_ITER_RECVD_PROPS;
	- break;
	- case 4:
	- cb.cb_columns[i++] = GET_COL_SOURCE;
	- break;
	- case 5:
	- if (i > 0) {
	- (void) fprintf(stderr,
	- gettext("\"all\" conflicts "
	- "with specific fields "
	- "given to -o option\n"));
	- usage(B_FALSE);
	- }
	- cb.cb_columns[0] = GET_COL_NAME;
	- cb.cb_columns[1] = GET_COL_PROPERTY;
	- cb.cb_columns[2] = GET_COL_VALUE;
	- cb.cb_columns[3] = GET_COL_RECVD;
	- cb.cb_columns[4] = GET_COL_SOURCE;
	- flags \|= ZFS_ITER_RECVD_PROPS;
	- i = ZFS_GET_NCOLS;
	- break;
	- default:
	- (void) fprintf(stderr,
	- gettext("invalid column name "
	- "'%s'\n"), suboptarg);
	- usage(B_FALSE);
	- }
	- }
	- break;
	-
	- case 's':
	- cb.cb_sources = 0;
	- while (*optarg != '\0') {
	- static char *source_subopts[] = {
	- "local", "default", "inherited",
	- "received", "temporary", "none",
	- NULL };
	-
	- switch (getsubopt(&optarg, source_subopts,
	- &value)) {
	- case 0:
	- cb.cb_sources \|= ZPROP_SRC_LOCAL;
	- break;
	- case 1:
	- cb.cb_sources \|= ZPROP_SRC_DEFAULT;
	- break;
	- case 2:
	- cb.cb_sources \|= ZPROP_SRC_INHERITED;
	- break;
	- case 3:
	- cb.cb_sources \|= ZPROP_SRC_RECEIVED;
	- break;
	- case 4:
	- cb.cb_sources \|= ZPROP_SRC_TEMPORARY;
	- break;
	- case 5:
	- cb.cb_sources \|= ZPROP_SRC_NONE;
	- break;
	- default:
	- (void) fprintf(stderr,
	- gettext("invalid source "
	- "'%s'\n"), suboptarg);
	- usage(B_FALSE);
	- }
	- }
	- break;
	-
	- case 't':
	- types = 0;
	- flags &= ~ZFS_ITER_PROP_LISTSNAPS;
	- while (*optarg != '\0') {
	- static char *type_subopts[] = { "filesystem",
	- "volume", "snapshot", "bookmark",
	- "all", NULL };
	-
	- switch (getsubopt(&optarg, type_subopts,
	- &value)) {
	- case 0:
	- types \|= ZFS_TYPE_FILESYSTEM;
	- break;
	- case 1:
	- types \|= ZFS_TYPE_VOLUME;
	- break;
	- case 2:
	- types \|= ZFS_TYPE_SNAPSHOT;
	- break;
	- case 3:
	- types \|= ZFS_TYPE_BOOKMARK;
	- break;
	- case 4:
	- types = ZFS_TYPE_DATASET \|
	- ZFS_TYPE_BOOKMARK;
	- break;
	-
	- default:
	- (void) fprintf(stderr,
	- gettext("invalid type '%s'\n"),
	- suboptarg);
	- usage(B_FALSE);
	- }
	- }
	- break;
	-
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing property "
	- "argument\n"));
	- usage(B_FALSE);
	- }
	-
	- fields = argv[0];
	-
	- if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
	- != 0)
	- usage(B_FALSE);
	-
	- argc--;
	- argv++;
	-
	- /*
	- * As part of zfs_expand_proplist(), we keep track of the maximum column
	- * width for each property. For the 'NAME' (and 'SOURCE') columns, we
	- * need to know the maximum name length. However, the user likely did
	- * not specify 'name' as one of the properties to fetch, so we need to
	- * make sure we always include at least this property for
	- * print_get_headers() to work properly.
	- */
	- if (cb.cb_proplist != NULL) {
	- fake_name.pl_prop = ZFS_PROP_NAME;
	- fake_name.pl_width = strlen(gettext("NAME"));
	- fake_name.pl_next = cb.cb_proplist;
	- cb.cb_proplist = &fake_name;
	- }
	-
	- cb.cb_first = B_TRUE;
	-
	- /* run for each object */
	- ret = zfs_for_each(argc, argv, flags, types, NULL,
	- &cb.cb_proplist, limit, get_callback, &cb);
	-
	- if (cb.cb_proplist == &fake_name)
	- zprop_free_list(fake_name.pl_next);
	- else
	- zprop_free_list(cb.cb_proplist);
	-
	- return (ret);
	-}
	-
	-/*
	- * inherit [-rS] <property> <fs\|vol> ...
	- *
	- * -r Recurse over all children
	- * -S Revert to received value, if any
	- *
	- * For each dataset specified on the command line, inherit the given property
	- * from its parent. Inheriting a property at the pool level will cause it to
	- * use the default value. The '-r' flag will recurse over all children, and is
	- * useful for setting a property on a hierarchy-wide basis, regardless of any
	- * local modifications for each dataset.
	- */
	-
	-typedef struct inherit_cbdata {
	- const char *cb_propname;
	- boolean_t cb_received;
	-} inherit_cbdata_t;
	-
	-static int
	-inherit_recurse_cb(zfs_handle_t zhp, void data)
	-{
	- inherit_cbdata_t *cb = data;
	- zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);
	-
	- /*
	- * If we're doing it recursively, then ignore properties that
	- * are not valid for this type of dataset.
	- */
	- if (prop != ZPROP_INVAL &&
	- !zfs_prop_valid_for_type(prop, zfs_get_type(zhp)))
	- return (0);
	-
	- return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
	-}
	-
	-static int
	-inherit_cb(zfs_handle_t zhp, void data)
	-{
	- inherit_cbdata_t *cb = data;
	-
	- return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
	-}
	-
	-static int
	-zfs_do_inherit(int argc, char **argv)
	-{
	- int c;
	- zfs_prop_t prop;
	- inherit_cbdata_t cb = { 0 };
	- char *propname;
	- int ret = 0;
	- int flags = 0;
	- boolean_t received = B_FALSE;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "rS")) != -1) {
	- switch (c) {
	- case 'r':
	- flags \|= ZFS_ITER_RECURSE;
	- break;
	- case 'S':
	- received = B_TRUE;
	- break;
	- case '?':
	- default:
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing property argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing dataset argument\n"));
	- usage(B_FALSE);
	- }
	-
	- propname = argv[0];
	- argc--;
	- argv++;
	-
	- if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
	- if (zfs_prop_readonly(prop)) {
	- (void) fprintf(stderr, gettext(
	- "%s property is read-only\n"),
	- propname);
	- return (1);
	- }
	- if (!zfs_prop_inheritable(prop) && !received) {
	- (void) fprintf(stderr, gettext("'%s' property cannot "
	- "be inherited\n"), propname);
	- if (prop == ZFS_PROP_QUOTA \|\|
	- prop == ZFS_PROP_RESERVATION \|\|
	- prop == ZFS_PROP_REFQUOTA \|\|
	- prop == ZFS_PROP_REFRESERVATION) {
	- (void) fprintf(stderr, gettext("use 'zfs set "
	- "%s=none' to clear\n"), propname);
	- (void) fprintf(stderr, gettext("use 'zfs "
	- "inherit -S %s' to revert to received "
	- "value\n"), propname);
	- }
	- return (1);
	- }
	- if (received && (prop == ZFS_PROP_VOLSIZE \|\|
	- prop == ZFS_PROP_VERSION)) {
	- (void) fprintf(stderr, gettext("'%s' property cannot "
	- "be reverted to a received value\n"), propname);
	- return (1);
	- }
	- } else if (!zfs_prop_user(propname)) {
	- (void) fprintf(stderr, gettext("invalid property '%s'\n"),
	- propname);
	- usage(B_FALSE);
	- }
	-
	- cb.cb_propname = propname;
	- cb.cb_received = received;
	-
	- if (flags & ZFS_ITER_RECURSE) {
	- ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
	- NULL, NULL, 0, inherit_recurse_cb, &cb);
	- } else {
	- ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
	- NULL, NULL, 0, inherit_cb, &cb);
	- }
	-
	- return (ret);
	-}
	-
	-typedef struct upgrade_cbdata {
	- uint64_t cb_numupgraded;
	- uint64_t cb_numsamegraded;
	- uint64_t cb_numfailed;
	- uint64_t cb_version;
	- boolean_t cb_newer;
	- boolean_t cb_foundone;
	- char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN];
	-} upgrade_cbdata_t;
	-
	-static int
	-same_pool(zfs_handle_t zhp, const char name)
	-{
	- int len1 = strcspn(name, "/@");
	- const char *zhname = zfs_get_name(zhp);
	- int len2 = strcspn(zhname, "/@");
	-
	- if (len1 != len2)
	- return (B_FALSE);
	- return (strncmp(name, zhname, len1) == 0);
	-}
	-
	-static int
	-upgrade_list_callback(zfs_handle_t zhp, void data)
	-{
	- upgrade_cbdata_t *cb = data;
	- int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
	-
	- /* list if it's old/new */
	- if ((!cb->cb_newer && version < ZPL_VERSION) \|\|
	- (cb->cb_newer && version > ZPL_VERSION)) {
	- char *str;
	- if (cb->cb_newer) {
	- str = gettext("The following filesystems are "
	- "formatted using a newer software version and\n"
	- "cannot be accessed on the current system.\n\n");
	- } else {
	- str = gettext("The following filesystems are "
	- "out of date, and can be upgraded. After being\n"
	- "upgraded, these filesystems (and any 'zfs send' "
	- "streams generated from\n"
	- "subsequent snapshots) will no longer be "
	- "accessible by older software versions.\n\n");
	- }
	-
	- if (!cb->cb_foundone) {
	- (void) puts(str);
	- (void) printf(gettext("VER FILESYSTEM\n"));
	- (void) printf(gettext("--- ------------\n"));
	- cb->cb_foundone = B_TRUE;
	- }
	-
	- (void) printf("%2u %s\n", version, zfs_get_name(zhp));
	- }
	-
	- return (0);
	-}
	-
	-static int
	-upgrade_set_callback(zfs_handle_t zhp, void data)
	-{
	- upgrade_cbdata_t *cb = data;
	- int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
	- int needed_spa_version;
	- int spa_version;
	-
	- if (zfs_spa_version(zhp, &spa_version) < 0)
	- return (-1);
	-
	- needed_spa_version = zfs_spa_version_map(cb->cb_version);
	-
	- if (needed_spa_version < 0)
	- return (-1);
	-
	- if (spa_version < needed_spa_version) {
	- /* can't upgrade */
	- (void) printf(gettext("%s: can not be "
	- "upgraded; the pool version needs to first "
	- "be upgraded\nto version %d\n\n"),
	- zfs_get_name(zhp), needed_spa_version);
	- cb->cb_numfailed++;
	- return (0);
	- }
	-
	- /* upgrade */
	- if (version < cb->cb_version) {
	- char verstr[16];
	- (void) snprintf(verstr, sizeof (verstr),
	- "%llu", cb->cb_version);
	- if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) {
	- /*
	- * If they did "zfs upgrade -a", then we could
	- * be doing ioctls to different pools. We need
	- * to log this history once to each pool, and bypass
	- * the normal history logging that happens in main().
	- */
	- (void) zpool_log_history(g_zfs, history_str);
	- log_history = B_FALSE;
	- }
	- if (zfs_prop_set(zhp, "version", verstr) == 0)
	- cb->cb_numupgraded++;
	- else
	- cb->cb_numfailed++;
	- (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp));
	- } else if (version > cb->cb_version) {
	- /* can't downgrade */
	- (void) printf(gettext("%s: can not be downgraded; "
	- "it is already at version %u\n"),
	- zfs_get_name(zhp), version);
	- cb->cb_numfailed++;
	- } else {
	- cb->cb_numsamegraded++;
	- }
	- return (0);
	-}
	-
	-/*
	- * zfs upgrade
	- * zfs upgrade -v
	- * zfs upgrade [-r] [-V <version>] <-a \| filesystem>
	- */
	-static int
	-zfs_do_upgrade(int argc, char **argv)
	-{
	- boolean_t all = B_FALSE;
	- boolean_t showversions = B_FALSE;
	- int ret = 0;
	- upgrade_cbdata_t cb = { 0 };
	- int c;
	- int flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "rvV:a")) != -1) {
	- switch (c) {
	- case 'r':
	- flags \|= ZFS_ITER_RECURSE;
	- break;
	- case 'v':
	- showversions = B_TRUE;
	- break;
	- case 'V':
	- if (zfs_prop_string_to_index(ZFS_PROP_VERSION,
	- optarg, &cb.cb_version) != 0) {
	- (void) fprintf(stderr,
	- gettext("invalid version %s\n"), optarg);
	- usage(B_FALSE);
	- }
	- break;
	- case 'a':
	- all = B_TRUE;
	- break;
	- case '?':
	- default:
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) \| cb.cb_version))
	- usage(B_FALSE);
	- if (showversions && (flags & ZFS_ITER_RECURSE \|\| all \|\|
	- cb.cb_version \|\| argc))
	- usage(B_FALSE);
	- if ((all \|\| argc) && (showversions))
	- usage(B_FALSE);
	- if (all && argc)
	- usage(B_FALSE);
	-
	- if (showversions) {
	- /* Show info on available versions. */
	- (void) printf(gettext("The following filesystem versions are "
	- "supported:\n\n"));
	- (void) printf(gettext("VER DESCRIPTION\n"));
	- (void) printf("--- -----------------------------------------"
	- "---------------\n");
	- (void) printf(gettext(" 1 Initial ZFS filesystem version\n"));
	- (void) printf(gettext(" 2 Enhanced directory entries\n"));
	- (void) printf(gettext(" 3 Case insensitive and filesystem "
	- "user identifier (FUID)\n"));
	- (void) printf(gettext(" 4 userquota, groupquota "
	- "properties\n"));
	- (void) printf(gettext(" 5 System attributes\n"));
	- (void) printf(gettext("\nFor more information on a particular "
	- "version, including supported releases,\n"));
	- (void) printf("see the ZFS Administration Guide.\n\n");
	- ret = 0;
	- } else if (argc \|\| all) {
	- /* Upgrade filesystems */
	- if (cb.cb_version == 0)
	- cb.cb_version = ZPL_VERSION;
	- ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
	- NULL, NULL, 0, upgrade_set_callback, &cb);
	- (void) printf(gettext("%llu filesystems upgraded\n"),
	- cb.cb_numupgraded);
	- if (cb.cb_numsamegraded) {
	- (void) printf(gettext("%llu filesystems already at "
	- "this version\n"),
	- cb.cb_numsamegraded);
	- }
	- if (cb.cb_numfailed != 0)
	- ret = 1;
	- } else {
	- /* List old-version filesystems */
	- boolean_t found;
	- (void) printf(gettext("This system is currently running "
	- "ZFS filesystem version %llu.\n\n"), ZPL_VERSION);
	-
	- flags \|= ZFS_ITER_RECURSE;
	- ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
	- NULL, NULL, 0, upgrade_list_callback, &cb);
	-
	- found = cb.cb_foundone;
	- cb.cb_foundone = B_FALSE;
	- cb.cb_newer = B_TRUE;
	-
	- ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
	- NULL, NULL, 0, upgrade_list_callback, &cb);
	-
	- if (!cb.cb_foundone && !found) {
	- (void) printf(gettext("All filesystems are "
	- "formatted with the current version.\n"));
	- }
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
	- * [-S field [-S field]...] [-t type[,...]] filesystem \| snapshot
	- * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
	- * [-S field [-S field]...] [-t type[,...]] filesystem \| snapshot
	- *
	- * -H Scripted mode; elide headers and separate columns by tabs.
	- * -i Translate SID to POSIX ID.
	- * -n Print numeric ID instead of user/group name.
	- * -o Control which fields to display.
	- * -p Use exact (parsable) numeric output.
	- * -s Specify sort columns, descending order.
	- * -S Specify sort columns, ascending order.
	- * -t Control which object types to display.
	- *
	- * Displays space consumed by, and quotas on, each user in the specified
	- * filesystem or snapshot.
	- */
	-
	-/* us_field_types, us_field_hdr and us_field_names should be kept in sync */
	-enum us_field_types {
	- USFIELD_TYPE,
	- USFIELD_NAME,
	- USFIELD_USED,
	- USFIELD_QUOTA
	-};
	-static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" };
	-static char *us_field_names[] = { "type", "name", "used", "quota" };
	-#define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *))
	-
	-#define USTYPE_PSX_GRP (1 << 0)
	-#define USTYPE_PSX_USR (1 << 1)
	-#define USTYPE_SMB_GRP (1 << 2)
	-#define USTYPE_SMB_USR (1 << 3)
	-#define USTYPE_ALL \
	- (USTYPE_PSX_GRP \| USTYPE_PSX_USR \| USTYPE_SMB_GRP \| USTYPE_SMB_USR)
	-
	-static int us_type_bits[] = {
	- USTYPE_PSX_GRP,
	- USTYPE_PSX_USR,
	- USTYPE_SMB_GRP,
	- USTYPE_SMB_USR,
	- USTYPE_ALL
	-};
	-static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup",
	- "smbuser", "all" };
	-
	-typedef struct us_node {
	- nvlist_t *usn_nvl;
	- uu_avl_node_t usn_avlnode;
	- uu_list_node_t usn_listnode;
	-} us_node_t;
	-
	-typedef struct us_cbdata {
	- nvlist_t **cb_nvlp;
	- uu_avl_pool_t *cb_avl_pool;
	- uu_avl_t *cb_avl;
	- boolean_t cb_numname;
	- boolean_t cb_nicenum;
	- boolean_t cb_sid2posix;
	- zfs_userquota_prop_t cb_prop;
	- zfs_sort_column_t *cb_sortcol;
	- size_t cb_width[USFIELD_LAST];
	-} us_cbdata_t;
	-
	-static boolean_t us_populated = B_FALSE;
	-
	-typedef struct {
	- zfs_sort_column_t *si_sortcol;
	- boolean_t si_numname;
	-} us_sort_info_t;
	-
	-static int
	-us_field_index(char *field)
	-{
	- int i;
	-
	- for (i = 0; i < USFIELD_LAST; i++) {
	- if (strcmp(field, us_field_names[i]) == 0)
	- return (i);
	- }
	-
	- return (-1);
	-}
	-
	-static int
	-us_compare(const void larg, const void rarg, void *unused)
	-{
	- const us_node_t *l = larg;
	- const us_node_t *r = rarg;
	- us_sort_info_t si = (us_sort_info_t )unused;
	- zfs_sort_column_t *sortcol = si->si_sortcol;
	- boolean_t numname = si->si_numname;
	- nvlist_t *lnvl = l->usn_nvl;
	- nvlist_t *rnvl = r->usn_nvl;
	- int rc = 0;
	- boolean_t lvb, rvb;
	-
	- for (; sortcol != NULL; sortcol = sortcol->sc_next) {
	- char *lvstr = "";
	- char *rvstr = "";
	- uint32_t lv32 = 0;
	- uint32_t rv32 = 0;
	- uint64_t lv64 = 0;
	- uint64_t rv64 = 0;
	- zfs_prop_t prop = sortcol->sc_prop;
	- const char *propname = NULL;
	- boolean_t reverse = sortcol->sc_reverse;
	-
	- switch (prop) {
	- case ZFS_PROP_TYPE:
	- propname = "type";
	- (void) nvlist_lookup_uint32(lnvl, propname, &lv32);
	- (void) nvlist_lookup_uint32(rnvl, propname, &rv32);
	- if (rv32 != lv32)
	- rc = (rv32 < lv32) ? 1 : -1;
	- break;
	- case ZFS_PROP_NAME:
	- propname = "name";
	- if (numname) {
	-compare_nums:
	- (void) nvlist_lookup_uint64(lnvl, propname,
	- &lv64);
	- (void) nvlist_lookup_uint64(rnvl, propname,
	- &rv64);
	- if (rv64 != lv64)
	- rc = (rv64 < lv64) ? 1 : -1;
	- } else {
	- if ((nvlist_lookup_string(lnvl, propname,
	- &lvstr) == ENOENT) \|\|
	- (nvlist_lookup_string(rnvl, propname,
	- &rvstr) == ENOENT)) {
	- goto compare_nums;
	- }
	- rc = strcmp(lvstr, rvstr);
	- }
	- break;
	- case ZFS_PROP_USED:
	- case ZFS_PROP_QUOTA:
	- if (!us_populated)
	- break;
	- if (prop == ZFS_PROP_USED)
	- propname = "used";
	- else
	- propname = "quota";
	- (void) nvlist_lookup_uint64(lnvl, propname, &lv64);
	- (void) nvlist_lookup_uint64(rnvl, propname, &rv64);
	- if (rv64 != lv64)
	- rc = (rv64 < lv64) ? 1 : -1;
	- break;
	-
	- default:
	- break;
	- }
	-
	- if (rc != 0) {
	- if (rc < 0)
	- return (reverse ? 1 : -1);
	- else
	- return (reverse ? -1 : 1);
	- }
	- }
	-
	- /*
	- * If entries still seem to be the same, check if they are of the same
	- * type (smbentity is added only if we are doing SID to POSIX ID
	- * translation where we can have duplicate type/name combinations).
	- */
	- if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 &&
	- nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 &&
	- lvb != rvb)
	- return (lvb < rvb ? -1 : 1);
	-
	- return (0);
	-}
	-
	-static inline const char *
	-us_type2str(unsigned field_type)
	-{
	- switch (field_type) {
	- case USTYPE_PSX_USR:
	- return ("POSIX User");
	- case USTYPE_PSX_GRP:
	- return ("POSIX Group");
	- case USTYPE_SMB_USR:
	- return ("SMB User");
	- case USTYPE_SMB_GRP:
	- return ("SMB Group");
	- default:
	- return ("Undefined");
	- }
	-}
	-
	-static int
	-userspace_cb(void arg, const char domain, uid_t rid, uint64_t space)
	-{
	- us_cbdata_t cb = (us_cbdata_t )arg;
	- zfs_userquota_prop_t prop = cb->cb_prop;
	- char *name = NULL;
	- char *propname;
	- char sizebuf[32];
	- us_node_t *node;
	- uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
	- uu_avl_t *avl = cb->cb_avl;
	- uu_avl_index_t idx;
	- nvlist_t *props;
	- us_node_t *n;
	- zfs_sort_column_t *sortcol = cb->cb_sortcol;
	- unsigned type = 0;
	- const char *typestr;
	- size_t namelen;
	- size_t typelen;
	- size_t sizelen;
	- int typeidx, nameidx, sizeidx;
	- us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
	- boolean_t smbentity = B_FALSE;
	-
	- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	- node = safe_malloc(sizeof (us_node_t));
	- uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
	- node->usn_nvl = props;
	-
	- if (domain != NULL && domain[0] != '\0') {
	- /* SMB */
	- char sid[MAXNAMELEN + 32];
	- uid_t id;
	-#ifdef illumos
	- int err;
	- int flag = IDMAP_REQ_FLG_USE_CACHE;
	-#endif
	-
	- smbentity = B_TRUE;
	-
	- (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);
	-
	- if (prop == ZFS_PROP_GROUPUSED \|\| prop == ZFS_PROP_GROUPQUOTA) {
	- type = USTYPE_SMB_GRP;
	-#ifdef illumos
	- err = sid_to_id(sid, B_FALSE, &id);
	-#endif
	- } else {
	- type = USTYPE_SMB_USR;
	-#ifdef illumos
	- err = sid_to_id(sid, B_TRUE, &id);
	-#endif
	- }
	-
	-#ifdef illumos
	- if (err == 0) {
	- rid = id;
	- if (!cb->cb_sid2posix) {
	- if (type == USTYPE_SMB_USR) {
	- (void) idmap_getwinnamebyuid(rid, flag,
	- &name, NULL);
	- } else {
	- (void) idmap_getwinnamebygid(rid, flag,
	- &name, NULL);
	- }
	- if (name == NULL)
	- name = sid;
	- }
	- }
	-#endif
	- }
	-
	- if (cb->cb_sid2posix \|\| domain == NULL \|\| domain[0] == '\0') {
	- /* POSIX or -i */
	- if (prop == ZFS_PROP_GROUPUSED \|\| prop == ZFS_PROP_GROUPQUOTA) {
	- type = USTYPE_PSX_GRP;
	- if (!cb->cb_numname) {
	- struct group *g;
	-
	- if ((g = getgrgid(rid)) != NULL)
	- name = g->gr_name;
	- }
	- } else {
	- type = USTYPE_PSX_USR;
	- if (!cb->cb_numname) {
	- struct passwd *p;
	-
	- if ((p = getpwuid(rid)) != NULL)
	- name = p->pw_name;
	- }
	- }
	- }
	-
	- /*
	- * Make sure that the type/name combination is unique when doing
	- * SID to POSIX ID translation (hence changing the type from SMB to
	- * POSIX).
	- */
	- if (cb->cb_sid2posix &&
	- nvlist_add_boolean_value(props, "smbentity", smbentity) != 0)
	- nomem();
	-
	- /* Calculate/update width of TYPE field */
	- typestr = us_type2str(type);
	- typelen = strlen(gettext(typestr));
	- typeidx = us_field_index("type");
	- if (typelen > cb->cb_width[typeidx])
	- cb->cb_width[typeidx] = typelen;
	- if (nvlist_add_uint32(props, "type", type) != 0)
	- nomem();
	-
	- /* Calculate/update width of NAME field */
	- if ((cb->cb_numname && cb->cb_sid2posix) \|\| name == NULL) {
	- if (nvlist_add_uint64(props, "name", rid) != 0)
	- nomem();
	- namelen = snprintf(NULL, 0, "%u", rid);
	- } else {
	- if (nvlist_add_string(props, "name", name) != 0)
	- nomem();
	- namelen = strlen(name);
	- }
	- nameidx = us_field_index("name");
	- if (namelen > cb->cb_width[nameidx])
	- cb->cb_width[nameidx] = namelen;
	-
	- /*
	- * Check if this type/name combination is in the list and update it;
	- * otherwise add new node to the list.
	- */
	- if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) {
	- uu_avl_insert(avl, node, idx);
	- } else {
	- nvlist_free(props);
	- free(node);
	- node = n;
	- props = node->usn_nvl;
	- }
	-
	- /* Calculate/update width of USED/QUOTA fields */
	- if (cb->cb_nicenum)
	- zfs_nicenum(space, sizebuf, sizeof (sizebuf));
	- else
	- (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", space);
	- sizelen = strlen(sizebuf);
	- if (prop == ZFS_PROP_USERUSED \|\| prop == ZFS_PROP_GROUPUSED) {
	- propname = "used";
	- if (!nvlist_exists(props, "quota"))
	- (void) nvlist_add_uint64(props, "quota", 0);
	- } else {
	- propname = "quota";
	- if (!nvlist_exists(props, "used"))
	- (void) nvlist_add_uint64(props, "used", 0);
	- }
	- sizeidx = us_field_index(propname);
	- if (sizelen > cb->cb_width[sizeidx])
	- cb->cb_width[sizeidx] = sizelen;
	-
	- if (nvlist_add_uint64(props, propname, space) != 0)
	- nomem();
	-
	- return (0);
	-}
	-
	-static void
	-print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types,
	- size_t width, us_node_t node)
	-{
	- nvlist_t *nvl = node->usn_nvl;
	- char valstr[MAXNAMELEN];
	- boolean_t first = B_TRUE;
	- int cfield = 0;
	- int field;
	- uint32_t ustype;
	-
	- /* Check type */
	- (void) nvlist_lookup_uint32(nvl, "type", &ustype);
	- if (!(ustype & types))
	- return;
	-
	- while ((field = fields[cfield]) != USFIELD_LAST) {
	- nvpair_t *nvp = NULL;
	- data_type_t type;
	- uint32_t val32;
	- uint64_t val64;
	- char *strval = NULL;
	-
	- while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	- if (strcmp(nvpair_name(nvp),
	- us_field_names[field]) == 0)
	- break;
	- }
	-
	- type = nvpair_type(nvp);
	- switch (type) {
	- case DATA_TYPE_UINT32:
	- (void) nvpair_value_uint32(nvp, &val32);
	- break;
	- case DATA_TYPE_UINT64:
	- (void) nvpair_value_uint64(nvp, &val64);
	- break;
	- case DATA_TYPE_STRING:
	- (void) nvpair_value_string(nvp, &strval);
	- break;
	- default:
	- (void) fprintf(stderr, "invalid data type\n");
	- }
	-
	- switch (field) {
	- case USFIELD_TYPE:
	- strval = (char *)us_type2str(val32);
	- break;
	- case USFIELD_NAME:
	- if (type == DATA_TYPE_UINT64) {
	- (void) sprintf(valstr, "%llu", val64);
	- strval = valstr;
	- }
	- break;
	- case USFIELD_USED:
	- case USFIELD_QUOTA:
	- if (type == DATA_TYPE_UINT64) {
	- if (parsable) {
	- (void) sprintf(valstr, "%llu", val64);
	- } else {
	- zfs_nicenum(val64, valstr,
	- sizeof (valstr));
	- }
	- if (field == USFIELD_QUOTA &&
	- strcmp(valstr, "0") == 0)
	- strval = "none";
	- else
	- strval = valstr;
	- }
	- break;
	- }
	-
	- if (!first) {
	- if (scripted)
	- (void) printf("\t");
	- else
	- (void) printf(" ");
	- }
	- if (scripted)
	- (void) printf("%s", strval);
	- else if (field == USFIELD_TYPE \|\| field == USFIELD_NAME)
	- (void) printf("%-*s", width[field], strval);
	- else
	- (void) printf("%*s", width[field], strval);
	-
	- first = B_FALSE;
	- cfield++;
	- }
	-
	- (void) printf("\n");
	-}
	-
	-static void
	-print_us(boolean_t scripted, boolean_t parsable, int *fields, int types,
	- size_t width, boolean_t rmnode, uu_avl_t avl)
	-{
	- us_node_t *node;
	- const char *col;
	- int cfield = 0;
	- int field;
	-
	- if (!scripted) {
	- boolean_t first = B_TRUE;
	-
	- while ((field = fields[cfield]) != USFIELD_LAST) {
	- col = gettext(us_field_hdr[field]);
	- if (field == USFIELD_TYPE \|\| field == USFIELD_NAME) {
	- (void) printf(first ? "%-s" : " %-s",
	- width[field], col);
	- } else {
	- (void) printf(first ? "%s" : " %s",
	- width[field], col);
	- }
	- first = B_FALSE;
	- cfield++;
	- }
	- (void) printf("\n");
	- }
	-
	- for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) {
	- print_us_node(scripted, parsable, fields, types, width, node);
	- if (rmnode)
	- nvlist_free(node->usn_nvl);
	- }
	-}
	-
	-static int
	-zfs_do_userspace(int argc, char **argv)
	-{
	- zfs_handle_t *zhp;
	- zfs_userquota_prop_t p;
	-
	- uu_avl_pool_t *avl_pool;
	- uu_avl_t *avl_tree;
	- uu_avl_walk_t *walk;
	- char *delim;
	- char deffields[] = "type,name,used,quota";
	- char *ofield = NULL;
	- char *tfield = NULL;
	- int cfield = 0;
	- int fields[256];
	- int i;
	- boolean_t scripted = B_FALSE;
	- boolean_t prtnum = B_FALSE;
	- boolean_t parsable = B_FALSE;
	- boolean_t sid2posix = B_FALSE;
	- int ret = 0;
	- int c;
	- zfs_sort_column_t *sortcol = NULL;
	- int types = USTYPE_PSX_USR \| USTYPE_SMB_USR;
	- us_cbdata_t cb;
	- us_node_t *node;
	- us_node_t *rmnode;
	- uu_list_pool_t *listpool;
	- uu_list_t *list;
	- uu_avl_index_t idx = 0;
	- uu_list_index_t idx2 = 0;
	-
	- if (argc < 2)
	- usage(B_FALSE);
	-
	- if (strcmp(argv[0], "groupspace") == 0)
	- /* Toggle default group types */
	- types = USTYPE_PSX_GRP \| USTYPE_SMB_GRP;
	-
	- while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
	- switch (c) {
	- case 'n':
	- prtnum = B_TRUE;
	- break;
	- case 'H':
	- scripted = B_TRUE;
	- break;
	- case 'p':
	- parsable = B_TRUE;
	- break;
	- case 'o':
	- ofield = optarg;
	- break;
	- case 's':
	- case 'S':
	- if (zfs_add_sort_column(&sortcol, optarg,
	- c == 's' ? B_FALSE : B_TRUE) != 0) {
	- (void) fprintf(stderr,
	- gettext("invalid field '%s'\n"), optarg);
	- usage(B_FALSE);
	- }
	- break;
	- case 't':
	- tfield = optarg;
	- break;
	- case 'i':
	- sid2posix = B_TRUE;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing dataset name\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- /* Use default output fields if not specified using -o */
	- if (ofield == NULL)
	- ofield = deffields;
	- do {
	- if ((delim = strchr(ofield, ',')) != NULL)
	- *delim = '\0';
	- if ((fields[cfield++] = us_field_index(ofield)) == -1) {
	- (void) fprintf(stderr, gettext("invalid type '%s' "
	- "for -o option\n"), ofield);
	- return (-1);
	- }
	- if (delim != NULL)
	- ofield = delim + 1;
	- } while (delim != NULL);
	- fields[cfield] = USFIELD_LAST;
	-
	- /* Override output types (-t option) */
	- if (tfield != NULL) {
	- types = 0;
	-
	- do {
	- boolean_t found = B_FALSE;
	-
	- if ((delim = strchr(tfield, ',')) != NULL)
	- *delim = '\0';
	- for (i = 0; i < sizeof (us_type_bits) / sizeof (int);
	- i++) {
	- if (strcmp(tfield, us_type_names[i]) == 0) {
	- found = B_TRUE;
	- types \|= us_type_bits[i];
	- break;
	- }
	- }
	- if (!found) {
	- (void) fprintf(stderr, gettext("invalid type "
	- "'%s' for -t option\n"), tfield);
	- return (-1);
	- }
	- if (delim != NULL)
	- tfield = delim + 1;
	- } while (delim != NULL);
	- }
	-
	- if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
	- return (1);
	-
	- if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
	- offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL)
	- nomem();
	- if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
	- nomem();
	-
	- /* Always add default sorting columns */
	- (void) zfs_add_sort_column(&sortcol, "type", B_FALSE);
	- (void) zfs_add_sort_column(&sortcol, "name", B_FALSE);
	-
	- cb.cb_sortcol = sortcol;
	- cb.cb_numname = prtnum;
	- cb.cb_nicenum = !parsable;
	- cb.cb_avl_pool = avl_pool;
	- cb.cb_avl = avl_tree;
	- cb.cb_sid2posix = sid2posix;
	-
	- for (i = 0; i < USFIELD_LAST; i++)
	- cb.cb_width[i] = strlen(gettext(us_field_hdr[i]));
	-
	- for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
	- if (((p == ZFS_PROP_USERUSED \|\| p == ZFS_PROP_USERQUOTA) &&
	- !(types & (USTYPE_PSX_USR \| USTYPE_SMB_USR))) \|\|
	- ((p == ZFS_PROP_GROUPUSED \|\| p == ZFS_PROP_GROUPQUOTA) &&
	- !(types & (USTYPE_PSX_GRP \| USTYPE_SMB_GRP))))
	- continue;
	- cb.cb_prop = p;
	- if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0)
	- return (ret);
	- }
	-
	- /* Sort the list */
	- if ((node = uu_avl_first(avl_tree)) == NULL)
	- return (0);
	-
	- us_populated = B_TRUE;
	-
	- listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
	- offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT);
	- list = uu_list_create(listpool, NULL, UU_DEFAULT);
	- uu_list_node_init(node, &node->usn_listnode, listpool);
	-
	- while (node != NULL) {
	- rmnode = node;
	- node = uu_avl_next(avl_tree, node);
	- uu_avl_remove(avl_tree, rmnode);
	- if (uu_list_find(list, rmnode, NULL, &idx2) == NULL)
	- uu_list_insert(list, rmnode, idx2);
	- }
	-
	- for (node = uu_list_first(list); node != NULL;
	- node = uu_list_next(list, node)) {
	- us_sort_info_t sortinfo = { sortcol, cb.cb_numname };
	-
	- if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL)
	- uu_avl_insert(avl_tree, node, idx);
	- }
	-
	- uu_list_destroy(list);
	- uu_list_pool_destroy(listpool);
	-
	- /* Print and free node nvlist memory */
	- print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE,
	- cb.cb_avl);
	-
	- zfs_free_sort_columns(sortcol);
	-
	- /* Clean up the AVL tree */
	- if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
	- nomem();
	-
	- while ((node = uu_avl_walk_next(walk)) != NULL) {
	- uu_avl_remove(cb.cb_avl, node);
	- free(node);
	- }
	-
	- uu_avl_walk_end(walk);
	- uu_avl_destroy(avl_tree);
	- uu_avl_pool_destroy(avl_pool);
	-
	- return (ret);
	-}
	-
	-/*
	- * list [-Hp][-r\|-d max] [-o property[,...]] [-s property] ... [-S property] ...
	- * [-t type[,...]] [filesystem\|volume\|snapshot] ...
	- *
	- * -H Scripted mode; elide headers and separate columns by tabs.
	- * -p Display values in parsable (literal) format.
	- * -r Recurse over all children.
	- * -d Limit recursion by depth.
	- * -o Control which fields to display.
	- * -s Specify sort columns, descending order.
	- * -S Specify sort columns, ascending order.
	- * -t Control which object types to display.
	- *
	- * When given no arguments, list all filesystems in the system.
	- * Otherwise, list the specified datasets, optionally recursing down them if
	- * '-r' is specified.
	- */
	-typedef struct list_cbdata {
	- boolean_t cb_first;
	- boolean_t cb_literal;
	- boolean_t cb_scripted;
	- zprop_list_t *cb_proplist;
	-} list_cbdata_t;
	-
	-/*
	- * Given a list of columns to display, output appropriate headers for each one.
	- */
	-static void
	-print_header(list_cbdata_t *cb)
	-{
	- zprop_list_t *pl = cb->cb_proplist;
	- char headerbuf[ZFS_MAXPROPLEN];
	- const char *header;
	- int i;
	- boolean_t first = B_TRUE;
	- boolean_t right_justify;
	-
	- for (; pl != NULL; pl = pl->pl_next) {
	- if (!first) {
	- (void) printf(" ");
	- } else {
	- first = B_FALSE;
	- }
	-
	- right_justify = B_FALSE;
	- if (pl->pl_prop != ZPROP_INVAL) {
	- header = zfs_prop_column_name(pl->pl_prop);
	- right_justify = zfs_prop_align_right(pl->pl_prop);
	- } else {
	- for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
	- headerbuf[i] = toupper(pl->pl_user_prop[i]);
	- headerbuf[i] = '\0';
	- header = headerbuf;
	- }
	-
	- if (pl->pl_next == NULL && !right_justify)
	- (void) printf("%s", header);
	- else if (right_justify)
	- (void) printf("%*s", pl->pl_width, header);
	- else
	- (void) printf("%-*s", pl->pl_width, header);
	- }
	-
	- (void) printf("\n");
	-}
	-
	-/*
	- * Given a dataset and a list of fields, print out all the properties according
	- * to the described layout.
	- */
	-static void
	-print_dataset(zfs_handle_t zhp, list_cbdata_t cb)
	-{
	- zprop_list_t *pl = cb->cb_proplist;
	- boolean_t first = B_TRUE;
	- char property[ZFS_MAXPROPLEN];
	- nvlist_t *userprops = zfs_get_user_props(zhp);
	- nvlist_t *propval;
	- char *propstr;
	- boolean_t right_justify;
	-
	- for (; pl != NULL; pl = pl->pl_next) {
	- if (!first) {
	- if (cb->cb_scripted)
	- (void) printf("\t");
	- else
	- (void) printf(" ");
	- } else {
	- first = B_FALSE;
	- }
	-
	- if (pl->pl_prop == ZFS_PROP_NAME) {
	- (void) strlcpy(property, zfs_get_name(zhp),
	- sizeof (property));
	- propstr = property;
	- right_justify = zfs_prop_align_right(pl->pl_prop);
	- } else if (pl->pl_prop != ZPROP_INVAL) {
	- if (zfs_prop_get(zhp, pl->pl_prop, property,
	- sizeof (property), NULL, NULL, 0,
	- cb->cb_literal) != 0)
	- propstr = "-";
	- else
	- propstr = property;
	- right_justify = zfs_prop_align_right(pl->pl_prop);
	- } else if (zfs_prop_userquota(pl->pl_user_prop)) {
	- if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
	- property, sizeof (property), cb->cb_literal) != 0)
	- propstr = "-";
	- else
	- propstr = property;
	- right_justify = B_TRUE;
	- } else if (zfs_prop_written(pl->pl_user_prop)) {
	- if (zfs_prop_get_written(zhp, pl->pl_user_prop,
	- property, sizeof (property), cb->cb_literal) != 0)
	- propstr = "-";
	- else
	- propstr = property;
	- right_justify = B_TRUE;
	- } else {
	- if (nvlist_lookup_nvlist(userprops,
	- pl->pl_user_prop, &propval) != 0)
	- propstr = "-";
	- else
	- verify(nvlist_lookup_string(propval,
	- ZPROP_VALUE, &propstr) == 0);
	- right_justify = B_FALSE;
	- }
	-
	- /*
	- * If this is being called in scripted mode, or if this is the
	- * last column and it is left-justified, don't include a width
	- * format specifier.
	- */
	- if (cb->cb_scripted \|\| (pl->pl_next == NULL && !right_justify))
	- (void) printf("%s", propstr);
	- else if (right_justify)
	- (void) printf("%*s", pl->pl_width, propstr);
	- else
	- (void) printf("%-*s", pl->pl_width, propstr);
	- }
	-
	- (void) printf("\n");
	-}
	-
	-/*
	- * Generic callback function to list a dataset or snapshot.
	- */
	-static int
	-list_callback(zfs_handle_t zhp, void data)
	-{
	- list_cbdata_t *cbp = data;
	-
	- if (cbp->cb_first) {
	- if (!cbp->cb_scripted)
	- print_header(cbp);
	- cbp->cb_first = B_FALSE;
	- }
	-
	- print_dataset(zhp, cbp);
	-
	- return (0);
	-}
	-
	-static int
	-zfs_do_list(int argc, char **argv)
	-{
	- int c;
	- static char default_fields[] =
	- "name,used,available,referenced,mountpoint";
	- int types = ZFS_TYPE_DATASET;
	- boolean_t types_specified = B_FALSE;
	- char *fields = NULL;
	- list_cbdata_t cb = { 0 };
	- char *value;
	- int limit = 0;
	- int ret = 0;
	- zfs_sort_column_t *sortcol = NULL;
	- int flags = ZFS_ITER_PROP_LISTSNAPS \| ZFS_ITER_ARGS_CAN_BE_PATHS;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) {
	- switch (c) {
	- case 'o':
	- fields = optarg;
	- break;
	- case 'p':
	- cb.cb_literal = B_TRUE;
	- flags \|= ZFS_ITER_LITERAL_PROPS;
	- break;
	- case 'd':
	- limit = parse_depth(optarg, &flags);
	- break;
	- case 'r':
	- flags \|= ZFS_ITER_RECURSE;
	- break;
	- case 'H':
	- cb.cb_scripted = B_TRUE;
	- break;
	- case 's':
	- if (zfs_add_sort_column(&sortcol, optarg,
	- B_FALSE) != 0) {
	- (void) fprintf(stderr,
	- gettext("invalid property '%s'\n"), optarg);
	- usage(B_FALSE);
	- }
	- break;
	- case 'S':
	- if (zfs_add_sort_column(&sortcol, optarg,
	- B_TRUE) != 0) {
	- (void) fprintf(stderr,
	- gettext("invalid property '%s'\n"), optarg);
	- usage(B_FALSE);
	- }
	- break;
	- case 't':
	- types = 0;
	- types_specified = B_TRUE;
	- flags &= ~ZFS_ITER_PROP_LISTSNAPS;
	- while (*optarg != '\0') {
	- static char *type_subopts[] = { "filesystem",
	- "volume", "snapshot", "snap", "bookmark",
	- "all", NULL };
	-
	- switch (getsubopt(&optarg, type_subopts,
	- &value)) {
	- case 0:
	- types \|= ZFS_TYPE_FILESYSTEM;
	- break;
	- case 1:
	- types \|= ZFS_TYPE_VOLUME;
	- break;
	- case 2:
	- case 3:
	- types \|= ZFS_TYPE_SNAPSHOT;
	- break;
	- case 4:
	- types \|= ZFS_TYPE_BOOKMARK;
	- break;
	- case 5:
	- types = ZFS_TYPE_DATASET \|
	- ZFS_TYPE_BOOKMARK;
	- break;
	- default:
	- (void) fprintf(stderr,
	- gettext("invalid type '%s'\n"),
	- suboptarg);
	- usage(B_FALSE);
	- }
	- }
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (fields == NULL)
	- fields = default_fields;
	-
	- /*
	- * If we are only going to list snapshot names and sort by name,
	- * then we can use faster version.
	- */
	- if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol))
	- flags \|= ZFS_ITER_SIMPLE;
	-
	- /*
	- * If "-o space" and no types were specified, don't display snapshots.
	- */
	- if (strcmp(fields, "space") == 0 && types_specified == B_FALSE)
	- types &= ~ZFS_TYPE_SNAPSHOT;
	-
	- /*
	- * If the user specifies '-o all', the zprop_get_list() doesn't
	- * normally include the name of the dataset. For 'zfs list', we always
	- * want this property to be first.
	- */
	- if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
	- != 0)
	- usage(B_FALSE);
	-
	- cb.cb_first = B_TRUE;
	-
	- ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
	- limit, list_callback, &cb);
	-
	- zprop_free_list(cb.cb_proplist);
	- zfs_free_sort_columns(sortcol);
	-
	- if (ret == 0 && cb.cb_first && !cb.cb_scripted)
	- (void) printf(gettext("no datasets available\n"));
	-
	- return (ret);
	-}
	-
	-/*
	- * zfs rename [-f] <fs \| snap \| vol> <fs \| snap \| vol>
	- * zfs rename [-f] -p <fs \| vol> <fs \| vol>
	- * zfs rename -r <snap> <snap>
	- * zfs rename <bmark> <bmark>
	- * zfs rename -u [-p] <fs> <fs>
	- *
	- * Renames the given dataset to another of the same type.
	- *
	- * The '-p' flag creates all the non-existing ancestors of the target first.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_do_rename(int argc, char **argv)
	-{
	- zfs_handle_t *zhp;
	- renameflags_t flags = { 0 };
	- int c;
	- int ret = 0;
	- int types;
	- boolean_t parents = B_FALSE;
	- boolean_t bookmarks = B_FALSE;
	- char *snapshot = NULL;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "fpru")) != -1) {
	- switch (c) {
	- case 'p':
	- parents = B_TRUE;
	- break;
	- case 'r':
	- flags.recurse = B_TRUE;
	- break;
	- case 'u':
	- flags.nounmount = B_TRUE;
	- break;
	- case 'f':
	- flags.forceunmount = B_TRUE;
	- break;
	- case '?':
	- default:
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing source dataset "
	- "argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing target dataset "
	- "argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 2) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- if (flags.recurse && parents) {
	- (void) fprintf(stderr, gettext("-p and -r options are mutually "
	- "exclusive\n"));
	- usage(B_FALSE);
	- }
	-
	- if (flags.recurse && strchr(argv[0], '@') == NULL) {
	- (void) fprintf(stderr, gettext("source dataset for recursive "
	- "rename must be a snapshot\n"));
	- usage(B_FALSE);
	- }
	-
	- if (flags.nounmount && parents) {
	- (void) fprintf(stderr, gettext("-u and -p options are mutually "
	- "exclusive\n"));
	- usage(B_FALSE);
	- }
	-
	- if (strchr(argv[0], '#') != NULL)
	- bookmarks = B_TRUE;
	-
	- if (bookmarks && (flags.nounmount \|\| flags.recurse \|\|
	- flags.forceunmount \|\| parents)) {
	- (void) fprintf(stderr, gettext("options are not supported "
	- "for renaming bookmarks\n"));
	- usage(B_FALSE);
	- }
	-
	- if (flags.nounmount)
	- types = ZFS_TYPE_FILESYSTEM;
	- else if (parents)
	- types = ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME;
	- else if (bookmarks)
	- types = ZFS_TYPE_BOOKMARK;
	- else
	- types = ZFS_TYPE_DATASET;
	-
	- if (flags.recurse) {
	- /*
	- * When we do recursive rename we are fine when the given
	- * snapshot for the given dataset doesn't exist - it can
	- * still exists below.
	- */
	-
	- snapshot = strchr(argv[0], '@');
	- assert(snapshot != NULL);
	- *snapshot = '\0';
	- snapshot++;
	- }
	-
	- if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
	- return (1);
	-
	- /* If we were asked and the name looks good, try to create ancestors. */
	- if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) &&
	- zfs_create_ancestors(g_zfs, argv[1]) != 0) {
	- zfs_close(zhp);
	- return (1);
	- }
	-
	- ret = (zfs_rename(zhp, snapshot, argv[1], flags) != 0);
	-
	- zfs_close(zhp);
	- return (ret);
	-}
	-
	-/*
	- * zfs promote <fs>
	- *
	- * Promotes the given clone fs to be the parent
	- */
	-/* ARGSUSED */
	-static int
	-zfs_do_promote(int argc, char **argv)
	-{
	- zfs_handle_t *zhp;
	- int ret = 0;
	-
	- /* check options */
	- if (argc > 1 && argv[1][0] == '-') {
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- argv[1][1]);
	- usage(B_FALSE);
	- }
	-
	- /* check number of arguments */
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing clone filesystem"
	- " argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 2) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (zhp == NULL)
	- return (1);
	-
	- ret = (zfs_promote(zhp) != 0);
	-
	-
	- zfs_close(zhp);
	- return (ret);
	-}
	-
	-/*
	- * zfs rollback [-rRf] <snapshot>
	- *
	- * -r Delete any intervening snapshots before doing rollback
	- * -R Delete any snapshots and their clones
	- * -f ignored for backwards compatability
	- *
	- * Given a filesystem, rollback to a specific snapshot, discarding any changes
	- * since then and making it the active dataset. If more recent snapshots exist,
	- * the command will complain unless the '-r' flag is given.
	- */
	-typedef struct rollback_cbdata {
	- uint64_t cb_create;
	- uint8_t cb_younger_ds_printed;
	- boolean_t cb_first;
	- int cb_doclones;
	- char *cb_target;
	- int cb_error;
	- boolean_t cb_recurse;
	-} rollback_cbdata_t;
	-
	-static int
	-rollback_check_dependent(zfs_handle_t zhp, void data)
	-{
	- rollback_cbdata_t *cbp = data;
	-
	- if (cbp->cb_first && cbp->cb_recurse) {
	- (void) fprintf(stderr, gettext("cannot rollback to "
	- "'%s': clones of previous snapshots exist\n"),
	- cbp->cb_target);
	- (void) fprintf(stderr, gettext("use '-R' to "
	- "force deletion of the following clones and "
	- "dependents:\n"));
	- cbp->cb_first = 0;
	- cbp->cb_error = 1;
	- }
	-
	- (void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
	-
	- zfs_close(zhp);
	- return (0);
	-}
	-
	-/*
	- * Report some snapshots/bookmarks more recent than the one specified.
	- * Used when '-r' is not specified. We reuse this same callback for the
	- * snapshot dependents - if 'cb_dependent' is set, then this is a
	- * dependent and we should report it without checking the transaction group.
	- */
	-static int
	-rollback_check(zfs_handle_t zhp, void data)
	-{
	- rollback_cbdata_t *cbp = data;
	- /*
	- * Max number of younger snapshots and/or bookmarks to display before
	- * we stop the iteration.
	- */
	- const uint8_t max_younger = 32;
	-
	- if (cbp->cb_doclones) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
	- if (cbp->cb_first && !cbp->cb_recurse) {
	- (void) fprintf(stderr, gettext("cannot "
	- "rollback to '%s': more recent snapshots "
	- "or bookmarks exist\n"),
	- cbp->cb_target);
	- (void) fprintf(stderr, gettext("use '-r' to "
	- "force deletion of the following "
	- "snapshots and bookmarks:\n"));
	- cbp->cb_first = 0;
	- cbp->cb_error = 1;
	- }
	-
	- if (cbp->cb_recurse) {
	- if (zfs_iter_dependents(zhp, B_TRUE,
	- rollback_check_dependent, cbp) != 0) {
	- zfs_close(zhp);
	- return (-1);
	- }
	- } else {
	- (void) fprintf(stderr, "%s\n",
	- zfs_get_name(zhp));
	- cbp->cb_younger_ds_printed++;
	- }
	- }
	- zfs_close(zhp);
	-
	- if (cbp->cb_younger_ds_printed == max_younger) {
	- /*
	- * This non-recursive rollback is going to fail due to the
	- * presence of snapshots and/or bookmarks that are younger than
	- * the rollback target.
	- * We printed some of the offending objects, now we stop
	- * zfs_iter_snapshot/bookmark iteration so we can fail fast and
	- * avoid iterating over the rest of the younger objects
	- */
	- (void) fprintf(stderr, gettext("Output limited to %d "
	- "snapshots/bookmarks\n"), max_younger);
	- return (-1);
	- }
	- return (0);
	-}
	-
	-static int
	-zfs_do_rollback(int argc, char **argv)
	-{
	- int ret = 0;
	- int c;
	- boolean_t force = B_FALSE;
	- rollback_cbdata_t cb = { 0 };
	- zfs_handle_t zhp, snap;
	- char parentname[ZFS_MAX_DATASET_NAME_LEN];
	- char *delim;
	- uint64_t min_txg = 0;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "rRf")) != -1) {
	- switch (c) {
	- case 'r':
	- cb.cb_recurse = 1;
	- break;
	- case 'R':
	- cb.cb_recurse = 1;
	- cb.cb_doclones = 1;
	- break;
	- case 'f':
	- force = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing dataset argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- /* open the snapshot */
	- if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
	- return (1);
	-
	- /* open the parent dataset */
	- (void) strlcpy(parentname, argv[0], sizeof (parentname));
	- verify((delim = strrchr(parentname, '@')) != NULL);
	- *delim = '\0';
	- if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) {
	- zfs_close(snap);
	- return (1);
	- }
	-
	- /*
	- * Check for more recent snapshots and/or clones based on the presence
	- * of '-r' and '-R'.
	- */
	- cb.cb_target = argv[0];
	- cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
	- cb.cb_first = B_TRUE;
	- cb.cb_error = 0;
	-
	- if (cb.cb_create > 0)
	- min_txg = cb.cb_create;
	-
	- if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb,
	- min_txg, 0)) != 0)
	- goto out;
	- if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0)
	- goto out;
	-
	- if ((ret = cb.cb_error) != 0)
	- goto out;
	-
	- /*
	- * Rollback parent to the given snapshot.
	- */
	- ret = zfs_rollback(zhp, snap, force);
	-
	-out:
	- zfs_close(snap);
	- zfs_close(zhp);
	-
	- if (ret == 0)
	- return (0);
	- else
	- return (1);
	-}
	-
	-/*
	- * zfs set property=value ... { fs \| snap \| vol } ...
	- *
	- * Sets the given properties for all datasets specified on the command line.
	- */
	-
	-static int
	-set_callback(zfs_handle_t zhp, void data)
	-{
	- nvlist_t *props = data;
	-
	- if (zfs_prop_set_list(zhp, props) != 0) {
	- switch (libzfs_errno(g_zfs)) {
	- case EZFS_MOUNTFAILED:
	- (void) fprintf(stderr, gettext("property may be set "
	- "but unable to remount filesystem\n"));
	- break;
	- case EZFS_SHARENFSFAILED:
	- (void) fprintf(stderr, gettext("property may be set "
	- "but unable to reshare filesystem\n"));
	- break;
	- }
	- return (1);
	- }
	- return (0);
	-}
	-
	-static int
	-zfs_do_set(int argc, char **argv)
	-{
	- nvlist_t *props = NULL;
	- int ds_start = -1; /* argv idx of first dataset arg */
	- int ret = 0;
	-
	- /* check for options */
	- if (argc > 1 && argv[1][0] == '-') {
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- argv[1][1]);
	- usage(B_FALSE);
	- }
	-
	- /* check number of arguments */
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing arguments\n"));
	- usage(B_FALSE);
	- }
	- if (argc < 3) {
	- if (strchr(argv[1], '=') == NULL) {
	- (void) fprintf(stderr, gettext("missing property=value "
	- "argument(s)\n"));
	- } else {
	- (void) fprintf(stderr, gettext("missing dataset "
	- "name(s)\n"));
	- }
	- usage(B_FALSE);
	- }
	-
	- /* validate argument order: prop=val args followed by dataset args */
	- for (int i = 1; i < argc; i++) {
	- if (strchr(argv[i], '=') != NULL) {
	- if (ds_start > 0) {
	- /* out-of-order prop=val argument */
	- (void) fprintf(stderr, gettext("invalid "
	- "argument order\n"), i);
	- usage(B_FALSE);
	- }
	- } else if (ds_start < 0) {
	- ds_start = i;
	- }
	- }
	- if (ds_start < 0) {
	- (void) fprintf(stderr, gettext("missing dataset name(s)\n"));
	- usage(B_FALSE);
	- }
	-
	- /* Populate a list of property settings */
	- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	- for (int i = 1; i < ds_start; i++) {
	- if ((ret = parseprop(props, argv[i])) != 0)
	- goto error;
	- }
	-
	- ret = zfs_for_each(argc - ds_start, argv + ds_start, 0,
	- ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props);
	-
	-error:
	- nvlist_free(props);
	- return (ret);
	-}
	-
	-typedef struct snap_cbdata {
	- nvlist_t *sd_nvl;
	- boolean_t sd_recursive;
	- const char *sd_snapname;
	-} snap_cbdata_t;
	-
	-static int
	-zfs_snapshot_cb(zfs_handle_t zhp, void arg)
	-{
	- snap_cbdata_t *sd = arg;
	- char *name;
	- int rv = 0;
	- int error;
	-
	- if (sd->sd_recursive &&
	- zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
	- if (error == -1)
	- nomem();
	- fnvlist_add_boolean(sd->sd_nvl, name);
	- free(name);
	-
	- if (sd->sd_recursive)
	- rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
	- zfs_close(zhp);
	- return (rv);
	-}
	-
	-/*
	- * zfs snapshot [-r] [-o prop=value] ... <fs@snap>
	- *
	- * Creates a snapshot with the given name. While functionally equivalent to
	- * 'zfs create', it is a separate command to differentiate intent.
	- */
	-static int
	-zfs_do_snapshot(int argc, char **argv)
	-{
	- int ret = 0;
	- int c;
	- nvlist_t *props;
	- snap_cbdata_t sd = { 0 };
	- boolean_t multiple_snaps = B_FALSE;
	-
	- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	- if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "ro:")) != -1) {
	- switch (c) {
	- case 'o':
	- if (parseprop(props, optarg) != 0)
	- return (1);
	- break;
	- case 'r':
	- sd.sd_recursive = B_TRUE;
	- multiple_snaps = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- goto usage;
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing snapshot argument\n"));
	- goto usage;
	- }
	-
	- if (argc > 1)
	- multiple_snaps = B_TRUE;
	- for (; argc > 0; argc--, argv++) {
	- char *atp;
	- zfs_handle_t *zhp;
	-
	- atp = strchr(argv[0], '@');
	- if (atp == NULL)
	- goto usage;
	- *atp = '\0';
	- sd.sd_snapname = atp + 1;
	- zhp = zfs_open(g_zfs, argv[0],
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (zhp == NULL)
	- goto usage;
	- if (zfs_snapshot_cb(zhp, &sd) != 0)
	- goto usage;
	- }
	-
	- ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props);
	- nvlist_free(sd.sd_nvl);
	- nvlist_free(props);
	- if (ret != 0 && multiple_snaps)
	- (void) fprintf(stderr, gettext("no snapshots were created\n"));
	- return (ret != 0);
	-
	-usage:
	- nvlist_free(sd.sd_nvl);
	- nvlist_free(props);
	- usage(B_FALSE);
	- return (-1);
	-}
	-
	-/*
	- * Send a backup stream to stdout.
	- */
	-static int
	-zfs_do_send(int argc, char **argv)
	-{
	- char *fromname = NULL;
	- char *toname = NULL;
	- char *resume_token = NULL;
	- char *cp;
	- zfs_handle_t *zhp;
	- sendflags_t flags = { 0 };
	- int c, err;
	- nvlist_t *dbgnv = NULL;
	- boolean_t extraverbose = B_FALSE;
	-
	- struct option long_options[] = {
	- {"replicate", no_argument, NULL, 'R'},
	- {"props", no_argument, NULL, 'p'},
	- {"parsable", no_argument, NULL, 'P'},
	- {"dedup", no_argument, NULL, 'D'},
	- {"verbose", no_argument, NULL, 'v'},
	- {"dryrun", no_argument, NULL, 'n'},
	- {"large-block", no_argument, NULL, 'L'},
	- {"embed", no_argument, NULL, 'e'},
	- {"resume", required_argument, NULL, 't'},
	- {"compressed", no_argument, NULL, 'c'},
	- {0, 0, 0, 0}
	- };
	-
	- /* check options */
	- while ((c = getopt_long(argc, argv, ":i:I:RbDpVvnPLet:c", long_options,
	- NULL)) != -1) {
	- switch (c) {
	- case 'i':
	- if (fromname)
	- usage(B_FALSE);
	- fromname = optarg;
	- break;
	- case 'I':
	- if (fromname)
	- usage(B_FALSE);
	- fromname = optarg;
	- flags.doall = B_TRUE;
	- break;
	- case 'R':
	- flags.replicate = B_TRUE;
	- break;
	- case 'p':
	- flags.props = B_TRUE;
	- break;
	- case 'P':
	- flags.parsable = B_TRUE;
	- flags.verbose = B_TRUE;
	- break;
	- case 'V':
	- flags.progress = B_TRUE;
	- flags.progressastitle = B_TRUE;
	- break;
	- case 'v':
	- if (flags.verbose)
	- extraverbose = B_TRUE;
	- flags.verbose = B_TRUE;
	- flags.progress = B_TRUE;
	- break;
	- case 'D':
	- flags.dedup = B_TRUE;
	- break;
	- case 'n':
	- flags.dryrun = B_TRUE;
	- break;
	- case 'L':
	- flags.largeblock = B_TRUE;
	- break;
	- case 'e':
	- flags.embed_data = B_TRUE;
	- break;
	- case 't':
	- resume_token = optarg;
	- break;
	- case 'c':
	- flags.compress = B_TRUE;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- /FALLTHROUGH/
	- default:
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (resume_token != NULL) {
	- if (fromname != NULL \|\| flags.replicate \|\| flags.props \|\|
	- flags.dedup) {
	- (void) fprintf(stderr,
	- gettext("invalid flags combined with -t\n"));
	- usage(B_FALSE);
	- }
	- if (argc != 0) {
	- (void) fprintf(stderr, gettext("no additional "
	- "arguments are permitted with -t\n"));
	- usage(B_FALSE);
	- }
	- } else {
	- if (argc < 1) {
	- (void) fprintf(stderr,
	- gettext("missing snapshot argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	- }
	-
	- if (!flags.dryrun && isatty(STDOUT_FILENO)) {
	- (void) fprintf(stderr,
	- gettext("Error: Stream can not be written to a terminal.\n"
	- "You must redirect standard output.\n"));
	- return (1);
	- }
	-
	- if (resume_token != NULL) {
	- return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
	- resume_token));
	- }
	-
	- /*
	- * Special case sending a filesystem, or from a bookmark.
	- */
	- if (strchr(argv[0], '@') == NULL \|\|
	- (fromname && strchr(fromname, '#') != NULL)) {
	- char frombuf[ZFS_MAX_DATASET_NAME_LEN];
	-
	- if (flags.replicate \|\| flags.doall \|\| flags.props \|\|
	- flags.dedup \|\| (strchr(argv[0], '@') == NULL &&
	- (flags.dryrun \|\| flags.verbose \|\| flags.progress))) {
	- (void) fprintf(stderr, gettext("Error: "
	- "Unsupported flag with filesystem or bookmark.\n"));
	- return (1);
	- }
	-
	- zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
	- if (zhp == NULL)
	- return (1);
	-
	- if (fromname != NULL &&
	- (fromname[0] == '#' \|\| fromname[0] == '@')) {
	- /*
	- * Incremental source name begins with # or @.
	- * Default to same fs as target.
	- */
	- (void) strncpy(frombuf, argv[0], sizeof (frombuf));
	- cp = strchr(frombuf, '@');
	- if (cp != NULL)
	- *cp = '\0';
	- (void) strlcat(frombuf, fromname, sizeof (frombuf));
	- fromname = frombuf;
	- }
	- err = zfs_send_one(zhp, fromname, STDOUT_FILENO, flags);
	- zfs_close(zhp);
	- return (err != 0);
	- }
	-
	- cp = strchr(argv[0], '@');
	- *cp = '\0';
	- toname = cp + 1;
	- zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (zhp == NULL)
	- return (1);
	-
	- /*
	- * If they specified the full path to the snapshot, chop off
	- * everything except the short name of the snapshot, but special
	- * case if they specify the origin.
	- */
	- if (fromname && (cp = strchr(fromname, '@')) != NULL) {
	- char origin[ZFS_MAX_DATASET_NAME_LEN];
	- zprop_source_t src;
	-
	- (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
	- origin, sizeof (origin), &src, NULL, 0, B_FALSE);
	-
	- if (strcmp(origin, fromname) == 0) {
	- fromname = NULL;
	- flags.fromorigin = B_TRUE;
	- } else {
	- *cp = '\0';
	- if (cp != fromname && strcmp(argv[0], fromname)) {
	- (void) fprintf(stderr,
	- gettext("incremental source must be "
	- "in same filesystem\n"));
	- usage(B_FALSE);
	- }
	- fromname = cp + 1;
	- if (strchr(fromname, '@') \|\| strchr(fromname, '/')) {
	- (void) fprintf(stderr,
	- gettext("invalid incremental source\n"));
	- usage(B_FALSE);
	- }
	- }
	- }
	-
	- if (flags.replicate && fromname == NULL)
	- flags.doall = B_TRUE;
	-
	- err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0,
	- extraverbose ? &dbgnv : NULL);
	-
	- if (extraverbose && dbgnv != NULL) {
	- /*
	- * dump_nvlist prints to stdout, but that's been
	- * redirected to a file. Make it print to stderr
	- * instead.
	- */
	- (void) dup2(STDERR_FILENO, STDOUT_FILENO);
	- dump_nvlist(dbgnv, 0);
	- nvlist_free(dbgnv);
	- }
	- zfs_close(zhp);
	-
	- return (err != 0);
	-}
	-
	-/*
	- * Restore a backup stream from stdin.
	- */
	-static int
	-zfs_do_receive(int argc, char **argv)
	-{
	- int c, err = 0;
	- recvflags_t flags = { 0 };
	- boolean_t abort_resumable = B_FALSE;
	-
	- nvlist_t *props;
	- nvpair_t *nvp = NULL;
	-
	- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	-
	- /* check options */
	- while ((c = getopt(argc, argv, ":o:denuvMFsA")) != -1) {
	- switch (c) {
	- case 'o':
	- if (parseprop(props, optarg) != 0)
	- return (1);
	- break;
	- case 'd':
	- flags.isprefix = B_TRUE;
	- break;
	- case 'e':
	- flags.isprefix = B_TRUE;
	- flags.istail = B_TRUE;
	- break;
	- case 'n':
	- flags.dryrun = B_TRUE;
	- break;
	- case 'u':
	- flags.nomount = B_TRUE;
	- break;
	- case 'v':
	- flags.verbose = B_TRUE;
	- break;
	- case 's':
	- flags.resumable = B_TRUE;
	- break;
	- case 'F':
	- flags.force = B_TRUE;
	- break;
	- case 'M':
	- flags.forceunmount = B_TRUE;
	- break;
	- case 'A':
	- abort_resumable = B_TRUE;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing snapshot argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- while ((nvp = nvlist_next_nvpair(props, nvp))) {
	- if (strcmp(nvpair_name(nvp), "origin") != 0) {
	- (void) fprintf(stderr, gettext("invalid option"));
	- usage(B_FALSE);
	- }
	- }
	-
	- if (abort_resumable) {
	- if (flags.isprefix \|\| flags.istail \|\| flags.dryrun \|\|
	- flags.resumable \|\| flags.nomount) {
	- (void) fprintf(stderr, gettext("invalid option"));
	- usage(B_FALSE);
	- }
	-
	- char namebuf[ZFS_MAX_DATASET_NAME_LEN];
	- (void) snprintf(namebuf, sizeof (namebuf),
	- "%s/%%recv", argv[0]);
	-
	- if (zfs_dataset_exists(g_zfs, namebuf,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME)) {
	- zfs_handle_t *zhp = zfs_open(g_zfs,
	- namebuf, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (zhp == NULL)
	- return (1);
	- err = zfs_destroy(zhp, B_FALSE);
	- } else {
	- zfs_handle_t *zhp = zfs_open(g_zfs,
	- argv[0], ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (zhp == NULL)
	- usage(B_FALSE);
	- if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) \|\|
	- zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
	- NULL, 0, NULL, NULL, 0, B_TRUE) == -1) {
	- (void) fprintf(stderr,
	- gettext("'%s' does not have any "
	- "resumable receive state to abort\n"),
	- argv[0]);
	- return (1);
	- }
	- err = zfs_destroy(zhp, B_FALSE);
	- }
	-
	- return (err != 0);
	- }
	-
	- if (isatty(STDIN_FILENO)) {
	- (void) fprintf(stderr,
	- gettext("Error: Backup stream can not be read "
	- "from a terminal.\n"
	- "You must redirect standard input.\n"));
	- return (1);
	- }
	- err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
	-
	- return (err != 0);
	-}
	-
	-/*
	- * allow/unallow stuff
	- */
	-/* copied from zfs/sys/dsl_deleg.h */
	-#define ZFS_DELEG_PERM_CREATE "create"
	-#define ZFS_DELEG_PERM_DESTROY "destroy"
	-#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
	-#define ZFS_DELEG_PERM_ROLLBACK "rollback"
	-#define ZFS_DELEG_PERM_CLONE "clone"
	-#define ZFS_DELEG_PERM_PROMOTE "promote"
	-#define ZFS_DELEG_PERM_RENAME "rename"
	-#define ZFS_DELEG_PERM_MOUNT "mount"
	-#define ZFS_DELEG_PERM_SHARE "share"
	-#define ZFS_DELEG_PERM_SEND "send"
	-#define ZFS_DELEG_PERM_RECEIVE "receive"
	-#define ZFS_DELEG_PERM_ALLOW "allow"
	-#define ZFS_DELEG_PERM_USERPROP "userprop"
	-#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */
	-#define ZFS_DELEG_PERM_USERQUOTA "userquota"
	-#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
	-#define ZFS_DELEG_PERM_USERUSED "userused"
	-#define ZFS_DELEG_PERM_GROUPUSED "groupused"
	-#define ZFS_DELEG_PERM_HOLD "hold"
	-#define ZFS_DELEG_PERM_RELEASE "release"
	-#define ZFS_DELEG_PERM_DIFF "diff"
	-#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
	-#define ZFS_DELEG_PERM_REMAP "remap"
	-
	-#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE
	-
	-static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
	- { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
	- { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
	- { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
	- { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
	- { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
	- { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
	- { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
	- { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
	- { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
	- { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
	- { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
	- { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
	- { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
	- { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
	- { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
	- { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK },
	- { ZFS_DELEG_PERM_REMAP, ZFS_DELEG_NOTE_REMAP },
	-
	- { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
	- { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
	- { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
	- { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
	- { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
	- { NULL, ZFS_DELEG_NOTE_NONE }
	-};
	-
	-/* permission structure */
	-typedef struct deleg_perm {
	- zfs_deleg_who_type_t dp_who_type;
	- const char *dp_name;
	- boolean_t dp_local;
	- boolean_t dp_descend;
	-} deleg_perm_t;
	-
	-/* */
	-typedef struct deleg_perm_node {
	- deleg_perm_t dpn_perm;
	-
	- uu_avl_node_t dpn_avl_node;
	-} deleg_perm_node_t;
	-
	-typedef struct fs_perm fs_perm_t;
	-
	-/* permissions set */
	-typedef struct who_perm {
	- zfs_deleg_who_type_t who_type;
	- const char who_name; / id */
	- char who_ug_name[256]; /* user/group name */
	- fs_perm_t who_fsperm; / uplink */
	-
	- uu_avl_t who_deleg_perm_avl; / permissions */
	-} who_perm_t;
	-
	-/* */
	-typedef struct who_perm_node {
	- who_perm_t who_perm;
	- uu_avl_node_t who_avl_node;
	-} who_perm_node_t;
	-
	-typedef struct fs_perm_set fs_perm_set_t;
	-/* fs permissions */
	-struct fs_perm {
	- const char *fsp_name;
	-
	- uu_avl_t fsp_sc_avl; / sets,create */
	- uu_avl_t fsp_uge_avl; / user,group,everyone */
	-
	- fs_perm_set_t fsp_set; / uplink */
	-};
	-
	-/* */
	-typedef struct fs_perm_node {
	- fs_perm_t fspn_fsperm;
	- uu_avl_t *fspn_avl;
	-
	- uu_list_node_t fspn_list_node;
	-} fs_perm_node_t;
	-
	-/* top level structure */
	-struct fs_perm_set {
	- uu_list_pool_t *fsps_list_pool;
	- uu_list_t fsps_list; / list of fs_perms */
	-
	- uu_avl_pool_t *fsps_named_set_avl_pool;
	- uu_avl_pool_t *fsps_who_perm_avl_pool;
	- uu_avl_pool_t *fsps_deleg_perm_avl_pool;
	-};
	-
	-static inline const char *
	-deleg_perm_type(zfs_deleg_note_t note)
	-{
	- /* subcommands */
	- switch (note) {
	- /* SUBCOMMANDS */
	- /* OTHER */
	- case ZFS_DELEG_NOTE_GROUPQUOTA:
	- case ZFS_DELEG_NOTE_GROUPUSED:
	- case ZFS_DELEG_NOTE_USERPROP:
	- case ZFS_DELEG_NOTE_USERQUOTA:
	- case ZFS_DELEG_NOTE_USERUSED:
	- /* other */
	- return (gettext("other"));
	- default:
	- return (gettext("subcommand"));
	- }
	-}
	-
	-static int
	-who_type2weight(zfs_deleg_who_type_t who_type)
	-{
	- int res;
	- switch (who_type) {
	- case ZFS_DELEG_NAMED_SET_SETS:
	- case ZFS_DELEG_NAMED_SET:
	- res = 0;
	- break;
	- case ZFS_DELEG_CREATE_SETS:
	- case ZFS_DELEG_CREATE:
	- res = 1;
	- break;
	- case ZFS_DELEG_USER_SETS:
	- case ZFS_DELEG_USER:
	- res = 2;
	- break;
	- case ZFS_DELEG_GROUP_SETS:
	- case ZFS_DELEG_GROUP:
	- res = 3;
	- break;
	- case ZFS_DELEG_EVERYONE_SETS:
	- case ZFS_DELEG_EVERYONE:
	- res = 4;
	- break;
	- default:
	- res = -1;
	- }
	-
	- return (res);
	-}
	-
	-/* ARGSUSED */
	-static int
	-who_perm_compare(const void larg, const void rarg, void *unused)
	-{
	- const who_perm_node_t *l = larg;
	- const who_perm_node_t *r = rarg;
	- zfs_deleg_who_type_t ltype = l->who_perm.who_type;
	- zfs_deleg_who_type_t rtype = r->who_perm.who_type;
	- int lweight = who_type2weight(ltype);
	- int rweight = who_type2weight(rtype);
	- int res = lweight - rweight;
	- if (res == 0)
	- res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
	- ZFS_MAX_DELEG_NAME-1);
	-
	- if (res == 0)
	- return (0);
	- if (res > 0)
	- return (1);
	- else
	- return (-1);
	-}
	-
	-/* ARGSUSED */
	-static int
	-deleg_perm_compare(const void larg, const void rarg, void *unused)
	-{
	- const deleg_perm_node_t *l = larg;
	- const deleg_perm_node_t *r = rarg;
	- int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
	- ZFS_MAX_DELEG_NAME-1);
	-
	- if (res == 0)
	- return (0);
	-
	- if (res > 0)
	- return (1);
	- else
	- return (-1);
	-}
	-
	-static inline void
	-fs_perm_set_init(fs_perm_set_t *fspset)
	-{
	- bzero(fspset, sizeof (fs_perm_set_t));
	-
	- if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
	- sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
	- NULL, UU_DEFAULT)) == NULL)
	- nomem();
	- if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
	- UU_DEFAULT)) == NULL)
	- nomem();
	-
	- if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
	- "named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
	- who_perm_node_t, who_avl_node), who_perm_compare,
	- UU_DEFAULT)) == NULL)
	- nomem();
	-
	- if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
	- "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
	- who_perm_node_t, who_avl_node), who_perm_compare,
	- UU_DEFAULT)) == NULL)
	- nomem();
	-
	- if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
	- "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
	- deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
	- == NULL)
	- nomem();
	-}
	-
	-static inline void fs_perm_fini(fs_perm_t *);
	-static inline void who_perm_fini(who_perm_t *);
	-
	-static inline void
	-fs_perm_set_fini(fs_perm_set_t *fspset)
	-{
	- fs_perm_node_t *node = uu_list_first(fspset->fsps_list);
	-
	- while (node != NULL) {
	- fs_perm_node_t *next_node =
	- uu_list_next(fspset->fsps_list, node);
	- fs_perm_t *fsperm = &node->fspn_fsperm;
	- fs_perm_fini(fsperm);
	- uu_list_remove(fspset->fsps_list, node);
	- free(node);
	- node = next_node;
	- }
	-
	- uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
	- uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
	- uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
	-}
	-
	-static inline void
	-deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
	- const char *name)
	-{
	- deleg_perm->dp_who_type = type;
	- deleg_perm->dp_name = name;
	-}
	-
	-static inline void
	-who_perm_init(who_perm_t who_perm, fs_perm_t fsperm,
	- zfs_deleg_who_type_t type, const char *name)
	-{
	- uu_avl_pool_t *pool;
	- pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;
	-
	- bzero(who_perm, sizeof (who_perm_t));
	-
	- if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
	- UU_DEFAULT)) == NULL)
	- nomem();
	-
	- who_perm->who_type = type;
	- who_perm->who_name = name;
	- who_perm->who_fsperm = fsperm;
	-}
	-
	-static inline void
	-who_perm_fini(who_perm_t *who_perm)
	-{
	- deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);
	-
	- while (node != NULL) {
	- deleg_perm_node_t *next_node =
	- uu_avl_next(who_perm->who_deleg_perm_avl, node);
	-
	- uu_avl_remove(who_perm->who_deleg_perm_avl, node);
	- free(node);
	- node = next_node;
	- }
	-
	- uu_avl_destroy(who_perm->who_deleg_perm_avl);
	-}
	-
	-static inline void
	-fs_perm_init(fs_perm_t fsperm, fs_perm_set_t fspset, const char *fsname)
	-{
	- uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool;
	- uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool;
	-
	- bzero(fsperm, sizeof (fs_perm_t));
	-
	- if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
	- == NULL)
	- nomem();
	-
	- if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
	- == NULL)
	- nomem();
	-
	- fsperm->fsp_set = fspset;
	- fsperm->fsp_name = fsname;
	-}
	-
	-static inline void
	-fs_perm_fini(fs_perm_t *fsperm)
	-{
	- who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
	- while (node != NULL) {
	- who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
	- node);
	- who_perm_t *who_perm = &node->who_perm;
	- who_perm_fini(who_perm);
	- uu_avl_remove(fsperm->fsp_sc_avl, node);
	- free(node);
	- node = next_node;
	- }
	-
	- node = uu_avl_first(fsperm->fsp_uge_avl);
	- while (node != NULL) {
	- who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
	- node);
	- who_perm_t *who_perm = &node->who_perm;
	- who_perm_fini(who_perm);
	- uu_avl_remove(fsperm->fsp_uge_avl, node);
	- free(node);
	- node = next_node;
	- }
	-
	- uu_avl_destroy(fsperm->fsp_sc_avl);
	- uu_avl_destroy(fsperm->fsp_uge_avl);
	-}
	-
	-static void
	-set_deleg_perm_node(uu_avl_t avl, deleg_perm_node_t node,
	- zfs_deleg_who_type_t who_type, const char *name, char locality)
	-{
	- uu_avl_index_t idx = 0;
	-
	- deleg_perm_node_t *found_node = NULL;
	- deleg_perm_t *deleg_perm = &node->dpn_perm;
	-
	- deleg_perm_init(deleg_perm, who_type, name);
	-
	- if ((found_node = uu_avl_find(avl, node, NULL, &idx))
	- == NULL)
	- uu_avl_insert(avl, node, idx);
	- else {
	- node = found_node;
	- deleg_perm = &node->dpn_perm;
	- }
	-
	-
	- switch (locality) {
	- case ZFS_DELEG_LOCAL:
	- deleg_perm->dp_local = B_TRUE;
	- break;
	- case ZFS_DELEG_DESCENDENT:
	- deleg_perm->dp_descend = B_TRUE;
	- break;
	- case ZFS_DELEG_NA:
	- break;
	- default:
	- assert(B_FALSE); /* invalid locality */
	- }
	-}
	-
	-static inline int
	-parse_who_perm(who_perm_t who_perm, nvlist_t nvl, char locality)
	-{
	- nvpair_t *nvp = NULL;
	- fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
	- uu_avl_t *avl = who_perm->who_deleg_perm_avl;
	- zfs_deleg_who_type_t who_type = who_perm->who_type;
	-
	- while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	- const char *name = nvpair_name(nvp);
	- data_type_t type = nvpair_type(nvp);
	- uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
	- deleg_perm_node_t *node =
	- safe_malloc(sizeof (deleg_perm_node_t));
	-
	- assert(type == DATA_TYPE_BOOLEAN);
	-
	- uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
	- set_deleg_perm_node(avl, node, who_type, name, locality);
	- }
	-
	- return (0);
	-}
	-
	-static inline int
	-parse_fs_perm(fs_perm_t fsperm, nvlist_t nvl)
	-{
	- nvpair_t *nvp = NULL;
	- fs_perm_set_t *fspset = fsperm->fsp_set;
	-
	- while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	- nvlist_t *nvl2 = NULL;
	- const char *name = nvpair_name(nvp);
	- uu_avl_t *avl = NULL;
	- uu_avl_pool_t *avl_pool = NULL;
	- zfs_deleg_who_type_t perm_type = name[0];
	- char perm_locality = name[1];
	- const char *perm_name = name + 3;
	- boolean_t is_set = B_TRUE;
	- who_perm_t *who_perm = NULL;
	-
	- assert('$' == name[2]);
	-
	- if (nvpair_value_nvlist(nvp, &nvl2) != 0)
	- return (-1);
	-
	- switch (perm_type) {
	- case ZFS_DELEG_CREATE:
	- case ZFS_DELEG_CREATE_SETS:
	- case ZFS_DELEG_NAMED_SET:
	- case ZFS_DELEG_NAMED_SET_SETS:
	- avl_pool = fspset->fsps_named_set_avl_pool;
	- avl = fsperm->fsp_sc_avl;
	- break;
	- case ZFS_DELEG_USER:
	- case ZFS_DELEG_USER_SETS:
	- case ZFS_DELEG_GROUP:
	- case ZFS_DELEG_GROUP_SETS:
	- case ZFS_DELEG_EVERYONE:
	- case ZFS_DELEG_EVERYONE_SETS:
	- avl_pool = fspset->fsps_who_perm_avl_pool;
	- avl = fsperm->fsp_uge_avl;
	- break;
	-
	- default:
	- assert(!"unhandled zfs_deleg_who_type_t");
	- }
	-
	- if (is_set) {
	- who_perm_node_t *found_node = NULL;
	- who_perm_node_t *node = safe_malloc(
	- sizeof (who_perm_node_t));
	- who_perm = &node->who_perm;
	- uu_avl_index_t idx = 0;
	-
	- uu_avl_node_init(node, &node->who_avl_node, avl_pool);
	- who_perm_init(who_perm, fsperm, perm_type, perm_name);
	-
	- if ((found_node = uu_avl_find(avl, node, NULL, &idx))
	- == NULL) {
	- if (avl == fsperm->fsp_uge_avl) {
	- uid_t rid = 0;
	- struct passwd *p = NULL;
	- struct group *g = NULL;
	- const char *nice_name = NULL;
	-
	- switch (perm_type) {
	- case ZFS_DELEG_USER_SETS:
	- case ZFS_DELEG_USER:
	- rid = atoi(perm_name);
	- p = getpwuid(rid);
	- if (p)
	- nice_name = p->pw_name;
	- break;
	- case ZFS_DELEG_GROUP_SETS:
	- case ZFS_DELEG_GROUP:
	- rid = atoi(perm_name);
	- g = getgrgid(rid);
	- if (g)
	- nice_name = g->gr_name;
	- break;
	-
	- default:
	- break;
	- }
	-
	- if (nice_name != NULL)
	- (void) strlcpy(
	- node->who_perm.who_ug_name,
	- nice_name, 256);
	- else {
	- /* User or group unknown */
	- (void) snprintf(
	- node->who_perm.who_ug_name,
	- sizeof (
	- node->who_perm.who_ug_name),
	- "(unknown: %d)", rid);
	- }
	- }
	-
	- uu_avl_insert(avl, node, idx);
	- } else {
	- node = found_node;
	- who_perm = &node->who_perm;
	- }
	- }
	-
	- (void) parse_who_perm(who_perm, nvl2, perm_locality);
	- }
	-
	- return (0);
	-}
	-
	-static inline int
	-parse_fs_perm_set(fs_perm_set_t fspset, nvlist_t nvl)
	-{
	- nvpair_t *nvp = NULL;
	- uu_avl_index_t idx = 0;
	-
	- while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	- nvlist_t *nvl2 = NULL;
	- const char *fsname = nvpair_name(nvp);
	- data_type_t type = nvpair_type(nvp);
	- fs_perm_t *fsperm = NULL;
	- fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
	- if (node == NULL)
	- nomem();
	-
	- fsperm = &node->fspn_fsperm;
	-
	- assert(DATA_TYPE_NVLIST == type);
	-
	- uu_list_node_init(node, &node->fspn_list_node,
	- fspset->fsps_list_pool);
	-
	- idx = uu_list_numnodes(fspset->fsps_list);
	- fs_perm_init(fsperm, fspset, fsname);
	-
	- if (nvpair_value_nvlist(nvp, &nvl2) != 0)
	- return (-1);
	-
	- (void) parse_fs_perm(fsperm, nvl2);
	-
	- uu_list_insert(fspset->fsps_list, node, idx);
	- }
	-
	- return (0);
	-}
	-
	-static inline const char *
	-deleg_perm_comment(zfs_deleg_note_t note)
	-{
	- const char *str = "";
	-
	- /* subcommands */
	- switch (note) {
	- /* SUBCOMMANDS */
	- case ZFS_DELEG_NOTE_ALLOW:
	- str = gettext("Must also have the permission that is being"
	- "\n\t\t\t\tallowed");
	- break;
	- case ZFS_DELEG_NOTE_CLONE:
	- str = gettext("Must also have the 'create' ability and 'mount'"
	- "\n\t\t\t\tability in the origin file system");
	- break;
	- case ZFS_DELEG_NOTE_CREATE:
	- str = gettext("Must also have the 'mount' ability");
	- break;
	- case ZFS_DELEG_NOTE_DESTROY:
	- str = gettext("Must also have the 'mount' ability");
	- break;
	- case ZFS_DELEG_NOTE_DIFF:
	- str = gettext("Allows lookup of paths within a dataset;"
	- "\n\t\t\t\tgiven an object number. Ordinary users need this"
	- "\n\t\t\t\tin order to use zfs diff");
	- break;
	- case ZFS_DELEG_NOTE_HOLD:
	- str = gettext("Allows adding a user hold to a snapshot");
	- break;
	- case ZFS_DELEG_NOTE_MOUNT:
	- str = gettext("Allows mount/umount of ZFS datasets");
	- break;
	- case ZFS_DELEG_NOTE_PROMOTE:
	- str = gettext("Must also have the 'mount'\n\t\t\t\tand"
	- " 'promote' ability in the origin file system");
	- break;
	- case ZFS_DELEG_NOTE_RECEIVE:
	- str = gettext("Must also have the 'mount' and 'create'"
	- " ability");
	- break;
	- case ZFS_DELEG_NOTE_RELEASE:
	- str = gettext("Allows releasing a user hold which\n\t\t\t\t"
	- "might destroy the snapshot");
	- break;
	- case ZFS_DELEG_NOTE_RENAME:
	- str = gettext("Must also have the 'mount' and 'create'"
	- "\n\t\t\t\tability in the new parent");
	- break;
	- case ZFS_DELEG_NOTE_ROLLBACK:
	- str = gettext("");
	- break;
	- case ZFS_DELEG_NOTE_SEND:
	- str = gettext("");
	- break;
	- case ZFS_DELEG_NOTE_SHARE:
	- str = gettext("Allows sharing file systems over NFS or SMB"
	- "\n\t\t\t\tprotocols");
	- break;
	- case ZFS_DELEG_NOTE_SNAPSHOT:
	- str = gettext("");
	- break;
	-/*
	- * case ZFS_DELEG_NOTE_VSCAN:
	- * str = gettext("");
	- * break;
	- */
	- /* OTHER */
	- case ZFS_DELEG_NOTE_GROUPQUOTA:
	- str = gettext("Allows accessing any groupquota@... property");
	- break;
	- case ZFS_DELEG_NOTE_GROUPUSED:
	- str = gettext("Allows reading any groupused@... property");
	- break;
	- case ZFS_DELEG_NOTE_USERPROP:
	- str = gettext("Allows changing any user property");
	- break;
	- case ZFS_DELEG_NOTE_USERQUOTA:
	- str = gettext("Allows accessing any userquota@... property");
	- break;
	- case ZFS_DELEG_NOTE_USERUSED:
	- str = gettext("Allows reading any userused@... property");
	- break;
	- /* other */
	- default:
	- str = "";
	- }
	-
	- return (str);
	-}
	-
	-struct allow_opts {
	- boolean_t local;
	- boolean_t descend;
	- boolean_t user;
	- boolean_t group;
	- boolean_t everyone;
	- boolean_t create;
	- boolean_t set;
	- boolean_t recursive; /* unallow only */
	- boolean_t prt_usage;
	-
	- boolean_t prt_perms;
	- char *who;
	- char *perms;
	- const char *dataset;
	-};
	-
	-static inline int
	-prop_cmp(const void a, const void b)
	-{
	- const char str1 = (const char **)a;
	- const char str2 = (const char **)b;
	- return (strcmp(str1, str2));
	-}
	-
	-static void
	-allow_usage(boolean_t un, boolean_t requested, const char *msg)
	-{
	- const char *opt_desc[] = {
	- "-h", gettext("show this help message and exit"),
	- "-l", gettext("set permission locally"),
	- "-d", gettext("set permission for descents"),
	- "-u", gettext("set permission for user"),
	- "-g", gettext("set permission for group"),
	- "-e", gettext("set permission for everyone"),
	- "-c", gettext("set create time permission"),
	- "-s", gettext("define permission set"),
	- /* unallow only */
	- "-r", gettext("remove permissions recursively"),
	- };
	- size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
	- size_t allow_size = unallow_size - 2;
	- const char *props[ZFS_NUM_PROPS];
	- int i;
	- size_t count = 0;
	- FILE *fp = requested ? stdout : stderr;
	- zprop_desc_t *pdtbl = zfs_prop_get_table();
	- const char *fmt = gettext("%-16s %-14s\t%s\n");
	-
	- (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
	- HELP_ALLOW));
	- (void) fprintf(fp, gettext("Options:\n"));
	- for (i = 0; i < (un ? unallow_size : allow_size); i++) {
	- const char *opt = opt_desc[i++];
	- const char *optdsc = opt_desc[i];
	- (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc);
	- }
	-
	- (void) fprintf(fp, gettext("\nThe following permissions are "
	- "supported:\n\n"));
	- (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
	- gettext("NOTES"));
	- for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
	- const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
	- zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
	- const char *perm_type = deleg_perm_type(perm_note);
	- const char *perm_comment = deleg_perm_comment(perm_note);
	- (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
	- }
	-
	- for (i = 0; i < ZFS_NUM_PROPS; i++) {
	- zprop_desc_t *pd = &pdtbl[i];
	- if (pd->pd_visible != B_TRUE)
	- continue;
	-
	- if (pd->pd_attr == PROP_READONLY)
	- continue;
	-
	- props[count++] = pd->pd_name;
	- }
	- props[count] = NULL;
	-
	- qsort(props, count, sizeof (char *), prop_cmp);
	-
	- for (i = 0; i < count; i++)
	- (void) fprintf(fp, fmt, props[i], gettext("property"), "");
	-
	- if (msg != NULL)
	- (void) fprintf(fp, gettext("\nzfs: error: %s"), msg);
	-
	- exit(requested ? 0 : 2);
	-}
	-
	-static inline const char *
	-munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
	- char **permsp)
	-{
	- if (un && argc == expected_argc - 1)
	- *permsp = NULL;
	- else if (argc == expected_argc)
	- *permsp = argv[argc - 2];
	- else
	- allow_usage(un, B_FALSE,
	- gettext("wrong number of parameters\n"));
	-
	- return (argv[argc - 1]);
	-}
	-
	-static void
	-parse_allow_args(int argc, char *argv, boolean_t un, struct allow_opts opts)
	-{
	- int uge_sum = opts->user + opts->group + opts->everyone;
	- int csuge_sum = opts->create + opts->set + uge_sum;
	- int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
	- int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;
	-
	- if (uge_sum > 1)
	- allow_usage(un, B_FALSE,
	- gettext("-u, -g, and -e are mutually exclusive\n"));
	-
	- if (opts->prt_usage) {
	- if (argc == 0 && all_sum == 0)
	- allow_usage(un, B_TRUE, NULL);
	- else
	- usage(B_FALSE);
	- }
	-
	- if (opts->set) {
	- if (csuge_sum > 1)
	- allow_usage(un, B_FALSE,
	- gettext("invalid options combined with -s\n"));
	-
	- opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
	- if (argv[0][0] != '@')
	- allow_usage(un, B_FALSE,
	- gettext("invalid set name: missing '@' prefix\n"));
	- opts->who = argv[0];
	- } else if (opts->create) {
	- if (ldcsuge_sum > 1)
	- allow_usage(un, B_FALSE,
	- gettext("invalid options combined with -c\n"));
	- opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
	- } else if (opts->everyone) {
	- if (csuge_sum > 1)
	- allow_usage(un, B_FALSE,
	- gettext("invalid options combined with -e\n"));
	- opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
	- } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
	- == 0) {
	- opts->everyone = B_TRUE;
	- argc--;
	- argv++;
	- opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
	- } else if (argc == 1 && !un) {
	- opts->prt_perms = B_TRUE;
	- opts->dataset = argv[argc-1];
	- } else {
	- opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
	- opts->who = argv[0];
	- }
	-
	- if (!opts->local && !opts->descend) {
	- opts->local = B_TRUE;
	- opts->descend = B_TRUE;
	- }
	-}
	-
	-static void
	-store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
	- const char who, char perms, nvlist_t *top_nvl)
	-{
	- int i;
	- char ld[2] = { '\0', '\0' };
	- char who_buf[MAXNAMELEN + 32];
	- char base_type = '\0';
	- char set_type = '\0';
	- nvlist_t *base_nvl = NULL;
	- nvlist_t *set_nvl = NULL;
	- nvlist_t *nvl;
	-
	- if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	- if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	-
	- switch (type) {
	- case ZFS_DELEG_NAMED_SET_SETS:
	- case ZFS_DELEG_NAMED_SET:
	- set_type = ZFS_DELEG_NAMED_SET_SETS;
	- base_type = ZFS_DELEG_NAMED_SET;
	- ld[0] = ZFS_DELEG_NA;
	- break;
	- case ZFS_DELEG_CREATE_SETS:
	- case ZFS_DELEG_CREATE:
	- set_type = ZFS_DELEG_CREATE_SETS;
	- base_type = ZFS_DELEG_CREATE;
	- ld[0] = ZFS_DELEG_NA;
	- break;
	- case ZFS_DELEG_USER_SETS:
	- case ZFS_DELEG_USER:
	- set_type = ZFS_DELEG_USER_SETS;
	- base_type = ZFS_DELEG_USER;
	- if (local)
	- ld[0] = ZFS_DELEG_LOCAL;
	- if (descend)
	- ld[1] = ZFS_DELEG_DESCENDENT;
	- break;
	- case ZFS_DELEG_GROUP_SETS:
	- case ZFS_DELEG_GROUP:
	- set_type = ZFS_DELEG_GROUP_SETS;
	- base_type = ZFS_DELEG_GROUP;
	- if (local)
	- ld[0] = ZFS_DELEG_LOCAL;
	- if (descend)
	- ld[1] = ZFS_DELEG_DESCENDENT;
	- break;
	- case ZFS_DELEG_EVERYONE_SETS:
	- case ZFS_DELEG_EVERYONE:
	- set_type = ZFS_DELEG_EVERYONE_SETS;
	- base_type = ZFS_DELEG_EVERYONE;
	- if (local)
	- ld[0] = ZFS_DELEG_LOCAL;
	- if (descend)
	- ld[1] = ZFS_DELEG_DESCENDENT;
	- break;
	-
	- default:
	- assert(set_type != '\0' && base_type != '\0');
	- }
	-
	- if (perms != NULL) {
	- char *curr = perms;
	- char *end = curr + strlen(perms);
	-
	- while (curr < end) {
	- char *delim = strchr(curr, ',');
	- if (delim == NULL)
	- delim = end;
	- else
	- *delim = '\0';
	-
	- if (curr[0] == '@')
	- nvl = set_nvl;
	- else
	- nvl = base_nvl;
	-
	- (void) nvlist_add_boolean(nvl, curr);
	- if (delim != end)
	- *delim = ',';
	- curr = delim + 1;
	- }
	-
	- for (i = 0; i < 2; i++) {
	- char locality = ld[i];
	- if (locality == 0)
	- continue;
	-
	- if (!nvlist_empty(base_nvl)) {
	- if (who != NULL)
	- (void) snprintf(who_buf,
	- sizeof (who_buf), "%c%c$%s",
	- base_type, locality, who);
	- else
	- (void) snprintf(who_buf,
	- sizeof (who_buf), "%c%c$",
	- base_type, locality);
	-
	- (void) nvlist_add_nvlist(top_nvl, who_buf,
	- base_nvl);
	- }
	-
	-
	- if (!nvlist_empty(set_nvl)) {
	- if (who != NULL)
	- (void) snprintf(who_buf,
	- sizeof (who_buf), "%c%c$%s",
	- set_type, locality, who);
	- else
	- (void) snprintf(who_buf,
	- sizeof (who_buf), "%c%c$",
	- set_type, locality);
	-
	- (void) nvlist_add_nvlist(top_nvl, who_buf,
	- set_nvl);
	- }
	- }
	- } else {
	- for (i = 0; i < 2; i++) {
	- char locality = ld[i];
	- if (locality == 0)
	- continue;
	-
	- if (who != NULL)
	- (void) snprintf(who_buf, sizeof (who_buf),
	- "%c%c$%s", base_type, locality, who);
	- else
	- (void) snprintf(who_buf, sizeof (who_buf),
	- "%c%c$", base_type, locality);
	- (void) nvlist_add_boolean(top_nvl, who_buf);
	-
	- if (who != NULL)
	- (void) snprintf(who_buf, sizeof (who_buf),
	- "%c%c$%s", set_type, locality, who);
	- else
	- (void) snprintf(who_buf, sizeof (who_buf),
	- "%c%c$", set_type, locality);
	- (void) nvlist_add_boolean(top_nvl, who_buf);
	- }
	- }
	-}
	-
	-static int
	-construct_fsacl_list(boolean_t un, struct allow_opts opts, nvlist_t *nvlp)
	-{
	- if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	-
	- if (opts->set) {
	- store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
	- opts->descend, opts->who, opts->perms, *nvlp);
	- } else if (opts->create) {
	- store_allow_perm(ZFS_DELEG_CREATE, opts->local,
	- opts->descend, NULL, opts->perms, *nvlp);
	- } else if (opts->everyone) {
	- store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
	- opts->descend, NULL, opts->perms, *nvlp);
	- } else {
	- char *curr = opts->who;
	- char *end = curr + strlen(curr);
	-
	- while (curr < end) {
	- const char *who;
	- zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
	- char *endch;
	- char *delim = strchr(curr, ',');
	- char errbuf[256];
	- char id[64];
	- struct passwd *p = NULL;
	- struct group *g = NULL;
	-
	- uid_t rid;
	- if (delim == NULL)
	- delim = end;
	- else
	- *delim = '\0';
	-
	- rid = (uid_t)strtol(curr, &endch, 0);
	- if (opts->user) {
	- who_type = ZFS_DELEG_USER;
	- if (*endch != '\0')
	- p = getpwnam(curr);
	- else
	- p = getpwuid(rid);
	-
	- if (p != NULL)
	- rid = p->pw_uid;
	- else if (*endch != '\0') {
	- (void) snprintf(errbuf, 256, gettext(
	- "invalid user %s\n"), curr);
	- allow_usage(un, B_TRUE, errbuf);
	- }
	- } else if (opts->group) {
	- who_type = ZFS_DELEG_GROUP;
	- if (*endch != '\0')
	- g = getgrnam(curr);
	- else
	- g = getgrgid(rid);
	-
	- if (g != NULL)
	- rid = g->gr_gid;
	- else if (*endch != '\0') {
	- (void) snprintf(errbuf, 256, gettext(
	- "invalid group %s\n"), curr);
	- allow_usage(un, B_TRUE, errbuf);
	- }
	- } else {
	- if (*endch != '\0') {
	- p = getpwnam(curr);
	- } else {
	- p = getpwuid(rid);
	- }
	-
	- if (p == NULL) {
	- if (*endch != '\0') {
	- g = getgrnam(curr);
	- } else {
	- g = getgrgid(rid);
	- }
	- }
	-
	- if (p != NULL) {
	- who_type = ZFS_DELEG_USER;
	- rid = p->pw_uid;
	- } else if (g != NULL) {
	- who_type = ZFS_DELEG_GROUP;
	- rid = g->gr_gid;
	- } else {
	- (void) snprintf(errbuf, 256, gettext(
	- "invalid user/group %s\n"), curr);
	- allow_usage(un, B_TRUE, errbuf);
	- }
	- }
	-
	- (void) sprintf(id, "%u", rid);
	- who = id;
	-
	- store_allow_perm(who_type, opts->local,
	- opts->descend, who, opts->perms, *nvlp);
	- curr = delim + 1;
	- }
	- }
	-
	- return (0);
	-}
	-
	-static void
	-print_set_creat_perms(uu_avl_t *who_avl)
	-{
	- const char *sc_title[] = {
	- gettext("Permission sets:\n"),
	- gettext("Create time permissions:\n"),
	- NULL
	- };
	- const char **title_ptr = sc_title;
	- who_perm_node_t *who_node = NULL;
	- int prev_weight = -1;
	-
	- for (who_node = uu_avl_first(who_avl); who_node != NULL;
	- who_node = uu_avl_next(who_avl, who_node)) {
	- uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
	- zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
	- const char *who_name = who_node->who_perm.who_name;
	- int weight = who_type2weight(who_type);
	- boolean_t first = B_TRUE;
	- deleg_perm_node_t *deleg_node;
	-
	- if (prev_weight != weight) {
	- (void) printf(*title_ptr++);
	- prev_weight = weight;
	- }
	-
	- if (who_name == NULL \|\| strnlen(who_name, 1) == 0)
	- (void) printf("\t");
	- else
	- (void) printf("\t%s ", who_name);
	-
	- for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
	- deleg_node = uu_avl_next(avl, deleg_node)) {
	- if (first) {
	- (void) printf("%s",
	- deleg_node->dpn_perm.dp_name);
	- first = B_FALSE;
	- } else
	- (void) printf(",%s",
	- deleg_node->dpn_perm.dp_name);
	- }
	-
	- (void) printf("\n");
	- }
	-}
	-
	-static void
	-print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
	- const char *title)
	-{
	- who_perm_node_t *who_node = NULL;
	- boolean_t prt_title = B_TRUE;
	- uu_avl_walk_t *walk;
	-
	- if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
	- nomem();
	-
	- while ((who_node = uu_avl_walk_next(walk)) != NULL) {
	- const char *who_name = who_node->who_perm.who_name;
	- const char *nice_who_name = who_node->who_perm.who_ug_name;
	- uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
	- zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
	- char delim = ' ';
	- deleg_perm_node_t *deleg_node;
	- boolean_t prt_who = B_TRUE;
	-
	- for (deleg_node = uu_avl_first(avl);
	- deleg_node != NULL;
	- deleg_node = uu_avl_next(avl, deleg_node)) {
	- if (local != deleg_node->dpn_perm.dp_local \|\|
	- descend != deleg_node->dpn_perm.dp_descend)
	- continue;
	-
	- if (prt_who) {
	- const char *who = NULL;
	- if (prt_title) {
	- prt_title = B_FALSE;
	- (void) printf(title);
	- }
	-
	- switch (who_type) {
	- case ZFS_DELEG_USER_SETS:
	- case ZFS_DELEG_USER:
	- who = gettext("user");
	- if (nice_who_name)
	- who_name = nice_who_name;
	- break;
	- case ZFS_DELEG_GROUP_SETS:
	- case ZFS_DELEG_GROUP:
	- who = gettext("group");
	- if (nice_who_name)
	- who_name = nice_who_name;
	- break;
	- case ZFS_DELEG_EVERYONE_SETS:
	- case ZFS_DELEG_EVERYONE:
	- who = gettext("everyone");
	- who_name = NULL;
	- break;
	-
	- default:
	- assert(who != NULL);
	- }
	-
	- prt_who = B_FALSE;
	- if (who_name == NULL)
	- (void) printf("\t%s", who);
	- else
	- (void) printf("\t%s %s", who, who_name);
	- }
	-
	- (void) printf("%c%s", delim,
	- deleg_node->dpn_perm.dp_name);
	- delim = ',';
	- }
	-
	- if (!prt_who)
	- (void) printf("\n");
	- }
	-
	- uu_avl_walk_end(walk);
	-}
	-
	-static void
	-print_fs_perms(fs_perm_set_t *fspset)
	-{
	- fs_perm_node_t *node = NULL;
	- char buf[MAXNAMELEN + 32];
	- const char *dsname = buf;
	-
	- for (node = uu_list_first(fspset->fsps_list); node != NULL;
	- node = uu_list_next(fspset->fsps_list, node)) {
	- uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
	- uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
	- int left = 0;
	-
	- (void) snprintf(buf, sizeof (buf),
	- gettext("---- Permissions on %s "),
	- node->fspn_fsperm.fsp_name);
	- (void) printf(dsname);
	- left = 70 - strlen(buf);
	- while (left-- > 0)
	- (void) printf("-");
	- (void) printf("\n");
	-
	- print_set_creat_perms(sc_avl);
	- print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
	- gettext("Local permissions:\n"));
	- print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
	- gettext("Descendent permissions:\n"));
	- print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
	- gettext("Local+Descendent permissions:\n"));
	- }
	-}
	-
	-static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };
	-
	-struct deleg_perms {
	- boolean_t un;
	- nvlist_t *nvl;
	-};
	-
	-static int
	-set_deleg_perms(zfs_handle_t zhp, void data)
	-{
	- struct deleg_perms perms = (struct deleg_perms )data;
	- zfs_type_t zfs_type = zfs_get_type(zhp);
	-
	- if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
	- return (0);
	-
	- return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
	-}
	-
	-static int
	-zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
	-{
	- zfs_handle_t *zhp;
	- nvlist_t *perm_nvl = NULL;
	- nvlist_t *update_perm_nvl = NULL;
	- int error = 1;
	- int c;
	- struct allow_opts opts = { 0 };
	-
	- const char *optstr = un ? "ldugecsrh" : "ldugecsh";
	-
	- /* check opts */
	- while ((c = getopt(argc, argv, optstr)) != -1) {
	- switch (c) {
	- case 'l':
	- opts.local = B_TRUE;
	- break;
	- case 'd':
	- opts.descend = B_TRUE;
	- break;
	- case 'u':
	- opts.user = B_TRUE;
	- break;
	- case 'g':
	- opts.group = B_TRUE;
	- break;
	- case 'e':
	- opts.everyone = B_TRUE;
	- break;
	- case 's':
	- opts.set = B_TRUE;
	- break;
	- case 'c':
	- opts.create = B_TRUE;
	- break;
	- case 'r':
	- opts.recursive = B_TRUE;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case 'h':
	- opts.prt_usage = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check arguments */
	- parse_allow_args(argc, argv, un, &opts);
	-
	- /* try to open the dataset */
	- if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM \|
	- ZFS_TYPE_VOLUME)) == NULL) {
	- (void) fprintf(stderr, "Failed to open dataset: %s\n",
	- opts.dataset);
	- return (-1);
	- }
	-
	- if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
	- goto cleanup2;
	-
	- fs_perm_set_init(&fs_perm_set);
	- if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
	- (void) fprintf(stderr, "Failed to parse fsacl permissions\n");
	- goto cleanup1;
	- }
	-
	- if (opts.prt_perms)
	- print_fs_perms(&fs_perm_set);
	- else {
	- (void) construct_fsacl_list(un, &opts, &update_perm_nvl);
	- if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
	- goto cleanup0;
	-
	- if (un && opts.recursive) {
	- struct deleg_perms data = { un, update_perm_nvl };
	- if (zfs_iter_filesystems(zhp, set_deleg_perms,
	- &data) != 0)
	- goto cleanup0;
	- }
	- }
	-
	- error = 0;
	-
	-cleanup0:
	- nvlist_free(perm_nvl);
	- nvlist_free(update_perm_nvl);
	-cleanup1:
	- fs_perm_set_fini(&fs_perm_set);
	-cleanup2:
	- zfs_close(zhp);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_do_allow(int argc, char **argv)
	-{
	- return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
	-}
	-
	-static int
	-zfs_do_unallow(int argc, char **argv)
	-{
	- return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
	-}
	-
	-static int
	-zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
	-{
	- int errors = 0;
	- int i;
	- const char *tag;
	- boolean_t recursive = B_FALSE;
	- const char *opts = holding ? "rt" : "r";
	- int c;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, opts)) != -1) {
	- switch (c) {
	- case 'r':
	- recursive = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc < 2)
	- usage(B_FALSE);
	-
	- tag = argv[0];
	- --argc;
	- ++argv;
	-
	- if (holding && tag[0] == '.') {
	- /* tags starting with '.' are reserved for libzfs */
	- (void) fprintf(stderr, gettext("tag may not start with '.'\n"));
	- usage(B_FALSE);
	- }
	-
	- for (i = 0; i < argc; ++i) {
	- zfs_handle_t *zhp;
	- char parent[ZFS_MAX_DATASET_NAME_LEN];
	- const char *delim;
	- char *path = argv[i];
	-
	- delim = strchr(path, '@');
	- if (delim == NULL) {
	- (void) fprintf(stderr,
	- gettext("'%s' is not a snapshot\n"), path);
	- ++errors;
	- continue;
	- }
	- (void) strncpy(parent, path, delim - path);
	- parent[delim - path] = '\0';
	-
	- zhp = zfs_open(g_zfs, parent,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (zhp == NULL) {
	- ++errors;
	- continue;
	- }
	- if (holding) {
	- if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0)
	- ++errors;
	- } else {
	- if (zfs_release(zhp, delim+1, tag, recursive) != 0)
	- ++errors;
	- }
	- zfs_close(zhp);
	- }
	-
	- return (errors != 0);
	-}
	-
	-/*
	- * zfs hold [-r] [-t] <tag> <snap> ...
	- *
	- * -r Recursively hold
	- *
	- * Apply a user-hold with the given tag to the list of snapshots.
	- */
	-static int
	-zfs_do_hold(int argc, char **argv)
	-{
	- return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
	-}
	-
	-/*
	- * zfs release [-r] <tag> <snap> ...
	- *
	- * -r Recursively release
	- *
	- * Release a user-hold with the given tag from the list of snapshots.
	- */
	-static int
	-zfs_do_release(int argc, char **argv)
	-{
	- return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
	-}
	-
	-typedef struct holds_cbdata {
	- boolean_t cb_recursive;
	- const char *cb_snapname;
	- nvlist_t **cb_nvlp;
	- size_t cb_max_namelen;
	- size_t cb_max_taglen;
	-} holds_cbdata_t;
	-
	-#define STRFTIME_FMT_STR "%a %b %e %k:%M %Y"
	-#define DATETIME_BUF_LEN (32)
	-/*
	- *
	- */
	-static void
	-print_holds(boolean_t scripted, boolean_t literal, size_t nwidth,
	- size_t tagwidth, nvlist_t *nvl)
	-{
	- int i;
	- nvpair_t *nvp = NULL;
	- char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
	- const char *col;
	-
	- if (!scripted) {
	- for (i = 0; i < 3; i++) {
	- col = gettext(hdr_cols[i]);
	- if (i < 2)
	- (void) printf("%-*s ", i ? tagwidth : nwidth,
	- col);
	- else
	- (void) printf("%s\n", col);
	- }
	- }
	-
	- while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	- char *zname = nvpair_name(nvp);
	- nvlist_t *nvl2;
	- nvpair_t *nvp2 = NULL;
	- (void) nvpair_value_nvlist(nvp, &nvl2);
	- while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
	- char tsbuf[DATETIME_BUF_LEN];
	- char *tagname = nvpair_name(nvp2);
	- uint64_t val = 0;
	- time_t time;
	- struct tm t;
	-
	- (void) nvpair_value_uint64(nvp2, &val);
	- if (literal)
	- snprintf(tsbuf, DATETIME_BUF_LEN, "%llu", val);
	- else {
	- time = (time_t)val;
	- (void) localtime_r(&time, &t);
	- (void) strftime(tsbuf, DATETIME_BUF_LEN,
	- gettext(STRFTIME_FMT_STR), &t);
	- }
	-
	- if (scripted) {
	- (void) printf("%s\t%s\t%s\n", zname,
	- tagname, tsbuf);
	- } else {
	- (void) printf("%-s %-s %s\n", nwidth,
	- zname, tagwidth, tagname, tsbuf);
	- }
	- }
	- }
	-}
	-
	-/*
	- * Generic callback function to list a dataset or snapshot.
	- */
	-static int
	-holds_callback(zfs_handle_t zhp, void data)
	-{
	- holds_cbdata_t *cbp = data;
	- nvlist_t top_nvl = cbp->cb_nvlp;
	- nvlist_t *nvl = NULL;
	- nvpair_t *nvp = NULL;
	- const char *zname = zfs_get_name(zhp);
	- size_t znamelen = strlen(zname);
	-
	- if (cbp->cb_recursive && cbp->cb_snapname != NULL) {
	- const char *snapname;
	- char *delim = strchr(zname, '@');
	- if (delim == NULL)
	- return (0);
	-
	- snapname = delim + 1;
	- if (strcmp(cbp->cb_snapname, snapname))
	- return (0);
	- }
	-
	- if (zfs_get_holds(zhp, &nvl) != 0)
	- return (-1);
	-
	- if (znamelen > cbp->cb_max_namelen)
	- cbp->cb_max_namelen = znamelen;
	-
	- while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	- const char *tag = nvpair_name(nvp);
	- size_t taglen = strlen(tag);
	- if (taglen > cbp->cb_max_taglen)
	- cbp->cb_max_taglen = taglen;
	- }
	-
	- return (nvlist_add_nvlist(top_nvl, zname, nvl));
	-}
	-
	-/*
	- * zfs holds [-Hp] [-r \| -d max] <dataset\|snap> ...
	- *
	- * -H Suppress header output
	- * -p Output literal values
	- * -r Recursively search for holds
	- * -d max Limit depth of recursive search
	- */
	-static int
	-zfs_do_holds(int argc, char **argv)
	-{
	- int errors = 0;
	- int c;
	- int i;
	- boolean_t scripted = B_FALSE;
	- boolean_t literal = B_FALSE;
	- boolean_t recursive = B_FALSE;
	- const char *opts = "d:rHp";
	- nvlist_t *nvl;
	-
	- int types = ZFS_TYPE_SNAPSHOT;
	- holds_cbdata_t cb = { 0 };
	-
	- int limit = 0;
	- int ret = 0;
	- int flags = 0;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, opts)) != -1) {
	- switch (c) {
	- case 'd':
	- limit = parse_depth(optarg, &flags);
	- recursive = B_TRUE;
	- break;
	- case 'r':
	- recursive = B_TRUE;
	- break;
	- case 'H':
	- scripted = B_TRUE;
	- break;
	- case 'p':
	- literal = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- if (recursive) {
	- types \|= ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME;
	- flags \|= ZFS_ITER_RECURSE;
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc < 1)
	- usage(B_FALSE);
	-
	- if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
	- nomem();
	-
	- for (i = 0; i < argc; ++i) {
	- char *snapshot = argv[i];
	- const char *delim;
	- const char *snapname = NULL;
	-
	- delim = strchr(snapshot, '@');
	- if (delim != NULL) {
	- snapname = delim + 1;
	- if (recursive)
	- snapshot[delim - snapshot] = '\0';
	- }
	-
	- cb.cb_recursive = recursive;
	- cb.cb_snapname = snapname;
	- cb.cb_nvlp = &nvl;
	-
	- /*
	- * 1. collect holds data, set format options
	- */
	- ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
	- holds_callback, &cb);
	- if (ret != 0)
	- ++errors;
	- }
	-
	- /*
	- * 2. print holds data
	- */
	- print_holds(scripted, literal, cb.cb_max_namelen, cb.cb_max_taglen,
	- nvl);
	-
	- if (nvlist_empty(nvl))
	- (void) printf(gettext("no datasets available\n"));
	-
	- nvlist_free(nvl);
	-
	- return (0 != errors);
	-}
	-
	-#define CHECK_SPINNER 30
	-#define SPINNER_TIME 3 /* seconds */
	-#define MOUNT_TIME 1 /* seconds */
	-
	-typedef struct get_all_state {
	- boolean_t ga_verbose;
	- get_all_cb_t *ga_cbp;
	-} get_all_state_t;
	-
	-static int
	-get_one_dataset(zfs_handle_t zhp, void data)
	-{
	- static char *spin[] = { "-", "\\", "\|", "/" };
	- static int spinval = 0;
	- static int spincheck = 0;
	- static time_t last_spin_time = (time_t)0;
	- get_all_state_t *state = data;
	- zfs_type_t type = zfs_get_type(zhp);
	-
	- if (state->ga_verbose) {
	- if (--spincheck < 0) {
	- time_t now = time(NULL);
	- if (last_spin_time + SPINNER_TIME < now) {
	- update_progress(spin[spinval++ % 4]);
	- last_spin_time = now;
	- }
	- spincheck = CHECK_SPINNER;
	- }
	- }
	-
	- /*
	- * Interate over any nested datasets.
	- */
	- if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
	- zfs_close(zhp);
	- return (1);
	- }
	-
	- /*
	- * Skip any datasets whose type does not match.
	- */
	- if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
	- zfs_close(zhp);
	- return (0);
	- }
	- libzfs_add_handle(state->ga_cbp, zhp);
	- assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc);
	-
	- return (0);
	-}
	-
	-static void
	-get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
	-{
	- get_all_state_t state = {
	- .ga_verbose = verbose,
	- .ga_cbp = cbp
	- };
	-
	- if (verbose)
	- set_progress_header(gettext("Reading ZFS config"));
	- (void) zfs_iter_root(g_zfs, get_one_dataset, &state);
	-
	- if (verbose)
	- finish_progress(gettext("done."));
	-}
	-
	-/*
	- * Generic callback for sharing or mounting filesystems. Because the code is so
	- * similar, we have a common function with an extra parameter to determine which
	- * mode we are using.
	- */
	-typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t;
	-
	-typedef struct share_mount_state {
	- share_mount_op_t sm_op;
	- boolean_t sm_verbose;
	- int sm_flags;
	- char *sm_options;
	- char sm_proto; / only valid for OP_SHARE */
	- pthread_mutex_t sm_lock; /* protects the remaining fields */
	- uint_t sm_total; /* number of filesystems to process */
	- uint_t sm_done; /* number of filesystems processed */
	- int sm_status; /* -1 if any of the share/mount operations failed */
	-} share_mount_state_t;
	-
	-/*
	- * Share or mount a dataset.
	- */
	-static int
	-share_mount_one(zfs_handle_t zhp, int op, int flags, char protocol,
	- boolean_t explicit, const char *options)
	-{
	- char mountpoint[ZFS_MAXPROPLEN];
	- char shareopts[ZFS_MAXPROPLEN];
	- char smbshareopts[ZFS_MAXPROPLEN];
	- const char *cmdname = op == OP_SHARE ? "share" : "mount";
	- struct mnttab mnt;
	- uint64_t zoned, canmount;
	- boolean_t shared_nfs, shared_smb;
	-
	- assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);
	-
	- /*
	- * Check to make sure we can mount/share this dataset. If we
	- * are in the global zone and the filesystem is exported to a
	- * local zone, or if we are in a local zone and the
	- * filesystem is not exported, then it is an error.
	- */
	- zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
	-
	- if (zoned && getzoneid() == GLOBAL_ZONEID) {
	- if (!explicit)
	- return (0);
	-
	- (void) fprintf(stderr, gettext("cannot %s '%s': "
	- "dataset is exported to a local zone\n"), cmdname,
	- zfs_get_name(zhp));
	- return (1);
	-
	- } else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
	- if (!explicit)
	- return (0);
	-
	- (void) fprintf(stderr, gettext("cannot %s '%s': "
	- "permission denied\n"), cmdname,
	- zfs_get_name(zhp));
	- return (1);
	- }
	-
	- /*
	- * Ignore any filesystems which don't apply to us. This
	- * includes those with a legacy mountpoint, or those with
	- * legacy share options.
	- */
	- verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
	- sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
	- verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
	- sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
	- verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
	- sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
	-
	- if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
	- strcmp(smbshareopts, "off") == 0) {
	- if (!explicit)
	- return (0);
	-
	- (void) fprintf(stderr, gettext("cannot share '%s': "
	- "legacy share\n"), zfs_get_name(zhp));
	- (void) fprintf(stderr, gettext("to "
	- "share this filesystem set "
	- "sharenfs property on\n"));
	- return (1);
	- }
	-
	- /*
	- * We cannot share or mount legacy filesystems. If the
	- * shareopts is non-legacy but the mountpoint is legacy, we
	- * treat it as a legacy share.
	- */
	- if (strcmp(mountpoint, "legacy") == 0) {
	- if (!explicit)
	- return (0);
	-
	- (void) fprintf(stderr, gettext("cannot %s '%s': "
	- "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
	- (void) fprintf(stderr, gettext("use %s(8) to "
	- "%s this filesystem\n"), cmdname, cmdname);
	- return (1);
	- }
	-
	- if (strcmp(mountpoint, "none") == 0) {
	- if (!explicit)
	- return (0);
	-
	- (void) fprintf(stderr, gettext("cannot %s '%s': no "
	- "mountpoint set\n"), cmdname, zfs_get_name(zhp));
	- return (1);
	- }
	-
	- /*
	- * canmount explicit outcome
	- * on no pass through
	- * on yes pass through
	- * off no return 0
	- * off yes display error, return 1
	- * noauto no return 0
	- * noauto yes pass through
	- */
	- canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
	- if (canmount == ZFS_CANMOUNT_OFF) {
	- if (!explicit)
	- return (0);
	-
	- (void) fprintf(stderr, gettext("cannot %s '%s': "
	- "'canmount' property is set to 'off'\n"), cmdname,
	- zfs_get_name(zhp));
	- return (1);
	- } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
	- return (0);
	- }
	-
	- /*
	- * If this filesystem is inconsistent and has a receive resume
	- * token, we can not mount it.
	- */
	- if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
	- zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
	- NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
	- if (!explicit)
	- return (0);
	-
	- (void) fprintf(stderr, gettext("cannot %s '%s': "
	- "Contains partially-completed state from "
	- "\"zfs receive -r\", which can be resumed with "
	- "\"zfs send -t\"\n"),
	- cmdname, zfs_get_name(zhp));
	- return (1);
	- }
	-
	- /*
	- * At this point, we have verified that the mountpoint and/or
	- * shareopts are appropriate for auto management. If the
	- * filesystem is already mounted or shared, return (failing
	- * for explicit requests); otherwise mount or share the
	- * filesystem.
	- */
	- switch (op) {
	- case OP_SHARE:
	-
	- shared_nfs = zfs_is_shared_nfs(zhp, NULL);
	- shared_smb = zfs_is_shared_smb(zhp, NULL);
	-
	- if ((shared_nfs && shared_smb) \|\|
	- (shared_nfs && strcmp(shareopts, "on") == 0 &&
	- strcmp(smbshareopts, "off") == 0) \|\|
	- (shared_smb && strcmp(smbshareopts, "on") == 0 &&
	- strcmp(shareopts, "off") == 0)) {
	- if (!explicit)
	- return (0);
	-
	- (void) fprintf(stderr, gettext("cannot share "
	- "'%s': filesystem already shared\n"),
	- zfs_get_name(zhp));
	- return (1);
	- }
	-
	- if (!zfs_is_mounted(zhp, NULL) &&
	- zfs_mount(zhp, NULL, 0) != 0)
	- return (1);
	-
	- if (protocol == NULL) {
	- if (zfs_shareall(zhp) != 0)
	- return (1);
	- } else if (strcmp(protocol, "nfs") == 0) {
	- if (zfs_share_nfs(zhp))
	- return (1);
	- } else if (strcmp(protocol, "smb") == 0) {
	- if (zfs_share_smb(zhp))
	- return (1);
	- } else {
	- (void) fprintf(stderr, gettext("cannot share "
	- "'%s': invalid share type '%s' "
	- "specified\n"),
	- zfs_get_name(zhp), protocol);
	- return (1);
	- }
	-
	- break;
	-
	- case OP_MOUNT:
	- if (options == NULL)
	- mnt.mnt_mntopts = "";
	- else
	- mnt.mnt_mntopts = (char *)options;
	-
	- if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
	- zfs_is_mounted(zhp, NULL)) {
	- if (!explicit)
	- return (0);
	-
	- (void) fprintf(stderr, gettext("cannot mount "
	- "'%s': filesystem already mounted\n"),
	- zfs_get_name(zhp));
	- return (1);
	- }
	-
	- if (zfs_mount(zhp, options, flags) != 0)
	- return (1);
	- break;
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Reports progress in the form "(current/total)". Not thread-safe.
	- */
	-static void
	-report_mount_progress(int current, int total)
	-{
	- static time_t last_progress_time = 0;
	- time_t now = time(NULL);
	- char info[32];
	-
	- /* display header if we're here for the first time */
	- if (current == 1) {
	- set_progress_header(gettext("Mounting ZFS filesystems"));
	- } else if (current != total && last_progress_time + MOUNT_TIME >= now) {
	- /* too soon to report again */
	- return;
	- }
	-
	- last_progress_time = now;
	-
	- (void) sprintf(info, "(%d/%d)", current, total);
	-
	- if (current == total)
	- finish_progress(info);
	- else
	- update_progress(info);
	-}
	-
	-/*
	- * zfs_foreach_mountpoint() callback that mounts or shares on filesystem and
	- * updates the progress meter
	- */
	-static int
	-share_mount_one_cb(zfs_handle_t zhp, void arg)
	-{
	- share_mount_state_t *sms = arg;
	- int ret;
	-
	- ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto,
	- B_FALSE, sms->sm_options);
	-
	- pthread_mutex_lock(&sms->sm_lock);
	- if (ret != 0)
	- sms->sm_status = ret;
	- sms->sm_done++;
	- if (sms->sm_verbose)
	- report_mount_progress(sms->sm_done, sms->sm_total);
	- pthread_mutex_unlock(&sms->sm_lock);
	- return (ret);
	-}
	-
	-static void
	-append_options(char mntopts, char newopts)
	-{
	- int len = strlen(mntopts);
	-
	- /* original length plus new string to append plus 1 for the comma */
	- if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) {
	- (void) fprintf(stderr, gettext("the opts argument for "
	- "'%c' option is too long (more than %d chars)\n"),
	- "-o", MNT_LINE_MAX);
	- usage(B_FALSE);
	- }
	-
	- if (*mntopts)
	- mntopts[len++] = ',';
	-
	- (void) strcpy(&mntopts[len], newopts);
	-}
	-
	-static int
	-share_mount(int op, int argc, char **argv)
	-{
	- int do_all = 0;
	- boolean_t verbose = B_FALSE;
	- int c, ret = 0;
	- char *options = NULL;
	- int flags = 0;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a"))
	- != -1) {
	- switch (c) {
	- case 'a':
	- do_all = 1;
	- break;
	- case 'v':
	- verbose = B_TRUE;
	- break;
	- case 'o':
	- if (*optarg == '\0') {
	- (void) fprintf(stderr, gettext("empty mount "
	- "options (-o) specified\n"));
	- usage(B_FALSE);
	- }
	-
	- if (options == NULL)
	- options = safe_malloc(MNT_LINE_MAX + 1);
	-
	- /* option validation is done later */
	- append_options(options, optarg);
	- break;
	-
	- case 'O':
	- warnx("no overlay mounts support on FreeBSD, ignoring");
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (do_all) {
	- char *protocol = NULL;
	-
	- if (op == OP_SHARE && argc > 0) {
	- if (strcmp(argv[0], "nfs") != 0 &&
	- strcmp(argv[0], "smb") != 0) {
	- (void) fprintf(stderr, gettext("share type "
	- "must be 'nfs' or 'smb'\n"));
	- usage(B_FALSE);
	- }
	- protocol = argv[0];
	- argc--;
	- argv++;
	- }
	-
	- if (argc != 0) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- start_progress_timer();
	- get_all_cb_t cb = { 0 };
	- get_all_datasets(&cb, verbose);
	-
	- if (cb.cb_used == 0) {
	- if (options != NULL)
	- free(options);
	- return (0);
	- }
	-
	-#ifdef illumos
	- if (op == OP_SHARE) {
	- sa_init_selective_arg_t sharearg;
	- sharearg.zhandle_arr = cb.cb_handles;
	- sharearg.zhandle_len = cb.cb_used;
	- if ((ret = zfs_init_libshare_arg(g_zfs,
	- SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) {
	- (void) fprintf(stderr, gettext(
	- "Could not initialize libshare, %d"), ret);
	- return (ret);
	- }
	- }
	-#endif
	- share_mount_state_t share_mount_state = { 0 };
	- share_mount_state.sm_op = op;
	- share_mount_state.sm_verbose = verbose;
	- share_mount_state.sm_flags = flags;
	- share_mount_state.sm_options = options;
	- share_mount_state.sm_proto = protocol;
	- share_mount_state.sm_total = cb.cb_used;
	- pthread_mutex_init(&share_mount_state.sm_lock, NULL);
	-
	- /*
	- * libshare isn't mt-safe, so only do the operation in parallel
	- * if we're mounting.
	- */
	- zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used,
	- share_mount_one_cb, &share_mount_state, op == OP_MOUNT);
	- ret = share_mount_state.sm_status;
	-
	- for (int i = 0; i < cb.cb_used; i++)
	- zfs_close(cb.cb_handles[i]);
	- free(cb.cb_handles);
	- } else if (argc == 0) {
	- struct mnttab entry;
	-
	- if ((op == OP_SHARE) \|\| (options != NULL)) {
	- (void) fprintf(stderr, gettext("missing filesystem "
	- "argument (specify -a for all)\n"));
	- usage(B_FALSE);
	- }
	-
	- /*
	- * When mount is given no arguments, go through /etc/mnttab and
	- * display any active ZFS mounts. We hide any snapshots, since
	- * they are controlled automatically.
	- */
	- rewind(mnttab_file);
	- while (getmntent(mnttab_file, &entry) == 0) {
	- if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 \|\|
	- strchr(entry.mnt_special, '@') != NULL)
	- continue;
	-
	- (void) printf("%-30s %s\n", entry.mnt_special,
	- entry.mnt_mountp);
	- }
	-
	- } else {
	- zfs_handle_t *zhp;
	-
	- if (argc > 1) {
	- (void) fprintf(stderr,
	- gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- if ((zhp = zfs_open(g_zfs, argv[0],
	- ZFS_TYPE_FILESYSTEM)) == NULL) {
	- ret = 1;
	- } else {
	- ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
	- options);
	- zfs_close(zhp);
	- }
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * zfs mount -a [nfs]
	- * zfs mount filesystem
	- *
	- * Mount all filesystems, or mount the given filesystem.
	- */
	-static int
	-zfs_do_mount(int argc, char **argv)
	-{
	- return (share_mount(OP_MOUNT, argc, argv));
	-}
	-
	-/*
	- * zfs share -a [nfs \| smb]
	- * zfs share filesystem
	- *
	- * Share all filesystems, or share the given filesystem.
	- */
	-static int
	-zfs_do_share(int argc, char **argv)
	-{
	- return (share_mount(OP_SHARE, argc, argv));
	-}
	-
	-typedef struct unshare_unmount_node {
	- zfs_handle_t *un_zhp;
	- char *un_mountp;
	- uu_avl_node_t un_avlnode;
	-} unshare_unmount_node_t;
	-
	-/* ARGSUSED */
	-static int
	-unshare_unmount_compare(const void larg, const void rarg, void *unused)
	-{
	- const unshare_unmount_node_t *l = larg;
	- const unshare_unmount_node_t *r = rarg;
	-
	- return (strcmp(l->un_mountp, r->un_mountp));
	-}
	-
	-/*
	- * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an
	- * absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem,
	- * and unmount it appropriately.
	- */
	-static int
	-unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
	-{
	- zfs_handle_t *zhp;
	- int ret = 0;
	- struct stat64 statbuf;
	- struct extmnttab entry;
	- const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
	- ino_t path_inode;
	-
	- /*
	- * Search for the path in /etc/mnttab. Rather than looking for the
	- * specific path, which can be fooled by non-standard paths (i.e. ".."
	- * or "//"), we stat() the path and search for the corresponding
	- * (major,minor) device pair.
	- */
	- if (stat64(path, &statbuf) != 0) {
	- (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
	- cmdname, path, strerror(errno));
	- return (1);
	- }
	- path_inode = statbuf.st_ino;
	-
	- /*
	- * Search for the given (major,minor) pair in the mount table.
	- */
	-#ifdef illumos
	- rewind(mnttab_file);
	- while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) {
	- if (entry.mnt_major == major(statbuf.st_dev) &&
	- entry.mnt_minor == minor(statbuf.st_dev))
	- break;
	- }
	-#else
	- {
	- struct statfs sfs;
	-
	- if (statfs(path, &sfs) != 0) {
	- (void) fprintf(stderr, "%s: %s\n", path,
	- strerror(errno));
	- ret = -1;
	- }
	- statfs2mnttab(&sfs, &entry);
	- }
	-#endif
	- if (ret != 0) {
	- if (op == OP_SHARE) {
	- (void) fprintf(stderr, gettext("cannot %s '%s': not "
	- "currently mounted\n"), cmdname, path);
	- return (1);
	- }
	- (void) fprintf(stderr, gettext("warning: %s not in mnttab\n"),
	- path);
	- if ((ret = umount2(path, flags)) != 0)
	- (void) fprintf(stderr, gettext("%s: %s\n"), path,
	- strerror(errno));
	- return (ret != 0);
	- }
	-
	- if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
	- (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
	- "filesystem\n"), cmdname, path);
	- return (1);
	- }
	-
	- if ((zhp = zfs_open(g_zfs, entry.mnt_special,
	- ZFS_TYPE_FILESYSTEM)) == NULL)
	- return (1);
	-
	- ret = 1;
	- if (stat64(entry.mnt_mountp, &statbuf) != 0) {
	- (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
	- cmdname, path, strerror(errno));
	- goto out;
	- } else if (statbuf.st_ino != path_inode) {
	- (void) fprintf(stderr, gettext("cannot "
	- "%s '%s': not a mountpoint\n"), cmdname, path);
	- goto out;
	- }
	-
	- if (op == OP_SHARE) {
	- char nfs_mnt_prop[ZFS_MAXPROPLEN];
	- char smbshare_prop[ZFS_MAXPROPLEN];
	-
	- verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop,
	- sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0);
	- verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop,
	- sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0);
	-
	- if (strcmp(nfs_mnt_prop, "off") == 0 &&
	- strcmp(smbshare_prop, "off") == 0) {
	- (void) fprintf(stderr, gettext("cannot unshare "
	- "'%s': legacy share\n"), path);
	-#ifdef illumos
	- (void) fprintf(stderr, gettext("use "
	- "unshare(1M) to unshare this filesystem\n"));
	-#endif
	- } else if (!zfs_is_shared(zhp)) {
	- (void) fprintf(stderr, gettext("cannot unshare '%s': "
	- "not currently shared\n"), path);
	- } else {
	- ret = zfs_unshareall_bypath(zhp, path);
	- }
	- } else {
	- char mtpt_prop[ZFS_MAXPROPLEN];
	-
	- verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop,
	- sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0);
	-
	- if (is_manual) {
	- ret = zfs_unmount(zhp, NULL, flags);
	- } else if (strcmp(mtpt_prop, "legacy") == 0) {
	- (void) fprintf(stderr, gettext("cannot unmount "
	- "'%s': legacy mountpoint\n"),
	- zfs_get_name(zhp));
	- (void) fprintf(stderr, gettext("use umount(8) "
	- "to unmount this filesystem\n"));
	- } else {
	- ret = zfs_unmountall(zhp, flags);
	- }
	- }
	-
	-out:
	- zfs_close(zhp);
	-
	- return (ret != 0);
	-}
	-
	-/*
	- * Generic callback for unsharing or unmounting a filesystem.
	- */
	-static int
	-unshare_unmount(int op, int argc, char **argv)
	-{
	- int do_all = 0;
	- int flags = 0;
	- int ret = 0;
	- int c;
	- zfs_handle_t *zhp;
	- char nfs_mnt_prop[ZFS_MAXPROPLEN];
	- char sharesmb[ZFS_MAXPROPLEN];
	-
	- /* check options */
	- while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) {
	- switch (c) {
	- case 'a':
	- do_all = 1;
	- break;
	- case 'f':
	- flags = MS_FORCE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (do_all) {
	- /*
	- * We could make use of zfs_for_each() to walk all datasets in
	- * the system, but this would be very inefficient, especially
	- * since we would have to linearly search /etc/mnttab for each
	- * one. Instead, do one pass through /etc/mnttab looking for
	- * zfs entries and call zfs_unmount() for each one.
	- *
	- * Things get a little tricky if the administrator has created
	- * mountpoints beneath other ZFS filesystems. In this case, we
	- * have to unmount the deepest filesystems first. To accomplish
	- * this, we place all the mountpoints in an AVL tree sorted by
	- * the special type (dataset name), and walk the result in
	- * reverse to make sure to get any snapshots first.
	- */
	- struct mnttab entry;
	- uu_avl_pool_t *pool;
	- uu_avl_t *tree = NULL;
	- unshare_unmount_node_t *node;
	- uu_avl_index_t idx;
	- uu_avl_walk_t *walk;
	-
	- if (argc != 0) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- if (((pool = uu_avl_pool_create("unmount_pool",
	- sizeof (unshare_unmount_node_t),
	- offsetof(unshare_unmount_node_t, un_avlnode),
	- unshare_unmount_compare, UU_DEFAULT)) == NULL) \|\|
	- ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
	- nomem();
	-
	- rewind(mnttab_file);
	- while (getmntent(mnttab_file, &entry) == 0) {
	-
	- /* ignore non-ZFS entries */
	- if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
	- continue;
	-
	- /* ignore snapshots */
	- if (strchr(entry.mnt_special, '@') != NULL)
	- continue;
	-
	- if ((zhp = zfs_open(g_zfs, entry.mnt_special,
	- ZFS_TYPE_FILESYSTEM)) == NULL) {
	- ret = 1;
	- continue;
	- }
	-
	- /*
	- * Ignore datasets that are excluded/restricted by
	- * parent pool name.
	- */
	- if (zpool_skip_pool(zfs_get_pool_name(zhp))) {
	- zfs_close(zhp);
	- continue;
	- }
	-
	- switch (op) {
	- case OP_SHARE:
	- verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
	- nfs_mnt_prop,
	- sizeof (nfs_mnt_prop),
	- NULL, NULL, 0, B_FALSE) == 0);
	- if (strcmp(nfs_mnt_prop, "off") != 0)
	- break;
	- verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
	- nfs_mnt_prop,
	- sizeof (nfs_mnt_prop),
	- NULL, NULL, 0, B_FALSE) == 0);
	- if (strcmp(nfs_mnt_prop, "off") == 0)
	- continue;
	- break;
	- case OP_MOUNT:
	- /* Ignore legacy mounts */
	- verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
	- nfs_mnt_prop,
	- sizeof (nfs_mnt_prop),
	- NULL, NULL, 0, B_FALSE) == 0);
	- if (strcmp(nfs_mnt_prop, "legacy") == 0)
	- continue;
	- /* Ignore canmount=noauto mounts */
	- if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
	- ZFS_CANMOUNT_NOAUTO)
	- continue;
	- default:
	- break;
	- }
	-
	- node = safe_malloc(sizeof (unshare_unmount_node_t));
	- node->un_zhp = zhp;
	- node->un_mountp = safe_strdup(entry.mnt_mountp);
	-
	- uu_avl_node_init(node, &node->un_avlnode, pool);
	-
	- if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
	- uu_avl_insert(tree, node, idx);
	- } else {
	- zfs_close(node->un_zhp);
	- free(node->un_mountp);
	- free(node);
	- }
	- }
	-
	- /*
	- * Walk the AVL tree in reverse, unmounting each filesystem and
	- * removing it from the AVL tree in the process.
	- */
	- if ((walk = uu_avl_walk_start(tree,
	- UU_WALK_REVERSE \| UU_WALK_ROBUST)) == NULL)
	- nomem();
	-
	- while ((node = uu_avl_walk_next(walk)) != NULL) {
	- uu_avl_remove(tree, node);
	-
	- switch (op) {
	- case OP_SHARE:
	- if (zfs_unshareall_bypath(node->un_zhp,
	- node->un_mountp) != 0)
	- ret = 1;
	- break;
	-
	- case OP_MOUNT:
	- if (zfs_unmount(node->un_zhp,
	- node->un_mountp, flags) != 0)
	- ret = 1;
	- break;
	- }
	-
	- zfs_close(node->un_zhp);
	- free(node->un_mountp);
	- free(node);
	- }
	-
	- uu_avl_walk_end(walk);
	- uu_avl_destroy(tree);
	- uu_avl_pool_destroy(pool);
	-
	- } else {
	- if (argc != 1) {
	- if (argc == 0)
	- (void) fprintf(stderr,
	- gettext("missing filesystem argument\n"));
	- else
	- (void) fprintf(stderr,
	- gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- /*
	- * We have an argument, but it may be a full path or a ZFS
	- * filesystem. Pass full paths off to unmount_path() (shared by
	- * manual_unmount), otherwise open the filesystem and pass to
	- * zfs_unmount().
	- */
	- if (argv[0][0] == '/')
	- return (unshare_unmount_path(op, argv[0],
	- flags, B_FALSE));
	-
	- if ((zhp = zfs_open(g_zfs, argv[0],
	- ZFS_TYPE_FILESYSTEM)) == NULL)
	- return (1);
	-
	- verify(zfs_prop_get(zhp, op == OP_SHARE ?
	- ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
	- nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
	- NULL, 0, B_FALSE) == 0);
	-
	- switch (op) {
	- case OP_SHARE:
	- verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
	- nfs_mnt_prop,
	- sizeof (nfs_mnt_prop),
	- NULL, NULL, 0, B_FALSE) == 0);
	- verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
	- sharesmb, sizeof (sharesmb), NULL, NULL,
	- 0, B_FALSE) == 0);
	-
	- if (strcmp(nfs_mnt_prop, "off") == 0 &&
	- strcmp(sharesmb, "off") == 0) {
	- (void) fprintf(stderr, gettext("cannot "
	- "unshare '%s': legacy share\n"),
	- zfs_get_name(zhp));
	-#ifdef illumos
	- (void) fprintf(stderr, gettext("use "
	- "unshare(1M) to unshare this "
	- "filesystem\n"));
	-#endif
	- ret = 1;
	- } else if (!zfs_is_shared(zhp)) {
	- (void) fprintf(stderr, gettext("cannot "
	- "unshare '%s': not currently "
	- "shared\n"), zfs_get_name(zhp));
	- ret = 1;
	- } else if (zfs_unshareall(zhp) != 0) {
	- ret = 1;
	- }
	- break;
	-
	- case OP_MOUNT:
	- if (strcmp(nfs_mnt_prop, "legacy") == 0) {
	- (void) fprintf(stderr, gettext("cannot "
	- "unmount '%s': legacy "
	- "mountpoint\n"), zfs_get_name(zhp));
	- (void) fprintf(stderr, gettext("use "
	- "umount(8) to unmount this "
	- "filesystem\n"));
	- ret = 1;
	- } else if (!zfs_is_mounted(zhp, NULL)) {
	- (void) fprintf(stderr, gettext("cannot "
	- "unmount '%s': not currently "
	- "mounted\n"),
	- zfs_get_name(zhp));
	- ret = 1;
	- } else if (zfs_unmountall(zhp, flags) != 0) {
	- ret = 1;
	- }
	- break;
	- }
	-
	- zfs_close(zhp);
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * zfs unmount -a
	- * zfs unmount filesystem
	- *
	- * Unmount all filesystems, or a specific ZFS filesystem.
	- */
	-static int
	-zfs_do_unmount(int argc, char **argv)
	-{
	- return (unshare_unmount(OP_MOUNT, argc, argv));
	-}
	-
	-/*
	- * zfs unshare -a
	- * zfs unshare filesystem
	- *
	- * Unshare all filesystems, or a specific ZFS filesystem.
	- */
	-static int
	-zfs_do_unshare(int argc, char **argv)
	-{
	- return (unshare_unmount(OP_SHARE, argc, argv));
	-}
	-
	-/*
	- * Attach/detach the given dataset to/from the given jail
	- */
	-/* ARGSUSED */
	-static int
	-do_jail(int argc, char **argv, int attach)
	-{
	- zfs_handle_t *zhp;
	- int jailid, ret;
	-
	- /* check number of arguments */
	- if (argc < 3) {
	- (void) fprintf(stderr, gettext("missing argument(s)\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 3) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- jailid = jail_getid(argv[1]);
	- if (jailid < 0) {
	- (void) fprintf(stderr, gettext("invalid jail id or name\n"));
	- usage(B_FALSE);
	- }
	-
	- zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
	- if (zhp == NULL)
	- return (1);
	-
	- ret = (zfs_jail(zhp, jailid, attach) != 0);
	-
	- zfs_close(zhp);
	- return (ret);
	-}
	-
	-/*
	- * zfs jail jailid filesystem
	- *
	- * Attach the given dataset to the given jail
	- */
	-/* ARGSUSED */
	-static int
	-zfs_do_jail(int argc, char **argv)
	-{
	-
	- return (do_jail(argc, argv, 1));
	-}
	-
	-/*
	- * zfs unjail jailid filesystem
	- *
	- * Detach the given dataset from the given jail
	- */
	-/* ARGSUSED */
	-static int
	-zfs_do_unjail(int argc, char **argv)
	-{
	-
	- return (do_jail(argc, argv, 0));
	-}
	-
	-/*
	- * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is
	- * 'legacy'. Otherwise, complain that use should be using 'zfs mount'.
	- */
	-static int
	-manual_mount(int argc, char **argv)
	-{
	- zfs_handle_t *zhp;
	- char mountpoint[ZFS_MAXPROPLEN];
	- char mntopts[MNT_LINE_MAX] = { '\0' };
	- int ret = 0;
	- int c;
	- int flags = 0;
	- char dataset, path;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, ":mo:O")) != -1) {
	- switch (c) {
	- case 'o':
	- (void) strlcpy(mntopts, optarg, sizeof (mntopts));
	- break;
	- case 'O':
	- flags \|= MS_OVERLAY;
	- break;
	- case 'm':
	- flags \|= MS_NOMNTTAB;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- (void) fprintf(stderr, gettext("usage: mount [-o opts] "
	- "<path>\n"));
	- return (2);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check that we only have two arguments */
	- if (argc != 2) {
	- if (argc == 0)
	- (void) fprintf(stderr, gettext("missing dataset "
	- "argument\n"));
	- else if (argc == 1)
	- (void) fprintf(stderr,
	- gettext("missing mountpoint argument\n"));
	- else
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- (void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
	- return (2);
	- }
	-
	- dataset = argv[0];
	- path = argv[1];
	-
	- /* try to open the dataset */
	- if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL)
	- return (1);
	-
	- (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
	- sizeof (mountpoint), NULL, NULL, 0, B_FALSE);
	-
	- /* check for legacy mountpoint and complain appropriately */
	- ret = 0;
	- if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
	- if (zmount(dataset, path, flags, MNTTYPE_ZFS,
	- NULL, 0, mntopts, sizeof (mntopts)) != 0) {
	- (void) fprintf(stderr, gettext("mount failed: %s\n"),
	- strerror(errno));
	- ret = 1;
	- }
	- } else {
	- (void) fprintf(stderr, gettext("filesystem '%s' cannot be "
	- "mounted using 'mount -t zfs'\n"), dataset);
	- (void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' "
	- "instead.\n"), path);
	- (void) fprintf(stderr, gettext("If you must use 'mount -t zfs' "
	- "or /etc/fstab, use 'zfs set mountpoint=legacy'.\n"));
	- (void) fprintf(stderr, gettext("See zfs(8) for more "
	- "information.\n"));
	- ret = 1;
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * Called when invoked as /etc/fs/zfs/umount. Unlike a manual mount, we allow
	- * unmounts of non-legacy filesystems, as this is the dominant administrative
	- * interface.
	- */
	-static int
	-manual_unmount(int argc, char **argv)
	-{
	- int flags = 0;
	- int c;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "f")) != -1) {
	- switch (c) {
	- case 'f':
	- flags = MS_FORCE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- (void) fprintf(stderr, gettext("usage: unmount [-f] "
	- "<path>\n"));
	- return (2);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check arguments */
	- if (argc != 1) {
	- if (argc == 0)
	- (void) fprintf(stderr, gettext("missing path "
	- "argument\n"));
	- else
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- (void) fprintf(stderr, gettext("usage: unmount [-f] <path>\n"));
	- return (2);
	- }
	-
	- return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE));
	-}
	-
	-static int
	-find_command_idx(char command, int idx)
	-{
	- int i;
	-
	- for (i = 0; i < NCOMMAND; i++) {
	- if (command_table[i].name == NULL)
	- continue;
	-
	- if (strcmp(command, command_table[i].name) == 0) {
	- *idx = i;
	- return (0);
	- }
	- }
	- return (1);
	-}
	-
	-static int
	-zfs_do_diff(int argc, char **argv)
	-{
	- zfs_handle_t *zhp;
	- int flags = 0;
	- char *tosnap = NULL;
	- char *fromsnap = NULL;
	- char atp, copy;
	- int err = 0;
	- int c;
	-
	- while ((c = getopt(argc, argv, "FHt")) != -1) {
	- switch (c) {
	- case 'F':
	- flags \|= ZFS_DIFF_CLASSIFY;
	- break;
	- case 'H':
	- flags \|= ZFS_DIFF_PARSEABLE;
	- break;
	- case 't':
	- flags \|= ZFS_DIFF_TIMESTAMP;
	- break;
	- default:
	- (void) fprintf(stderr,
	- gettext("invalid option '%c'\n"), optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr,
	- gettext("must provide at least one snapshot name\n"));
	- usage(B_FALSE);
	- }
	-
	- if (argc > 2) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- fromsnap = argv[0];
	- tosnap = (argc == 2) ? argv[1] : NULL;
	-
	- copy = NULL;
	- if (*fromsnap != '@')
	- copy = strdup(fromsnap);
	- else if (tosnap)
	- copy = strdup(tosnap);
	- if (copy == NULL)
	- usage(B_FALSE);
	-
	- if ((atp = strchr(copy, '@')) != NULL)
	- *atp = '\0';
	-
	- if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL)
	- return (1);
	-
	- free(copy);
	-
	- /*
	- * Ignore SIGPIPE so that the library can give us
	- * information on any failure
	- */
	- (void) sigignore(SIGPIPE);
	-
	- err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);
	-
	- zfs_close(zhp);
	-
	- return (err != 0);
	-}
	-
	-/*
	- * zfs remap <filesystem \| volume>
	- *
	- * Remap the indirect blocks in the given fileystem or volume.
	- */
	-static int
	-zfs_do_remap(int argc, char **argv)
	-{
	- const char *fsname;
	- int err = 0;
	- int c;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "")) != -1) {
	- switch (c) {
	- case '?':
	- (void) fprintf(stderr,
	- gettext("invalid option '%c'\n"), optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- if (argc != 2) {
	- (void) fprintf(stderr, gettext("wrong number of arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- fsname = argv[1];
	- err = zfs_remap_indirects(g_zfs, fsname);
	-
	- return (err);
	-}
	-
	-/*
	- * zfs bookmark <fs@snap> <fs#bmark>
	- *
	- * Creates a bookmark with the given name from the given snapshot.
	- */
	-static int
	-zfs_do_bookmark(int argc, char **argv)
	-{
	- char snapname[ZFS_MAX_DATASET_NAME_LEN];
	- zfs_handle_t *zhp;
	- nvlist_t *nvl;
	- int ret = 0;
	- int c;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "")) != -1) {
	- switch (c) {
	- case '?':
	- (void) fprintf(stderr,
	- gettext("invalid option '%c'\n"), optopt);
	- goto usage;
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing snapshot argument\n"));
	- goto usage;
	- }
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing bookmark argument\n"));
	- goto usage;
	- }
	-
	- if (strchr(argv[1], '#') == NULL) {
	- (void) fprintf(stderr,
	- gettext("invalid bookmark name '%s' -- "
	- "must contain a '#'\n"), argv[1]);
	- goto usage;
	- }
	-
	- if (argv[0][0] == '@') {
	- /*
	- * Snapshot name begins with @.
	- * Default to same fs as bookmark.
	- */
	- (void) strncpy(snapname, argv[1], sizeof (snapname));
	- *strchr(snapname, '#') = '\0';
	- (void) strlcat(snapname, argv[0], sizeof (snapname));
	- } else {
	- (void) strncpy(snapname, argv[0], sizeof (snapname));
	- }
	- zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT);
	- if (zhp == NULL)
	- goto usage;
	- zfs_close(zhp);
	-
	-
	- nvl = fnvlist_alloc();
	- fnvlist_add_string(nvl, argv[1], snapname);
	- ret = lzc_bookmark(nvl, NULL);
	- fnvlist_free(nvl);
	-
	- if (ret != 0) {
	- const char *err_msg = NULL;
	- char errbuf[1024];
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot create bookmark '%s'"), argv[1]);
	-
	- switch (ret) {
	- case EXDEV:
	- err_msg = "bookmark is in a different pool";
	- break;
	- case EEXIST:
	- err_msg = "bookmark exists";
	- break;
	- case EINVAL:
	- err_msg = "invalid argument";
	- break;
	- case ENOTSUP:
	- err_msg = "bookmark feature not enabled";
	- break;
	- case ENOSPC:
	- err_msg = "out of space";
	- break;
	- default:
	- (void) zfs_standard_error(g_zfs, ret, errbuf);
	- break;
	- }
	- if (err_msg != NULL) {
	- (void) fprintf(stderr, "%s: %s\n", errbuf,
	- dgettext(TEXT_DOMAIN, err_msg));
	- }
	- }
	-
	- return (ret != 0);
	-
	-usage:
	- usage(B_FALSE);
	- return (-1);
	-}
	-
	-static int
	-zfs_do_channel_program(int argc, char **argv)
	-{
	- int ret, fd;
	- char c;
	- char progbuf, filename, *poolname;
	- size_t progsize, progread;
	- nvlist_t *outnvl = NULL;
	- uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT;
	- uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT;
	- boolean_t sync_flag = B_TRUE, json_output = B_FALSE;
	- zpool_handle_t *zhp;
	-
	- /* check options */
	- while (-1 !=
	- (c = getopt(argc, argv, "jnt:(instr-limit)m:(memory-limit)"))) {
	- switch (c) {
	- case 't':
	- case 'm': {
	- uint64_t arg;
	- char *endp;
	-
	- errno = 0;
	- arg = strtoull(optarg, &endp, 0);
	- if (errno != 0 \|\| *endp != '\0') {
	- (void) fprintf(stderr, gettext(
	- "invalid argument "
	- "'%s': expected integer\n"), optarg);
	- goto usage;
	- }
	-
	- if (c == 't') {
	- if (arg > ZCP_MAX_INSTRLIMIT \|\| arg == 0) {
	- (void) fprintf(stderr, gettext(
	- "Invalid instruction limit: "
	- "%s\n"), optarg);
	- return (1);
	- } else {
	- instrlimit = arg;
	- }
	- } else {
	- ASSERT3U(c, ==, 'm');
	- if (arg > ZCP_MAX_MEMLIMIT \|\| arg == 0) {
	- (void) fprintf(stderr, gettext(
	- "Invalid memory limit: "
	- "%s\n"), optarg);
	- return (1);
	- } else {
	- memlimit = arg;
	- }
	- }
	- break;
	- }
	- case 'n': {
	- sync_flag = B_FALSE;
	- break;
	- }
	- case 'j': {
	- json_output = B_TRUE;
	- break;
	- }
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- goto usage;
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 2) {
	- (void) fprintf(stderr,
	- gettext("invalid number of arguments\n"));
	- goto usage;
	- }
	-
	- poolname = argv[0];
	- filename = argv[1];
	- if (strcmp(filename, "-") == 0) {
	- fd = 0;
	- filename = "standard input";
	- } else if ((fd = open(filename, O_RDONLY)) < 0) {
	- (void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
	- filename, strerror(errno));
	- return (1);
	- }
	-
	- if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
	- (void) fprintf(stderr, gettext("cannot open pool '%s'"),
	- poolname);
	- return (1);
	- }
	- zpool_close(zhp);
	-
	- /*
	- * Read in the channel program, expanding the program buffer as
	- * necessary.
	- */
	- progread = 0;
	- progsize = 1024;
	- progbuf = safe_malloc(progsize);
	- do {
	- ret = read(fd, progbuf + progread, progsize - progread);
	- progread += ret;
	- if (progread == progsize && ret > 0) {
	- progsize *= 2;
	- progbuf = safe_realloc(progbuf, progsize);
	- }
	- } while (ret > 0);
	-
	- if (fd != 0)
	- (void) close(fd);
	- if (ret < 0) {
	- free(progbuf);
	- (void) fprintf(stderr,
	- gettext("cannot read '%s': %s\n"),
	- filename, strerror(errno));
	- return (1);
	- }
	- progbuf[progread] = '\0';
	-
	- /*
	- * Any remaining arguments are passed as arguments to the lua script as
	- * a string array:
	- * {
	- * "argv" -> [ "arg 1", ... "arg n" ],
	- * }
	- */
	- nvlist_t *argnvl = fnvlist_alloc();
	- fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2);
	-
	- if (sync_flag) {
	- ret = lzc_channel_program(poolname, progbuf,
	- instrlimit, memlimit, argnvl, &outnvl);
	- } else {
	- ret = lzc_channel_program_nosync(poolname, progbuf,
	- instrlimit, memlimit, argnvl, &outnvl);
	- }
	-
	- if (ret != 0) {
	- /*
	- * On error, report the error message handed back by lua if one
	- * exists. Otherwise, generate an appropriate error message,
	- * falling back on strerror() for an unexpected return code.
	- */
	- char *errstring = NULL;
	- const char *msg = gettext("Channel program execution failed");
	- if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) {
	- (void) nvlist_lookup_string(outnvl,
	- ZCP_RET_ERROR, &errstring);
	- if (errstring == NULL)
	- errstring = strerror(ret);
	- } else {
	- switch (ret) {
	- case EINVAL:
	- errstring =
	- "Invalid instruction or memory limit.";
	- break;
	- case ENOMEM:
	- errstring = "Return value too large.";
	- break;
	- case ENOSPC:
	- errstring = "Memory limit exhausted.";
	- break;
	-#ifdef illumos
	- case ETIME:
	-#else
	- case ETIMEDOUT:
	-#endif
	- errstring = "Timed out.";
	- break;
	- case EPERM:
	- errstring = "Permission denied. Channel "
	- "programs must be run as root.";
	- break;
	- default:
	- (void) zfs_standard_error(g_zfs, ret, msg);
	- }
	- }
	- if (errstring != NULL)
	- (void) fprintf(stderr, "%s:\n%s\n", msg, errstring);
	- } else {
	- if (json_output) {
	- (void) nvlist_print_json(stdout, outnvl);
	- } else if (nvlist_empty(outnvl)) {
	- (void) fprintf(stdout, gettext("Channel program fully "
	- "executed and did not produce output.\n"));
	- } else {
	- (void) fprintf(stdout, gettext("Channel program fully "
	- "executed and produced output:\n"));
	- dump_nvlist(outnvl, 4);
	- }
	- }
	-
	- free(progbuf);
	- fnvlist_free(outnvl);
	- fnvlist_free(argnvl);
	- return (ret != 0);
	-
	-usage:
	- usage(B_FALSE);
	- return (-1);
	-}
	-
	-int
	-main(int argc, char **argv)
	-{
	- int ret = 0;
	- int i;
	- char *progname;
	- char *cmdname;
	-
	- (void) setlocale(LC_ALL, "");
	- (void) textdomain(TEXT_DOMAIN);
	-
	- opterr = 0;
	-
	- if ((g_zfs = libzfs_init()) == NULL) {
	- (void) fprintf(stderr, gettext("internal error: failed to "
	- "initialize ZFS library\n"));
	- return (1);
	- }
	-
	- zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
	-
	- libzfs_print_on_error(g_zfs, B_TRUE);
	-
	- if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) {
	- (void) fprintf(stderr, gettext("internal error: unable to "
	- "open %s\n"), MNTTAB);
	- return (1);
	- }
	-
	- /*
	- * This command also doubles as the /etc/fs mount and unmount program.
	- * Determine if we should take this behavior based on argv[0].
	- */
	- progname = basename(argv[0]);
	- if (strcmp(progname, "mount") == 0) {
	- ret = manual_mount(argc, argv);
	- } else if (strcmp(progname, "umount") == 0) {
	- ret = manual_unmount(argc, argv);
	- } else {
	- /*
	- * Make sure the user has specified some command.
	- */
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing command\n"));
	- usage(B_FALSE);
	- }
	-
	- cmdname = argv[1];
	-
	- /*
	- * The 'umount' command is an alias for 'unmount'
	- */
	- if (strcmp(cmdname, "umount") == 0)
	- cmdname = "unmount";
	-
	- /*
	- * The 'recv' command is an alias for 'receive'
	- */
	- if (strcmp(cmdname, "recv") == 0)
	- cmdname = "receive";
	-
	- /*
	- * The 'snap' command is an alias for 'snapshot'
	- */
	- if (strcmp(cmdname, "snap") == 0)
	- cmdname = "snapshot";
	-
	- /*
	- * Special case '-?'
	- */
	- if (strcmp(cmdname, "-?") == 0)
	- usage(B_TRUE);
	-
	- /*
	- * Run the appropriate command.
	- */
	- libzfs_mnttab_cache(g_zfs, B_TRUE);
	- if (find_command_idx(cmdname, &i) == 0) {
	- current_command = &command_table[i];
	- ret = command_table[i].func(argc - 1, argv + 1);
	- } else if (strchr(cmdname, '=') != NULL) {
	- verify(find_command_idx("set", &i) == 0);
	- current_command = &command_table[i];
	- ret = command_table[i].func(argc, argv);
	- } else {
	- (void) fprintf(stderr, gettext("unrecognized "
	- "command '%s'\n"), cmdname);
	- usage(B_FALSE);
	- }
	- libzfs_mnttab_cache(g_zfs, B_FALSE);
	- }
	-
	- (void) fclose(mnttab_file);
	-
	- if (ret == 0 && log_history)
	- (void) zpool_log_history(g_zfs, history_str);
	-
	- libzfs_fini(g_zfs);
	-
	- /*
	- * The 'ZFS_ABORT' environment variable causes us to dump core on exit
	- * for the purposes of running ::findleaks.
	- */
	- if (getenv("ZFS_ABORT") != NULL) {
	- (void) printf("dumping core by request\n");
	- abort();
	- }
	-
	- return (ret);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h
	+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h
	@@ -1,42 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _ZFS_UTIL_H
	-#define _ZFS_UTIL_H
	-
	-#include <libzfs.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-void * safe_malloc(size_t size);
	-void nomem(void);
	-extern libzfs_handle_t *g_zfs;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZFS_UTIL_H */
	Index: head/cddl/contrib/opensolaris/cmd/zhack/zhack.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zhack/zhack.c
	+++ head/cddl/contrib/opensolaris/cmd/zhack/zhack.c
	@@ -1,535 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- */
	-
	-/*
	- * zhack is a debugging tool that can write changes to ZFS pool using libzpool
	- * for testing purposes. Altering pools with zhack is unsupported and may
	- * result in corrupted pools.
	- */
	-
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <ctype.h>
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/dmu.h>
	-#include <sys/zap.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/vdev.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zio_compress.h>
	-#include <sys/zfeature.h>
	-#include <sys/dmu_tx.h>
	-#undef verify
	-#include <libzfs.h>
	-
	-extern boolean_t zfeature_checks_disable;
	-
	-const char cmdname[] = "zhack";
	-libzfs_handle_t *g_zfs;
	-static importargs_t g_importargs;
	-static char *g_pool;
	-static boolean_t g_readonly;
	-
	-static void
	-usage(void)
	-{
	- (void) fprintf(stderr,
	- "Usage: %s [-c cachefile] [-d dir] <subcommand> <args> ...\n"
	- "where <subcommand> <args> is one of the following:\n"
	- "\n", cmdname);
	-
	- (void) fprintf(stderr,
	- " feature stat <pool>\n"
	- " print information about enabled features\n"
	- " feature enable [-d desc] <pool> <feature>\n"
	- " add a new enabled feature to the pool\n"
	- " -d <desc> sets the feature's description\n"
	- " feature ref [-md] <pool> <feature>\n"
	- " change the refcount on the given feature\n"
	- " -d decrease instead of increase the refcount\n"
	- " -m add the feature to the label if increasing refcount\n"
	- "\n"
	- " <feature> : should be a feature guid\n");
	- exit(1);
	-}
	-
	-
	-static void
	-fatal(spa_t spa, void tag, const char *fmt, ...)
	-{
	- va_list ap;
	-
	- if (spa != NULL) {
	- spa_close(spa, tag);
	- (void) spa_export(g_pool, NULL, B_TRUE, B_FALSE);
	- }
	-
	- va_start(ap, fmt);
	- (void) fprintf(stderr, "%s: ", cmdname);
	- (void) vfprintf(stderr, fmt, ap);
	- va_end(ap);
	- (void) fprintf(stderr, "\n");
	-
	- exit(1);
	-}
	-
	-/* ARGSUSED */
	-static int
	-space_delta_cb(dmu_object_type_t bonustype, void *data,
	- uint64_t userp, uint64_t groupp)
	-{
	- /*
	- * Is it a valid type of object to track?
	- */
	- if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
	- return (ENOENT);
	- (void) fprintf(stderr, "modifying object that needs user accounting");
	- abort();
	- /* NOTREACHED */
	-}
	-
	-/*
	- * Target is the dataset whose pool we want to open.
	- */
	-static void
	-zhack_import(char *target, boolean_t readonly)
	-{
	- nvlist_t *config;
	- nvlist_t *props;
	- int error;
	-
	- kernel_init(readonly ? FREAD : (FREAD \| FWRITE));
	- g_zfs = libzfs_init();
	- ASSERT(g_zfs != NULL);
	-
	- dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb);
	-
	- g_readonly = readonly;
	- g_importargs.unique = B_TRUE;
	- g_importargs.can_be_active = readonly;
	- g_pool = strdup(target);
	-
	- error = zpool_tryimport(g_zfs, target, &config, &g_importargs);
	- if (error)
	- fatal(NULL, FTAG, "cannot import '%s': %s", target,
	- libzfs_error_description(g_zfs));
	-
	- props = NULL;
	- if (readonly) {
	- VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
	- VERIFY(nvlist_add_uint64(props,
	- zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0);
	- }
	-
	- zfeature_checks_disable = B_TRUE;
	- error = spa_import(target, config, props,
	- (readonly ? ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL));
	- zfeature_checks_disable = B_FALSE;
	- if (error == EEXIST)
	- error = 0;
	-
	- if (error)
	- fatal(NULL, FTAG, "can't import '%s': %s", target,
	- strerror(error));
	-}
	-
	-static void
	-zhack_spa_open(char target, boolean_t readonly, void tag, spa_t **spa)
	-{
	- int err;
	-
	- zhack_import(target, readonly);
	-
	- zfeature_checks_disable = B_TRUE;
	- err = spa_open(target, spa, tag);
	- zfeature_checks_disable = B_FALSE;
	-
	- if (err != 0)
	- fatal(*spa, FTAG, "cannot open '%s': %s", target,
	- strerror(err));
	- if (spa_version(*spa) < SPA_VERSION_FEATURES) {
	- fatal(*spa, FTAG, "'%s' has version %d, features not enabled",
	- target, (int)spa_version(*spa));
	- }
	-}
	-
	-static void
	-dump_obj(objset_t os, uint64_t obj, const char name)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- (void) printf("%s_obj:\n", name);
	-
	- for (zap_cursor_init(&zc, os, obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- if (za.za_integer_length == 8) {
	- ASSERT(za.za_num_integers == 1);
	- (void) printf("\t%s = %llu\n",
	- za.za_name, (u_longlong_t)za.za_first_integer);
	- } else {
	- ASSERT(za.za_integer_length == 1);
	- char val[1024];
	- VERIFY(zap_lookup(os, obj, za.za_name,
	- 1, sizeof (val), val) == 0);
	- (void) printf("\t%s = %s\n", za.za_name, val);
	- }
	- }
	- zap_cursor_fini(&zc);
	-}
	-
	-static void
	-dump_mos(spa_t *spa)
	-{
	- nvlist_t *nv = spa->spa_label_features;
	-
	- (void) printf("label config:\n");
	- for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
	- pair != NULL;
	- pair = nvlist_next_nvpair(nv, pair)) {
	- (void) printf("\t%s\n", nvpair_name(pair));
	- }
	-}
	-
	-static void
	-zhack_do_feature_stat(int argc, char **argv)
	-{
	- spa_t *spa;
	- objset_t *os;
	- char *target;
	-
	- argc--;
	- argv++;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, "error: missing pool name\n");
	- usage();
	- }
	- target = argv[0];
	-
	- zhack_spa_open(target, B_TRUE, FTAG, &spa);
	- os = spa->spa_meta_objset;
	-
	- dump_obj(os, spa->spa_feat_for_read_obj, "for_read");
	- dump_obj(os, spa->spa_feat_for_write_obj, "for_write");
	- dump_obj(os, spa->spa_feat_desc_obj, "descriptions");
	- if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
	- dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg");
	- }
	- dump_mos(spa);
	-
	- spa_close(spa, FTAG);
	-}
	-
	-static void
	-zhack_feature_enable_sync(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- zfeature_info_t *feature = arg;
	-
	- feature_enable_sync(spa, feature, tx);
	-
	- spa_history_log_internal(spa, "zhack enable feature", tx,
	- "guid=%s flags=%x",
	- feature->fi_guid, feature->fi_flags);
	-}
	-
	-static void
	-zhack_do_feature_enable(int argc, char **argv)
	-{
	- char c;
	- char desc, target;
	- spa_t *spa;
	- objset_t *mos;
	- zfeature_info_t feature;
	- spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
	-
	- /*
	- * Features are not added to the pool's label until their refcounts
	- * are incremented, so fi_mos can just be left as false for now.
	- */
	- desc = NULL;
	- feature.fi_uname = "zhack";
	- feature.fi_flags = 0;
	- feature.fi_depends = nodeps;
	- feature.fi_feature = SPA_FEATURE_NONE;
	-
	- optind = 1;
	- while ((c = getopt(argc, argv, "rmd:")) != -1) {
	- switch (c) {
	- case 'r':
	- feature.fi_flags \|= ZFEATURE_FLAG_READONLY_COMPAT;
	- break;
	- case 'd':
	- desc = strdup(optarg);
	- break;
	- default:
	- usage();
	- break;
	- }
	- }
	-
	- if (desc == NULL)
	- desc = strdup("zhack injected");
	- feature.fi_desc = desc;
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 2) {
	- (void) fprintf(stderr, "error: missing feature or pool name\n");
	- usage();
	- }
	- target = argv[0];
	- feature.fi_guid = argv[1];
	-
	- if (!zfeature_is_valid_guid(feature.fi_guid))
	- fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
	-
	- zhack_spa_open(target, B_FALSE, FTAG, &spa);
	- mos = spa->spa_meta_objset;
	-
	- if (zfeature_is_supported(feature.fi_guid))
	- fatal(spa, FTAG, "'%s' is a real feature, will not enable");
	- if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid))
	- fatal(spa, FTAG, "feature already enabled: %s",
	- feature.fi_guid);
	-
	- VERIFY0(dsl_sync_task(spa_name(spa), NULL,
	- zhack_feature_enable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL));
	-
	- spa_close(spa, FTAG);
	-
	- free(desc);
	-}
	-
	-static void
	-feature_incr_sync(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- zfeature_info_t *feature = arg;
	- uint64_t refcount;
	-
	- VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
	- feature_sync(spa, feature, refcount + 1, tx);
	- spa_history_log_internal(spa, "zhack feature incr", tx,
	- "name=%s", feature->fi_guid);
	-}
	-
	-static void
	-feature_decr_sync(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- zfeature_info_t *feature = arg;
	- uint64_t refcount;
	-
	- VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
	- feature_sync(spa, feature, refcount - 1, tx);
	- spa_history_log_internal(spa, "zhack feature decr", tx,
	- "name=%s", feature->fi_guid);
	-}
	-
	-static void
	-zhack_do_feature_ref(int argc, char **argv)
	-{
	- char c;
	- char *target;
	- boolean_t decr = B_FALSE;
	- spa_t *spa;
	- objset_t *mos;
	- zfeature_info_t feature;
	- spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
	-
	- /*
	- * fi_desc does not matter here because it was written to disk
	- * when the feature was enabled, but we need to properly set the
	- * feature for read or write based on the information we read off
	- * disk later.
	- */
	- feature.fi_uname = "zhack";
	- feature.fi_flags = 0;
	- feature.fi_desc = NULL;
	- feature.fi_depends = nodeps;
	- feature.fi_feature = SPA_FEATURE_NONE;
	-
	- optind = 1;
	- while ((c = getopt(argc, argv, "md")) != -1) {
	- switch (c) {
	- case 'm':
	- feature.fi_flags \|= ZFEATURE_FLAG_MOS;
	- break;
	- case 'd':
	- decr = B_TRUE;
	- break;
	- default:
	- usage();
	- break;
	- }
	- }
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 2) {
	- (void) fprintf(stderr, "error: missing feature or pool name\n");
	- usage();
	- }
	- target = argv[0];
	- feature.fi_guid = argv[1];
	-
	- if (!zfeature_is_valid_guid(feature.fi_guid))
	- fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
	-
	- zhack_spa_open(target, B_FALSE, FTAG, &spa);
	- mos = spa->spa_meta_objset;
	-
	- if (zfeature_is_supported(feature.fi_guid)) {
	- fatal(spa, FTAG,
	- "'%s' is a real feature, will not change refcount");
	- }
	-
	- if (0 == zap_contains(mos, spa->spa_feat_for_read_obj,
	- feature.fi_guid)) {
	- feature.fi_flags &= ~ZFEATURE_FLAG_READONLY_COMPAT;
	- } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj,
	- feature.fi_guid)) {
	- feature.fi_flags \|= ZFEATURE_FLAG_READONLY_COMPAT;
	- } else {
	- fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid);
	- }
	-
	- if (decr) {
	- uint64_t count;
	- if (feature_get_refcount_from_disk(spa, &feature,
	- &count) == 0 && count != 0) {
	- fatal(spa, FTAG, "feature refcount already 0: %s",
	- feature.fi_guid);
	- }
	- }
	-
	- VERIFY0(dsl_sync_task(spa_name(spa), NULL,
	- decr ? feature_decr_sync : feature_incr_sync, &feature,
	- 5, ZFS_SPACE_CHECK_NORMAL));
	-
	- spa_close(spa, FTAG);
	-}
	-
	-static int
	-zhack_do_feature(int argc, char **argv)
	-{
	- char *subcommand;
	-
	- argc--;
	- argv++;
	- if (argc == 0) {
	- (void) fprintf(stderr,
	- "error: no feature operation specified\n");
	- usage();
	- }
	-
	- subcommand = argv[0];
	- if (strcmp(subcommand, "stat") == 0) {
	- zhack_do_feature_stat(argc, argv);
	- } else if (strcmp(subcommand, "enable") == 0) {
	- zhack_do_feature_enable(argc, argv);
	- } else if (strcmp(subcommand, "ref") == 0) {
	- zhack_do_feature_ref(argc, argv);
	- } else {
	- (void) fprintf(stderr, "error: unknown subcommand: %s\n",
	- subcommand);
	- usage();
	- }
	-
	- return (0);
	-}
	-
	-#define MAX_NUM_PATHS 1024
	-
	-int
	-main(int argc, char **argv)
	-{
	- extern void zfs_prop_init(void);
	-
	- char *path[MAX_NUM_PATHS];
	- const char *subcommand;
	- int rv = 0;
	- char c;
	-
	- g_importargs.path = path;
	-
	- dprintf_setup(&argc, argv);
	- zfs_prop_init();
	-
	- while ((c = getopt(argc, argv, "c:d:")) != -1) {
	- switch (c) {
	- case 'c':
	- g_importargs.cachefile = optarg;
	- break;
	- case 'd':
	- assert(g_importargs.paths < MAX_NUM_PATHS);
	- g_importargs.path[g_importargs.paths++] = optarg;
	- break;
	- default:
	- usage();
	- break;
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	- optind = 1;
	-
	- if (argc == 0) {
	- (void) fprintf(stderr, "error: no command specified\n");
	- usage();
	- }
	-
	- subcommand = argv[0];
	-
	- if (strcmp(subcommand, "feature") == 0) {
	- rv = zhack_do_feature(argc, argv);
	- } else {
	- (void) fprintf(stderr, "error: unknown subcommand: %s\n",
	- subcommand);
	- usage();
	- }
	-
	- if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_FALSE) != 0) {
	- fatal(NULL, FTAG, "pool export failed; "
	- "changes may not be committed to disk\n");
	- }
	-
	- libzfs_fini(g_zfs);
	- kernel_fini();
	-
	- return (rv);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zinject/translate.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zinject/translate.c
	+++ head/cddl/contrib/opensolaris/cmd/zinject/translate.c
	@@ -1,492 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
	- */
	-
	-#include <libzfs.h>
	-
	-#include <sys/zfs_context.h>
	-
	-#include <errno.h>
	-#include <fcntl.h>
	-#include <stdarg.h>
	-#include <stddef.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <sys/file.h>
	-#include <sys/mntent.h>
	-#include <sys/mnttab.h>
	-#include <sys/param.h>
	-#include <sys/stat.h>
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dnode.h>
	-#include <sys/vdev_impl.h>
	-
	-#include "zinject.h"
	-
	-extern void kernel_init(int);
	-extern void kernel_fini(void);
	-
	-static int debug;
	-
	-static void
	-ziprintf(const char *fmt, ...)
	-{
	- va_list ap;
	-
	- if (!debug)
	- return;
	-
	- va_start(ap, fmt);
	- (void) vprintf(fmt, ap);
	- va_end(ap);
	-}
	-
	-static void
	-compress_slashes(const char src, char dest)
	-{
	- while (*src != '\0') {
	- dest = src++;
	- while (dest == '/' && src == '/')
	- ++src;
	- ++dest;
	- }
	- *dest = '\0';
	-}
	-
	-/*
	- * Given a full path to a file, translate into a dataset name and a relative
	- * path within the dataset. 'dataset' must be at least MAXNAMELEN characters,
	- * and 'relpath' must be at least MAXPATHLEN characters. We also pass a stat64
	- * buffer, which we need later to get the object ID.
	- */
	-static int
	-parse_pathname(const char inpath, char dataset, char *relpath,
	- struct stat64 *statbuf)
	-{
	- struct statfs sfs;
	- const char *rel;
	- char fullpath[MAXPATHLEN];
	-
	- compress_slashes(inpath, fullpath);
	-
	- if (fullpath[0] != '/') {
	- (void) fprintf(stderr, "invalid object '%s': must be full "
	- "path\n", fullpath);
	- usage();
	- return (-1);
	- }
	-
	- if (strlen(fullpath) >= MAXPATHLEN) {
	- (void) fprintf(stderr, "invalid object; pathname too long\n");
	- return (-1);
	- }
	-
	- if (stat64(fullpath, statbuf) != 0) {
	- (void) fprintf(stderr, "cannot open '%s': %s\n",
	- fullpath, strerror(errno));
	- return (-1);
	- }
	-
	- if (statfs(fullpath, &sfs) == -1) {
	- (void) fprintf(stderr, "cannot find mountpoint for '%s': %s\n",
	- fullpath, strerror(errno));
	- return (-1);
	- }
	-
	- if (strcmp(sfs.f_fstypename, MNTTYPE_ZFS) != 0) {
	- (void) fprintf(stderr, "invalid path '%s': not a ZFS "
	- "filesystem\n", fullpath);
	- return (-1);
	- }
	-
	- if (strncmp(fullpath, sfs.f_mntonname, strlen(sfs.f_mntonname)) != 0) {
	- (void) fprintf(stderr, "invalid path '%s': mountpoint "
	- "doesn't match path\n", fullpath);
	- return (-1);
	- }
	-
	- (void) strcpy(dataset, sfs.f_mntfromname);
	-
	- rel = fullpath + strlen(sfs.f_mntonname);
	- if (rel[0] == '/')
	- rel++;
	- (void) strcpy(relpath, rel);
	-
	- return (0);
	-}
	-
	-/*
	- * Convert from a (dataset, path) pair into a (objset, object) pair. Note that
	- * we grab the object number from the inode number, since looking this up via
	- * libzpool is a real pain.
	- */
	-/* ARGSUSED */
	-static int
	-object_from_path(const char dataset, const char path, struct stat64 *statbuf,
	- zinject_record_t *record)
	-{
	- objset_t *os;
	- int err;
	-
	- /*
	- * Before doing any libzpool operations, call sync() to ensure that the
	- * on-disk state is consistent with the in-core state.
	- */
	- sync();
	-
	- err = dmu_objset_own(dataset, DMU_OST_ZFS, B_TRUE, FTAG, &os);
	- if (err != 0) {
	- (void) fprintf(stderr, "cannot open dataset '%s': %s\n",
	- dataset, strerror(err));
	- return (-1);
	- }
	-
	- record->zi_objset = dmu_objset_id(os);
	- record->zi_object = statbuf->st_ino;
	-
	- dmu_objset_disown(os, FTAG);
	-
	- return (0);
	-}
	-
	-/*
	- * Calculate the real range based on the type, level, and range given.
	- */
	-static int
	-calculate_range(const char dataset, err_type_t type, int level, char range,
	- zinject_record_t *record)
	-{
	- objset_t *os = NULL;
	- dnode_t *dn = NULL;
	- int err;
	- int ret = -1;
	-
	- /*
	- * Determine the numeric range from the string.
	- */
	- if (range == NULL) {
	- /*
	- * If range is unspecified, set the range to [0,-1], which
	- * indicates that the whole object should be treated as an
	- * error.
	- */
	- record->zi_start = 0;
	- record->zi_end = -1ULL;
	- } else {
	- char *end;
	-
	- /* XXX add support for suffixes */
	- record->zi_start = strtoull(range, &end, 10);
	-
	-
	- if (*end == '\0')
	- record->zi_end = record->zi_start + 1;
	- else if (*end == ',')
	- record->zi_end = strtoull(end + 1, &end, 10);
	-
	- if (*end != '\0') {
	- (void) fprintf(stderr, "invalid range '%s': must be "
	- "a numeric range of the form 'start[,end]'\n",
	- range);
	- goto out;
	- }
	- }
	-
	- switch (type) {
	- case TYPE_DATA:
	- break;
	-
	- case TYPE_DNODE:
	- /*
	- * If this is a request to inject faults into the dnode, then we
	- * must translate the current (objset,object) pair into an
	- * offset within the metadnode for the objset. Specifying any
	- * kind of range with type 'dnode' is illegal.
	- */
	- if (range != NULL) {
	- (void) fprintf(stderr, "range cannot be specified when "
	- "type is 'dnode'\n");
	- goto out;
	- }
	-
	- record->zi_start = record->zi_object * sizeof (dnode_phys_t);
	- record->zi_end = record->zi_start + sizeof (dnode_phys_t);
	- record->zi_object = 0;
	- break;
	- }
	-
	- /*
	- * Get the dnode associated with object, so we can calculate the block
	- * size.
	- */
	- if ((err = dmu_objset_own(dataset, DMU_OST_ANY,
	- B_TRUE, FTAG, &os)) != 0) {
	- (void) fprintf(stderr, "cannot open dataset '%s': %s\n",
	- dataset, strerror(err));
	- goto out;
	- }
	-
	- if (record->zi_object == 0) {
	- dn = DMU_META_DNODE(os);
	- } else {
	- err = dnode_hold(os, record->zi_object, FTAG, &dn);
	- if (err != 0) {
	- (void) fprintf(stderr, "failed to hold dnode "
	- "for object %llu\n",
	- (u_longlong_t)record->zi_object);
	- goto out;
	- }
	- }
	-
	-
	- ziprintf("data shift: %d\n", (int)dn->dn_datablkshift);
	- ziprintf(" ind shift: %d\n", (int)dn->dn_indblkshift);
	-
	- /*
	- * Translate range into block IDs.
	- */
	- if (record->zi_start != 0 \|\| record->zi_end != -1ULL) {
	- record->zi_start >>= dn->dn_datablkshift;
	- record->zi_end >>= dn->dn_datablkshift;
	- }
	-
	- /*
	- * Check level, and then translate level 0 blkids into ranges
	- * appropriate for level of indirection.
	- */
	- record->zi_level = level;
	- if (level > 0) {
	- ziprintf("level 0 blkid range: [%llu, %llu]\n",
	- record->zi_start, record->zi_end);
	-
	- if (level >= dn->dn_nlevels) {
	- (void) fprintf(stderr, "level %d exceeds max level "
	- "of object (%d)\n", level, dn->dn_nlevels - 1);
	- goto out;
	- }
	-
	- if (record->zi_start != 0 \|\| record->zi_end != 0) {
	- int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	-
	- for (; level > 0; level--) {
	- record->zi_start >>= shift;
	- record->zi_end >>= shift;
	- }
	- }
	- }
	-
	- ret = 0;
	-out:
	- if (dn) {
	- if (dn != DMU_META_DNODE(os))
	- dnode_rele(dn, FTAG);
	- }
	- if (os)
	- dmu_objset_disown(os, FTAG);
	-
	- return (ret);
	-}
	-
	-int
	-translate_record(err_type_t type, const char object, const char range,
	- int level, zinject_record_t record, char poolname, char *dataset)
	-{
	- char path[MAXPATHLEN];
	- char *slash;
	- struct stat64 statbuf;
	- int ret = -1;
	-
	- kernel_init(FREAD);
	-
	- debug = (getenv("ZINJECT_DEBUG") != NULL);
	-
	- ziprintf("translating: %s\n", object);
	-
	- if (MOS_TYPE(type)) {
	- /*
	- * MOS objects are treated specially.
	- */
	- switch (type) {
	- case TYPE_MOS:
	- record->zi_type = 0;
	- break;
	- case TYPE_MOSDIR:
	- record->zi_type = DMU_OT_OBJECT_DIRECTORY;
	- break;
	- case TYPE_METASLAB:
	- record->zi_type = DMU_OT_OBJECT_ARRAY;
	- break;
	- case TYPE_CONFIG:
	- record->zi_type = DMU_OT_PACKED_NVLIST;
	- break;
	- case TYPE_BPOBJ:
	- record->zi_type = DMU_OT_BPOBJ;
	- break;
	- case TYPE_SPACEMAP:
	- record->zi_type = DMU_OT_SPACE_MAP;
	- break;
	- case TYPE_ERRLOG:
	- record->zi_type = DMU_OT_ERROR_LOG;
	- break;
	- }
	-
	- dataset[0] = '\0';
	- (void) strcpy(poolname, object);
	- return (0);
	- }
	-
	- /*
	- * Convert a full path into a (dataset, file) pair.
	- */
	- if (parse_pathname(object, dataset, path, &statbuf) != 0)
	- goto err;
	-
	- ziprintf(" dataset: %s\n", dataset);
	- ziprintf(" path: %s\n", path);
	-
	- /*
	- * Convert (dataset, file) into (objset, object)
	- */
	- if (object_from_path(dataset, path, &statbuf, record) != 0)
	- goto err;
	-
	- ziprintf("raw objset: %llu\n", record->zi_objset);
	- ziprintf("raw object: %llu\n", record->zi_object);
	-
	- /*
	- * For the given object, calculate the real (type, level, range)
	- */
	- if (calculate_range(dataset, type, level, (char *)range, record) != 0)
	- goto err;
	-
	- ziprintf(" objset: %llu\n", record->zi_objset);
	- ziprintf(" object: %llu\n", record->zi_object);
	- if (record->zi_start == 0 &&
	- record->zi_end == -1ULL)
	- ziprintf(" range: all\n");
	- else
	- ziprintf(" range: [%llu, %llu]\n", record->zi_start,
	- record->zi_end);
	-
	- /*
	- * Copy the pool name
	- */
	- (void) strcpy(poolname, dataset);
	- if ((slash = strchr(poolname, '/')) != NULL)
	- *slash = '\0';
	-
	- ret = 0;
	-
	-err:
	- kernel_fini();
	- return (ret);
	-}
	-
	-int
	-translate_raw(const char str, zinject_record_t record)
	-{
	- /*
	- * A raw bookmark of the form objset:object:level:blkid, where each
	- * number is a hexidecimal value.
	- */
	- if (sscanf(str, "%llx:%llx:%x:%llx", (u_longlong_t *)&record->zi_objset,
	- (u_longlong_t *)&record->zi_object, &record->zi_level,
	- (u_longlong_t *)&record->zi_start) != 4) {
	- (void) fprintf(stderr, "bad raw spec '%s': must be of the form "
	- "'objset:object:level:blkid'\n", str);
	- return (-1);
	- }
	-
	- record->zi_end = record->zi_start;
	-
	- return (0);
	-}
	-
	-int
	-translate_device(const char pool, const char device, err_type_t label_type,
	- zinject_record_t *record)
	-{
	- char *end;
	- zpool_handle_t *zhp;
	- nvlist_t *tgt;
	- boolean_t isspare, iscache;
	-
	- /*
	- * Given a device name or GUID, create an appropriate injection record
	- * with zi_guid set.
	- */
	- if ((zhp = zpool_open(g_zfs, pool)) == NULL)
	- return (-1);
	-
	- record->zi_guid = strtoull(device, &end, 16);
	- if (record->zi_guid == 0 \|\| *end != '\0') {
	- tgt = zpool_find_vdev(zhp, device, &isspare, &iscache, NULL);
	-
	- if (tgt == NULL) {
	- (void) fprintf(stderr, "cannot find device '%s' in "
	- "pool '%s'\n", device, pool);
	- return (-1);
	- }
	-
	- verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID,
	- &record->zi_guid) == 0);
	- }
	-
	- /*
	- * Device faults can take on three different forms:
	- * 1). delayed or hanging I/O
	- * 2). zfs label faults
	- * 3). generic disk faults
	- */
	- if (record->zi_timer != 0) {
	- record->zi_cmd = ZINJECT_DELAY_IO;
	- } else if (label_type != TYPE_INVAL) {
	- record->zi_cmd = ZINJECT_LABEL_FAULT;
	- } else {
	- record->zi_cmd = ZINJECT_DEVICE_FAULT;
	- }
	-
	- switch (label_type) {
	- case TYPE_LABEL_UBERBLOCK:
	- record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]);
	- record->zi_end = record->zi_start + VDEV_UBERBLOCK_RING - 1;
	- break;
	- case TYPE_LABEL_NVLIST:
	- record->zi_start = offsetof(vdev_label_t, vl_vdev_phys);
	- record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1;
	- break;
	- case TYPE_LABEL_PAD1:
	- record->zi_start = offsetof(vdev_label_t, vl_pad1);
	- record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
	- break;
	- case TYPE_LABEL_PAD2:
	- record->zi_start = offsetof(vdev_label_t, vl_be);
	- record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
	- break;
	- }
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zinject/zinject.h
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zinject/zinject.h
	+++ head/cddl/contrib/opensolaris/cmd/zinject/zinject.h
	@@ -1,70 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _ZINJECT_H
	-#define _ZINJECT_H
	-
	-#include <sys/zfs_ioctl.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef enum {
	- TYPE_DATA, /* plain file contents */
	- TYPE_DNODE, /* metadnode contents */
	- TYPE_MOS, /* all MOS data */
	- TYPE_MOSDIR, /* MOS object directory */
	- TYPE_METASLAB, /* metaslab objects */
	- TYPE_CONFIG, /* MOS config */
	- TYPE_BPOBJ, /* block pointer list */
	- TYPE_SPACEMAP, /* space map objects */
	- TYPE_ERRLOG, /* persistent error log */
	- TYPE_LABEL_UBERBLOCK, /* label specific uberblock */
	- TYPE_LABEL_NVLIST, /* label specific nvlist */
	- TYPE_LABEL_PAD1, /* label specific 8K pad1 area */
	- TYPE_LABEL_PAD2, /* label specific 8K pad2 area */
	- TYPE_INVAL
	-} err_type_t;
	-
	-#define MOS_TYPE(t) \
	- ((t) >= TYPE_MOS && (t) < TYPE_LABEL_UBERBLOCK)
	-
	-#define LABEL_TYPE(t) \
	- ((t) >= TYPE_LABEL_UBERBLOCK && (t) < TYPE_INVAL)
	-
	-int translate_record(err_type_t type, const char object, const char range,
	- int level, zinject_record_t record, char poolname, char *dataset);
	-int translate_raw(const char raw, zinject_record_t record);
	-int translate_device(const char pool, const char device,
	- err_type_t label_type, zinject_record_t *record);
	-void usage(void);
	-
	-extern libzfs_handle_t *g_zfs;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZINJECT_H */
	Index: head/cddl/contrib/opensolaris/cmd/zinject/zinject.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zinject/zinject.c
	+++ head/cddl/contrib/opensolaris/cmd/zinject/zinject.c
	@@ -1,1093 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * ZFS Fault Injector
	- *
	- * This userland component takes a set of options and uses libzpool to translate
	- * from a user-visible object type and name to an internal representation.
	- * There are two basic types of faults: device faults and data faults.
	- *
	- *
	- * DEVICE FAULTS
	- *
	- * Errors can be injected into a particular vdev using the '-d' option. This
	- * option takes a path or vdev GUID to uniquely identify the device within a
	- * pool. There are two types of errors that can be injected, EIO and ENXIO,
	- * that can be controlled through the '-e' option. The default is ENXIO. For
	- * EIO failures, any attempt to read data from the device will return EIO, but
	- * subsequent attempt to reopen the device will succeed. For ENXIO failures,
	- * any attempt to read from the device will return EIO, but any attempt to
	- * reopen the device will also return ENXIO.
	- * For label faults, the -L option must be specified. This allows faults
	- * to be injected into either the nvlist, uberblock, pad1, or pad2 region
	- * of all the labels for the specified device.
	- *
	- * This form of the command looks like:
	- *
	- * zinject -d device [-e errno] [-L <uber \| nvlist \| pad1 \| pad2>] pool
	- *
	- *
	- * DATA FAULTS
	- *
	- * We begin with a tuple of the form:
	- *
	- * <type,level,range,object>
	- *
	- * type A string describing the type of data to target. Each type
	- * implicitly describes how to interpret 'object'. Currently,
	- * the following values are supported:
	- *
	- * data User data for a file
	- * dnode Dnode for a file or directory
	- *
	- * The following MOS objects are special. Instead of injecting
	- * errors on a particular object or blkid, we inject errors across
	- * all objects of the given type.
	- *
	- * mos Any data in the MOS
	- * mosdir object directory
	- * config pool configuration
	- * bpobj blkptr list
	- * spacemap spacemap
	- * metaslab metaslab
	- * errlog persistent error log
	- *
	- * level Object level. Defaults to '0', not applicable to all types. If
	- * a range is given, this corresponds to the indirect block
	- * corresponding to the specific range.
	- *
	- * range A numerical range [start,end) within the object. Defaults to
	- * the full size of the file.
	- *
	- * object A string describing the logical location of the object. For
	- * files and directories (currently the only supported types),
	- * this is the path of the object on disk.
	- *
	- * This is translated, via libzpool, into the following internal representation:
	- *
	- * <type,objset,object,level,range>
	- *
	- * These types should be self-explanatory. This tuple is then passed to the
	- * kernel via a special ioctl() to initiate fault injection for the given
	- * object. Note that 'type' is not strictly necessary for fault injection, but
	- * is used when translating existing faults into a human-readable string.
	- *
	- *
	- * The command itself takes one of the forms:
	- *
	- * zinject
	- * zinject <-a \| -u pool>
	- * zinject -c <id\|all>
	- * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
	- * [-r range] <object>
	- * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
	- *
	- * With no arguments, the command prints all currently registered injection
	- * handlers, with their numeric identifiers.
	- *
	- * The '-c' option will clear the given handler, or all handlers if 'all' is
	- * specified.
	- *
	- * The '-e' option takes a string describing the errno to simulate. This must
	- * be either 'io' or 'checksum'. In most cases this will result in the same
	- * behavior, but RAID-Z will produce a different set of ereports for this
	- * situation.
	- *
	- * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is
	- * specified, then the ARC cache is flushed appropriately. If '-u' is
	- * specified, then the underlying SPA is unloaded. Either of these flags can be
	- * specified independently of any other handlers. The '-m' flag automatically
	- * does an unmount and remount of the underlying dataset to aid in flushing the
	- * cache.
	- *
	- * The '-f' flag controls the frequency of errors injected, expressed as a
	- * integer percentage between 1 and 100. The default is 100.
	- *
	- * The this form is responsible for actually injecting the handler into the
	- * framework. It takes the arguments described above, translates them to the
	- * internal tuple using libzpool, and then issues an ioctl() to register the
	- * handler.
	- *
	- * The final form can target a specific bookmark, regardless of whether a
	- * human-readable interface has been designed. It allows developers to specify
	- * a particular block by number.
	- */
	-
	-#include <errno.h>
	-#include <fcntl.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <unistd.h>
	-
	-#include <sys/fs/zfs.h>
	-#include <sys/param.h>
	-#include <sys/mount.h>
	-
	-#include <libzfs.h>
	-#include <libzfs_compat.h>
	-
	-#undef verify /* both libzfs.h and zfs_context.h want to define this */
	-
	-#include "zinject.h"
	-
	-libzfs_handle_t *g_zfs;
	-int zfs_fd;
	-
	-#ifndef ECKSUM
	-#define ECKSUM EBADE
	-#endif
	-
	-static const char *errtable[TYPE_INVAL] = {
	- "data",
	- "dnode",
	- "mos",
	- "mosdir",
	- "metaslab",
	- "config",
	- "bpobj",
	- "spacemap",
	- "errlog",
	- "uber",
	- "nvlist",
	- "pad1",
	- "pad2"
	-};
	-
	-static err_type_t
	-name_to_type(const char *arg)
	-{
	- int i;
	- for (i = 0; i < TYPE_INVAL; i++)
	- if (strcmp(errtable[i], arg) == 0)
	- return (i);
	-
	- return (TYPE_INVAL);
	-}
	-
	-static const char *
	-type_to_name(uint64_t type)
	-{
	- switch (type) {
	- case DMU_OT_OBJECT_DIRECTORY:
	- return ("mosdir");
	- case DMU_OT_OBJECT_ARRAY:
	- return ("metaslab");
	- case DMU_OT_PACKED_NVLIST:
	- return ("config");
	- case DMU_OT_BPOBJ:
	- return ("bpobj");
	- case DMU_OT_SPACE_MAP:
	- return ("spacemap");
	- case DMU_OT_ERROR_LOG:
	- return ("errlog");
	- default:
	- return ("-");
	- }
	-}
	-
	-
	-/*
	- * Print usage message.
	- */
	-void
	-usage(void)
	-{
	- (void) printf(
	- "usage:\n"
	- "\n"
	- "\tzinject\n"
	- "\n"
	- "\t\tList all active injection records.\n"
	- "\n"
	- "\tzinject -c <id\|all>\n"
	- "\n"
	- "\t\tClear the particular record (if given a numeric ID), or\n"
	- "\t\tall records if 'all' is specificed.\n"
	- "\n"
	- "\tzinject -p <function name> pool\n"
	- "\n"
	- "\t\tInject a panic fault at the specified function. Only \n"
	- "\t\tfunctions which call spa_vdev_config_exit(), or \n"
	- "\t\tspa_vdev_exit() will trigger a panic.\n"
	- "\n"
	- "\tzinject -d device [-e errno] [-L <nvlist\|uber\|pad1\|pad2>] [-F]\n"
	- "\t [-T <read\|write\|free\|claim\|all> pool\n"
	- "\n"
	- "\t\tInject a fault into a particular device or the device's\n"
	- "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n "
	- "\t\t'pad1', or 'pad2'.\n"
	- "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
	- "\n"
	- "\tzinject -d device -A <degrade\|fault> pool\n"
	- "\n"
	- "\t\tPerform a specific action on a particular device\n"
	- "\n"
	- "\tzinject -d device -D latency:lanes pool\n"
	- "\n"
	- "\t\tAdd an artificial delay to IO requests on a particular\n"
	- "\t\tdevice, such that the requests take a minimum of 'latency'\n"
	- "\t\tmilliseconds to complete. Each delay has an associated\n"
	- "\t\tnumber of 'lanes' which defines the number of concurrent\n"
	- "\t\tIO requests that can be processed.\n"
	- "\n"
	- "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
	- "\t\tthe device will only be able to service a single IO request\n"
	- "\t\tat a time with each request taking 10 ms to complete. So,\n"
	- "\t\tif only a single request is submitted every 10 ms, the\n"
	- "\t\taverage latency will be 10 ms; but if more than one request\n"
	- "\t\tis submitted every 10 ms, the average latency will be more\n"
	- "\t\tthan 10 ms.\n"
	- "\n"
	- "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
	- "\t\tlanes (-D 10:2), then the device will be able to service\n"
	- "\t\ttwo requests at a time, each with a minimum latency of\n"
	- "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
	- "\t\tthe average latency will be 10 ms; but if more than two\n"
	- "\t\trequests are submitted every 10 ms, the average latency\n"
	- "\t\twill be more than 10 ms.\n"
	- "\n"
	- "\t\tAlso note, these delays are additive. So two invocations\n"
	- "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
	- "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
	- "\t\tlanes with differing target latencies. For example, an\n"
	- "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
	- "\t\tcreate 3 lanes on the device; one lane with a latency\n"
	- "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
	- "\n"
	- "\tzinject -I [-s <seconds> \| -g <txgs>] pool\n"
	- "\n"
	- "\t\tCause the pool to stop writing blocks yet not\n"
	- "\t\treport errors for a duration. Simulates buggy hardware\n"
	- "\t\tthat fails to honor cache flush requests.\n"
	- "\t\tDefault duration is 30 seconds. The machine is panicked\n"
	- "\t\tat the end of the duration.\n"
	- "\n"
	- "\tzinject -b objset:object:level:blkid pool\n"
	- "\n"
	- "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
	- "\t\tspecified by the remaining tuple. Each number is in\n"
	- "\t\thexidecimal, and only one block can be specified.\n"
	- "\n"
	- "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n"
	- "\t [-a] [-m] [-u] [-f freq] <object>\n"
	- "\n"
	- "\t\tInject an error into the object specified by the '-t' option\n"
	- "\t\tand the object descriptor. The 'object' parameter is\n"
	- "\t\tinterperted depending on the '-t' option.\n"
	- "\n"
	- "\t\t-q\tQuiet mode. Only print out the handler number added.\n"
	- "\t\t-e\tInject a specific error. Must be either 'io' or\n"
	- "\t\t\t'checksum'. Default is 'io'.\n"
	- "\t\t-l\tInject error at a particular block level. Default is "
	- "0.\n"
	- "\t\t-m\tAutomatically remount underlying filesystem.\n"
	- "\t\t-r\tInject error over a particular logical range of an\n"
	- "\t\t\tobject. Will be translated to the appropriate blkid\n"
	- "\t\t\trange according to the object's properties.\n"
	- "\t\t-a\tFlush the ARC cache. Can be specified without any\n"
	- "\t\t\tassociated object.\n"
	- "\t\t-u\tUnload the associated pool. Can be specified with only\n"
	- "\t\t\ta pool object.\n"
	- "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n"
	- "\t\t\ta percentage between 1 and 100.\n"
	- "\n"
	- "\t-t data\t\tInject an error into the plain file contents of a\n"
	- "\t\t\tfile. The object must be specified as a complete path\n"
	- "\t\t\tto a file on a ZFS filesystem.\n"
	- "\n"
	- "\t-t dnode\tInject an error into the metadnode in the block\n"
	- "\t\t\tcorresponding to the dnode for a file or directory. The\n"
	- "\t\t\t'-r' option is incompatible with this mode. The object\n"
	- "\t\t\tis specified as a complete path to a file or directory\n"
	- "\t\t\ton a ZFS filesystem.\n"
	- "\n"
	- "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
	- "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n"
	- "\t\t\tspacemap, metaslab, errlog. The only valid <object> is\n"
	- "\t\t\tthe poolname.\n");
	-}
	-
	-static int
	-iter_handlers(int (func)(int, const char , zinject_record_t , void ),
	- void *data)
	-{
	- zfs_cmd_t zc = { 0 };
	- int ret;
	-
	- while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
	- if ((ret = func((int)zc.zc_guid, zc.zc_name,
	- &zc.zc_inject_record, data)) != 0)
	- return (ret);
	-
	- if (errno != ENOENT) {
	- (void) fprintf(stderr, "Unable to list handlers: %s\n",
	- strerror(errno));
	- return (-1);
	- }
	-
	- return (0);
	-}
	-
	-static int
	-print_data_handler(int id, const char pool, zinject_record_t record,
	- void *data)
	-{
	- int *count = data;
	-
	- if (record->zi_guid != 0 \|\| record->zi_func[0] != '\0')
	- return (0);
	-
	- if (*count == 0) {
	- (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n",
	- "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE");
	- (void) printf("--- --------------- ------ "
	- "------ -------- --- ---------------\n");
	- }
	-
	- *count += 1;
	-
	- (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool,
	- (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object,
	- type_to_name(record->zi_type), record->zi_level);
	-
	- if (record->zi_start == 0 &&
	- record->zi_end == -1ULL)
	- (void) printf("all\n");
	- else
	- (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
	- (u_longlong_t)record->zi_end);
	-
	- return (0);
	-}
	-
	-static int
	-print_device_handler(int id, const char pool, zinject_record_t record,
	- void *data)
	-{
	- int *count = data;
	-
	- if (record->zi_guid == 0 \|\| record->zi_func[0] != '\0')
	- return (0);
	-
	- if (record->zi_cmd == ZINJECT_DELAY_IO)
	- return (0);
	-
	- if (*count == 0) {
	- (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID");
	- (void) printf("--- --------------- ----------------\n");
	- }
	-
	- *count += 1;
	-
	- (void) printf("%3d %-15s %llx\n", id, pool,
	- (u_longlong_t)record->zi_guid);
	-
	- return (0);
	-}
	-
	-static int
	-print_delay_handler(int id, const char pool, zinject_record_t record,
	- void *data)
	-{
	- int *count = data;
	-
	- if (record->zi_guid == 0 \|\| record->zi_func[0] != '\0')
	- return (0);
	-
	- if (record->zi_cmd != ZINJECT_DELAY_IO)
	- return (0);
	-
	- if (*count == 0) {
	- (void) printf("%3s %-15s %-15s %-15s %s\n",
	- "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
	- (void) printf("--- --------------- --------------- "
	- "--------------- ----------------\n");
	- }
	-
	- *count += 1;
	-
	- (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool,
	- (u_longlong_t)NSEC2MSEC(record->zi_timer),
	- (u_longlong_t)record->zi_nlanes,
	- (u_longlong_t)record->zi_guid);
	-
	- return (0);
	-}
	-
	-static int
	-print_panic_handler(int id, const char pool, zinject_record_t record,
	- void *data)
	-{
	- int *count = data;
	-
	- if (record->zi_func[0] == '\0')
	- return (0);
	-
	- if (*count == 0) {
	- (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION");
	- (void) printf("--- --------------- ----------------\n");
	- }
	-
	- *count += 1;
	-
	- (void) printf("%3d %-15s %s\n", id, pool, record->zi_func);
	-
	- return (0);
	-}
	-
	-/*
	- * Print all registered error handlers. Returns the number of handlers
	- * registered.
	- */
	-static int
	-print_all_handlers(void)
	-{
	- int count = 0, total = 0;
	-
	- (void) iter_handlers(print_device_handler, &count);
	- if (count > 0) {
	- total += count;
	- (void) printf("\n");
	- count = 0;
	- }
	-
	- (void) iter_handlers(print_delay_handler, &count);
	- if (count > 0) {
	- total += count;
	- (void) printf("\n");
	- count = 0;
	- }
	-
	- (void) iter_handlers(print_data_handler, &count);
	- if (count > 0) {
	- total += count;
	- (void) printf("\n");
	- count = 0;
	- }
	-
	- (void) iter_handlers(print_panic_handler, &count);
	-
	- return (count + total);
	-}
	-
	-/* ARGSUSED */
	-static int
	-cancel_one_handler(int id, const char pool, zinject_record_t record,
	- void *data)
	-{
	- zfs_cmd_t zc = { 0 };
	-
	- zc.zc_guid = (uint64_t)id;
	-
	- if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
	- (void) fprintf(stderr, "failed to remove handler %d: %s\n",
	- id, strerror(errno));
	- return (1);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Remove all fault injection handlers.
	- */
	-static int
	-cancel_all_handlers(void)
	-{
	- int ret = iter_handlers(cancel_one_handler, NULL);
	-
	- if (ret == 0)
	- (void) printf("removed all registered handlers\n");
	-
	- return (ret);
	-}
	-
	-/*
	- * Remove a specific fault injection handler.
	- */
	-static int
	-cancel_handler(int id)
	-{
	- zfs_cmd_t zc = { 0 };
	-
	- zc.zc_guid = (uint64_t)id;
	-
	- if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
	- (void) fprintf(stderr, "failed to remove handler %d: %s\n",
	- id, strerror(errno));
	- return (1);
	- }
	-
	- (void) printf("removed handler %d\n", id);
	-
	- return (0);
	-}
	-
	-/*
	- * Register a new fault injection handler.
	- */
	-static int
	-register_handler(const char pool, int flags, zinject_record_t record,
	- int quiet)
	-{
	- zfs_cmd_t zc = { 0 };
	-
	- (void) strcpy(zc.zc_name, pool);
	- zc.zc_inject_record = *record;
	- zc.zc_guid = flags;
	-
	- if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
	- (void) fprintf(stderr, "failed to add handler: %s\n",
	- strerror(errno));
	- return (1);
	- }
	-
	- if (flags & ZINJECT_NULL)
	- return (0);
	-
	- if (quiet) {
	- (void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
	- } else {
	- (void) printf("Added handler %llu with the following "
	- "properties:\n", (u_longlong_t)zc.zc_guid);
	- (void) printf(" pool: %s\n", pool);
	- if (record->zi_guid) {
	- (void) printf(" vdev: %llx\n",
	- (u_longlong_t)record->zi_guid);
	- } else if (record->zi_func[0] != '\0') {
	- (void) printf(" panic function: %s\n",
	- record->zi_func);
	- } else if (record->zi_duration > 0) {
	- (void) printf(" time: %lld seconds\n",
	- (u_longlong_t)record->zi_duration);
	- } else if (record->zi_duration < 0) {
	- (void) printf(" txgs: %lld \n",
	- (u_longlong_t)-record->zi_duration);
	- } else {
	- (void) printf("objset: %llu\n",
	- (u_longlong_t)record->zi_objset);
	- (void) printf("object: %llu\n",
	- (u_longlong_t)record->zi_object);
	- (void) printf(" type: %llu\n",
	- (u_longlong_t)record->zi_type);
	- (void) printf(" level: %d\n", record->zi_level);
	- if (record->zi_start == 0 &&
	- record->zi_end == -1ULL)
	- (void) printf(" range: all\n");
	- else
	- (void) printf(" range: [%llu, %llu)\n",
	- (u_longlong_t)record->zi_start,
	- (u_longlong_t)record->zi_end);
	- }
	- }
	-
	- return (0);
	-}
	-
	-int
	-perform_action(const char pool, zinject_record_t record, int cmd)
	-{
	- zfs_cmd_t zc = { 0 };
	-
	- ASSERT(cmd == VDEV_STATE_DEGRADED \|\| cmd == VDEV_STATE_FAULTED);
	- (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
	- zc.zc_guid = record->zi_guid;
	- zc.zc_cookie = cmd;
	-
	- if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
	- return (0);
	-
	- return (1);
	-}
	-
	-static int
	-parse_delay(char str, uint64_t delay, uint64_t *nlanes)
	-{
	- unsigned long scan_delay;
	- unsigned long scan_nlanes;
	-
	- if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
	- return (1);
	-
	- /*
	- * We explicitly disallow a delay of zero here, because we key
	- * off this value being non-zero in translate_device(), to
	- * determine if the fault is a ZINJECT_DELAY_IO fault or not.
	- */
	- if (scan_delay == 0)
	- return (1);
	-
	- /*
	- * The units for the CLI delay parameter is milliseconds, but
	- * the data passed to the kernel is interpreted as nanoseconds.
	- * Thus we scale the milliseconds to nanoseconds here, and this
	- * nanosecond value is used to pass the delay to the kernel.
	- */
	- *delay = MSEC2NSEC(scan_delay);
	- *nlanes = scan_nlanes;
	-
	- return (0);
	-}
	-
	-int
	-main(int argc, char **argv)
	-{
	- int c;
	- char *range = NULL;
	- char *cancel = NULL;
	- char *end;
	- char *raw = NULL;
	- char *device = NULL;
	- int level = 0;
	- int quiet = 0;
	- int error = 0;
	- int domount = 0;
	- int io_type = ZIO_TYPES;
	- int action = VDEV_STATE_UNKNOWN;
	- err_type_t type = TYPE_INVAL;
	- err_type_t label = TYPE_INVAL;
	- zinject_record_t record = { 0 };
	- char pool[MAXNAMELEN];
	- char dataset[MAXNAMELEN];
	- zfs_handle_t *zhp;
	- int nowrites = 0;
	- int dur_txg = 0;
	- int dur_secs = 0;
	- int ret;
	- int flags = 0;
	-
	- if ((g_zfs = libzfs_init()) == NULL) {
	- (void) fprintf(stderr, "internal error: failed to "
	- "initialize ZFS library\n");
	- return (1);
	- }
	-
	- libzfs_print_on_error(g_zfs, B_TRUE);
	-
	- if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
	- (void) fprintf(stderr, "failed to open ZFS device\n");
	- return (1);
	- }
	-
	- if (argc == 1) {
	- /*
	- * No arguments. Print the available handlers. If there are no
	- * available handlers, direct the user to '-h' for help
	- * information.
	- */
	- if (print_all_handlers() == 0) {
	- (void) printf("No handlers registered.\n");
	- (void) printf("Run 'zinject -h' for usage "
	- "information.\n");
	- }
	-
	- return (0);
	- }
	-
	- while ((c = getopt(argc, argv,
	- ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
	- switch (c) {
	- case 'a':
	- flags \|= ZINJECT_FLUSH_ARC;
	- break;
	- case 'A':
	- if (strcasecmp(optarg, "degrade") == 0) {
	- action = VDEV_STATE_DEGRADED;
	- } else if (strcasecmp(optarg, "fault") == 0) {
	- action = VDEV_STATE_FAULTED;
	- } else {
	- (void) fprintf(stderr, "invalid action '%s': "
	- "must be 'degrade' or 'fault'\n", optarg);
	- usage();
	- return (1);
	- }
	- break;
	- case 'b':
	- raw = optarg;
	- break;
	- case 'c':
	- cancel = optarg;
	- break;
	- case 'd':
	- device = optarg;
	- break;
	- case 'D':
	- ret = parse_delay(optarg, &record.zi_timer,
	- &record.zi_nlanes);
	- if (ret != 0) {
	- (void) fprintf(stderr, "invalid i/o delay "
	- "value: '%s'\n", optarg);
	- usage();
	- return (1);
	- }
	- break;
	- case 'e':
	- if (strcasecmp(optarg, "io") == 0) {
	- error = EIO;
	- } else if (strcasecmp(optarg, "checksum") == 0) {
	- error = ECKSUM;
	- } else if (strcasecmp(optarg, "nxio") == 0) {
	- error = ENXIO;
	- } else if (strcasecmp(optarg, "dtl") == 0) {
	- error = ECHILD;
	- } else {
	- (void) fprintf(stderr, "invalid error type "
	- "'%s': must be 'io', 'checksum' or "
	- "'nxio'\n", optarg);
	- usage();
	- return (1);
	- }
	- break;
	- case 'f':
	- record.zi_freq = atoi(optarg);
	- if (record.zi_freq < 1 \|\| record.zi_freq > 100) {
	- (void) fprintf(stderr, "frequency range must "
	- "be in the range (0, 100]\n");
	- return (1);
	- }
	- break;
	- case 'F':
	- record.zi_failfast = B_TRUE;
	- break;
	- case 'g':
	- dur_txg = 1;
	- record.zi_duration = (int)strtol(optarg, &end, 10);
	- if (record.zi_duration <= 0 \|\| *end != '\0') {
	- (void) fprintf(stderr, "invalid duration '%s': "
	- "must be a positive integer\n", optarg);
	- usage();
	- return (1);
	- }
	- /* store duration of txgs as its negative */
	- record.zi_duration *= -1;
	- break;
	- case 'h':
	- usage();
	- return (0);
	- case 'I':
	- /* default duration, if one hasn't yet been defined */
	- nowrites = 1;
	- if (dur_secs == 0 && dur_txg == 0)
	- record.zi_duration = 30;
	- break;
	- case 'l':
	- level = (int)strtol(optarg, &end, 10);
	- if (*end != '\0') {
	- (void) fprintf(stderr, "invalid level '%s': "
	- "must be an integer\n", optarg);
	- usage();
	- return (1);
	- }
	- break;
	- case 'm':
	- domount = 1;
	- break;
	- case 'p':
	- (void) strlcpy(record.zi_func, optarg,
	- sizeof (record.zi_func));
	- record.zi_cmd = ZINJECT_PANIC;
	- break;
	- case 'q':
	- quiet = 1;
	- break;
	- case 'r':
	- range = optarg;
	- break;
	- case 's':
	- dur_secs = 1;
	- record.zi_duration = (int)strtol(optarg, &end, 10);
	- if (record.zi_duration <= 0 \|\| *end != '\0') {
	- (void) fprintf(stderr, "invalid duration '%s': "
	- "must be a positive integer\n", optarg);
	- usage();
	- return (1);
	- }
	- break;
	- case 'T':
	- if (strcasecmp(optarg, "read") == 0) {
	- io_type = ZIO_TYPE_READ;
	- } else if (strcasecmp(optarg, "write") == 0) {
	- io_type = ZIO_TYPE_WRITE;
	- } else if (strcasecmp(optarg, "free") == 0) {
	- io_type = ZIO_TYPE_FREE;
	- } else if (strcasecmp(optarg, "claim") == 0) {
	- io_type = ZIO_TYPE_CLAIM;
	- } else if (strcasecmp(optarg, "all") == 0) {
	- io_type = ZIO_TYPES;
	- } else {
	- (void) fprintf(stderr, "invalid I/O type "
	- "'%s': must be 'read', 'write', 'free', "
	- "'claim' or 'all'\n", optarg);
	- usage();
	- return (1);
	- }
	- break;
	- case 't':
	- if ((type = name_to_type(optarg)) == TYPE_INVAL &&
	- !MOS_TYPE(type)) {
	- (void) fprintf(stderr, "invalid type '%s'\n",
	- optarg);
	- usage();
	- return (1);
	- }
	- break;
	- case 'u':
	- flags \|= ZINJECT_UNLOAD_SPA;
	- break;
	- case 'L':
	- if ((label = name_to_type(optarg)) == TYPE_INVAL &&
	- !LABEL_TYPE(type)) {
	- (void) fprintf(stderr, "invalid label type "
	- "'%s'\n", optarg);
	- usage();
	- return (1);
	- }
	- break;
	- case ':':
	- (void) fprintf(stderr, "option -%c requires an "
	- "operand\n", optopt);
	- usage();
	- return (1);
	- case '?':
	- (void) fprintf(stderr, "invalid option '%c'\n",
	- optopt);
	- usage();
	- return (2);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (record.zi_duration != 0)
	- record.zi_cmd = ZINJECT_IGNORED_WRITES;
	-
	- if (cancel != NULL) {
	- /*
	- * '-c' is invalid with any other options.
	- */
	- if (raw != NULL \|\| range != NULL \|\| type != TYPE_INVAL \|\|
	- level != 0 \|\| record.zi_cmd != ZINJECT_UNINITIALIZED) {
	- (void) fprintf(stderr, "cancel (-c) incompatible with "
	- "any other options\n");
	- usage();
	- return (2);
	- }
	- if (argc != 0) {
	- (void) fprintf(stderr, "extraneous argument to '-c'\n");
	- usage();
	- return (2);
	- }
	-
	- if (strcmp(cancel, "all") == 0) {
	- return (cancel_all_handlers());
	- } else {
	- int id = (int)strtol(cancel, &end, 10);
	- if (*end != '\0') {
	- (void) fprintf(stderr, "invalid handle id '%s':"
	- " must be an integer or 'all'\n", cancel);
	- usage();
	- return (1);
	- }
	- return (cancel_handler(id));
	- }
	- }
	-
	- if (device != NULL) {
	- /*
	- * Device (-d) injection uses a completely different mechanism
	- * for doing injection, so handle it separately here.
	- */
	- if (raw != NULL \|\| range != NULL \|\| type != TYPE_INVAL \|\|
	- level != 0 \|\| record.zi_cmd != ZINJECT_UNINITIALIZED) {
	- (void) fprintf(stderr, "device (-d) incompatible with "
	- "data error injection\n");
	- usage();
	- return (2);
	- }
	-
	- if (argc != 1) {
	- (void) fprintf(stderr, "device (-d) injection requires "
	- "a single pool name\n");
	- usage();
	- return (2);
	- }
	-
	- (void) strcpy(pool, argv[0]);
	- dataset[0] = '\0';
	-
	- if (error == ECKSUM) {
	- (void) fprintf(stderr, "device error type must be "
	- "'io' or 'nxio'\n");
	- return (1);
	- }
	-
	- record.zi_iotype = io_type;
	- if (translate_device(pool, device, label, &record) != 0)
	- return (1);
	- if (!error)
	- error = ENXIO;
	-
	- if (action != VDEV_STATE_UNKNOWN)
	- return (perform_action(pool, &record, action));
	-
	- } else if (raw != NULL) {
	- if (range != NULL \|\| type != TYPE_INVAL \|\| level != 0 \|\|
	- record.zi_cmd != ZINJECT_UNINITIALIZED) {
	- (void) fprintf(stderr, "raw (-b) format with "
	- "any other options\n");
	- usage();
	- return (2);
	- }
	-
	- if (argc != 1) {
	- (void) fprintf(stderr, "raw (-b) format expects a "
	- "single pool name\n");
	- usage();
	- return (2);
	- }
	-
	- (void) strcpy(pool, argv[0]);
	- dataset[0] = '\0';
	-
	- if (error == ENXIO) {
	- (void) fprintf(stderr, "data error type must be "
	- "'checksum' or 'io'\n");
	- return (1);
	- }
	-
	- record.zi_cmd = ZINJECT_DATA_FAULT;
	- if (translate_raw(raw, &record) != 0)
	- return (1);
	- if (!error)
	- error = EIO;
	- } else if (record.zi_cmd == ZINJECT_PANIC) {
	- if (raw != NULL \|\| range != NULL \|\| type != TYPE_INVAL \|\|
	- level != 0 \|\| device != NULL) {
	- (void) fprintf(stderr, "panic (-p) incompatible with "
	- "other options\n");
	- usage();
	- return (2);
	- }
	-
	- if (argc < 1 \|\| argc > 2) {
	- (void) fprintf(stderr, "panic (-p) injection requires "
	- "a single pool name and an optional id\n");
	- usage();
	- return (2);
	- }
	-
	- (void) strcpy(pool, argv[0]);
	- if (argv[1] != NULL)
	- record.zi_type = atoi(argv[1]);
	- dataset[0] = '\0';
	- } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
	- if (nowrites == 0) {
	- (void) fprintf(stderr, "-s or -g meaningless "
	- "without -I (ignore writes)\n");
	- usage();
	- return (2);
	- } else if (dur_secs && dur_txg) {
	- (void) fprintf(stderr, "choose a duration either "
	- "in seconds (-s) or a number of txgs (-g) "
	- "but not both\n");
	- usage();
	- return (2);
	- } else if (argc != 1) {
	- (void) fprintf(stderr, "ignore writes (-I) "
	- "injection requires a single pool name\n");
	- usage();
	- return (2);
	- }
	-
	- (void) strcpy(pool, argv[0]);
	- dataset[0] = '\0';
	- } else if (type == TYPE_INVAL) {
	- if (flags == 0) {
	- (void) fprintf(stderr, "at least one of '-b', '-d', "
	- "'-t', '-a', '-p', '-I' or '-u' "
	- "must be specified\n");
	- usage();
	- return (2);
	- }
	-
	- if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
	- (void) strcpy(pool, argv[0]);
	- dataset[0] = '\0';
	- } else if (argc != 0) {
	- (void) fprintf(stderr, "extraneous argument for "
	- "'-f'\n");
	- usage();
	- return (2);
	- }
	-
	- flags \|= ZINJECT_NULL;
	- } else {
	- if (argc != 1) {
	- (void) fprintf(stderr, "missing object\n");
	- usage();
	- return (2);
	- }
	-
	- if (error == ENXIO) {
	- (void) fprintf(stderr, "data error type must be "
	- "'checksum' or 'io'\n");
	- return (1);
	- }
	-
	- record.zi_cmd = ZINJECT_DATA_FAULT;
	- if (translate_record(type, argv[0], range, level, &record, pool,
	- dataset) != 0)
	- return (1);
	- if (!error)
	- error = EIO;
	- }
	-
	- /*
	- * If this is pool-wide metadata, unmount everything. The ioctl() will
	- * unload the pool, so that we trigger spa-wide reopen of metadata next
	- * time we access the pool.
	- */
	- if (dataset[0] != '\0' && domount) {
	- if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL)
	- return (1);
	-
	- if (zfs_unmount(zhp, NULL, 0) != 0)
	- return (1);
	- }
	-
	- record.zi_error = error;
	-
	- ret = register_handler(pool, flags, &record, quiet);
	-
	- if (dataset[0] != '\0' && domount)
	- ret = (zfs_mount(zhp, NULL, 0) != 0);
	-
	- libzfs_fini(g_zfs);
	-
	- return (ret);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zlook/zlook.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zlook/zlook.c
	+++ head/cddl/contrib/opensolaris/cmd/zlook/zlook.c
	@@ -1,411 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-/*
	- * This is a test program that uses ioctls to the ZFS Unit Test driver
	- * to perform readdirs or lookups using flags not normally available
	- * to user-land programs. This allows testing of the flags'
	- * behavior outside of a complicated consumer, such as the SMB driver.
	- */
	-
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <unistd.h>
	-#include <stropts.h>
	-#include <errno.h>
	-#include <sys/stat.h>
	-#include <sys/types.h>
	-#include <sys/dirent.h>
	-#include <sys/attr.h>
	-#include <stddef.h>
	-#include <fcntl.h>
	-#include <string.h>
	-#include <time.h>
	-
	-#define _KERNEL
	-
	-#include <sys/fs/zut.h>
	-#include <sys/extdirent.h>
	-
	-#undef _KERNEL
	-
	-#define MAXBUF (64 * 1024)
	-#define BIGBUF 4096
	-#define LILBUF (sizeof (dirent_t))
	-
	-#define DIRENT_NAMELEN(reclen) \
	- ((reclen) - (offsetof(dirent_t, d_name[0])))
	-
	-static void
	-usage(char *pnam)
	-{
	- (void) fprintf(stderr, "Usage:\n %s -l [-is] dir-to-look-in "
	- "file-in-dir [xfile-on-file]\n", pnam);
	- (void) fprintf(stderr, " %s -i [-ls] dir-to-look-in "
	- "file-in-dir [xfile-on-file]\n", pnam);
	- (void) fprintf(stderr, " %s -s [-il] dir-to-look-in "
	- "file-in-dir [xfile-on-file]\n", pnam);
	- (void) fprintf(stderr, "\t Perform a lookup\n");
	- (void) fprintf(stderr, "\t -l == lookup\n");
	- (void) fprintf(stderr, "\t -i == request FIGNORECASE\n");
	- (void) fprintf(stderr, "\t -s == request stat(2) and xvattr info\n");
	- (void) fprintf(stderr, " %s -r [-ea] [-b buffer-size-in-bytes] "
	- "dir-to-look-in [file-in-dir]\n", pnam);
	- (void) fprintf(stderr, " %s -e [-ra] [-b buffer-size-in-bytes] "
	- "dir-to-look-in [file-in-dir]\n", pnam);
	- (void) fprintf(stderr, " %s -a [-re] [-b buffer-size-in-bytes] "
	- "dir-to-look-in [file-in-dir]\n", pnam);
	- (void) fprintf(stderr, "\t Perform a readdir\n");
	- (void) fprintf(stderr, "\t -r == readdir\n");
	- (void) fprintf(stderr, "\t -e == request extended entries\n");
	- (void) fprintf(stderr, "\t -a == request access filtering\n");
	- (void) fprintf(stderr, "\t -b == buffer size (default 4K)\n");
	- (void) fprintf(stderr, " %s -A path\n", pnam);
	- (void) fprintf(stderr, "\t Look up _PC_ACCESS_FILTERING "
	- "for path with pathconf(2)\n");
	- (void) fprintf(stderr, " %s -E path\n", pnam);
	- (void) fprintf(stderr, "\t Look up _PC_SATTR_EXISTS "
	- "for path with pathconf(2)\n");
	- (void) fprintf(stderr, " %s -S path\n", pnam);
	- (void) fprintf(stderr, "\t Look up _PC_SATTR_EXISTS "
	- "for path with pathconf(2)\n");
	- exit(EINVAL);
	-}
	-
	-static void
	-print_extd_entries(zut_readdir_t *r)
	-{
	- struct edirent *eodp;
	- char *bufstart;
	-
	- eodp = (edirent_t *)(uintptr_t)r->zr_buf;
	- bufstart = (char *)eodp;
	- while ((char *)eodp < bufstart + r->zr_bytes) {
	- char *blanks = " ";
	- int i = 0;
	- while (i < EDIRENT_NAMELEN(eodp->ed_reclen)) {
	- if (!eodp->ed_name[i])
	- break;
	- (void) printf("%c", eodp->ed_name[i++]);
	- }
	- if (i < 16)
	- (void) printf("%.*s", 16 - i, blanks);
	- (void) printf("\t%x\n", eodp->ed_eflags);
	- eodp = (edirent_t *)((intptr_t)eodp + eodp->ed_reclen);
	- }
	-}
	-
	-static void
	-print_entries(zut_readdir_t *r)
	-{
	- dirent64_t *dp;
	- char *bufstart;
	-
	- dp = (dirent64_t *)(intptr_t)r->zr_buf;
	- bufstart = (char *)dp;
	- while ((char *)dp < bufstart + r->zr_bytes) {
	- int i = 0;
	- while (i < DIRENT_NAMELEN(dp->d_reclen)) {
	- if (!dp->d_name[i])
	- break;
	- (void) printf("%c", dp->d_name[i++]);
	- }
	- (void) printf("\n");
	- dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
	- }
	-}
	-
	-static void
	-print_stats(struct stat64 *sb)
	-{
	- char timebuf[512];
	-
	- (void) printf("st_mode\t\t\t%04lo\n", (unsigned long)sb->st_mode);
	- (void) printf("st_ino\t\t\t%llu\n", (unsigned long long)sb->st_ino);
	- (void) printf("st_nlink\t\t%lu\n", (unsigned long)sb->st_nlink);
	- (void) printf("st_uid\t\t\t%d\n", sb->st_uid);
	- (void) printf("st_gid\t\t\t%d\n", sb->st_gid);
	- (void) printf("st_size\t\t\t%lld\n", (long long)sb->st_size);
	- (void) printf("st_blksize\t\t%ld\n", (long)sb->st_blksize);
	- (void) printf("st_blocks\t\t%lld\n", (long long)sb->st_blocks);
	-
	- timebuf[0] = 0;
	- if (ctime_r(&sb->st_atime, timebuf, 512)) {
	- (void) printf("st_atime\t\t");
	- (void) printf("%s", timebuf);
	- }
	- timebuf[0] = 0;
	- if (ctime_r(&sb->st_mtime, timebuf, 512)) {
	- (void) printf("st_mtime\t\t");
	- (void) printf("%s", timebuf);
	- }
	- timebuf[0] = 0;
	- if (ctime_r(&sb->st_ctime, timebuf, 512)) {
	- (void) printf("st_ctime\t\t");
	- (void) printf("%s", timebuf);
	- }
	-}
	-
	-static void
	-print_xvs(uint64_t xvs)
	-{
	- uint_t bits;
	- int idx = 0;
	-
	- if (xvs == 0)
	- return;
	-
	- (void) printf("-------------------\n");
	- (void) printf("Attribute bit(s) set:\n");
	- (void) printf("-------------------\n");
	-
	- bits = xvs & ((1 << F_ATTR_ALL) - 1);
	- while (bits) {
	- uint_t rest = bits >> 1;
	- if (bits & 1) {
	- (void) printf("%s", attr_to_name((f_attr_t)idx));
	- if (rest)
	- (void) printf(", ");
	- }
	- idx++;
	- bits = rest;
	- }
	- (void) printf("\n");
	-}
	-
	-int
	-main(int argc, char **argv)
	-{
	- zut_lookup_t lk = {0};
	- zut_readdir_t rd = {0};
	- boolean_t checking = B_FALSE;
	- boolean_t looking = B_FALSE;
	- boolean_t reading = B_FALSE;
	- boolean_t bflag = B_FALSE;
	- long rddir_bufsize = BIGBUF;
	- int error = 0;
	- int check;
	- int fd;
	- int c;
	-
	- while ((c = getopt(argc, argv, "lisaerb:ASE")) != -1) {
	- switch (c) {
	- case 'l':
	- looking = B_TRUE;
	- break;
	- case 'i':
	- lk.zl_reqflags \|= ZUT_IGNORECASE;
	- looking = B_TRUE;
	- break;
	- case 's':
	- lk.zl_reqflags \|= ZUT_GETSTAT;
	- looking = B_TRUE;
	- break;
	- case 'a':
	- rd.zr_reqflags \|= ZUT_ACCFILTER;
	- reading = B_TRUE;
	- break;
	- case 'e':
	- rd.zr_reqflags \|= ZUT_EXTRDDIR;
	- reading = B_TRUE;
	- break;
	- case 'r':
	- reading = B_TRUE;
	- break;
	- case 'b':
	- reading = B_TRUE;
	- bflag = B_TRUE;
	- rddir_bufsize = strtol(optarg, NULL, 0);
	- break;
	- case 'A':
	- checking = B_TRUE;
	- check = _PC_ACCESS_FILTERING;
	- break;
	- case 'S':
	- checking = B_TRUE;
	- check = _PC_SATTR_ENABLED;
	- break;
	- case 'E':
	- checking = B_TRUE;
	- check = _PC_SATTR_EXISTS;
	- break;
	- case '?':
	- default:
	- usage(argv[0]); /* no return */
	- }
	- }
	-
	- if ((checking && looking) \|\| (checking && reading) \|\|
	- (looking && reading) \|\| (!reading && bflag) \|\|
	- (!checking && !reading && !looking))
	- usage(argv[0]); /* no return */
	-
	- if (rddir_bufsize < LILBUF \|\| rddir_bufsize > MAXBUF) {
	- (void) fprintf(stderr, "Sorry, buffer size "
	- "must be >= %d and less than or equal to %d bytes.\n",
	- (int)LILBUF, MAXBUF);
	- exit(EINVAL);
	- }
	-
	- if (checking) {
	- char pathbuf[MAXPATHLEN];
	- long result;
	-
	- if (argc - optind < 1)
	- usage(argv[0]); /* no return */
	- (void) strlcpy(pathbuf, argv[optind], MAXPATHLEN);
	- result = pathconf(pathbuf, check);
	- (void) printf("pathconf(2) check for %s\n", pathbuf);
	- switch (check) {
	- case _PC_SATTR_ENABLED:
	- (void) printf("System attributes ");
	- if (result != 0)
	- (void) printf("Enabled\n");
	- else
	- (void) printf("Not enabled\n");
	- break;
	- case _PC_SATTR_EXISTS:
	- (void) printf("System attributes ");
	- if (result != 0)
	- (void) printf("Exist\n");
	- else
	- (void) printf("Do not exist\n");
	- break;
	- case _PC_ACCESS_FILTERING:
	- (void) printf("Access filtering ");
	- if (result != 0)
	- (void) printf("Available\n");
	- else
	- (void) printf("Not available\n");
	- break;
	- }
	- return (result);
	- }
	-
	- if ((fd = open(ZUT_DEV, O_RDONLY)) < 0) {
	- perror(ZUT_DEV);
	- return (ENXIO);
	- }
	-
	- if (reading) {
	- char *buf;
	-
	- if (argc - optind < 1)
	- usage(argv[0]); /* no return */
	-
	- (void) strlcpy(rd.zr_dir, argv[optind], MAXPATHLEN);
	- if (argc - optind > 1) {
	- (void) strlcpy(rd.zr_file, argv[optind + 1],
	- MAXNAMELEN);
	- rd.zr_reqflags \|= ZUT_XATTR;
	- }
	-
	- if ((buf = malloc(rddir_bufsize)) == NULL) {
	- error = errno;
	- perror("malloc");
	- (void) close(fd);
	- return (error);
	- }
	-
	- rd.zr_buf = (uint64_t)(uintptr_t)buf;
	- rd.zr_buflen = rddir_bufsize;
	-
	- while (!rd.zr_eof) {
	- int ierr;
	-
	- if ((ierr = ioctl(fd, ZUT_IOC_READDIR, &rd)) != 0) {
	- (void) fprintf(stderr,
	- "IOCTL error: %s (%d)\n",
	- strerror(ierr), ierr);
	- free(buf);
	- (void) close(fd);
	- return (ierr);
	- }
	- if (rd.zr_retcode) {
	- (void) fprintf(stderr,
	- "readdir result: %s (%d)\n",
	- strerror(rd.zr_retcode), rd.zr_retcode);
	- free(buf);
	- (void) close(fd);
	- return (rd.zr_retcode);
	- }
	- if (rd.zr_reqflags & ZUT_EXTRDDIR)
	- print_extd_entries(&rd);
	- else
	- print_entries(&rd);
	- }
	- free(buf);
	- } else {
	- int ierr;
	-
	- if (argc - optind < 2)
	- usage(argv[0]); /* no return */
	-
	- (void) strlcpy(lk.zl_dir, argv[optind], MAXPATHLEN);
	- (void) strlcpy(lk.zl_file, argv[optind + 1], MAXNAMELEN);
	- if (argc - optind > 2) {
	- (void) strlcpy(lk.zl_xfile,
	- argv[optind + 2], MAXNAMELEN);
	- lk.zl_reqflags \|= ZUT_XATTR;
	- }
	-
	- if ((ierr = ioctl(fd, ZUT_IOC_LOOKUP, &lk)) != 0) {
	- (void) fprintf(stderr,
	- "IOCTL error: %s (%d)\n",
	- strerror(ierr), ierr);
	- (void) close(fd);
	- return (ierr);
	- }
	-
	- (void) printf("\nLookup of ");
	- if (lk.zl_reqflags & ZUT_XATTR) {
	- (void) printf("extended attribute \"%s\" of ",
	- lk.zl_xfile);
	- }
	- (void) printf("file \"%s\" ", lk.zl_file);
	- (void) printf("in directory \"%s\" ", lk.zl_dir);
	- if (lk.zl_retcode) {
	- (void) printf("failed: %s (%d)\n",
	- strerror(lk.zl_retcode), lk.zl_retcode);
	- (void) close(fd);
	- return (lk.zl_retcode);
	- }
	-
	- (void) printf("succeeded.\n");
	- if (lk.zl_reqflags & ZUT_IGNORECASE) {
	- (void) printf("----------------------------\n");
	- (void) printf("dirent flags: 0x%0x\n", lk.zl_deflags);
	- (void) printf("real name: %s\n", lk.zl_real);
	- }
	- if (lk.zl_reqflags & ZUT_GETSTAT) {
	- (void) printf("----------------------------\n");
	- print_stats(&lk.zl_statbuf);
	- print_xvs(lk.zl_xvattrs);
	- }
	- }
	-
	- (void) close(fd);
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
	+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
	@@ -1,674 +0,0 @@
	-'\" te
	-.\" Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>.
	-.\" All Rights Reserved.
	-.\"
	-.\" The contents of this file are subject to the terms of the
	-.\" Common Development and Distribution License (the "License").
	-.\" You may not use this file except in compliance with the License.
	-.\"
	-.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	-.\" or http://www.opensolaris.org/os/licensing.
	-.\" See the License for the specific language governing permissions
	-.\" and limitations under the License.
	-.\"
	-.\" When distributing Covered Code, include this CDDL HEADER in each
	-.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	-.\" If applicable, add the following below this CDDL HEADER, with the
	-.\" fields enclosed by brackets "[]" replaced with your own identifying
	-.\" information: Portions Copyright [yyyy] [name of copyright owner]
	-.\"
	-.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	-.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	-.\" Copyright (c) 2013, Joyent, Inc. All rights reserved.
	-.\"
	-.\" $FreeBSD$
	-.\"
	-.Dd August 16, 2019
	-.Dt ZPOOL-FEATURES 7
	-.Os
	-.Sh NAME
	-.Nm zpool-features
	-.Nd ZFS pool feature descriptions
	-.Sh DESCRIPTION
	-ZFS pool on\-disk format versions are specified via "features" which replace
	-the old on\-disk format numbers (the last supported on\-disk format number is
	-28).
	-To enable a feature on a pool use the
	-.Cm upgrade
	-subcommand of the
	-.Xr zpool 8
	-command, or set the
	-.Sy feature@feature_name
	-property to
	-.Ar enabled .
	-.Pp
	-The pool format does not affect file system version compatibility or the ability
	-to send file systems between pools.
	-.Pp
	-Since most features can be enabled independently of each other the on\-disk
	-format of the pool is specified by the set of all features marked as
	-.Sy active
	-on the pool.
	-If the pool was created by another software version this set may
	-include unsupported features.
	-.Ss Identifying features
	-Every feature has a guid of the form
	-.Sy com.example:feature_name .
	-The reverse DNS name ensures that the feature's guid is unique across all ZFS
	-implementations.
	-When unsupported features are encountered on a pool they will
	-be identified by their guids.
	-Refer to the documentation for the ZFS implementation that created the pool
	-for information about those features.
	-.Pp
	-Each supported feature also has a short name.
	-By convention a feature's short name is the portion of its guid which follows
	-the ':' (e.g.
	-.Sy com.example:feature_name
	-would have the short name
	-.Sy feature_name ),
	-however a feature's short name may differ across ZFS implementations if
	-following the convention would result in name conflicts.
	-.Ss Feature states
	-Features can be in one of three states:
	-.Bl -tag -width "XXXXXXXX"
	-.It Sy active
	-This feature's on\-disk format changes are in effect on the pool.
	-Support for this feature is required to import the pool in read\-write mode.
	-If this feature is not read-only compatible, support is also required to
	-import the pool in read\-only mode (see "Read\-only compatibility").
	-.It Sy enabled
	-An administrator has marked this feature as enabled on the pool, but the
	-feature's on\-disk format changes have not been made yet.
	-The pool can still be imported by software that does not support this feature,
	-but changes may be made to the on\-disk format at any time which will move
	-the feature to the
	-.Sy active
	-state.
	-Some features may support returning to the
	-.Sy enabled
	-state after becoming
	-.Sy active .
	-See feature\-specific documentation for details.
	-.It Sy disabled
	-This feature's on\-disk format changes have not been made and will not be made
	-unless an administrator moves the feature to the
	-.Sy enabled
	-state.
	-Features cannot be disabled once they have been enabled.
	-.El
	-.Pp
	-The state of supported features is exposed through pool properties of the form
	-.Sy feature@short_name .
	-.Ss Read\-only compatibility
	-Some features may make on\-disk format changes that do not interfere with other
	-software's ability to read from the pool.
	-These features are referred to as "read\-only compatible".
	-If all unsupported features on a pool are read\-only compatible, the pool can
	-be imported in read\-only mode by setting the
	-.Sy readonly
	-property during import (see
	-.Xr zpool 8
	-for details on importing pools).
	-.Ss Unsupported features
	-For each unsupported feature enabled on an imported pool a pool property
	-named
	-.Sy unsupported@feature_guid
	-will indicate why the import was allowed despite the unsupported feature.
	-Possible values for this property are:
	-.Bl -tag -width "XXXXXXXX"
	-.It Sy inactive
	-The feature is in the
	-.Sy enabled
	-state and therefore the pool's on\-disk format is still compatible with
	-software that does not support this feature.
	-.It Sy readonly
	-The feature is read\-only compatible and the pool has been imported in
	-read\-only mode.
	-.El
	-.Ss Feature dependencies
	-Some features depend on other features being enabled in order to function
	-properly.
	-Enabling a feature will automatically enable any features it depends on.
	-.Sh FEATURES
	-The following features are supported on this system:
	-.Bl -tag -width "XXXXXXXX"
	-.It Sy async_destroy
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:async_destroy"
	-.It GUID Ta com.delphix:async_destroy
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-Destroying a file system requires traversing all of its data in order to
	-return its used space to the pool.
	-Without
	-.Sy async_destroy
	-the file system is not fully removed until all space has been reclaimed.
	-If the destroy operation is interrupted by a reboot or power outage the next
	-attempt to open the pool will need to complete the destroy operation
	-synchronously.
	-.Pp
	-When
	-.Sy async_destroy
	-is enabled the file system's data will be reclaimed by a background process,
	-allowing the destroy operation to complete without traversing the entire file
	-system.
	-The background process is able to resume interrupted destroys after the pool
	-has been opened, eliminating the need to finish interrupted destroys as part
	-of the open operation.
	-The amount of space remaining to be reclaimed by the background process is
	-available through the
	-.Sy freeing
	-property.
	-.Pp
	-This feature is only
	-.Sy active
	-while
	-.Sy freeing
	-is non\-zero.
	-.It Sy empty_bpobj
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:empty_bpobj"
	-.It GUID Ta com.delphix:empty_bpobj
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-This feature increases the performance of creating and using a large number
	-of snapshots of a single filesystem or volume, and also reduces the disk
	-space required.
	-.Pp
	-When there are many snapshots, each snapshot uses many Block Pointer Objects
	-.Pq bpobj's
	-to track blocks associated with that snapshot.
	-However, in common use cases, most of these bpobj's are empty.
	-This feature allows us to create each bpobj on-demand, thus eliminating the
	-empty bpobjs.
	-.Pp
	-This feature is
	-.Sy active
	-while there are any filesystems, volumes, or snapshots which were created
	-after enabling this feature.
	-.It Sy filesystem_limits
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.joyent:filesystem_limits"
	-.It GUID Ta com.joyent:filesystem_limits
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta extensible_dataset
	-.El
	-.Pp
	-This feature enables filesystem and snapshot limits.
	-These limits can be used
	-to control how many filesystems and/or snapshots can be created at the point in
	-the tree on which the limits are set.
	-.Pp
	-This feature is
	-.Sy active
	-once either of the limit properties has been
	-set on a dataset.
	-Once activated the feature is never deactivated.
	-.It Sy lz4_compress
	-.Bl -column "READ\-ONLY COMPATIBLE" "org.illumos:lz4_compress"
	-.It GUID Ta org.illumos:lz4_compress
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-.Sy lz4
	-is a high-performance real-time compression algorithm that
	-features significantly faster compression and decompression as well as a
	-higher compression ratio than the older
	-.Sy lzjb
	-compression.
	-Typically,
	-.Sy lz4
	-compression is approximately 50% faster on
	-compressible data and 200% faster on incompressible data than
	-.Sy lzjb .
	-It is also approximately 80% faster on decompression, while
	-giving approximately 10% better compression ratio.
	-.Pp
	-When the
	-.Sy lz4_compress
	-feature is set to
	-.Sy enabled ,
	-the
	-administrator can turn on
	-.Sy lz4
	-compression on any dataset on the
	-pool using the
	-.Xr zfs 8
	-command.
	-Also, all newly written metadata
	-will be compressed with
	-.Sy lz4
	-algorithm.
	-Since this feature is not read-only compatible, this
	-operation will render the pool unimportable on systems without support
	-for the
	-.Sy lz4_compress
	-feature.
	-Booting off of
	-.Sy lz4
	--compressed root pools is supported.
	-.Pp
	-This feature becomes
	-.Sy active
	-as soon as it is enabled and will
	-never return to being
	-.Sy enabled .
	-.It Sy multi_vdev_crash_dump
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.joyent:multi_vdev_crash_dump"
	-.It GUID Ta com.joyent:multi_vdev_crash_dump
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-This feature allows a dump device to be configured with a pool comprised
	-of multiple vdevs.
	-Those vdevs may be arranged in any mirrored or raidz
	-configuration.
	-.\" TODO: this is not yet supported on FreeBSD.
	-.\" .Pp
	-.\" When the
	-.\" .Sy multi_vdev_crash_dump
	-.\" feature is set to
	-.\" .Sy enabled ,
	-.\" the administrator can use the
	-.\" .Xr dumpon 8
	-.\" command to configure a
	-.\" dump device on a pool comprised of multiple vdevs.
	-.It Sy spacemap_histogram
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:spacemap_histogram"
	-.It GUID Ta com.delphix:spacemap_histogram
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-This feature allows ZFS to maintain more information about how free space
	-is organized within the pool.
	-If this feature is
	-.Sy enabled ,
	-ZFS will
	-set this feature to
	-.Sy active
	-when a new space map object is created or
	-an existing space map is upgraded to the new format.
	-Once the feature is
	-.Sy active ,
	-it will remain in that state until the pool is destroyed.
	-.It Sy extensible_dataset
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:extensible_dataset"
	-.It GUID Ta com.delphix:extensible_dataset
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-This feature allows more flexible use of internal ZFS data structures,
	-and exists for other features to depend on.
	-.Pp
	-This feature will be
	-.Sy active
	-when the first dependent feature uses it,
	-and will be returned to the
	-.Sy enabled
	-state when all datasets that use
	-this feature are destroyed.
	-.It Sy bookmarks
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:bookmarks"
	-.It GUID Ta com.delphix:bookmarks
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta extensible_dataset
	-.El
	-.Pp
	-This feature enables use of the
	-.Nm zfs
	-.Cm bookmark
	-subcommand.
	-.Pp
	-This feature is
	-.Sy active
	-while any bookmarks exist in the pool.
	-All bookmarks in the pool can be listed by running
	-.Nm zfs
	-.Cm list
	-.Fl t No bookmark Fl r Ar poolname .
	-.It Sy enabled_txg
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:enabled_txg"
	-.It GUID Ta com.delphix:enabled_txg
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-Once this feature is enabled ZFS records the transaction group number
	-in which new features are enabled.
	-This has no user-visible impact,
	-but other features may depend on this feature.
	-.Pp
	-This feature becomes
	-.Sy active
	-as soon as it is enabled and will
	-never return to being
	-.Sy enabled .
	-.It Sy hole_birth
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:hole_birth"
	-.It GUID Ta com.delphix:hole_birth
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta enabled_txg
	-.El
	-.Pp
	-This feature improves performance of incremental sends
	-.Pq Dq zfs send -i
	-and receives for objects with many holes.
	-The most common case of
	-hole-filled objects is zvols.
	-.Pp
	-An incremental send stream from snapshot
	-.Sy A
	-to snapshot
	-.Sy B
	-contains information about every block that changed between
	-.Sy A
	-and
	-.Sy B .
	-Blocks which did not change between those snapshots can be
	-identified and omitted from the stream using a piece of metadata called
	-the 'block birth time', but birth times are not recorded for holes
	-.Pq blocks filled only with zeroes .
	-Since holes created after
	-.Sy A
	-cannot be
	-distinguished from holes created before
	-.Sy A ,
	-information about every
	-hole in the entire filesystem or zvol is included in the send stream.
	-.Pp
	-For workloads where holes are rare this is not a problem.
	-However, when
	-incrementally replicating filesystems or zvols with many holes
	-.Pq for example a zvol formatted with another filesystem
	-a lot of time will
	-be spent sending and receiving unnecessary information about holes that
	-already exist on the receiving side.
	-.Pp
	-Once the
	-.Sy hole_birth
	-feature has been enabled the block birth times
	-of all new holes will be recorded.
	-Incremental sends between snapshots
	-created after this feature is enabled will use this new metadata to avoid
	-sending information about holes that already exist on the receiving side.
	-.Pp
	-This feature becomes
	-.Sy active
	-as soon as it is enabled and will
	-never return to being
	-.Sy enabled .
	-.It Sy embedded_data
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:embedded_data"
	-.It GUID Ta com.delphix:embedded_data
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-This feature improves the performance and compression ratio of
	-highly-compressible blocks.
	-Blocks whose contents can compress to 112 bytes
	-or smaller can take advantage of this feature.
	-.Pp
	-When this feature is enabled, the contents of highly-compressible blocks are
	-stored in the block "pointer" itself
	-.Po a misnomer in this case, as it contains
	-the compressed data, rather than a pointer to its location on disk
	-.Pc .
	-Thus
	-the space of the block
	-.Pq one sector, typically 512 bytes or 4KB
	-is saved,
	-and no additional i/o is needed to read and write the data block.
	-.Pp
	-This feature becomes
	-.Sy active
	-as soon as it is enabled and will
	-never return to being
	-.Sy enabled .
	-.It Sy zpool_checkpoint
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:zpool_checkpoint"
	-.It GUID Ta com.delphix:zpool_checkpoint
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-This feature enables the "zpool checkpoint" subcommand that can
	-checkpoint the state of the pool at the time it was issued and later
	-rewind back to it or discard it.
	-.Pp
	-This feature becomes
	-.Sy active
	-when the "zpool checkpoint" command is used to checkpoint the pool.
	-The feature will only return back to being
	-.Sy enabled
	-when the pool is rewound or the checkpoint has been discarded.
	-.It Sy device_removal
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:device_removal"
	-.It GUID Ta com.delphix:device_removal
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-This feature enables the "zpool remove" subcommand to remove top-level
	-vdevs, evacuating them to reduce the total size of the pool.
	-.Pp
	-This feature becomes
	-.Sy active
	-when the "zpool remove" command is used
	-on a top-level vdev, and will never return to being
	-.Sy enabled .
	-.It Sy obsolete_counts
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:obsolete_counts"
	-.It GUID Ta com.delphix:obsolete_counts
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta device_removal
	-.El
	-.Pp
	-This feature is an enhancement of device_removal, which will over time
	-reduce the memory used to track removed devices. When indirect blocks
	-are freed or remapped, we note that their part of the indirect mapping
	-is "obsolete", i.e. no longer needed. See also the "zfs remap"
	-subcommand in
	-.Xr zfs 8 .
	-
	-This feature becomes
	-.Sy active
	-when the "zpool remove" command is
	-used on a top-level vdev, and will never return to being
	-.Sy enabled .
	-.It Sy spacemap_v2
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:spacemap_v2"
	-.It GUID Ta com.delphix:spacemap_v2
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-This feature enables the use of the new space map encoding which
	-consists of two words (instead of one) whenever it is advantageous.
	-The new encoding allows space maps to represent large regions of
	-space more efficiently on-disk while also increasing their maximum
	-addressable offset.
	-.Pp
	-This feature becomes
	-.Sy active
	-as soon as it is enabled and will
	-never return to being
	-.Sy enabled .
	-.It Sy large_blocks
	-.Bl -column "READ\-ONLY COMPATIBLE" "org.open-zfs:large_block"
	-.It GUID Ta org.open-zfs:large_block
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta extensible_dataset
	-.El
	-.Pp
	-The
	-.Sy large_block
	-feature allows the record size on a dataset to be
	-set larger than 128KB.
	-.Pp
	-This feature becomes
	-.Sy active
	-once a
	-.Sy recordsize
	-property has been set larger than 128KB, and will return to being
	-.Sy enabled
	-once all filesystems that have ever had their recordsize larger than 128KB
	-are destroyed.
	-.Pp
	-Booting from datasets that use the
	-.Sy large_block
	-feature is supported by the
	-.Fx
	-boot loader.
	-.It Sy large_dnode
	-.Bl -column "READ\-ONLY COMPATIBLE" "org.zfsonlinux:large_dnode"
	-.It GUID Ta org.zfsonlinux:large_dnode
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta extensible_dataset
	-.El
	-.Pp
	-The
	-.Sy large_dnode
	-feature allows the size of dnodes in a dataset to be set larger than 512B.
	-.Pp
	-This feature becomes
	-.Sy active
	-once a dataset contains an object with a dnode larger than 512B,
	-which occurs as a result of setting the
	-.Sy dnodesize
	-dataset property to a value other than
	-.Sy legacy .
	-The feature will return to being
	-.Sy enabled
	-once all filesystems that have ever contained a dnode larger than 512B are
	-destroyed.
	-Large dnodes allow more data to be stored in the bonus buffer, thus potentially
	-improving performance by avoiding the use of spill blocks.
	-.It Sy sha512
	-.Bl -column "READ\-ONLY COMPATIBLE" "org.illumos:sha512"
	-.It GUID Ta org.illumos:sha512
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta extensible_dataset
	-.El
	-.Pp
	-The
	-.Sy sha512
	-feature enables the use of the SHA-512/256 truncated hash algorithm
	-.Pq FIPS 180-4
	-for checksum and dedup.
	-The native 64-bit arithmetic of SHA-512 provides an approximate 50%
	-performance boost over SHA-256 on 64-bit hardware and is thus a good
	-minimum-change replacement candidate for systems where hash performance is
	-important, but these systems cannot for whatever reason utilize the faster
	-.Sy skein
	-algorithms.
	-.Pp
	-When the
	-.Sy sha512
	-feature is set to
	-.Sy enabled ,
	-the administrator can turn on the
	-.Sy sha512
	-checksum on any dataset using the
	-.Dl # zfs set checksum=sha512 Ar dataset
	-command.
	-This feature becomes
	-.Sy active
	-once a
	-.Sy checksum
	-property has been set to
	-.Sy sha512 ,
	-and will return to being
	-.Sy enabled
	-once all filesystems that have ever had their checksum set to
	-.Sy sha512
	-are destroyed.
	-.It Sy skein
	-.Bl -column "READ\-ONLY COMPATIBLE" "org.illumos:skein"
	-.It GUID Ta org.illumos:skein
	-.It READ\-ONLY COMPATIBLE Ta no
	-.It DEPENDENCIES Ta extensible_dataset
	-.El
	-.Pp
	-The
	-.Sy skein
	-feature enables the use of the Skein hash algorithm for checksum and dedup.
	-Skein is a high-performance secure hash algorithm that was a finalist in the
	-NIST SHA-3 competition.
	-It provides a very high security margin and high performance on 64-bit hardware
	-.Pq 80% faster than SHA-256 .
	-This implementation also utilizes the new salted checksumming functionality in
	-ZFS, which means that the checksum is pre-seeded with a secret 256-bit random
	-key
	-.Pq stored on the pool
	-before being fed the data block to be checksummed.
	-Thus the produced checksums are unique to a given pool, preventing hash
	-collision attacks on systems with dedup.
	-.Pp
	-When the
	-.Sy skein
	-feature is set to
	-.Sy enabled ,
	-the administrator can turn on the
	-.Sy skein
	-checksum on any dataset using the
	-.Dl # zfs set checksum=skein Ar dataset
	-command.
	-This feature becomes
	-.Sy active
	-once a
	-.Sy checksum
	-property has been set to
	-.Sy skein ,
	-and will return to being
	-.Sy enabled
	-once all filesystems that have ever had their checksum set to
	-.Sy skein
	-are destroyed.
	-Booting off of pools using
	-.Sy skein
	-is supported.
	-.It Sy allocation_classes
	-.Bl -column "READ\-ONLY COMPATIBLE" "com.intel:allocation_classes"
	-.It GUID Ta com.intel:allocation_classes
	-.It READ\-ONLY COMPATIBLE Ta yes
	-.It DEPENDENCIES Ta none
	-.El
	-.Pp
	-This feature enables support for separate allocation classes.
	-.Pp
	-This feature becomes
	-.Sy active
	-when a dedicated allocation class vdev
	-(dedup or special) is created with
	-.Dq zpool create
	-or
	-.Dq zpool add .
	-With device removal, it can be returned to the
	-.Sy enabled
	-state if all the top-level vdevs from an allocation class are removed.
	-.El
	-.Sh SEE ALSO
	-.Xr zpool 8
	-.Sh AUTHORS
	-This manual page is a
	-.Xr mdoc 7
	-reimplementation of the
	-.Tn illumos
	-manual page
	-.Em zpool-features(5) ,
	-modified and customized for
	-.Fx
	-and licensed under the Common Development and Distribution License
	-.Pq Tn CDDL .
	-.Pp
	-The
	-.Xr mdoc 7
	-implementation of this manual page was initially written by
	-.An Martin Matuska Aq mm@FreeBSD.org .
	Index: head/cddl/contrib/opensolaris/cmd/zpool/zpool.8
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zpool/zpool.8
	+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool.8
	@@ -1,2485 +0,0 @@
	-'\" te
	-.\" Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>.
	-.\" Copyright (c) 2013-2014, Xin Li <delphij@FreeBSD.org>.
	-.\" All Rights Reserved.
	-.\"
	-.\" The contents of this file are subject to the terms of the
	-.\" Common Development and Distribution License (the "License").
	-.\" You may not use this file except in compliance with the License.
	-.\"
	-.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	-.\" or http://www.opensolaris.org/os/licensing.
	-.\" See the License for the specific language governing permissions
	-.\" and limitations under the License.
	-.\"
	-.\" When distributing Covered Code, include this CDDL HEADER in each
	-.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	-.\" If applicable, add the following below this CDDL HEADER, with the
	-.\" fields enclosed by brackets "[]" replaced with your own identifying
	-.\" information: Portions Copyright [yyyy] [name of copyright owner]
	-.\"
	-.\" Copyright (c) 2010, Sun Microsystems, Inc. All Rights Reserved.
	-.\" Copyright (c) 2011, Justin T. Gibbs <gibbs@FreeBSD.org>
	-.\" Copyright (c) 2012, Glen Barber <gjb@FreeBSD.org>
	-.\" Copyright (c) 2012, 2017 by Delphix. All Rights Reserved.
	-.\" Copyright 2017 Nexenta Systems, Inc.
	-.\" Copyright (c) 2017 Datto Inc.
	-.\" Copyright (c) 2017 George Melikov. All Rights Reserved.
	-.\" Copyright 2019 Joyent, Inc.
	-.\"
	-.\" $FreeBSD$
	-.\"
	-.Dd February 25, 2020
	-.Dt ZPOOL 8
	-.Os
	-.Sh NAME
	-.Nm zpool
	-.Nd configures ZFS storage pools
	-.Sh SYNOPSIS
	-.Nm
	-.Op Fl \&?
	-.Nm
	-.Cm add
	-.Op Fl fgLnP
	-.Ar pool vdev ...
	-.Nm
	-.Cm attach
	-.Op Fl f
	-.Ar pool device new_device
	-.Nm
	-.Cm checkpoint
	-.Op Fl d, -discard
	-.Ar pool
	-.Nm
	-.Cm clear
	-.Op Fl F Op Fl n
	-.Ar pool
	-.Op Ar device
	-.Nm
	-.Cm create
	-.Op Fl fnd
	-.Op Fl o Ar property Ns = Ns Ar value
	-.Ar ...
	-.Op Fl O Ar file-system-property Ns = Ns Ar value
	-.Ar ...
	-.Op Fl m Ar mountpoint
	-.Op Fl R Ar root
	-.Op Fl t Ar tempname
	-.Ar pool vdev ...
	-.Nm
	-.Cm destroy
	-.Op Fl f
	-.Ar pool
	-.Nm
	-.Cm detach
	-.Ar pool device
	-.Nm
	-.Cm export
	-.Op Fl f
	-.Ar pool ...
	-.Nm
	-.Cm get
	-.Op Fl Hp
	-.Op Fl o Ar field Ns Op , Ns Ar ...
	-.Ar all \| property Ns Op , Ns Ar ...
	-.Ar pool ...
	-.Nm
	-.Cm history
	-.Op Fl il
	-.Op Ar pool
	-.Ar ...
	-.Nm
	-.Cm import
	-.Op Fl d Ar dir \| Fl c Ar cachefile
	-.Op Fl D
	-.Nm
	-.Cm import
	-.Op Fl o Ar mntopts
	-.Op Fl o Ar property Ns = Ns Ar value
	-.Ar ...
	-.Op Fl -rewind-to-checkpoint
	-.Op Fl d Ar dir \| Fl c Ar cachefile
	-.Op Fl D
	-.Op Fl f
	-.Op Fl m
	-.Op Fl N
	-.Op Fl R Ar root
	-.Op Fl F Op Fl n
	-.Fl a
	-.Nm
	-.Cm import
	-.Op Fl o Ar mntopts
	-.Op Fl o Ar property Ns = Ns Ar value
	-.Ar ...
	-.Op Fl -rewind-to-checkpoint
	-.Op Fl d Ar dir \| Fl c Ar cachefile
	-.Op Fl D
	-.Op Fl f
	-.Op Fl m
	-.Op Fl N
	-.Op Fl R Ar root
	-.Op Fl t
	-.Op Fl F Op Fl n
	-.Ar pool \| id
	-.Op Ar newpool
	-.Nm
	-.Cm initialize
	-.Op Fl cs
	-.Ar pool
	-.Op Ar device Ns ...
	-.Nm
	-.Cm iostat
	-.Op Fl v
	-.Op Fl T Cm d Ns \| Ns Cm u
	-.Op Fl gLP
	-.Op Ar pool
	-.Ar ...
	-.Op Ar interval Op Ar count
	-.Nm
	-.Cm labelclear
	-.Op Fl f
	-.Ar device
	-.Nm
	-.Cm list
	-.Op Fl HgLpPv
	-.Op Fl o Ar property Ns Op , Ns Ar ...
	-.Op Fl T Cm d Ns \| Ns Cm u
	-.Op Ar pool
	-.Ar ...
	-.Op Ar interval Op Ar count
	-.Nm
	-.Cm offline
	-.Op Fl t
	-.Ar pool device ...
	-.Nm
	-.Cm online
	-.Op Fl e
	-.Ar pool device ...
	-.Nm
	-.Cm reguid
	-.Ar pool
	-.Nm
	-.Cm remove
	-.Op Fl np
	-.Ar pool device ...
	-.Nm
	-.Cm remove
	-.Fl s
	-.Ar pool
	-.Nm
	-.Cm reopen
	-.Ar pool
	-.Nm
	-.Cm replace
	-.Op Fl f
	-.Ar pool device
	-.Op Ar new_device
	-.Nm
	-.Cm scrub
	-.Op Fl s \| Fl p
	-.Ar pool ...
	-.Nm
	-.Cm set
	-.Ar property Ns = Ns Ar value pool
	-.Nm
	-.Cm split
	-.Op Fl gLnP
	-.Op Fl R Ar altroot
	-.Op Fl o Ar mntopts
	-.Op Fl o Ar property Ns = Ns Ar value
	-.Ar pool newpool
	-.Op Ar device ...
	-.Nm
	-.Cm status
	-.Op Fl DgLPvx
	-.Op Fl T Cm d Ns \| Ns Cm u
	-.Op Ar pool
	-.Ar ...
	-.Op Ar interval Op Ar count
	-.Nm
	-.Cm sync
	-.Oo Ar pool Oc Ns ...
	-.Nm
	-.Cm upgrade
	-.Op Fl v
	-.Nm
	-.Cm upgrade
	-.Op Fl V Ar version
	-.Fl a \| Ar pool ...
	-.Sh DESCRIPTION
	-The
	-.Nm
	-command configures
	-.Tn ZFS
	-storage pools. A storage pool is a collection of devices that provides physical
	-storage and data replication for
	-.Tn ZFS
	-datasets.
	-.Pp
	-All datasets within a storage pool share the same space. See
	-.Xr zfs 8
	-for information on managing datasets.
	-.Ss Virtual Devices (vdevs)
	-A
	-.Qq virtual device
	-.Pq No vdev
	-describes a single device or a collection of devices organized according to
	-certain performance and fault characteristics. The following virtual devices
	-are supported:
	-.Bl -tag -width "XXXXXX"
	-.It Sy disk
	-A block device, typically located under
	-.Pa /dev .
	-.Tn ZFS
	-can use individual slices or partitions, though the recommended mode of
	-operation is to use whole disks. A disk can be specified by a full path to the
	-device or the
	-.Xr geom 4
	-provider name. When given a whole disk,
	-.Tn ZFS
	-automatically labels the disk, if necessary.
	-.It Sy file
	-A regular file. The use of files as a backing store is strongly discouraged. It
	-is designed primarily for experimental purposes, as the fault tolerance of a
	-file is only as good the file system of which it is a part. A file must be
	-specified by a full path.
	-.It Sy mirror
	-A mirror of two or more devices. Data is replicated in an identical fashion
	-across all components of a mirror. A mirror with
	-.Em N
	-disks of size
	-.Em X
	-can hold
	-.Em X
	-bytes and can withstand
	-.Pq Em N-1
	-devices failing before data integrity is compromised.
	-.It Sy raidz
	-(or
	-.Sy raidz1 raidz2 raidz3 ) .
	-A variation on
	-.Sy RAID-5
	-that allows for better distribution of parity and eliminates the
	-.Qq Sy RAID-5
	-write hole (in which data and parity become inconsistent after a power loss).
	-Data and parity is striped across all disks within a
	-.No raidz
	-group.
	-.Pp
	-A
	-.No raidz
	-group can have single-, double- , or triple parity, meaning that the
	-.No raidz
	-group can sustain one, two, or three failures, respectively, without
	-losing any data. The
	-.Sy raidz1 No vdev
	-type specifies a single-parity
	-.No raidz
	-group; the
	-.Sy raidz2 No vdev
	-type specifies a double-parity
	-.No raidz
	-group; and the
	-.Sy raidz3 No vdev
	-type specifies a triple-parity
	-.No raidz
	-group. The
	-.Sy raidz No vdev
	-type is an alias for
	-.Sy raidz1 .
	-.Pp
	-A
	-.No raidz
	-group with
	-.Em N
	-disks of size
	-.Em X
	-with
	-.Em P
	-parity disks can hold approximately
	-.Sm off
	-.Pq Em N-P
	-*X
	-.Sm on
	-bytes and can withstand
	-.Em P
	-device(s) failing before data integrity is compromised. The minimum number of
	-devices in a
	-.No raidz
	-group is one more than the number of parity disks. The
	-recommended number is between 3 and 9 to help increase performance.
	-.It Sy spare
	-A special
	-.No pseudo- Ns No vdev
	-which keeps track of available hot spares for a pool.
	-For more information, see the
	-.Qq Sx Hot Spares
	-section.
	-.It Sy log
	-A separate-intent log device. If more than one log device is specified, then
	-writes are load-balanced between devices. Log devices can be mirrored. However,
	-.No raidz
	-.No vdev
	-types are not supported for the intent log. For more information,
	-see the
	-.Qq Sx Intent Log
	-section.
	-.It Sy dedup
	-A device dedicated solely for allocating dedup data.
	-The redundancy of this device should match the redundancy of the other normal
	-devices in the pool.
	-If more than one dedup device is specified, then allocations are load-balanced
	-between devices.
	-.It Sy special
	-A device dedicated solely for allocating various kinds of internal metadata,
	-and optionally small file data.
	-The redundancy of this device should match the redundancy of the other normal
	-devices in the pool.
	-If more than one special device is specified, then allocations are
	-load-balanced between devices.
	-.Pp
	-For more information on special allocations, see the
	-.Sx Special Allocation Class
	-section.
	-.It Sy cache
	-A device used to cache storage pool data.
	-A cache device cannot be configured as a mirror or raidz group.
	-For more information, see the
	-.Qq Sx Cache Devices
	-section.
	-.El
	-.Pp
	-Virtual devices cannot be nested, so a mirror or
	-.No raidz
	-virtual device can only
	-contain files or disks. Mirrors of mirrors (or other combinations) are not
	-allowed.
	-.Pp
	-A pool can have any number of virtual devices at the top of the configuration
	-(known as
	-.Qq root
	-.No vdev Ns s).
	-Data is dynamically distributed across all top-level devices to balance data
	-among devices. As new virtual devices are added,
	-.Tn ZFS
	-automatically places data on the newly available devices.
	-.Pp
	-Virtual devices are specified one at a time on the command line, separated by
	-whitespace. The keywords
	-.Qq mirror
	-and
	-.Qq raidz
	-are used to distinguish where a group ends and another begins. For example, the
	-following creates two root
	-.No vdev Ns s,
	-each a mirror of two disks:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create mypool mirror da0 da1 mirror da2 da3
	-.Ed
	-.Ss Device Failure and Recovery
	-.Tn ZFS
	-supports a rich set of mechanisms for handling device failure and data
	-corruption. All metadata and data is checksummed, and
	-.Tn ZFS
	-automatically repairs bad data from a good copy when corruption is detected.
	-.Pp
	-In order to take advantage of these features, a pool must make use of some form
	-of redundancy, using either mirrored or
	-.No raidz
	-groups. While
	-.Tn ZFS
	-supports running in a non-redundant configuration, where each root
	-.No vdev
	-is simply a disk or file, this is strongly discouraged. A single case of bit
	-corruption can render some or all of your data unavailable.
	-.Pp
	-A pool's health status is described by one of three states: online, degraded,
	-or faulted. An online pool has all devices operating normally. A degraded pool
	-is one in which one or more devices have failed, but the data is still
	-available due to a redundant configuration. A faulted pool has corrupted
	-metadata, or one or more faulted devices, and insufficient replicas to continue
	-functioning.
	-.Pp
	-The health of the top-level
	-.No vdev ,
	-such as mirror or
	-.No raidz
	-device, is
	-potentially impacted by the state of its associated
	-.No vdev Ns s,
	-or component devices. A top-level
	-.No vdev
	-or component device is in one of the following states:
	-.Bl -tag -width "DEGRADED"
	-.It Sy DEGRADED
	-One or more top-level
	-.No vdev Ns s
	-is in the degraded state because one or more
	-component devices are offline. Sufficient replicas exist to continue
	-functioning.
	-.Pp
	-One or more component devices is in the degraded or faulted state, but
	-sufficient replicas exist to continue functioning. The underlying conditions
	-are as follows:
	-.Bl -bullet -offset 2n
	-.It
	-The number of checksum errors exceeds acceptable levels and the device is
	-degraded as an indication that something may be wrong.
	-.Tn ZFS
	-continues to use the device as necessary.
	-.It
	-The number of
	-.Tn I/O
	-errors exceeds acceptable levels. The device could not be
	-marked as faulted because there are insufficient replicas to continue
	-functioning.
	-.El
	-.It Sy FAULTED
	-One or more top-level
	-.No vdev Ns s
	-is in the faulted state because one or more
	-component devices are offline. Insufficient replicas exist to continue
	-functioning.
	-.Pp
	-One or more component devices is in the faulted state, and insufficient
	-replicas exist to continue functioning. The underlying conditions are as
	-follows:
	-.Bl -bullet -offset 2n
	-.It
	-The device could be opened, but the contents did not match expected values.
	-.It
	-The number of
	-.Tn I/O
	-errors exceeds acceptable levels and the device is faulted to
	-prevent further use of the device.
	-.El
	-.It Sy OFFLINE
	-The device was explicitly taken offline by the
	-.Qq Nm Cm offline
	-command.
	-.It Sy ONLINE
	-The device is online and functioning.
	-.It Sy REMOVED
	-The device was physically removed while the system was running. Device removal
	-detection is hardware-dependent and may not be supported on all platforms.
	-.It Sy UNAVAIL
	-The device could not be opened. If a pool is imported when a device was
	-unavailable, then the device will be identified by a unique identifier instead
	-of its path since the path was never correct in the first place.
	-.El
	-.Pp
	-If a device is removed and later reattached to the system,
	-.Tn ZFS
	-attempts to put the device online automatically. Device attach detection is
	-hardware-dependent and might not be supported on all platforms.
	-.Ss Hot Spares
	-.Tn ZFS
	-allows devices to be associated with pools as
	-.Qq hot spares .
	-These devices are not actively used in the pool, but when an active device
	-fails, it is automatically replaced by a hot spare. To create a pool with hot
	-spares, specify a
	-.Qq spare
	-.No vdev
	-with any number of devices. For example,
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create pool mirror da0 da1 spare da2 da3
	-.Ed
	-.Pp
	-Spares can be shared across multiple pools, and can be added with the
	-.Qq Nm Cm add
	-command and removed with the
	-.Qq Nm Cm remove
	-command. Once a spare replacement is initiated, a new "spare"
	-.No vdev
	-is created
	-within the configuration that will remain there until the original device is
	-replaced. At this point, the hot spare becomes available again if another
	-device fails.
	-.Pp
	-If a pool has a shared spare that is currently being used, the pool can not be
	-exported since other pools may use this shared spare, which may lead to
	-potential data corruption.
	-.Pp
	-Shared spares add some risk.
	-If the pools are imported on different hosts, and both pools suffer a device
	-failure at the same time, both could attempt to use the spare at the same time.
	-This may not be detected, resulting in data corruption.
	-.Pp
	-An in-progress spare replacement can be cancelled by detaching the hot spare.
	-If the original faulted device is detached, then the hot spare assumes its
	-place in the configuration, and is removed from the spare list of all active
	-pools.
	-.Pp
	-Spares cannot replace log devices.
	-.Pp
	-This feature requires a userland helper.
	-FreeBSD provides
	-.Xr zfsd 8
	-for this purpose.
	-It must be manually enabled by adding
	-.Va zfsd_enable="YES"
	-to
	-.Pa /etc/rc.conf .
	-.Ss Intent Log
	-The
	-.Tn ZFS
	-Intent Log
	-.Pq Tn ZIL
	-satisfies
	-.Tn POSIX
	-requirements for synchronous transactions. For instance, databases often
	-require their transactions to be on stable storage devices when returning from
	-a system call.
	-.Tn NFS
	-and other applications can also use
	-.Xr fsync 2
	-to ensure data stability. By default, the intent log is allocated from blocks
	-within the main pool. However, it might be possible to get better performance
	-using separate intent log devices such as
	-.Tn NVRAM
	-or a dedicated disk. For example:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create pool da0 da1 log da2
	-.Ed
	-.Pp
	-Multiple log devices can also be specified, and they can be mirrored. See the
	-.Sx EXAMPLES
	-section for an example of mirroring multiple log devices.
	-.Pp
	-Log devices can be added, replaced, attached, detached, imported and exported
	-as part of the larger pool.
	-Mirrored devices can be removed by specifying the top-level mirror vdev.
	-.Ss Cache devices
	-Devices can be added to a storage pool as "cache devices." These devices
	-provide an additional layer of caching between main memory and disk. For
	-read-heavy workloads, where the working set size is much larger than what can
	-be cached in main memory, using cache devices allow much more of this working
	-set to be served from low latency media. Using cache devices provides the
	-greatest performance improvement for random read-workloads of mostly static
	-content.
	-.Pp
	-To create a pool with cache devices, specify a "cache"
	-.No vdev
	-with any number of devices. For example:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create pool da0 da1 cache da2 da3
	-.Ed
	-.Pp
	-Cache devices cannot be mirrored or part of a
	-.No raidz
	-configuration. If a read
	-error is encountered on a cache device, that read
	-.Tn I/O
	-is reissued to the original storage pool device, which might be part of a
	-mirrored or
	-.No raidz
	-configuration.
	-.Pp
	-The content of the cache devices is considered volatile, as is the case with
	-other system caches.
	-.Ss Pool checkpoint
	-Before starting critical procedures that include destructive actions (e.g
	-.Nm zfs Cm destroy
	-), an administrator can checkpoint the pool's state and in the case of a
	-mistake or failure, rewind the entire pool back to the checkpoint.
	-Otherwise, the checkpoint can be discarded when the procedure has completed
	-successfully.
	-.Pp
	-A pool checkpoint can be thought of as a pool-wide snapshot and should be used
	-with care as it contains every part of the pool's state, from properties to vdev
	-configuration.
	-Thus, while a pool has a checkpoint certain operations are not allowed.
	-Specifically, vdev removal/attach/detach, mirror splitting, and
	-changing the pool's guid.
	-Adding a new vdev is supported but in the case of a rewind it will have to be
	-added again.
	-Finally, users of this feature should keep in mind that scrubs in a pool that
	-has a checkpoint do not repair checkpointed data.
	-.Pp
	-To create a checkpoint for a pool:
	-.Bd -literal
	-# zpool checkpoint pool
	-.Ed
	-.Pp
	-To later rewind to its checkpointed state, you need to first export it and
	-then rewind it during import:
	-.Bd -literal
	-# zpool export pool
	-# zpool import --rewind-to-checkpoint pool
	-.Ed
	-.Pp
	-To discard the checkpoint from a pool:
	-.Bd -literal
	-# zpool checkpoint -d pool
	-.Ed
	-.Pp
	-Dataset reservations (controlled by the
	-.Nm reservation
	-or
	-.Nm refreservation
	-zfs properties) may be unenforceable while a checkpoint exists, because the
	-checkpoint is allowed to consume the dataset's reservation.
	-Finally, data that is part of the checkpoint but has been freed in the
	-current state of the pool won't be scanned during a scrub.
	-.Ss Special Allocation Class
	-The allocations in the special class are dedicated to specific block types.
	-By default this includes all metadata, the indirect blocks of user data, and
	-any dedup data.
	-The class can also be provisioned to accept a limited percentage of small file
	-data blocks.
	-.Pp
	-A pool must always have at least one general (non-specified) vdev before
	-other devices can be assigned to the special class.
	-If the special class becomes full, then allocations intended for it will spill
	-back into the normal class.
	-.Pp
	-Dedup data can be excluded from the special class by setting the
	-.Sy vfs.zfs.ddt_data_is_special
	-sysctl to false (0).
	-.Pp
	-Inclusion of small file blocks in the special class is opt-in.
	-Each dataset can control the size of small file blocks allowed in the special
	-class by setting the
	-.Sy special_small_blocks
	-dataset property.
	-It defaults to zero so you must opt-in by setting it to a non-zero value.
	-See
	-.Xr zfs 1M
	-for more info on setting this property.
	-.Ss Properties
	-Each pool has several properties associated with it. Some properties are
	-read-only statistics while others are configurable and change the behavior of
	-the pool. The following are read-only properties:
	-.Bl -tag -width "dedupratio"
	-.It Sy allocated
	-Amount of storage space used within the pool.
	-.It Sy capacity
	-Percentage of pool space used. This property can also be referred to by its
	-shortened column name, "cap".
	-.It Sy dedupratio
	-The deduplication ratio specified for a pool, expressed as a multiplier.
	-For example, a
	-.Sy dedupratio
	-value of 1.76 indicates that 1.76 units of data were stored but only 1 unit of disk space was actually consumed. See
	-.Xr zfs 8
	-for a description of the deduplication feature.
	-.It Sy expandsize
	-Amount of uninitialized space within the pool or device that can be used to
	-increase the total capacity of the pool.
	-Uninitialized space consists of
	-any space on an EFI labeled vdev which has not been brought online
	-.Pq i.e. zpool online -e .
	-This space occurs when a LUN is dynamically expanded.
	-.It Sy fragmentation
	-The amount of fragmentation in the pool.
	-.It Sy free
	-Number of blocks within the pool that are not allocated.
	-.It Sy freeing
	-After a file system or snapshot is destroyed, the space it was using is
	-returned to the pool asynchronously.
	-.Sy freeing
	-is the amount of space remaining to be reclaimed.
	-Over time
	-.Sy freeing
	-will decrease while
	-.Sy free
	-increases.
	-.It Sy guid
	-A unique identifier for the pool.
	-.It Sy health
	-The current health of the pool. Health can be
	-.Qq Sy ONLINE ,
	-.Qq Sy DEGRADED ,
	-.Qq Sy FAULTED ,
	-.Qq Sy OFFLINE ,
	-.Qq Sy REMOVED ,
	-or
	-.Qq Sy UNAVAIL .
	-.It Sy size
	-Total size of the storage pool.
	-.It Sy unsupported@ Ns Ar feature_guid
	-Information about unsupported features that are enabled on the pool.
	-See
	-.Xr zpool-features 7
	-for details.
	-.El
	-.Pp
	-The space usage properties report actual physical space available to the
	-storage pool. The physical space can be different from the total amount of
	-space that any contained datasets can actually use. The amount of space used in
	-a
	-.No raidz
	-configuration depends on the characteristics of the data being written.
	-In addition,
	-.Tn ZFS
	-reserves some space for internal accounting that the
	-.Xr zfs 8
	-command takes into account, but the
	-.Xr zpool 8
	-command does not. For non-full pools of a reasonable size, these effects should
	-be invisible. For small pools, or pools that are close to being completely
	-full, these discrepancies may become more noticeable.
	-.Pp
	-The following property can be set at creation time and import time:
	-.Bl -tag -width 2n
	-.It Sy altroot
	-Alternate root directory. If set, this directory is prepended to any mount
	-points within the pool. This can be used when examining an unknown pool where
	-the mount points cannot be trusted, or in an alternate boot environment, where
	-the typical paths are not valid.
	-.Sy altroot
	-is not a persistent property. It is valid only while the system is up.
	-Setting
	-.Sy altroot
	-defaults to using
	-.Cm cachefile=none ,
	-though this may be overridden using an explicit setting.
	-.El
	-.Pp
	-The following property can only be set at import time:
	-.Bl -tag -width 2n
	-.It Sy readonly Ns = Ns Cm on No \| Cm off
	-If set to
	-.Cm on ,
	-pool will be imported in read-only mode with the following restrictions:
	-.Bl -bullet -offset 2n
	-.It
	-Synchronous data in the intent log will not be accessible
	-.It
	-Properties of the pool can not be changed
	-.It
	-Datasets of this pool can only be mounted read-only
	-.It
	-To write to a read-only pool, a export and import of the pool is required.
	-.El
	-.Pp
	-This property can also be referred to by its shortened column name,
	-.Sy rdonly .
	-.El
	-.Pp
	-The following properties can be set at creation time and import time, and later
	-changed with the
	-.Ic zpool set
	-command:
	-.Bl -tag -width 2n
	-.It Sy autoexpand Ns = Ns Cm on No \| Cm off
	-Controls automatic pool expansion when the underlying LUN is grown. If set to
	-.Qq Cm on ,
	-the pool will be resized according to the size of the expanded
	-device. If the device is part of a mirror or
	-.No raidz
	-then all devices within that
	-.No mirror/ Ns No raidz
	-group must be expanded before the new space is made available to
	-the pool. The default behavior is
	-.Qq off .
	-This property can also be referred to by its shortened column name,
	-.Sy expand .
	-.It Sy autoreplace Ns = Ns Cm on No \| Cm off
	-Controls automatic device replacement. If set to
	-.Qq Cm off ,
	-device replacement must be initiated by the administrator by using the
	-.Qq Nm Cm replace
	-command. If set to
	-.Qq Cm on ,
	-any new device, found in the same
	-physical location as a device that previously belonged to the pool, is
	-automatically formatted and replaced. The default behavior is
	-.Qq Cm off .
	-This property can also be referred to by its shortened column name, "replace".
	-.It Sy bootfs Ns = Ns Ar pool Ns / Ns Ar dataset
	-Identifies the default bootable dataset for the root pool. This property is
	-expected to be set mainly by the installation and upgrade programs.
	-.It Sy cachefile Ns = Ns Ar path No \| Cm none
	-Controls the location of where the pool configuration is cached. Discovering
	-all pools on system startup requires a cached copy of the configuration data
	-that is stored on the root file system. All pools in this cache are
	-automatically imported when the system boots. Some environments, such as
	-install and clustering, need to cache this information in a different location
	-so that pools are not automatically imported. Setting this property caches the
	-pool configuration in a different location that can later be imported with
	-.Qq Nm Cm import Fl c .
	-Setting it to the special value
	-.Qq Cm none
	-creates a temporary pool that is never cached, and the special value
	-.Cm ''
	-(empty string) uses the default location.
	-.It Sy comment Ns = Ns Ar text
	-A text string consisting of printable ASCII characters that will be stored
	-such that it is available even if the pool becomes faulted.
	-An administrator can provide additional information about a pool using this
	-property.
	-.It Sy dedupditto Ns = Ns Ar number
	-Threshold for the number of block ditto copies. If the reference count for a
	-deduplicated block increases above this number, a new ditto copy of this block
	-is automatically stored. Default setting is
	-.Cm 0
	-which causes no ditto copies to be created for deduplicated blocks.
	-The miniumum legal nonzero setting is 100.
	-.It Sy delegation Ns = Ns Cm on No \| Cm off
	-Controls whether a non-privileged user is granted access based on the dataset
	-permissions defined on the dataset. See
	-.Xr zfs 8
	-for more information on
	-.Tn ZFS
	-delegated administration.
	-.It Sy failmode Ns = Ns Cm wait No \| Cm continue No \| Cm panic
	-Controls the system behavior in the event of catastrophic pool failure. This
	-condition is typically a result of a loss of connectivity to the underlying
	-storage device(s) or a failure of all devices within the pool. The behavior of
	-such an event is determined as follows:
	-.Bl -tag -width indent
	-.It Sy wait
	-Blocks all
	-.Tn I/O
	-access until the device connectivity is recovered and the errors are cleared.
	-This is the default behavior.
	-.It Sy continue
	-Returns
	-.Em EIO
	-to any new write
	-.Tn I/O
	-requests but allows reads to any of the remaining healthy devices. Any write
	-requests that have yet to be committed to disk would be blocked.
	-.It Sy panic
	-Prints out a message to the console and generates a system crash dump.
	-.El
	-.It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled
	-The value of this property is the current state of
	-.Ar feature_name .
	-The only valid value when setting this property is
	-.Sy enabled
	-which moves
	-.Ar feature_name
	-to the enabled state.
	-See
	-.Xr zpool-features 7
	-for details on feature states.
	-.It Sy listsnapshots Ns = Ns Cm on No \| Cm off
	-Controls whether information about snapshots associated with this pool is
	-output when
	-.Qq Nm zfs Cm list
	-is run without the
	-.Fl t
	-option. The default value is
	-.Cm off .
	-This property can also be referred to by its shortened name,
	-.Sy listsnaps .
	-.It Sy multihost Ns = Ns Sy on No \| Sy off
	-Controls whether a pool activity check should be performed during
	-.Nm zpool Cm import .
	-When a pool is determined to be active it cannot be imported, even with the
	-.Fl f
	-option.
	-This property is intended to be used in failover configurations
	-where multiple hosts have access to a pool on shared storage.
	-.Pp
	-Multihost provides protection on import only.
	-It does not protect against an
	-individual device being used in multiple pools, regardless of the type of vdev.
	-See the discussion under
	-.Sy zpool create.
	-.Pp
	-When this property is on, periodic writes to storage occur to show the pool is
	-in use.
	-See
	-.Sy vfs.zfs.multihost_interval
	-sysctl.
	-In order to enable this property each host must set a unique hostid.
	-The default value is
	-.Sy off .
	-.It Sy version Ns = Ns Ar version
	-The current on-disk version of the pool. This can be increased, but never
	-decreased. The preferred method of updating pools is with the
	-.Qq Nm Cm upgrade
	-command, though this property can be used when a specific version is needed
	-for backwards compatibility.
	-Once feature flags is enabled on a pool this property will no longer have a
	-value.
	-.El
	-.Sh SUBCOMMANDS
	-All subcommands that modify state are logged persistently to the pool in their
	-original form.
	-.Pp
	-The
	-.Nm
	-command provides subcommands to create and destroy storage pools, add capacity
	-to storage pools, and provide information about the storage pools. The following
	-subcommands are supported:
	-.Bl -tag -width 2n
	-.It Xo
	-.Nm
	-.Op Fl \&?
	-.Xc
	-.Pp
	-Displays a help message.
	-.It Xo
	-.Nm
	-.Cm add
	-.Op Fl fgLnP
	-.Ar pool vdev ...
	-.Xc
	-.Pp
	-Adds the specified virtual devices to the given pool. The
	-.No vdev
	-specification is described in the
	-.Qq Sx Virtual Devices
	-section. The behavior of the
	-.Fl f
	-option, and the device checks performed are described in the
	-.Qq Nm Cm create
	-subcommand.
	-.Bl -tag -width indent
	-.It Fl f
	-Forces use of
	-.Ar vdev ,
	-even if they appear in use or specify a conflicting replication level.
	-Not all devices can be overridden in this manner.
	-.It Fl g
	-Display
	-.Ar vdev ,
	-GUIDs instead of the normal device names.
	-These GUIDs can be used in place of
	-device names for the zpool detach/offline/remove/replace commands.
	-.It Fl L
	-Display real paths for
	-.Ar vdev Ns s
	-resolving all symbolic links.
	-This can be used to look up the current block
	-device name regardless of the /dev/disk/ path used to open it.
	-.It Fl n
	-Displays the configuration that would be used without actually adding the
	-.Ar vdev Ns s.
	-The actual pool creation can still fail due to insufficient privileges or
	-device sharing.
	-.It Fl P
	-Display real paths for
	-.Ar vdev Ns s
	-instead of only the last component of the path.
	-This can be used in conjunction with the
	-.Fl L
	-flag.
	-.El
	-.It Xo
	-.Nm
	-.Cm attach
	-.Op Fl f
	-.Ar pool device new_device
	-.Xc
	-.Pp
	-Attaches
	-.Ar new_device
	-to an existing
	-.Sy zpool
	-device. The existing device cannot be part of a
	-.No raidz
	-configuration. If
	-.Ar device
	-is not currently part of a mirrored configuration,
	-.Ar device
	-automatically transforms into a two-way mirror of
	-.Ar device No and Ar new_device .
	-If
	-.Ar device
	-is part of a two-way mirror, attaching
	-.Ar new_device
	-creates a three-way mirror, and so on. In either case,
	-.Ar new_device
	-begins to resilver immediately.
	-.Bl -tag -width indent
	-.It Fl f
	-Forces use of
	-.Ar new_device ,
	-even if its appears to be in use. Not all devices can be overridden in this
	-manner.
	-.El
	-.It Xo
	-.Nm
	-.Cm checkpoint
	-.Op Fl d, -discard
	-.Ar pool
	-.Xc
	-Checkpoints the current state of
	-.Ar pool
	-, which can be later restored by
	-.Nm zpool Cm import --rewind-to-checkpoint .
	-The existence of a checkpoint in a pool prohibits the following
	-.Nm zpool
	-commands:
	-.Cm remove ,
	-.Cm attach ,
	-.Cm detach ,
	-.Cm split ,
	-and
	-.Cm reguid .
	-In addition, it may break reservation boundaries if the pool lacks free
	-space.
	-The
	-.Nm zpool Cm status
	-command indicates the existence of a checkpoint or the progress of discarding a
	-checkpoint from a pool.
	-The
	-.Nm zpool Cm list
	-command reports how much space the checkpoint takes from the pool.
	-.Bl -tag -width Ds
	-.It Fl d, -discard
	-Discards an existing checkpoint from
	-.Ar pool .
	-.El
	-.It Xo
	-.Nm
	-.Cm clear
	-.Op Fl F Op Fl n
	-.Ar pool
	-.Op Ar device
	-.Xc
	-.Pp
	-Clears device errors in a pool.
	-If no arguments are specified, all device errors within the pool are cleared.
	-If one or more devices is specified, only those errors associated with the
	-specified device or devices are cleared.
	-If multihost is enabled, and the pool has been suspended, this will not
	-resume I/O.
	-While the pool was suspended, it may have been imported on
	-another host, and resuming I/O could result in pool damage.
	-.Bl -tag -width indent
	-.It Fl F
	-Initiates recovery mode for an unopenable pool. Attempts to discard the last
	-few transactions in the pool to return it to an openable state. Not all damaged
	-pools can be recovered by using this option. If successful, the data from the
	-discarded transactions is irretrievably lost.
	-.It Fl n
	-Used in combination with the
	-.Fl F
	-flag. Check whether discarding transactions would make the pool openable, but
	-do not actually discard any transactions.
	-.El
	-.It Xo
	-.Nm
	-.Cm create
	-.Op Fl fnd
	-.Op Fl o Ar property Ns = Ns Ar value
	-.Ar ...
	-.Op Fl O Ar file-system-property Ns = Ns Ar value
	-.Ar ...
	-.Op Fl m Ar mountpoint
	-.Op Fl R Ar root
	-.Op Fl t Ar tempname
	-.Ar pool vdev ...
	-.Xc
	-.Pp
	-Creates a new storage pool containing the virtual devices specified on the
	-command line. The pool name must begin with a letter, and can only contain
	-alphanumeric characters as well as underscore ("_"), dash ("-"), and period
	-("."). The pool names "mirror", "raidz", "spare" and "log" are reserved, as are
	-names beginning with the pattern "c[0-9]". The
	-.No vdev
	-specification is described in the
	-.Qq Sx Virtual Devices
	-section.
	-.Pp
	-The command attempts to verify that each device specified is accessible and not
	-currently in use by another subsystem.
	-However this check is not robust enough
	-to detect simultaneous attempts to use a new device in different pools, even if
	-.Sy multihost
	-is
	-.Sy enabled.
	-The
	-administrator must ensure that simultaneous invocations of any combination of
	-.Sy zpool replace ,
	-.Sy zpool create ,
	-.Sy zpool add ,
	-or
	-.Sy zpool labelclear ,
	-do not refer to the same device.
	-Using the same device in two pools will
	-result in pool corruption.
	-.Pp
	-There are some uses, such as being currently mounted, or specified as the
	-dedicated dump device, that prevents a device from ever being used by ZFS.
	-Other uses, such as having a preexisting UFS file system, can be overridden
	-with the
	-.Fl f
	-option.
	-.Pp
	-The command also checks that the replication strategy for the pool is
	-consistent. An attempt to combine redundant and non-redundant storage in a
	-single pool, or to mix disks and files, results in an error unless
	-.Fl f
	-is specified. The use of differently sized devices within a single
	-.No raidz
	-or mirror group is also flagged as an error unless
	-.Fl f
	-is specified.
	-.Pp
	-Unless the
	-.Fl R
	-option is specified, the default mount point is
	-.Qq Pa /pool .
	-The mount point must not exist or must be empty, or else the
	-root dataset cannot be mounted. This can be overridden with the
	-.Fl m
	-option.
	-.Pp
	-By default all supported features are enabled on the new pool unless the
	-.Fl d
	-option is specified.
	-.Bl -tag -width indent
	-.It Fl f
	-Forces use of
	-.Ar vdev Ns s,
	-even if they appear in use or specify a conflicting replication level.
	-Not all devices can be overridden in this manner.
	-.It Fl n
	-Displays the configuration that would be used without actually creating the
	-pool. The actual pool creation can still fail due to insufficient privileges or
	-device sharing.
	-.It Fl d
	-Do not enable any features on the new pool.
	-Individual features can be enabled by setting their corresponding properties
	-to
	-.Sy enabled
	-with the
	-.Fl o
	-option.
	-See
	-.Xr zpool-features 7
	-for details about feature properties.
	-.It Xo
	-.Fl o Ar property Ns = Ns Ar value
	-.Op Fl o Ar property Ns = Ns Ar value
	-.Ar ...
	-.Xc
	-Sets the given pool properties. See the
	-.Qq Sx Properties
	-section for a list of valid properties that can be set.
	-.It Xo
	-.Fl O
	-.Ar file-system-property Ns = Ns Ar value
	-.Op Fl O Ar file-system-property Ns = Ns Ar value
	-.Ar ...
	-.Xc
	-Sets the given file system properties in the root file system of the pool. See
	-.Xr zfs 8 Properties
	-for a list of valid properties that
	-can be set.
	-.It Fl R Ar root
	-Equivalent to
	-.Qq Fl o Cm cachefile=none,altroot= Ns Pa root
	-.It Fl m Ar mountpoint
	-Sets the mount point for the root dataset. The default mount point is
	-.Qq Pa /pool
	-or
	-.Qq Cm altroot Ns Pa /pool
	-if
	-.Sy altroot
	-is specified. The mount point must be an absolute path,
	-.Qq Cm legacy ,
	-or
	-.Qq Cm none .
	-For more information on dataset mount points, see
	-.Xr zfs 8 .
	-.It Fl t Ar tempname
	-Sets the in-core pool name to
	-.Pa tempname
	-while the on-disk name will be the name specified as the pool name
	-.Pa pool .
	-This will set the default
	-.Sy cachefile
	-property to
	-.Sy none .
	-This is intended to handle name space collisions when creating pools
	-for other systems, such as virtual machines or physical machines
	-whose pools live on network block devices.
	-.El
	-.It Xo
	-.Nm
	-.Cm destroy
	-.Op Fl f
	-.Ar pool
	-.Xc
	-.Pp
	-Destroys the given pool, freeing up any devices for other use. This command
	-tries to unmount any active datasets before destroying the pool.
	-.Bl -tag -width indent
	-.It Fl f
	-Forces any active datasets contained within the pool to be unmounted.
	-.El
	-.It Xo
	-.Nm
	-.Cm detach
	-.Ar pool device
	-.Xc
	-.Pp
	-Detaches
	-.Ar device
	-from a mirror. The operation is refused if there are no other valid replicas
	-of the data.
	-.It Xo
	-.Nm
	-.Cm export
	-.Op Fl f
	-.Ar pool ...
	-.Xc
	-.Pp
	-Exports the given pools from the system. All devices are marked as exported,
	-but are still considered in use by other subsystems. The devices can be moved
	-between systems (even those of different endianness) and imported as long as a
	-sufficient number of devices are present.
	-.Pp
	-Before exporting the pool, all datasets within the pool are unmounted. A pool
	-can not be exported if it has a shared spare that is currently being used.
	-.Pp
	-For pools to be portable, you must give the
	-.Nm
	-command whole disks, not just slices, so that
	-.Tn ZFS
	-can label the disks with portable
	-.Sy EFI
	-labels. Otherwise, disk drivers on platforms of different endianness will not
	-recognize the disks.
	-.Bl -tag -width indent
	-.It Fl f
	-Forcefully unmount all datasets, using the
	-.Qq Nm unmount Fl f
	-command.
	-.Pp
	-This command will forcefully export the pool even if it has a shared spare that
	-is currently being used. This may lead to potential data corruption.
	-.El
	-.It Xo
	-.Nm
	-.Cm get
	-.Op Fl Hp
	-.Op Fl o Ar field Ns Op , Ns Ar ...
	-.Ar all \| property Ns Op , Ns Ar ...
	-.Ar pool ...
	-.Xc
	-.Pp
	-Retrieves the given list of properties (or all properties if
	-.Qq Cm all
	-is used) for the specified storage pool(s). These properties are displayed with
	-the following fields:
	-.Bl -column -offset indent "property"
	-.It name Ta Name of storage pool
	-.It property Ta Property name
	-.It value Ta Property value
	-.It source Ta Property source, either 'default' or 'local'.
	-.El
	-.Pp
	-See the
	-.Qq Sx Properties
	-section for more information on the available pool properties.
	-.It Fl H
	-Scripted mode. Do not display headers, and separate fields by a single tab
	-instead of arbitrary space.
	-.It Fl p
	-Display numbers in parsable (exact) values.
	-.It Fl o Ar field
	-A comma-separated list of columns to display.
	-.Sy name Ns , Ns
	-.Sy property Ns , Ns
	-.Sy value Ns , Ns
	-.Sy source
	-is the default value.
	-.It Xo
	-.Nm
	-.Cm history
	-.Op Fl il
	-.Op Ar pool
	-.Ar ...
	-.Xc
	-.Pp
	-Displays the command history of the specified pools or all pools if no pool is
	-specified.
	-.Bl -tag -width indent
	-.It Fl i
	-Displays internally logged
	-.Tn ZFS
	-events in addition to user initiated events.
	-.It Fl l
	-Displays log records in long format, which in addition to standard format
	-includes, the user name, the hostname, and the zone in which the operation was
	-performed.
	-.El
	-.It Xo
	-.Nm
	-.Cm import
	-.Op Fl d Ar dir \| Fl c Ar cachefile
	-.Op Fl D
	-.Xc
	-.Pp
	-Lists pools available to import. If the
	-.Fl d
	-option is not specified, this command searches for devices in
	-.Qq Pa /dev .
	-The
	-.Fl d
	-option can be specified multiple times, and all directories are searched. If
	-the device appears to be part of an exported pool, this command displays a
	-summary of the pool with the name of the pool, a numeric identifier, as well as
	-the
	-.No vdev
	-layout and current health of the device for each device or file.
	-Destroyed pools, pools that were previously destroyed with the
	-.Qq Nm Cm destroy
	-command, are not listed unless the
	-.Fl D
	-option is specified.
	-.Pp
	-The numeric identifier is unique, and can be used instead of the pool name when
	-multiple exported pools of the same name are available.
	-.Bl -tag -width indent
	-.It Fl c Ar cachefile
	-Reads configuration from the given
	-.Ar cachefile
	-that was created with the
	-.Qq Sy cachefile
	-pool property. This
	-.Ar cachefile
	-is used instead of searching for devices.
	-.It Fl d Ar dir
	-Searches for devices or files in
	-.Ar dir .
	-The
	-.Fl d
	-option can be specified multiple times.
	-.It Fl D
	-Lists destroyed pools only.
	-.El
	-.It Xo
	-.Nm
	-.Cm import
	-.Op Fl o Ar mntopts
	-.Op Fl o Ar property Ns = Ns Ar value
	-.Ar ...
	-.Op Fl d Ar dir \| Fl c Ar cachefile
	-.Op Fl D
	-.Op Fl f
	-.Op Fl m
	-.Op Fl N
	-.Op Fl R Ar root
	-.Op Fl F Op Fl n
	-.Fl a
	-.Xc
	-.Pp
	-Imports all pools found in the search directories. Identical to the previous
	-command, except that all pools with a sufficient number of devices available
	-are imported. Destroyed pools, pools that were previously destroyed with the
	-.Qq Nm Cm destroy
	-command, will not be imported unless the
	-.Fl D
	-option is specified.
	-.Bl -tag -width indent
	-.It Fl o Ar mntopts
	-Comma-separated list of mount options to use when mounting datasets within the
	-pool. See
	-.Xr zfs 8
	-for a description of dataset properties and mount options.
	-.It Fl o Ar property Ns = Ns Ar value
	-Sets the specified property on the imported pool. See the
	-.Qq Sx Properties
	-section for more information on the available pool properties.
	-.It Fl c Ar cachefile
	-Reads configuration from the given
	-.Ar cachefile
	-that was created with the
	-.Qq Sy cachefile
	-pool property. This
	-.Ar cachefile
	-is used instead of searching for devices.
	-.It Fl d Ar dir
	-Searches for devices or files in
	-.Ar dir .
	-The
	-.Fl d
	-option can be specified multiple times. This option is incompatible with the
	-.Fl c
	-option.
	-.It Fl D
	-Imports destroyed pools only. The
	-.Fl f
	-option is also required.
	-.It Fl f
	-Forces import, even if the pool appears to be potentially active.
	-.It Fl m
	-Allows a pool to import when there is a missing log device. Recent transactions
	-can be lost because the log device will be discarded.
	-.It Fl N
	-Import the pool without mounting any file systems.
	-.It Fl R Ar root
	-Sets the
	-.Qq Sy cachefile
	-property to
	-.Qq Cm none
	-and the
	-.Qq Sy altroot
	-property to
	-.Qq Ar root
	-.It Fl F
	-Recovery mode for a non-importable pool. Attempt to return the pool to an
	-importable state by discarding the last few transactions. Not all damaged pools
	-can be recovered by using this option. If successful, the data from the
	-discarded transactions is irretrievably lost. This option is ignored if the
	-pool is importable or already imported.
	-.It Fl n
	-Used with the
	-.Fl F
	-recovery option. Determines whether a non-importable pool can be made
	-importable again, but does not actually perform the pool recovery. For more
	-details about pool recovery mode, see the
	-.Fl F
	-option, above.
	-.It Fl a
	-Searches for and imports all pools found.
	-.El
	-.It Xo
	-.Nm
	-.Cm import
	-.Op Fl o Ar mntopts
	-.Op Fl o Ar property Ns = Ns Ar value
	-.Ar ...
	-.Op Fl d Ar dir \| Fl c Ar cachefile
	-.Op Fl D
	-.Op Fl f
	-.Op Fl m
	-.Op Fl N
	-.Op Fl R Ar root
	-.Op Fl t
	-.Op Fl F Op Fl n
	-.Ar pool \| id
	-.Op Ar newpool
	-.Xc
	-.Pp
	-Imports a specific pool. A pool can be identified by its name or the numeric
	-identifier. If
	-.Ar newpool
	-is specified, the pool is imported using the name
	-.Ar newpool .
	-Otherwise, it is imported with the same name as its exported name.
	-.Pp
	-If a device is removed from a system without running
	-.Qq Nm Cm export
	-first, the device appears as potentially active. It cannot be determined if
	-this was a failed export, or whether the device is really in use from another
	-host. To import a pool in this state, the
	-.Fl f
	-option is required.
	-.Bl -tag -width indent
	-.It Fl o Ar mntopts
	-Comma-separated list of mount options to use when mounting datasets within the
	-pool. See
	-.Xr zfs 8
	-for a description of dataset properties and mount options.
	-.It Fl o Ar property Ns = Ns Ar value
	-Sets the specified property on the imported pool. See the
	-.Qq Sx Properties
	-section for more information on the available pool properties.
	-.It Fl c Ar cachefile
	-Reads configuration from the given
	-.Ar cachefile
	-that was created with the
	-.Qq Sy cachefile
	-pool property. This
	-.Ar cachefile
	-is used instead of searching for devices.
	-.It Fl d Ar dir
	-Searches for devices or files in
	-.Ar dir .
	-The
	-.Fl d
	-option can be specified multiple times. This option is incompatible with the
	-.Fl c
	-option.
	-.It Fl D
	-Imports destroyed pools only. The
	-.Fl f
	-option is also required.
	-.It Fl f
	-Forces import, even if the pool appears to be potentially active.
	-.It Fl m
	-Allows a pool to import when there is a missing log device. Recent transactions
	-can be lost because the log device will be discarded.
	-.It Fl N
	-Import the pool without mounting any file systems.
	-.It Fl R Ar root
	-Equivalent to
	-.Qq Fl o Cm cachefile=none,altroot= Ns Pa root
	-.It Fl t
	-Used with
	-.Ar newpool .
	-Specifies that
	-.Ar newpool
	-is temporary.
	-Temporary pool names last until export.
	-Ensures that the original pool name will be used in all label updates and
	-therefore is retained upon export.
	-Will also set
	-.Sy cachefile
	-property to
	-.Sy none
	-when not explicitly specified.
	-.It Fl F
	-Recovery mode for a non-importable pool. Attempt to return the pool to an
	-importable state by discarding the last few transactions. Not all damaged pools
	-can be recovered by using this option. If successful, the data from the
	-discarded transactions is irretrievably lost. This option is ignored if the
	-pool is importable or already imported.
	-.It Fl n
	-Used with the
	-.Fl F
	-recovery option. Determines whether a non-importable pool can be made
	-importable again, but does not actually perform the pool recovery. For more
	-details about pool recovery mode, see the
	-.Fl F
	-option, above.
	-.It Fl -rewind-to-checkpoint
	-Rewinds pool to the checkpointed state.
	-Once the pool is imported with this flag there is no way to undo the rewind.
	-All changes and data that were written after the checkpoint are lost!
	-The only exception is when the
	-.Sy readonly
	-mounting option is enabled.
	-In this case, the checkpointed state of the pool is opened and an
	-administrator can see how the pool would look like if they were
	-to fully rewind.
	-.El
	-.It Xo
	-.Nm
	-.Cm initialize
	-.Op Fl cs
	-.Ar pool
	-.Op Ar device Ns ...
	-.Xc
	-Begins initializing by writing to all unallocated regions on the specified
	-devices, or all eligible devices in the pool if no individual devices are
	-specified.
	-Only leaf data or log devices may be initialized.
	-.Bl -tag -width Ds
	-.It Fl c, -cancel
	-Cancel initializing on the specified devices, or all eligible devices if none
	-are specified.
	-If one or more target devices are invalid or are not currently being
	-initialized, the command will fail and no cancellation will occur on any device.
	-.It Fl s -suspend
	-Suspend initializing on the specified devices, or all eligible devices if none
	-are specified.
	-If one or more target devices are invalid or are not currently being
	-initialized, the command will fail and no suspension will occur on any device.
	-Initializing can then be resumed by running
	-.Nm zpool Cm initialize
	-with no flags on the relevant target devices.
	-.El
	-.It Xo
	-.Nm
	-.Cm iostat
	-.Op Fl T Cm d Ns \| Ns Cm u
	-.Op Fl gLPv
	-.Op Ar pool
	-.Ar ...
	-.Op Ar interval Op Ar count
	-.Xc
	-.Pp
	-Displays
	-.Tn I/O
	-statistics for the given pools. When given an interval, the statistics are
	-printed every
	-.Ar interval
	-seconds until
	-.Sy Ctrl-C
	-is pressed. If no
	-.Ar pools
	-are specified, statistics for every pool in the system is shown. If
	-.Ar count
	-is specified, the command exits after
	-.Ar count
	-reports are printed.
	-.Bl -tag -width indent
	-.It Fl T Cm d Ns \| Ns Cm u
	-Print a timestamp.
	-.Pp
	-Use modifier
	-.Cm d
	-for standard date format. See
	-.Xr date 1 .
	-Use modifier
	-.Cm u
	-for unixtime
	-.Pq equals Qq Ic date +%s .
	-.It Fl g
	-Display vdev GUIDs instead of the normal device names.
	-These GUIDs can be used in place of device names for the zpool
	-detach/offline/remove/replace commands.
	-.It Fl L
	-Display real paths for vdevs resolving all symbolic links.
	-This can be used to look up the current block device name regardless of the
	-.Pa /dev/disk/
	-path used to open it.
	-.It Fl P
	-Display full paths for vdevs instead of only the last component of
	-the path.
	-This can be used in conjunction with the
	-.Fl L
	-flag.
	-.It Fl v
	-Verbose statistics.
	-Reports usage statistics for individual vdevs within the
	-pool, in addition to the pool-wide statistics.
	-.El
	-.It Xo
	-.Nm
	-.Cm labelclear
	-.Op Fl f
	-.Ar device
	-.Xc
	-.Pp
	-Removes
	-.Tn ZFS
	-label information from the specified
	-.Ar device .
	-The
	-.Ar device
	-must not be part of an active pool configuration.
	-.Bl -tag -width indent
	-.It Fl f
	-Treat exported or foreign devices as inactive.
	-.El
	-.It Xo
	-.Nm
	-.Cm list
	-.Op Fl HgLpPv
	-.Op Fl o Ar property Ns Op , Ns Ar ...
	-.Op Fl T Cm d Ns \| Ns Cm u
	-.Op Ar pool
	-.Ar ...
	-.Op Ar interval Op Ar count
	-.Xc
	-.Pp
	-Lists the given pools along with a health status and space usage. If no
	-.Ar pools
	-are specified, all pools in the system are listed.
	-.Pp
	-When given an interval, the output is printed every
	-.Ar interval
	-seconds until
	-.Sy Ctrl-C
	-is pressed. If
	-.Ar count
	-is specified, the command exits after
	-.Ar count
	-reports are printed.
	-.Bl -tag -width indent
	-.It Fl T Cm d Ns \| Ns Cm u
	-Print a timestamp.
	-.Pp
	-Use modifier
	-.Cm d
	-for standard date format. See
	-.Xr date 1 .
	-Use modifier
	-.Cm u
	-for unixtime
	-.Pq equals Qq Ic date +%s .
	-.It Fl g
	-Display vdev GUIDs instead of the normal device names.
	-These GUIDs can be used in place of device names for the zpool
	-detach/offline/remove/replace commands.
	-.It Fl H
	-Scripted mode. Do not display headers, and separate fields by a single tab
	-instead of arbitrary space.
	-.It Fl L
	-Display real paths for vdevs resolving all symbolic links.
	-This can be used to look up the current block device name regardless of the
	-/dev/disk/ path used to open it.
	-.It Fl p
	-Display numbers in parsable
	-.Pq exact
	-values.
	-.It Fl P
	-Display full paths for vdevs instead of only the last component of
	-the path.
	-This can be used in conjunction with the
	-.Fl L
	-flag.
	-.It Fl v
	-Verbose statistics. Reports usage statistics for individual
	-.Em vdevs
	-within
	-the pool, in addition to the pool-wide statistics.
	-.It Fl o Ar property Ns Op , Ns Ar ...
	-Comma-separated list of properties to display. See the
	-.Qq Sx Properties
	-section for a list of valid properties. The default list is
	-.Sy name ,
	-.Sy size ,
	-.Sy allocated ,
	-.Sy free ,
	-.Sy checkpoint ,
	-.Sy expandsize ,
	-.Sy fragmentation ,
	-.Sy capacity ,
	-.Sy dedupratio ,
	-.Sy health ,
	-.Sy altroot .
	-.It Fl T Cm d Ns \| Ns Cm u
	-Print a timestamp.
	-.Pp
	-Use modifier
	-.Cm d
	-for standard date format. See
	-.Xr date 1 .
	-Use modifier
	-.Cm u
	-for unixtime
	-.Pq equals Qq Ic date +%s .
	-.El
	-.It Xo
	-.Nm
	-.Cm offline
	-.Op Fl t
	-.Ar pool device ...
	-.Xc
	-.Pp
	-Takes the specified physical device offline. While the
	-.Ar device
	-is offline, no attempt is made to read or write to the device.
	-.Bl -tag -width indent
	-.It Fl t
	-Temporary. Upon reboot, the specified physical device reverts to its previous
	-state.
	-.El
	-.It Xo
	-.Nm
	-.Cm online
	-.Op Fl e
	-.Ar pool device ...
	-.Xc
	-.Pp
	-Brings the specified physical device online.
	-.Pp
	-This command is not applicable to spares or cache devices.
	-.Bl -tag -width indent
	-.It Fl e
	-Expand the device to use all available space. If the device is part of a mirror
	-or
	-.No raidz
	-then all devices must be expanded before the new space will become
	-available to the pool.
	-.El
	-.It Xo
	-.Nm
	-.Cm reguid
	-.Ar pool
	-.Xc
	-.Pp
	-Generates a new unique identifier for the pool. You must ensure that all
	-devices in this pool are online and healthy before performing this action.
	-.It Xo
	-.Nm
	-.Cm remove
	-.Op Fl np
	-.Ar pool device ...
	-.Xc
	-.Pp
	-Removes the specified device from the pool.
	-This command currently only supports removing hot spares, cache, log
	-devices and mirrored top-level vdevs (mirror of leaf devices); but not raidz.
	-.Pp
	-Removing a top-level vdev reduces the total amount of space in the storage pool.
	-The specified device will be evacuated by copying all allocated space from it to
	-the other devices in the pool.
	-In this case, the
	-.Nm zpool Cm remove
	-command initiates the removal and returns, while the evacuation continues in
	-the background.
	-The removal progress can be monitored with
	-.Nm zpool Cm status.
	-This feature must be enabled to be used, see
	-.Xr zpool-features 7
	-.Pp
	-A mirrored top-level device (log or data) can be removed by specifying the
	-top-level mirror for the same.
	-Non-log devices or data devices that are part of a mirrored configuration can
	-be removed using the
	-.Qq Nm Cm detach
	-command.
	-.Bl -tag -width Ds
	-.It Fl n
	-Do not actually perform the removal ("no-op").
	-Instead, print the estimated amount of memory that will be used by the
	-mapping table after the removal completes.
	-This is nonzero only for top-level vdevs.
	-.El
	-.Bl -tag -width Ds
	-.It Fl p
	-Used in conjunction with the
	-.Fl n
	-flag, displays numbers as parsable (exact) values.
	-.El
	-.It Xo
	-.Nm
	-.Cm remove
	-.Fl s
	-.Ar pool
	-.Xc
	-.Pp
	-Stops and cancels an in-progress removal of a top-level vdev.
	-.It Xo
	-.Nm
	-.Cm reopen
	-.Ar pool
	-.Xc
	-.Pp
	-Reopen all the vdevs associated with the pool.
	-.It Xo
	-.Nm
	-.Cm replace
	-.Op Fl f
	-.Ar pool device
	-.Op Ar new_device
	-.Xc
	-.Pp
	-Replaces
	-.Ar old_device
	-with
	-.Ar new_device .
	-This is equivalent to attaching
	-.Ar new_device ,
	-waiting for it to resilver, and then detaching
	-.Ar old_device .
	-.Pp
	-The size of
	-.Ar new_device
	-must be greater than or equal to the minimum size
	-of all the devices in a mirror or
	-.No raidz
	-configuration.
	-.Pp
	-.Ar new_device
	-is required if the pool is not redundant. If
	-.Ar new_device
	-is not specified, it defaults to
	-.Ar old_device .
	-This form of replacement is useful after an existing disk has failed and has
	-been physically replaced. In this case, the new disk may have the same
	-.Pa /dev
	-path as the old device, even though it is actually a different disk.
	-.Tn ZFS
	-recognizes this.
	-.Bl -tag -width indent
	-.It Fl f
	-Forces use of
	-.Ar new_device ,
	-even if its appears to be in use. Not all devices can be overridden in this
	-manner.
	-.El
	-.It Xo
	-.Nm
	-.Cm scrub
	-.Op Fl s \| Fl p
	-.Ar pool ...
	-.Xc
	-.Pp
	-Begins a scrub or resumes a paused scrub.
	-The scrub examines all data in the specified pools to verify that it checksums
	-correctly.
	-For replicated
	-.Pq mirror or raidz
	-devices, ZFS automatically repairs any damage discovered during the scrub.
	-The
	-.Nm zpool Cm status
	-command reports the progress of the scrub and summarizes the results of the
	-scrub upon completion.
	-.Pp
	-Scrubbing and resilvering are very similar operations.
	-The difference is that resilvering only examines data that ZFS knows to be out
	-of date
	-.Po
	-for example, when attaching a new device to a mirror or replacing an existing
	-device
	-.Pc ,
	-whereas scrubbing examines all data to discover silent errors due to hardware
	-faults or disk failure.
	-.Pp
	-Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows
	-one at a time.
	-If a scrub is paused, the
	-.Nm zpool Cm scrub
	-resumes it.
	-If a resilver is in progress, ZFS does not allow a scrub to be started until the
	-resilver completes.
	-.Bl -tag -width Ds
	-.It Fl s
	-Stop scrubbing.
	-.El
	-.Bl -tag -width Ds
	-.It Fl p
	-Pause scrubbing.
	-Scrub pause state and progress are periodically synced to disk.
	-If the system is restarted or pool is exported during a paused scrub,
	-even after import, scrub will remain paused until it is resumed.
	-Once resumed the scrub will pick up from the place where it was last
	-checkpointed to disk.
	-To resume a paused scrub issue
	-.Nm zpool Cm scrub
	-again.
	-.El
	-.It Xo
	-.Nm
	-.Cm set
	-.Ar property Ns = Ns Ar value pool
	-.Xc
	-.Pp
	-Sets the given property on the specified pool. See the
	-.Qq Sx Properties
	-section for more information on what properties can be set and acceptable
	-values.
	-.It Xo
	-.Nm
	-.Cm split
	-.Op Fl gLnP
	-.Op Fl R Ar altroot
	-.Op Fl o Ar mntopts
	-.Op Fl o Ar property Ns = Ns Ar value
	-.Ar pool newpool
	-.Op Ar device ...
	-.Xc
	-.Pp
	-Splits off one disk from each mirrored top-level
	-.No vdev
	-in a pool and creates a new pool from the split-off disks. The original pool
	-must be made up of one or more mirrors and must not be in the process of
	-resilvering. The
	-.Cm split
	-subcommand chooses the last device in each mirror
	-.No vdev
	-unless overridden by a device specification on the command line.
	-.Pp
	-When using a
	-.Ar device
	-argument,
	-.Cm split
	-includes the specified device(s) in a new pool and, should any devices remain
	-unspecified, assigns the last device in each mirror
	-.No vdev
	-to that pool, as it does normally. If you are uncertain about the outcome of a
	-.Cm split
	-command, use the
	-.Fl n
	-("dry-run") option to ensure your command will have the effect you intend.
	-.Bl -tag -width indent
	-.It Fl R Ar altroot
	-Automatically import the newly created pool after splitting, using the
	-specified
	-.Ar altroot
	-parameter for the new pool's alternate root. See the
	-.Sy altroot
	-description in the
	-.Qq Sx Properties
	-section, above.
	-.It Fl g
	-Display vdev GUIDs instead of the normal device names.
	-These GUIDs can be used in place of device names for the zpool
	-detach/offline/remove/replace commands.
	-.It Fl L
	-Display real paths for vdevs resolving all symbolic links.
	-This can be used to look up the current block device name regardless of the
	-.Pa /dev/disk/
	-path used to open it.
	-.It Fl n
	-Displays the configuration that would be created without actually splitting the
	-pool. The actual pool split could still fail due to insufficient privileges or
	-device status.
	-.It Fl o Ar mntopts
	-Comma-separated list of mount options to use when mounting datasets within the
	-pool. See
	-.Xr zfs 8
	-for a description of dataset properties and mount options. Valid only in
	-conjunction with the
	-.Fl R
	-option.
	-.It Fl o Ar property Ns = Ns Ar value
	-Sets the specified property on the new pool. See the
	-.Qq Sx Properties
	-section, above, for more information on the available pool properties.
	-.It Fl P
	-Display full paths for vdevs instead of only the last component of
	-the path.
	-This can be used in conjunction with the
	-.Fl L
	-flag.
	-.El
	-.It Xo
	-.Nm
	-.Cm status
	-.Op Fl DgLPvx
	-.Op Fl T Cm d Ns \| Ns Cm u
	-.Op Ar pool
	-.Ar ...
	-.Op Ar interval Op Ar count
	-.Xc
	-.Pp
	-Displays the detailed health status for the given pools. If no
	-.Ar pool
	-is specified, then the status of each pool in the system is displayed. For more
	-information on pool and device health, see the
	-.Qq Sx Device Failure and Recovery
	-section.
	-.Pp
	-When given an interval, the output is printed every
	-.Ar interval
	-seconds until
	-.Sy Ctrl-C
	-is pressed. If
	-.Ar count
	-is specified, the command exits after
	-.Ar count
	-reports are printed.
	-.Pp
	-If a scrub or resilver is in progress, this command reports the percentage
	-done and the estimated time to completion. Both of these are only approximate,
	-because the amount of data in the pool and the other workloads on the system
	-can change.
	-.Bl -tag -width indent
	-.It Fl D
	-Display a histogram of deduplication statistics, showing the allocated
	-.Pq physically present on disk
	-and referenced
	-.Pq logically referenced in the pool
	-block counts and sizes by reference count.
	-.It Fl g
	-Display vdev GUIDs instead of the normal device names.
	-These GUIDs can be used in place of device names for the zpool
	-detach/offline/remove/replace commands.
	-.It Fl L
	-Display real paths for vdevs resolving all symbolic links.
	-This can be used to look up the current block device name regardless of the
	-.Pa /dev/disk/
	-path used to open it.
	-.It Fl P
	-Display full paths for vdevs instead of only the last component of
	-the path.
	-This can be used in conjunction with the
	-.Fl L
	-flag.
	-.It Fl T Cm d Ns \| Ns Cm u
	-Print a timestamp.
	-.Pp
	-Use modifier
	-.Cm d
	-for standard date format. See
	-.Xr date 1 .
	-Use modifier
	-.Cm u
	-for unixtime
	-.Pq equals Qq Ic date +%s .
	-.It Fl v
	-Displays verbose data error information, printing out a complete list of all
	-data errors since the last complete pool scrub.
	-.It Fl x
	-Only display status for pools that are exhibiting errors or are otherwise
	-unavailable.
	-Warnings about pools not using the latest on-disk format, having non-native
	-block size or disabled features will not be included.
	-.El
	-.It Xo
	-.Nm
	-.Cm sync
	-.Oo Ar pool Oc Ns ...
	-.Xc
	-Forces all in-core dirty data to be written to the primary pool storage and
	-not the ZIL.
	-It will also update administrative information including quota reporting.
	-Without arguments,
	-.Nm zpool Cm sync
	-will sync all pools on the system.
	-Otherwise, it will only sync the specified
	-.Ar pool .
	-.It Xo
	-.Nm
	-.Cm upgrade
	-.Op Fl v
	-.Xc
	-.Pp
	-Displays pools which do not have all supported features enabled and pools
	-formatted using a legacy
	-.Tn ZFS
	-version number.
	-These pools can continue to be used, but some features may not be available.
	-Use
	-.Nm Cm upgrade Fl a
	-to enable all features on all pools.
	-.Bl -tag -width indent
	-.It Fl v
	-Displays legacy
	-.Tn ZFS
	-versions supported by the current software.
	-See
	-.Xr zpool-features 7
	-for a description of feature flags features supported by the current software.
	-.El
	-.It Xo
	-.Nm
	-.Cm upgrade
	-.Op Fl V Ar version
	-.Fl a \| Ar pool ...
	-.Xc
	-.Pp
	-Enables all supported features on the given pool.
	-Once this is done, the pool will no longer be accessible on systems that do
	-not support feature flags.
	-See
	-.Xr zpool-features 7
	-for details on compatibility with systems that support feature flags, but do
	-not support all features enabled on the pool.
	-.Bl -tag -width indent
	-.It Fl a
	-Enables all supported features on all pools.
	-.It Fl V Ar version
	-Upgrade to the specified legacy version. If the
	-.Fl V
	-flag is specified, no features will be enabled on the pool.
	-This option can only be used to increase version number up to the last
	-supported legacy version number.
	-.El
	-.El
	-.Sh EXIT STATUS
	-The following exit values are returned:
	-.Bl -tag -offset 2n -width 2n
	-.It 0
	-Successful completion.
	-.It 1
	-An error occurred.
	-.It 2
	-Invalid command line options were specified.
	-.El
	-.Sh ENVIRONMENT VARIABLES
	-.Bl -tag -width "ZPOOL_VDEV_NAME_FOLLOW_LINKS"
	-.It Ev ZPOOL_VDEV_NAME_GUID
	-Cause
	-.Nm zpool
	-subcommands to output vdev guids by default.
	-This behavior is identical to the
	-.Nm zpool status -g
	-command line option.
	-.It Ev ZPOOL_VDEV_NAME_FOLLOW_LINKS
	-Cause
	-.Nm zpool
	-subcommands to follow links for vdev names by default.
	-This behavior is identical to the
	-.Nm zpool status -L
	-command line option.
	-.It Ev ZPOOL_VDEV_NAME_PATH
	-Cause
	-.Nm zpool
	-subcommands to output full vdev path names by default.
	-This behavior is identical to the
	-.Nm zpool status -P
	-command line option.
	-.El
	-.Sh EXAMPLES
	-.Bl -tag -width 0n
	-.It Sy Example 1 No Creating a RAID-Z Storage Pool
	-.Pp
	-The following command creates a pool with a single
	-.No raidz
	-root
	-.No vdev
	-that consists of six disks.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create tank raidz da0 da1 da2 da3 da4 da5
	-.Ed
	-.It Sy Example 2 No Creating a Mirrored Storage Pool
	-.Pp
	-The following command creates a pool with two mirrors, where each mirror
	-contains two disks.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create tank mirror da0 da1 mirror da2 da3
	-.Ed
	-.It Sy Example 3 No Creating a Tn ZFS No Storage Pool by Using Partitions
	-.Pp
	-The following command creates an unmirrored pool using two GPT partitions.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create tank da0p3 da1p3
	-.Ed
	-.It Sy Example 4 No Creating a Tn ZFS No Storage Pool by Using Files
	-.Pp
	-The following command creates an unmirrored pool using files. While not
	-recommended, a pool based on files can be useful for experimental purposes.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create tank /path/to/file/a /path/to/file/b
	-.Ed
	-.It Sy Example 5 No Adding a Mirror to a Tn ZFS No Storage Pool
	-.Pp
	-The following command adds two mirrored disks to the pool
	-.Em tank ,
	-assuming the pool is already made up of two-way mirrors. The additional space
	-is immediately available to any datasets within the pool.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool add tank mirror da2 da3
	-.Ed
	-.It Sy Example 6 No Listing Available Tn ZFS No Storage Pools
	-.Pp
	-The following command lists all available pools on the system.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool list
	-NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT
	-pool 2.70T 473G 2.24T 33% - 17% 1.00x ONLINE -
	-test 1.98G 89.5K 1.98G 48% - 0% 1.00x ONLINE -
	-.Ed
	-.It Sy Example 7 No Listing All Properties for a Pool
	-.Pp
	-The following command lists all the properties for a pool.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool get all pool
	-pool size 2.70T -
	-pool capacity 17% -
	-pool altroot - default
	-pool health ONLINE -
	-pool guid 2501120270416322443 default
	-pool version 28 default
	-pool bootfs pool/root local
	-pool delegation on default
	-pool autoreplace off default
	-pool cachefile - default
	-pool failmode wait default
	-pool listsnapshots off default
	-pool autoexpand off default
	-pool dedupditto 0 default
	-pool dedupratio 1.00x -
	-pool free 2.24T -
	-pool allocated 473G -
	-pool readonly off -
	-.Ed
	-.It Sy Example 8 No Destroying a Tn ZFS No Storage Pool
	-.Pp
	-The following command destroys the pool
	-.Qq Em tank
	-and any datasets contained within.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool destroy -f tank
	-.Ed
	-.It Sy Example 9 No Exporting a Tn ZFS No Storage Pool
	-.Pp
	-The following command exports the devices in pool
	-.Em tank
	-so that they can be relocated or later imported.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool export tank
	-.Ed
	-.It Sy Example 10 No Importing a Tn ZFS No Storage Pool
	-.Pp
	-The following command displays available pools, and then imports the pool
	-.Qq Em tank
	-for use on the system.
	-.Pp
	-The results from this command are similar to the following:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool import
	-
	- pool: tank
	- id: 15451357997522795478
	- state: ONLINE
	-action: The pool can be imported using its name or numeric identifier.
	-config:
	-
	- tank ONLINE
	- mirror ONLINE
	- da0 ONLINE
	- da1 ONLINE
	-.Ed
	-.It Xo
	-.Sy Example 11
	-Upgrading All
	-.Tn ZFS
	-Storage Pools to the Current Version
	-.Xc
	-.Pp
	-The following command upgrades all
	-.Tn ZFS
	-Storage pools to the current version of
	-the software.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool upgrade -a
	-This system is currently running ZFS pool version 28.
	-.Ed
	-.It Sy Example 12 No Managing Hot Spares
	-.Pp
	-The following command creates a new pool with an available hot spare:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create tank mirror da0 da1 spare da2
	-.Ed
	-.Pp
	-If one of the disks were to fail, the pool would be reduced to the degraded
	-state. The failed device can be replaced using the following command:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool replace tank da0 da2
	-.Ed
	-.Pp
	-Once the data has been resilvered, the spare is automatically removed and is
	-made available should another device fails. The hot spare can be permanently
	-removed from the pool using the following command:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool remove tank da2
	-.Ed
	-.It Xo
	-.Sy Example 13
	-Creating a
	-.Tn ZFS
	-Pool with Mirrored Separate Intent Logs
	-.Xc
	-.Pp
	-The following command creates a
	-.Tn ZFS
	-storage pool consisting of two, two-way
	-mirrors and mirrored log devices:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool create pool mirror da0 da1 mirror da2 da3 log mirror da4 da5
	-.Ed
	-.It Sy Example 14 No Adding Cache Devices to a Tn ZFS No Pool
	-.Pp
	-The following command adds two disks for use as cache devices to a
	-.Tn ZFS
	-storage pool:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool add pool cache da2 da3
	-.Ed
	-.Pp
	-Once added, the cache devices gradually fill with content from main memory.
	-Depending on the size of your cache devices, it could take over an hour for
	-them to fill. Capacity and reads can be monitored using the
	-.Cm iostat
	-subcommand as follows:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool iostat -v pool 5
	-.Ed
	-.It Xo
	-.Sy Example 15
	-Displaying expanded space on a device
	-.Xc
	-.Pp
	-The following command dipslays the detailed information for the
	-.Em data
	-pool.
	-This pool is comprised of a single
	-.Em raidz
	-vdev where one of its
	-devices increased its capacity by 10GB.
	-In this example, the pool will not
	-be able to utilized this extra capacity until all the devices under the
	-.Em raidz
	-vdev have been expanded.
	-.Bd -literal -offset 2n
	-.Li # Ic zpool list -v data
	-NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT
	-data 23.9G 14.6G 9.30G 48% - 61% 1.00x ONLINE -
	- raidz1 23.9G 14.6G 9.30G 48% -
	- ada0 - - - - -
	- ada1 - - - - 10G
	- ada2 - - - - -
	-.Ed
	-.It Xo
	-.Sy Example 16
	-Removing a Mirrored top-level (Log or Data) Device
	-.Xc
	-.Pp
	-The following commands remove the mirrored log device
	-.Sy mirror-2
	-and mirrored top-level data device
	-.Sy mirror-1 .
	-.Pp
	-Given this configuration:
	-.Bd -literal -offset 2n
	- pool: tank
	- state: ONLINE
	- scrub: none requested
	- config:
	-
	- NAME STATE READ WRITE CKSUM
	- tank ONLINE 0 0 0
	- mirror-0 ONLINE 0 0 0
	- da0 ONLINE 0 0 0
	- da1 ONLINE 0 0 0
	- mirror-1 ONLINE 0 0 0
	- da2 ONLINE 0 0 0
	- da3 ONLINE 0 0 0
	- logs
	- mirror-2 ONLINE 0 0 0
	- da4 ONLINE 0 0 0
	- da5 ONLINE 0 0 0
	-.Ed
	-.Pp
	-The command to remove the mirrored log
	-.Em mirror-2
	-is:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool remove tank mirror-2
	-.Ed
	-.Pp
	-The command to remove the mirrored data
	-.Em mirror-1
	-is:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool remove tank mirror-1
	-.Ed
	-.It Xo
	-.Sy Example 17
	-Recovering a Faulted
	-.Tn ZFS
	-Pool
	-.Xc
	-.Pp
	-If a pool is faulted but recoverable, a message indicating this state is
	-provided by
	-.Qq Nm Cm status
	-if the pool was cached (see the
	-.Fl c Ar cachefile
	-argument above), or as part of the error output from a failed
	-.Qq Nm Cm import
	-of the pool.
	-.Pp
	-Recover a cached pool with the
	-.Qq Nm Cm clear
	-command:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool clear -F data
	-Pool data returned to its state as of Tue Sep 08 13:23:35 2009.
	-Discarded approximately 29 seconds of transactions.
	-.Ed
	-.Pp
	-If the pool configuration was not cached, use
	-.Qq Nm Cm import
	-with the recovery mode flag:
	-.Bd -literal -offset 2n
	-.Li # Ic zpool import -F data
	-Pool data returned to its state as of Tue Sep 08 13:23:35 2009.
	-Discarded approximately 29 seconds of transactions.
	-.Ed
	-.El
	-.Sh SEE ALSO
	-.Xr zpool-features 7 ,
	-.Xr zfs 8 ,
	-.Xr zfsd 8
	-.Sh HISTORY
	-The
	-.Nm
	-utility first appeared in
	-.Fx 7.0 .
	-.Sh AUTHORS
	-This manual page is a
	-.Xr mdoc 7
	-reimplementation of the
	-.Tn OpenSolaris
	-manual page
	-.Em zpool(1M) ,
	-modified and customized for
	-.Fx
	-and licensed under the Common Development and Distribution License
	-.Pq Tn CDDL .
	-.Pp
	-The
	-.Xr mdoc 7
	-implementation of this manual page was initially written by
	-.An Martin Matuska Aq mm@FreeBSD.org .
	Index: head/cddl/contrib/opensolaris/cmd/zpool/zpool_iter.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zpool/zpool_iter.c
	+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool_iter.c
	@@ -1,255 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
	- */
	-
	-#include <solaris.h>
	-#include <libintl.h>
	-#include <libuutil.h>
	-#include <stddef.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-
	-#include <libzfs.h>
	-
	-#include "zpool_util.h"
	-
	-/*
	- * Private interface for iterating over pools specified on the command line.
	- * Most consumers will call for_each_pool, but in order to support iostat, we
	- * allow fined grained control through the zpool_list_t interface.
	- */
	-
	-typedef struct zpool_node {
	- zpool_handle_t *zn_handle;
	- uu_avl_node_t zn_avlnode;
	- int zn_mark;
	-} zpool_node_t;
	-
	-struct zpool_list {
	- boolean_t zl_findall;
	- uu_avl_t *zl_avl;
	- uu_avl_pool_t *zl_pool;
	- zprop_list_t **zl_proplist;
	-};
	-
	-/* ARGSUSED */
	-static int
	-zpool_compare(const void larg, const void rarg, void *unused)
	-{
	- zpool_handle_t l = ((zpool_node_t )larg)->zn_handle;
	- zpool_handle_t r = ((zpool_node_t )rarg)->zn_handle;
	- const char *lname = zpool_get_name(l);
	- const char *rname = zpool_get_name(r);
	-
	- return (strcmp(lname, rname));
	-}
	-
	-/*
	- * Callback function for pool_list_get(). Adds the given pool to the AVL tree
	- * of known pools.
	- */
	-static int
	-add_pool(zpool_handle_t zhp, void data)
	-{
	- zpool_list_t *zlp = data;
	- zpool_node_t *node = safe_malloc(sizeof (zpool_node_t));
	- uu_avl_index_t idx;
	-
	- node->zn_handle = zhp;
	- uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool);
	- if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) {
	- if (zlp->zl_proplist &&
	- zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) {
	- zpool_close(zhp);
	- free(node);
	- return (-1);
	- }
	- uu_avl_insert(zlp->zl_avl, node, idx);
	- } else {
	- zpool_close(zhp);
	- free(node);
	- return (-1);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Create a list of pools based on the given arguments. If we're given no
	- * arguments, then iterate over all pools in the system and add them to the AVL
	- * tree. Otherwise, add only those pool explicitly specified on the command
	- * line.
	- */
	-zpool_list_t *
	-pool_list_get(int argc, char argv, zprop_list_t proplist, int *err)
	-{
	- zpool_list_t *zlp;
	-
	- zlp = safe_malloc(sizeof (zpool_list_t));
	-
	- zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t),
	- offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT);
	-
	- if (zlp->zl_pool == NULL)
	- zpool_no_memory();
	-
	- if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL,
	- UU_DEFAULT)) == NULL)
	- zpool_no_memory();
	-
	- zlp->zl_proplist = proplist;
	-
	- if (argc == 0) {
	- (void) zpool_iter(g_zfs, add_pool, zlp);
	- zlp->zl_findall = B_TRUE;
	- } else {
	- int i;
	-
	- for (i = 0; i < argc; i++) {
	- zpool_handle_t *zhp;
	-
	- if ((zhp = zpool_open_canfail(g_zfs, argv[i])) !=
	- NULL) {
	- if (add_pool(zhp, zlp) != 0)
	- *err = B_TRUE;
	- } else {
	- *err = B_TRUE;
	- }
	- }
	- }
	-
	- return (zlp);
	-}
	-
	-/*
	- * Search for any new pools, adding them to the list. We only add pools when no
	- * options were given on the command line. Otherwise, we keep the list fixed as
	- * those that were explicitly specified.
	- */
	-void
	-pool_list_update(zpool_list_t *zlp)
	-{
	- if (zlp->zl_findall)
	- (void) zpool_iter(g_zfs, add_pool, zlp);
	-}
	-
	-/*
	- * Iterate over all pools in the list, executing the callback for each
	- */
	-int
	-pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func,
	- void *data)
	-{
	- zpool_node_t node, next_node;
	- int ret = 0;
	-
	- for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) {
	- next_node = uu_avl_next(zlp->zl_avl, node);
	- if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL \|\|
	- unavail)
	- ret \|= func(node->zn_handle, data);
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * Remove the given pool from the list. When running iostat, we want to remove
	- * those pools that no longer exist.
	- */
	-void
	-pool_list_remove(zpool_list_t zlp, zpool_handle_t zhp)
	-{
	- zpool_node_t search, *node;
	-
	- search.zn_handle = zhp;
	- if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) {
	- uu_avl_remove(zlp->zl_avl, node);
	- zpool_close(node->zn_handle);
	- free(node);
	- }
	-}
	-
	-/*
	- * Free all the handles associated with this list.
	- */
	-void
	-pool_list_free(zpool_list_t *zlp)
	-{
	- uu_avl_walk_t *walk;
	- zpool_node_t *node;
	-
	- if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) {
	- (void) fprintf(stderr,
	- gettext("internal error: out of memory"));
	- exit(1);
	- }
	-
	- while ((node = uu_avl_walk_next(walk)) != NULL) {
	- uu_avl_remove(zlp->zl_avl, node);
	- zpool_close(node->zn_handle);
	- free(node);
	- }
	-
	- uu_avl_walk_end(walk);
	- uu_avl_destroy(zlp->zl_avl);
	- uu_avl_pool_destroy(zlp->zl_pool);
	-
	- free(zlp);
	-}
	-
	-/*
	- * Returns the number of elements in the pool list.
	- */
	-int
	-pool_list_count(zpool_list_t *zlp)
	-{
	- return (uu_avl_numnodes(zlp->zl_avl));
	-}
	-
	-/*
	- * High level function which iterates over all pools given on the command line,
	- * using the pool_list_* interfaces.
	- */
	-int
	-for_each_pool(int argc, char **argv, boolean_t unavail,
	- zprop_list_t *proplist, zpool_iter_f func, void data)
	-{
	- zpool_list_t *list;
	- int ret = 0;
	-
	- if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL)
	- return (1);
	-
	- if (pool_list_iter(list, unavail, func, data) != 0)
	- ret = 1;
	-
	- pool_list_free(list);
	-
	- return (ret);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
	+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
	@@ -1,6742 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2012 by Frederik Wessels. All rights reserved.
	- * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
	- * Copyright 2016 Nexenta Systems, Inc.
	- * Copyright (c) 2017 Datto Inc.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#include <solaris.h>
	-#include <assert.h>
	-#include <ctype.h>
	-#include <dirent.h>
	-#include <errno.h>
	-#include <fcntl.h>
	-#include <getopt.h>
	-#include <libgen.h>
	-#include <libintl.h>
	-#include <libuutil.h>
	-#include <locale.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <string.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <priv.h>
	-#include <pwd.h>
	-#include <zone.h>
	-#include <sys/time.h>
	-#include <zfs_prop.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/stat.h>
	-#include <sys/debug.h>
	-
	-#include <libzfs.h>
	-
	-#include "zpool_util.h"
	-#include "zfs_comutil.h"
	-#include "zfeature_common.h"
	-
	-#include "statcommon.h"
	-
	-libzfs_handle_t *g_zfs;
	-
	-static int zpool_do_create(int, char **);
	-static int zpool_do_destroy(int, char **);
	-
	-static int zpool_do_add(int, char **);
	-static int zpool_do_remove(int, char **);
	-static int zpool_do_labelclear(int, char **);
	-
	-static int zpool_do_checkpoint(int, char **);
	-
	-static int zpool_do_list(int, char **);
	-static int zpool_do_iostat(int, char **);
	-static int zpool_do_status(int, char **);
	-
	-static int zpool_do_online(int, char **);
	-static int zpool_do_offline(int, char **);
	-static int zpool_do_clear(int, char **);
	-static int zpool_do_reopen(int, char **);
	-
	-static int zpool_do_reguid(int, char **);
	-
	-static int zpool_do_attach(int, char **);
	-static int zpool_do_detach(int, char **);
	-static int zpool_do_replace(int, char **);
	-static int zpool_do_split(int, char **);
	-
	-static int zpool_do_initialize(int, char **);
	-static int zpool_do_scrub(int, char **);
	-
	-static int zpool_do_import(int, char **);
	-static int zpool_do_export(int, char **);
	-
	-static int zpool_do_upgrade(int, char **);
	-
	-static int zpool_do_history(int, char **);
	-
	-static int zpool_do_get(int, char **);
	-static int zpool_do_set(int, char **);
	-
	-static int zpool_do_sync(int, char **);
	-
	-/*
	- * These libumem hooks provide a reasonable set of defaults for the allocator's
	- * debugging facilities.
	- */
	-
	-#ifdef DEBUG
	-const char *
	-_umem_debug_init(void)
	-{
	- return ("default,verbose"); /* $UMEM_DEBUG setting */
	-}
	-
	-const char *
	-_umem_logging_init(void)
	-{
	- return ("fail,contents"); /* $UMEM_LOGGING setting */
	-}
	-#endif
	-
	-typedef enum {
	- HELP_ADD,
	- HELP_ATTACH,
	- HELP_CLEAR,
	- HELP_CREATE,
	- HELP_CHECKPOINT,
	- HELP_DESTROY,
	- HELP_DETACH,
	- HELP_EXPORT,
	- HELP_HISTORY,
	- HELP_IMPORT,
	- HELP_IOSTAT,
	- HELP_LABELCLEAR,
	- HELP_LIST,
	- HELP_OFFLINE,
	- HELP_ONLINE,
	- HELP_REPLACE,
	- HELP_REMOVE,
	- HELP_INITIALIZE,
	- HELP_SCRUB,
	- HELP_STATUS,
	- HELP_UPGRADE,
	- HELP_GET,
	- HELP_SET,
	- HELP_SPLIT,
	- HELP_SYNC,
	- HELP_REGUID,
	- HELP_REOPEN
	-} zpool_help_t;
	-
	-
	-typedef struct zpool_command {
	- const char *name;
	- int (func)(int, char *);
	- zpool_help_t usage;
	-} zpool_command_t;
	-
	-/*
	- * Master command table. Each ZFS command has a name, associated function, and
	- * usage message. The usage messages need to be internationalized, so we have
	- * to have a function to return the usage message based on a command index.
	- *
	- * These commands are organized according to how they are displayed in the usage
	- * message. An empty command (one with a NULL name) indicates an empty line in
	- * the generic usage message.
	- */
	-static zpool_command_t command_table[] = {
	- { "create", zpool_do_create, HELP_CREATE },
	- { "destroy", zpool_do_destroy, HELP_DESTROY },
	- { NULL },
	- { "add", zpool_do_add, HELP_ADD },
	- { "remove", zpool_do_remove, HELP_REMOVE },
	- { NULL },
	- { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR },
	- { NULL },
	- { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT },
	- { NULL },
	- { "list", zpool_do_list, HELP_LIST },
	- { "iostat", zpool_do_iostat, HELP_IOSTAT },
	- { "status", zpool_do_status, HELP_STATUS },
	- { NULL },
	- { "online", zpool_do_online, HELP_ONLINE },
	- { "offline", zpool_do_offline, HELP_OFFLINE },
	- { "clear", zpool_do_clear, HELP_CLEAR },
	- { "reopen", zpool_do_reopen, HELP_REOPEN },
	- { NULL },
	- { "attach", zpool_do_attach, HELP_ATTACH },
	- { "detach", zpool_do_detach, HELP_DETACH },
	- { "replace", zpool_do_replace, HELP_REPLACE },
	- { "split", zpool_do_split, HELP_SPLIT },
	- { NULL },
	- { "initialize", zpool_do_initialize, HELP_INITIALIZE },
	- { "scrub", zpool_do_scrub, HELP_SCRUB },
	- { NULL },
	- { "import", zpool_do_import, HELP_IMPORT },
	- { "export", zpool_do_export, HELP_EXPORT },
	- { "upgrade", zpool_do_upgrade, HELP_UPGRADE },
	- { "reguid", zpool_do_reguid, HELP_REGUID },
	- { NULL },
	- { "history", zpool_do_history, HELP_HISTORY },
	- { "get", zpool_do_get, HELP_GET },
	- { "set", zpool_do_set, HELP_SET },
	- { "sync", zpool_do_sync, HELP_SYNC },
	-};
	-
	-#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
	-
	-#define VDEV_ALLOC_CLASS_LOGS "logs"
	-
	-static zpool_command_t *current_command;
	-static char history_str[HIS_MAX_RECORD_LEN];
	-static boolean_t log_history = B_TRUE;
	-static uint_t timestamp_fmt = NODATE;
	-
	-static const char *
	-get_usage(zpool_help_t idx)
	-{
	- switch (idx) {
	- case HELP_ADD:
	- return (gettext("\tadd [-fgLnP] <pool> <vdev> ...\n"));
	- case HELP_ATTACH:
	- return (gettext("\tattach [-f] <pool> <device> "
	- "<new-device>\n"));
	- case HELP_CLEAR:
	- return (gettext("\tclear [-nF] <pool> [device]\n"));
	- case HELP_CREATE:
	- return (gettext("\tcreate [-fnd] [-B] "
	- "[-o property=value] ... \n"
	- "\t [-O file-system-property=value] ...\n"
	- "\t [-m mountpoint] [-R root] [-t tempname] "
	- "<pool> <vdev> ...\n"));
	- case HELP_CHECKPOINT:
	- return (gettext("\tcheckpoint [--discard] <pool> ...\n"));
	- case HELP_DESTROY:
	- return (gettext("\tdestroy [-f] <pool>\n"));
	- case HELP_DETACH:
	- return (gettext("\tdetach <pool> <device>\n"));
	- case HELP_EXPORT:
	- return (gettext("\texport [-f] <pool> ...\n"));
	- case HELP_HISTORY:
	- return (gettext("\thistory [-il] [<pool>] ...\n"));
	- case HELP_IMPORT:
	- return (gettext("\timport [-d dir] [-D]\n"
	- "\timport [-o mntopts] [-o property=value] ... \n"
	- "\t [-d dir \| -c cachefile] [-D] [-f] [-m] [-N] "
	- "[-R root] [-F [-n]] -a\n"
	- "\timport [-o mntopts] [-o property=value] ... \n"
	- "\t [-d dir \| -c cachefile] [-D] [-f] [-m] [-N] "
	- "[-R root] [-F [-n]] [-t]\n"
	- "\t [--rewind-to-checkpoint] <pool \| id> [newpool]\n"));
	- case HELP_IOSTAT:
	- return (gettext("\tiostat [-gLPv] [-T d\|u] [pool] ... "
	- "[interval [count]]\n"));
	- case HELP_LABELCLEAR:
	- return (gettext("\tlabelclear [-f] <vdev>\n"));
	- case HELP_LIST:
	- return (gettext("\tlist [-gHLpPv] [-o property[,...]] "
	- "[-T d\|u] [pool] ... [interval [count]]\n"));
	- case HELP_OFFLINE:
	- return (gettext("\toffline [-t] <pool> <device> ...\n"));
	- case HELP_ONLINE:
	- return (gettext("\tonline [-e] <pool> <device> ...\n"));
	- case HELP_REPLACE:
	- return (gettext("\treplace [-f] <pool> <device> "
	- "[new-device]\n"));
	- case HELP_REMOVE:
	- return (gettext("\tremove [-nps] <pool> <device> ...\n"));
	- case HELP_REOPEN:
	- return (gettext("\treopen <pool>\n"));
	- case HELP_INITIALIZE:
	- return (gettext("\tinitialize [-cs] <pool> [<device> ...]\n"));
	- case HELP_SCRUB:
	- return (gettext("\tscrub [-s \| -p] <pool> ...\n"));
	- case HELP_STATUS:
	- return (gettext("\tstatus [-DgLPvx] [-T d\|u] [pool] ... "
	- "[interval [count]]\n"));
	- case HELP_UPGRADE:
	- return (gettext("\tupgrade [-v]\n"
	- "\tupgrade [-V version] <-a \| pool ...>\n"));
	- case HELP_GET:
	- return (gettext("\tget [-Hp] [-o \"all\" \| field[,...]] "
	- "<\"all\" \| property[,...]> <pool> ...\n"));
	- case HELP_SET:
	- return (gettext("\tset <property=value> <pool> \n"));
	- case HELP_SPLIT:
	- return (gettext("\tsplit [-gLnP] [-R altroot] [-o mntopts]\n"
	- "\t [-o property=value] <pool> <newpool> "
	- "[<device> ...]\n"));
	- case HELP_REGUID:
	- return (gettext("\treguid <pool>\n"));
	- case HELP_SYNC:
	- return (gettext("\tsync [pool] ...\n"));
	- }
	-
	- abort();
	- /* NOTREACHED */
	-}
	-
	-
	-/*
	- * Callback routine that will print out a pool property value.
	- */
	-static int
	-print_prop_cb(int prop, void *cb)
	-{
	- FILE *fp = cb;
	-
	- (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop));
	-
	- if (zpool_prop_readonly(prop))
	- (void) fprintf(fp, " NO ");
	- else
	- (void) fprintf(fp, " YES ");
	-
	- if (zpool_prop_values(prop) == NULL)
	- (void) fprintf(fp, "-\n");
	- else
	- (void) fprintf(fp, "%s\n", zpool_prop_values(prop));
	-
	- return (ZPROP_CONT);
	-}
	-
	-/*
	- * Display usage message. If we're inside a command, display only the usage for
	- * that command. Otherwise, iterate over the entire command table and display
	- * a complete usage message.
	- */
	-void
	-usage(boolean_t requested)
	-{
	- FILE *fp = requested ? stdout : stderr;
	-
	- if (current_command == NULL) {
	- int i;
	-
	- (void) fprintf(fp, gettext("usage: zpool command args ...\n"));
	- (void) fprintf(fp,
	- gettext("where 'command' is one of the following:\n\n"));
	-
	- for (i = 0; i < NCOMMAND; i++) {
	- if (command_table[i].name == NULL)
	- (void) fprintf(fp, "\n");
	- else
	- (void) fprintf(fp, "%s",
	- get_usage(command_table[i].usage));
	- }
	- } else {
	- (void) fprintf(fp, gettext("usage:\n"));
	- (void) fprintf(fp, "%s", get_usage(current_command->usage));
	- }
	-
	- if (current_command != NULL &&
	- ((strcmp(current_command->name, "set") == 0) \|\|
	- (strcmp(current_command->name, "get") == 0) \|\|
	- (strcmp(current_command->name, "list") == 0))) {
	-
	- (void) fprintf(fp,
	- gettext("\nthe following properties are supported:\n"));
	-
	- (void) fprintf(fp, "\n\t%-19s %s %s\n\n",
	- "PROPERTY", "EDIT", "VALUES");
	-
	- /* Iterate over all properties */
	- (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE,
	- ZFS_TYPE_POOL);
	-
	- (void) fprintf(fp, "\t%-19s ", "feature@...");
	- (void) fprintf(fp, "YES disabled \| enabled \| active\n");
	-
	- (void) fprintf(fp, gettext("\nThe feature@ properties must be "
	- "appended with a feature name.\nSee zpool-features(7).\n"));
	- }
	-
	- /*
	- * See comments at end of main().
	- */
	- if (getenv("ZFS_ABORT") != NULL) {
	- (void) printf("dumping core by request\n");
	- abort();
	- }
	-
	- exit(requested ? 0 : 2);
	-}
	-
	-/*
	- * print a pool vdev config for dry runs
	- */
	-static void
	-print_vdev_tree(zpool_handle_t zhp, const char name, nvlist_t *nv, int indent,
	- const char *match, int name_flags)
	-{
	- nvlist_t **child;
	- uint_t c, children;
	- char *vname;
	- boolean_t printed = B_FALSE;
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0) {
	- if (name != NULL)
	- (void) printf("\t%*s%s\n", indent, "", name);
	- return;
	- }
	-
	- for (c = 0; c < children; c++) {
	- uint64_t is_log = B_FALSE;
	- char *class = "";
	-
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	- &is_log);
	- if (is_log)
	- class = VDEV_ALLOC_BIAS_LOG;
	- (void) nvlist_lookup_string(child[c],
	- ZPOOL_CONFIG_ALLOCATION_BIAS, &class);
	- if (strcmp(match, class) != 0)
	- continue;
	-
	- if (!printed && name != NULL) {
	- (void) printf("\t%*s%s\n", indent, "", name);
	- printed = B_TRUE;
	- }
	- vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags);
	- print_vdev_tree(zhp, vname, child[c], indent + 2, "",
	- name_flags);
	- free(vname);
	- }
	-}
	-
	-static boolean_t
	-prop_list_contains_feature(nvlist_t *proplist)
	-{
	- nvpair_t *nvp;
	- for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp;
	- nvp = nvlist_next_nvpair(proplist, nvp)) {
	- if (zpool_prop_feature(nvpair_name(nvp)))
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * Add a property pair (name, string-value) into a property nvlist.
	- */
	-static int
	-add_prop_list(const char propname, char propval, nvlist_t **props,
	- boolean_t poolprop)
	-{
	- zpool_prop_t prop = ZPROP_INVAL;
	- zfs_prop_t fprop;
	- nvlist_t *proplist;
	- const char *normnm;
	- char *strval;
	-
	- if (*props == NULL &&
	- nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) {
	- (void) fprintf(stderr,
	- gettext("internal error: out of memory\n"));
	- return (1);
	- }
	-
	- proplist = *props;
	-
	- if (poolprop) {
	- const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION);
	-
	- if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL &&
	- !zpool_prop_feature(propname)) {
	- (void) fprintf(stderr, gettext("property '%s' is "
	- "not a valid pool property\n"), propname);
	- return (2);
	- }
	-
	- /*
	- * feature@ properties and version should not be specified
	- * at the same time.
	- */
	- if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) &&
	- nvlist_exists(proplist, vname)) \|\|
	- (prop == ZPOOL_PROP_VERSION &&
	- prop_list_contains_feature(proplist))) {
	- (void) fprintf(stderr, gettext("'feature@' and "
	- "'version' properties cannot be specified "
	- "together\n"));
	- return (2);
	- }
	-
	-
	- if (zpool_prop_feature(propname))
	- normnm = propname;
	- else
	- normnm = zpool_prop_to_name(prop);
	- } else {
	- if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
	- normnm = zfs_prop_to_name(fprop);
	- } else {
	- normnm = propname;
	- }
	- }
	-
	- if (nvlist_lookup_string(proplist, normnm, &strval) == 0 &&
	- prop != ZPOOL_PROP_CACHEFILE) {
	- (void) fprintf(stderr, gettext("property '%s' "
	- "specified multiple times\n"), propname);
	- return (2);
	- }
	-
	- if (nvlist_add_string(proplist, normnm, propval) != 0) {
	- (void) fprintf(stderr, gettext("internal "
	- "error: out of memory\n"));
	- return (1);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Set a default property pair (name, string-value) in a property nvlist
	- */
	-static int
	-add_prop_list_default(const char propname, char propval, nvlist_t **props,
	- boolean_t poolprop)
	-{
	- char *pval;
	-
	- if (nvlist_lookup_string(*props, propname, &pval) == 0)
	- return (0);
	-
	- return (add_prop_list(propname, propval, props, poolprop));
	-}
	-
	-/*
	- * zpool add [-fgLnP] [-o property=value] <pool> <vdev> ...
	- *
	- * -f Force addition of devices, even if they appear in use
	- * -g Display guid for individual vdev name.
	- * -L Follow links when resolving vdev path name.
	- * -n Do not add the devices, but display the resulting layout if
	- * they were to be added.
	- * -P Display full path for vdev name.
	- *
	- * Adds the given vdevs to 'pool'. As with create, the bulk of this work is
	- * handled by get_vdev_spec(), which constructs the nvlist needed to pass to
	- * libzfs.
	- */
	-int
	-zpool_do_add(int argc, char **argv)
	-{
	- boolean_t force = B_FALSE;
	- boolean_t dryrun = B_FALSE;
	- int name_flags = 0;
	- int c;
	- nvlist_t *nvroot;
	- char *poolname;
	- zpool_boot_label_t boot_type;
	- uint64_t boot_size;
	- int ret;
	- zpool_handle_t *zhp;
	- nvlist_t *config;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "fgLnP")) != -1) {
	- switch (c) {
	- case 'f':
	- force = B_TRUE;
	- break;
	- case 'g':
	- name_flags \|= VDEV_NAME_GUID;
	- break;
	- case 'L':
	- name_flags \|= VDEV_NAME_FOLLOW_LINKS;
	- break;
	- case 'n':
	- dryrun = B_TRUE;
	- break;
	- case 'P':
	- name_flags \|= VDEV_NAME_PATH;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* get pool name and check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing vdev specification\n"));
	- usage(B_FALSE);
	- }
	-
	- poolname = argv[0];
	-
	- argc--;
	- argv++;
	-
	- if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	- return (1);
	-
	- if ((config = zpool_get_config(zhp, NULL)) == NULL) {
	- (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
	- poolname);
	- zpool_close(zhp);
	- return (1);
	- }
	-
	- if (zpool_is_bootable(zhp))
	- boot_type = ZPOOL_COPY_BOOT_LABEL;
	- else
	- boot_type = ZPOOL_NO_BOOT_LABEL;
	-
	- /* pass off to get_vdev_spec for processing */
	- boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL);
	- nvroot = make_root_vdev(zhp, force, !force, B_FALSE, dryrun,
	- boot_type, boot_size, argc, argv);
	- if (nvroot == NULL) {
	- zpool_close(zhp);
	- return (1);
	- }
	-
	- if (dryrun) {
	- nvlist_t *poolnvroot;
	-
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &poolnvroot) == 0);
	-
	- (void) printf(gettext("would update '%s' to the following "
	- "configuration:\n"), zpool_get_name(zhp));
	-
	- /* print original main pool and new tree */
	- print_vdev_tree(zhp, poolname, poolnvroot, 0, "",
	- name_flags \| VDEV_NAME_TYPE_ID);
	- print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags);
	-
	- /* print other classes: 'dedup', 'special', and 'log' */
	- print_vdev_tree(zhp, "dedup", poolnvroot, 0,
	- VDEV_ALLOC_BIAS_DEDUP, name_flags);
	- print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP,
	- name_flags);
	-
	- print_vdev_tree(zhp, "special", poolnvroot, 0,
	- VDEV_ALLOC_BIAS_SPECIAL, name_flags);
	- print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL,
	- name_flags);
	-
	- print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG,
	- name_flags);
	- print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG,
	- name_flags);
	-
	- ret = 0;
	- } else {
	- ret = (zpool_add(zhp, nvroot) != 0);
	- }
	-
	- nvlist_free(nvroot);
	- zpool_close(zhp);
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool remove <pool> <vdev> ...
	- *
	- * Removes the given vdev from the pool.
	- */
	-int
	-zpool_do_remove(int argc, char **argv)
	-{
	- char *poolname;
	- int i, ret = 0;
	- zpool_handle_t *zhp;
	- boolean_t stop = B_FALSE;
	- boolean_t noop = B_FALSE;
	- boolean_t parsable = B_FALSE;
	- char c;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "nps")) != -1) {
	- switch (c) {
	- case 'n':
	- noop = B_TRUE;
	- break;
	- case 'p':
	- parsable = B_TRUE;
	- break;
	- case 's':
	- stop = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* get pool name and check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name argument\n"));
	- usage(B_FALSE);
	- }
	-
	- poolname = argv[0];
	-
	- if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	- return (1);
	-
	- if (stop && noop) {
	- (void) fprintf(stderr, gettext("stop request ignored\n"));
	- return (0);
	- }
	-
	- if (stop) {
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	- if (zpool_vdev_remove_cancel(zhp) != 0)
	- ret = 1;
	- } else {
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing device\n"));
	- usage(B_FALSE);
	- }
	-
	- for (i = 1; i < argc; i++) {
	- if (noop) {
	- uint64_t size;
	-
	- if (zpool_vdev_indirect_size(zhp, argv[i],
	- &size) != 0) {
	- ret = 1;
	- break;
	- }
	- if (parsable) {
	- (void) printf("%s %llu\n",
	- argv[i], size);
	- } else {
	- char valstr[32];
	- zfs_nicenum(size, valstr,
	- sizeof (valstr));
	- (void) printf("Memory that will be "
	- "used after removing %s: %s\n",
	- argv[i], valstr);
	- }
	- } else {
	- if (zpool_vdev_remove(zhp, argv[i]) != 0)
	- ret = 1;
	- }
	- }
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool labelclear [-f] <vdev>
	- *
	- * -f Force clearing the label for the vdevs which are members of
	- * the exported or foreign pools.
	- *
	- * Verifies that the vdev is not active and zeros out the label information
	- * on the device.
	- */
	-int
	-zpool_do_labelclear(int argc, char **argv)
	-{
	- char vdev[MAXPATHLEN];
	- char *name = NULL;
	- struct stat st;
	- int c, fd, ret = 0;
	- nvlist_t *config;
	- pool_state_t state;
	- boolean_t inuse = B_FALSE;
	- boolean_t force = B_FALSE;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "f")) != -1) {
	- switch (c) {
	- case 'f':
	- force = B_TRUE;
	- break;
	- default:
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* get vdev name */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing vdev name\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- /*
	- * Check if we were given absolute path and use it as is.
	- * Otherwise if the provided vdev name doesn't point to a file,
	- * try prepending dsk path and appending s0.
	- */
	- (void) strlcpy(vdev, argv[0], sizeof (vdev));
	- if (vdev[0] != '/' && stat(vdev, &st) != 0) {
	- char *s;
	-
	- (void) snprintf(vdev, sizeof (vdev), "%s/%s",
	-#ifdef illumos
	- ZFS_DISK_ROOT, argv[0]);
	- if ((s = strrchr(argv[0], 's')) == NULL \|\|
	- !isdigit(*(s + 1)))
	- (void) strlcat(vdev, "s0", sizeof (vdev));
	-#else
	- "/dev", argv[0]);
	-#endif
	- if (stat(vdev, &st) != 0) {
	- (void) fprintf(stderr, gettext(
	- "failed to find device %s, try specifying absolute "
	- "path instead\n"), argv[0]);
	- return (1);
	- }
	- }
	-
	- if ((fd = open(vdev, O_RDWR)) < 0) {
	- (void) fprintf(stderr, gettext("failed to open %s: %s\n"),
	- vdev, strerror(errno));
	- return (1);
	- }
	-
	- if (zpool_read_label(fd, &config) != 0) {
	- (void) fprintf(stderr,
	- gettext("failed to read label from %s\n"), vdev);
	- return (1);
	- }
	- nvlist_free(config);
	-
	- ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse);
	- if (ret != 0) {
	- (void) fprintf(stderr,
	- gettext("failed to check state for %s\n"), vdev);
	- return (1);
	- }
	-
	- if (!inuse)
	- goto wipe_label;
	-
	- switch (state) {
	- default:
	- case POOL_STATE_ACTIVE:
	- case POOL_STATE_SPARE:
	- case POOL_STATE_L2CACHE:
	- (void) fprintf(stderr, gettext(
	- "%s is a member (%s) of pool \"%s\"\n"),
	- vdev, zpool_pool_state_to_name(state), name);
	- ret = 1;
	- goto errout;
	-
	- case POOL_STATE_EXPORTED:
	- if (force)
	- break;
	- (void) fprintf(stderr, gettext(
	- "use '-f' to override the following error:\n"
	- "%s is a member of exported pool \"%s\"\n"),
	- vdev, name);
	- ret = 1;
	- goto errout;
	-
	- case POOL_STATE_POTENTIALLY_ACTIVE:
	- if (force)
	- break;
	- (void) fprintf(stderr, gettext(
	- "use '-f' to override the following error:\n"
	- "%s is a member of potentially active pool \"%s\"\n"),
	- vdev, name);
	- ret = 1;
	- goto errout;
	-
	- case POOL_STATE_DESTROYED:
	- /* inuse should never be set for a destroyed pool */
	- assert(0);
	- break;
	- }
	-
	-wipe_label:
	- ret = zpool_clear_label(fd);
	- if (ret != 0) {
	- (void) fprintf(stderr,
	- gettext("failed to clear label for %s\n"), vdev);
	- }
	-
	-errout:
	- free(name);
	- (void) close(fd);
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool create [-fnd] [-B] [-o property=value] ...
	- * [-O file-system-property=value] ...
	- * [-R root] [-m mountpoint] [-t tempname] <pool> <dev> ...
	- *
	- * -B Create boot partition.
	- * -f Force creation, even if devices appear in use
	- * -n Do not create the pool, but display the resulting layout if it
	- * were to be created.
	- * -R Create a pool under an alternate root
	- * -m Set default mountpoint for the root dataset. By default it's
	- * '/<pool>'
	- * -t Use the temporary name until the pool is exported.
	- * -o Set property=value.
	- * -d Don't automatically enable all supported pool features
	- * (individual features can be enabled with -o).
	- * -O Set fsproperty=value in the pool's root file system
	- *
	- * Creates the named pool according to the given vdev specification. The
	- * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once
	- * we get the nvlist back from get_vdev_spec(), we either print out the contents
	- * (if '-n' was specified), or pass it to libzfs to do the creation.
	- */
	-
	-#define SYSTEM256 (256 * 1024 * 1024)
	-int
	-zpool_do_create(int argc, char **argv)
	-{
	- boolean_t force = B_FALSE;
	- boolean_t dryrun = B_FALSE;
	- boolean_t enable_all_pool_feat = B_TRUE;
	- zpool_boot_label_t boot_type = ZPOOL_NO_BOOT_LABEL;
	- uint64_t boot_size = 0;
	- int c;
	- nvlist_t *nvroot = NULL;
	- char *poolname;
	- char *tname = NULL;
	- int ret = 1;
	- char *altroot = NULL;
	- char *mountpoint = NULL;
	- nvlist_t *fsprops = NULL;
	- nvlist_t *props = NULL;
	- char *propval;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, ":fndBR:m:o:O:t:")) != -1) {
	- switch (c) {
	- case 'f':
	- force = B_TRUE;
	- break;
	- case 'n':
	- dryrun = B_TRUE;
	- break;
	- case 'd':
	- enable_all_pool_feat = B_FALSE;
	- break;
	- case 'B':
	-#ifdef illumos
	- /*
	- * We should create the system partition.
	- * Also make sure the size is set.
	- */
	- boot_type = ZPOOL_CREATE_BOOT_LABEL;
	- if (boot_size == 0)
	- boot_size = SYSTEM256;
	- break;
	-#else
	- (void) fprintf(stderr,
	- gettext("option '%c' is not supported\n"),
	- optopt);
	- goto badusage;
	-#endif
	- case 'R':
	- altroot = optarg;
	- if (add_prop_list(zpool_prop_to_name(
	- ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
	- goto errout;
	- if (add_prop_list_default(zpool_prop_to_name(
	- ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
	- goto errout;
	- break;
	- case 'm':
	- /* Equivalent to -O mountpoint=optarg */
	- mountpoint = optarg;
	- break;
	- case 'o':
	- if ((propval = strchr(optarg, '=')) == NULL) {
	- (void) fprintf(stderr, gettext("missing "
	- "'=' for -o option\n"));
	- goto errout;
	- }
	- *propval = '\0';
	- propval++;
	-
	- if (add_prop_list(optarg, propval, &props, B_TRUE))
	- goto errout;
	-
	- /*
	- * Get bootsize value for make_root_vdev().
	- */
	- if (zpool_name_to_prop(optarg) == ZPOOL_PROP_BOOTSIZE) {
	- if (zfs_nicestrtonum(g_zfs, propval,
	- &boot_size) < 0 \|\| boot_size == 0) {
	- (void) fprintf(stderr,
	- gettext("bad boot partition size "
	- "'%s': %s\n"), propval,
	- libzfs_error_description(g_zfs));
	- goto errout;
	- }
	- }
	-
	- /*
	- * If the user is creating a pool that doesn't support
	- * feature flags, don't enable any features.
	- */
	- if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) {
	- char *end;
	- u_longlong_t ver;
	-
	- ver = strtoull(propval, &end, 10);
	- if (*end == '\0' &&
	- ver < SPA_VERSION_FEATURES) {
	- enable_all_pool_feat = B_FALSE;
	- }
	- }
	- if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT)
	- altroot = propval;
	- break;
	- case 'O':
	- if ((propval = strchr(optarg, '=')) == NULL) {
	- (void) fprintf(stderr, gettext("missing "
	- "'=' for -O option\n"));
	- goto errout;
	- }
	- *propval = '\0';
	- propval++;
	-
	- /*
	- * Mountpoints are checked and then added later.
	- * Uniquely among properties, they can be specified
	- * more than once, to avoid conflict with -m.
	- */
	- if (0 == strcmp(optarg,
	- zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) {
	- mountpoint = propval;
	- } else if (add_prop_list(optarg, propval, &fsprops,
	- B_FALSE)) {
	- goto errout;
	- }
	- break;
	- case 't':
	- /*
	- * Sanity check temporary pool name.
	- */
	- if (strchr(optarg, '/') != NULL) {
	- (void) fprintf(stderr, gettext("cannot create "
	- "'%s': invalid character '/' in temporary "
	- "name\n"), optarg);
	- (void) fprintf(stderr, gettext("use 'zfs "
	- "create' to create a dataset\n"));
	- goto errout;
	- }
	-
	- if (add_prop_list(zpool_prop_to_name(
	- ZPOOL_PROP_TNAME), optarg, &props, B_TRUE))
	- goto errout;
	- if (add_prop_list_default(zpool_prop_to_name(
	- ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
	- goto errout;
	- tname = optarg;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- goto badusage;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- goto badusage;
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* get pool name and check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name argument\n"));
	- goto badusage;
	- }
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing vdev specification\n"));
	- goto badusage;
	- }
	-
	- poolname = argv[0];
	-
	- /*
	- * As a special case, check for use of '/' in the name, and direct the
	- * user to use 'zfs create' instead.
	- */
	- if (strchr(poolname, '/') != NULL) {
	- (void) fprintf(stderr, gettext("cannot create '%s': invalid "
	- "character '/' in pool name\n"), poolname);
	- (void) fprintf(stderr, gettext("use 'zfs create' to "
	- "create a dataset\n"));
	- goto errout;
	- }
	-
	- /*
	- * Make sure the bootsize is set when ZPOOL_CREATE_BOOT_LABEL is used,
	- * and not set otherwise.
	- */
	- if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
	- const char *propname;
	- char strptr, buf = NULL;
	- int rv;
	-
	- propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE);
	- if (nvlist_lookup_string(props, propname, &strptr) != 0) {
	- (void) asprintf(&buf, "%" PRIu64, boot_size);
	- if (buf == NULL) {
	- (void) fprintf(stderr,
	- gettext("internal error: out of memory\n"));
	- goto errout;
	- }
	- rv = add_prop_list(propname, buf, &props, B_TRUE);
	- free(buf);
	- if (rv != 0)
	- goto errout;
	- }
	- } else {
	- const char *propname;
	- char *strptr;
	-
	- propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE);
	- if (nvlist_lookup_string(props, propname, &strptr) == 0) {
	- (void) fprintf(stderr, gettext("error: setting boot "
	- "partition size requires option '-B'\n"));
	- goto errout;
	- }
	- }
	-
	- /* pass off to get_vdev_spec for bulk processing */
	- nvroot = make_root_vdev(NULL, force, !force, B_FALSE, dryrun,
	- boot_type, boot_size, argc - 1, argv + 1);
	- if (nvroot == NULL)
	- goto errout;
	-
	- /* make_root_vdev() allows 0 toplevel children if there are spares */
	- if (!zfs_allocatable_devs(nvroot)) {
	- (void) fprintf(stderr, gettext("invalid vdev "
	- "specification: at least one toplevel vdev must be "
	- "specified\n"));
	- goto errout;
	- }
	-
	- if (altroot != NULL && altroot[0] != '/') {
	- (void) fprintf(stderr, gettext("invalid alternate root '%s': "
	- "must be an absolute path\n"), altroot);
	- goto errout;
	- }
	-
	- /*
	- * Check the validity of the mountpoint and direct the user to use the
	- * '-m' mountpoint option if it looks like its in use.
	- * Ignore the checks if the '-f' option is given.
	- */
	- if (!force && (mountpoint == NULL \|\|
	- (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
	- strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0))) {
	- char buf[MAXPATHLEN];
	- DIR *dirp;
	-
	- if (mountpoint && mountpoint[0] != '/') {
	- (void) fprintf(stderr, gettext("invalid mountpoint "
	- "'%s': must be an absolute path, 'legacy', or "
	- "'none'\n"), mountpoint);
	- goto errout;
	- }
	-
	- if (mountpoint == NULL) {
	- if (altroot != NULL)
	- (void) snprintf(buf, sizeof (buf), "%s/%s",
	- altroot, poolname);
	- else
	- (void) snprintf(buf, sizeof (buf), "/%s",
	- poolname);
	- } else {
	- if (altroot != NULL)
	- (void) snprintf(buf, sizeof (buf), "%s%s",
	- altroot, mountpoint);
	- else
	- (void) snprintf(buf, sizeof (buf), "%s",
	- mountpoint);
	- }
	-
	- if ((dirp = opendir(buf)) == NULL && errno != ENOENT) {
	- (void) fprintf(stderr, gettext("mountpoint '%s' : "
	- "%s\n"), buf, strerror(errno));
	- (void) fprintf(stderr, gettext("use '-m' "
	- "option to provide a different default\n"));
	- goto errout;
	- } else if (dirp) {
	- int count = 0;
	-
	- while (count < 3 && readdir(dirp) != NULL)
	- count++;
	- (void) closedir(dirp);
	-
	- if (count > 2) {
	- (void) fprintf(stderr, gettext("mountpoint "
	- "'%s' exists and is not empty\n"), buf);
	- (void) fprintf(stderr, gettext("use '-m' "
	- "option to provide a "
	- "different default\n"));
	- goto errout;
	- }
	- }
	- }
	-
	- /*
	- * Now that the mountpoint's validity has been checked, ensure that
	- * the property is set appropriately prior to creating the pool.
	- */
	- if (mountpoint != NULL) {
	- ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
	- mountpoint, &fsprops, B_FALSE);
	- if (ret != 0)
	- goto errout;
	- }
	-
	- ret = 1;
	- if (dryrun) {
	- /*
	- * For a dry run invocation, print out a basic message and run
	- * through all the vdevs in the list and print out in an
	- * appropriate hierarchy.
	- */
	- (void) printf(gettext("would create '%s' with the "
	- "following layout:\n\n"), poolname);
	-
	- print_vdev_tree(NULL, poolname, nvroot, 0, "", 0);
	- print_vdev_tree(NULL, "dedup", nvroot, 0,
	- VDEV_ALLOC_BIAS_DEDUP, 0);
	- print_vdev_tree(NULL, "special", nvroot, 0,
	- VDEV_ALLOC_BIAS_SPECIAL, 0);
	- print_vdev_tree(NULL, "logs", nvroot, 0,
	- VDEV_ALLOC_BIAS_LOG, 0);
	-
	- ret = 0;
	- } else {
	- /*
	- * Hand off to libzfs.
	- */
	- if (enable_all_pool_feat) {
	- spa_feature_t i;
	- for (i = 0; i < SPA_FEATURES; i++) {
	- char propname[MAXPATHLEN];
	- zfeature_info_t *feat = &spa_feature_table[i];
	-
	- (void) snprintf(propname, sizeof (propname),
	- "feature@%s", feat->fi_uname);
	-
	- /*
	- * Skip feature if user specified it manually
	- * on the command line.
	- */
	- if (nvlist_exists(props, propname))
	- continue;
	-
	- ret = add_prop_list(propname,
	- ZFS_FEATURE_ENABLED, &props, B_TRUE);
	- if (ret != 0)
	- goto errout;
	- }
	- }
	-
	- ret = 1;
	- if (zpool_create(g_zfs, poolname,
	- nvroot, props, fsprops) == 0) {
	- zfs_handle_t *pool = zfs_open(g_zfs,
	- tname ? tname : poolname, ZFS_TYPE_FILESYSTEM);
	- if (pool != NULL) {
	- if (zfs_mount(pool, NULL, 0) == 0)
	- ret = zfs_shareall(pool);
	- zfs_close(pool);
	- }
	- } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) {
	- (void) fprintf(stderr, gettext("pool name may have "
	- "been omitted\n"));
	- }
	- }
	-
	-errout:
	- nvlist_free(nvroot);
	- nvlist_free(fsprops);
	- nvlist_free(props);
	- return (ret);
	-badusage:
	- nvlist_free(fsprops);
	- nvlist_free(props);
	- usage(B_FALSE);
	- return (2);
	-}
	-
	-/*
	- * zpool destroy <pool>
	- *
	- * -f Forcefully unmount any datasets
	- *
	- * Destroy the given pool. Automatically unmounts any datasets in the pool.
	- */
	-int
	-zpool_do_destroy(int argc, char **argv)
	-{
	- boolean_t force = B_FALSE;
	- int c;
	- char *pool;
	- zpool_handle_t *zhp;
	- int ret;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "f")) != -1) {
	- switch (c) {
	- case 'f':
	- force = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool argument\n"));
	- usage(B_FALSE);
	- }
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- pool = argv[0];
	-
	- if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
	- /*
	- * As a special case, check for use of '/' in the name, and
	- * direct the user to use 'zfs destroy' instead.
	- */
	- if (strchr(pool, '/') != NULL)
	- (void) fprintf(stderr, gettext("use 'zfs destroy' to "
	- "destroy a dataset\n"));
	- return (1);
	- }
	-
	- if (zpool_disable_datasets(zhp, force) != 0) {
	- (void) fprintf(stderr, gettext("could not destroy '%s': "
	- "could not unmount datasets\n"), zpool_get_name(zhp));
	- return (1);
	- }
	-
	- /* The history must be logged as part of the export */
	- log_history = B_FALSE;
	-
	- ret = (zpool_destroy(zhp, history_str) != 0);
	-
	- zpool_close(zhp);
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool export [-f] <pool> ...
	- *
	- * -f Forcefully unmount datasets
	- *
	- * Export the given pools. By default, the command will attempt to cleanly
	- * unmount any active datasets within the pool. If the '-f' flag is specified,
	- * then the datasets will be forcefully unmounted.
	- */
	-int
	-zpool_do_export(int argc, char **argv)
	-{
	- boolean_t force = B_FALSE;
	- boolean_t hardforce = B_FALSE;
	- int c;
	- zpool_handle_t *zhp;
	- int ret;
	- int i;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "fF")) != -1) {
	- switch (c) {
	- case 'f':
	- force = B_TRUE;
	- break;
	- case 'F':
	- hardforce = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* check arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool argument\n"));
	- usage(B_FALSE);
	- }
	-
	- ret = 0;
	- for (i = 0; i < argc; i++) {
	- if ((zhp = zpool_open_canfail(g_zfs, argv[i])) == NULL) {
	- ret = 1;
	- continue;
	- }
	-
	- if (zpool_disable_datasets(zhp, force) != 0) {
	- ret = 1;
	- zpool_close(zhp);
	- continue;
	- }
	-
	- /* The history must be logged as part of the export */
	- log_history = B_FALSE;
	-
	- if (hardforce) {
	- if (zpool_export_force(zhp, history_str) != 0)
	- ret = 1;
	- } else if (zpool_export(zhp, force, history_str) != 0) {
	- ret = 1;
	- }
	-
	- zpool_close(zhp);
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * Given a vdev configuration, determine the maximum width needed for the device
	- * name column.
	- */
	-static int
	-max_width(zpool_handle_t zhp, nvlist_t nv, int depth, int max,
	- int name_flags)
	-{
	- char *name;
	- nvlist_t **child;
	- uint_t c, children;
	- int ret;
	-
	- name = zpool_vdev_name(g_zfs, zhp, nv, name_flags \| VDEV_NAME_TYPE_ID);
	- if (strlen(name) + depth > max)
	- max = strlen(name) + depth;
	-
	- free(name);
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++)
	- if ((ret = max_width(zhp, child[c], depth + 2,
	- max, name_flags)) > max)
	- max = ret;
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++)
	- if ((ret = max_width(zhp, child[c], depth + 2,
	- max, name_flags)) > max)
	- max = ret;
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++)
	- if ((ret = max_width(zhp, child[c], depth + 2,
	- max, name_flags)) > max)
	- max = ret;
	- }
	-
	- return (max);
	-}
	-
	-typedef struct spare_cbdata {
	- uint64_t cb_guid;
	- zpool_handle_t *cb_zhp;
	-} spare_cbdata_t;
	-
	-static boolean_t
	-find_vdev(nvlist_t *nv, uint64_t search)
	-{
	- uint64_t guid;
	- nvlist_t **child;
	- uint_t c, children;
	-
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
	- search == guid)
	- return (B_TRUE);
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++)
	- if (find_vdev(child[c], search))
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-static int
	-find_spare(zpool_handle_t zhp, void data)
	-{
	- spare_cbdata_t *cbp = data;
	- nvlist_t config, nvroot;
	-
	- config = zpool_get_config(zhp, NULL);
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	-
	- if (find_vdev(nvroot, cbp->cb_guid)) {
	- cbp->cb_zhp = zhp;
	- return (1);
	- }
	-
	- zpool_close(zhp);
	- return (0);
	-}
	-
	-typedef struct status_cbdata {
	- int cb_count;
	- int cb_name_flags;
	- int cb_namewidth;
	- boolean_t cb_allpools;
	- boolean_t cb_verbose;
	- boolean_t cb_explain;
	- boolean_t cb_first;
	- boolean_t cb_dedup_stats;
	- boolean_t cb_print_status;
	-} status_cbdata_t;
	-
	-/*
	- * Print out configuration state as requested by status_callback.
	- */
	-static void
	-print_status_config(zpool_handle_t zhp, status_cbdata_t cb, const char *name,
	- nvlist_t *nv, int depth, boolean_t isspare)
	-{
	- nvlist_t **child;
	- uint_t c, vsc, children;
	- pool_scan_stat_t *ps = NULL;
	- vdev_stat_t *vs;
	- char rbuf[6], wbuf[6], cbuf[6];
	- char *vname;
	- uint64_t notpresent;
	- uint64_t ashift;
	- spare_cbdata_t spare_cb;
	- const char *state;
	- char *type;
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0)
	- children = 0;
	-
	- verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &vsc) == 0);
	-
	- verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
	-
	- if (strcmp(type, VDEV_TYPE_INDIRECT) == 0)
	- return;
	-
	- state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
	- if (isspare) {
	- /*
	- * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
	- * online drives.
	- */
	- if (vs->vs_aux == VDEV_AUX_SPARED)
	- state = "INUSE";
	- else if (vs->vs_state == VDEV_STATE_HEALTHY)
	- state = "AVAIL";
	- }
	-
	- (void) printf("\t%s%-s %-8s", depth, "", cb->cb_namewidth - depth,
	- name, state);
	-
	- if (!isspare) {
	- zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
	- zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
	- zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
	- (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
	- }
	-
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
	- &notpresent) == 0 \|\|
	- vs->vs_state <= VDEV_STATE_CANT_OPEN) {
	- char *path;
	- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0)
	- (void) printf(" was %s", path);
	- } else if (vs->vs_aux != 0) {
	- (void) printf(" ");
	-
	- switch (vs->vs_aux) {
	- case VDEV_AUX_OPEN_FAILED:
	- (void) printf(gettext("cannot open"));
	- break;
	-
	- case VDEV_AUX_BAD_GUID_SUM:
	- (void) printf(gettext("missing device"));
	- break;
	-
	- case VDEV_AUX_NO_REPLICAS:
	- (void) printf(gettext("insufficient replicas"));
	- break;
	-
	- case VDEV_AUX_VERSION_NEWER:
	- (void) printf(gettext("newer version"));
	- break;
	-
	- case VDEV_AUX_UNSUP_FEAT:
	- (void) printf(gettext("unsupported feature(s)"));
	- break;
	-
	- case VDEV_AUX_ASHIFT_TOO_BIG:
	- (void) printf(gettext("unsupported minimum blocksize"));
	- break;
	-
	- case VDEV_AUX_SPARED:
	- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
	- &spare_cb.cb_guid) == 0);
	- if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) {
	- if (strcmp(zpool_get_name(spare_cb.cb_zhp),
	- zpool_get_name(zhp)) == 0)
	- (void) printf(gettext("currently in "
	- "use"));
	- else
	- (void) printf(gettext("in use by "
	- "pool '%s'"),
	- zpool_get_name(spare_cb.cb_zhp));
	- zpool_close(spare_cb.cb_zhp);
	- } else {
	- (void) printf(gettext("currently in use"));
	- }
	- break;
	-
	- case VDEV_AUX_ERR_EXCEEDED:
	- (void) printf(gettext("too many errors"));
	- break;
	-
	- case VDEV_AUX_IO_FAILURE:
	- (void) printf(gettext("experienced I/O failures"));
	- break;
	-
	- case VDEV_AUX_BAD_LOG:
	- (void) printf(gettext("bad intent log"));
	- break;
	-
	- case VDEV_AUX_EXTERNAL:
	- (void) printf(gettext("external device fault"));
	- break;
	-
	- case VDEV_AUX_SPLIT_POOL:
	- (void) printf(gettext("split into new pool"));
	- break;
	-
	- case VDEV_AUX_ACTIVE:
	- (void) printf(gettext("currently in use"));
	- break;
	-
	- case VDEV_AUX_CHILDREN_OFFLINE:
	- (void) printf(gettext("all children offline"));
	- break;
	-
	- default:
	- (void) printf(gettext("corrupted data"));
	- break;
	- }
	- } else if (children == 0 && !isspare &&
	- VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
	- vs->vs_configured_ashift < vs->vs_physical_ashift) {
	- (void) printf(
	- gettext(" block size: %dB configured, %dB native"),
	- 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift);
	- }
	-
	- (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
	- (uint64_t **)&ps, &c);
	-
	- if (ps != NULL && ps->pss_state == DSS_SCANNING &&
	- vs->vs_scan_processed != 0 && children == 0) {
	- (void) printf(gettext(" (%s)"),
	- (ps->pss_func == POOL_SCAN_RESILVER) ?
	- "resilvering" : "repairing");
	- }
	-
	- if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE \|\|
	- vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED \|\|
	- vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) &&
	- !vs->vs_scan_removing) {
	- char zbuf[1024];
	- char tbuf[256];
	- struct tm zaction_ts;
	-
	- time_t t = vs->vs_initialize_action_time;
	- int initialize_pct = 100;
	- if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) {
	- initialize_pct = (vs->vs_initialize_bytes_done * 100 /
	- (vs->vs_initialize_bytes_est + 1));
	- }
	-
	- (void) localtime_r(&t, &zaction_ts);
	- (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
	-
	- switch (vs->vs_initialize_state) {
	- case VDEV_INITIALIZE_SUSPENDED:
	- (void) snprintf(zbuf, sizeof (zbuf),
	- ", suspended, started at %s", tbuf);
	- break;
	- case VDEV_INITIALIZE_ACTIVE:
	- (void) snprintf(zbuf, sizeof (zbuf),
	- ", started at %s", tbuf);
	- break;
	- case VDEV_INITIALIZE_COMPLETE:
	- (void) snprintf(zbuf, sizeof (zbuf),
	- ", completed at %s", tbuf);
	- break;
	- }
	-
	- (void) printf(gettext(" (%d%% initialized%s)"),
	- initialize_pct, zbuf);
	- }
	-
	- (void) printf("\n");
	-
	- for (c = 0; c < children; c++) {
	- uint64_t islog = B_FALSE, ishole = B_FALSE;
	-
	- /* Don't print logs or holes here */
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	- &islog);
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
	- &ishole);
	- if (islog \|\| ishole)
	- continue;
	- /* Only print normal classes here */
	- if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
	- continue;
	-
	- vname = zpool_vdev_name(g_zfs, zhp, child[c],
	- cb->cb_name_flags \| VDEV_NAME_TYPE_ID);
	- print_status_config(zhp, cb, vname, child[c], depth + 2,
	- isspare);
	- free(vname);
	- }
	-}
	-
	-/*
	- * Print the configuration of an exported pool. Iterate over all vdevs in the
	- * pool, printing out the name and status for each one.
	- */
	-static void
	-print_import_config(status_cbdata_t cb, const char name, nvlist_t *nv,
	- int depth)
	-{
	- nvlist_t **child;
	- uint_t c, children;
	- vdev_stat_t *vs;
	- char type, vname;
	-
	- verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
	- if (strcmp(type, VDEV_TYPE_MISSING) == 0 \|\|
	- strcmp(type, VDEV_TYPE_HOLE) == 0)
	- return;
	-
	- verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &c) == 0);
	-
	- (void) printf("\t%s%-s", depth, "", cb->cb_namewidth - depth, name);
	- (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux));
	-
	- if (vs->vs_aux != 0) {
	- (void) printf(" ");
	-
	- switch (vs->vs_aux) {
	- case VDEV_AUX_OPEN_FAILED:
	- (void) printf(gettext("cannot open"));
	- break;
	-
	- case VDEV_AUX_BAD_GUID_SUM:
	- (void) printf(gettext("missing device"));
	- break;
	-
	- case VDEV_AUX_NO_REPLICAS:
	- (void) printf(gettext("insufficient replicas"));
	- break;
	-
	- case VDEV_AUX_VERSION_NEWER:
	- (void) printf(gettext("newer version"));
	- break;
	-
	- case VDEV_AUX_UNSUP_FEAT:
	- (void) printf(gettext("unsupported feature(s)"));
	- break;
	-
	- case VDEV_AUX_ERR_EXCEEDED:
	- (void) printf(gettext("too many errors"));
	- break;
	-
	- case VDEV_AUX_ACTIVE:
	- (void) printf(gettext("currently in use"));
	- break;
	-
	- case VDEV_AUX_CHILDREN_OFFLINE:
	- (void) printf(gettext("all children offline"));
	- break;
	-
	- default:
	- (void) printf(gettext("corrupted data"));
	- break;
	- }
	- }
	- (void) printf("\n");
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0)
	- return;
	-
	- for (c = 0; c < children; c++) {
	- uint64_t is_log = B_FALSE;
	-
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	- &is_log);
	- if (is_log)
	- continue;
	- if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
	- continue;
	-
	- vname = zpool_vdev_name(g_zfs, NULL, child[c],
	- cb->cb_name_flags \| VDEV_NAME_TYPE_ID);
	- print_import_config(cb, vname, child[c], depth + 2);
	- free(vname);
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	- &child, &children) == 0) {
	- (void) printf(gettext("\tcache\n"));
	- for (c = 0; c < children; c++) {
	- vname = zpool_vdev_name(g_zfs, NULL, child[c],
	- cb->cb_name_flags);
	- (void) printf("\t %s\n", vname);
	- free(vname);
	- }
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
	- &child, &children) == 0) {
	- (void) printf(gettext("\tspares\n"));
	- for (c = 0; c < children; c++) {
	- vname = zpool_vdev_name(g_zfs, NULL, child[c],
	- cb->cb_name_flags);
	- (void) printf("\t %s\n", vname);
	- free(vname);
	- }
	- }
	-}
	-
	-/*
	- * Print specialized class vdevs.
	- *
	- * These are recorded as top level vdevs in the main pool child array
	- * but with "is_log" set to 1 or an "alloc_bias" string. We use either
	- * print_status_config() or print_import_config() to print the top level
	- * class vdevs then any of their children (eg mirrored slogs) are printed
	- * recursively - which works because only the top level vdev is marked.
	- */
	-static void
	-print_class_vdevs(zpool_handle_t zhp, status_cbdata_t cb, nvlist_t *nv,
	- const char *class)
	-{
	- uint_t c, children;
	- nvlist_t **child;
	- boolean_t printed = B_FALSE;
	-
	- assert(zhp != NULL \|\| !cb->cb_verbose);
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
	- &children) != 0)
	- return;
	-
	- for (c = 0; c < children; c++) {
	- uint64_t is_log = B_FALSE;
	- char *bias = NULL;
	- char *type = NULL;
	-
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	- &is_log);
	-
	- if (is_log) {
	- bias = VDEV_ALLOC_CLASS_LOGS;
	- } else {
	- (void) nvlist_lookup_string(child[c],
	- ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
	- (void) nvlist_lookup_string(child[c],
	- ZPOOL_CONFIG_TYPE, &type);
	- }
	-
	- if (bias == NULL \|\| strcmp(bias, class) != 0)
	- continue;
	- if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
	- continue;
	-
	- if (!printed) {
	- (void) printf("\t%s\t\n", gettext(class));
	- printed = B_TRUE;
	- }
	-
	- char *name = zpool_vdev_name(g_zfs, zhp, child[c],
	- cb->cb_name_flags \| VDEV_NAME_TYPE_ID);
	- if (cb->cb_print_status)
	- print_status_config(zhp, cb, name, child[c], 2,
	- B_FALSE);
	- else
	- print_import_config(cb, name, child[c], 2);
	- free(name);
	- }
	-}
	-
	-/*
	- * Display the status for the given pool.
	- */
	-static void
	-show_import(nvlist_t *config)
	-{
	- uint64_t pool_state;
	- vdev_stat_t *vs;
	- char *name;
	- uint64_t guid;
	- uint64_t hostid = 0;
	- char *msgid;
	- char *hostname = "unknown";
	- nvlist_t nvroot, nvinfo;
	- int reason;
	- const char *health;
	- uint_t vsc;
	- char *comment;
	- status_cbdata_t cb = { 0 };
	-
	- verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	- &name) == 0);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	- &guid) == 0);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	- &pool_state) == 0);
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	-
	- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &vsc) == 0);
	- health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
	-
	- reason = zpool_import_status(config, &msgid);
	-
	- (void) printf(gettext(" pool: %s\n"), name);
	- (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid);
	- (void) printf(gettext(" state: %s"), health);
	- if (pool_state == POOL_STATE_DESTROYED)
	- (void) printf(gettext(" (DESTROYED)"));
	- (void) printf("\n");
	-
	- switch (reason) {
	- case ZPOOL_STATUS_MISSING_DEV_R:
	- case ZPOOL_STATUS_MISSING_DEV_NR:
	- case ZPOOL_STATUS_BAD_GUID_SUM:
	- (void) printf(gettext(" status: One or more devices are "
	- "missing from the system.\n"));
	- break;
	-
	- case ZPOOL_STATUS_CORRUPT_LABEL_R:
	- case ZPOOL_STATUS_CORRUPT_LABEL_NR:
	- (void) printf(gettext(" status: One or more devices contains "
	- "corrupted data.\n"));
	- break;
	-
	- case ZPOOL_STATUS_CORRUPT_DATA:
	- (void) printf(
	- gettext(" status: The pool data is corrupted.\n"));
	- break;
	-
	- case ZPOOL_STATUS_OFFLINE_DEV:
	- (void) printf(gettext(" status: One or more devices "
	- "are offlined.\n"));
	- break;
	-
	- case ZPOOL_STATUS_CORRUPT_POOL:
	- (void) printf(gettext(" status: The pool metadata is "
	- "corrupted.\n"));
	- break;
	-
	- case ZPOOL_STATUS_VERSION_OLDER:
	- (void) printf(gettext(" status: The pool is formatted using a "
	- "legacy on-disk version.\n"));
	- break;
	-
	- case ZPOOL_STATUS_VERSION_NEWER:
	- (void) printf(gettext(" status: The pool is formatted using an "
	- "incompatible version.\n"));
	- break;
	-
	- case ZPOOL_STATUS_FEAT_DISABLED:
	- (void) printf(gettext(" status: Some supported features are "
	- "not enabled on the pool.\n"));
	- break;
	-
	- case ZPOOL_STATUS_UNSUP_FEAT_READ:
	- (void) printf(gettext("status: The pool uses the following "
	- "feature(s) not supported on this system:\n"));
	- zpool_print_unsup_feat(config);
	- break;
	-
	- case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
	- (void) printf(gettext("status: The pool can only be accessed "
	- "in read-only mode on this system. It\n\tcannot be "
	- "accessed in read-write mode because it uses the "
	- "following\n\tfeature(s) not supported on this system:\n"));
	- zpool_print_unsup_feat(config);
	- break;
	-
	- case ZPOOL_STATUS_HOSTID_ACTIVE:
	- (void) printf(gettext(" status: The pool is currently "
	- "imported by another system.\n"));
	- break;
	-
	- case ZPOOL_STATUS_HOSTID_REQUIRED:
	- (void) printf(gettext(" status: The pool has the "
	- "multihost property on. It cannot\n\tbe safely imported "
	- "when the system hostid is not set.\n"));
	- break;
	-
	- case ZPOOL_STATUS_HOSTID_MISMATCH:
	- (void) printf(gettext(" status: The pool was last accessed by "
	- "another system.\n"));
	- break;
	-
	- case ZPOOL_STATUS_FAULTED_DEV_R:
	- case ZPOOL_STATUS_FAULTED_DEV_NR:
	- (void) printf(gettext(" status: One or more devices are "
	- "faulted.\n"));
	- break;
	-
	- case ZPOOL_STATUS_BAD_LOG:
	- (void) printf(gettext(" status: An intent log record cannot be "
	- "read.\n"));
	- break;
	-
	- case ZPOOL_STATUS_RESILVERING:
	- (void) printf(gettext(" status: One or more devices were being "
	- "resilvered.\n"));
	- break;
	-
	- case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
	- (void) printf(gettext("status: One or more devices were "
	- "configured to use a non-native block size.\n"
	- "\tExpect reduced performance.\n"));
	- break;
	-
	- default:
	- /*
	- * No other status can be seen when importing pools.
	- */
	- assert(reason == ZPOOL_STATUS_OK);
	- }
	-
	- /*
	- * Print out an action according to the overall state of the pool.
	- */
	- if (vs->vs_state == VDEV_STATE_HEALTHY) {
	- if (reason == ZPOOL_STATUS_VERSION_OLDER \|\|
	- reason == ZPOOL_STATUS_FEAT_DISABLED) {
	- (void) printf(gettext(" action: The pool can be "
	- "imported using its name or numeric identifier, "
	- "though\n\tsome features will not be available "
	- "without an explicit 'zpool upgrade'.\n"));
	- } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) {
	- (void) printf(gettext(" action: The pool can be "
	- "imported using its name or numeric "
	- "identifier and\n\tthe '-f' flag.\n"));
	- } else {
	- (void) printf(gettext(" action: The pool can be "
	- "imported using its name or numeric "
	- "identifier.\n"));
	- }
	- } else if (vs->vs_state == VDEV_STATE_DEGRADED) {
	- (void) printf(gettext(" action: The pool can be imported "
	- "despite missing or damaged devices. The\n\tfault "
	- "tolerance of the pool may be compromised if imported.\n"));
	- } else {
	- switch (reason) {
	- case ZPOOL_STATUS_VERSION_NEWER:
	- (void) printf(gettext(" action: The pool cannot be "
	- "imported. Access the pool on a system running "
	- "newer\n\tsoftware, or recreate the pool from "
	- "backup.\n"));
	- break;
	- case ZPOOL_STATUS_UNSUP_FEAT_READ:
	- (void) printf(gettext("action: The pool cannot be "
	- "imported. Access the pool on a system that "
	- "supports\n\tthe required feature(s), or recreate "
	- "the pool from backup.\n"));
	- break;
	- case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
	- (void) printf(gettext("action: The pool cannot be "
	- "imported in read-write mode. Import the pool "
	- "with\n"
	- "\t\"-o readonly=on\", access the pool on a system "
	- "that supports the\n\trequired feature(s), or "
	- "recreate the pool from backup.\n"));
	- break;
	- case ZPOOL_STATUS_MISSING_DEV_R:
	- case ZPOOL_STATUS_MISSING_DEV_NR:
	- case ZPOOL_STATUS_BAD_GUID_SUM:
	- (void) printf(gettext(" action: The pool cannot be "
	- "imported. Attach the missing\n\tdevices and try "
	- "again.\n"));
	- break;
	- case ZPOOL_STATUS_HOSTID_ACTIVE:
	- VERIFY0(nvlist_lookup_nvlist(config,
	- ZPOOL_CONFIG_LOAD_INFO, &nvinfo));
	-
	- if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME))
	- hostname = fnvlist_lookup_string(nvinfo,
	- ZPOOL_CONFIG_MMP_HOSTNAME);
	-
	- if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID))
	- hostid = fnvlist_lookup_uint64(nvinfo,
	- ZPOOL_CONFIG_MMP_HOSTID);
	-
	- (void) printf(gettext(" action: The pool must be "
	- "exported from %s (hostid=%lx)\n\tbefore it "
	- "can be safely imported.\n"), hostname,
	- (unsigned long) hostid);
	- break;
	- case ZPOOL_STATUS_HOSTID_REQUIRED:
	- (void) printf(gettext(" action: Check the SMF "
	- "svc:/system/hostid service.\n"));
	- break;
	- default:
	- (void) printf(gettext(" action: The pool cannot be "
	- "imported due to damaged devices or data.\n"));
	- }
	- }
	-
	- /* Print the comment attached to the pool. */
	- if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
	- (void) printf(gettext("comment: %s\n"), comment);
	-
	- /*
	- * If the state is "closed" or "can't open", and the aux state
	- * is "corrupt data":
	- */
	- if (((vs->vs_state == VDEV_STATE_CLOSED) \|\|
	- (vs->vs_state == VDEV_STATE_CANT_OPEN)) &&
	- (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) {
	- if (pool_state == POOL_STATE_DESTROYED)
	- (void) printf(gettext("\tThe pool was destroyed, "
	- "but can be imported using the '-Df' flags.\n"));
	- else if (pool_state != POOL_STATE_EXPORTED)
	- (void) printf(gettext("\tThe pool may be active on "
	- "another system, but can be imported using\n\t"
	- "the '-f' flag.\n"));
	- }
	-
	- if (msgid != NULL)
	- (void) printf(gettext(" see: http://illumos.org/msg/%s\n"),
	- msgid);
	-
	- (void) printf(gettext(" config:\n\n"));
	-
	- cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, 0);
	- if (cb.cb_namewidth < 10)
	- cb.cb_namewidth = 10;
	-
	- print_import_config(&cb, name, nvroot, 0);
	-
	- print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP);
	- print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
	- print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS);
	-
	- if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
	- (void) printf(gettext("\n\tAdditional devices are known to "
	- "be part of this pool, though their\n\texact "
	- "configuration cannot be determined.\n"));
	- }
	-}
	-
	-static boolean_t
	-zfs_force_import_required(nvlist_t *config)
	-{
	- uint64_t state;
	- uint64_t hostid = 0;
	- nvlist_t *nvinfo;
	-
	- state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE);
	- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
	-
	- if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid())
	- return (B_TRUE);
	-
	- nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
	- if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) {
	- mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo,
	- ZPOOL_CONFIG_MMP_STATE);
	-
	- if (mmp_state != MMP_STATE_INACTIVE)
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Perform the import for the given configuration. This passes the heavy
	- * lifting off to zpool_import_props(), and then mounts the datasets contained
	- * within the pool.
	- */
	-static int
	-do_import(nvlist_t config, const char newname, const char *mntopts,
	- nvlist_t *props, int flags)
	-{
	- zpool_handle_t *zhp;
	- char *name;
	- uint64_t version;
	-
	- name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME);
	- version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION);
	-
	- if (!SPA_VERSION_IS_SUPPORTED(version)) {
	- (void) fprintf(stderr, gettext("cannot import '%s': pool "
	- "is formatted using an unsupported ZFS version\n"), name);
	- return (1);
	- } else if (zfs_force_import_required(config) &&
	- !(flags & ZFS_IMPORT_ANY_HOST)) {
	- mmp_state_t mmp_state = MMP_STATE_INACTIVE;
	- nvlist_t *nvinfo;
	-
	- nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
	- if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE))
	- mmp_state = fnvlist_lookup_uint64(nvinfo,
	- ZPOOL_CONFIG_MMP_STATE);
	-
	- if (mmp_state == MMP_STATE_ACTIVE) {
	- char *hostname = "<unknown>";
	- uint64_t hostid = 0;
	-
	- if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME))
	- hostname = fnvlist_lookup_string(nvinfo,
	- ZPOOL_CONFIG_MMP_HOSTNAME);
	-
	- if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID))
	- hostid = fnvlist_lookup_uint64(nvinfo,
	- ZPOOL_CONFIG_MMP_HOSTID);
	-
	- (void) fprintf(stderr, gettext("cannot import '%s': "
	- "pool is imported on %s (hostid: "
	- "0x%lx)\nExport the pool on the other system, "
	- "then run 'zpool import'.\n"),
	- name, hostname, (unsigned long) hostid);
	- } else if (mmp_state == MMP_STATE_NO_HOSTID) {
	- (void) fprintf(stderr, gettext("Cannot import '%s': "
	- "pool has the multihost property on and the\n"
	- "system's hostid is not set.\n"), name);
	- } else {
	- char *hostname = "<unknown>";
	- uint64_t timestamp = 0;
	- uint64_t hostid = 0;
	-
	- if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
	- hostname = fnvlist_lookup_string(config,
	- ZPOOL_CONFIG_HOSTNAME);
	-
	- if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP))
	- timestamp = fnvlist_lookup_uint64(config,
	- ZPOOL_CONFIG_TIMESTAMP);
	-
	- if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
	- hostid = fnvlist_lookup_uint64(config,
	- ZPOOL_CONFIG_HOSTID);
	-
	- (void) fprintf(stderr, gettext("cannot import '%s': "
	- "pool was previously in use from another system.\n"
	- "Last accessed by %s (hostid=%lx) at %s"
	- "The pool can be imported, use 'zpool import -f' "
	- "to import the pool.\n"), name, hostname,
	- (unsigned long)hostid, ctime((time_t *)&timestamp));
	-
	- }
	-
	- return (1);
	- }
	-
	- if (zpool_import_props(g_zfs, config, newname, props, flags) != 0)
	- return (1);
	-
	- if (newname != NULL)
	- name = (char *)newname;
	-
	- if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL)
	- return (1);
	-
	- if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
	- !(flags & ZFS_IMPORT_ONLY) &&
	- zpool_enable_datasets(zhp, mntopts, 0) != 0) {
	- zpool_close(zhp);
	- return (1);
	- }
	-
	- zpool_close(zhp);
	- return (0);
	-}
	-
	-/*
	- * zpool checkpoint <pool>
	- * checkpoint --discard <pool>
	- *
	- * -d Discard the checkpoint from a checkpointed
	- * --discard pool.
	- *
	- * Checkpoints the specified pool, by taking a "snapshot" of its
	- * current state. A pool can only have one checkpoint at a time.
	- */
	-int
	-zpool_do_checkpoint(int argc, char **argv)
	-{
	- boolean_t discard;
	- char *pool;
	- zpool_handle_t *zhp;
	- int c, err;
	-
	- struct option long_options[] = {
	- {"discard", no_argument, NULL, 'd'},
	- {0, 0, 0, 0}
	- };
	-
	- discard = B_FALSE;
	- while ((c = getopt_long(argc, argv, ":d", long_options, NULL)) != -1) {
	- switch (c) {
	- case 'd':
	- discard = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool argument\n"));
	- usage(B_FALSE);
	- }
	-
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- pool = argv[0];
	-
	- if ((zhp = zpool_open(g_zfs, pool)) == NULL) {
	- /* As a special case, check for use of '/' in the name */
	- if (strchr(pool, '/') != NULL)
	- (void) fprintf(stderr, gettext("'zpool checkpoint' "
	- "doesn't work on datasets. To save the state "
	- "of a dataset from a specific point in time "
	- "please use 'zfs snapshot'\n"));
	- return (1);
	- }
	-
	- if (discard)
	- err = (zpool_discard_checkpoint(zhp) != 0);
	- else
	- err = (zpool_checkpoint(zhp) != 0);
	-
	- zpool_close(zhp);
	-
	- return (err);
	-}
	-
	-#define CHECKPOINT_OPT 1024
	-
	-/*
	- * zpool import [-d dir] [-D]
	- * import [-o mntopts] [-o prop=value] ... [-R root] [-D]
	- * [-d dir \| -c cachefile] [-f] -a
	- * import [-o mntopts] [-o prop=value] ... [-R root] [-D]
	- * [-d dir \| -c cachefile] [-f] [-n] [-F] [-t]
	- * <pool \| id> [newpool]
	- *
	- * -c Read pool information from a cachefile instead of searching
	- * devices.
	- *
	- * -d Scan in a specific directory, other than /dev/dsk. More than
	- * one directory can be specified using multiple '-d' options.
	- *
	- * -D Scan for previously destroyed pools or import all or only
	- * specified destroyed pools.
	- *
	- * -R Temporarily import the pool, with all mountpoints relative to
	- * the given root. The pool will remain exported when the machine
	- * is rebooted.
	- *
	- * -V Import even in the presence of faulted vdevs. This is an
	- * intentionally undocumented option for testing purposes, and
	- * treats the pool configuration as complete, leaving any bad
	- * vdevs in the FAULTED state. In other words, it does verbatim
	- * import.
	- *
	- * -f Force import, even if it appears that the pool is active.
	- *
	- * -F Attempt rewind if necessary.
	- *
	- * -n See if rewind would work, but don't actually rewind.
	- *
	- * -N Import the pool but don't mount datasets.
	- *
	- * -t Use newpool as a temporary pool name instead of renaming
	- * the pool.
	- *
	- * -T Specify a starting txg to use for import. This option is
	- * intentionally undocumented option for testing purposes.
	- *
	- * -a Import all pools found.
	- *
	- * -o Set property=value and/or temporary mount options (without '=').
	- *
	- * --rewind-to-checkpoint
	- * Import the pool and revert back to the checkpoint.
	- *
	- * The import command scans for pools to import, and import pools based on pool
	- * name and GUID. The pool can also be renamed as part of the import process.
	- */
	-int
	-zpool_do_import(int argc, char **argv)
	-{
	- char **searchdirs = NULL;
	- int nsearch = 0;
	- int c;
	- int err = 0;
	- nvlist_t *pools = NULL;
	- boolean_t do_all = B_FALSE;
	- boolean_t do_destroyed = B_FALSE;
	- char *mntopts = NULL;
	- nvpair_t *elem;
	- nvlist_t *config;
	- uint64_t searchguid = 0;
	- char *searchname = NULL;
	- char *propval;
	- nvlist_t *found_config;
	- nvlist_t *policy = NULL;
	- nvlist_t *props = NULL;
	- boolean_t first;
	- int flags = ZFS_IMPORT_NORMAL;
	- uint32_t rewind_policy = ZPOOL_NO_REWIND;
	- boolean_t dryrun = B_FALSE;
	- boolean_t do_rewind = B_FALSE;
	- boolean_t xtreme_rewind = B_FALSE;
	- uint64_t pool_state, txg = -1ULL;
	- char *cachefile = NULL;
	- importargs_t idata = { 0 };
	- char *endptr;
	-
	-
	- struct option long_options[] = {
	- {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT},
	- {0, 0, 0, 0}
	- };
	-
	- /* check options */
	- while ((c = getopt_long(argc, argv, ":aCc:d:DEfFmnNo:rR:tT:VX",
	- long_options, NULL)) != -1) {
	- switch (c) {
	- case 'a':
	- do_all = B_TRUE;
	- break;
	- case 'c':
	- cachefile = optarg;
	- break;
	- case 'd':
	- if (searchdirs == NULL) {
	- searchdirs = safe_malloc(sizeof (char *));
	- } else {
	- char *tmp = safe_malloc((nsearch + 1)
	- sizeof (char *));
	- bcopy(searchdirs, tmp, nsearch *
	- sizeof (char *));
	- free(searchdirs);
	- searchdirs = tmp;
	- }
	- searchdirs[nsearch++] = optarg;
	- break;
	- case 'D':
	- do_destroyed = B_TRUE;
	- break;
	- case 'f':
	- flags \|= ZFS_IMPORT_ANY_HOST;
	- break;
	- case 'F':
	- do_rewind = B_TRUE;
	- break;
	- case 'm':
	- flags \|= ZFS_IMPORT_MISSING_LOG;
	- break;
	- case 'n':
	- dryrun = B_TRUE;
	- break;
	- case 'N':
	- flags \|= ZFS_IMPORT_ONLY;
	- break;
	- case 'o':
	- if ((propval = strchr(optarg, '=')) != NULL) {
	- *propval = '\0';
	- propval++;
	- if (add_prop_list(optarg, propval,
	- &props, B_TRUE))
	- goto error;
	- } else {
	- mntopts = optarg;
	- }
	- break;
	- case 'R':
	- if (add_prop_list(zpool_prop_to_name(
	- ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
	- goto error;
	- if (add_prop_list_default(zpool_prop_to_name(
	- ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
	- goto error;
	- break;
	- case 't':
	- flags \|= ZFS_IMPORT_TEMP_NAME;
	- if (add_prop_list_default(zpool_prop_to_name(
	- ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
	- goto error;
	- break;
	- case 'T':
	- errno = 0;
	- txg = strtoull(optarg, &endptr, 0);
	- if (errno != 0 \|\| *endptr != '\0') {
	- (void) fprintf(stderr,
	- gettext("invalid txg value\n"));
	- usage(B_FALSE);
	- }
	- rewind_policy = ZPOOL_DO_REWIND \| ZPOOL_EXTREME_REWIND;
	- break;
	- case 'V':
	- flags \|= ZFS_IMPORT_VERBATIM;
	- break;
	- case 'X':
	- xtreme_rewind = B_TRUE;
	- break;
	- case CHECKPOINT_OPT:
	- flags \|= ZFS_IMPORT_CHECKPOINT;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (cachefile && nsearch != 0) {
	- (void) fprintf(stderr, gettext("-c is incompatible with -d\n"));
	- usage(B_FALSE);
	- }
	-
	- if ((dryrun \|\| xtreme_rewind) && !do_rewind) {
	- (void) fprintf(stderr,
	- gettext("-n or -X only meaningful with -F\n"));
	- usage(B_FALSE);
	- }
	- if (dryrun)
	- rewind_policy = ZPOOL_TRY_REWIND;
	- else if (do_rewind)
	- rewind_policy = ZPOOL_DO_REWIND;
	- if (xtreme_rewind)
	- rewind_policy \|= ZPOOL_EXTREME_REWIND;
	-
	- /* In the future, we can capture further policy and include it here */
	- if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 \|\|
	- nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 \|\|
	- nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY,
	- rewind_policy) != 0)
	- goto error;
	-
	- if (searchdirs == NULL) {
	- searchdirs = safe_malloc(sizeof (char *));
	- searchdirs[0] = "/dev";
	- nsearch = 1;
	- }
	-
	- /* check argument count */
	- if (do_all) {
	- if (argc != 0) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	- } else {
	- if (argc > 2) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- /*
	- * Check for the SYS_CONFIG privilege. We do this explicitly
	- * here because otherwise any attempt to discover pools will
	- * silently fail.
	- */
	- if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) {
	- (void) fprintf(stderr, gettext("cannot "
	- "discover pools: permission denied\n"));
	- free(searchdirs);
	- nvlist_free(policy);
	- return (1);
	- }
	- }
	-
	- /*
	- * Depending on the arguments given, we do one of the following:
	- *
	- * <none> Iterate through all pools and display information about
	- * each one.
	- *
	- * -a Iterate through all pools and try to import each one.
	- *
	- * <id> Find the pool that corresponds to the given GUID/pool
	- * name and import that one.
	- *
	- * -D Above options applies only to destroyed pools.
	- */
	- if (argc != 0) {
	- char *endptr;
	-
	- errno = 0;
	- searchguid = strtoull(argv[0], &endptr, 10);
	- if (errno != 0 \|\| *endptr != '\0') {
	- searchname = argv[0];
	- searchguid = 0;
	- }
	- found_config = NULL;
	-
	- /*
	- * User specified a name or guid. Ensure it's unique.
	- */
	- idata.unique = B_TRUE;
	- }
	-
	-
	- idata.path = searchdirs;
	- idata.paths = nsearch;
	- idata.poolname = searchname;
	- idata.guid = searchguid;
	- idata.cachefile = cachefile;
	- idata.policy = policy;
	-
	- pools = zpool_search_import(g_zfs, &idata);
	-
	- if (pools != NULL && idata.exists &&
	- (argc == 1 \|\| strcmp(argv[0], argv[1]) == 0)) {
	- (void) fprintf(stderr, gettext("cannot import '%s': "
	- "a pool with that name already exists\n"),
	- argv[0]);
	- (void) fprintf(stderr, gettext("use the form 'zpool import "
	- "[-t] <pool \| id> <newpool>' to give it a new temporary "
	- "or permanent name\n"));
	- err = 1;
	- } else if (pools == NULL && idata.exists) {
	- (void) fprintf(stderr, gettext("cannot import '%s': "
	- "a pool with that name is already created/imported,\n"),
	- argv[0]);
	- (void) fprintf(stderr, gettext("and no additional pools "
	- "with that name were found\n"));
	- err = 1;
	- } else if (pools == NULL) {
	- if (argc != 0) {
	- (void) fprintf(stderr, gettext("cannot import '%s': "
	- "no such pool available\n"), argv[0]);
	- }
	- err = 1;
	- }
	-
	- if (err == 1) {
	- free(searchdirs);
	- nvlist_free(policy);
	- return (1);
	- }
	-
	- /*
	- * At this point we have a list of import candidate configs. Even if
	- * we were searching by pool name or guid, we still need to
	- * post-process the list to deal with pool state and possible
	- * duplicate names.
	- */
	- err = 0;
	- elem = NULL;
	- first = B_TRUE;
	- while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
	-
	- verify(nvpair_value_nvlist(elem, &config) == 0);
	-
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	- &pool_state) == 0);
	- if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
	- continue;
	- if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
	- continue;
	-
	- verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
	- policy) == 0);
	-
	- if (argc == 0) {
	- if (first)
	- first = B_FALSE;
	- else if (!do_all)
	- (void) printf("\n");
	-
	- if (do_all) {
	- err \|= do_import(config, NULL, mntopts,
	- props, flags);
	- } else {
	- show_import(config);
	- }
	- } else if (searchname != NULL) {
	- char *name;
	-
	- /*
	- * We are searching for a pool based on name.
	- */
	- verify(nvlist_lookup_string(config,
	- ZPOOL_CONFIG_POOL_NAME, &name) == 0);
	-
	- if (strcmp(name, searchname) == 0) {
	- if (found_config != NULL) {
	- (void) fprintf(stderr, gettext(
	- "cannot import '%s': more than "
	- "one matching pool\n"), searchname);
	- (void) fprintf(stderr, gettext(
	- "import by numeric ID instead\n"));
	- err = B_TRUE;
	- }
	- found_config = config;
	- }
	- } else {
	- uint64_t guid;
	-
	- /*
	- * Search for a pool by guid.
	- */
	- verify(nvlist_lookup_uint64(config,
	- ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
	-
	- if (guid == searchguid)
	- found_config = config;
	- }
	- }
	-
	- /*
	- * If we were searching for a specific pool, verify that we found a
	- * pool, and then do the import.
	- */
	- if (argc != 0 && err == 0) {
	- if (found_config == NULL) {
	- (void) fprintf(stderr, gettext("cannot import '%s': "
	- "no such pool available\n"), argv[0]);
	- err = B_TRUE;
	- } else {
	- err \|= do_import(found_config, argc == 1 ? NULL :
	- argv[1], mntopts, props, flags);
	- }
	- }
	-
	- /*
	- * If we were just looking for pools, report an error if none were
	- * found.
	- */
	- if (argc == 0 && first)
	- (void) fprintf(stderr,
	- gettext("no pools available to import\n"));
	-
	-error:
	- nvlist_free(props);
	- nvlist_free(pools);
	- nvlist_free(policy);
	- free(searchdirs);
	-
	- return (err ? 1 : 0);
	-}
	-
	-/*
	- * zpool sync [-f] [pool] ...
	- *
	- * -f (undocumented) force uberblock (and config including zpool cache file)
	- * update.
	- *
	- * Sync the specified pool(s).
	- * Without arguments "zpool sync" will sync all pools.
	- * This command initiates TXG sync(s) and will return after the TXG(s) commit.
	- *
	- */
	-static int
	-zpool_do_sync(int argc, char **argv)
	-{
	- int ret;
	- boolean_t force = B_FALSE;
	-
	- /* check options */
	- while ((ret = getopt(argc, argv, "f")) != -1) {
	- switch (ret) {
	- case 'f':
	- force = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* if argc == 0 we will execute zpool_sync_one on all pools */
	- ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force);
	-
	- return (ret);
	-}
	-
	-typedef struct iostat_cbdata {
	- boolean_t cb_verbose;
	- int cb_name_flags;
	- int cb_namewidth;
	- int cb_iteration;
	- boolean_t cb_scripted;
	- zpool_list_t *cb_list;
	-} iostat_cbdata_t;
	-
	-static void
	-print_iostat_separator(iostat_cbdata_t *cb)
	-{
	- int i = 0;
	-
	- for (i = 0; i < cb->cb_namewidth; i++)
	- (void) printf("-");
	- (void) printf(" ----- ----- ----- ----- ----- -----\n");
	-}
	-
	-static void
	-print_iostat_header(iostat_cbdata_t *cb)
	-{
	- (void) printf("%*s capacity operations bandwidth\n",
	- cb->cb_namewidth, "");
	- (void) printf("%-*s alloc free read write read write\n",
	- cb->cb_namewidth, "pool");
	- print_iostat_separator(cb);
	-}
	-
	-/*
	- * Display a single statistic.
	- */
	-static void
	-print_one_stat(uint64_t value)
	-{
	- char buf[64];
	-
	- zfs_nicenum(value, buf, sizeof (buf));
	- (void) printf(" %5s", buf);
	-}
	-
	-static const char *class_name[] = {
	- VDEV_ALLOC_BIAS_DEDUP,
	- VDEV_ALLOC_BIAS_SPECIAL,
	- VDEV_ALLOC_CLASS_LOGS
	-};
	-
	-/*
	- * Print out all the statistics for the given vdev. This can either be the
	- * toplevel configuration, or called recursively. If 'name' is NULL, then this
	- * is a verbose output, and we don't want to display the toplevel pool stats.
	- *
	- * Returns the number of stat lines printed.
	- */
	-static unsigned int
	-print_vdev_stats(zpool_handle_t zhp, const char name, nvlist_t *oldnv,
	- nvlist_t newnv, iostat_cbdata_t cb, int depth)
	-{
	- nvlist_t oldchild, newchild;
	- uint_t c, children;
	- vdev_stat_t oldvs, newvs;
	- vdev_stat_t zerovs = { 0 };
	- char *vname;
	- int ret = 0;
	- uint64_t tdelta;
	- double scale;
	-
	- if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
	- return (ret);
	-
	- if (oldnv != NULL) {
	- verify(nvlist_lookup_uint64_array(oldnv,
	- ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
	- } else {
	- oldvs = &zerovs;
	- }
	-
	- verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&newvs, &c) == 0);
	-
	- if (strlen(name) + depth > cb->cb_namewidth)
	- (void) printf("%*s%s", depth, "", name);
	- else
	- (void) printf("%s%s%s", depth, "", name,
	- (int)(cb->cb_namewidth - strlen(name) - depth), "");
	-
	- tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;
	-
	- if (tdelta == 0)
	- scale = 1.0;
	- else
	- scale = (double)NANOSEC / tdelta;
	-
	- /* only toplevel vdevs have capacity stats */
	- if (newvs->vs_space == 0) {
	- (void) printf(" - -");
	- } else {
	- print_one_stat(newvs->vs_alloc);
	- print_one_stat(newvs->vs_space - newvs->vs_alloc);
	- }
	-
	- print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] -
	- oldvs->vs_ops[ZIO_TYPE_READ])));
	-
	- print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] -
	- oldvs->vs_ops[ZIO_TYPE_WRITE])));
	-
	- print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] -
	- oldvs->vs_bytes[ZIO_TYPE_READ])));
	-
	- print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] -
	- oldvs->vs_bytes[ZIO_TYPE_WRITE])));
	-
	- (void) printf("\n");
	-
	- if (!cb->cb_verbose)
	- return (ret);
	-
	- if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
	- &newchild, &children) != 0)
	- return (ret);
	-
	- if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
	- &oldchild, &c) != 0)
	- return (ret);
	-
	- /*
	- * print normal top-level devices
	- */
	- for (c = 0; c < children; c++) {
	- uint64_t ishole = B_FALSE, islog = B_FALSE;
	-
	- (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE,
	- &ishole);
	-
	- (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG,
	- &islog);
	-
	- if (ishole \|\| islog)
	- continue;
	-
	- if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
	- continue;
	-
	- vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
	- cb->cb_name_flags);
	- print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
	- newchild[c], cb, depth + 2);
	- free(vname);
	- }
	-
	- /*
	- * print all other top-level devices
	- */
	- for (uint_t n = 0; n < 3; n++) {
	- for (c = 0; c < children; c++) {
	- uint64_t islog = B_FALSE;
	- char *bias = NULL;
	- char *type = NULL;
	-
	- (void) nvlist_lookup_uint64(newchild[c],
	- ZPOOL_CONFIG_IS_LOG, &islog);
	- if (islog) {
	- bias = VDEV_ALLOC_CLASS_LOGS;
	- } else {
	- (void) nvlist_lookup_string(newchild[c],
	- ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
	- (void) nvlist_lookup_string(newchild[c],
	- ZPOOL_CONFIG_TYPE, &type);
	- }
	- if (bias == NULL \|\| strcmp(bias, class_name[n]) != 0)
	- continue;
	- if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
	- continue;
	-
	- vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
	- cb->cb_name_flags);
	- ret += print_vdev_stats(zhp, vname, oldnv ?
	- oldchild[c] : NULL, newchild[c], cb, depth + 2);
	- free(vname);
	- }
	-
	- }
	-
	- /*
	- * Include level 2 ARC devices in iostat output
	- */
	- if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
	- &newchild, &children) != 0)
	- return (ret);
	-
	- if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
	- &oldchild, &c) != 0)
	- return (ret);
	-
	- if (children > 0) {
	- (void) printf("%-*s - - - - - "
	- "-\n", cb->cb_namewidth, "cache");
	- for (c = 0; c < children; c++) {
	- vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
	- cb->cb_name_flags);
	- print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
	- newchild[c], cb, depth + 2);
	- free(vname);
	- }
	- }
	-
	- return (ret);
	-}
	-
	-static int
	-refresh_iostat(zpool_handle_t zhp, void data)
	-{
	- iostat_cbdata_t *cb = data;
	- boolean_t missing;
	-
	- /*
	- * If the pool has disappeared, remove it from the list and continue.
	- */
	- if (zpool_refresh_stats(zhp, &missing) != 0)
	- return (-1);
	-
	- if (missing)
	- pool_list_remove(cb->cb_list, zhp);
	-
	- return (0);
	-}
	-
	-/*
	- * Callback to print out the iostats for the given pool.
	- */
	-int
	-print_iostat(zpool_handle_t zhp, void data)
	-{
	- iostat_cbdata_t *cb = data;
	- nvlist_t oldconfig, newconfig;
	- nvlist_t oldnvroot, newnvroot;
	-
	- newconfig = zpool_get_config(zhp, &oldconfig);
	-
	- if (cb->cb_iteration == 1)
	- oldconfig = NULL;
	-
	- verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE,
	- &newnvroot) == 0);
	-
	- if (oldconfig == NULL)
	- oldnvroot = NULL;
	- else
	- verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE,
	- &oldnvroot) == 0);
	-
	- /*
	- * Print out the statistics for the pool.
	- */
	- print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0);
	-
	- if (cb->cb_verbose)
	- print_iostat_separator(cb);
	-
	- return (0);
	-}
	-
	-int
	-get_namewidth(zpool_handle_t zhp, void data)
	-{
	- iostat_cbdata_t *cb = data;
	- nvlist_t config, nvroot;
	-
	- if ((config = zpool_get_config(zhp, NULL)) != NULL) {
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	- if (!cb->cb_verbose)
	- cb->cb_namewidth = strlen(zpool_get_name(zhp));
	- else
	- cb->cb_namewidth = max_width(zhp, nvroot, 0,
	- cb->cb_namewidth, cb->cb_name_flags);
	- }
	-
	- /*
	- * The width must fall into the range [10,38]. The upper limit is the
	- * maximum we can have and still fit in 80 columns.
	- */
	- if (cb->cb_namewidth < 10)
	- cb->cb_namewidth = 10;
	- if (cb->cb_namewidth > 38)
	- cb->cb_namewidth = 38;
	-
	- return (0);
	-}
	-
	-/*
	- * Parse the input string, get the 'interval' and 'count' value if there is one.
	- */
	-static void
	-get_interval_count(int argcp, char argv, unsigned long iv,
	- unsigned long *cnt)
	-{
	- unsigned long interval = 0, count = 0;
	- int argc = *argcp, errno;
	-
	- /*
	- * Determine if the last argument is an integer or a pool name
	- */
	- if (argc > 0 && isdigit(argv[argc - 1][0])) {
	- char *end;
	-
	- errno = 0;
	- interval = strtoul(argv[argc - 1], &end, 10);
	-
	- if (*end == '\0' && errno == 0) {
	- if (interval == 0) {
	- (void) fprintf(stderr, gettext("interval "
	- "cannot be zero\n"));
	- usage(B_FALSE);
	- }
	- /*
	- * Ignore the last parameter
	- */
	- argc--;
	- } else {
	- /*
	- * If this is not a valid number, just plow on. The
	- * user will get a more informative error message later
	- * on.
	- */
	- interval = 0;
	- }
	- }
	-
	- /*
	- * If the last argument is also an integer, then we have both a count
	- * and an interval.
	- */
	- if (argc > 0 && isdigit(argv[argc - 1][0])) {
	- char *end;
	-
	- errno = 0;
	- count = interval;
	- interval = strtoul(argv[argc - 1], &end, 10);
	-
	- if (*end == '\0' && errno == 0) {
	- if (interval == 0) {
	- (void) fprintf(stderr, gettext("interval "
	- "cannot be zero\n"));
	- usage(B_FALSE);
	- }
	-
	- /*
	- * Ignore the last parameter
	- */
	- argc--;
	- } else {
	- interval = 0;
	- }
	- }
	-
	- *iv = interval;
	- *cnt = count;
	- *argcp = argc;
	-}
	-
	-static void
	-get_timestamp_arg(char c)
	-{
	- if (c == 'u')
	- timestamp_fmt = UDATE;
	- else if (c == 'd')
	- timestamp_fmt = DDATE;
	- else
	- usage(B_FALSE);
	-}
	-
	-/*
	- * zpool iostat [-gLPv] [-T d\|u] [pool] ... [interval [count]]
	- *
	- * -g Display guid for individual vdev name.
	- * -L Follow links when resolving vdev path name.
	- * -P Display full path for vdev name.
	- * -v Display statistics for individual vdevs
	- * -T Display a timestamp in date(1) or Unix format
	- *
	- * This command can be tricky because we want to be able to deal with pool
	- * creation/destruction as well as vdev configuration changes. The bulk of this
	- * processing is handled by the pool_list_* routines in zpool_iter.c. We rely
	- * on pool_list_update() to detect the addition of new pools. Configuration
	- * changes are all handled within libzfs.
	- */
	-int
	-zpool_do_iostat(int argc, char **argv)
	-{
	- int c;
	- int ret;
	- int npools;
	- unsigned long interval = 0, count = 0;
	- zpool_list_t *list;
	- boolean_t verbose = B_FALSE;
	- boolean_t guid = B_FALSE;
	- boolean_t follow_links = B_FALSE;
	- boolean_t full_name = B_FALSE;
	- iostat_cbdata_t cb = { 0 };
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "gLPT:v")) != -1) {
	- switch (c) {
	- case 'g':
	- guid = B_TRUE;
	- break;
	- case 'L':
	- follow_links = B_TRUE;
	- break;
	- case 'P':
	- full_name = B_TRUE;
	- break;
	- case 'T':
	- get_timestamp_arg(*optarg);
	- break;
	- case 'v':
	- verbose = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- get_interval_count(&argc, argv, &interval, &count);
	-
	- /*
	- * Construct the list of all interesting pools.
	- */
	- ret = 0;
	- if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL)
	- return (1);
	-
	- if (pool_list_count(list) == 0 && argc != 0) {
	- pool_list_free(list);
	- return (1);
	- }
	-
	- if (pool_list_count(list) == 0 && interval == 0) {
	- pool_list_free(list);
	- (void) fprintf(stderr, gettext("no pools available\n"));
	- return (1);
	- }
	-
	- /*
	- * Enter the main iostat loop.
	- */
	- cb.cb_list = list;
	- cb.cb_verbose = verbose;
	- if (guid)
	- cb.cb_name_flags \|= VDEV_NAME_GUID;
	- if (follow_links)
	- cb.cb_name_flags \|= VDEV_NAME_FOLLOW_LINKS;
	- if (full_name)
	- cb.cb_name_flags \|= VDEV_NAME_PATH;
	- cb.cb_iteration = 0;
	- cb.cb_namewidth = 0;
	-
	- for (;;) {
	- pool_list_update(list);
	-
	- if ((npools = pool_list_count(list)) == 0)
	- break;
	-
	- /*
	- * Refresh all statistics. This is done as an explicit step
	- * before calculating the maximum name width, so that any
	- * configuration changes are properly accounted for.
	- */
	- (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb);
	-
	- /*
	- * Iterate over all pools to determine the maximum width
	- * for the pool / device name column across all pools.
	- */
	- cb.cb_namewidth = 0;
	- (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
	-
	- if (timestamp_fmt != NODATE)
	- print_timestamp(timestamp_fmt);
	-
	- /*
	- * If it's the first time, or verbose mode, print the header.
	- */
	- if (++cb.cb_iteration == 1 \|\| verbose)
	- print_iostat_header(&cb);
	-
	- (void) pool_list_iter(list, B_FALSE, print_iostat, &cb);
	-
	- /*
	- * If there's more than one pool, and we're not in verbose mode
	- * (which prints a separator for us), then print a separator.
	- */
	- if (npools > 1 && !verbose)
	- print_iostat_separator(&cb);
	-
	- if (verbose)
	- (void) printf("\n");
	-
	- /*
	- * Flush the output so that redirection to a file isn't buffered
	- * indefinitely.
	- */
	- (void) fflush(stdout);
	-
	- if (interval == 0)
	- break;
	-
	- if (count != 0 && --count == 0)
	- break;
	-
	- (void) sleep(interval);
	- }
	-
	- pool_list_free(list);
	-
	- return (ret);
	-}
	-
	-typedef struct list_cbdata {
	- boolean_t cb_verbose;
	- int cb_name_flags;
	- int cb_namewidth;
	- boolean_t cb_scripted;
	- zprop_list_t *cb_proplist;
	- boolean_t cb_literal;
	-} list_cbdata_t;
	-
	-
	-/*
	- * Given a list of columns to display, output appropriate headers for each one.
	- */
	-static void
	-print_header(list_cbdata_t *cb)
	-{
	- zprop_list_t *pl = cb->cb_proplist;
	- char headerbuf[ZPOOL_MAXPROPLEN];
	- const char *header;
	- boolean_t first = B_TRUE;
	- boolean_t right_justify;
	- size_t width = 0;
	-
	- for (; pl != NULL; pl = pl->pl_next) {
	- width = pl->pl_width;
	- if (first && cb->cb_verbose) {
	- /*
	- * Reset the width to accommodate the verbose listing
	- * of devices.
	- */
	- width = cb->cb_namewidth;
	- }
	-
	- if (!first)
	- (void) printf(" ");
	- else
	- first = B_FALSE;
	-
	- right_justify = B_FALSE;
	- if (pl->pl_prop != ZPROP_INVAL) {
	- header = zpool_prop_column_name(pl->pl_prop);
	- right_justify = zpool_prop_align_right(pl->pl_prop);
	- } else {
	- int i;
	-
	- for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
	- headerbuf[i] = toupper(pl->pl_user_prop[i]);
	- headerbuf[i] = '\0';
	- header = headerbuf;
	- }
	-
	- if (pl->pl_next == NULL && !right_justify)
	- (void) printf("%s", header);
	- else if (right_justify)
	- (void) printf("%*s", width, header);
	- else
	- (void) printf("%-*s", width, header);
	-
	- }
	-
	- (void) printf("\n");
	-}
	-
	-/*
	- * Given a pool and a list of properties, print out all the properties according
	- * to the described layout. Used by zpool_do_list().
	- */
	-static void
	-print_pool(zpool_handle_t zhp, list_cbdata_t cb)
	-{
	- zprop_list_t *pl = cb->cb_proplist;
	- boolean_t first = B_TRUE;
	- char property[ZPOOL_MAXPROPLEN];
	- char *propstr;
	- boolean_t right_justify;
	- size_t width;
	-
	- for (; pl != NULL; pl = pl->pl_next) {
	-
	- width = pl->pl_width;
	- if (first && cb->cb_verbose) {
	- /*
	- * Reset the width to accommodate the verbose listing
	- * of devices.
	- */
	- width = cb->cb_namewidth;
	- }
	-
	- if (!first) {
	- if (cb->cb_scripted)
	- (void) printf("\t");
	- else
	- (void) printf(" ");
	- } else {
	- first = B_FALSE;
	- }
	-
	- right_justify = B_FALSE;
	- if (pl->pl_prop != ZPROP_INVAL) {
	- if (zpool_get_prop(zhp, pl->pl_prop, property,
	- sizeof (property), NULL, cb->cb_literal) != 0)
	- propstr = "-";
	- else
	- propstr = property;
	-
	- right_justify = zpool_prop_align_right(pl->pl_prop);
	- } else if ((zpool_prop_feature(pl->pl_user_prop) \|\|
	- zpool_prop_unsupported(pl->pl_user_prop)) &&
	- zpool_prop_get_feature(zhp, pl->pl_user_prop, property,
	- sizeof (property)) == 0) {
	- propstr = property;
	- } else {
	- propstr = "-";
	- }
	-
	-
	- /*
	- * If this is being called in scripted mode, or if this is the
	- * last column and it is left-justified, don't include a width
	- * format specifier.
	- */
	- if (cb->cb_scripted \|\| (pl->pl_next == NULL && !right_justify))
	- (void) printf("%s", propstr);
	- else if (right_justify)
	- (void) printf("%*s", width, propstr);
	- else
	- (void) printf("%-*s", width, propstr);
	- }
	-
	- (void) printf("\n");
	-}
	-
	-static void
	-print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted,
	- boolean_t valid)
	-{
	- char propval[64];
	- boolean_t fixed;
	- size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);
	-
	- switch (prop) {
	- case ZPOOL_PROP_EXPANDSZ:
	- case ZPOOL_PROP_CHECKPOINT:
	- if (value == 0)
	- (void) strlcpy(propval, "-", sizeof (propval));
	- else
	- zfs_nicenum(value, propval, sizeof (propval));
	- break;
	- case ZPOOL_PROP_FRAGMENTATION:
	- if (value == ZFS_FRAG_INVALID) {
	- (void) strlcpy(propval, "-", sizeof (propval));
	- } else {
	- (void) snprintf(propval, sizeof (propval), "%llu%%",
	- value);
	- }
	- break;
	- case ZPOOL_PROP_CAPACITY:
	- (void) snprintf(propval, sizeof (propval),
	- value < 1000 ? "%1.2f%%" : value < 10000 ?
	- "%2.1f%%" : "%3.0f%%", value / 100.0);
	- break;
	- default:
	- zfs_nicenum(value, propval, sizeof (propval));
	- }
	-
	- if (!valid)
	- (void) strlcpy(propval, "-", sizeof (propval));
	-
	- if (scripted)
	- (void) printf("\t%s", propval);
	- else
	- (void) printf(" %*s", width, propval);
	-}
	-
	-/*
	- * print static default line per vdev
	- */
	-void
	-print_list_stats(zpool_handle_t zhp, const char name, nvlist_t *nv,
	- list_cbdata_t *cb, int depth)
	-{
	- nvlist_t **child;
	- vdev_stat_t *vs;
	- uint_t c, children;
	- char *vname;
	- boolean_t scripted = cb->cb_scripted;
	- uint64_t islog = B_FALSE;
	- char dashes = "%-s - - - - - -\n";
	-
	- verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &c) == 0);
	-
	- if (name != NULL) {
	- boolean_t toplevel = (vs->vs_space != 0);
	- uint64_t cap;
	-
	- if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
	- return;
	-
	- if (scripted)
	- (void) printf("\t%s", name);
	- else if (strlen(name) + depth > cb->cb_namewidth)
	- (void) printf("%*s%s", depth, "", name);
	- else
	- (void) printf("%s%s%s", depth, "", name,
	- (int)(cb->cb_namewidth - strlen(name) - depth), "");
	-
	- /*
	- * Print the properties for the individual vdevs. Some
	- * properties are only applicable to toplevel vdevs. The
	- * 'toplevel' boolean value is passed to the print_one_column()
	- * to indicate that the value is valid.
	- */
	- print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted,
	- toplevel);
	- print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, scripted,
	- toplevel);
	- print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
	- scripted, toplevel);
	- print_one_column(ZPOOL_PROP_CHECKPOINT,
	- vs->vs_checkpoint_space, scripted, toplevel);
	- print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted,
	- B_TRUE);
	- print_one_column(ZPOOL_PROP_FRAGMENTATION,
	- vs->vs_fragmentation, scripted,
	- (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel));
	- cap = (vs->vs_space == 0) ? 0 :
	- (vs->vs_alloc * 10000 / vs->vs_space);
	- print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel);
	- (void) printf("\n");
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0)
	- return;
	-
	- /* list the normal vdevs first */
	- for (c = 0; c < children; c++) {
	- uint64_t ishole = B_FALSE;
	-
	- if (nvlist_lookup_uint64(child[c],
	- ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
	- continue;
	-
	- if (nvlist_lookup_uint64(child[c],
	- ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog)
	- continue;
	-
	- if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
	- continue;
	-
	- vname = zpool_vdev_name(g_zfs, zhp, child[c],
	- cb->cb_name_flags);
	- print_list_stats(zhp, vname, child[c], cb, depth + 2);
	- free(vname);
	- }
	-
	- /* list the classes: 'logs', 'dedup', and 'special' */
	- for (uint_t n = 0; n < 3; n++) {
	- boolean_t printed = B_FALSE;
	-
	- for (c = 0; c < children; c++) {
	- char *bias = NULL;
	- char *type = NULL;
	-
	- if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	- &islog) == 0 && islog) {
	- bias = VDEV_ALLOC_CLASS_LOGS;
	- } else {
	- (void) nvlist_lookup_string(child[c],
	- ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
	- (void) nvlist_lookup_string(child[c],
	- ZPOOL_CONFIG_TYPE, &type);
	- }
	- if (bias == NULL \|\| strcmp(bias, class_name[n]) != 0)
	- continue;
	- if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
	- continue;
	-
	- if (!printed) {
	- /* LINTED E_SEC_PRINTF_VAR_FMT */
	- (void) printf(dashes, cb->cb_namewidth,
	- class_name[n]);
	- printed = B_TRUE;
	- }
	- vname = zpool_vdev_name(g_zfs, zhp, child[c],
	- cb->cb_name_flags);
	- print_list_stats(zhp, vname, child[c], cb, depth + 2);
	- free(vname);
	- }
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	- &child, &children) == 0 && children > 0) {
	- /* LINTED E_SEC_PRINTF_VAR_FMT */
	- (void) printf(dashes, cb->cb_namewidth, "cache");
	- for (c = 0; c < children; c++) {
	- vname = zpool_vdev_name(g_zfs, zhp, child[c],
	- cb->cb_name_flags);
	- print_list_stats(zhp, vname, child[c], cb, depth + 2);
	- free(vname);
	- }
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child,
	- &children) == 0 && children > 0) {
	- /* LINTED E_SEC_PRINTF_VAR_FMT */
	- (void) printf(dashes, cb->cb_namewidth, "spare");
	- for (c = 0; c < children; c++) {
	- vname = zpool_vdev_name(g_zfs, zhp, child[c],
	- cb->cb_name_flags);
	- print_list_stats(zhp, vname, child[c], cb, depth + 2);
	- free(vname);
	- }
	- }
	-}
	-
	-/*
	- * Generic callback function to list a pool.
	- */
	-int
	-list_callback(zpool_handle_t zhp, void data)
	-{
	- list_cbdata_t *cbp = data;
	- nvlist_t *config;
	- nvlist_t *nvroot;
	-
	- config = zpool_get_config(zhp, NULL);
	-
	- if (cbp->cb_verbose) {
	- config = zpool_get_config(zhp, NULL);
	-
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	- }
	-
	- if (cbp->cb_verbose)
	- cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
	- cbp->cb_name_flags);
	-
	- print_pool(zhp, cbp);
	-
	- if (cbp->cb_verbose)
	- print_list_stats(zhp, NULL, nvroot, cbp, 0);
	-
	- return (0);
	-}
	-
	-/*
	- * zpool list [-gHLP] [-o prop[,prop]*] [-T d\|u] [pool] ... [interval [count]]
	- *
	- * -g Display guid for individual vdev name.
	- * -H Scripted mode. Don't display headers, and separate properties
	- * by a single tab.
	- * -L Follow links when resolving vdev path name.
	- * -o List of properties to display. Defaults to
	- * "name,size,allocated,free,expandsize,fragmentation,capacity,"
	- * "dedupratio,health,altroot"
	- * -p Diplay values in parsable (exact) format.
	- * -P Display full path for vdev name.
	- * -T Display a timestamp in date(1) or Unix format
	- *
	- * List all pools in the system, whether or not they're healthy. Output space
	- * statistics for each one, as well as health status summary.
	- */
	-int
	-zpool_do_list(int argc, char **argv)
	-{
	- int c;
	- int ret;
	- list_cbdata_t cb = { 0 };
	- static char default_props[] =
	- "name,size,allocated,free,checkpoint,expandsize,fragmentation,"
	- "capacity,dedupratio,health,altroot";
	- char *props = default_props;
	- unsigned long interval = 0, count = 0;
	- zpool_list_t *list;
	- boolean_t first = B_TRUE;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) {
	- switch (c) {
	- case 'g':
	- cb.cb_name_flags \|= VDEV_NAME_GUID;
	- break;
	- case 'H':
	- cb.cb_scripted = B_TRUE;
	- break;
	- case 'L':
	- cb.cb_name_flags \|= VDEV_NAME_FOLLOW_LINKS;
	- break;
	- case 'o':
	- props = optarg;
	- break;
	- case 'P':
	- cb.cb_name_flags \|= VDEV_NAME_PATH;
	- break;
	- case 'p':
	- cb.cb_literal = B_TRUE;
	- break;
	- case 'T':
	- get_timestamp_arg(*optarg);
	- break;
	- case 'v':
	- cb.cb_verbose = B_TRUE;
	- cb.cb_namewidth = 8; /* 8 until precalc is avail */
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- get_interval_count(&argc, argv, &interval, &count);
	-
	- if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
	- usage(B_FALSE);
	-
	- for (;;) {
	- if ((list = pool_list_get(argc, argv, &cb.cb_proplist,
	- &ret)) == NULL)
	- return (1);
	-
	- if (pool_list_count(list) == 0)
	- break;
	-
	- cb.cb_namewidth = 0;
	- (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
	-
	- if (timestamp_fmt != NODATE)
	- print_timestamp(timestamp_fmt);
	-
	- if (!cb.cb_scripted && (first \|\| cb.cb_verbose)) {
	- print_header(&cb);
	- first = B_FALSE;
	- }
	- ret = pool_list_iter(list, B_TRUE, list_callback, &cb);
	-
	- if (interval == 0)
	- break;
	-
	- if (count != 0 && --count == 0)
	- break;
	-
	- pool_list_free(list);
	- (void) sleep(interval);
	- }
	-
	- if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) {
	- (void) printf(gettext("no pools available\n"));
	- ret = 0;
	- }
	-
	- pool_list_free(list);
	- zprop_free_list(cb.cb_proplist);
	- return (ret);
	-}
	-
	-static int
	-zpool_do_attach_or_replace(int argc, char **argv, int replacing)
	-{
	- boolean_t force = B_FALSE;
	- int c;
	- nvlist_t *nvroot;
	- char poolname, old_disk, *new_disk;
	- zpool_handle_t *zhp;
	- zpool_boot_label_t boot_type;
	- uint64_t boot_size;
	- int ret;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "f")) != -1) {
	- switch (c) {
	- case 'f':
	- force = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* get pool name and check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name argument\n"));
	- usage(B_FALSE);
	- }
	-
	- poolname = argv[0];
	-
	- if (argc < 2) {
	- (void) fprintf(stderr,
	- gettext("missing <device> specification\n"));
	- usage(B_FALSE);
	- }
	-
	- old_disk = argv[1];
	-
	- if (argc < 3) {
	- if (!replacing) {
	- (void) fprintf(stderr,
	- gettext("missing <new_device> specification\n"));
	- usage(B_FALSE);
	- }
	- new_disk = old_disk;
	- argc -= 1;
	- argv += 1;
	- } else {
	- new_disk = argv[2];
	- argc -= 2;
	- argv += 2;
	- }
	-
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	- return (1);
	-
	- if (zpool_get_config(zhp, NULL) == NULL) {
	- (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
	- poolname);
	- zpool_close(zhp);
	- return (1);
	- }
	-
	- if (zpool_is_bootable(zhp))
	- boot_type = ZPOOL_COPY_BOOT_LABEL;
	- else
	- boot_type = ZPOOL_NO_BOOT_LABEL;
	-
	- boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL);
	- nvroot = make_root_vdev(zhp, force, B_FALSE, replacing, B_FALSE,
	- boot_type, boot_size, argc, argv);
	- if (nvroot == NULL) {
	- zpool_close(zhp);
	- return (1);
	- }
	-
	- ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing);
	-
	- nvlist_free(nvroot);
	- zpool_close(zhp);
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool replace [-f] <pool> <device> <new_device>
	- *
	- * -f Force attach, even if <new_device> appears to be in use.
	- *
	- * Replace <device> with <new_device>.
	- */
	-/* ARGSUSED */
	-int
	-zpool_do_replace(int argc, char **argv)
	-{
	- return (zpool_do_attach_or_replace(argc, argv, B_TRUE));
	-}
	-
	-/*
	- * zpool attach [-f] <pool> <device> <new_device>
	- *
	- * -f Force attach, even if <new_device> appears to be in use.
	- *
	- * Attach <new_device> to the mirror containing <device>. If <device> is not
	- * part of a mirror, then <device> will be transformed into a mirror of
	- * <device> and <new_device>. In either case, <new_device> will begin life
	- * with a DTL of [0, now], and will immediately begin to resilver itself.
	- */
	-int
	-zpool_do_attach(int argc, char **argv)
	-{
	- return (zpool_do_attach_or_replace(argc, argv, B_FALSE));
	-}
	-
	-/*
	- * zpool detach [-f] <pool> <device>
	- *
	- * -f Force detach of <device>, even if DTLs argue against it
	- * (not supported yet)
	- *
	- * Detach a device from a mirror. The operation will be refused if <device>
	- * is the last device in the mirror, or if the DTLs indicate that this device
	- * has the only valid copy of some data.
	- */
	-/* ARGSUSED */
	-int
	-zpool_do_detach(int argc, char **argv)
	-{
	- int c;
	- char poolname, path;
	- zpool_handle_t *zhp;
	- int ret;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "f")) != -1) {
	- switch (c) {
	- case 'f':
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* get pool name and check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name argument\n"));
	- usage(B_FALSE);
	- }
	-
	- if (argc < 2) {
	- (void) fprintf(stderr,
	- gettext("missing <device> specification\n"));
	- usage(B_FALSE);
	- }
	-
	- poolname = argv[0];
	- path = argv[1];
	-
	- if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	- return (1);
	-
	- ret = zpool_vdev_detach(zhp, path);
	-
	- zpool_close(zhp);
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool split [-gLnP] [-o prop=val] ...
	- * [-o mntopt] ...
	- * [-R altroot] <pool> <newpool> [<device> ...]
	- *
	- * -g Display guid for individual vdev name.
	- * -L Follow links when resolving vdev path name.
	- * -n Do not split the pool, but display the resulting layout if
	- * it were to be split.
	- * -o Set property=value, or set mount options.
	- * -P Display full path for vdev name.
	- * -R Mount the split-off pool under an alternate root.
	- *
	- * Splits the named pool and gives it the new pool name. Devices to be split
	- * off may be listed, provided that no more than one device is specified
	- * per top-level vdev mirror. The newly split pool is left in an exported
	- * state unless -R is specified.
	- *
	- * Restrictions: the top-level of the pool pool must only be made up of
	- * mirrors; all devices in the pool must be healthy; no device may be
	- * undergoing a resilvering operation.
	- */
	-int
	-zpool_do_split(int argc, char **argv)
	-{
	- char srcpool, newpool, *propval;
	- char *mntopts = NULL;
	- splitflags_t flags;
	- int c, ret = 0;
	- zpool_handle_t *zhp;
	- nvlist_t config, props = NULL;
	-
	- flags.dryrun = B_FALSE;
	- flags.import = B_FALSE;
	- flags.name_flags = 0;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, ":gLR:no:P")) != -1) {
	- switch (c) {
	- case 'g':
	- flags.name_flags \|= VDEV_NAME_GUID;
	- break;
	- case 'L':
	- flags.name_flags \|= VDEV_NAME_FOLLOW_LINKS;
	- break;
	- case 'R':
	- flags.import = B_TRUE;
	- if (add_prop_list(
	- zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
	- &props, B_TRUE) != 0) {
	- nvlist_free(props);
	- usage(B_FALSE);
	- }
	- break;
	- case 'n':
	- flags.dryrun = B_TRUE;
	- break;
	- case 'o':
	- if ((propval = strchr(optarg, '=')) != NULL) {
	- *propval = '\0';
	- propval++;
	- if (add_prop_list(optarg, propval,
	- &props, B_TRUE) != 0) {
	- nvlist_free(props);
	- usage(B_FALSE);
	- }
	- } else {
	- mntopts = optarg;
	- }
	- break;
	- case 'P':
	- flags.name_flags \|= VDEV_NAME_PATH;
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- break;
	- }
	- }
	-
	- if (!flags.import && mntopts != NULL) {
	- (void) fprintf(stderr, gettext("setting mntopts is only "
	- "valid when importing the pool\n"));
	- usage(B_FALSE);
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("Missing pool name\n"));
	- usage(B_FALSE);
	- }
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("Missing new pool name\n"));
	- usage(B_FALSE);
	- }
	-
	- srcpool = argv[0];
	- newpool = argv[1];
	-
	- argc -= 2;
	- argv += 2;
	-
	- if ((zhp = zpool_open(g_zfs, srcpool)) == NULL)
	- return (1);
	-
	- config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv);
	- if (config == NULL) {
	- ret = 1;
	- } else {
	- if (flags.dryrun) {
	- (void) printf(gettext("would create '%s' with the "
	- "following layout:\n\n"), newpool);
	- print_vdev_tree(NULL, newpool, config, 0, "",
	- flags.name_flags);
	- }
	- nvlist_free(config);
	- }
	-
	- zpool_close(zhp);
	-
	- if (ret != 0 \|\| flags.dryrun \|\| !flags.import)
	- return (ret);
	-
	- /*
	- * The split was successful. Now we need to open the new
	- * pool and import it.
	- */
	- if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL)
	- return (1);
	- if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
	- zpool_enable_datasets(zhp, mntopts, 0) != 0) {
	- ret = 1;
	- (void) fprintf(stderr, gettext("Split was successful, but "
	- "the datasets could not all be mounted\n"));
	- (void) fprintf(stderr, gettext("Try doing '%s' with a "
	- "different altroot\n"), "zpool import");
	- }
	- zpool_close(zhp);
	-
	- return (ret);
	-}
	-
	-
	-
	-/*
	- * zpool online <pool> <device> ...
	- */
	-int
	-zpool_do_online(int argc, char **argv)
	-{
	- int c, i;
	- char *poolname;
	- zpool_handle_t *zhp;
	- int ret = 0;
	- vdev_state_t newstate;
	- int flags = 0;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "et")) != -1) {
	- switch (c) {
	- case 'e':
	- flags \|= ZFS_ONLINE_EXPAND;
	- break;
	- case 't':
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* get pool name and check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name\n"));
	- usage(B_FALSE);
	- }
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing device name\n"));
	- usage(B_FALSE);
	- }
	-
	- poolname = argv[0];
	-
	- if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	- return (1);
	-
	- for (i = 1; i < argc; i++) {
	- if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
	- if (newstate != VDEV_STATE_HEALTHY) {
	- (void) printf(gettext("warning: device '%s' "
	- "onlined, but remains in faulted state\n"),
	- argv[i]);
	- if (newstate == VDEV_STATE_FAULTED)
	- (void) printf(gettext("use 'zpool "
	- "clear' to restore a faulted "
	- "device\n"));
	- else
	- (void) printf(gettext("use 'zpool "
	- "replace' to replace devices "
	- "that are no longer present\n"));
	- }
	- } else {
	- ret = 1;
	- }
	- }
	-
	- zpool_close(zhp);
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool offline [-ft] <pool> <device> ...
	- *
	- * -f Force the device into the offline state, even if doing
	- * so would appear to compromise pool availability.
	- * (not supported yet)
	- *
	- * -t Only take the device off-line temporarily. The offline
	- * state will not be persistent across reboots.
	- */
	-/* ARGSUSED */
	-int
	-zpool_do_offline(int argc, char **argv)
	-{
	- int c, i;
	- char *poolname;
	- zpool_handle_t *zhp;
	- int ret = 0;
	- boolean_t istmp = B_FALSE;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "ft")) != -1) {
	- switch (c) {
	- case 't':
	- istmp = B_TRUE;
	- break;
	- case 'f':
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* get pool name and check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name\n"));
	- usage(B_FALSE);
	- }
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing device name\n"));
	- usage(B_FALSE);
	- }
	-
	- poolname = argv[0];
	-
	- if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	- return (1);
	-
	- for (i = 1; i < argc; i++) {
	- if (zpool_vdev_offline(zhp, argv[i], istmp) != 0)
	- ret = 1;
	- }
	-
	- zpool_close(zhp);
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool clear <pool> [device]
	- *
	- * Clear all errors associated with a pool or a particular device.
	- */
	-int
	-zpool_do_clear(int argc, char **argv)
	-{
	- int c;
	- int ret = 0;
	- boolean_t dryrun = B_FALSE;
	- boolean_t do_rewind = B_FALSE;
	- boolean_t xtreme_rewind = B_FALSE;
	- uint32_t rewind_policy = ZPOOL_NO_REWIND;
	- nvlist_t *policy = NULL;
	- zpool_handle_t *zhp;
	- char pool, device;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "FnX")) != -1) {
	- switch (c) {
	- case 'F':
	- do_rewind = B_TRUE;
	- break;
	- case 'n':
	- dryrun = B_TRUE;
	- break;
	- case 'X':
	- xtreme_rewind = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name\n"));
	- usage(B_FALSE);
	- }
	-
	- if (argc > 2) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- if ((dryrun \|\| xtreme_rewind) && !do_rewind) {
	- (void) fprintf(stderr,
	- gettext("-n or -X only meaningful with -F\n"));
	- usage(B_FALSE);
	- }
	- if (dryrun)
	- rewind_policy = ZPOOL_TRY_REWIND;
	- else if (do_rewind)
	- rewind_policy = ZPOOL_DO_REWIND;
	- if (xtreme_rewind)
	- rewind_policy \|= ZPOOL_EXTREME_REWIND;
	-
	- /* In future, further rewind policy choices can be passed along here */
	- if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 \|\|
	- nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY,
	- rewind_policy) != 0) {
	- return (1);
	- }
	-
	- pool = argv[0];
	- device = argc == 2 ? argv[1] : NULL;
	-
	- if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
	- nvlist_free(policy);
	- return (1);
	- }
	-
	- if (zpool_clear(zhp, device, policy) != 0)
	- ret = 1;
	-
	- zpool_close(zhp);
	-
	- nvlist_free(policy);
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool reguid <pool>
	- */
	-int
	-zpool_do_reguid(int argc, char **argv)
	-{
	- int c;
	- char *poolname;
	- zpool_handle_t *zhp;
	- int ret = 0;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "")) != -1) {
	- switch (c) {
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- /* get pool name and check number of arguments */
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name\n"));
	- usage(B_FALSE);
	- }
	-
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- poolname = argv[0];
	- if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	- return (1);
	-
	- ret = zpool_reguid(zhp);
	-
	- zpool_close(zhp);
	- return (ret);
	-}
	-
	-
	-/*
	- * zpool reopen <pool>
	- *
	- * Reopen the pool so that the kernel can update the sizes of all vdevs.
	- */
	-int
	-zpool_do_reopen(int argc, char **argv)
	-{
	- int c;
	- int ret = 0;
	- zpool_handle_t *zhp;
	- char *pool;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "")) != -1) {
	- switch (c) {
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc--;
	- argv++;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name\n"));
	- usage(B_FALSE);
	- }
	-
	- if (argc > 1) {
	- (void) fprintf(stderr, gettext("too many arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- pool = argv[0];
	- if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
	- return (1);
	-
	- ret = zpool_reopen(zhp);
	- zpool_close(zhp);
	- return (ret);
	-}
	-
	-typedef struct scrub_cbdata {
	- int cb_type;
	- int cb_argc;
	- char **cb_argv;
	- pool_scrub_cmd_t cb_scrub_cmd;
	-} scrub_cbdata_t;
	-
	-static boolean_t
	-zpool_has_checkpoint(zpool_handle_t *zhp)
	-{
	- nvlist_t config, nvroot;
	-
	- config = zpool_get_config(zhp, NULL);
	-
	- if (config != NULL) {
	- pool_checkpoint_stat_t *pcs = NULL;
	- uint_t c;
	-
	- nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
	- (void) nvlist_lookup_uint64_array(nvroot,
	- ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
	-
	- if (pcs == NULL \|\| pcs->pcs_state == CS_NONE)
	- return (B_FALSE);
	-
	- assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS \|\|
	- pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-int
	-scrub_callback(zpool_handle_t zhp, void data)
	-{
	- scrub_cbdata_t *cb = data;
	- int err;
	-
	- /*
	- * Ignore faulted pools.
	- */
	- if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	- (void) fprintf(stderr, gettext("cannot scrub '%s': pool is "
	- "currently unavailable\n"), zpool_get_name(zhp));
	- return (1);
	- }
	-
	- err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);
	-
	- if (err == 0 && zpool_has_checkpoint(zhp) &&
	- cb->cb_type == POOL_SCAN_SCRUB) {
	- (void) printf(gettext("warning: will not scrub state that "
	- "belongs to the checkpoint of pool '%s'\n"),
	- zpool_get_name(zhp));
	- }
	-
	- return (err != 0);
	-}
	-
	-/*
	- * zpool scrub [-s \| -p] <pool> ...
	- *
	- * -s Stop. Stops any in-progress scrub.
	- * -p Pause. Pause in-progress scrub.
	- */
	-int
	-zpool_do_scrub(int argc, char **argv)
	-{
	- int c;
	- scrub_cbdata_t cb;
	-
	- cb.cb_type = POOL_SCAN_SCRUB;
	- cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "sp")) != -1) {
	- switch (c) {
	- case 's':
	- cb.cb_type = POOL_SCAN_NONE;
	- break;
	- case 'p':
	- cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- if (cb.cb_type == POOL_SCAN_NONE &&
	- cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) {
	- (void) fprintf(stderr, gettext("invalid option combination: "
	- "-s and -p are mutually exclusive\n"));
	- usage(B_FALSE);
	- }
	-
	- cb.cb_argc = argc;
	- cb.cb_argv = argv;
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name argument\n"));
	- usage(B_FALSE);
	- }
	-
	- return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
	-}
	-
	-static void
	-zpool_collect_leaves(zpool_handle_t zhp, nvlist_t nvroot, nvlist_t *res)
	-{
	- uint_t children = 0;
	- nvlist_t **child;
	- uint_t i;
	-
	- (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- &child, &children);
	-
	- if (children == 0) {
	- char *path = zpool_vdev_name(g_zfs, zhp, nvroot, B_FALSE);
	- fnvlist_add_boolean(res, path);
	- free(path);
	- return;
	- }
	-
	- for (i = 0; i < children; i++) {
	- zpool_collect_leaves(zhp, child[i], res);
	- }
	-}
	-
	-/*
	- * zpool initialize [-cs] <pool> [<vdev> ...]
	- * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool
	- * if none specified.
	- *
	- * -c Cancel. Ends active initializing.
	- * -s Suspend. Initializing can then be restarted with no flags.
	- */
	-int
	-zpool_do_initialize(int argc, char **argv)
	-{
	- int c;
	- char *poolname;
	- zpool_handle_t *zhp;
	- nvlist_t *vdevs;
	- int err = 0;
	-
	- struct option long_options[] = {
	- {"cancel", no_argument, NULL, 'c'},
	- {"suspend", no_argument, NULL, 's'},
	- {0, 0, 0, 0}
	- };
	-
	- pool_initialize_func_t cmd_type = POOL_INITIALIZE_DO;
	- while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) {
	- switch (c) {
	- case 'c':
	- if (cmd_type != POOL_INITIALIZE_DO) {
	- (void) fprintf(stderr, gettext("-c cannot be "
	- "combined with other options\n"));
	- usage(B_FALSE);
	- }
	- cmd_type = POOL_INITIALIZE_CANCEL;
	- break;
	- case 's':
	- if (cmd_type != POOL_INITIALIZE_DO) {
	- (void) fprintf(stderr, gettext("-s cannot be "
	- "combined with other options\n"));
	- usage(B_FALSE);
	- }
	- cmd_type = POOL_INITIALIZE_SUSPEND;
	- break;
	- case '?':
	- if (optopt != 0) {
	- (void) fprintf(stderr,
	- gettext("invalid option '%c'\n"), optopt);
	- } else {
	- (void) fprintf(stderr,
	- gettext("invalid option '%s'\n"),
	- argv[optind - 1]);
	- }
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing pool name argument\n"));
	- usage(B_FALSE);
	- return (-1);
	- }
	-
	- poolname = argv[0];
	- zhp = zpool_open(g_zfs, poolname);
	- if (zhp == NULL)
	- return (-1);
	-
	- vdevs = fnvlist_alloc();
	- if (argc == 1) {
	- /* no individual leaf vdevs specified, so add them all */
	- nvlist_t *config = zpool_get_config(zhp, NULL);
	- nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
	- ZPOOL_CONFIG_VDEV_TREE);
	- zpool_collect_leaves(zhp, nvroot, vdevs);
	- } else {
	- int i;
	- for (i = 1; i < argc; i++) {
	- fnvlist_add_boolean(vdevs, argv[i]);
	- }
	- }
	-
	- err = zpool_initialize(zhp, cmd_type, vdevs);
	-
	- fnvlist_free(vdevs);
	- zpool_close(zhp);
	-
	- return (err);
	-}
	-
	-/*
	- * Print out detailed scrub status.
	- */
	-static void
	-print_scan_status(pool_scan_stat_t *ps)
	-{
	- time_t start, end, pause;
	- uint64_t total_secs_left;
	- uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
	- uint64_t pass_scanned, scanned, pass_issued, issued, total;
	- uint_t scan_rate, issue_rate;
	- double fraction_done;
	- char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
	- char srate_buf[7], irate_buf[7];
	-
	- (void) printf(gettext(" scan: "));
	-
	- /* If there's never been a scan, there's not much to say. */
	- if (ps == NULL \|\| ps->pss_func == POOL_SCAN_NONE \|\|
	- ps->pss_func >= POOL_SCAN_FUNCS) {
	- (void) printf(gettext("none requested\n"));
	- return;
	- }
	-
	- start = ps->pss_start_time;
	- end = ps->pss_end_time;
	- pause = ps->pss_pass_scrub_pause;
	-
	- zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
	-
	- assert(ps->pss_func == POOL_SCAN_SCRUB \|\|
	- ps->pss_func == POOL_SCAN_RESILVER);
	-
	- /* Scan is finished or canceled. */
	- if (ps->pss_state == DSS_FINISHED) {
	- total_secs_left = end - start;
	- days_left = total_secs_left / 60 / 60 / 24;
	- hours_left = (total_secs_left / 60 / 60) % 24;
	- mins_left = (total_secs_left / 60) % 60;
	- secs_left = (total_secs_left % 60);
	-
	- if (ps->pss_func == POOL_SCAN_SCRUB) {
	- (void) printf(gettext("scrub repaired %s "
	- "in %llu days %02llu:%02llu:%02llu "
	- "with %llu errors on %s"), processed_buf,
	- (u_longlong_t)days_left, (u_longlong_t)hours_left,
	- (u_longlong_t)mins_left, (u_longlong_t)secs_left,
	- (u_longlong_t)ps->pss_errors, ctime(&end));
	- } else if (ps->pss_func == POOL_SCAN_RESILVER) {
	- (void) printf(gettext("resilvered %s "
	- "in %llu days %02llu:%02llu:%02llu "
	- "with %llu errors on %s"), processed_buf,
	- (u_longlong_t)days_left, (u_longlong_t)hours_left,
	- (u_longlong_t)mins_left, (u_longlong_t)secs_left,
	- (u_longlong_t)ps->pss_errors, ctime(&end));
	-
	- }
	-
	- return;
	- } else if (ps->pss_state == DSS_CANCELED) {
	- if (ps->pss_func == POOL_SCAN_SCRUB) {
	- (void) printf(gettext("scrub canceled on %s"),
	- ctime(&end));
	- } else if (ps->pss_func == POOL_SCAN_RESILVER) {
	- (void) printf(gettext("resilver canceled on %s"),
	- ctime(&end));
	- }
	- return;
	- }
	-
	- assert(ps->pss_state == DSS_SCANNING);
	-
	- /* Scan is in progress. Resilvers can't be paused. */
	- if (ps->pss_func == POOL_SCAN_SCRUB) {
	- if (pause == 0) {
	- (void) printf(gettext("scrub in progress since %s"),
	- ctime(&start));
	- } else {
	- (void) printf(gettext("scrub paused since %s"),
	- ctime(&pause));
	- (void) printf(gettext("\tscrub started on %s"),
	- ctime(&start));
	- }
	- } else if (ps->pss_func == POOL_SCAN_RESILVER) {
	- (void) printf(gettext("resilver in progress since %s"),
	- ctime(&start));
	- }
	-
	- scanned = ps->pss_examined;
	- pass_scanned = ps->pss_pass_exam;
	- issued = ps->pss_issued;
	- pass_issued = ps->pss_pass_issued;
	- total = ps->pss_to_examine;
	-
	- /* we are only done with a block once we have issued the IO for it */
	- fraction_done = (double)issued / total;
	-
	- /* elapsed time for this pass, rounding up to 1 if it's 0 */
	- elapsed = time(NULL) - ps->pss_pass_start;
	- elapsed -= ps->pss_pass_scrub_spent_paused;
	- elapsed = (elapsed != 0) ? elapsed : 1;
	-
	- scan_rate = pass_scanned / elapsed;
	- issue_rate = pass_issued / elapsed;
	- total_secs_left = (issue_rate != 0) ?
	- ((total - issued) / issue_rate) : UINT64_MAX;
	-
	- days_left = total_secs_left / 60 / 60 / 24;
	- hours_left = (total_secs_left / 60 / 60) % 24;
	- mins_left = (total_secs_left / 60) % 60;
	- secs_left = (total_secs_left % 60);
	-
	- /* format all of the numbers we will be reporting */
	- zfs_nicenum(scanned, scanned_buf, sizeof (scanned_buf));
	- zfs_nicenum(issued, issued_buf, sizeof (issued_buf));
	- zfs_nicenum(total, total_buf, sizeof (total_buf));
	- zfs_nicenum(scan_rate, srate_buf, sizeof (srate_buf));
	- zfs_nicenum(issue_rate, irate_buf, sizeof (irate_buf));
	-
	- /* doo not print estimated time if we have a paused scrub */
	- if (pause == 0) {
	- (void) printf(gettext("\t%s scanned at %s/s, "
	- "%s issued at %s/s, %s total\n"),
	- scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
	- } else {
	- (void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
	- scanned_buf, issued_buf, total_buf);
	- }
	-
	- if (ps->pss_func == POOL_SCAN_RESILVER) {
	- (void) printf(gettext("\t%s resilvered, %.2f%% done"),
	- processed_buf, 100 * fraction_done);
	- } else if (ps->pss_func == POOL_SCAN_SCRUB) {
	- (void) printf(gettext("\t%s repaired, %.2f%% done"),
	- processed_buf, 100 * fraction_done);
	- }
	-
	- if (pause == 0) {
	- if (issue_rate >= 10 * 1024 * 1024) {
	- (void) printf(gettext(", %llu days "
	- "%02llu:%02llu:%02llu to go\n"),
	- (u_longlong_t)days_left, (u_longlong_t)hours_left,
	- (u_longlong_t)mins_left, (u_longlong_t)secs_left);
	- } else {
	- (void) printf(gettext(", no estimated "
	- "completion time\n"));
	- }
	- } else {
	- (void) printf(gettext("\n"));
	- }
	-}
	-
	-/*
	- * As we don't scrub checkpointed blocks, we want to warn the
	- * user that we skipped scanning some blocks if a checkpoint exists
	- * or existed at any time during the scan.
	- */
	-static void
	-print_checkpoint_scan_warning(pool_scan_stat_t ps, pool_checkpoint_stat_t pcs)
	-{
	- if (ps == NULL \|\| pcs == NULL)
	- return;
	-
	- if (pcs->pcs_state == CS_NONE \|\|
	- pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
	- return;
	-
	- assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS);
	-
	- if (ps->pss_state == DSS_NONE)
	- return;
	-
	- if ((ps->pss_state == DSS_FINISHED \|\| ps->pss_state == DSS_CANCELED) &&
	- ps->pss_end_time < pcs->pcs_start_time)
	- return;
	-
	- if (ps->pss_state == DSS_FINISHED \|\| ps->pss_state == DSS_CANCELED) {
	- (void) printf(gettext(" scan warning: skipped blocks "
	- "that are only referenced by the checkpoint.\n"));
	- } else {
	- assert(ps->pss_state == DSS_SCANNING);
	- (void) printf(gettext(" scan warning: skipping blocks "
	- "that are only referenced by the checkpoint.\n"));
	- }
	-}
	-
	-/*
	- * Print out detailed removal status.
	- */
	-static void
	-print_removal_status(zpool_handle_t zhp, pool_removal_stat_t prs)
	-{
	- char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
	- time_t start, end;
	- nvlist_t config, nvroot;
	- nvlist_t **child;
	- uint_t children;
	- char *vdev_name;
	-
	- if (prs == NULL \|\| prs->prs_state == DSS_NONE)
	- return;
	-
	- /*
	- * Determine name of vdev.
	- */
	- config = zpool_get_config(zhp, NULL);
	- nvroot = fnvlist_lookup_nvlist(config,
	- ZPOOL_CONFIG_VDEV_TREE);
	- verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) == 0);
	- assert(prs->prs_removing_vdev < children);
	- vdev_name = zpool_vdev_name(g_zfs, zhp,
	- child[prs->prs_removing_vdev], B_TRUE);
	-
	- (void) printf(gettext("remove: "));
	-
	- start = prs->prs_start_time;
	- end = prs->prs_end_time;
	- zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf));
	-
	- /*
	- * Removal is finished or canceled.
	- */
	- if (prs->prs_state == DSS_FINISHED) {
	- uint64_t minutes_taken = (end - start) / 60;
	-
	- (void) printf(gettext("Removal of vdev %llu copied %s "
	- "in %lluh%um, completed on %s"),
	- (longlong_t)prs->prs_removing_vdev,
	- copied_buf,
	- (u_longlong_t)(minutes_taken / 60),
	- (uint_t)(minutes_taken % 60),
	- ctime((time_t *)&end));
	- } else if (prs->prs_state == DSS_CANCELED) {
	- (void) printf(gettext("Removal of %s canceled on %s"),
	- vdev_name, ctime(&end));
	- } else {
	- uint64_t copied, total, elapsed, mins_left, hours_left;
	- double fraction_done;
	- uint_t rate;
	-
	- assert(prs->prs_state == DSS_SCANNING);
	-
	- /*
	- * Removal is in progress.
	- */
	- (void) printf(gettext(
	- "Evacuation of %s in progress since %s"),
	- vdev_name, ctime(&start));
	-
	- copied = prs->prs_copied > 0 ? prs->prs_copied : 1;
	- total = prs->prs_to_copy;
	- fraction_done = (double)copied / total;
	-
	- /* elapsed time for this pass */
	- elapsed = time(NULL) - prs->prs_start_time;
	- elapsed = elapsed > 0 ? elapsed : 1;
	- rate = copied / elapsed;
	- rate = rate > 0 ? rate : 1;
	- mins_left = ((total - copied) / rate) / 60;
	- hours_left = mins_left / 60;
	-
	- zfs_nicenum(copied, examined_buf, sizeof (examined_buf));
	- zfs_nicenum(total, total_buf, sizeof (total_buf));
	- zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
	-
	- /*
	- * do not print estimated time if hours_left is more than
	- * 30 days
	- */
	- (void) printf(gettext(" %s copied out of %s at %s/s, "
	- "%.2f%% done"),
	- examined_buf, total_buf, rate_buf, 100 * fraction_done);
	- if (hours_left < (30 * 24)) {
	- (void) printf(gettext(", %lluh%um to go\n"),
	- (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
	- } else {
	- (void) printf(gettext(
	- ", (copy is slow, no estimated time)\n"));
	- }
	- }
	-
	- if (prs->prs_mapping_memory > 0) {
	- char mem_buf[7];
	- zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf));
	- (void) printf(gettext(" %s memory used for "
	- "removed device mappings\n"),
	- mem_buf);
	- }
	-}
	-
	-static void
	-print_checkpoint_status(pool_checkpoint_stat_t *pcs)
	-{
	- time_t start;
	- char space_buf[7];
	-
	- if (pcs == NULL \|\| pcs->pcs_state == CS_NONE)
	- return;
	-
	- (void) printf(gettext("checkpoint: "));
	-
	- start = pcs->pcs_start_time;
	- zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf));
	-
	- if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) {
	- char *date = ctime(&start);
	-
	- /*
	- * ctime() adds a newline at the end of the generated
	- * string, thus the weird format specifier and the
	- * strlen() call used to chop it off from the output.
	- */
	- (void) printf(gettext("created %.*s, consumes %s\n"),
	- strlen(date) - 1, date, space_buf);
	- return;
	- }
	-
	- assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
	-
	- (void) printf(gettext("discarding, %s remaining.\n"),
	- space_buf);
	-}
	-
	-static void
	-print_error_log(zpool_handle_t *zhp)
	-{
	- nvlist_t *nverrlist = NULL;
	- nvpair_t *elem;
	- char *pathname;
	- size_t len = MAXPATHLEN * 2;
	-
	- if (zpool_get_errlog(zhp, &nverrlist) != 0) {
	- (void) printf("errors: List of errors unavailable "
	- "(insufficient privileges)\n");
	- return;
	- }
	-
	- (void) printf("errors: Permanent errors have been "
	- "detected in the following files:\n\n");
	-
	- pathname = safe_malloc(len);
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) {
	- nvlist_t *nv;
	- uint64_t dsobj, obj;
	-
	- verify(nvpair_value_nvlist(elem, &nv) == 0);
	- verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET,
	- &dsobj) == 0);
	- verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT,
	- &obj) == 0);
	- zpool_obj_to_path(zhp, dsobj, obj, pathname, len);
	- (void) printf("%7s %s\n", "", pathname);
	- }
	- free(pathname);
	- nvlist_free(nverrlist);
	-}
	-
	-static void
	-print_spares(zpool_handle_t zhp, status_cbdata_t cb, nvlist_t **spares,
	- uint_t nspares)
	-{
	- uint_t i;
	- char *name;
	-
	- if (nspares == 0)
	- return;
	-
	- (void) printf(gettext("\tspares\n"));
	-
	- for (i = 0; i < nspares; i++) {
	- name = zpool_vdev_name(g_zfs, zhp, spares[i],
	- cb->cb_name_flags);
	- print_status_config(zhp, cb, name, spares[i], 2, B_TRUE);
	- free(name);
	- }
	-}
	-
	-static void
	-print_l2cache(zpool_handle_t zhp, status_cbdata_t cb, nvlist_t **l2cache,
	- uint_t nl2cache)
	-{
	- uint_t i;
	- char *name;
	-
	- if (nl2cache == 0)
	- return;
	-
	- (void) printf(gettext("\tcache\n"));
	-
	- for (i = 0; i < nl2cache; i++) {
	- name = zpool_vdev_name(g_zfs, zhp, l2cache[i],
	- cb->cb_name_flags);
	- print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE);
	- free(name);
	- }
	-}
	-
	-static void
	-print_dedup_stats(nvlist_t *config)
	-{
	- ddt_histogram_t *ddh;
	- ddt_stat_t *dds;
	- ddt_object_t *ddo;
	- uint_t c;
	-
	- /*
	- * If the pool was faulted then we may not have been able to
	- * obtain the config. Otherwise, if we have anything in the dedup
	- * table continue processing the stats.
	- */
	- if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
	- (uint64_t **)&ddo, &c) != 0)
	- return;
	-
	- (void) printf("\n");
	- (void) printf(gettext(" dedup: "));
	- if (ddo->ddo_count == 0) {
	- (void) printf(gettext("no DDT entries\n"));
	- return;
	- }
	-
	- (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n",
	- (u_longlong_t)ddo->ddo_count,
	- (u_longlong_t)ddo->ddo_dspace,
	- (u_longlong_t)ddo->ddo_mspace);
	-
	- verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
	- (uint64_t **)&dds, &c) == 0);
	- verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM,
	- (uint64_t **)&ddh, &c) == 0);
	- zpool_dump_ddt(dds, ddh);
	-}
	-
	-/*
	- * Display a summary of pool status. Displays a summary such as:
	- *
	- * pool: tank
	- * status: DEGRADED
	- * reason: One or more devices ...
	- * see: http://illumos.org/msg/ZFS-xxxx-01
	- * config:
	- * mirror DEGRADED
	- * c1t0d0 OK
	- * c2t0d0 UNAVAIL
	- *
	- * When given the '-v' option, we print out the complete config. If the '-e'
	- * option is specified, then we print out error rate information as well.
	- */
	-int
	-status_callback(zpool_handle_t zhp, void data)
	-{
	- status_cbdata_t *cbp = data;
	- nvlist_t config, nvroot;
	- char *msgid;
	- int reason;
	- const char *health;
	- uint_t c;
	- vdev_stat_t *vs;
	-
	- config = zpool_get_config(zhp, NULL);
	- reason = zpool_get_status(zhp, &msgid);
	-
	- cbp->cb_count++;
	-
	- /*
	- * If we were given 'zpool status -x', only report those pools with
	- * problems.
	- */
	- if (cbp->cb_explain &&
	- (reason == ZPOOL_STATUS_OK \|\|
	- reason == ZPOOL_STATUS_VERSION_OLDER \|\|
	- reason == ZPOOL_STATUS_NON_NATIVE_ASHIFT \|\|
	- reason == ZPOOL_STATUS_FEAT_DISABLED)) {
	- if (!cbp->cb_allpools) {
	- (void) printf(gettext("pool '%s' is healthy\n"),
	- zpool_get_name(zhp));
	- if (cbp->cb_first)
	- cbp->cb_first = B_FALSE;
	- }
	- return (0);
	- }
	-
	- if (cbp->cb_first)
	- cbp->cb_first = B_FALSE;
	- else
	- (void) printf("\n");
	-
	- nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
	- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &c) == 0);
	- health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
	-
	- (void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp));
	- (void) printf(gettext(" state: %s\n"), health);
	-
	- switch (reason) {
	- case ZPOOL_STATUS_MISSING_DEV_R:
	- (void) printf(gettext("status: One or more devices could not "
	- "be opened. Sufficient replicas exist for\n\tthe pool to "
	- "continue functioning in a degraded state.\n"));
	- (void) printf(gettext("action: Attach the missing device and "
	- "online it using 'zpool online'.\n"));
	- break;
	-
	- case ZPOOL_STATUS_MISSING_DEV_NR:
	- (void) printf(gettext("status: One or more devices could not "
	- "be opened. There are insufficient\n\treplicas for the "
	- "pool to continue functioning.\n"));
	- (void) printf(gettext("action: Attach the missing device and "
	- "online it using 'zpool online'.\n"));
	- break;
	-
	- case ZPOOL_STATUS_CORRUPT_LABEL_R:
	- (void) printf(gettext("status: One or more devices could not "
	- "be used because the label is missing or\n\tinvalid. "
	- "Sufficient replicas exist for the pool to continue\n\t"
	- "functioning in a degraded state.\n"));
	- (void) printf(gettext("action: Replace the device using "
	- "'zpool replace'.\n"));
	- break;
	-
	- case ZPOOL_STATUS_CORRUPT_LABEL_NR:
	- (void) printf(gettext("status: One or more devices could not "
	- "be used because the label is missing \n\tor invalid. "
	- "There are insufficient replicas for the pool to "
	- "continue\n\tfunctioning.\n"));
	- zpool_explain_recover(zpool_get_handle(zhp),
	- zpool_get_name(zhp), reason, config);
	- break;
	-
	- case ZPOOL_STATUS_FAILING_DEV:
	- (void) printf(gettext("status: One or more devices has "
	- "experienced an unrecoverable error. An\n\tattempt was "
	- "made to correct the error. Applications are "
	- "unaffected.\n"));
	- (void) printf(gettext("action: Determine if the device needs "
	- "to be replaced, and clear the errors\n\tusing "
	- "'zpool clear' or replace the device with 'zpool "
	- "replace'.\n"));
	- break;
	-
	- case ZPOOL_STATUS_OFFLINE_DEV:
	- (void) printf(gettext("status: One or more devices has "
	- "been taken offline by the administrator.\n\tSufficient "
	- "replicas exist for the pool to continue functioning in "
	- "a\n\tdegraded state.\n"));
	- (void) printf(gettext("action: Online the device using "
	- "'zpool online' or replace the device with\n\t'zpool "
	- "replace'.\n"));
	- break;
	-
	- case ZPOOL_STATUS_REMOVED_DEV:
	- (void) printf(gettext("status: One or more devices has "
	- "been removed by the administrator.\n\tSufficient "
	- "replicas exist for the pool to continue functioning in "
	- "a\n\tdegraded state.\n"));
	- (void) printf(gettext("action: Online the device using "
	- "'zpool online' or replace the device with\n\t'zpool "
	- "replace'.\n"));
	- break;
	-
	- case ZPOOL_STATUS_RESILVERING:
	- (void) printf(gettext("status: One or more devices is "
	- "currently being resilvered. The pool will\n\tcontinue "
	- "to function, possibly in a degraded state.\n"));
	- (void) printf(gettext("action: Wait for the resilver to "
	- "complete.\n"));
	- break;
	-
	- case ZPOOL_STATUS_CORRUPT_DATA:
	- (void) printf(gettext("status: One or more devices has "
	- "experienced an error resulting in data\n\tcorruption. "
	- "Applications may be affected.\n"));
	- (void) printf(gettext("action: Restore the file in question "
	- "if possible. Otherwise restore the\n\tentire pool from "
	- "backup.\n"));
	- break;
	-
	- case ZPOOL_STATUS_CORRUPT_POOL:
	- (void) printf(gettext("status: The pool metadata is corrupted "
	- "and the pool cannot be opened.\n"));
	- zpool_explain_recover(zpool_get_handle(zhp),
	- zpool_get_name(zhp), reason, config);
	- break;
	-
	- case ZPOOL_STATUS_VERSION_OLDER:
	- (void) printf(gettext("status: The pool is formatted using a "
	- "legacy on-disk format. The pool can\n\tstill be used, "
	- "but some features are unavailable.\n"));
	- (void) printf(gettext("action: Upgrade the pool using 'zpool "
	- "upgrade'. Once this is done, the\n\tpool will no longer "
	- "be accessible on software that does not support feature\n"
	- "\tflags.\n"));
	- break;
	-
	- case ZPOOL_STATUS_VERSION_NEWER:
	- (void) printf(gettext("status: The pool has been upgraded to a "
	- "newer, incompatible on-disk version.\n\tThe pool cannot "
	- "be accessed on this system.\n"));
	- (void) printf(gettext("action: Access the pool from a system "
	- "running more recent software, or\n\trestore the pool from "
	- "backup.\n"));
	- break;
	-
	- case ZPOOL_STATUS_FEAT_DISABLED:
	- (void) printf(gettext("status: Some supported features are not "
	- "enabled on the pool. The pool can\n\tstill be used, but "
	- "some features are unavailable.\n"));
	- (void) printf(gettext("action: Enable all features using "
	- "'zpool upgrade'. Once this is done,\n\tthe pool may no "
	- "longer be accessible by software that does not support\n\t"
	- "the features. See zpool-features(7) for details.\n"));
	- break;
	-
	- case ZPOOL_STATUS_UNSUP_FEAT_READ:
	- (void) printf(gettext("status: The pool cannot be accessed on "
	- "this system because it uses the\n\tfollowing feature(s) "
	- "not supported on this system:\n"));
	- zpool_print_unsup_feat(config);
	- (void) printf("\n");
	- (void) printf(gettext("action: Access the pool from a system "
	- "that supports the required feature(s),\n\tor restore the "
	- "pool from backup.\n"));
	- break;
	-
	- case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
	- (void) printf(gettext("status: The pool can only be accessed "
	- "in read-only mode on this system. It\n\tcannot be "
	- "accessed in read-write mode because it uses the "
	- "following\n\tfeature(s) not supported on this system:\n"));
	- zpool_print_unsup_feat(config);
	- (void) printf("\n");
	- (void) printf(gettext("action: The pool cannot be accessed in "
	- "read-write mode. Import the pool with\n"
	- "\t\"-o readonly=on\", access the pool from a system that "
	- "supports the\n\trequired feature(s), or restore the "
	- "pool from backup.\n"));
	- break;
	-
	- case ZPOOL_STATUS_FAULTED_DEV_R:
	- (void) printf(gettext("status: One or more devices are "
	- "faulted in response to persistent errors.\n\tSufficient "
	- "replicas exist for the pool to continue functioning "
	- "in a\n\tdegraded state.\n"));
	- (void) printf(gettext("action: Replace the faulted device, "
	- "or use 'zpool clear' to mark the device\n\trepaired.\n"));
	- break;
	-
	- case ZPOOL_STATUS_FAULTED_DEV_NR:
	- (void) printf(gettext("status: One or more devices are "
	- "faulted in response to persistent errors. There are "
	- "insufficient replicas for the pool to\n\tcontinue "
	- "functioning.\n"));
	- (void) printf(gettext("action: Destroy and re-create the pool "
	- "from a backup source. Manually marking the device\n"
	- "\trepaired using 'zpool clear' may allow some data "
	- "to be recovered.\n"));
	- break;
	-
	- case ZPOOL_STATUS_IO_FAILURE_MMP:
	- (void) printf(gettext("status: The pool is suspended because "
	- "multihost writes failed or were delayed;\n\tanother "
	- "system could import the pool undetected.\n"));
	- (void) printf(gettext("action: Make sure the pool's devices "
	- "are connected, then reboot your system and\n\timport the "
	- "pool.\n"));
	- break;
	-
	- case ZPOOL_STATUS_IO_FAILURE_WAIT:
	- case ZPOOL_STATUS_IO_FAILURE_CONTINUE:
	- (void) printf(gettext("status: One or more devices are "
	- "faulted in response to IO failures.\n"));
	- (void) printf(gettext("action: Make sure the affected devices "
	- "are connected, then run 'zpool clear'.\n"));
	- break;
	-
	- case ZPOOL_STATUS_BAD_LOG:
	- (void) printf(gettext("status: An intent log record "
	- "could not be read.\n"
	- "\tWaiting for adminstrator intervention to fix the "
	- "faulted pool.\n"));
	- (void) printf(gettext("action: Either restore the affected "
	- "device(s) and run 'zpool online',\n"
	- "\tor ignore the intent log records by running "
	- "'zpool clear'.\n"));
	- break;
	-
	- case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
	- (void) printf(gettext("status: One or more devices are "
	- "configured to use a non-native block size.\n"
	- "\tExpect reduced performance.\n"));
	- (void) printf(gettext("action: Replace affected devices with "
	- "devices that support the\n\tconfigured block size, or "
	- "migrate data to a properly configured\n\tpool.\n"));
	- break;
	-
	- default:
	- /*
	- * The remaining errors can't actually be generated, yet.
	- */
	- assert(reason == ZPOOL_STATUS_OK);
	- }
	-
	- if (msgid != NULL)
	- (void) printf(gettext(" see: http://illumos.org/msg/%s\n"),
	- msgid);
	-
	- if (config != NULL) {
	- uint64_t nerr;
	- nvlist_t spares, l2cache;
	- uint_t nspares, nl2cache;
	- pool_checkpoint_stat_t *pcs = NULL;
	- pool_scan_stat_t *ps = NULL;
	- pool_removal_stat_t *prs = NULL;
	-
	- (void) nvlist_lookup_uint64_array(nvroot,
	- ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
	- (void) nvlist_lookup_uint64_array(nvroot,
	- ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
	- (void) nvlist_lookup_uint64_array(nvroot,
	- ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
	-
	- print_scan_status(ps);
	- print_checkpoint_scan_warning(ps, pcs);
	- print_removal_status(zhp, prs);
	- print_checkpoint_status(pcs);
	-
	- cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
	- cbp->cb_name_flags);
	- if (cbp->cb_namewidth < 10)
	- cbp->cb_namewidth = 10;
	-
	- (void) printf(gettext("config:\n\n"));
	- (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"),
	- cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
	- "CKSUM");
	-
	- print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0,
	- B_FALSE);
	-
	- print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP);
	- print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
	- print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS);
	-
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	- &l2cache, &nl2cache) == 0)
	- print_l2cache(zhp, cbp, l2cache, nl2cache);
	-
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	- &spares, &nspares) == 0)
	- print_spares(zhp, cbp, spares, nspares);
	-
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
	- &nerr) == 0) {
	- nvlist_t *nverrlist = NULL;
	-
	- /*
	- * If the approximate error count is small, get a
	- * precise count by fetching the entire log and
	- * uniquifying the results.
	- */
	- if (nerr > 0 && nerr < 100 && !cbp->cb_verbose &&
	- zpool_get_errlog(zhp, &nverrlist) == 0) {
	- nvpair_t *elem;
	-
	- elem = NULL;
	- nerr = 0;
	- while ((elem = nvlist_next_nvpair(nverrlist,
	- elem)) != NULL) {
	- nerr++;
	- }
	- }
	- nvlist_free(nverrlist);
	-
	- (void) printf("\n");
	-
	- if (nerr == 0)
	- (void) printf(gettext("errors: No known data "
	- "errors\n"));
	- else if (!cbp->cb_verbose)
	- (void) printf(gettext("errors: %llu data "
	- "errors, use '-v' for a list\n"),
	- (u_longlong_t)nerr);
	- else
	- print_error_log(zhp);
	- }
	-
	- if (cbp->cb_dedup_stats)
	- print_dedup_stats(config);
	- } else {
	- (void) printf(gettext("config: The configuration cannot be "
	- "determined.\n"));
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * zpool status [-gLPvx] [-T d\|u] [pool] ... [interval [count]]
	- *
	- * -g Display guid for individual vdev name.
	- * -L Follow links when resolving vdev path name.
	- * -P Display full path for vdev name.
	- * -v Display complete error logs
	- * -x Display only pools with potential problems
	- * -D Display dedup status (undocumented)
	- * -T Display a timestamp in date(1) or Unix format
	- *
	- * Describes the health status of all pools or some subset.
	- */
	-int
	-zpool_do_status(int argc, char **argv)
	-{
	- int c;
	- int ret;
	- unsigned long interval = 0, count = 0;
	- status_cbdata_t cb = { 0 };
	-
	- /* check options */
	- while ((c = getopt(argc, argv, "gLPvxDT:")) != -1) {
	- switch (c) {
	- case 'g':
	- cb.cb_name_flags \|= VDEV_NAME_GUID;
	- break;
	- case 'L':
	- cb.cb_name_flags \|= VDEV_NAME_FOLLOW_LINKS;
	- break;
	- case 'P':
	- cb.cb_name_flags \|= VDEV_NAME_PATH;
	- break;
	- case 'v':
	- cb.cb_verbose = B_TRUE;
	- break;
	- case 'x':
	- cb.cb_explain = B_TRUE;
	- break;
	- case 'D':
	- cb.cb_dedup_stats = B_TRUE;
	- break;
	- case 'T':
	- get_timestamp_arg(*optarg);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- get_interval_count(&argc, argv, &interval, &count);
	-
	- if (argc == 0)
	- cb.cb_allpools = B_TRUE;
	-
	- cb.cb_first = B_TRUE;
	- cb.cb_print_status = B_TRUE;
	-
	- for (;;) {
	- if (timestamp_fmt != NODATE)
	- print_timestamp(timestamp_fmt);
	-
	- ret = for_each_pool(argc, argv, B_TRUE, NULL,
	- status_callback, &cb);
	-
	- if (argc == 0 && cb.cb_count == 0)
	- (void) printf(gettext("no pools available\n"));
	- else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
	- (void) printf(gettext("all pools are healthy\n"));
	-
	- if (ret != 0)
	- return (ret);
	-
	- if (interval == 0)
	- break;
	-
	- if (count != 0 && --count == 0)
	- break;
	-
	- (void) sleep(interval);
	- }
	-
	- return (0);
	-}
	-
	-typedef struct upgrade_cbdata {
	- boolean_t cb_first;
	- boolean_t cb_unavail;
	- char cb_poolname[ZFS_MAX_DATASET_NAME_LEN];
	- int cb_argc;
	- uint64_t cb_version;
	- char **cb_argv;
	-} upgrade_cbdata_t;
	-
	-#ifdef __FreeBSD__
	-static int
	-is_root_pool(zpool_handle_t *zhp)
	-{
	- static struct statfs sfs;
	- static char *poolname = NULL;
	- static boolean_t stated = B_FALSE;
	- char *slash;
	-
	- if (!stated) {
	- stated = B_TRUE;
	- if (statfs("/", &sfs) == -1) {
	- (void) fprintf(stderr,
	- "Unable to stat root file system: %s.\n",
	- strerror(errno));
	- return (0);
	- }
	- if (strcmp(sfs.f_fstypename, "zfs") != 0)
	- return (0);
	- poolname = sfs.f_mntfromname;
	- if ((slash = strchr(poolname, '/')) != NULL)
	- *slash = '\0';
	- }
	- return (poolname != NULL && strcmp(poolname, zpool_get_name(zhp)) == 0);
	-}
	-
	-static void
	-root_pool_upgrade_check(zpool_handle_t zhp, char poolname, int size)
	-{
	-
	- if (poolname[0] == '\0' && is_root_pool(zhp))
	- (void) strlcpy(poolname, zpool_get_name(zhp), size);
	-}
	-#endif /* FreeBSD */
	-
	-static int
	-upgrade_version(zpool_handle_t *zhp, uint64_t version)
	-{
	- int ret;
	- nvlist_t *config;
	- uint64_t oldversion;
	-
	- config = zpool_get_config(zhp, NULL);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	- &oldversion) == 0);
	-
	- assert(SPA_VERSION_IS_SUPPORTED(oldversion));
	- assert(oldversion < version);
	-
	- ret = zpool_upgrade(zhp, version);
	- if (ret != 0)
	- return (ret);
	-
	- if (version >= SPA_VERSION_FEATURES) {
	- (void) printf(gettext("Successfully upgraded "
	- "'%s' from version %llu to feature flags.\n"),
	- zpool_get_name(zhp), oldversion);
	- } else {
	- (void) printf(gettext("Successfully upgraded "
	- "'%s' from version %llu to version %llu.\n"),
	- zpool_get_name(zhp), oldversion, version);
	- }
	-
	- return (0);
	-}
	-
	-static int
	-upgrade_enable_all(zpool_handle_t zhp, int countp)
	-{
	- int i, ret, count;
	- boolean_t firstff = B_TRUE;
	- nvlist_t *enabled = zpool_get_features(zhp);
	-
	- count = 0;
	- for (i = 0; i < SPA_FEATURES; i++) {
	- const char *fname = spa_feature_table[i].fi_uname;
	- const char *fguid = spa_feature_table[i].fi_guid;
	- if (!nvlist_exists(enabled, fguid)) {
	- char *propname;
	- verify(-1 != asprintf(&propname, "feature@%s", fname));
	- ret = zpool_set_prop(zhp, propname,
	- ZFS_FEATURE_ENABLED);
	- if (ret != 0) {
	- free(propname);
	- return (ret);
	- }
	- count++;
	-
	- if (firstff) {
	- (void) printf(gettext("Enabled the "
	- "following features on '%s':\n"),
	- zpool_get_name(zhp));
	- firstff = B_FALSE;
	- }
	- (void) printf(gettext(" %s\n"), fname);
	- free(propname);
	- }
	- }
	-
	- if (countp != NULL)
	- *countp = count;
	- return (0);
	-}
	-
	-static int
	-upgrade_cb(zpool_handle_t zhp, void arg)
	-{
	- upgrade_cbdata_t *cbp = arg;
	- nvlist_t *config;
	- uint64_t version;
	- boolean_t printnl = B_FALSE;
	- int ret;
	-
	- if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	- (void) fprintf(stderr, gettext("cannot upgrade '%s': pool is "
	- "currently unavailable.\n\n"), zpool_get_name(zhp));
	- cbp->cb_unavail = B_TRUE;
	- /* Allow iteration to continue. */
	- return (0);
	- }
	-
	- config = zpool_get_config(zhp, NULL);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	- &version) == 0);
	-
	- assert(SPA_VERSION_IS_SUPPORTED(version));
	-
	- if (version < cbp->cb_version) {
	- cbp->cb_first = B_FALSE;
	- ret = upgrade_version(zhp, cbp->cb_version);
	- if (ret != 0)
	- return (ret);
	-#ifdef __FreeBSD__
	- root_pool_upgrade_check(zhp, cbp->cb_poolname,
	- sizeof(cbp->cb_poolname));
	-#endif /* __FreeBSD__ */
	- printnl = B_TRUE;
	-
	-#ifdef illumos
	- /*
	- * If they did "zpool upgrade -a", then we could
	- * be doing ioctls to different pools. We need
	- * to log this history once to each pool, and bypass
	- * the normal history logging that happens in main().
	- */
	- (void) zpool_log_history(g_zfs, history_str);
	- log_history = B_FALSE;
	-#endif
	- }
	-
	- if (cbp->cb_version >= SPA_VERSION_FEATURES) {
	- int count;
	- ret = upgrade_enable_all(zhp, &count);
	- if (ret != 0)
	- return (ret);
	-
	- if (count > 0) {
	- cbp->cb_first = B_FALSE;
	- printnl = B_TRUE;
	-#ifdef __FreeBSD__
	- root_pool_upgrade_check(zhp, cbp->cb_poolname,
	- sizeof(cbp->cb_poolname));
	-#endif /* __FreeBSD__ */
	- /*
	- * If they did "zpool upgrade -a", then we could
	- * be doing ioctls to different pools. We need
	- * to log this history once to each pool, and bypass
	- * the normal history logging that happens in main().
	- */
	- (void) zpool_log_history(g_zfs, history_str);
	- log_history = B_FALSE;
	- }
	- }
	-
	- if (printnl) {
	- (void) printf(gettext("\n"));
	- }
	-
	- return (0);
	-}
	-
	-static int
	-upgrade_list_unavail(zpool_handle_t zhp, void arg)
	-{
	- upgrade_cbdata_t *cbp = arg;
	-
	- if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	- if (cbp->cb_first) {
	- (void) fprintf(stderr, gettext("The following pools "
	- "are unavailable and cannot be upgraded as this "
	- "time.\n\n"));
	- (void) fprintf(stderr, gettext("POOL\n"));
	- (void) fprintf(stderr, gettext("------------\n"));
	- cbp->cb_first = B_FALSE;
	- }
	- (void) printf(gettext("%s\n"), zpool_get_name(zhp));
	- cbp->cb_unavail = B_TRUE;
	- }
	- return (0);
	-}
	-
	-static int
	-upgrade_list_older_cb(zpool_handle_t zhp, void arg)
	-{
	- upgrade_cbdata_t *cbp = arg;
	- nvlist_t *config;
	- uint64_t version;
	-
	- if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	- /*
	- * This will have been reported by upgrade_list_unavail so
	- * just allow iteration to continue.
	- */
	- cbp->cb_unavail = B_TRUE;
	- return (0);
	- }
	-
	- config = zpool_get_config(zhp, NULL);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	- &version) == 0);
	-
	- assert(SPA_VERSION_IS_SUPPORTED(version));
	-
	- if (version < SPA_VERSION_FEATURES) {
	- if (cbp->cb_first) {
	- (void) printf(gettext("The following pools are "
	- "formatted with legacy version numbers and can\n"
	- "be upgraded to use feature flags. After "
	- "being upgraded, these pools\nwill no "
	- "longer be accessible by software that does not "
	- "support feature\nflags.\n\n"));
	- (void) printf(gettext("VER POOL\n"));
	- (void) printf(gettext("--- ------------\n"));
	- cbp->cb_first = B_FALSE;
	- }
	-
	- (void) printf("%2llu %s\n", (u_longlong_t)version,
	- zpool_get_name(zhp));
	- }
	-
	- return (0);
	-}
	-
	-static int
	-upgrade_list_disabled_cb(zpool_handle_t zhp, void arg)
	-{
	- upgrade_cbdata_t *cbp = arg;
	- nvlist_t *config;
	- uint64_t version;
	-
	- if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	- /*
	- * This will have been reported by upgrade_list_unavail so
	- * just allow iteration to continue.
	- */
	- cbp->cb_unavail = B_TRUE;
	- return (0);
	- }
	-
	- config = zpool_get_config(zhp, NULL);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	- &version) == 0);
	-
	- if (version >= SPA_VERSION_FEATURES) {
	- int i;
	- boolean_t poolfirst = B_TRUE;
	- nvlist_t *enabled = zpool_get_features(zhp);
	-
	- for (i = 0; i < SPA_FEATURES; i++) {
	- const char *fguid = spa_feature_table[i].fi_guid;
	- const char *fname = spa_feature_table[i].fi_uname;
	- if (!nvlist_exists(enabled, fguid)) {
	- if (cbp->cb_first) {
	- (void) printf(gettext("\nSome "
	- "supported features are not "
	- "enabled on the following pools. "
	- "Once a\nfeature is enabled the "
	- "pool may become incompatible with "
	- "software\nthat does not support "
	- "the feature. See "
	- "zpool-features(7) for "
	- "details.\n\n"));
	- (void) printf(gettext("POOL "
	- "FEATURE\n"));
	- (void) printf(gettext("------"
	- "---------\n"));
	- cbp->cb_first = B_FALSE;
	- }
	-
	- if (poolfirst) {
	- (void) printf(gettext("%s\n"),
	- zpool_get_name(zhp));
	- poolfirst = B_FALSE;
	- }
	-
	- (void) printf(gettext(" %s\n"), fname);
	- }
	- }
	- }
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-upgrade_one(zpool_handle_t zhp, void data)
	-{
	- boolean_t printnl = B_FALSE;
	- upgrade_cbdata_t *cbp = data;
	- uint64_t cur_version;
	- int ret;
	-
	- if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	- (void) fprintf(stderr, gettext("cannot upgrade '%s': pool is "
	- "is currently unavailable.\n\n"), zpool_get_name(zhp));
	- cbp->cb_unavail = B_TRUE;
	- return (1);
	- }
	-
	- if (strcmp("log", zpool_get_name(zhp)) == 0) {
	- (void) printf(gettext("'log' is now a reserved word\n"
	- "Pool 'log' must be renamed using export and import"
	- " to upgrade.\n\n"));
	- return (1);
	- }
	-
	- cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
	- if (cur_version > cbp->cb_version) {
	- (void) printf(gettext("Pool '%s' is already formatted "
	- "using more current version '%llu'.\n\n"),
	- zpool_get_name(zhp), cur_version);
	- return (0);
	- }
	-
	- if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) {
	- (void) printf(gettext("Pool '%s' is already formatted "
	- "using version %llu.\n\n"), zpool_get_name(zhp),
	- cbp->cb_version);
	- return (0);
	- }
	-
	- if (cur_version != cbp->cb_version) {
	- printnl = B_TRUE;
	- ret = upgrade_version(zhp, cbp->cb_version);
	- if (ret != 0)
	- return (ret);
	-#ifdef __FreeBSD__
	- root_pool_upgrade_check(zhp, cbp->cb_poolname,
	- sizeof(cbp->cb_poolname));
	-#endif /* __FreeBSD__ */
	- }
	-
	- if (cbp->cb_version >= SPA_VERSION_FEATURES) {
	- int count = 0;
	- ret = upgrade_enable_all(zhp, &count);
	- if (ret != 0)
	- return (ret);
	-
	- if (count != 0) {
	- printnl = B_TRUE;
	-#ifdef __FreeBSD__
	- root_pool_upgrade_check(zhp, cbp->cb_poolname,
	- sizeof(cbp->cb_poolname));
	-#endif /* __FreeBSD __*/
	- } else if (cur_version == SPA_VERSION) {
	- (void) printf(gettext("Pool '%s' already has all "
	- "supported features enabled.\n\n"),
	- zpool_get_name(zhp));
	- }
	- }
	-
	- if (printnl) {
	- (void) printf(gettext("\n"));
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * zpool upgrade
	- * zpool upgrade -v
	- * zpool upgrade [-V version] <-a \| pool ...>
	- *
	- * With no arguments, display downrev'd ZFS pool available for upgrade.
	- * Individual pools can be upgraded by specifying the pool, and '-a' will
	- * upgrade all pools.
	- */
	-int
	-zpool_do_upgrade(int argc, char **argv)
	-{
	- int c;
	- upgrade_cbdata_t cb = { 0 };
	- int ret = 0;
	- boolean_t showversions = B_FALSE;
	- boolean_t upgradeall = B_FALSE;
	- char *end;
	-
	-
	- /* check options */
	- while ((c = getopt(argc, argv, ":avV:")) != -1) {
	- switch (c) {
	- case 'a':
	- upgradeall = B_TRUE;
	- break;
	- case 'v':
	- showversions = B_TRUE;
	- break;
	- case 'V':
	- cb.cb_version = strtoll(optarg, &end, 10);
	- if (*end != '\0' \|\|
	- !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) {
	- (void) fprintf(stderr,
	- gettext("invalid version '%s'\n"), optarg);
	- usage(B_FALSE);
	- }
	- break;
	- case ':':
	- (void) fprintf(stderr, gettext("missing argument for "
	- "'%c' option\n"), optopt);
	- usage(B_FALSE);
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- cb.cb_argc = argc;
	- cb.cb_argv = argv;
	- argc -= optind;
	- argv += optind;
	-
	- if (cb.cb_version == 0) {
	- cb.cb_version = SPA_VERSION;
	- } else if (!upgradeall && argc == 0) {
	- (void) fprintf(stderr, gettext("-V option is "
	- "incompatible with other arguments\n"));
	- usage(B_FALSE);
	- }
	-
	- if (showversions) {
	- if (upgradeall \|\| argc != 0) {
	- (void) fprintf(stderr, gettext("-v option is "
	- "incompatible with other arguments\n"));
	- usage(B_FALSE);
	- }
	- } else if (upgradeall) {
	- if (argc != 0) {
	- (void) fprintf(stderr, gettext("-a option should not "
	- "be used along with a pool name\n"));
	- usage(B_FALSE);
	- }
	- }
	-
	- (void) printf(gettext("This system supports ZFS pool feature "
	- "flags.\n\n"));
	- if (showversions) {
	- int i;
	-
	- (void) printf(gettext("The following features are "
	- "supported:\n\n"));
	- (void) printf(gettext("FEAT DESCRIPTION\n"));
	- (void) printf("----------------------------------------------"
	- "---------------\n");
	- for (i = 0; i < SPA_FEATURES; i++) {
	- zfeature_info_t *fi = &spa_feature_table[i];
	- const char *ro =
	- (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
	- " (read-only compatible)" : "";
	-
	- (void) printf("%-37s%s\n", fi->fi_uname, ro);
	- (void) printf(" %s\n", fi->fi_desc);
	- }
	- (void) printf("\n");
	-
	- (void) printf(gettext("The following legacy versions are also "
	- "supported:\n\n"));
	- (void) printf(gettext("VER DESCRIPTION\n"));
	- (void) printf("--- -----------------------------------------"
	- "---------------\n");
	- (void) printf(gettext(" 1 Initial ZFS version\n"));
	- (void) printf(gettext(" 2 Ditto blocks "
	- "(replicated metadata)\n"));
	- (void) printf(gettext(" 3 Hot spares and double parity "
	- "RAID-Z\n"));
	- (void) printf(gettext(" 4 zpool history\n"));
	- (void) printf(gettext(" 5 Compression using the gzip "
	- "algorithm\n"));
	- (void) printf(gettext(" 6 bootfs pool property\n"));
	- (void) printf(gettext(" 7 Separate intent log devices\n"));
	- (void) printf(gettext(" 8 Delegated administration\n"));
	- (void) printf(gettext(" 9 refquota and refreservation "
	- "properties\n"));
	- (void) printf(gettext(" 10 Cache devices\n"));
	- (void) printf(gettext(" 11 Improved scrub performance\n"));
	- (void) printf(gettext(" 12 Snapshot properties\n"));
	- (void) printf(gettext(" 13 snapused property\n"));
	- (void) printf(gettext(" 14 passthrough-x aclinherit\n"));
	- (void) printf(gettext(" 15 user/group space accounting\n"));
	- (void) printf(gettext(" 16 stmf property support\n"));
	- (void) printf(gettext(" 17 Triple-parity RAID-Z\n"));
	- (void) printf(gettext(" 18 Snapshot user holds\n"));
	- (void) printf(gettext(" 19 Log device removal\n"));
	- (void) printf(gettext(" 20 Compression using zle "
	- "(zero-length encoding)\n"));
	- (void) printf(gettext(" 21 Deduplication\n"));
	- (void) printf(gettext(" 22 Received properties\n"));
	- (void) printf(gettext(" 23 Slim ZIL\n"));
	- (void) printf(gettext(" 24 System attributes\n"));
	- (void) printf(gettext(" 25 Improved scrub stats\n"));
	- (void) printf(gettext(" 26 Improved snapshot deletion "
	- "performance\n"));
	- (void) printf(gettext(" 27 Improved snapshot creation "
	- "performance\n"));
	- (void) printf(gettext(" 28 Multiple vdev replacements\n"));
	- (void) printf(gettext("\nFor more information on a particular "
	- "version, including supported releases,\n"));
	- (void) printf(gettext("see the ZFS Administration Guide.\n\n"));
	- } else if (argc == 0 && upgradeall) {
	- cb.cb_first = B_TRUE;
	- ret = zpool_iter(g_zfs, upgrade_cb, &cb);
	- if (ret == 0 && cb.cb_first) {
	- if (cb.cb_version == SPA_VERSION) {
	- (void) printf(gettext("All %spools are already "
	- "formatted using feature flags.\n\n"),
	- cb.cb_unavail ? gettext("available ") : "");
	- (void) printf(gettext("Every %sfeature flags "
	- "pool already has all supported features "
	- "enabled.\n"),
	- cb.cb_unavail ? gettext("available ") : "");
	- } else {
	- (void) printf(gettext("All pools are already "
	- "formatted with version %llu or higher.\n"),
	- cb.cb_version);
	- }
	- }
	- } else if (argc == 0) {
	- cb.cb_first = B_TRUE;
	- ret = zpool_iter(g_zfs, upgrade_list_unavail, &cb);
	- assert(ret == 0);
	-
	- if (!cb.cb_first) {
	- (void) fprintf(stderr, "\n");
	- }
	-
	- cb.cb_first = B_TRUE;
	- ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb);
	- assert(ret == 0);
	-
	- if (cb.cb_first) {
	- (void) printf(gettext("All %spools are formatted using "
	- "feature flags.\n\n"), cb.cb_unavail ?
	- gettext("available ") : "");
	- } else {
	- (void) printf(gettext("\nUse 'zpool upgrade -v' "
	- "for a list of available legacy versions.\n"));
	- }
	-
	- cb.cb_first = B_TRUE;
	- ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb);
	- assert(ret == 0);
	-
	- if (cb.cb_first) {
	- (void) printf(gettext("Every %sfeature flags pool has "
	- "all supported features enabled.\n"),
	- cb.cb_unavail ? gettext("available ") : "");
	- } else {
	- (void) printf(gettext("\n"));
	- }
	- } else {
	- ret = for_each_pool(argc, argv, B_TRUE, NULL,
	- upgrade_one, &cb);
	- }
	-
	- if (cb.cb_poolname[0] != '\0') {
	- (void) printf(
	- "If you boot from pool '%s', don't forget to update boot code.\n"
	- "Assuming you use GPT partitioning and da0 is your boot disk\n"
	- "the following command will do it:\n"
	- "\n"
	- "\tgpart bootcode -b /boot/pmbr -p /boot/gptzfsboot -i 1 da0\n\n",
	- cb.cb_poolname);
	- }
	-
	- return (ret);
	-}
	-
	-typedef struct hist_cbdata {
	- boolean_t first;
	- boolean_t longfmt;
	- boolean_t internal;
	-} hist_cbdata_t;
	-
	-static void
	-print_history_records(nvlist_t nvhis, hist_cbdata_t cb)
	-{
	- nvlist_t **records;
	- uint_t numrecords;
	- int i;
	-
	- verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD,
	- &records, &numrecords) == 0);
	- for (i = 0; i < numrecords; i++) {
	- nvlist_t *rec = records[i];
	- char tbuf[30] = "";
	-
	- if (nvlist_exists(rec, ZPOOL_HIST_TIME)) {
	- time_t tsec;
	- struct tm t;
	-
	- tsec = fnvlist_lookup_uint64(records[i],
	- ZPOOL_HIST_TIME);
	- (void) localtime_r(&tsec, &t);
	- (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
	- }
	-
	- if (nvlist_exists(rec, ZPOOL_HIST_CMD)) {
	- (void) printf("%s %s", tbuf,
	- fnvlist_lookup_string(rec, ZPOOL_HIST_CMD));
	- } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) {
	- int ievent =
	- fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT);
	- if (!cb->internal)
	- continue;
	- if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) {
	- (void) printf("%s unrecognized record:\n",
	- tbuf);
	- dump_nvlist(rec, 4);
	- continue;
	- }
	- (void) printf("%s [internal %s txg:%lld] %s", tbuf,
	- zfs_history_event_names[ievent],
	- fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG),
	- fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR));
	- } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) {
	- if (!cb->internal)
	- continue;
	- (void) printf("%s [txg:%lld] %s", tbuf,
	- fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG),
	- fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME));
	- if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) {
	- (void) printf(" %s (%llu)",
	- fnvlist_lookup_string(rec,
	- ZPOOL_HIST_DSNAME),
	- fnvlist_lookup_uint64(rec,
	- ZPOOL_HIST_DSID));
	- }
	- (void) printf(" %s", fnvlist_lookup_string(rec,
	- ZPOOL_HIST_INT_STR));
	- } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) {
	- if (!cb->internal)
	- continue;
	- (void) printf("%s ioctl %s\n", tbuf,
	- fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL));
	- if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) {
	- (void) printf(" input:\n");
	- dump_nvlist(fnvlist_lookup_nvlist(rec,
	- ZPOOL_HIST_INPUT_NVL), 8);
	- }
	- if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) {
	- (void) printf(" output:\n");
	- dump_nvlist(fnvlist_lookup_nvlist(rec,
	- ZPOOL_HIST_OUTPUT_NVL), 8);
	- }
	- if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) {
	- (void) printf(" errno: %lld\n",
	- fnvlist_lookup_int64(rec,
	- ZPOOL_HIST_ERRNO));
	- }
	- } else {
	- if (!cb->internal)
	- continue;
	- (void) printf("%s unrecognized record:\n", tbuf);
	- dump_nvlist(rec, 4);
	- }
	-
	- if (!cb->longfmt) {
	- (void) printf("\n");
	- continue;
	- }
	- (void) printf(" [");
	- if (nvlist_exists(rec, ZPOOL_HIST_WHO)) {
	- uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO);
	- struct passwd *pwd = getpwuid(who);
	- (void) printf("user %d ", (int)who);
	- if (pwd != NULL)
	- (void) printf("(%s) ", pwd->pw_name);
	- }
	- if (nvlist_exists(rec, ZPOOL_HIST_HOST)) {
	- (void) printf("on %s",
	- fnvlist_lookup_string(rec, ZPOOL_HIST_HOST));
	- }
	- if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) {
	- (void) printf(":%s",
	- fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE));
	- }
	- (void) printf("]");
	- (void) printf("\n");
	- }
	-}
	-
	-/*
	- * Print out the command history for a specific pool.
	- */
	-static int
	-get_history_one(zpool_handle_t zhp, void data)
	-{
	- nvlist_t *nvhis;
	- int ret;
	- hist_cbdata_t cb = (hist_cbdata_t )data;
	- uint64_t off = 0;
	- boolean_t eof = B_FALSE;
	-
	- cb->first = B_FALSE;
	-
	- (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp));
	-
	- while (!eof) {
	- if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0)
	- return (ret);
	-
	- print_history_records(nvhis, cb);
	- nvlist_free(nvhis);
	- }
	- (void) printf("\n");
	-
	- return (ret);
	-}
	-
	-/*
	- * zpool history <pool>
	- *
	- * Displays the history of commands that modified pools.
	- */
	-int
	-zpool_do_history(int argc, char **argv)
	-{
	- hist_cbdata_t cbdata = { 0 };
	- int ret;
	- int c;
	-
	- cbdata.first = B_TRUE;
	- /* check options */
	- while ((c = getopt(argc, argv, "li")) != -1) {
	- switch (c) {
	- case 'l':
	- cbdata.longfmt = B_TRUE;
	- break;
	- case 'i':
	- cbdata.internal = B_TRUE;
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	- argc -= optind;
	- argv += optind;
	-
	- ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one,
	- &cbdata);
	-
	- if (argc == 0 && cbdata.first == B_TRUE) {
	- (void) printf(gettext("no pools available\n"));
	- return (0);
	- }
	-
	- return (ret);
	-}
	-
	-static int
	-get_callback(zpool_handle_t zhp, void data)
	-{
	- zprop_get_cbdata_t cbp = (zprop_get_cbdata_t )data;
	- char value[MAXNAMELEN];
	- zprop_source_t srctype;
	- zprop_list_t *pl;
	-
	- for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
	-
	- /*
	- * Skip the special fake placeholder. This will also skip
	- * over the name property when 'all' is specified.
	- */
	- if (pl->pl_prop == ZPOOL_PROP_NAME &&
	- pl == cbp->cb_proplist)
	- continue;
	-
	- if (pl->pl_prop == ZPROP_INVAL &&
	- (zpool_prop_feature(pl->pl_user_prop) \|\|
	- zpool_prop_unsupported(pl->pl_user_prop))) {
	- srctype = ZPROP_SRC_LOCAL;
	-
	- if (zpool_prop_get_feature(zhp, pl->pl_user_prop,
	- value, sizeof (value)) == 0) {
	- zprop_print_one_property(zpool_get_name(zhp),
	- cbp, pl->pl_user_prop, value, srctype,
	- NULL, NULL);
	- }
	- } else {
	- if (zpool_get_prop(zhp, pl->pl_prop, value,
	- sizeof (value), &srctype, cbp->cb_literal) != 0)
	- continue;
	-
	- zprop_print_one_property(zpool_get_name(zhp), cbp,
	- zpool_prop_to_name(pl->pl_prop), value, srctype,
	- NULL, NULL);
	- }
	- }
	- return (0);
	-}
	-
	-/*
	- * zpool get [-Hp] [-o "all" \| field[,...]] <"all" \| property[,...]> <pool> ...
	- *
	- * -H Scripted mode. Don't display headers, and separate properties
	- * by a single tab.
	- * -o List of columns to display. Defaults to
	- * "name,property,value,source".
	- * -p Diplay values in parsable (exact) format.
	- *
	- * Get properties of pools in the system. Output space statistics
	- * for each one as well as other attributes.
	- */
	-int
	-zpool_do_get(int argc, char **argv)
	-{
	- zprop_get_cbdata_t cb = { 0 };
	- zprop_list_t fake_name = { 0 };
	- int ret;
	- int c, i;
	- char *value;
	-
	- cb.cb_first = B_TRUE;
	-
	- /*
	- * Set up default columns and sources.
	- */
	- cb.cb_sources = ZPROP_SRC_ALL;
	- cb.cb_columns[0] = GET_COL_NAME;
	- cb.cb_columns[1] = GET_COL_PROPERTY;
	- cb.cb_columns[2] = GET_COL_VALUE;
	- cb.cb_columns[3] = GET_COL_SOURCE;
	- cb.cb_type = ZFS_TYPE_POOL;
	-
	- /* check options */
	- while ((c = getopt(argc, argv, ":Hpo:")) != -1) {
	- switch (c) {
	- case 'p':
	- cb.cb_literal = B_TRUE;
	- break;
	- case 'H':
	- cb.cb_scripted = B_TRUE;
	- break;
	- case 'o':
	- bzero(&cb.cb_columns, sizeof (cb.cb_columns));
	- i = 0;
	- while (*optarg != '\0') {
	- static char *col_subopts[] =
	- { "name", "property", "value", "source",
	- "all", NULL };
	-
	- if (i == ZFS_GET_NCOLS) {
	- (void) fprintf(stderr, gettext("too "
	- "many fields given to -o "
	- "option\n"));
	- usage(B_FALSE);
	- }
	-
	- switch (getsubopt(&optarg, col_subopts,
	- &value)) {
	- case 0:
	- cb.cb_columns[i++] = GET_COL_NAME;
	- break;
	- case 1:
	- cb.cb_columns[i++] = GET_COL_PROPERTY;
	- break;
	- case 2:
	- cb.cb_columns[i++] = GET_COL_VALUE;
	- break;
	- case 3:
	- cb.cb_columns[i++] = GET_COL_SOURCE;
	- break;
	- case 4:
	- if (i > 0) {
	- (void) fprintf(stderr,
	- gettext("\"all\" conflicts "
	- "with specific fields "
	- "given to -o option\n"));
	- usage(B_FALSE);
	- }
	- cb.cb_columns[0] = GET_COL_NAME;
	- cb.cb_columns[1] = GET_COL_PROPERTY;
	- cb.cb_columns[2] = GET_COL_VALUE;
	- cb.cb_columns[3] = GET_COL_SOURCE;
	- i = ZFS_GET_NCOLS;
	- break;
	- default:
	- (void) fprintf(stderr,
	- gettext("invalid column name "
	- "'%s'\n"), suboptarg);
	- usage(B_FALSE);
	- }
	- }
	- break;
	- case '?':
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- optopt);
	- usage(B_FALSE);
	- }
	- }
	-
	- argc -= optind;
	- argv += optind;
	-
	- if (argc < 1) {
	- (void) fprintf(stderr, gettext("missing property "
	- "argument\n"));
	- usage(B_FALSE);
	- }
	-
	- if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist,
	- ZFS_TYPE_POOL) != 0)
	- usage(B_FALSE);
	-
	- argc--;
	- argv++;
	-
	- if (cb.cb_proplist != NULL) {
	- fake_name.pl_prop = ZPOOL_PROP_NAME;
	- fake_name.pl_width = strlen(gettext("NAME"));
	- fake_name.pl_next = cb.cb_proplist;
	- cb.cb_proplist = &fake_name;
	- }
	-
	- ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
	- get_callback, &cb);
	-
	- if (cb.cb_proplist == &fake_name)
	- zprop_free_list(fake_name.pl_next);
	- else
	- zprop_free_list(cb.cb_proplist);
	-
	- return (ret);
	-}
	-
	-typedef struct set_cbdata {
	- char *cb_propname;
	- char *cb_value;
	- boolean_t cb_any_successful;
	-} set_cbdata_t;
	-
	-int
	-set_callback(zpool_handle_t zhp, void data)
	-{
	- int error;
	- set_cbdata_t cb = (set_cbdata_t )data;
	-
	- error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value);
	-
	- if (!error)
	- cb->cb_any_successful = B_TRUE;
	-
	- return (error);
	-}
	-
	-int
	-zpool_do_set(int argc, char **argv)
	-{
	- set_cbdata_t cb = { 0 };
	- int error;
	-
	- if (argc > 1 && argv[1][0] == '-') {
	- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	- argv[1][1]);
	- usage(B_FALSE);
	- }
	-
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing property=value "
	- "argument\n"));
	- usage(B_FALSE);
	- }
	-
	- if (argc < 3) {
	- (void) fprintf(stderr, gettext("missing pool name\n"));
	- usage(B_FALSE);
	- }
	-
	- if (argc > 3) {
	- (void) fprintf(stderr, gettext("too many pool names\n"));
	- usage(B_FALSE);
	- }
	-
	- cb.cb_propname = argv[1];
	- cb.cb_value = strchr(cb.cb_propname, '=');
	- if (cb.cb_value == NULL) {
	- (void) fprintf(stderr, gettext("missing value in "
	- "property=value argument\n"));
	- usage(B_FALSE);
	- }
	-
	- *(cb.cb_value) = '\0';
	- cb.cb_value++;
	-
	- error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL,
	- set_callback, &cb);
	-
	- return (error);
	-}
	-
	-static int
	-find_command_idx(char command, int idx)
	-{
	- int i;
	-
	- for (i = 0; i < NCOMMAND; i++) {
	- if (command_table[i].name == NULL)
	- continue;
	-
	- if (strcmp(command, command_table[i].name) == 0) {
	- *idx = i;
	- return (0);
	- }
	- }
	- return (1);
	-}
	-
	-int
	-main(int argc, char **argv)
	-{
	- int ret = 0;
	- int i;
	- char *cmdname;
	-
	- (void) setlocale(LC_ALL, "");
	- (void) textdomain(TEXT_DOMAIN);
	-
	- if ((g_zfs = libzfs_init()) == NULL) {
	- (void) fprintf(stderr, gettext("internal error: failed to "
	- "initialize ZFS library\n"));
	- return (1);
	- }
	-
	- libzfs_print_on_error(g_zfs, B_TRUE);
	-
	- opterr = 0;
	-
	- /*
	- * Make sure the user has specified some command.
	- */
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing command\n"));
	- usage(B_FALSE);
	- }
	-
	- cmdname = argv[1];
	-
	- /*
	- * Special case '-?'
	- */
	- if (strcmp(cmdname, "-?") == 0)
	- usage(B_TRUE);
	-
	- zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
	-
	- /*
	- * Run the appropriate command.
	- */
	- if (find_command_idx(cmdname, &i) == 0) {
	- current_command = &command_table[i];
	- ret = command_table[i].func(argc - 1, argv + 1);
	- } else if (strchr(cmdname, '=')) {
	- verify(find_command_idx("set", &i) == 0);
	- current_command = &command_table[i];
	- ret = command_table[i].func(argc, argv);
	- } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
	- /*
	- * 'freeze' is a vile debugging abomination, so we treat
	- * it as such.
	- */
	- zfs_cmd_t zc = { 0 };
	- (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name));
	- return (!!zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc));
	- } else {
	- (void) fprintf(stderr, gettext("unrecognized "
	- "command '%s'\n"), cmdname);
	- usage(B_FALSE);
	- }
	-
	- if (ret == 0 && log_history)
	- (void) zpool_log_history(g_zfs, history_str);
	-
	- libzfs_fini(g_zfs);
	-
	- /*
	- * The 'ZFS_ABORT' environment variable causes us to dump core on exit
	- * for the purposes of running ::findleaks.
	- */
	- if (getenv("ZFS_ABORT") != NULL) {
	- (void) printf("dumping core by request\n");
	- abort();
	- }
	-
	- return (ret);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h
	+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h
	@@ -1,73 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef ZPOOL_UTIL_H
	-#define ZPOOL_UTIL_H
	-
	-#include <libnvpair.h>
	-#include <libzfs.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Basic utility functions
	- */
	-void *safe_malloc(size_t);
	-void zpool_no_memory(void);
	-uint_t num_logs(nvlist_t *nv);
	-
	-/*
	- * Virtual device functions
	- */
	-
	-nvlist_t make_root_vdev(zpool_handle_t zhp, int force, int check_rep,
	- boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type,
	- uint64_t boot_size, int argc, char **argv);
	-nvlist_t split_mirror_vdev(zpool_handle_t zhp, char *newname,
	- nvlist_t props, splitflags_t flags, int argc, char *argv);
	-
	-/*
	- * Pool list functions
	- */
	-int for_each_pool(int, char , boolean_t unavail, zprop_list_t ,
	- zpool_iter_f, void *);
	-
	-typedef struct zpool_list zpool_list_t;
	-
	-zpool_list_t pool_list_get(int, char , zprop_list_t , int );
	-void pool_list_update(zpool_list_t *);
	-int pool_list_iter(zpool_list_t , int unavail, zpool_iter_f, void );
	-void pool_list_free(zpool_list_t *);
	-int pool_list_count(zpool_list_t *);
	-void pool_list_remove(zpool_list_t , zpool_handle_t );
	-
	-extern libzfs_handle_t *g_zfs;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* ZPOOL_UTIL_H */
	Index: head/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c
	+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c
	@@ -1,86 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#include <errno.h>
	-#include <libgen.h>
	-#include <libintl.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-
	-#include "zpool_util.h"
	-
	-/*
	- * Utility function to guarantee malloc() success.
	- */
	-void *
	-safe_malloc(size_t size)
	-{
	- void *data;
	-
	- if ((data = calloc(1, size)) == NULL) {
	- (void) fprintf(stderr, "internal error: out of memory\n");
	- exit(1);
	- }
	-
	- return (data);
	-}
	-
	-/*
	- * Display an out of memory error message and abort the current program.
	- */
	-void
	-zpool_no_memory(void)
	-{
	- assert(errno == ENOMEM);
	- (void) fprintf(stderr,
	- gettext("internal error: out of memory\n"));
	- exit(1);
	-}
	-
	-/*
	- * Return the number of logs in supplied nvlist
	- */
	-uint_t
	-num_logs(nvlist_t *nv)
	-{
	- uint_t nlogs = 0;
	- uint_t c, children;
	- nvlist_t **child;
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0)
	- return (0);
	-
	- for (c = 0; c < children; c++) {
	- uint64_t is_log = B_FALSE;
	-
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	- &is_log);
	- if (is_log)
	- nlogs++;
	- }
	- return (nlogs);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c
	+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c
	@@ -1,1729 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2016, 2017 Intel Corporation.
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
	- */
	-
	-/*
	- * Functions to convert between a list of vdevs and an nvlist representing the
	- * configuration. Each entry in the list can be one of:
	- *
	- * Device vdevs
	- * disk=(path=..., devid=...)
	- * file=(path=...)
	- *
	- * Group vdevs
	- * raidz[1\|2]=(...)
	- * mirror=(...)
	- *
	- * Hot spares
	- *
	- * While the underlying implementation supports it, group vdevs cannot contain
	- * other group vdevs. All userland verification of devices is contained within
	- * this file. If successful, the nvlist returned can be passed directly to the
	- * kernel; we've done as much verification as possible in userland.
	- *
	- * Hot spares are a special case, and passed down as an array of disk vdevs, at
	- * the same level as the root of the vdev tree.
	- *
	- * The only function exported by this file is 'make_root_vdev'. The
	- * function performs several passes:
	- *
	- * 1. Construct the vdev specification. Performs syntax validation and
	- * makes sure each device is valid.
	- * 2. Check for devices in use. Using libdiskmgt, makes sure that no
	- * devices are also in use. Some can be overridden using the 'force'
	- * flag, others cannot.
	- * 3. Check for replication errors if the 'force' flag is not specified.
	- * validates that the replication level is consistent across the
	- * entire pool.
	- * 4. Call libzfs to label any whole disks with an EFI label.
	- */
	-
	-#include <assert.h>
	-#include <devid.h>
	-#include <errno.h>
	-#include <fcntl.h>
	-#include <libintl.h>
	-#include <libnvpair.h>
	-#include <limits.h>
	-#include <stdio.h>
	-#include <string.h>
	-#include <unistd.h>
	-#include <paths.h>
	-#include <sys/stat.h>
	-#include <sys/disk.h>
	-#include <sys/mntent.h>
	-#include <libgeom.h>
	-
	-#include "zpool_util.h"
	-
	-#define BACKUP_SLICE "s2"
	-
	-/*
	- * For any given vdev specification, we can have multiple errors. The
	- * vdev_error() function keeps track of whether we have seen an error yet, and
	- * prints out a header if its the first error we've seen.
	- */
	-boolean_t error_seen;
	-boolean_t is_force;
	-
	-/PRINTFLIKE1/
	-static void
	-vdev_error(const char *fmt, ...)
	-{
	- va_list ap;
	-
	- if (!error_seen) {
	- (void) fprintf(stderr, gettext("invalid vdev specification\n"));
	- if (!is_force)
	- (void) fprintf(stderr, gettext("use '-f' to override "
	- "the following errors:\n"));
	- else
	- (void) fprintf(stderr, gettext("the following errors "
	- "must be manually repaired:\n"));
	- error_seen = B_TRUE;
	- }
	-
	- va_start(ap, fmt);
	- (void) vfprintf(stderr, fmt, ap);
	- va_end(ap);
	-}
	-
	-#ifdef illumos
	-static void
	-libdiskmgt_error(int error)
	-{
	- /*
	- * ENXIO/ENODEV is a valid error message if the device doesn't live in
	- * /dev/dsk. Don't bother printing an error message in this case.
	- */
	- if (error == ENXIO \|\| error == ENODEV)
	- return;
	-
	- (void) fprintf(stderr, gettext("warning: device in use checking "
	- "failed: %s\n"), strerror(error));
	-}
	-
	-/*
	- * Validate a device, passing the bulk of the work off to libdiskmgt.
	- */
	-static int
	-check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
	-{
	- char *msg;
	- int error = 0;
	- dm_who_type_t who;
	-
	- if (force)
	- who = DM_WHO_ZPOOL_FORCE;
	- else if (isspare)
	- who = DM_WHO_ZPOOL_SPARE;
	- else
	- who = DM_WHO_ZPOOL;
	-
	- if (dm_inuse((char *)path, &msg, who, &error) \|\| error) {
	- if (error != 0) {
	- libdiskmgt_error(error);
	- return (0);
	- } else {
	- vdev_error("%s", msg);
	- free(msg);
	- return (-1);
	- }
	- }
	-
	- /*
	- * If we're given a whole disk, ignore overlapping slices since we're
	- * about to label it anyway.
	- */
	- error = 0;
	- if (!wholedisk && !force &&
	- (dm_isoverlapping((char *)path, &msg, &error) \|\| error)) {
	- if (error == 0) {
	- /* dm_isoverlapping returned -1 */
	- vdev_error(gettext("%s overlaps with %s\n"), path, msg);
	- free(msg);
	- return (-1);
	- } else if (error != ENODEV) {
	- /* libdiskmgt's devcache only handles physical drives */
	- libdiskmgt_error(error);
	- return (0);
	- }
	- }
	-
	- return (0);
	-}
	-
	-
	-/*
	- * Validate a whole disk. Iterate over all slices on the disk and make sure
	- * that none is in use by calling check_slice().
	- */
	-static int
	-check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
	-{
	- dm_descriptor_t drive, media, *slice;
	- int err = 0;
	- int i;
	- int ret;
	-
	- /*
	- * Get the drive associated with this disk. This should never fail,
	- * because we already have an alias handle open for the device.
	- */
	- if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
	- &err)) == NULL \|\| *drive == NULL) {
	- if (err)
	- libdiskmgt_error(err);
	- return (0);
	- }
	-
	- if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
	- &err)) == NULL) {
	- dm_free_descriptors(drive);
	- if (err)
	- libdiskmgt_error(err);
	- return (0);
	- }
	-
	- dm_free_descriptors(drive);
	-
	- /*
	- * It is possible that the user has specified a removable media drive,
	- * and the media is not present.
	- */
	- if (*media == NULL) {
	- dm_free_descriptors(media);
	- vdev_error(gettext("'%s' has no media in drive\n"), name);
	- return (-1);
	- }
	-
	- if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
	- &err)) == NULL) {
	- dm_free_descriptors(media);
	- if (err)
	- libdiskmgt_error(err);
	- return (0);
	- }
	-
	- dm_free_descriptors(media);
	-
	- ret = 0;
	-
	- /*
	- * Iterate over all slices and report any errors. We don't care about
	- * overlapping slices because we are using the whole disk.
	- */
	- for (i = 0; slice[i] != NULL; i++) {
	- char *name = dm_get_name(slice[i], &err);
	-
	- if (check_slice(name, force, B_TRUE, isspare) != 0)
	- ret = -1;
	-
	- dm_free_name(name);
	- }
	-
	- dm_free_descriptors(slice);
	- return (ret);
	-}
	-
	-/*
	- * Validate a device.
	- */
	-static int
	-check_device(const char *path, boolean_t force, boolean_t isspare)
	-{
	- dm_descriptor_t desc;
	- int err;
	- char *dev;
	-
	- /*
	- * For whole disks, libdiskmgt does not include the leading dev path.
	- */
	- dev = strrchr(path, '/');
	- assert(dev != NULL);
	- dev++;
	- if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
	- err = check_disk(path, desc, force, isspare);
	- dm_free_descriptor(desc);
	- return (err);
	- }
	-
	- return (check_slice(path, force, B_FALSE, isspare));
	-}
	-#endif /* illumos */
	-
	-/*
	- * Check that a file is valid. All we can do in this case is check that it's
	- * not in use by another pool, and not in use by swap.
	- */
	-static int
	-check_file(const char *file, boolean_t force, boolean_t isspare)
	-{
	- char *name;
	- int fd;
	- int ret = 0;
	- int err;
	- pool_state_t state;
	- boolean_t inuse;
	-
	-#ifdef illumos
	- if (dm_inuse_swap(file, &err)) {
	- if (err)
	- libdiskmgt_error(err);
	- else
	- vdev_error(gettext("%s is currently used by swap. "
	- "Please see swap(1M).\n"), file);
	- return (-1);
	- }
	-#endif
	-
	- if ((fd = open(file, O_RDONLY)) < 0)
	- return (0);
	-
	- if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
	- const char *desc;
	-
	- switch (state) {
	- case POOL_STATE_ACTIVE:
	- desc = gettext("active");
	- break;
	-
	- case POOL_STATE_EXPORTED:
	- desc = gettext("exported");
	- break;
	-
	- case POOL_STATE_POTENTIALLY_ACTIVE:
	- desc = gettext("potentially active");
	- break;
	-
	- default:
	- desc = gettext("unknown");
	- break;
	- }
	-
	- /*
	- * Allow hot spares to be shared between pools.
	- */
	- if (state == POOL_STATE_SPARE && isspare)
	- return (0);
	-
	- if (state == POOL_STATE_ACTIVE \|\|
	- state == POOL_STATE_SPARE \|\| !force) {
	- switch (state) {
	- case POOL_STATE_SPARE:
	- vdev_error(gettext("%s is reserved as a hot "
	- "spare for pool %s\n"), file, name);
	- break;
	- default:
	- vdev_error(gettext("%s is part of %s pool "
	- "'%s'\n"), file, desc, name);
	- break;
	- }
	- ret = -1;
	- }
	-
	- free(name);
	- }
	-
	- (void) close(fd);
	- return (ret);
	-}
	-
	-static int
	-check_device(const char *name, boolean_t force, boolean_t isspare)
	-{
	- char path[MAXPATHLEN];
	-
	- if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) != 0)
	- snprintf(path, sizeof(path), "%s%s", _PATH_DEV, name);
	- else
	- strlcpy(path, name, sizeof(path));
	-
	- return (check_file(path, force, isspare));
	-}
	-
	-/*
	- * By "whole disk" we mean an entire physical disk (something we can
	- * label, toggle the write cache on, etc.) as opposed to the full
	- * capacity of a pseudo-device such as lofi or did. We act as if we
	- * are labeling the disk, which should be a pretty good test of whether
	- * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
	- * it isn't.
	- */
	-static boolean_t
	-is_whole_disk(const char *arg)
	-{
	-#ifdef illumos
	- struct dk_gpt *label;
	- int fd;
	- char path[MAXPATHLEN];
	-
	- (void) snprintf(path, sizeof (path), "%s%s%s",
	- ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
	- if ((fd = open(path, O_RDWR \| O_NDELAY)) < 0)
	- return (B_FALSE);
	- if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
	- (void) close(fd);
	- return (B_FALSE);
	- }
	- efi_free(label);
	- (void) close(fd);
	- return (B_TRUE);
	-#else
	- int fd;
	-
	- fd = g_open(arg, 0);
	- if (fd >= 0) {
	- g_close(fd);
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-#endif
	-}
	-
	-/*
	- * Create a leaf vdev. Determine if this is a file or a device. If it's a
	- * device, fill in the device id to make a complete nvlist. Valid forms for a
	- * leaf vdev are:
	- *
	- * /dev/dsk/xxx Complete disk path
	- * /xxx Full path to file
	- * xxx Shorthand for /dev/dsk/xxx
	- */
	-static nvlist_t *
	-make_leaf_vdev(const char *arg, uint64_t is_log)
	-{
	- char path[MAXPATHLEN];
	- struct stat64 statbuf;
	- nvlist_t *vdev = NULL;
	- char *type = NULL;
	- boolean_t wholedisk = B_FALSE;
	-
	- /*
	- * Determine what type of vdev this is, and put the full path into
	- * 'path'. We detect whether this is a device of file afterwards by
	- * checking the st_mode of the file.
	- */
	- if (arg[0] == '/') {
	- /*
	- * Complete device or file path. Exact type is determined by
	- * examining the file descriptor afterwards.
	- */
	- wholedisk = is_whole_disk(arg);
	- if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
	- (void) fprintf(stderr,
	- gettext("cannot open '%s': %s\n"),
	- arg, strerror(errno));
	- return (NULL);
	- }
	-
	- (void) strlcpy(path, arg, sizeof (path));
	- } else {
	- /*
	- * This may be a short path for a device, or it could be total
	- * gibberish. Check to see if it's a known device in
	- * /dev/dsk/. As part of this check, see if we've been given a
	- * an entire disk (minus the slice number).
	- */
	- if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
	- strlcpy(path, arg, sizeof (path));
	- else
	- snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg);
	- wholedisk = is_whole_disk(path);
	- if (!wholedisk && (stat64(path, &statbuf) != 0)) {
	- /*
	- * If we got ENOENT, then the user gave us
	- * gibberish, so try to direct them with a
	- * reasonable error message. Otherwise,
	- * regurgitate strerror() since it's the best we
	- * can do.
	- */
	- if (errno == ENOENT) {
	- (void) fprintf(stderr,
	- gettext("cannot open '%s': no such "
	- "GEOM provider\n"), arg);
	- (void) fprintf(stderr,
	- gettext("must be a full path or "
	- "shorthand device name\n"));
	- return (NULL);
	- } else {
	- (void) fprintf(stderr,
	- gettext("cannot open '%s': %s\n"),
	- path, strerror(errno));
	- return (NULL);
	- }
	- }
	- }
	-
	-#ifdef __FreeBSD__
	- if (S_ISCHR(statbuf.st_mode)) {
	- statbuf.st_mode &= ~S_IFCHR;
	- statbuf.st_mode \|= S_IFBLK;
	- wholedisk = B_FALSE;
	- }
	-#endif
	-
	- /*
	- * Determine whether this is a device or a file.
	- */
	- if (wholedisk \|\| S_ISBLK(statbuf.st_mode)) {
	- type = VDEV_TYPE_DISK;
	- } else if (S_ISREG(statbuf.st_mode)) {
	- type = VDEV_TYPE_FILE;
	- } else {
	- (void) fprintf(stderr, gettext("cannot use '%s': must be a "
	- "GEOM provider or regular file\n"), path);
	- return (NULL);
	- }
	-
	- /*
	- * Finally, we have the complete device or file, and we know that it is
	- * acceptable to use. Construct the nvlist to describe this vdev. All
	- * vdevs have a 'path' element, and devices also have a 'devid' element.
	- */
	- verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
	- verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
	- verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
	- verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
	- if (is_log)
	- verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
	- VDEV_ALLOC_BIAS_LOG) == 0);
	- if (strcmp(type, VDEV_TYPE_DISK) == 0)
	- verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
	- (uint64_t)wholedisk) == 0);
	-
	-#ifdef have_devid
	- /*
	- * For a whole disk, defer getting its devid until after labeling it.
	- */
	- if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
	- /*
	- * Get the devid for the device.
	- */
	- int fd;
	- ddi_devid_t devid;
	- char minor = NULL, devid_str = NULL;
	-
	- if ((fd = open(path, O_RDONLY)) < 0) {
	- (void) fprintf(stderr, gettext("cannot open '%s': "
	- "%s\n"), path, strerror(errno));
	- nvlist_free(vdev);
	- return (NULL);
	- }
	-
	- if (devid_get(fd, &devid) == 0) {
	- if (devid_get_minor_name(fd, &minor) == 0 &&
	- (devid_str = devid_str_encode(devid, minor)) !=
	- NULL) {
	- verify(nvlist_add_string(vdev,
	- ZPOOL_CONFIG_DEVID, devid_str) == 0);
	- }
	- if (devid_str != NULL)
	- devid_str_free(devid_str);
	- if (minor != NULL)
	- devid_str_free(minor);
	- devid_free(devid);
	- }
	-
	- (void) close(fd);
	- }
	-#endif
	-
	- return (vdev);
	-}
	-
	-/*
	- * Go through and verify the replication level of the pool is consistent.
	- * Performs the following checks:
	- *
	- * For the new spec, verifies that devices in mirrors and raidz are the
	- * same size.
	- *
	- * If the current configuration already has inconsistent replication
	- * levels, ignore any other potential problems in the new spec.
	- *
	- * Otherwise, make sure that the current spec (if there is one) and the new
	- * spec have consistent replication levels.
	- *
	- * If there is no current spec (create), make sure new spec has at least
	- * one general purpose vdev.
	- */
	-typedef struct replication_level {
	- char *zprl_type;
	- uint64_t zprl_children;
	- uint64_t zprl_parity;
	-} replication_level_t;
	-
	-#define ZPOOL_FUZZ (16 * 1024 * 1024)
	-
	-static boolean_t
	-is_raidz_mirror(replication_level_t a, replication_level_t b,
	- replication_level_t raidz, replication_level_t mirror)
	-{
	- if (strcmp(a->zprl_type, "raidz") == 0 &&
	- strcmp(b->zprl_type, "mirror") == 0) {
	- *raidz = a;
	- *mirror = b;
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * Given a list of toplevel vdevs, return the current replication level. If
	- * the config is inconsistent, then NULL is returned. If 'fatal' is set, then
	- * an error message will be displayed for each self-inconsistent vdev.
	- */
	-static replication_level_t *
	-get_replication(nvlist_t *nvroot, boolean_t fatal)
	-{
	- nvlist_t **top;
	- uint_t t, toplevels;
	- nvlist_t **child;
	- uint_t c, children;
	- nvlist_t *nv;
	- char *type;
	- replication_level_t lastrep = {0};
	- replication_level_t rep;
	- replication_level_t *ret;
	- replication_level_t raidz, mirror;
	- boolean_t dontreport;
	-
	- ret = safe_malloc(sizeof (replication_level_t));
	-
	- verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- &top, &toplevels) == 0);
	-
	- for (t = 0; t < toplevels; t++) {
	- uint64_t is_log = B_FALSE;
	-
	- nv = top[t];
	-
	- /*
	- * For separate logs we ignore the top level vdev replication
	- * constraints.
	- */
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
	- if (is_log)
	- continue;
	-
	- verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
	- &type) == 0);
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0) {
	- /*
	- * This is a 'file' or 'disk' vdev.
	- */
	- rep.zprl_type = type;
	- rep.zprl_children = 1;
	- rep.zprl_parity = 0;
	- } else {
	- uint64_t vdev_size;
	-
	- /*
	- * This is a mirror or RAID-Z vdev. Go through and make
	- * sure the contents are all the same (files vs. disks),
	- * keeping track of the number of elements in the
	- * process.
	- *
	- * We also check that the size of each vdev (if it can
	- * be determined) is the same.
	- */
	- rep.zprl_type = type;
	- rep.zprl_children = 0;
	-
	- if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
	- verify(nvlist_lookup_uint64(nv,
	- ZPOOL_CONFIG_NPARITY,
	- &rep.zprl_parity) == 0);
	- assert(rep.zprl_parity != 0);
	- } else {
	- rep.zprl_parity = 0;
	- }
	-
	- /*
	- * The 'dontreport' variable indicates that we've
	- * already reported an error for this spec, so don't
	- * bother doing it again.
	- */
	- type = NULL;
	- dontreport = 0;
	- vdev_size = -1ULL;
	- for (c = 0; c < children; c++) {
	- boolean_t is_replacing, is_spare;
	- nvlist_t *cnv = child[c];
	- char *path;
	- struct stat64 statbuf;
	- uint64_t size = -1ULL;
	- char *childtype;
	- int fd, err;
	-
	- rep.zprl_children++;
	-
	- verify(nvlist_lookup_string(cnv,
	- ZPOOL_CONFIG_TYPE, &childtype) == 0);
	-
	- /*
	- * If this is a replacing or spare vdev, then
	- * get the real first child of the vdev.
	- */
	- is_replacing = strcmp(childtype,
	- VDEV_TYPE_REPLACING) == 0;
	- is_spare = strcmp(childtype,
	- VDEV_TYPE_SPARE) == 0;
	- if (is_replacing \|\| is_spare) {
	- nvlist_t **rchild;
	- uint_t rchildren;
	-
	- verify(nvlist_lookup_nvlist_array(cnv,
	- ZPOOL_CONFIG_CHILDREN, &rchild,
	- &rchildren) == 0);
	- assert((is_replacing && rchildren == 2)
	- \|\| (is_spare && rchildren >= 2));
	- cnv = rchild[0];
	-
	- verify(nvlist_lookup_string(cnv,
	- ZPOOL_CONFIG_TYPE,
	- &childtype) == 0);
	- if (strcmp(childtype,
	- VDEV_TYPE_SPARE) == 0) {
	- /* We have a replacing vdev with
	- * a spare child. Get the first
	- * real child of the spare
	- */
	- verify(
	- nvlist_lookup_nvlist_array(
	- cnv,
	- ZPOOL_CONFIG_CHILDREN,
	- &rchild,
	- &rchildren) == 0);
	- assert(rchildren >= 2);
	- cnv = rchild[0];
	- }
	- }
	-
	- verify(nvlist_lookup_string(cnv,
	- ZPOOL_CONFIG_PATH, &path) == 0);
	-
	- /*
	- * If we have a raidz/mirror that combines disks
	- * with files, report it as an error.
	- */
	- if (!dontreport && type != NULL &&
	- strcmp(type, childtype) != 0) {
	- if (ret != NULL)
	- free(ret);
	- ret = NULL;
	- if (fatal)
	- vdev_error(gettext(
	- "mismatched replication "
	- "level: %s contains both "
	- "files and devices\n"),
	- rep.zprl_type);
	- else
	- return (NULL);
	- dontreport = B_TRUE;
	- }
	-
	- /*
	- * According to stat(2), the value of 'st_size'
	- * is undefined for block devices and character
	- * devices. But there is no effective way to
	- * determine the real size in userland.
	- *
	- * Instead, we'll take advantage of an
	- * implementation detail of spec_size(). If the
	- * device is currently open, then we (should)
	- * return a valid size.
	- *
	- * If we still don't get a valid size (indicated
	- * by a size of 0 or MAXOFFSET_T), then ignore
	- * this device altogether.
	- */
	- if ((fd = open(path, O_RDONLY)) >= 0) {
	- err = fstat64(fd, &statbuf);
	- (void) close(fd);
	- } else {
	- err = stat64(path, &statbuf);
	- }
	-
	- if (err != 0 \|\|
	- statbuf.st_size == 0 \|\|
	- statbuf.st_size == MAXOFFSET_T)
	- continue;
	-
	- size = statbuf.st_size;
	-
	- /*
	- * Also make sure that devices and
	- * slices have a consistent size. If
	- * they differ by a significant amount
	- * (~16MB) then report an error.
	- */
	- if (!dontreport &&
	- (vdev_size != -1ULL &&
	- (labs(size - vdev_size) >
	- ZPOOL_FUZZ))) {
	- if (ret != NULL)
	- free(ret);
	- ret = NULL;
	- if (fatal)
	- vdev_error(gettext(
	- "%s contains devices of "
	- "different sizes\n"),
	- rep.zprl_type);
	- else
	- return (NULL);
	- dontreport = B_TRUE;
	- }
	-
	- type = childtype;
	- vdev_size = size;
	- }
	- }
	-
	- /*
	- * At this point, we have the replication of the last toplevel
	- * vdev in 'rep'. Compare it to 'lastrep' to see if it is
	- * different.
	- */
	- if (lastrep.zprl_type != NULL) {
	- if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) \|\|
	- is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
	- /*
	- * Accepted raidz and mirror when they can
	- * handle the same number of disk failures.
	- */
	- if (raidz->zprl_parity !=
	- mirror->zprl_children - 1) {
	- if (ret != NULL)
	- free(ret);
	- ret = NULL;
	- if (fatal)
	- vdev_error(gettext(
	- "mismatched replication "
	- "level: "
	- "%s and %s vdevs with "
	- "different redundancy, "
	- "%llu vs. %llu (%llu-way) "
	- "are present\n"),
	- raidz->zprl_type,
	- mirror->zprl_type,
	- raidz->zprl_parity,
	- mirror->zprl_children - 1,
	- mirror->zprl_children);
	- else
	- return (NULL);
	- }
	- } else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
	- 0) {
	- if (ret != NULL)
	- free(ret);
	- ret = NULL;
	- if (fatal)
	- vdev_error(gettext(
	- "mismatched replication level: "
	- "both %s and %s vdevs are "
	- "present\n"),
	- lastrep.zprl_type, rep.zprl_type);
	- else
	- return (NULL);
	- } else if (lastrep.zprl_parity != rep.zprl_parity) {
	- if (ret)
	- free(ret);
	- ret = NULL;
	- if (fatal)
	- vdev_error(gettext(
	- "mismatched replication level: "
	- "both %llu and %llu device parity "
	- "%s vdevs are present\n"),
	- lastrep.zprl_parity,
	- rep.zprl_parity,
	- rep.zprl_type);
	- else
	- return (NULL);
	- } else if (lastrep.zprl_children != rep.zprl_children) {
	- if (ret)
	- free(ret);
	- ret = NULL;
	- if (fatal)
	- vdev_error(gettext(
	- "mismatched replication level: "
	- "both %llu-way and %llu-way %s "
	- "vdevs are present\n"),
	- lastrep.zprl_children,
	- rep.zprl_children,
	- rep.zprl_type);
	- else
	- return (NULL);
	- }
	- }
	- lastrep = rep;
	- }
	-
	- if (ret != NULL)
	- *ret = rep;
	-
	- return (ret);
	-}
	-
	-/*
	- * Check the replication level of the vdev spec against the current pool. Calls
	- * get_replication() to make sure the new spec is self-consistent. If the pool
	- * has a consistent replication level, then we ignore any errors. Otherwise,
	- * report any difference between the two.
	- */
	-static int
	-check_replication(nvlist_t config, nvlist_t newroot)
	-{
	- nvlist_t **child;
	- uint_t children;
	- replication_level_t current = NULL, new;
	- replication_level_t raidz, mirror;
	- int ret;
	-
	- /*
	- * If we have a current pool configuration, check to see if it's
	- * self-consistent. If not, simply return success.
	- */
	- if (config != NULL) {
	- nvlist_t *nvroot;
	-
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	- if ((current = get_replication(nvroot, B_FALSE)) == NULL)
	- return (0);
	- }
	- /*
	- * for spares there may be no children, and therefore no
	- * replication level to check
	- */
	- if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0) \|\| (children == 0)) {
	- free(current);
	- return (0);
	- }
	-
	- /*
	- * If all we have is logs then there's no replication level to check.
	- */
	- if (num_logs(newroot) == children) {
	- free(current);
	- return (0);
	- }
	-
	- /*
	- * Get the replication level of the new vdev spec, reporting any
	- * inconsistencies found.
	- */
	- if ((new = get_replication(newroot, B_TRUE)) == NULL) {
	- free(current);
	- return (-1);
	- }
	-
	- /*
	- * Check to see if the new vdev spec matches the replication level of
	- * the current pool.
	- */
	- ret = 0;
	- if (current != NULL) {
	- if (is_raidz_mirror(current, new, &raidz, &mirror) \|\|
	- is_raidz_mirror(new, current, &raidz, &mirror)) {
	- if (raidz->zprl_parity != mirror->zprl_children - 1) {
	- vdev_error(gettext(
	- "mismatched replication level: pool and "
	- "new vdev with different redundancy, %s "
	- "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
	- raidz->zprl_type,
	- mirror->zprl_type,
	- raidz->zprl_parity,
	- mirror->zprl_children - 1,
	- mirror->zprl_children);
	- ret = -1;
	- }
	- } else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
	- vdev_error(gettext(
	- "mismatched replication level: pool uses %s "
	- "and new vdev is %s\n"),
	- current->zprl_type, new->zprl_type);
	- ret = -1;
	- } else if (current->zprl_parity != new->zprl_parity) {
	- vdev_error(gettext(
	- "mismatched replication level: pool uses %llu "
	- "device parity and new vdev uses %llu\n"),
	- current->zprl_parity, new->zprl_parity);
	- ret = -1;
	- } else if (current->zprl_children != new->zprl_children) {
	- vdev_error(gettext(
	- "mismatched replication level: pool uses %llu-way "
	- "%s and new vdev uses %llu-way %s\n"),
	- current->zprl_children, current->zprl_type,
	- new->zprl_children, new->zprl_type);
	- ret = -1;
	- }
	- }
	-
	- free(new);
	- if (current != NULL)
	- free(current);
	-
	- return (ret);
	-}
	-
	-#ifdef illumos
	-/*
	- * Go through and find any whole disks in the vdev specification, labelling them
	- * as appropriate. When constructing the vdev spec, we were unable to open this
	- * device in order to provide a devid. Now that we have labelled the disk and
	- * know the pool slice is valid, we can construct the devid now.
	- *
	- * If the disk was already labeled with an EFI label, we will have gotten the
	- * devid already (because we were able to open the whole disk). Otherwise, we
	- * need to get the devid after we label the disk.
	- */
	-static int
	-make_disks(zpool_handle_t zhp, nvlist_t nv, zpool_boot_label_t boot_type,
	- uint64_t boot_size)
	-{
	- nvlist_t **child;
	- uint_t c, children;
	- char type, path, *diskname;
	- char buf[MAXPATHLEN];
	- uint64_t wholedisk;
	- int fd;
	- int ret;
	- int slice;
	- ddi_devid_t devid;
	- char minor = NULL, devid_str = NULL;
	-
	- verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0) {
	-
	- if (strcmp(type, VDEV_TYPE_DISK) != 0)
	- return (0);
	-
	- /*
	- * We have a disk device. Get the path to the device
	- * and see if it's a whole disk by appending the backup
	- * slice and stat()ing the device.
	- */
	- verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
	-
	- diskname = strrchr(path, '/');
	- assert(diskname != NULL);
	- diskname++;
	-
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	- &wholedisk) != 0 \|\| !wholedisk) {
	- /*
	- * This is not whole disk, return error if
	- * boot partition creation was requested
	- */
	- if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
	- (void) fprintf(stderr,
	- gettext("creating boot partition is only "
	- "supported on whole disk vdevs: %s\n"),
	- diskname);
	- return (-1);
	- }
	- return (0);
	- }
	-
	- ret = zpool_label_disk(g_zfs, zhp, diskname, boot_type,
	- boot_size, &slice);
	- if (ret == -1)
	- return (ret);
	-
	- /*
	- * Fill in the devid, now that we've labeled the disk.
	- */
	- (void) snprintf(buf, sizeof (buf), "%ss%d", path, slice);
	- if ((fd = open(buf, O_RDONLY)) < 0) {
	- (void) fprintf(stderr,
	- gettext("cannot open '%s': %s\n"),
	- buf, strerror(errno));
	- return (-1);
	- }
	-
	- if (devid_get(fd, &devid) == 0) {
	- if (devid_get_minor_name(fd, &minor) == 0 &&
	- (devid_str = devid_str_encode(devid, minor)) !=
	- NULL) {
	- verify(nvlist_add_string(nv,
	- ZPOOL_CONFIG_DEVID, devid_str) == 0);
	- }
	- if (devid_str != NULL)
	- devid_str_free(devid_str);
	- if (minor != NULL)
	- devid_str_free(minor);
	- devid_free(devid);
	- }
	-
	- /*
	- * Update the path to refer to the pool slice. The presence of
	- * the 'whole_disk' field indicates to the CLI that we should
	- * chop off the slice number when displaying the device in
	- * future output.
	- */
	- verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
	-
	- (void) close(fd);
	-
	- return (0);
	- }
	-
	- /* illumos kernel does not support booting from multi-vdev pools. */
	- if ((boot_type == ZPOOL_CREATE_BOOT_LABEL)) {
	- if ((strcmp(type, VDEV_TYPE_ROOT) == 0) && children > 1) {
	- (void) fprintf(stderr, gettext("boot pool "
	- "can not have more than one vdev\n"));
	- return (-1);
	- }
	- }
	-
	- for (c = 0; c < children; c++) {
	- ret = make_disks(zhp, child[c], boot_type, boot_size);
	- if (ret != 0)
	- return (ret);
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
	- &child, &children) == 0)
	- for (c = 0; c < children; c++) {
	- ret = make_disks(zhp, child[c], boot_type, boot_size);
	- if (ret != 0)
	- return (ret);
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	- &child, &children) == 0)
	- for (c = 0; c < children; c++) {
	- ret = make_disks(zhp, child[c], boot_type, boot_size);
	- if (ret != 0)
	- return (ret);
	- }
	-
	- return (0);
	-}
	-#endif /* illumos */
	-
	-/*
	- * Determine if the given path is a hot spare within the given configuration.
	- */
	-static boolean_t
	-is_spare(nvlist_t config, const char path)
	-{
	- int fd;
	- pool_state_t state;
	- char *name = NULL;
	- nvlist_t *label;
	- uint64_t guid, spareguid;
	- nvlist_t *nvroot;
	- nvlist_t **spares;
	- uint_t i, nspares;
	- boolean_t inuse;
	-
	- if ((fd = open(path, O_RDONLY)) < 0)
	- return (B_FALSE);
	-
	- if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 \|\|
	- !inuse \|\|
	- state != POOL_STATE_SPARE \|\|
	- zpool_read_label(fd, &label) != 0) {
	- free(name);
	- (void) close(fd);
	- return (B_FALSE);
	- }
	- free(name);
	- (void) close(fd);
	-
	- verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
	- nvlist_free(label);
	-
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	- &spares, &nspares) == 0) {
	- for (i = 0; i < nspares; i++) {
	- verify(nvlist_lookup_uint64(spares[i],
	- ZPOOL_CONFIG_GUID, &spareguid) == 0);
	- if (spareguid == guid)
	- return (B_TRUE);
	- }
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Go through and find any devices that are in use. We rely on libdiskmgt for
	- * the majority of this task.
	- */
	-static boolean_t
	-is_device_in_use(nvlist_t config, nvlist_t nv, boolean_t force,
	- boolean_t replacing, boolean_t isspare)
	-{
	- nvlist_t **child;
	- uint_t c, children;
	- char type, path;
	- int ret = 0;
	- char buf[MAXPATHLEN];
	- uint64_t wholedisk;
	- boolean_t anyinuse = B_FALSE;
	-
	- verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0) {
	-
	- verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
	-
	- /*
	- * As a generic check, we look to see if this is a replace of a
	- * hot spare within the same pool. If so, we allow it
	- * regardless of what libdiskmgt or zpool_in_use() says.
	- */
	- if (replacing) {
	-#ifdef illumos
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	- &wholedisk) == 0 && wholedisk)
	- (void) snprintf(buf, sizeof (buf), "%ss0",
	- path);
	- else
	-#endif
	- (void) strlcpy(buf, path, sizeof (buf));
	-
	- if (is_spare(config, buf))
	- return (B_FALSE);
	- }
	-
	- if (strcmp(type, VDEV_TYPE_DISK) == 0)
	- ret = check_device(path, force, isspare);
	- else if (strcmp(type, VDEV_TYPE_FILE) == 0)
	- ret = check_file(path, force, isspare);
	-
	- return (ret != 0);
	- }
	-
	- for (c = 0; c < children; c++)
	- if (is_device_in_use(config, child[c], force, replacing,
	- B_FALSE))
	- anyinuse = B_TRUE;
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
	- &child, &children) == 0)
	- for (c = 0; c < children; c++)
	- if (is_device_in_use(config, child[c], force, replacing,
	- B_TRUE))
	- anyinuse = B_TRUE;
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	- &child, &children) == 0)
	- for (c = 0; c < children; c++)
	- if (is_device_in_use(config, child[c], force, replacing,
	- B_FALSE))
	- anyinuse = B_TRUE;
	-
	- return (anyinuse);
	-}
	-
	-static const char *
	-is_grouping(const char type, int mindev, int *maxdev)
	-{
	- if (strncmp(type, "raidz", 5) == 0) {
	- const char *p = type + 5;
	- char *end;
	- long nparity;
	-
	- if (*p == '\0') {
	- nparity = 1;
	- } else if (*p == '0') {
	- return (NULL); /* no zero prefixes allowed */
	- } else {
	- errno = 0;
	- nparity = strtol(p, &end, 10);
	- if (errno != 0 \|\| nparity < 1 \|\| nparity >= 255 \|\|
	- *end != '\0')
	- return (NULL);
	- }
	-
	- if (mindev != NULL)
	- *mindev = nparity + 1;
	- if (maxdev != NULL)
	- *maxdev = 255;
	- return (VDEV_TYPE_RAIDZ);
	- }
	-
	- if (maxdev != NULL)
	- *maxdev = INT_MAX;
	-
	- if (strcmp(type, "mirror") == 0) {
	- if (mindev != NULL)
	- *mindev = 2;
	- return (VDEV_TYPE_MIRROR);
	- }
	-
	- if (strcmp(type, "spare") == 0) {
	- if (mindev != NULL)
	- *mindev = 1;
	- return (VDEV_TYPE_SPARE);
	- }
	-
	- if (strcmp(type, "log") == 0) {
	- if (mindev != NULL)
	- *mindev = 1;
	- return (VDEV_TYPE_LOG);
	- }
	-
	- if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 \|\|
	- strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
	- if (mindev != NULL)
	- *mindev = 1;
	- return (type);
	- }
	-
	- if (strcmp(type, "cache") == 0) {
	- if (mindev != NULL)
	- *mindev = 1;
	- return (VDEV_TYPE_L2CACHE);
	- }
	-
	- return (NULL);
	-}
	-
	-/*
	- * Construct a syntactically valid vdev specification,
	- * and ensure that all devices and files exist and can be opened.
	- * Note: we don't bother freeing anything in the error paths
	- * because the program is just going to exit anyway.
	- */
	-nvlist_t *
	-construct_spec(int argc, char **argv)
	-{
	- nvlist_t nvroot, nv, top, spares, **l2cache;
	- int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
	- const char *type;
	- uint64_t is_log, is_special, is_dedup;
	- boolean_t seen_logs;
	-
	- top = NULL;
	- toplevels = 0;
	- spares = NULL;
	- l2cache = NULL;
	- nspares = 0;
	- nlogs = 0;
	- nl2cache = 0;
	- is_log = is_special = is_dedup = B_FALSE;
	- seen_logs = B_FALSE;
	-
	- while (argc > 0) {
	- nv = NULL;
	-
	- /*
	- * If it's a mirror or raidz, the subsequent arguments are
	- * its leaves -- until we encounter the next mirror or raidz.
	- */
	- if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
	- nvlist_t **child = NULL;
	- int c, children = 0;
	-
	- if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
	- if (spares != NULL) {
	- (void) fprintf(stderr,
	- gettext("invalid vdev "
	- "specification: 'spare' can be "
	- "specified only once\n"));
	- return (NULL);
	- }
	- is_log = is_special = is_dedup = B_FALSE;
	- }
	-
	- if (strcmp(type, VDEV_TYPE_LOG) == 0) {
	- if (seen_logs) {
	- (void) fprintf(stderr,
	- gettext("invalid vdev "
	- "specification: 'log' can be "
	- "specified only once\n"));
	- return (NULL);
	- }
	- seen_logs = B_TRUE;
	- is_log = B_TRUE;
	- is_special = B_FALSE;
	- is_dedup = B_FALSE;
	- argc--;
	- argv++;
	- /*
	- * A log is not a real grouping device.
	- * We just set is_log and continue.
	- */
	- continue;
	- }
	-
	- if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
	- is_special = B_TRUE;
	- is_log = B_FALSE;
	- is_dedup = B_FALSE;
	- argc--;
	- argv++;
	- continue;
	- }
	-
	- if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
	- is_dedup = B_TRUE;
	- is_log = B_FALSE;
	- is_special = B_FALSE;
	- argc--;
	- argv++;
	- continue;
	- }
	-
	- if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
	- if (l2cache != NULL) {
	- (void) fprintf(stderr,
	- gettext("invalid vdev "
	- "specification: 'cache' can be "
	- "specified only once\n"));
	- return (NULL);
	- }
	- is_log = is_special = is_dedup = B_FALSE;
	- }
	-
	- if (is_log \|\| is_special \|\| is_dedup) {
	- if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
	- (void) fprintf(stderr,
	- gettext("invalid vdev "
	- "specification: unsupported '%s' "
	- "device: %s\n"), is_log ? "log" :
	- "special", type);
	- return (NULL);
	- }
	- nlogs++;
	- }
	-
	- for (c = 1; c < argc; c++) {
	- if (is_grouping(argv[c], NULL, NULL) != NULL)
	- break;
	- children++;
	- child = realloc(child,
	- children * sizeof (nvlist_t *));
	- if (child == NULL)
	- zpool_no_memory();
	- if ((nv = make_leaf_vdev(argv[c], B_FALSE))
	- == NULL)
	- return (NULL);
	- child[children - 1] = nv;
	- }
	-
	- if (children < mindev) {
	- (void) fprintf(stderr, gettext("invalid vdev "
	- "specification: %s requires at least %d "
	- "devices\n"), argv[0], mindev);
	- return (NULL);
	- }
	-
	- if (children > maxdev) {
	- (void) fprintf(stderr, gettext("invalid vdev "
	- "specification: %s supports no more than "
	- "%d devices\n"), argv[0], maxdev);
	- return (NULL);
	- }
	-
	- argc -= c;
	- argv += c;
	-
	- if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
	- spares = child;
	- nspares = children;
	- continue;
	- } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
	- l2cache = child;
	- nl2cache = children;
	- continue;
	- } else {
	- /* create a top-level vdev with children */
	- verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
	- 0) == 0);
	- verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
	- type) == 0);
	- verify(nvlist_add_uint64(nv,
	- ZPOOL_CONFIG_IS_LOG, is_log) == 0);
	- if (is_log)
	- verify(nvlist_add_string(nv,
	- ZPOOL_CONFIG_ALLOCATION_BIAS,
	- VDEV_ALLOC_BIAS_LOG) == 0);
	- if (is_special) {
	- verify(nvlist_add_string(nv,
	- ZPOOL_CONFIG_ALLOCATION_BIAS,
	- VDEV_ALLOC_BIAS_SPECIAL) == 0);
	- }
	- if (is_dedup) {
	- verify(nvlist_add_string(nv,
	- ZPOOL_CONFIG_ALLOCATION_BIAS,
	- VDEV_ALLOC_BIAS_DEDUP) == 0);
	- }
	- if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
	- verify(nvlist_add_uint64(nv,
	- ZPOOL_CONFIG_NPARITY,
	- mindev - 1) == 0);
	- }
	- verify(nvlist_add_nvlist_array(nv,
	- ZPOOL_CONFIG_CHILDREN, child,
	- children) == 0);
	-
	- for (c = 0; c < children; c++)
	- nvlist_free(child[c]);
	- free(child);
	- }
	- } else {
	- /*
	- * We have a device. Pass off to make_leaf_vdev() to
	- * construct the appropriate nvlist describing the vdev.
	- */
	- if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL)
	- return (NULL);
	- if (is_log)
	- nlogs++;
	- if (is_special) {
	- verify(nvlist_add_string(nv,
	- ZPOOL_CONFIG_ALLOCATION_BIAS,
	- VDEV_ALLOC_BIAS_SPECIAL) == 0);
	- }
	- if (is_dedup) {
	- verify(nvlist_add_string(nv,
	- ZPOOL_CONFIG_ALLOCATION_BIAS,
	- VDEV_ALLOC_BIAS_DEDUP) == 0);
	- }
	- argc--;
	- argv++;
	- }
	-
	- toplevels++;
	- top = realloc(top, toplevels * sizeof (nvlist_t *));
	- if (top == NULL)
	- zpool_no_memory();
	- top[toplevels - 1] = nv;
	- }
	-
	- if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
	- (void) fprintf(stderr, gettext("invalid vdev "
	- "specification: at least one toplevel vdev must be "
	- "specified\n"));
	- return (NULL);
	- }
	-
	- if (seen_logs && nlogs == 0) {
	- (void) fprintf(stderr, gettext("invalid vdev specification: "
	- "log requires at least 1 device\n"));
	- return (NULL);
	- }
	-
	- /*
	- * Finally, create nvroot and add all top-level vdevs to it.
	- */
	- verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
	- verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_ROOT) == 0);
	- verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- top, toplevels) == 0);
	- if (nspares != 0)
	- verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	- spares, nspares) == 0);
	- if (nl2cache != 0)
	- verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	- l2cache, nl2cache) == 0);
	-
	- for (t = 0; t < toplevels; t++)
	- nvlist_free(top[t]);
	- for (t = 0; t < nspares; t++)
	- nvlist_free(spares[t]);
	- for (t = 0; t < nl2cache; t++)
	- nvlist_free(l2cache[t]);
	- if (spares)
	- free(spares);
	- if (l2cache)
	- free(l2cache);
	- free(top);
	-
	- return (nvroot);
	-}
	-
	-nvlist_t *
	-split_mirror_vdev(zpool_handle_t zhp, char newname, nvlist_t *props,
	- splitflags_t flags, int argc, char **argv)
	-{
	- nvlist_t newroot = NULL, *child;
	- uint_t c, children;
	-#ifdef illumos
	- zpool_boot_label_t boot_type;
	-#endif
	-
	- if (argc > 0) {
	- if ((newroot = construct_spec(argc, argv)) == NULL) {
	- (void) fprintf(stderr, gettext("Unable to build a "
	- "pool from the specified devices\n"));
	- return (NULL);
	- }
	-
	-#ifdef illumos
	- if (zpool_is_bootable(zhp))
	- boot_type = ZPOOL_COPY_BOOT_LABEL;
	- else
	- boot_type = ZPOOL_NO_BOOT_LABEL;
	-
	- if (!flags.dryrun &&
	- make_disks(zhp, newroot, boot_type, 0) != 0) {
	- nvlist_free(newroot);
	- return (NULL);
	- }
	-#endif
	-
	- /* avoid any tricks in the spec */
	- verify(nvlist_lookup_nvlist_array(newroot,
	- ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
	- for (c = 0; c < children; c++) {
	- char *path;
	- const char *type;
	- int min, max;
	-
	- verify(nvlist_lookup_string(child[c],
	- ZPOOL_CONFIG_PATH, &path) == 0);
	- if ((type = is_grouping(path, &min, &max)) != NULL) {
	- (void) fprintf(stderr, gettext("Cannot use "
	- "'%s' as a device for splitting\n"), type);
	- nvlist_free(newroot);
	- return (NULL);
	- }
	- }
	- }
	-
	- if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
	- nvlist_free(newroot);
	- return (NULL);
	- }
	-
	- return (newroot);
	-}
	-
	-static int
	-num_normal_vdevs(nvlist_t *nvroot)
	-{
	- nvlist_t **top;
	- uint_t t, toplevels, normal = 0;
	-
	- verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- &top, &toplevels) == 0);
	-
	- for (t = 0; t < toplevels; t++) {
	- uint64_t log = B_FALSE;
	-
	- (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
	- if (log)
	- continue;
	- if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
	- continue;
	-
	- normal++;
	- }
	-
	- return (normal);
	-}
	-
	-/*
	- * Get and validate the contents of the given vdev specification. This ensures
	- * that the nvlist returned is well-formed, that all the devices exist, and that
	- * they are not currently in use by any other known consumer. The 'poolconfig'
	- * parameter is the current configuration of the pool when adding devices
	- * existing pool, and is used to perform additional checks, such as changing the
	- * replication level of the pool. It can be 'NULL' to indicate that this is a
	- * new pool. The 'force' flag controls whether devices should be forcefully
	- * added, even if they appear in use.
	- */
	-nvlist_t *
	-make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
	- boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type,
	- uint64_t boot_size, int argc, char **argv)
	-{
	- nvlist_t *newroot;
	- nvlist_t *poolconfig = NULL;
	- is_force = force;
	-
	- /*
	- * Construct the vdev specification. If this is successful, we know
	- * that we have a valid specification, and that all devices can be
	- * opened.
	- */
	- if ((newroot = construct_spec(argc, argv)) == NULL)
	- return (NULL);
	-
	- if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
	- return (NULL);
	-
	- /*
	- * Validate each device to make sure that its not shared with another
	- * subsystem. We do this even if 'force' is set, because there are some
	- * uses (such as a dedicated dump device) that even '-f' cannot
	- * override.
	- */
	- if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
	- nvlist_free(newroot);
	- return (NULL);
	- }
	-
	- /*
	- * Check the replication level of the given vdevs and report any errors
	- * found. We include the existing pool spec, if any, as we need to
	- * catch changes against the existing replication level.
	- */
	- if (check_rep && check_replication(poolconfig, newroot) != 0) {
	- nvlist_free(newroot);
	- return (NULL);
	- }
	-
	-#ifdef illumos
	- /*
	- * On pool create the new vdev spec must have one normal vdev.
	- */
	- if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
	- vdev_error(gettext("at least one general top-level vdev must "
	- "be specified\n"));
	- nvlist_free(newroot);
	- return (NULL);
	- }
	-
	- /*
	- * Run through the vdev specification and label any whole disks found.
	- */
	- if (!dryrun && make_disks(zhp, newroot, boot_type, boot_size) != 0) {
	- nvlist_free(newroot);
	- return (NULL);
	- }
	-#endif
	-
	- return (newroot);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.1
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.1
	+++ head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.1
	@@ -1,76 +0,0 @@
	-'\" te
	-.\" Copyright (c) 2011, Martin Matuska <mm@FreeBSD.org>.
	-.\" All Rights Reserved.
	-.\"
	-.\" The contents of this file are subject to the terms of the
	-.\" Common Development and Distribution License (the "License").
	-.\" You may not use this file except in compliance with the License.
	-.\"
	-.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	-.\" or http://www.opensolaris.org/os/licensing.
	-.\" See the License for the specific language governing permissions
	-.\" and limitations under the License.
	-.\"
	-.\" When distributing Covered Code, include this CDDL HEADER in each
	-.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	-.\" If applicable, add the following below this CDDL HEADER, with the
	-.\" fields enclosed by brackets "[]" replaced with your own identifying
	-.\" information: Portions Copyright [yyyy] [name of copyright owner]
	-.\"
	-.\" Copyright (c) 2009, Sun Microsystems, Inc. All Rights Reserved.
	-.\" Copyright (c) 2013, Delphix. All Rights Reserved.
	-.\"
	-.\" $FreeBSD$
	-.\"
	-.Dd February 25, 2020
	-.Dt ZSTREAMDUMP 8
	-.Os
	-.Sh NAME
	-.Nm zstreamdump
	-.Nd filter data in zfs send stream
	-.Sh SYNOPSIS
	-.Nm
	-.Op Fl C
	-.Op Fl d
	-.Op Fl v
	-.Sh DESCRIPTION
	-The
	-.Nm
	-utility reads from the output of the
	-.Qq Nm zfs Cm send
	-command, then displays headers and some statistics from that output. See
	-.Xr zfs 8 .
	-.Pp
	-The following options are supported:
	-.Bl -tag -width indent
	-.It Fl C
	-Suppress the validation of checksums.
	-.It Fl d
	-Dump contents of blocks modified, implies verbose.
	-.It Fl v
	-Verbose. Dump all headers, not only begin and end headers.
	-.El
	-.Sh SEE ALSO
	-.Xr zfs 8
	-.Sh HISTORY
	-The
	-.Nm
	-utility first appeared in
	-.Fx 7.0 .
	-.Sh AUTHORS
	-This manual page is a
	-.Xr mdoc 7
	-reimplementation of the
	-.Tn OpenSolaris
	-manual page
	-.Em zstreamdump(1M) ,
	-modified and customized for
	-.Fx
	-and licensed under the
	-.Tn Common Development and Distribution License
	-.Pq Tn CDDL .
	-.Pp
	-The
	-.Xr mdoc 7
	-implementation of this manual page was initially written by
	-.An Martin Matuska Aq mm@FreeBSD.org .
	Index: head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
	+++ head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
	@@ -1,644 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	- */
	-
	-#include <ctype.h>
	-#include <libnvpair.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <stddef.h>
	-
	-#include <sys/dmu.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zio.h>
	-#include <zfs_fletcher.h>
	-
	-/*
	- * If dump mode is enabled, the number of bytes to print per line
	- */
	-#define BYTES_PER_LINE 16
	-/*
	- * If dump mode is enabled, the number of bytes to group together, separated
	- * by newlines or spaces
	- */
	-#define DUMP_GROUPING 4
	-
	-uint64_t total_write_size = 0;
	-uint64_t total_stream_len = 0;
	-FILE *send_stream = 0;
	-boolean_t do_byteswap = B_FALSE;
	-boolean_t do_cksum = B_TRUE;
	-
	-static void
	-usage(void)
	-{
	- (void) fprintf(stderr, "usage: zstreamdump [-v] [-C] [-d] < file\n");
	- (void) fprintf(stderr, "\t -v -- verbose\n");
	- (void) fprintf(stderr, "\t -C -- suppress checksum verification\n");
	- (void) fprintf(stderr, "\t -d -- dump contents of blocks modified, "
	- "implies verbose\n");
	- exit(1);
	-}
	-
	-static void *
	-safe_malloc(size_t size)
	-{
	- void *rv = malloc(size);
	- if (rv == NULL) {
	- (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n",
	- size);
	- abort();
	- }
	- return (rv);
	-}
	-
	-/*
	- * ssread - send stream read.
	- *
	- * Read while computing incremental checksum
	- */
	-static size_t
	-ssread(void buf, size_t len, zio_cksum_t cksum)
	-{
	- size_t outlen;
	-
	- if ((outlen = fread(buf, len, 1, send_stream)) == 0)
	- return (0);
	-
	- if (do_cksum) {
	- if (do_byteswap)
	- fletcher_4_incremental_byteswap(buf, len, cksum);
	- else
	- fletcher_4_incremental_native(buf, len, cksum);
	- }
	- total_stream_len += len;
	- return (outlen);
	-}
	-
	-static size_t
	-read_hdr(dmu_replay_record_t drr, zio_cksum_t cksum)
	-{
	- ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
	- ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
	- size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum);
	- if (r == 0)
	- return (0);
	- zio_cksum_t saved_cksum = *cksum;
	- r = ssread(&drr->drr_u.drr_checksum.drr_checksum,
	- sizeof (zio_cksum_t), cksum);
	- if (r == 0)
	- return (0);
	- if (!ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) &&
	- !ZIO_CHECKSUM_EQUAL(saved_cksum,
	- drr->drr_u.drr_checksum.drr_checksum)) {
	- fprintf(stderr, "invalid checksum\n");
	- (void) printf("Incorrect checksum in record header.\n");
	- (void) printf("Expected checksum = %llx/%llx/%llx/%llx\n",
	- saved_cksum.zc_word[0],
	- saved_cksum.zc_word[1],
	- saved_cksum.zc_word[2],
	- saved_cksum.zc_word[3]);
	- return (0);
	- }
	- return (sizeof (*drr));
	-}
	-
	-/*
	- * Print part of a block in ASCII characters
	- */
	-static void
	-print_ascii_block(char *subbuf, int length)
	-{
	- int i;
	-
	- for (i = 0; i < length; i++) {
	- char char_print = isprint(subbuf[i]) ? subbuf[i] : '.';
	- if (i != 0 && i % DUMP_GROUPING == 0) {
	- (void) printf(" ");
	- }
	- (void) printf("%c", char_print);
	- }
	- (void) printf("\n");
	-}
	-
	-/*
	- * print_block - Dump the contents of a modified block to STDOUT
	- *
	- * Assume that buf has capacity evenly divisible by BYTES_PER_LINE
	- */
	-static void
	-print_block(char *buf, int length)
	-{
	- int i;
	- /*
	- * Start printing ASCII characters at a constant offset, after
	- * the hex prints. Leave 3 characters per byte on a line (2 digit
	- * hex number plus 1 space) plus spaces between characters and
	- * groupings.
	- */
	- int ascii_start = BYTES_PER_LINE * 3 +
	- BYTES_PER_LINE / DUMP_GROUPING + 2;
	-
	- for (i = 0; i < length; i += BYTES_PER_LINE) {
	- int j;
	- int this_line_length = MIN(BYTES_PER_LINE, length - i);
	- int print_offset = 0;
	-
	- for (j = 0; j < this_line_length; j++) {
	- int buf_offset = i + j;
	-
	- /*
	- * Separate every DUMP_GROUPING bytes by a space.
	- */
	- if (buf_offset % DUMP_GROUPING == 0) {
	- print_offset += printf(" ");
	- }
	-
	- /*
	- * Print the two-digit hex value for this byte.
	- */
	- unsigned char hex_print = buf[buf_offset];
	- print_offset += printf("%02x ", hex_print);
	- }
	-
	- (void) printf("%*s", ascii_start - print_offset, " ");
	-
	- print_ascii_block(buf + i, this_line_length);
	- }
	-}
	-
	-int
	-main(int argc, char *argv[])
	-{
	- char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
	- uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
	- uint64_t total_records = 0;
	- dmu_replay_record_t thedrr;
	- dmu_replay_record_t *drr = &thedrr;
	- struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
	- struct drr_end *drre = &thedrr.drr_u.drr_end;
	- struct drr_object *drro = &thedrr.drr_u.drr_object;
	- struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects;
	- struct drr_write *drrw = &thedrr.drr_u.drr_write;
	- struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
	- struct drr_free *drrf = &thedrr.drr_u.drr_free;
	- struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
	- struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
	- struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum;
	- char c;
	- boolean_t verbose = B_FALSE;
	- boolean_t very_verbose = B_FALSE;
	- boolean_t first = B_TRUE;
	- /*
	- * dump flag controls whether the contents of any modified data blocks
	- * are printed to the console during processing of the stream. Warning:
	- * for large streams, this can obviously lead to massive prints.
	- */
	- boolean_t dump = B_FALSE;
	- int err;
	- zio_cksum_t zc = { 0 };
	- zio_cksum_t pcksum = { 0 };
	-
	- while ((c = getopt(argc, argv, ":vCd")) != -1) {
	- switch (c) {
	- case 'C':
	- do_cksum = B_FALSE;
	- break;
	- case 'v':
	- if (verbose)
	- very_verbose = B_TRUE;
	- verbose = B_TRUE;
	- break;
	- case 'd':
	- dump = B_TRUE;
	- verbose = B_TRUE;
	- very_verbose = B_TRUE;
	- break;
	- case ':':
	- (void) fprintf(stderr,
	- "missing argument for '%c' option\n", optopt);
	- usage();
	- break;
	- case '?':
	- (void) fprintf(stderr, "invalid option '%c'\n",
	- optopt);
	- usage();
	- break;
	- }
	- }
	-
	- if (isatty(STDIN_FILENO)) {
	- (void) fprintf(stderr,
	- "Error: Backup stream can not be read "
	- "from a terminal.\n"
	- "You must redirect standard input.\n");
	- exit(1);
	- }
	-
	- send_stream = stdin;
	- pcksum = zc;
	- while (read_hdr(drr, &zc)) {
	-
	- /*
	- * If this is the first DMU record being processed, check for
	- * the magic bytes and figure out the endian-ness based on them.
	- */
	- if (first) {
	- if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
	- do_byteswap = B_TRUE;
	- if (do_cksum) {
	- ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
	- /*
	- * recalculate header checksum now
	- * that we know it needs to be
	- * byteswapped.
	- */
	- fletcher_4_incremental_byteswap(drr,
	- sizeof (dmu_replay_record_t), &zc);
	- }
	- } else if (drrb->drr_magic != DMU_BACKUP_MAGIC) {
	- (void) fprintf(stderr, "Invalid stream "
	- "(bad magic number)\n");
	- exit(1);
	- }
	- first = B_FALSE;
	- }
	- if (do_byteswap) {
	- drr->drr_type = BSWAP_32(drr->drr_type);
	- drr->drr_payloadlen =
	- BSWAP_32(drr->drr_payloadlen);
	- }
	-
	- /*
	- * At this point, the leading fields of the replay record
	- * (drr_type and drr_payloadlen) have been byte-swapped if
	- * necessary, but the rest of the data structure (the
	- * union of type-specific structures) is still in its
	- * original state.
	- */
	- if (drr->drr_type >= DRR_NUMTYPES) {
	- (void) printf("INVALID record found: type 0x%x\n",
	- drr->drr_type);
	- (void) printf("Aborting.\n");
	- exit(1);
	- }
	-
	- drr_record_count[drr->drr_type]++;
	- total_records++;
	-
	- switch (drr->drr_type) {
	- case DRR_BEGIN:
	- if (do_byteswap) {
	- drrb->drr_magic = BSWAP_64(drrb->drr_magic);
	- drrb->drr_versioninfo =
	- BSWAP_64(drrb->drr_versioninfo);
	- drrb->drr_creation_time =
	- BSWAP_64(drrb->drr_creation_time);
	- drrb->drr_type = BSWAP_32(drrb->drr_type);
	- drrb->drr_flags = BSWAP_32(drrb->drr_flags);
	- drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
	- drrb->drr_fromguid =
	- BSWAP_64(drrb->drr_fromguid);
	- }
	-
	- (void) printf("BEGIN record\n");
	- (void) printf("\thdrtype = %lld\n",
	- DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo));
	- (void) printf("\tfeatures = %llx\n",
	- DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo));
	- (void) printf("\tmagic = %llx\n",
	- (u_longlong_t)drrb->drr_magic);
	- (void) printf("\tcreation_time = %llx\n",
	- (u_longlong_t)drrb->drr_creation_time);
	- (void) printf("\ttype = %u\n", drrb->drr_type);
	- (void) printf("\tflags = 0x%x\n", drrb->drr_flags);
	- (void) printf("\ttoguid = %llx\n",
	- (u_longlong_t)drrb->drr_toguid);
	- (void) printf("\tfromguid = %llx\n",
	- (u_longlong_t)drrb->drr_fromguid);
	- (void) printf("\ttoname = %s\n", drrb->drr_toname);
	- if (verbose)
	- (void) printf("\n");
	-
	- if (drr->drr_payloadlen != 0) {
	- nvlist_t *nv;
	- int sz = drr->drr_payloadlen;
	-
	- if (sz > SPA_MAXBLOCKSIZE) {
	- free(buf);
	- buf = safe_malloc(sz);
	- }
	- (void) ssread(buf, sz, &zc);
	- if (ferror(send_stream))
	- perror("fread");
	- err = nvlist_unpack(buf, sz, &nv, 0);
	- if (err)
	- perror(strerror(err));
	- nvlist_print(stdout, nv);
	- nvlist_free(nv);
	- }
	- break;
	-
	- case DRR_END:
	- if (do_byteswap) {
	- drre->drr_checksum.zc_word[0] =
	- BSWAP_64(drre->drr_checksum.zc_word[0]);
	- drre->drr_checksum.zc_word[1] =
	- BSWAP_64(drre->drr_checksum.zc_word[1]);
	- drre->drr_checksum.zc_word[2] =
	- BSWAP_64(drre->drr_checksum.zc_word[2]);
	- drre->drr_checksum.zc_word[3] =
	- BSWAP_64(drre->drr_checksum.zc_word[3]);
	- }
	- /*
	- * We compare against the previous checksum
	- * value, because the stored checksum is of
	- * everything before the DRR_END record.
	- */
	- if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum,
	- pcksum)) {
	- (void) printf("Expected checksum differs from "
	- "checksum in stream.\n");
	- (void) printf("Expected checksum = "
	- "%llx/%llx/%llx/%llx\n",
	- pcksum.zc_word[0],
	- pcksum.zc_word[1],
	- pcksum.zc_word[2],
	- pcksum.zc_word[3]);
	- }
	- (void) printf("END checksum = %llx/%llx/%llx/%llx\n",
	- drre->drr_checksum.zc_word[0],
	- drre->drr_checksum.zc_word[1],
	- drre->drr_checksum.zc_word[2],
	- drre->drr_checksum.zc_word[3]);
	-
	- ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
	- break;
	-
	- case DRR_OBJECT:
	- if (do_byteswap) {
	- drro->drr_object = BSWAP_64(drro->drr_object);
	- drro->drr_type = BSWAP_32(drro->drr_type);
	- drro->drr_bonustype =
	- BSWAP_32(drro->drr_bonustype);
	- drro->drr_blksz = BSWAP_32(drro->drr_blksz);
	- drro->drr_bonuslen =
	- BSWAP_32(drro->drr_bonuslen);
	- drro->drr_toguid = BSWAP_64(drro->drr_toguid);
	- }
	- if (verbose) {
	- (void) printf("OBJECT object = %" PRIu64
	- " type = %u bonustype = %u blksz = %u"
	- " bonuslen = %u dn_slots = %u\n",
	- drro->drr_object,
	- drro->drr_type,
	- drro->drr_bonustype,
	- drro->drr_blksz,
	- drro->drr_bonuslen,
	- drro->drr_dn_slots);
	- }
	- if (drro->drr_bonuslen > 0) {
	- (void) ssread(buf,
	- P2ROUNDUP(drro->drr_bonuslen, 8), &zc);
	- if (dump) {
	- print_block(buf,
	- P2ROUNDUP(drro->drr_bonuslen, 8));
	- }
	- }
	- break;
	-
	- case DRR_FREEOBJECTS:
	- if (do_byteswap) {
	- drrfo->drr_firstobj =
	- BSWAP_64(drrfo->drr_firstobj);
	- drrfo->drr_numobjs =
	- BSWAP_64(drrfo->drr_numobjs);
	- drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid);
	- }
	- if (verbose) {
	- (void) printf("FREEOBJECTS firstobj = %llu "
	- "numobjs = %llu\n",
	- (u_longlong_t)drrfo->drr_firstobj,
	- (u_longlong_t)drrfo->drr_numobjs);
	- }
	- break;
	-
	- case DRR_WRITE:
	- if (do_byteswap) {
	- drrw->drr_object = BSWAP_64(drrw->drr_object);
	- drrw->drr_type = BSWAP_32(drrw->drr_type);
	- drrw->drr_offset = BSWAP_64(drrw->drr_offset);
	- drrw->drr_logical_size =
	- BSWAP_64(drrw->drr_logical_size);
	- drrw->drr_toguid = BSWAP_64(drrw->drr_toguid);
	- drrw->drr_key.ddk_prop =
	- BSWAP_64(drrw->drr_key.ddk_prop);
	- drrw->drr_compressed_size =
	- BSWAP_64(drrw->drr_compressed_size);
	- }
	-
	- uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
	-
	- /*
	- * If this is verbose and/or dump output,
	- * print info on the modified block
	- */
	- if (verbose) {
	- (void) printf("WRITE object = %llu type = %u "
	- "checksum type = %u compression type = %u\n"
	- " offset = %llu logical_size = %llu "
	- "compressed_size = %llu "
	- "payload_size = %llu "
	- "props = %llx\n",
	- (u_longlong_t)drrw->drr_object,
	- drrw->drr_type,
	- drrw->drr_checksumtype,
	- drrw->drr_compressiontype,
	- (u_longlong_t)drrw->drr_offset,
	- (u_longlong_t)drrw->drr_logical_size,
	- (u_longlong_t)drrw->drr_compressed_size,
	- (u_longlong_t)payload_size,
	- (u_longlong_t)drrw->drr_key.ddk_prop);
	- }
	-
	- /*
	- * Read the contents of the block in from STDIN to buf
	- */
	- (void) ssread(buf, payload_size, &zc);
	- /*
	- * If in dump mode
	- */
	- if (dump) {
	- print_block(buf, payload_size);
	- }
	- total_write_size += payload_size;
	- break;
	-
	- case DRR_WRITE_BYREF:
	- if (do_byteswap) {
	- drrwbr->drr_object =
	- BSWAP_64(drrwbr->drr_object);
	- drrwbr->drr_offset =
	- BSWAP_64(drrwbr->drr_offset);
	- drrwbr->drr_length =
	- BSWAP_64(drrwbr->drr_length);
	- drrwbr->drr_toguid =
	- BSWAP_64(drrwbr->drr_toguid);
	- drrwbr->drr_refguid =
	- BSWAP_64(drrwbr->drr_refguid);
	- drrwbr->drr_refobject =
	- BSWAP_64(drrwbr->drr_refobject);
	- drrwbr->drr_refoffset =
	- BSWAP_64(drrwbr->drr_refoffset);
	- drrwbr->drr_key.ddk_prop =
	- BSWAP_64(drrwbr->drr_key.ddk_prop);
	- }
	- if (verbose) {
	- (void) printf("WRITE_BYREF object = %llu "
	- "checksum type = %u props = %llx\n"
	- " offset = %llu length = %llu\n"
	- "toguid = %llx refguid = %llx\n"
	- " refobject = %llu refoffset = %llu\n",
	- (u_longlong_t)drrwbr->drr_object,
	- drrwbr->drr_checksumtype,
	- (u_longlong_t)drrwbr->drr_key.ddk_prop,
	- (u_longlong_t)drrwbr->drr_offset,
	- (u_longlong_t)drrwbr->drr_length,
	- (u_longlong_t)drrwbr->drr_toguid,
	- (u_longlong_t)drrwbr->drr_refguid,
	- (u_longlong_t)drrwbr->drr_refobject,
	- (u_longlong_t)drrwbr->drr_refoffset);
	- }
	- break;
	-
	- case DRR_FREE:
	- if (do_byteswap) {
	- drrf->drr_object = BSWAP_64(drrf->drr_object);
	- drrf->drr_offset = BSWAP_64(drrf->drr_offset);
	- drrf->drr_length = BSWAP_64(drrf->drr_length);
	- }
	- if (verbose) {
	- (void) printf("FREE object = %llu "
	- "offset = %llu length = %lld\n",
	- (u_longlong_t)drrf->drr_object,
	- (u_longlong_t)drrf->drr_offset,
	- (longlong_t)drrf->drr_length);
	- }
	- break;
	- case DRR_SPILL:
	- if (do_byteswap) {
	- drrs->drr_object = BSWAP_64(drrs->drr_object);
	- drrs->drr_length = BSWAP_64(drrs->drr_length);
	- }
	- if (verbose) {
	- (void) printf("SPILL block for object = %llu "
	- "length = %llu\n", drrs->drr_object,
	- drrs->drr_length);
	- }
	- (void) ssread(buf, drrs->drr_length, &zc);
	- if (dump) {
	- print_block(buf, drrs->drr_length);
	- }
	- break;
	- case DRR_WRITE_EMBEDDED:
	- if (do_byteswap) {
	- drrwe->drr_object =
	- BSWAP_64(drrwe->drr_object);
	- drrwe->drr_offset =
	- BSWAP_64(drrwe->drr_offset);
	- drrwe->drr_length =
	- BSWAP_64(drrwe->drr_length);
	- drrwe->drr_toguid =
	- BSWAP_64(drrwe->drr_toguid);
	- drrwe->drr_lsize =
	- BSWAP_32(drrwe->drr_lsize);
	- drrwe->drr_psize =
	- BSWAP_32(drrwe->drr_psize);
	- }
	- if (verbose) {
	- (void) printf("WRITE_EMBEDDED object = %llu "
	- "offset = %llu length = %llu\n"
	- " toguid = %llx comp = %u etype = %u "
	- "lsize = %u psize = %u\n",
	- (u_longlong_t)drrwe->drr_object,
	- (u_longlong_t)drrwe->drr_offset,
	- (u_longlong_t)drrwe->drr_length,
	- (u_longlong_t)drrwe->drr_toguid,
	- drrwe->drr_compression,
	- drrwe->drr_etype,
	- drrwe->drr_lsize,
	- drrwe->drr_psize);
	- }
	- (void) ssread(buf,
	- P2ROUNDUP(drrwe->drr_psize, 8), &zc);
	- break;
	- }
	- if (drr->drr_type != DRR_BEGIN && very_verbose) {
	- (void) printf(" checksum = %llx/%llx/%llx/%llx\n",
	- (longlong_t)drrc->drr_checksum.zc_word[0],
	- (longlong_t)drrc->drr_checksum.zc_word[1],
	- (longlong_t)drrc->drr_checksum.zc_word[2],
	- (longlong_t)drrc->drr_checksum.zc_word[3]);
	- }
	- pcksum = zc;
	- }
	- free(buf);
	-
	- /* Print final summary */
	-
	- (void) printf("SUMMARY:\n");
	- (void) printf("\tTotal DRR_BEGIN records = %lld\n",
	- (u_longlong_t)drr_record_count[DRR_BEGIN]);
	- (void) printf("\tTotal DRR_END records = %lld\n",
	- (u_longlong_t)drr_record_count[DRR_END]);
	- (void) printf("\tTotal DRR_OBJECT records = %lld\n",
	- (u_longlong_t)drr_record_count[DRR_OBJECT]);
	- (void) printf("\tTotal DRR_FREEOBJECTS records = %lld\n",
	- (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]);
	- (void) printf("\tTotal DRR_WRITE records = %lld\n",
	- (u_longlong_t)drr_record_count[DRR_WRITE]);
	- (void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n",
	- (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]);
	- (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n",
	- (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]);
	- (void) printf("\tTotal DRR_FREE records = %lld\n",
	- (u_longlong_t)drr_record_count[DRR_FREE]);
	- (void) printf("\tTotal DRR_SPILL records = %lld\n",
	- (u_longlong_t)drr_record_count[DRR_SPILL]);
	- (void) printf("\tTotal records = %lld\n",
	- (u_longlong_t)total_records);
	- (void) printf("\tTotal write size = %lld (0x%llx)\n",
	- (u_longlong_t)total_write_size, (u_longlong_t)total_write_size);
	- (void) printf("\tTotal stream length = %lld (0x%llx)\n",
	- (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len);
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
	===================================================================
	--- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
	+++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
	@@ -1,7135 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Joyent, Inc.
	- * Copyright (c) 2017, Intel Corporation.
	- * Copyright 2017 RackTop Systems.
	- */
	-
	-/*
	- * The objective of this program is to provide a DMU/ZAP/SPA stress test
	- * that runs entirely in userland, is easy to use, and easy to extend.
	- *
	- * The overall design of the ztest program is as follows:
	- *
	- * (1) For each major functional area (e.g. adding vdevs to a pool,
	- * creating and destroying datasets, reading and writing objects, etc)
	- * we have a simple routine to test that functionality. These
	- * individual routines do not have to do anything "stressful".
	- *
	- * (2) We turn these simple functionality tests into a stress test by
	- * running them all in parallel, with as many threads as desired,
	- * and spread across as many datasets, objects, and vdevs as desired.
	- *
	- * (3) While all this is happening, we inject faults into the pool to
	- * verify that self-healing data really works.
	- *
	- * (4) Every time we open a dataset, we change its checksum and compression
	- * functions. Thus even individual objects vary from block to block
	- * in which checksum they use and whether they're compressed.
	- *
	- * (5) To verify that we never lose on-disk consistency after a crash,
	- * we run the entire test in a child of the main process.
	- * At random times, the child self-immolates with a SIGKILL.
	- * This is the software equivalent of pulling the power cord.
	- * The parent then runs the test again, using the existing
	- * storage pool, as many times as desired. If backwards compatibility
	- * testing is enabled ztest will sometimes run the "older" version
	- * of ztest after a SIGKILL.
	- *
	- * (6) To verify that we don't have future leaks or temporal incursions,
	- * many of the functional tests record the transaction group number
	- * as part of their data. When reading old data, they verify that
	- * the transaction group number is less than the current, open txg.
	- * If you add a new test, please do this if applicable.
	- *
	- * When run with no arguments, ztest runs for about five minutes and
	- * produces no output if successful. To get a little bit of information,
	- * specify -V. To get more information, specify -VV, and so on.
	- *
	- * To turn this into an overnight stress test, use -T to specify run time.
	- *
	- * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
	- * to increase the pool capacity, fanout, and overall stress level.
	- *
	- * Use the -k option to set the desired frequency of kills.
	- *
	- * When ztest invokes itself it passes all relevant information through a
	- * temporary file which is mmap-ed in the child process. This allows shared
	- * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
	- * stored at offset 0 of this file and contains information on the size and
	- * number of shared structures in the file. The information stored in this file
	- * must remain backwards compatible with older versions of ztest so that
	- * ztest can invoke them during backwards compatibility testing (-B).
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/dmu.h>
	-#include <sys/txg.h>
	-#include <sys/dbuf.h>
	-#include <sys/zap.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/poll.h>
	-#include <sys/stat.h>
	-#include <sys/time.h>
	-#include <sys/wait.h>
	-#include <sys/mman.h>
	-#include <sys/resource.h>
	-#include <sys/zio.h>
	-#include <sys/zil.h>
	-#include <sys/zil_impl.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/vdev_file.h>
	-#include <sys/vdev_initialize.h>
	-#include <sys/spa_impl.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/refcount.h>
	-#include <sys/zfeature.h>
	-#include <sys/dsl_userhold.h>
	-#include <sys/abd.h>
	-#include <stdio.h>
	-#include <stdio_ext.h>
	-#include <stdlib.h>
	-#include <unistd.h>
	-#include <signal.h>
	-#include <umem.h>
	-#include <dlfcn.h>
	-#include <ctype.h>
	-#include <math.h>
	-#include <errno.h>
	-#include <sys/fs/zfs.h>
	-#include <libnvpair.h>
	-#include <libzfs.h>
	-#include <libcmdutils.h>
	-
	-static int ztest_fd_data = -1;
	-static int ztest_fd_rand = -1;
	-
	-typedef struct ztest_shared_hdr {
	- uint64_t zh_hdr_size;
	- uint64_t zh_opts_size;
	- uint64_t zh_size;
	- uint64_t zh_stats_size;
	- uint64_t zh_stats_count;
	- uint64_t zh_ds_size;
	- uint64_t zh_ds_count;
	-} ztest_shared_hdr_t;
	-
	-static ztest_shared_hdr_t *ztest_shared_hdr;
	-
	-enum ztest_class_state {
	- ZTEST_VDEV_CLASS_OFF,
	- ZTEST_VDEV_CLASS_ON,
	- ZTEST_VDEV_CLASS_RND
	-};
	-
	-typedef struct ztest_shared_opts {
	- char zo_pool[ZFS_MAX_DATASET_NAME_LEN];
	- char zo_dir[ZFS_MAX_DATASET_NAME_LEN];
	- char zo_alt_ztest[MAXNAMELEN];
	- char zo_alt_libpath[MAXNAMELEN];
	- uint64_t zo_vdevs;
	- uint64_t zo_vdevtime;
	- size_t zo_vdev_size;
	- int zo_ashift;
	- int zo_mirrors;
	- int zo_raidz;
	- int zo_raidz_parity;
	- int zo_datasets;
	- int zo_threads;
	- uint64_t zo_passtime;
	- uint64_t zo_killrate;
	- int zo_verbose;
	- int zo_init;
	- uint64_t zo_time;
	- uint64_t zo_maxloops;
	- uint64_t zo_metaslab_force_ganging;
	- int zo_mmp_test;
	- int zo_special_vdevs;
	-} ztest_shared_opts_t;
	-
	-static const ztest_shared_opts_t ztest_opts_defaults = {
	- .zo_pool = { 'z', 't', 'e', 's', 't', '\0' },
	- .zo_dir = { '/', 't', 'm', 'p', '\0' },
	- .zo_alt_ztest = { '\0' },
	- .zo_alt_libpath = { '\0' },
	- .zo_vdevs = 5,
	- .zo_ashift = SPA_MINBLOCKSHIFT,
	- .zo_mirrors = 2,
	- .zo_raidz = 4,
	- .zo_raidz_parity = 1,
	- .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */
	- .zo_datasets = 7,
	- .zo_threads = 23,
	- .zo_passtime = 60, /* 60 seconds */
	- .zo_killrate = 70, /* 70% kill rate */
	- .zo_verbose = 0,
	- .zo_mmp_test = 0,
	- .zo_init = 1,
	- .zo_time = 300, /* 5 minutes */
	- .zo_maxloops = 50, /* max loops during spa_freeze() */
	- .zo_metaslab_force_ganging = 32 << 10,
	- .zo_special_vdevs = ZTEST_VDEV_CLASS_RND,
	-};
	-
	-extern uint64_t metaslab_force_ganging;
	-extern uint64_t metaslab_df_alloc_threshold;
	-extern uint64_t zfs_deadman_synctime_ms;
	-extern int metaslab_preload_limit;
	-extern boolean_t zfs_compressed_arc_enabled;
	-extern boolean_t zfs_abd_scatter_enabled;
	-extern int dmu_object_alloc_chunk_shift;
	-extern boolean_t zfs_force_some_double_word_sm_entries;
	-extern unsigned long zfs_reconstruct_indirect_damage_fraction;
	-
	-static ztest_shared_opts_t *ztest_shared_opts;
	-static ztest_shared_opts_t ztest_opts;
	-
	-typedef struct ztest_shared_ds {
	- uint64_t zd_seq;
	-} ztest_shared_ds_t;
	-
	-static ztest_shared_ds_t *ztest_shared_ds;
	-#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
	-
	-#define BT_MAGIC 0x123456789abcdefULL
	-#define MAXFAULTS() \
	- (MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
	-
	-enum ztest_io_type {
	- ZTEST_IO_WRITE_TAG,
	- ZTEST_IO_WRITE_PATTERN,
	- ZTEST_IO_WRITE_ZEROES,
	- ZTEST_IO_TRUNCATE,
	- ZTEST_IO_SETATTR,
	- ZTEST_IO_REWRITE,
	- ZTEST_IO_TYPES
	-};
	-
	-typedef struct ztest_block_tag {
	- uint64_t bt_magic;
	- uint64_t bt_objset;
	- uint64_t bt_object;
	- uint64_t bt_dnodesize;
	- uint64_t bt_offset;
	- uint64_t bt_gen;
	- uint64_t bt_txg;
	- uint64_t bt_crtxg;
	-} ztest_block_tag_t;
	-
	-typedef struct bufwad {
	- uint64_t bw_index;
	- uint64_t bw_txg;
	- uint64_t bw_data;
	-} bufwad_t;
	-
	-/*
	- * It would be better to use a rangelock_t per object. Unfortunately
	- * the rangelock_t is not a drop-in replacement for rl_t, because we
	- * still need to map from object ID to rangelock_t.
	- */
	-typedef enum {
	- RL_READER,
	- RL_WRITER,
	- RL_APPEND
	-} rl_type_t;
	-
	-typedef struct rll {
	- void *rll_writer;
	- int rll_readers;
	- kmutex_t rll_lock;
	- kcondvar_t rll_cv;
	-} rll_t;
	-
	-typedef struct rl {
	- uint64_t rl_object;
	- uint64_t rl_offset;
	- uint64_t rl_size;
	- rll_t *rl_lock;
	-} rl_t;
	-
	-#define ZTEST_RANGE_LOCKS 64
	-#define ZTEST_OBJECT_LOCKS 64
	-
	-/*
	- * Object descriptor. Used as a template for object lookup/create/remove.
	- */
	-typedef struct ztest_od {
	- uint64_t od_dir;
	- uint64_t od_object;
	- dmu_object_type_t od_type;
	- dmu_object_type_t od_crtype;
	- uint64_t od_blocksize;
	- uint64_t od_crblocksize;
	- uint64_t od_crdnodesize;
	- uint64_t od_gen;
	- uint64_t od_crgen;
	- char od_name[ZFS_MAX_DATASET_NAME_LEN];
	-} ztest_od_t;
	-
	-/*
	- * Per-dataset state.
	- */
	-typedef struct ztest_ds {
	- ztest_shared_ds_t *zd_shared;
	- objset_t *zd_os;
	- krwlock_t zd_zilog_lock;
	- zilog_t *zd_zilog;
	- ztest_od_t zd_od; / debugging aid */
	- char zd_name[ZFS_MAX_DATASET_NAME_LEN];
	- kmutex_t zd_dirobj_lock;
	- rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
	- rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
	-} ztest_ds_t;
	-
	-/*
	- * Per-iteration state.
	- */
	-typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
	-
	-typedef struct ztest_info {
	- ztest_func_t zi_func; / test function */
	- uint64_t zi_iters; /* iterations per execution */
	- uint64_t zi_interval; / execute every <interval> seconds */
	-} ztest_info_t;
	-
	-typedef struct ztest_shared_callstate {
	- uint64_t zc_count; /* per-pass count */
	- uint64_t zc_time; /* per-pass time */
	- uint64_t zc_next; /* next time to call this function */
	-} ztest_shared_callstate_t;
	-
	-static ztest_shared_callstate_t *ztest_shared_callstate;
	-#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])
	-
	-/*
	- * Note: these aren't static because we want dladdr() to work.
	- */
	-ztest_func_t ztest_dmu_read_write;
	-ztest_func_t ztest_dmu_write_parallel;
	-ztest_func_t ztest_dmu_object_alloc_free;
	-ztest_func_t ztest_dmu_object_next_chunk;
	-ztest_func_t ztest_dmu_commit_callbacks;
	-ztest_func_t ztest_zap;
	-ztest_func_t ztest_zap_parallel;
	-ztest_func_t ztest_zil_commit;
	-ztest_func_t ztest_zil_remount;
	-ztest_func_t ztest_dmu_read_write_zcopy;
	-ztest_func_t ztest_dmu_objset_create_destroy;
	-ztest_func_t ztest_dmu_prealloc;
	-ztest_func_t ztest_fzap;
	-ztest_func_t ztest_dmu_snapshot_create_destroy;
	-ztest_func_t ztest_dsl_prop_get_set;
	-ztest_func_t ztest_spa_prop_get_set;
	-ztest_func_t ztest_spa_create_destroy;
	-ztest_func_t ztest_fault_inject;
	-ztest_func_t ztest_ddt_repair;
	-ztest_func_t ztest_dmu_snapshot_hold;
	-ztest_func_t ztest_mmp_enable_disable;
	-ztest_func_t ztest_scrub;
	-ztest_func_t ztest_dsl_dataset_promote_busy;
	-ztest_func_t ztest_vdev_attach_detach;
	-ztest_func_t ztest_vdev_LUN_growth;
	-ztest_func_t ztest_vdev_add_remove;
	-ztest_func_t ztest_vdev_class_add;
	-ztest_func_t ztest_vdev_aux_add_remove;
	-ztest_func_t ztest_split_pool;
	-ztest_func_t ztest_reguid;
	-ztest_func_t ztest_spa_upgrade;
	-ztest_func_t ztest_device_removal;
	-ztest_func_t ztest_remap_blocks;
	-ztest_func_t ztest_spa_checkpoint_create_discard;
	-ztest_func_t ztest_initialize;
	-ztest_func_t ztest_verify_dnode_bt;
	-
	-uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
	-uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
	-uint64_t zopt_often = 1ULL * NANOSEC; /* every second */
	-uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */
	-uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */
	-
	-ztest_info_t ztest_info[] = {
	- { ztest_dmu_read_write, 1, &zopt_always },
	- { ztest_dmu_write_parallel, 10, &zopt_always },
	- { ztest_dmu_object_alloc_free, 1, &zopt_always },
	- { ztest_dmu_object_next_chunk, 1, &zopt_sometimes },
	- { ztest_dmu_commit_callbacks, 1, &zopt_always },
	- { ztest_zap, 30, &zopt_always },
	- { ztest_zap_parallel, 100, &zopt_always },
	- { ztest_split_pool, 1, &zopt_always },
	- { ztest_zil_commit, 1, &zopt_incessant },
	- { ztest_zil_remount, 1, &zopt_sometimes },
	- { ztest_dmu_read_write_zcopy, 1, &zopt_often },
	- { ztest_dmu_objset_create_destroy, 1, &zopt_often },
	- { ztest_dsl_prop_get_set, 1, &zopt_often },
	- { ztest_spa_prop_get_set, 1, &zopt_sometimes },
	-#if 0
	- { ztest_dmu_prealloc, 1, &zopt_sometimes },
	-#endif
	- { ztest_fzap, 1, &zopt_sometimes },
	- { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
	- { ztest_spa_create_destroy, 1, &zopt_sometimes },
	- { ztest_fault_inject, 1, &zopt_incessant },
	- { ztest_ddt_repair, 1, &zopt_sometimes },
	- { ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
	- { ztest_mmp_enable_disable, 1, &zopt_sometimes },
	- { ztest_reguid, 1, &zopt_rarely },
	- { ztest_scrub, 1, &zopt_often },
	- { ztest_spa_upgrade, 1, &zopt_rarely },
	- { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
	- { ztest_vdev_attach_detach, 1, &zopt_incessant },
	- { ztest_vdev_LUN_growth, 1, &zopt_rarely },
	- { ztest_vdev_add_remove, 1,
	- &ztest_opts.zo_vdevtime },
	- { ztest_vdev_class_add, 1,
	- &ztest_opts.zo_vdevtime },
	- { ztest_vdev_aux_add_remove, 1,
	- &ztest_opts.zo_vdevtime },
	- { ztest_device_removal, 1, &zopt_sometimes },
	- { ztest_remap_blocks, 1, &zopt_sometimes },
	- { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely },
	- { ztest_initialize, 1, &zopt_sometimes },
	- { ztest_verify_dnode_bt, 1, &zopt_sometimes }
	-};
	-
	-#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
	-
	-/*
	- * The following struct is used to hold a list of uncalled commit callbacks.
	- * The callbacks are ordered by txg number.
	- */
	-typedef struct ztest_cb_list {
	- kmutex_t zcl_callbacks_lock;
	- list_t zcl_callbacks;
	-} ztest_cb_list_t;
	-
	-/*
	- * Stuff we need to share writably between parent and child.
	- */
	-typedef struct ztest_shared {
	- boolean_t zs_do_init;
	- hrtime_t zs_proc_start;
	- hrtime_t zs_proc_stop;
	- hrtime_t zs_thread_start;
	- hrtime_t zs_thread_stop;
	- hrtime_t zs_thread_kill;
	- uint64_t zs_enospc_count;
	- uint64_t zs_vdev_next_leaf;
	- uint64_t zs_vdev_aux;
	- uint64_t zs_alloc;
	- uint64_t zs_space;
	- uint64_t zs_splits;
	- uint64_t zs_mirrors;
	- uint64_t zs_metaslab_sz;
	- uint64_t zs_metaslab_df_alloc_threshold;
	- uint64_t zs_guid;
	-} ztest_shared_t;
	-
	-#define ID_PARALLEL -1ULL
	-
	-static char ztest_dev_template[] = "%s/%s.%llua";
	-static char ztest_aux_template[] = "%s/%s.%s.%llu";
	-ztest_shared_t *ztest_shared;
	-
	-static spa_t *ztest_spa = NULL;
	-static ztest_ds_t *ztest_ds;
	-
	-static kmutex_t ztest_vdev_lock;
	-static boolean_t ztest_device_removal_active = B_FALSE;
	-static kmutex_t ztest_checkpoint_lock;
	-
	-/*
	- * The ztest_name_lock protects the pool and dataset namespace used by
	- * the individual tests. To modify the namespace, consumers must grab
	- * this lock as writer. Grabbing the lock as reader will ensure that the
	- * namespace does not change while the lock is held.
	- */
	-static krwlock_t ztest_name_lock;
	-
	-static boolean_t ztest_dump_core = B_TRUE;
	-static boolean_t ztest_exiting;
	-
	-/* Global commit callback list */
	-static ztest_cb_list_t zcl;
	-
	-enum ztest_object {
	- ZTEST_META_DNODE = 0,
	- ZTEST_DIROBJ,
	- ZTEST_OBJECTS
	-};
	-
	-static void usage(boolean_t) __NORETURN;
	-
	-/*
	- * These libumem hooks provide a reasonable set of defaults for the allocator's
	- * debugging facilities.
	- */
	-const char *
	-_umem_debug_init()
	-{
	- return ("default,verbose"); /* $UMEM_DEBUG setting */
	-}
	-
	-const char *
	-_umem_logging_init(void)
	-{
	- return ("fail,contents"); /* $UMEM_LOGGING setting */
	-}
	-
	-#define FATAL_MSG_SZ 1024
	-
	-char *fatal_msg;
	-
	-static void
	-fatal(int do_perror, char *message, ...)
	-{
	- va_list args;
	- int save_errno = errno;
	- char buf[FATAL_MSG_SZ];
	-
	- (void) fflush(stdout);
	-
	- va_start(args, message);
	- (void) sprintf(buf, "ztest: ");
	- /* LINTED */
	- (void) vsprintf(buf + strlen(buf), message, args);
	- va_end(args);
	- if (do_perror) {
	- (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
	- ": %s", strerror(save_errno));
	- }
	- (void) fprintf(stderr, "%s\n", buf);
	- fatal_msg = buf; /* to ease debugging */
	- if (ztest_dump_core)
	- abort();
	- exit(3);
	-}
	-
	-static int
	-str2shift(const char *buf)
	-{
	- const char *ends = "BKMGTPEZ";
	- int i;
	-
	- if (buf[0] == '\0')
	- return (0);
	- for (i = 0; i < strlen(ends); i++) {
	- if (toupper(buf[0]) == ends[i])
	- break;
	- }
	- if (i == strlen(ends)) {
	- (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
	- buf);
	- usage(B_FALSE);
	- }
	- if (buf[1] == '\0' \|\| (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
	- return (10*i);
	- }
	- (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
	- usage(B_FALSE);
	- /* NOTREACHED */
	-}
	-
	-static uint64_t
	-nicenumtoull(const char *buf)
	-{
	- char *end;
	- uint64_t val;
	-
	- val = strtoull(buf, &end, 0);
	- if (end == buf) {
	- (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
	- usage(B_FALSE);
	- } else if (end[0] == '.') {
	- double fval = strtod(buf, &end);
	- fval *= pow(2, str2shift(end));
	- if (fval > UINT64_MAX) {
	- (void) fprintf(stderr, "ztest: value too large: %s\n",
	- buf);
	- usage(B_FALSE);
	- }
	- val = (uint64_t)fval;
	- } else {
	- int shift = str2shift(end);
	- if (shift >= 64 \|\| (val << shift) >> shift != val) {
	- (void) fprintf(stderr, "ztest: value too large: %s\n",
	- buf);
	- usage(B_FALSE);
	- }
	- val <<= shift;
	- }
	- return (val);
	-}
	-
	-static void
	-usage(boolean_t requested)
	-{
	- const ztest_shared_opts_t *zo = &ztest_opts_defaults;
	-
	- char nice_vdev_size[NN_NUMBUF_SZ];
	- char nice_force_ganging[NN_NUMBUF_SZ];
	- FILE *fp = requested ? stdout : stderr;
	-
	- nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size));
	- nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging,
	- sizeof (nice_force_ganging));
	-
	- (void) fprintf(fp, "Usage: %s\n"
	- "\t[-v vdevs (default: %llu)]\n"
	- "\t[-s size_of_each_vdev (default: %s)]\n"
	- "\t[-a alignment_shift (default: %d)] use 0 for random\n"
	- "\t[-m mirror_copies (default: %d)]\n"
	- "\t[-r raidz_disks (default: %d)]\n"
	- "\t[-R raidz_parity (default: %d)]\n"
	- "\t[-d datasets (default: %d)]\n"
	- "\t[-t threads (default: %d)]\n"
	- "\t[-g gang_block_threshold (default: %s)]\n"
	- "\t[-i init_count (default: %d)] initialize pool i times\n"
	- "\t[-k kill_percentage (default: %llu%%)]\n"
	- "\t[-p pool_name (default: %s)]\n"
	- "\t[-f dir (default: %s)] file directory for vdev files\n"
	- "\t[-M] Multi-host simulate pool imported on remote host\n"
	- "\t[-V] verbose (use multiple times for ever more blather)\n"
	- "\t[-E] use existing pool instead of creating new one\n"
	- "\t[-T time (default: %llu sec)] total run time\n"
	- "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
	- "\t[-P passtime (default: %llu sec)] time per pass\n"
	- "\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
	- "\t[-C vdev class state (default: random)] special=on\|off\|random\n"
	- "\t[-o variable=value] ... set global variable to an unsigned\n"
	- "\t 32-bit integer value\n"
	- "\t[-h] (print help)\n"
	- "",
	- zo->zo_pool,
	- (u_longlong_t)zo->zo_vdevs, /* -v */
	- nice_vdev_size, /* -s */
	- zo->zo_ashift, /* -a */
	- zo->zo_mirrors, /* -m */
	- zo->zo_raidz, /* -r */
	- zo->zo_raidz_parity, /* -R */
	- zo->zo_datasets, /* -d */
	- zo->zo_threads, /* -t */
	- nice_force_ganging, /* -g */
	- zo->zo_init, /* -i */
	- (u_longlong_t)zo->zo_killrate, /* -k */
	- zo->zo_pool, /* -p */
	- zo->zo_dir, /* -f */
	- (u_longlong_t)zo->zo_time, /* -T */
	- (u_longlong_t)zo->zo_maxloops, /* -F */
	- (u_longlong_t)zo->zo_passtime);
	- exit(requested ? 0 : 1);
	-}
	-
	-
	-static void
	-ztest_parse_name_value(const char input, ztest_shared_opts_t zo)
	-{
	- char name[32];
	- char *value;
	- int state = ZTEST_VDEV_CLASS_RND;
	-
	- (void) strlcpy(name, input, sizeof (name));
	-
	- value = strchr(name, '=');
	- if (value == NULL) {
	- (void) fprintf(stderr, "missing value in property=value "
	- "'-C' argument (%s)\n", input);
	- usage(B_FALSE);
	- }
	- *(value) = '\0';
	- value++;
	-
	- if (strcmp(value, "on") == 0) {
	- state = ZTEST_VDEV_CLASS_ON;
	- } else if (strcmp(value, "off") == 0) {
	- state = ZTEST_VDEV_CLASS_OFF;
	- } else if (strcmp(value, "random") == 0) {
	- state = ZTEST_VDEV_CLASS_RND;
	- } else {
	- (void) fprintf(stderr, "invalid property value '%s'\n", value);
	- usage(B_FALSE);
	- }
	-
	- if (strcmp(name, "special") == 0) {
	- zo->zo_special_vdevs = state;
	- } else {
	- (void) fprintf(stderr, "invalid property name '%s'\n", name);
	- usage(B_FALSE);
	- }
	- if (zo->zo_verbose >= 3)
	- (void) printf("%s vdev state is '%s'\n", name, value);
	-}
	-
	-static void
	-process_options(int argc, char **argv)
	-{
	- char *path;
	- ztest_shared_opts_t *zo = &ztest_opts;
	-
	- int opt;
	- uint64_t value;
	- char altdir[MAXNAMELEN] = { 0 };
	-
	- bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
	-
	- while ((opt = getopt(argc, argv,
	- "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:")) != EOF) {
	- value = 0;
	- switch (opt) {
	- case 'v':
	- case 's':
	- case 'a':
	- case 'm':
	- case 'r':
	- case 'R':
	- case 'd':
	- case 't':
	- case 'g':
	- case 'i':
	- case 'k':
	- case 'T':
	- case 'P':
	- case 'F':
	- value = nicenumtoull(optarg);
	- }
	- switch (opt) {
	- case 'v':
	- zo->zo_vdevs = value;
	- break;
	- case 's':
	- zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
	- break;
	- case 'a':
	- zo->zo_ashift = value;
	- break;
	- case 'm':
	- zo->zo_mirrors = value;
	- break;
	- case 'r':
	- zo->zo_raidz = MAX(1, value);
	- break;
	- case 'R':
	- zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
	- break;
	- case 'd':
	- zo->zo_datasets = MAX(1, value);
	- break;
	- case 't':
	- zo->zo_threads = MAX(1, value);
	- break;
	- case 'g':
	- zo->zo_metaslab_force_ganging =
	- MAX(SPA_MINBLOCKSIZE << 1, value);
	- break;
	- case 'i':
	- zo->zo_init = value;
	- break;
	- case 'k':
	- zo->zo_killrate = value;
	- break;
	- case 'p':
	- (void) strlcpy(zo->zo_pool, optarg,
	- sizeof (zo->zo_pool));
	- break;
	- case 'f':
	- path = realpath(optarg, NULL);
	- if (path == NULL) {
	- (void) fprintf(stderr, "error: %s: %s\n",
	- optarg, strerror(errno));
	- usage(B_FALSE);
	- } else {
	- (void) strlcpy(zo->zo_dir, path,
	- sizeof (zo->zo_dir));
	- }
	- break;
	- case 'M':
	- zo->zo_mmp_test = 1;
	- break;
	- case 'V':
	- zo->zo_verbose++;
	- break;
	- case 'E':
	- zo->zo_init = 0;
	- break;
	- case 'T':
	- zo->zo_time = value;
	- break;
	- case 'P':
	- zo->zo_passtime = MAX(1, value);
	- break;
	- case 'F':
	- zo->zo_maxloops = MAX(1, value);
	- break;
	- case 'B':
	- (void) strlcpy(altdir, optarg, sizeof (altdir));
	- break;
	- case 'C':
	- ztest_parse_name_value(optarg, zo);
	- break;
	- case 'o':
	- if (set_global_var(optarg) != 0)
	- usage(B_FALSE);
	- break;
	- case 'h':
	- usage(B_TRUE);
	- break;
	- case '?':
	- default:
	- usage(B_FALSE);
	- break;
	- }
	- }
	-
	- zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
	-
	- zo->zo_vdevtime =
	- (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
	- UINT64_MAX >> 2);
	-
	- if (strlen(altdir) > 0) {
	- char *cmd;
	- char *realaltdir;
	- char *bin;
	- char *ztest;
	- char *isa;
	- int isalen;
	-
	- cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
	- realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
	-
	- VERIFY(NULL != realpath(getexecname(), cmd));
	- if (0 != access(altdir, F_OK)) {
	- ztest_dump_core = B_FALSE;
	- fatal(B_TRUE, "invalid alternate ztest path: %s",
	- altdir);
	- }
	- VERIFY(NULL != realpath(altdir, realaltdir));
	-
	- /*
	- * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
	- * We want to extract <isa> to determine if we should use
	- * 32 or 64 bit binaries.
	- */
	- bin = strstr(cmd, "/usr/bin/");
	- ztest = strstr(bin, "/ztest");
	- isa = bin + 9;
	- isalen = ztest - isa;
	- (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
	- "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
	- (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
	- "%s/usr/lib/%.*s", realaltdir, isalen, isa);
	-
	- if (0 != access(zo->zo_alt_ztest, X_OK)) {
	- ztest_dump_core = B_FALSE;
	- fatal(B_TRUE, "invalid alternate ztest: %s",
	- zo->zo_alt_ztest);
	- } else if (0 != access(zo->zo_alt_libpath, X_OK)) {
	- ztest_dump_core = B_FALSE;
	- fatal(B_TRUE, "invalid alternate lib directory %s",
	- zo->zo_alt_libpath);
	- }
	-
	- umem_free(cmd, MAXPATHLEN);
	- umem_free(realaltdir, MAXPATHLEN);
	- }
	-}
	-
	-static void
	-ztest_kill(ztest_shared_t *zs)
	-{
	- zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
	- zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
	-
	- /*
	- * Before we kill off ztest, make sure that the config is updated.
	- * See comment above spa_write_cachefile().
	- */
	- mutex_enter(&spa_namespace_lock);
	- spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE);
	- mutex_exit(&spa_namespace_lock);
	-
	- zfs_dbgmsg_print(FTAG);
	- (void) kill(getpid(), SIGKILL);
	-}
	-
	-static uint64_t
	-ztest_random(uint64_t range)
	-{
	- uint64_t r;
	-
	- ASSERT3S(ztest_fd_rand, >=, 0);
	-
	- if (range == 0)
	- return (0);
	-
	- if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
	- fatal(1, "short read from /dev/urandom");
	-
	- return (r % range);
	-}
	-
	-/* ARGSUSED */
	-static void
	-ztest_record_enospc(const char *s)
	-{
	- ztest_shared->zs_enospc_count++;
	-}
	-
	-static uint64_t
	-ztest_get_ashift(void)
	-{
	- if (ztest_opts.zo_ashift == 0)
	- return (SPA_MINBLOCKSHIFT + ztest_random(5));
	- return (ztest_opts.zo_ashift);
	-}
	-
	-static nvlist_t *
	-make_vdev_file(char path, char aux, char *pool, size_t size, uint64_t ashift)
	-{
	- char pathbuf[MAXPATHLEN];
	- uint64_t vdev;
	- nvlist_t *file;
	-
	- if (ashift == 0)
	- ashift = ztest_get_ashift();
	-
	- if (path == NULL) {
	- path = pathbuf;
	-
	- if (aux != NULL) {
	- vdev = ztest_shared->zs_vdev_aux;
	- (void) snprintf(path, sizeof (pathbuf),
	- ztest_aux_template, ztest_opts.zo_dir,
	- pool == NULL ? ztest_opts.zo_pool : pool,
	- aux, vdev);
	- } else {
	- vdev = ztest_shared->zs_vdev_next_leaf++;
	- (void) snprintf(path, sizeof (pathbuf),
	- ztest_dev_template, ztest_opts.zo_dir,
	- pool == NULL ? ztest_opts.zo_pool : pool, vdev);
	- }
	- }
	-
	- if (size != 0) {
	- int fd = open(path, O_RDWR \| O_CREAT \| O_TRUNC, 0666);
	- if (fd == -1)
	- fatal(1, "can't open %s", path);
	- if (ftruncate(fd, size) != 0)
	- fatal(1, "can't ftruncate %s", path);
	- (void) close(fd);
	- }
	-
	- VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
	- VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
	- VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
	- VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
	-
	- return (file);
	-}
	-
	-static nvlist_t *
	-make_vdev_raidz(char path, char aux, char *pool, size_t size,
	- uint64_t ashift, int r)
	-{
	- nvlist_t raidz, *child;
	- int c;
	-
	- if (r < 2)
	- return (make_vdev_file(path, aux, pool, size, ashift));
	- child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
	-
	- for (c = 0; c < r; c++)
	- child[c] = make_vdev_file(path, aux, pool, size, ashift);
	-
	- VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
	- VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_RAIDZ) == 0);
	- VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
	- ztest_opts.zo_raidz_parity) == 0);
	- VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
	- child, r) == 0);
	-
	- for (c = 0; c < r; c++)
	- nvlist_free(child[c]);
	-
	- umem_free(child, r * sizeof (nvlist_t *));
	-
	- return (raidz);
	-}
	-
	-static nvlist_t *
	-make_vdev_mirror(char path, char aux, char *pool, size_t size,
	- uint64_t ashift, int r, int m)
	-{
	- nvlist_t mirror, *child;
	- int c;
	-
	- if (m < 1)
	- return (make_vdev_raidz(path, aux, pool, size, ashift, r));
	-
	- child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
	-
	- for (c = 0; c < m; c++)
	- child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
	-
	- VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
	- VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_MIRROR) == 0);
	- VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
	- child, m) == 0);
	-
	- for (c = 0; c < m; c++)
	- nvlist_free(child[c]);
	-
	- umem_free(child, m * sizeof (nvlist_t *));
	-
	- return (mirror);
	-}
	-
	-static nvlist_t *
	-make_vdev_root(char path, char aux, char *pool, size_t size, uint64_t ashift,
	- const char *class, int r, int m, int t)
	-{
	- nvlist_t root, *child;
	- int c;
	- boolean_t log;
	-
	- ASSERT(t > 0);
	-
	- log = (class != NULL && strcmp(class, "log") == 0);
	-
	- child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
	-
	- for (c = 0; c < t; c++) {
	- child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
	- r, m);
	- VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	- log) == 0);
	-
	- if (class != NULL && class[0] != '\0') {
	- ASSERT(m > 1 \|\| log); /* expecting a mirror */
	- VERIFY(nvlist_add_string(child[c],
	- ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0);
	- }
	- }
	-
	- VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
	- VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
	- VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
	- child, t) == 0);
	-
	- for (c = 0; c < t; c++)
	- nvlist_free(child[c]);
	-
	- umem_free(child, t * sizeof (nvlist_t *));
	-
	- return (root);
	-}
	-
	-/*
	- * Find a random spa version. Returns back a random spa version in the
	- * range [initial_version, SPA_VERSION_FEATURES].
	- */
	-static uint64_t
	-ztest_random_spa_version(uint64_t initial_version)
	-{
	- uint64_t version = initial_version;
	-
	- if (version <= SPA_VERSION_BEFORE_FEATURES) {
	- version = version +
	- ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
	- }
	-
	- if (version > SPA_VERSION_BEFORE_FEATURES)
	- version = SPA_VERSION_FEATURES;
	-
	- ASSERT(SPA_VERSION_IS_SUPPORTED(version));
	- return (version);
	-}
	-
	-static int
	-ztest_random_blocksize(void)
	-{
	- uint64_t block_shift;
	-
	- ASSERT(ztest_spa->spa_max_ashift != 0);
	-
	- /*
	- * Choose a block size >= the ashift.
	- * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
	- */
	- int maxbs = SPA_OLD_MAXBLOCKSHIFT;
	- if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
	- maxbs = 20;
	- block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1);
	- return (1 << (SPA_MINBLOCKSHIFT + block_shift));
	-}
	-
	-static int
	-ztest_random_dnodesize(void)
	-{
	- int slots;
	- int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT;
	-
	- if (max_slots == DNODE_MIN_SLOTS)
	- return (DNODE_MIN_SIZE);
	-
	- /*
	- * Weight the random distribution more heavily toward smaller
	- * dnode sizes since that is more likely to reflect real-world
	- * usage.
	- */
	- ASSERT3U(max_slots, >, 4);
	- switch (ztest_random(10)) {
	- case 0:
	- slots = 5 + ztest_random(max_slots - 4);
	- break;
	- case 1 ... 4:
	- slots = 2 + ztest_random(3);
	- break;
	- default:
	- slots = 1;
	- break;
	- }
	-
	- return (slots << DNODE_SHIFT);
	-}
	-
	-static int
	-ztest_random_ibshift(void)
	-{
	- return (DN_MIN_INDBLKSHIFT +
	- ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
	-}
	-
	-static uint64_t
	-ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
	-{
	- uint64_t top;
	- vdev_t *rvd = spa->spa_root_vdev;
	- vdev_t *tvd;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
	-
	- do {
	- top = ztest_random(rvd->vdev_children);
	- tvd = rvd->vdev_child[top];
	- } while (!vdev_is_concrete(tvd) \|\| (tvd->vdev_islog && !log_ok) \|\|
	- tvd->vdev_mg == NULL \|\| tvd->vdev_mg->mg_class == NULL);
	-
	- return (top);
	-}
	-
	-static uint64_t
	-ztest_random_dsl_prop(zfs_prop_t prop)
	-{
	- uint64_t value;
	-
	- do {
	- value = zfs_prop_random_value(prop, ztest_random(-1ULL));
	- } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
	-
	- return (value);
	-}
	-
	-static int
	-ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
	- boolean_t inherit)
	-{
	- const char *propname = zfs_prop_to_name(prop);
	- const char *valname;
	- char setpoint[MAXPATHLEN];
	- uint64_t curval;
	- int error;
	-
	- error = dsl_prop_set_int(osname, propname,
	- (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
	-
	- if (error == ENOSPC) {
	- ztest_record_enospc(FTAG);
	- return (error);
	- }
	- ASSERT0(error);
	-
	- VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
	-
	- if (ztest_opts.zo_verbose >= 6) {
	- VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
	- (void) printf("%s %s = %s at '%s'\n",
	- osname, propname, valname, setpoint);
	- }
	-
	- return (error);
	-}
	-
	-static int
	-ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
	-{
	- spa_t *spa = ztest_spa;
	- nvlist_t *props = NULL;
	- int error;
	-
	- VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
	- VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
	-
	- error = spa_prop_set(spa, props);
	-
	- nvlist_free(props);
	-
	- if (error == ENOSPC) {
	- ztest_record_enospc(FTAG);
	- return (error);
	- }
	- ASSERT0(error);
	-
	- return (error);
	-}
	-
	-static void
	-ztest_rll_init(rll_t *rll)
	-{
	- rll->rll_writer = NULL;
	- rll->rll_readers = 0;
	- mutex_init(&rll->rll_lock, NULL, USYNC_THREAD, NULL);
	- cv_init(&rll->rll_cv, NULL, USYNC_THREAD, NULL);
	-}
	-
	-static void
	-ztest_rll_destroy(rll_t *rll)
	-{
	- ASSERT(rll->rll_writer == NULL);
	- ASSERT(rll->rll_readers == 0);
	- mutex_destroy(&rll->rll_lock);
	- cv_destroy(&rll->rll_cv);
	-}
	-
	-static void
	-ztest_rll_lock(rll_t *rll, rl_type_t type)
	-{
	- mutex_enter(&rll->rll_lock);
	-
	- if (type == RL_READER) {
	- while (rll->rll_writer != NULL)
	- cv_wait(&rll->rll_cv, &rll->rll_lock);
	- rll->rll_readers++;
	- } else {
	- while (rll->rll_writer != NULL \|\| rll->rll_readers)
	- cv_wait(&rll->rll_cv, &rll->rll_lock);
	- rll->rll_writer = curthread;
	- }
	-
	- mutex_exit(&rll->rll_lock);
	-}
	-
	-static void
	-ztest_rll_unlock(rll_t *rll)
	-{
	- mutex_enter(&rll->rll_lock);
	-
	- if (rll->rll_writer) {
	- ASSERT(rll->rll_readers == 0);
	- rll->rll_writer = NULL;
	- } else {
	- ASSERT(rll->rll_readers != 0);
	- ASSERT(rll->rll_writer == NULL);
	- rll->rll_readers--;
	- }
	-
	- if (rll->rll_writer == NULL && rll->rll_readers == 0)
	- cv_broadcast(&rll->rll_cv);
	-
	- mutex_exit(&rll->rll_lock);
	-}
	-
	-static void
	-ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
	-{
	- rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
	-
	- ztest_rll_lock(rll, type);
	-}
	-
	-static void
	-ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
	-{
	- rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
	-
	- ztest_rll_unlock(rll);
	-}
	-
	-static rl_t *
	-ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
	- uint64_t size, rl_type_t type)
	-{
	- uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
	- rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
	- rl_t *rl;
	-
	- rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
	- rl->rl_object = object;
	- rl->rl_offset = offset;
	- rl->rl_size = size;
	- rl->rl_lock = rll;
	-
	- ztest_rll_lock(rll, type);
	-
	- return (rl);
	-}
	-
	-static void
	-ztest_range_unlock(rl_t *rl)
	-{
	- rll_t *rll = rl->rl_lock;
	-
	- ztest_rll_unlock(rll);
	-
	- umem_free(rl, sizeof (*rl));
	-}
	-
	-static void
	-ztest_zd_init(ztest_ds_t zd, ztest_shared_ds_t szd, objset_t *os)
	-{
	- zd->zd_os = os;
	- zd->zd_zilog = dmu_objset_zil(os);
	- zd->zd_shared = szd;
	- dmu_objset_name(os, zd->zd_name);
	-
	- if (zd->zd_shared != NULL)
	- zd->zd_shared->zd_seq = 0;
	-
	- rw_init(&zd->zd_zilog_lock, NULL, USYNC_THREAD, NULL);
	- mutex_init(&zd->zd_dirobj_lock, NULL, USYNC_THREAD, NULL);
	-
	- for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
	- ztest_rll_init(&zd->zd_object_lock[l]);
	-
	- for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
	- ztest_rll_init(&zd->zd_range_lock[l]);
	-}
	-
	-static void
	-ztest_zd_fini(ztest_ds_t *zd)
	-{
	- mutex_destroy(&zd->zd_dirobj_lock);
	-
	- for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
	- ztest_rll_destroy(&zd->zd_object_lock[l]);
	-
	- for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
	- ztest_rll_destroy(&zd->zd_range_lock[l]);
	-}
	-
	-#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
	-
	-static uint64_t
	-ztest_tx_assign(dmu_tx_t tx, uint64_t txg_how, const char tag)
	-{
	- uint64_t txg;
	- int error;
	-
	- /*
	- * Attempt to assign tx to some transaction group.
	- */
	- error = dmu_tx_assign(tx, txg_how);
	- if (error) {
	- if (error == ERESTART) {
	- ASSERT(txg_how == TXG_NOWAIT);
	- dmu_tx_wait(tx);
	- } else {
	- ASSERT3U(error, ==, ENOSPC);
	- ztest_record_enospc(tag);
	- }
	- dmu_tx_abort(tx);
	- return (0);
	- }
	- txg = dmu_tx_get_txg(tx);
	- ASSERT(txg != 0);
	- return (txg);
	-}
	-
	-static void
	-ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
	-{
	- uint64_t *ip = buf;
	- uint64_t ip_end = (uint64_t )((uintptr_t)buf + (uintptr_t)size);
	-
	- while (ip < ip_end)
	- *ip++ = value;
	-}
	-
	-static boolean_t
	-ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
	-{
	- uint64_t *ip = buf;
	- uint64_t ip_end = (uint64_t )((uintptr_t)buf + (uintptr_t)size);
	- uint64_t diff = 0;
	-
	- while (ip < ip_end)
	- diff \|= (value - *ip++);
	-
	- return (diff == 0);
	-}
	-
	-static void
	-ztest_bt_generate(ztest_block_tag_t bt, objset_t os, uint64_t object,
	- uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
	- uint64_t crtxg)
	-{
	- bt->bt_magic = BT_MAGIC;
	- bt->bt_objset = dmu_objset_id(os);
	- bt->bt_object = object;
	- bt->bt_dnodesize = dnodesize;
	- bt->bt_offset = offset;
	- bt->bt_gen = gen;
	- bt->bt_txg = txg;
	- bt->bt_crtxg = crtxg;
	-}
	-
	-static void
	-ztest_bt_verify(ztest_block_tag_t bt, objset_t os, uint64_t object,
	- uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
	- uint64_t crtxg)
	-{
	- ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
	- ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
	- ASSERT3U(bt->bt_object, ==, object);
	- ASSERT3U(bt->bt_dnodesize, ==, dnodesize);
	- ASSERT3U(bt->bt_offset, ==, offset);
	- ASSERT3U(bt->bt_gen, <=, gen);
	- ASSERT3U(bt->bt_txg, <=, txg);
	- ASSERT3U(bt->bt_crtxg, ==, crtxg);
	-}
	-
	-static ztest_block_tag_t *
	-ztest_bt_bonus(dmu_buf_t *db)
	-{
	- dmu_object_info_t doi;
	- ztest_block_tag_t *bt;
	-
	- dmu_object_info_from_db(db, &doi);
	- ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
	- ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
	- bt = (void )((char )db->db_data + doi.doi_bonus_size - sizeof (*bt));
	-
	- return (bt);
	-}
	-
	-/*
	- * Generate a token to fill up unused bonus buffer space. Try to make
	- * it unique to the object, generation, and offset to verify that data
	- * is not getting overwritten by data from other dnodes.
	- */
	-#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \
	- (((ds) << 48) \| ((gen) << 32) \| ((obj) << 8) \| (offset))
	-
	-/*
	- * Fill up the unused bonus buffer region before the block tag with a
	- * verifiable pattern. Filling the whole bonus area with non-zero data
	- * helps ensure that all dnode traversal code properly skips the
	- * interior regions of large dnodes.
	- */
	-void
	-ztest_fill_unused_bonus(dmu_buf_t db, void end, uint64_t obj,
	- objset_t *os, uint64_t gen)
	-{
	- uint64_t *bonusp;
	-
	- ASSERT(IS_P2ALIGNED((char )end - (char )db->db_data, 8));
	-
	- for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
	- uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
	- gen, bonusp - (uint64_t *)db->db_data);
	- *bonusp = token;
	- }
	-}
	-
	-/*
	- * Verify that the unused area of a bonus buffer is filled with the
	- * expected tokens.
	- */
	-void
	-ztest_verify_unused_bonus(dmu_buf_t db, void end, uint64_t obj,
	- objset_t *os, uint64_t gen)
	-{
	- uint64_t *bonusp;
	-
	- for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
	- uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
	- gen, bonusp - (uint64_t *)db->db_data);
	- VERIFY3U(*bonusp, ==, token);
	- }
	-}
	-
	-/*
	- * ZIL logging ops
	- */
	-
	-#define lrz_type lr_mode
	-#define lrz_blocksize lr_uid
	-#define lrz_ibshift lr_gid
	-#define lrz_bonustype lr_rdev
	-#define lrz_dnodesize lr_crtime[1]
	-
	-static void
	-ztest_log_create(ztest_ds_t zd, dmu_tx_t tx, lr_create_t *lr)
	-{
	- char name = (void )(lr + 1); /* name follows lr */
	- size_t namesize = strlen(name) + 1;
	- itx_t *itx;
	-
	- if (zil_replaying(zd->zd_zilog, tx))
	- return;
	-
	- itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
	- bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	- sizeof (*lr) + namesize - sizeof (lr_t));
	-
	- zil_itx_assign(zd->zd_zilog, itx, tx);
	-}
	-
	-static void
	-ztest_log_remove(ztest_ds_t zd, dmu_tx_t tx, lr_remove_t *lr, uint64_t object)
	-{
	- char name = (void )(lr + 1); /* name follows lr */
	- size_t namesize = strlen(name) + 1;
	- itx_t *itx;
	-
	- if (zil_replaying(zd->zd_zilog, tx))
	- return;
	-
	- itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
	- bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	- sizeof (*lr) + namesize - sizeof (lr_t));
	-
	- itx->itx_oid = object;
	- zil_itx_assign(zd->zd_zilog, itx, tx);
	-}
	-
	-static void
	-ztest_log_write(ztest_ds_t zd, dmu_tx_t tx, lr_write_t *lr)
	-{
	- itx_t *itx;
	- itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
	-
	- if (zil_replaying(zd->zd_zilog, tx))
	- return;
	-
	- if (lr->lr_length > zil_max_log_data(zd->zd_zilog))
	- write_state = WR_INDIRECT;
	-
	- itx = zil_itx_create(TX_WRITE,
	- sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
	-
	- if (write_state == WR_COPIED &&
	- dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
	- ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
	- zil_itx_destroy(itx);
	- itx = zil_itx_create(TX_WRITE, sizeof (*lr));
	- write_state = WR_NEED_COPY;
	- }
	- itx->itx_private = zd;
	- itx->itx_wr_state = write_state;
	- itx->itx_sync = (ztest_random(8) == 0);
	-
	- bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	- sizeof (*lr) - sizeof (lr_t));
	-
	- zil_itx_assign(zd->zd_zilog, itx, tx);
	-}
	-
	-static void
	-ztest_log_truncate(ztest_ds_t zd, dmu_tx_t tx, lr_truncate_t *lr)
	-{
	- itx_t *itx;
	-
	- if (zil_replaying(zd->zd_zilog, tx))
	- return;
	-
	- itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
	- bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	- sizeof (*lr) - sizeof (lr_t));
	-
	- itx->itx_sync = B_FALSE;
	- zil_itx_assign(zd->zd_zilog, itx, tx);
	-}
	-
	-static void
	-ztest_log_setattr(ztest_ds_t zd, dmu_tx_t tx, lr_setattr_t *lr)
	-{
	- itx_t *itx;
	-
	- if (zil_replaying(zd->zd_zilog, tx))
	- return;
	-
	- itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
	- bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	- sizeof (*lr) - sizeof (lr_t));
	-
	- itx->itx_sync = B_FALSE;
	- zil_itx_assign(zd->zd_zilog, itx, tx);
	-}
	-
	-/*
	- * ZIL replay ops
	- */
	-static int
	-ztest_replay_create(void arg1, void arg2, boolean_t byteswap)
	-{
	- ztest_ds_t *zd = arg1;
	- lr_create_t *lr = arg2;
	- char name = (void )(lr + 1); /* name follows lr */
	- objset_t *os = zd->zd_os;
	- ztest_block_tag_t *bbt;
	- dmu_buf_t *db;
	- dmu_tx_t *tx;
	- uint64_t txg;
	- int error = 0;
	- int bonuslen;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- ASSERT(lr->lr_doid == ZTEST_DIROBJ);
	- ASSERT(name[0] != '\0');
	-
	- tx = dmu_tx_create(os);
	-
	- dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
	-
	- if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	- } else {
	- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
	- }
	-
	- txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	- if (txg == 0)
	- return (ENOSPC);
	-
	- ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
	- bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize);
	-
	- if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
	- if (lr->lr_foid == 0) {
	- lr->lr_foid = zap_create_dnsize(os,
	- lr->lrz_type, lr->lrz_bonustype,
	- bonuslen, lr->lrz_dnodesize, tx);
	- } else {
	- error = zap_create_claim_dnsize(os, lr->lr_foid,
	- lr->lrz_type, lr->lrz_bonustype,
	- bonuslen, lr->lrz_dnodesize, tx);
	- }
	- } else {
	- if (lr->lr_foid == 0) {
	- lr->lr_foid = dmu_object_alloc_dnsize(os,
	- lr->lrz_type, 0, lr->lrz_bonustype,
	- bonuslen, lr->lrz_dnodesize, tx);
	- } else {
	- error = dmu_object_claim_dnsize(os, lr->lr_foid,
	- lr->lrz_type, 0, lr->lrz_bonustype,
	- bonuslen, lr->lrz_dnodesize, tx);
	- }
	- }
	-
	- if (error) {
	- ASSERT3U(error, ==, EEXIST);
	- ASSERT(zd->zd_zilog->zl_replay);
	- dmu_tx_commit(tx);
	- return (error);
	- }
	-
	- ASSERT(lr->lr_foid != 0);
	-
	- if (lr->lrz_type != DMU_OT_ZAP_OTHER)
	- VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
	- lr->lrz_blocksize, lr->lrz_ibshift, tx));
	-
	- VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
	- bbt = ztest_bt_bonus(db);
	- dmu_buf_will_dirty(db, tx);
	- ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL,
	- lr->lr_gen, txg, txg);
	- ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen);
	- dmu_buf_rele(db, FTAG);
	-
	- VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
	- &lr->lr_foid, tx));
	-
	- (void) ztest_log_create(zd, tx, lr);
	-
	- dmu_tx_commit(tx);
	-
	- return (0);
	-}
	-
	-static int
	-ztest_replay_remove(void arg1, void arg2, boolean_t byteswap)
	-{
	- ztest_ds_t *zd = arg1;
	- lr_remove_t *lr = arg2;
	- char name = (void )(lr + 1); /* name follows lr */
	- objset_t *os = zd->zd_os;
	- dmu_object_info_t doi;
	- dmu_tx_t *tx;
	- uint64_t object, txg;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- ASSERT(lr->lr_doid == ZTEST_DIROBJ);
	- ASSERT(name[0] != '\0');
	-
	- VERIFY3U(0, ==,
	- zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
	- ASSERT(object != 0);
	-
	- ztest_object_lock(zd, object, RL_WRITER);
	-
	- VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
	-
	- tx = dmu_tx_create(os);
	-
	- dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
	- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
	-
	- txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	- if (txg == 0) {
	- ztest_object_unlock(zd, object);
	- return (ENOSPC);
	- }
	-
	- if (doi.doi_type == DMU_OT_ZAP_OTHER) {
	- VERIFY3U(0, ==, zap_destroy(os, object, tx));
	- } else {
	- VERIFY3U(0, ==, dmu_object_free(os, object, tx));
	- }
	-
	- VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
	-
	- (void) ztest_log_remove(zd, tx, lr, object);
	-
	- dmu_tx_commit(tx);
	-
	- ztest_object_unlock(zd, object);
	-
	- return (0);
	-}
	-
	-static int
	-ztest_replay_write(void arg1, void arg2, boolean_t byteswap)
	-{
	- ztest_ds_t *zd = arg1;
	- lr_write_t *lr = arg2;
	- objset_t *os = zd->zd_os;
	- void data = lr + 1; / data follows lr */
	- uint64_t offset, length;
	- ztest_block_tag_t *bt = data;
	- ztest_block_tag_t *bbt;
	- uint64_t gen, txg, lrtxg, crtxg;
	- dmu_object_info_t doi;
	- dmu_tx_t *tx;
	- dmu_buf_t *db;
	- arc_buf_t *abuf = NULL;
	- rl_t *rl;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- offset = lr->lr_offset;
	- length = lr->lr_length;
	-
	- /* If it's a dmu_sync() block, write the whole block */
	- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
	- uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
	- if (length < blocksize) {
	- offset -= offset % blocksize;
	- length = blocksize;
	- }
	- }
	-
	- if (bt->bt_magic == BSWAP_64(BT_MAGIC))
	- byteswap_uint64_array(bt, sizeof (*bt));
	-
	- if (bt->bt_magic != BT_MAGIC)
	- bt = NULL;
	-
	- ztest_object_lock(zd, lr->lr_foid, RL_READER);
	- rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
	-
	- VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
	-
	- dmu_object_info_from_db(db, &doi);
	-
	- bbt = ztest_bt_bonus(db);
	- ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
	- gen = bbt->bt_gen;
	- crtxg = bbt->bt_crtxg;
	- lrtxg = lr->lr_common.lrc_txg;
	-
	- tx = dmu_tx_create(os);
	-
	- dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
	-
	- if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
	- P2PHASE(offset, length) == 0)
	- abuf = dmu_request_arcbuf(db, length);
	-
	- txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	- if (txg == 0) {
	- if (abuf != NULL)
	- dmu_return_arcbuf(abuf);
	- dmu_buf_rele(db, FTAG);
	- ztest_range_unlock(rl);
	- ztest_object_unlock(zd, lr->lr_foid);
	- return (ENOSPC);
	- }
	-
	- if (bt != NULL) {
	- /*
	- * Usually, verify the old data before writing new data --
	- * but not always, because we also want to verify correct
	- * behavior when the data was not recently read into cache.
	- */
	- ASSERT(offset % doi.doi_data_block_size == 0);
	- if (ztest_random(4) != 0) {
	- int prefetch = ztest_random(2) ?
	- DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
	- ztest_block_tag_t rbt;
	-
	- VERIFY(dmu_read(os, lr->lr_foid, offset,
	- sizeof (rbt), &rbt, prefetch) == 0);
	- if (rbt.bt_magic == BT_MAGIC) {
	- ztest_bt_verify(&rbt, os, lr->lr_foid, 0,
	- offset, gen, txg, crtxg);
	- }
	- }
	-
	- /*
	- * Writes can appear to be newer than the bonus buffer because
	- * the ztest_get_data() callback does a dmu_read() of the
	- * open-context data, which may be different than the data
	- * as it was when the write was generated.
	- */
	- if (zd->zd_zilog->zl_replay) {
	- ztest_bt_verify(bt, os, lr->lr_foid, 0, offset,
	- MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
	- bt->bt_crtxg);
	- }
	-
	- /*
	- * Set the bt's gen/txg to the bonus buffer's gen/txg
	- * so that all of the usual ASSERTs will work.
	- */
	- ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg,
	- crtxg);
	- }
	-
	- if (abuf == NULL) {
	- dmu_write(os, lr->lr_foid, offset, length, data, tx);
	- } else {
	- bcopy(data, abuf->b_data, length);
	- dmu_assign_arcbuf(db, offset, abuf, tx);
	- }
	-
	- (void) ztest_log_write(zd, tx, lr);
	-
	- dmu_buf_rele(db, FTAG);
	-
	- dmu_tx_commit(tx);
	-
	- ztest_range_unlock(rl);
	- ztest_object_unlock(zd, lr->lr_foid);
	-
	- return (0);
	-}
	-
	-static int
	-ztest_replay_truncate(void arg1, void arg2, boolean_t byteswap)
	-{
	- ztest_ds_t *zd = arg1;
	- lr_truncate_t *lr = arg2;
	- objset_t *os = zd->zd_os;
	- dmu_tx_t *tx;
	- uint64_t txg;
	- rl_t *rl;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- ztest_object_lock(zd, lr->lr_foid, RL_READER);
	- rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
	- RL_WRITER);
	-
	- tx = dmu_tx_create(os);
	-
	- dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
	-
	- txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	- if (txg == 0) {
	- ztest_range_unlock(rl);
	- ztest_object_unlock(zd, lr->lr_foid);
	- return (ENOSPC);
	- }
	-
	- VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
	- lr->lr_length, tx) == 0);
	-
	- (void) ztest_log_truncate(zd, tx, lr);
	-
	- dmu_tx_commit(tx);
	-
	- ztest_range_unlock(rl);
	- ztest_object_unlock(zd, lr->lr_foid);
	-
	- return (0);
	-}
	-
	-static int
	-ztest_replay_setattr(void arg1, void arg2, boolean_t byteswap)
	-{
	- ztest_ds_t *zd = arg1;
	- lr_setattr_t *lr = arg2;
	- objset_t *os = zd->zd_os;
	- dmu_tx_t *tx;
	- dmu_buf_t *db;
	- ztest_block_tag_t *bbt;
	- uint64_t txg, lrtxg, crtxg, dnodesize;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
	-
	- VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_bonus(tx, lr->lr_foid);
	-
	- txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	- if (txg == 0) {
	- dmu_buf_rele(db, FTAG);
	- ztest_object_unlock(zd, lr->lr_foid);
	- return (ENOSPC);
	- }
	-
	- bbt = ztest_bt_bonus(db);
	- ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
	- crtxg = bbt->bt_crtxg;
	- lrtxg = lr->lr_common.lrc_txg;
	- dnodesize = bbt->bt_dnodesize;
	-
	- if (zd->zd_zilog->zl_replay) {
	- ASSERT(lr->lr_size != 0);
	- ASSERT(lr->lr_mode != 0);
	- ASSERT(lrtxg != 0);
	- } else {
	- /*
	- * Randomly change the size and increment the generation.
	- */
	- lr->lr_size = (ztest_random(db->db_size / sizeof (bbt)) + 1)
	- sizeof (*bbt);
	- lr->lr_mode = bbt->bt_gen + 1;
	- ASSERT(lrtxg == 0);
	- }
	-
	- /*
	- * Verify that the current bonus buffer is not newer than our txg.
	- */
	- ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
	- MAX(txg, lrtxg), crtxg);
	-
	- dmu_buf_will_dirty(db, tx);
	-
	- ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
	- ASSERT3U(lr->lr_size, <=, db->db_size);
	- VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
	- bbt = ztest_bt_bonus(db);
	-
	- ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
	- txg, crtxg);
	- ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen);
	-
	- dmu_buf_rele(db, FTAG);
	-
	- (void) ztest_log_setattr(zd, tx, lr);
	-
	- dmu_tx_commit(tx);
	-
	- ztest_object_unlock(zd, lr->lr_foid);
	-
	- return (0);
	-}
	-
	-zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
	- NULL, /* 0 no such transaction type */
	- ztest_replay_create, /* TX_CREATE */
	- NULL, /* TX_MKDIR */
	- NULL, /* TX_MKXATTR */
	- NULL, /* TX_SYMLINK */
	- ztest_replay_remove, /* TX_REMOVE */
	- NULL, /* TX_RMDIR */
	- NULL, /* TX_LINK */
	- NULL, /* TX_RENAME */
	- ztest_replay_write, /* TX_WRITE */
	- ztest_replay_truncate, /* TX_TRUNCATE */
	- ztest_replay_setattr, /* TX_SETATTR */
	- NULL, /* TX_ACL */
	- NULL, /* TX_CREATE_ACL */
	- NULL, /* TX_CREATE_ATTR */
	- NULL, /* TX_CREATE_ACL_ATTR */
	- NULL, /* TX_MKDIR_ACL */
	- NULL, /* TX_MKDIR_ATTR */
	- NULL, /* TX_MKDIR_ACL_ATTR */
	- NULL, /* TX_WRITE2 */
	-};
	-
	-/*
	- * ZIL get_data callbacks
	- */
	-
	-/* ARGSUSED */
	-static void
	-ztest_get_done(zgd_t *zgd, int error)
	-{
	- ztest_ds_t *zd = zgd->zgd_private;
	- uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object;
	-
	- if (zgd->zgd_db)
	- dmu_buf_rele(zgd->zgd_db, zgd);
	-
	- ztest_range_unlock((rl_t *)zgd->zgd_lr);
	- ztest_object_unlock(zd, object);
	-
	- umem_free(zgd, sizeof (*zgd));
	-}
	-
	-static int
	-ztest_get_data(void arg, lr_write_t lr, char buf, struct lwb lwb,
	- zio_t *zio)
	-{
	- ztest_ds_t *zd = arg;
	- objset_t *os = zd->zd_os;
	- uint64_t object = lr->lr_foid;
	- uint64_t offset = lr->lr_offset;
	- uint64_t size = lr->lr_length;
	- uint64_t txg = lr->lr_common.lrc_txg;
	- uint64_t crtxg;
	- dmu_object_info_t doi;
	- dmu_buf_t *db;
	- zgd_t *zgd;
	- int error;
	-
	- ASSERT3P(lwb, !=, NULL);
	- ASSERT3P(zio, !=, NULL);
	- ASSERT3U(size, !=, 0);
	-
	- ztest_object_lock(zd, object, RL_READER);
	- error = dmu_bonus_hold(os, object, FTAG, &db);
	- if (error) {
	- ztest_object_unlock(zd, object);
	- return (error);
	- }
	-
	- crtxg = ztest_bt_bonus(db)->bt_crtxg;
	-
	- if (crtxg == 0 \|\| crtxg > txg) {
	- dmu_buf_rele(db, FTAG);
	- ztest_object_unlock(zd, object);
	- return (ENOENT);
	- }
	-
	- dmu_object_info_from_db(db, &doi);
	- dmu_buf_rele(db, FTAG);
	- db = NULL;
	-
	- zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
	- zgd->zgd_lwb = lwb;
	- zgd->zgd_private = zd;
	-
	- if (buf != NULL) { /* immediate write */
	- zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
	- object, offset, size, RL_READER);
	-
	- error = dmu_read(os, object, offset, size, buf,
	- DMU_READ_NO_PREFETCH);
	- ASSERT(error == 0);
	- } else {
	- size = doi.doi_data_block_size;
	- if (ISP2(size)) {
	- offset = P2ALIGN(offset, size);
	- } else {
	- ASSERT(offset < size);
	- offset = 0;
	- }
	-
	- zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
	- object, offset, size, RL_READER);
	-
	- error = dmu_buf_hold(os, object, offset, zgd, &db,
	- DMU_READ_NO_PREFETCH);
	-
	- if (error == 0) {
	- blkptr_t *bp = &lr->lr_blkptr;
	-
	- zgd->zgd_db = db;
	- zgd->zgd_bp = bp;
	-
	- ASSERT(db->db_offset == offset);
	- ASSERT(db->db_size == size);
	-
	- error = dmu_sync(zio, lr->lr_common.lrc_txg,
	- ztest_get_done, zgd);
	-
	- if (error == 0)
	- return (0);
	- }
	- }
	-
	- ztest_get_done(zgd, error);
	-
	- return (error);
	-}
	-
	-static void *
	-ztest_lr_alloc(size_t lrsize, char *name)
	-{
	- char *lr;
	- size_t namesize = name ? strlen(name) + 1 : 0;
	-
	- lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
	-
	- if (name)
	- bcopy(name, lr + lrsize, namesize);
	-
	- return (lr);
	-}
	-
	-void
	-ztest_lr_free(void lr, size_t lrsize, char name)
	-{
	- size_t namesize = name ? strlen(name) + 1 : 0;
	-
	- umem_free(lr, lrsize + namesize);
	-}
	-
	-/*
	- * Lookup a bunch of objects. Returns the number of objects not found.
	- */
	-static int
	-ztest_lookup(ztest_ds_t zd, ztest_od_t od, int count)
	-{
	- int missing = 0;
	- int error;
	-
	- ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
	-
	- for (int i = 0; i < count; i++, od++) {
	- od->od_object = 0;
	- error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
	- sizeof (uint64_t), 1, &od->od_object);
	- if (error) {
	- ASSERT(error == ENOENT);
	- ASSERT(od->od_object == 0);
	- missing++;
	- } else {
	- dmu_buf_t *db;
	- ztest_block_tag_t *bbt;
	- dmu_object_info_t doi;
	-
	- ASSERT(od->od_object != 0);
	- ASSERT(missing == 0); /* there should be no gaps */
	-
	- ztest_object_lock(zd, od->od_object, RL_READER);
	- VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
	- od->od_object, FTAG, &db));
	- dmu_object_info_from_db(db, &doi);
	- bbt = ztest_bt_bonus(db);
	- ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
	- od->od_type = doi.doi_type;
	- od->od_blocksize = doi.doi_data_block_size;
	- od->od_gen = bbt->bt_gen;
	- dmu_buf_rele(db, FTAG);
	- ztest_object_unlock(zd, od->od_object);
	- }
	- }
	-
	- return (missing);
	-}
	-
	-static int
	-ztest_create(ztest_ds_t zd, ztest_od_t od, int count)
	-{
	- int missing = 0;
	-
	- ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
	-
	- for (int i = 0; i < count; i++, od++) {
	- if (missing) {
	- od->od_object = 0;
	- missing++;
	- continue;
	- }
	-
	- lr_create_t lr = ztest_lr_alloc(sizeof (lr), od->od_name);
	-
	- lr->lr_doid = od->od_dir;
	- lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
	- lr->lrz_type = od->od_crtype;
	- lr->lrz_blocksize = od->od_crblocksize;
	- lr->lrz_ibshift = ztest_random_ibshift();
	- lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
	- lr->lrz_dnodesize = od->od_crdnodesize;
	- lr->lr_gen = od->od_crgen;
	- lr->lr_crtime[0] = time(NULL);
	-
	- if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
	- ASSERT(missing == 0);
	- od->od_object = 0;
	- missing++;
	- } else {
	- od->od_object = lr->lr_foid;
	- od->od_type = od->od_crtype;
	- od->od_blocksize = od->od_crblocksize;
	- od->od_gen = od->od_crgen;
	- ASSERT(od->od_object != 0);
	- }
	-
	- ztest_lr_free(lr, sizeof (*lr), od->od_name);
	- }
	-
	- return (missing);
	-}
	-
	-static int
	-ztest_remove(ztest_ds_t zd, ztest_od_t od, int count)
	-{
	- int missing = 0;
	- int error;
	-
	- ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
	-
	- od += count - 1;
	-
	- for (int i = count - 1; i >= 0; i--, od--) {
	- if (missing) {
	- missing++;
	- continue;
	- }
	-
	- /*
	- * No object was found.
	- */
	- if (od->od_object == 0)
	- continue;
	-
	- lr_remove_t lr = ztest_lr_alloc(sizeof (lr), od->od_name);
	-
	- lr->lr_doid = od->od_dir;
	-
	- if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
	- ASSERT3U(error, ==, ENOSPC);
	- missing++;
	- } else {
	- od->od_object = 0;
	- }
	- ztest_lr_free(lr, sizeof (*lr), od->od_name);
	- }
	-
	- return (missing);
	-}
	-
	-static int
	-ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
	- void *data)
	-{
	- lr_write_t *lr;
	- int error;
	-
	- lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
	-
	- lr->lr_foid = object;
	- lr->lr_offset = offset;
	- lr->lr_length = size;
	- lr->lr_blkoff = 0;
	- BP_ZERO(&lr->lr_blkptr);
	-
	- bcopy(data, lr + 1, size);
	-
	- error = ztest_replay_write(zd, lr, B_FALSE);
	-
	- ztest_lr_free(lr, sizeof (*lr) + size, NULL);
	-
	- return (error);
	-}
	-
	-static int
	-ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
	-{
	- lr_truncate_t *lr;
	- int error;
	-
	- lr = ztest_lr_alloc(sizeof (*lr), NULL);
	-
	- lr->lr_foid = object;
	- lr->lr_offset = offset;
	- lr->lr_length = size;
	-
	- error = ztest_replay_truncate(zd, lr, B_FALSE);
	-
	- ztest_lr_free(lr, sizeof (*lr), NULL);
	-
	- return (error);
	-}
	-
	-static int
	-ztest_setattr(ztest_ds_t *zd, uint64_t object)
	-{
	- lr_setattr_t *lr;
	- int error;
	-
	- lr = ztest_lr_alloc(sizeof (*lr), NULL);
	-
	- lr->lr_foid = object;
	- lr->lr_size = 0;
	- lr->lr_mode = 0;
	-
	- error = ztest_replay_setattr(zd, lr, B_FALSE);
	-
	- ztest_lr_free(lr, sizeof (*lr), NULL);
	-
	- return (error);
	-}
	-
	-static void
	-ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
	-{
	- objset_t *os = zd->zd_os;
	- dmu_tx_t *tx;
	- uint64_t txg;
	- rl_t *rl;
	-
	- txg_wait_synced(dmu_objset_pool(os), 0);
	-
	- ztest_object_lock(zd, object, RL_READER);
	- rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
	-
	- tx = dmu_tx_create(os);
	-
	- dmu_tx_hold_write(tx, object, offset, size);
	-
	- txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	-
	- if (txg != 0) {
	- dmu_prealloc(os, object, offset, size, tx);
	- dmu_tx_commit(tx);
	- txg_wait_synced(dmu_objset_pool(os), txg);
	- } else {
	- (void) dmu_free_long_range(os, object, offset, size);
	- }
	-
	- ztest_range_unlock(rl);
	- ztest_object_unlock(zd, object);
	-}
	-
	-static void
	-ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
	-{
	- int err;
	- ztest_block_tag_t wbt;
	- dmu_object_info_t doi;
	- enum ztest_io_type io_type;
	- uint64_t blocksize;
	- void *data;
	-
	- VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
	- blocksize = doi.doi_data_block_size;
	- data = umem_alloc(blocksize, UMEM_NOFAIL);
	-
	- /*
	- * Pick an i/o type at random, biased toward writing block tags.
	- */
	- io_type = ztest_random(ZTEST_IO_TYPES);
	- if (ztest_random(2) == 0)
	- io_type = ZTEST_IO_WRITE_TAG;
	-
	- rw_enter(&zd->zd_zilog_lock, RW_READER);
	-
	- switch (io_type) {
	-
	- case ZTEST_IO_WRITE_TAG:
	- ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize,
	- offset, 0, 0, 0);
	- (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
	- break;
	-
	- case ZTEST_IO_WRITE_PATTERN:
	- (void) memset(data, 'a' + (object + offset) % 5, blocksize);
	- if (ztest_random(2) == 0) {
	- /*
	- * Induce fletcher2 collisions to ensure that
	- * zio_ddt_collision() detects and resolves them
	- * when using fletcher2-verify for deduplication.
	- */
	- ((uint64_t *)data)[0] ^= 1ULL << 63;
	- ((uint64_t *)data)[4] ^= 1ULL << 63;
	- }
	- (void) ztest_write(zd, object, offset, blocksize, data);
	- break;
	-
	- case ZTEST_IO_WRITE_ZEROES:
	- bzero(data, blocksize);
	- (void) ztest_write(zd, object, offset, blocksize, data);
	- break;
	-
	- case ZTEST_IO_TRUNCATE:
	- (void) ztest_truncate(zd, object, offset, blocksize);
	- break;
	-
	- case ZTEST_IO_SETATTR:
	- (void) ztest_setattr(zd, object);
	- break;
	-
	- case ZTEST_IO_REWRITE:
	- rw_enter(&ztest_name_lock, RW_READER);
	- err = ztest_dsl_prop_set_uint64(zd->zd_name,
	- ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
	- B_FALSE);
	- VERIFY(err == 0 \|\| err == ENOSPC);
	- err = ztest_dsl_prop_set_uint64(zd->zd_name,
	- ZFS_PROP_COMPRESSION,
	- ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
	- B_FALSE);
	- VERIFY(err == 0 \|\| err == ENOSPC);
	- rw_exit(&ztest_name_lock);
	-
	- VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
	- DMU_READ_NO_PREFETCH));
	-
	- (void) ztest_write(zd, object, offset, blocksize, data);
	- break;
	- }
	-
	- rw_exit(&zd->zd_zilog_lock);
	-
	- umem_free(data, blocksize);
	-}
	-
	-/*
	- * Initialize an object description template.
	- */
	-static void
	-ztest_od_init(ztest_od_t od, uint64_t id, char tag, uint64_t index,
	- dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize,
	- uint64_t gen)
	-{
	- od->od_dir = ZTEST_DIROBJ;
	- od->od_object = 0;
	-
	- od->od_crtype = type;
	- od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
	- od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize();
	- od->od_crgen = gen;
	-
	- od->od_type = DMU_OT_NONE;
	- od->od_blocksize = 0;
	- od->od_gen = 0;
	-
	- (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
	- tag, (int64_t)id, index);
	-}
	-
	-/*
	- * Lookup or create the objects for a test using the od template.
	- * If the objects do not all exist, or if 'remove' is specified,
	- * remove any existing objects and create new ones. Otherwise,
	- * use the existing objects.
	- */
	-static int
	-ztest_object_init(ztest_ds_t zd, ztest_od_t od, size_t size, boolean_t remove)
	-{
	- int count = size / sizeof (*od);
	- int rv = 0;
	-
	- mutex_enter(&zd->zd_dirobj_lock);
	- if ((ztest_lookup(zd, od, count) != 0 \|\| remove) &&
	- (ztest_remove(zd, od, count) != 0 \|\|
	- ztest_create(zd, od, count) != 0))
	- rv = -1;
	- zd->zd_od = od;
	- mutex_exit(&zd->zd_dirobj_lock);
	-
	- return (rv);
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
	-{
	- zilog_t *zilog = zd->zd_zilog;
	-
	- rw_enter(&zd->zd_zilog_lock, RW_READER);
	-
	- zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
	-
	- /*
	- * Remember the committed values in zd, which is in parent/child
	- * shared memory. If we die, the next iteration of ztest_run()
	- * will verify that the log really does contain this record.
	- */
	- mutex_enter(&zilog->zl_lock);
	- ASSERT(zd->zd_shared != NULL);
	- ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
	- zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
	- mutex_exit(&zilog->zl_lock);
	-
	- rw_exit(&zd->zd_zilog_lock);
	-}
	-
	-/*
	- * This function is designed to simulate the operations that occur during a
	- * mount/unmount operation. We hold the dataset across these operations in an
	- * attempt to expose any implicit assumptions about ZIL management.
	- */
	-/* ARGSUSED */
	-void
	-ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os = zd->zd_os;
	-
	- /*
	- * We grab the zd_dirobj_lock to ensure that no other thread is
	- * updating the zil (i.e. adding in-memory log records) and the
	- * zd_zilog_lock to block any I/O.
	- */
	- mutex_enter(&zd->zd_dirobj_lock);
	- rw_enter(&zd->zd_zilog_lock, RW_WRITER);
	-
	- /* zfsvfs_teardown() */
	- zil_close(zd->zd_zilog);
	-
	- /* zfsvfs_setup() */
	- VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
	- zil_replay(os, zd, ztest_replay_vector);
	-
	- rw_exit(&zd->zd_zilog_lock);
	- mutex_exit(&zd->zd_dirobj_lock);
	-}
	-
	-/*
	- * Verify that we can't destroy an active pool, create an existing pool,
	- * or create a pool with a bad vdev spec.
	- */
	-/* ARGSUSED */
	-void
	-ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_shared_opts_t *zo = &ztest_opts;
	- spa_t *spa;
	- nvlist_t *nvroot;
	-
	- if (zo->zo_mmp_test)
	- return;
	-
	- /*
	- * Attempt to create using a bad file.
	- */
	- nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
	- VERIFY3U(ENOENT, ==,
	- spa_create("ztest_bad_file", nvroot, NULL, NULL));
	- nvlist_free(nvroot);
	-
	- /*
	- * Attempt to create using a bad mirror.
	- */
	- nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1);
	- VERIFY3U(ENOENT, ==,
	- spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
	- nvlist_free(nvroot);
	-
	- /*
	- * Attempt to create an existing pool. It shouldn't matter
	- * what's in the nvroot; we should fail with EEXIST.
	- */
	- rw_enter(&ztest_name_lock, RW_READER);
	- nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
	- VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
	- nvlist_free(nvroot);
	- VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
	- VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
	- spa_close(spa, FTAG);
	-
	- rw_exit(&ztest_name_lock);
	-}
	-
	-/*
	- * Start and then stop the MMP threads to ensure the startup and shutdown code
	- * works properly. Actual protection and property-related code tested via ZTS.
	- */
	-/* ARGSUSED */
	-void
	-ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_shared_opts_t *zo = &ztest_opts;
	- spa_t *spa = ztest_spa;
	-
	- if (zo->zo_mmp_test)
	- return;
	-
	- /*
	- * Since enabling MMP involves setting a property, it could not be done
	- * while the pool is suspended.
	- */
	- if (spa_suspended(spa))
	- return;
	-
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	- mutex_enter(&spa->spa_props_lock);
	-
	- zfs_multihost_fail_intervals = 0;
	-
	- if (!spa_multihost(spa)) {
	- spa->spa_multihost = B_TRUE;
	- mmp_thread_start(spa);
	- }
	-
	- mutex_exit(&spa->spa_props_lock);
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	-
	- txg_wait_synced(spa_get_dsl(spa), 0);
	- mmp_signal_all_threads();
	- txg_wait_synced(spa_get_dsl(spa), 0);
	-
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	- mutex_enter(&spa->spa_props_lock);
	-
	- if (spa_multihost(spa)) {
	- mmp_thread_stop(spa);
	- spa->spa_multihost = B_FALSE;
	- }
	-
	- mutex_exit(&spa->spa_props_lock);
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
	-{
	- spa_t *spa;
	- uint64_t initial_version = SPA_VERSION_INITIAL;
	- uint64_t version, newversion;
	- nvlist_t nvroot, props;
	- char *name;
	-
	- if (ztest_opts.zo_mmp_test)
	- return;
	-
	- mutex_enter(&ztest_vdev_lock);
	- name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
	-
	- /*
	- * Clean up from previous runs.
	- */
	- (void) spa_destroy(name);
	-
	- nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
	- NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
	-
	- /*
	- * If we're configuring a RAIDZ device then make sure that the
	- * the initial version is capable of supporting that feature.
	- */
	- switch (ztest_opts.zo_raidz_parity) {
	- case 0:
	- case 1:
	- initial_version = SPA_VERSION_INITIAL;
	- break;
	- case 2:
	- initial_version = SPA_VERSION_RAIDZ2;
	- break;
	- case 3:
	- initial_version = SPA_VERSION_RAIDZ3;
	- break;
	- }
	-
	- /*
	- * Create a pool with a spa version that can be upgraded. Pick
	- * a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
	- */
	- do {
	- version = ztest_random_spa_version(initial_version);
	- } while (version > SPA_VERSION_BEFORE_FEATURES);
	-
	- props = fnvlist_alloc();
	- fnvlist_add_uint64(props,
	- zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
	- VERIFY0(spa_create(name, nvroot, props, NULL));
	- fnvlist_free(nvroot);
	- fnvlist_free(props);
	-
	- VERIFY0(spa_open(name, &spa, FTAG));
	- VERIFY3U(spa_version(spa), ==, version);
	- newversion = ztest_random_spa_version(version + 1);
	-
	- if (ztest_opts.zo_verbose >= 4) {
	- (void) printf("upgrading spa version from %llu to %llu\n",
	- (u_longlong_t)version, (u_longlong_t)newversion);
	- }
	-
	- spa_upgrade(spa, newversion);
	- VERIFY3U(spa_version(spa), >, version);
	- VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
	- zpool_prop_to_name(ZPOOL_PROP_VERSION)));
	- spa_close(spa, FTAG);
	-
	- strfree(name);
	- mutex_exit(&ztest_vdev_lock);
	-}
	-
	-static void
	-ztest_spa_checkpoint(spa_t *spa)
	-{
	- ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
	-
	- int error = spa_checkpoint(spa->spa_name);
	-
	- switch (error) {
	- case 0:
	- case ZFS_ERR_DEVRM_IN_PROGRESS:
	- case ZFS_ERR_DISCARDING_CHECKPOINT:
	- case ZFS_ERR_CHECKPOINT_EXISTS:
	- break;
	- case ENOSPC:
	- ztest_record_enospc(FTAG);
	- break;
	- default:
	- fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error);
	- }
	-}
	-
	-static void
	-ztest_spa_discard_checkpoint(spa_t *spa)
	-{
	- ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
	-
	- int error = spa_checkpoint_discard(spa->spa_name);
	-
	- switch (error) {
	- case 0:
	- case ZFS_ERR_DISCARDING_CHECKPOINT:
	- case ZFS_ERR_NO_CHECKPOINT:
	- break;
	- default:
	- fatal(0, "spa_discard_checkpoint(%s) = %d",
	- spa->spa_name, error);
	- }
	-
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id)
	-{
	- spa_t *spa = ztest_spa;
	-
	- mutex_enter(&ztest_checkpoint_lock);
	- if (ztest_random(2) == 0) {
	- ztest_spa_checkpoint(spa);
	- } else {
	- ztest_spa_discard_checkpoint(spa);
	- }
	- mutex_exit(&ztest_checkpoint_lock);
	-}
	-
	-
	-static vdev_t *
	-vdev_lookup_by_path(vdev_t vd, const char path)
	-{
	- vdev_t *mvd;
	-
	- if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
	- return (vd);
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
	- NULL)
	- return (mvd);
	-
	- return (NULL);
	-}
	-
	-/*
	- * Find the first available hole which can be used as a top-level.
	- */
	-int
	-find_vdev_hole(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- int c;
	-
	- ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
	-
	- for (c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *cvd = rvd->vdev_child[c];
	-
	- if (cvd->vdev_ishole)
	- break;
	- }
	- return (c);
	-}
	-
	-/*
	- * Verify that vdev_add() works as expected.
	- */
	-/* ARGSUSED */
	-void
	-ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_shared_t *zs = ztest_shared;
	- spa_t *spa = ztest_spa;
	- uint64_t leaves;
	- uint64_t guid;
	- nvlist_t *nvroot;
	- int error;
	-
	- if (ztest_opts.zo_mmp_test)
	- return;
	-
	- mutex_enter(&ztest_vdev_lock);
	- leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	-
	- ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
	-
	- /*
	- * If we have slogs then remove them 1/4 of the time.
	- */
	- if (spa_has_slogs(spa) && ztest_random(4) == 0) {
	- metaslab_group_t *mg;
	-
	- /*
	- * find the first real slog in log allocation class
	- */
	- mg = spa_log_class(spa)->mc_rotor;
	- while (!mg->mg_vd->vdev_islog)
	- mg = mg->mg_next;
	-
	- guid = mg->mg_vd->vdev_guid;
	-
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- /*
	- * We have to grab the zs_name_lock as writer to
	- * prevent a race between removing a slog (dmu_objset_find)
	- * and destroying a dataset. Removing the slog will
	- * grab a reference on the dataset which may cause
	- * dmu_objset_destroy() to fail with EBUSY thus
	- * leaving the dataset in an inconsistent state.
	- */
	- rw_enter(&ztest_name_lock, RW_WRITER);
	- error = spa_vdev_remove(spa, guid, B_FALSE);
	- rw_exit(&ztest_name_lock);
	-
	- switch (error) {
	- case 0:
	- case EEXIST:
	- case ZFS_ERR_CHECKPOINT_EXISTS:
	- case ZFS_ERR_DISCARDING_CHECKPOINT:
	- break;
	- default:
	- fatal(0, "spa_vdev_remove() = %d", error);
	- }
	- } else {
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- /*
	- * Make 1/4 of the devices be log devices
	- */
	- nvroot = make_vdev_root(NULL, NULL, NULL,
	- ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?
	- "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
	-
	- error = spa_vdev_add(spa, nvroot);
	- nvlist_free(nvroot);
	-
	- switch (error) {
	- case 0:
	- break;
	- case ENOSPC:
	- ztest_record_enospc("spa_vdev_add");
	- break;
	- default:
	- fatal(0, "spa_vdev_add() = %d", error);
	- }
	- }
	-
	- mutex_exit(&ztest_vdev_lock);
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_shared_t *zs = ztest_shared;
	- spa_t *spa = ztest_spa;
	- uint64_t leaves;
	- nvlist_t *nvroot;
	- const char *class = (ztest_random(2) == 0) ?
	- VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP;
	- int error;
	-
	- /*
	- * By default add a special vdev 50% of the time
	- */
	- if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) \|\|
	- (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND &&
	- ztest_random(2) == 0)) {
	- return;
	- }
	-
	- mutex_enter(&ztest_vdev_lock);
	-
	- /* Only test with mirrors */
	- if (zs->zs_mirrors < 2) {
	- mutex_exit(&ztest_vdev_lock);
	- return;
	- }
	-
	- /* requires feature@allocation_classes */
	- if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) {
	- mutex_exit(&ztest_vdev_lock);
	- return;
	- }
	-
	- leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	- ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
	- class, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
	-
	- error = spa_vdev_add(spa, nvroot);
	- nvlist_free(nvroot);
	-
	- if (error == ENOSPC)
	- ztest_record_enospc("spa_vdev_add");
	- else if (error != 0)
	- fatal(0, "spa_vdev_add() = %d", error);
	-
	- /*
	- * 50% of the time allow small blocks in the special class
	- */
	- if (error == 0 &&
	- spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) {
	- if (ztest_opts.zo_verbose >= 3)
	- (void) printf("Enabling special VDEV small blocks\n");
	- (void) ztest_dsl_prop_set_uint64(zd->zd_name,
	- ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE);
	- }
	-
	- mutex_exit(&ztest_vdev_lock);
	-
	- if (ztest_opts.zo_verbose >= 3) {
	- metaslab_class_t *mc;
	-
	- if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0)
	- mc = spa_special_class(spa);
	- else
	- mc = spa_dedup_class(spa);
	- (void) printf("Added a %s mirrored vdev (of %d)\n",
	- class, (int)mc->mc_groups);
	- }
	-}
	-
	-/*
	- * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
	- */
	-/* ARGSUSED */
	-void
	-ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_shared_t *zs = ztest_shared;
	- spa_t *spa = ztest_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- spa_aux_vdev_t *sav;
	- char *aux;
	- uint64_t guid = 0;
	- int error;
	-
	- if (ztest_opts.zo_mmp_test)
	- return;
	-
	- if (ztest_random(2) == 0) {
	- sav = &spa->spa_spares;
	- aux = ZPOOL_CONFIG_SPARES;
	- } else {
	- sav = &spa->spa_l2cache;
	- aux = ZPOOL_CONFIG_L2CACHE;
	- }
	-
	- mutex_enter(&ztest_vdev_lock);
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	-
	- if (sav->sav_count != 0 && ztest_random(4) == 0) {
	- /*
	- * Pick a random device to remove.
	- */
	- guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
	- } else {
	- /*
	- * Find an unused device we can add.
	- */
	- zs->zs_vdev_aux = 0;
	- for (;;) {
	- char path[MAXPATHLEN];
	- int c;
	- (void) snprintf(path, sizeof (path), ztest_aux_template,
	- ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
	- zs->zs_vdev_aux);
	- for (c = 0; c < sav->sav_count; c++)
	- if (strcmp(sav->sav_vdevs[c]->vdev_path,
	- path) == 0)
	- break;
	- if (c == sav->sav_count &&
	- vdev_lookup_by_path(rvd, path) == NULL)
	- break;
	- zs->zs_vdev_aux++;
	- }
	- }
	-
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- if (guid == 0) {
	- /*
	- * Add a new device.
	- */
	- nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
	- (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1);
	- error = spa_vdev_add(spa, nvroot);
	-
	- switch (error) {
	- case 0:
	- break;
	- default:
	- fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
	- }
	- nvlist_free(nvroot);
	- } else {
	- /*
	- * Remove an existing device. Sometimes, dirty its
	- * vdev state first to make sure we handle removal
	- * of devices that have pending state changes.
	- */
	- if (ztest_random(2) == 0)
	- (void) vdev_online(spa, guid, 0, NULL);
	-
	- error = spa_vdev_remove(spa, guid, B_FALSE);
	-
	- switch (error) {
	- case 0:
	- case EBUSY:
	- case ZFS_ERR_CHECKPOINT_EXISTS:
	- case ZFS_ERR_DISCARDING_CHECKPOINT:
	- break;
	- default:
	- fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
	- }
	- }
	-
	- mutex_exit(&ztest_vdev_lock);
	-}
	-
	-/*
	- * split a pool if it has mirror tlvdevs
	- */
	-/* ARGSUSED */
	-void
	-ztest_split_pool(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_shared_t *zs = ztest_shared;
	- spa_t *spa = ztest_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- nvlist_t tree, child, config, split, *schild;
	- uint_t c, children, schildren = 0, lastlogid = 0;
	- int error = 0;
	-
	- if (ztest_opts.zo_mmp_test)
	- return;
	-
	- mutex_enter(&ztest_vdev_lock);
	-
	- /* ensure we have a useable config; mirrors of raidz aren't supported */
	- if (zs->zs_mirrors < 3 \|\| ztest_opts.zo_raidz > 1) {
	- mutex_exit(&ztest_vdev_lock);
	- return;
	- }
	-
	- /* clean up the old pool, if any */
	- (void) spa_destroy("splitp");
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	-
	- /* generate a config from the existing config */
	- mutex_enter(&spa->spa_props_lock);
	- VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
	- &tree) == 0);
	- mutex_exit(&spa->spa_props_lock);
	-
	- VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
	- &children) == 0);
	-
	- schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
	- for (c = 0; c < children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	- nvlist_t **mchild;
	- uint_t mchildren;
	-
	- if (tvd->vdev_islog \|\| tvd->vdev_ops == &vdev_hole_ops) {
	- VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
	- 0) == 0);
	- VERIFY(nvlist_add_string(schild[schildren],
	- ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
	- VERIFY(nvlist_add_uint64(schild[schildren],
	- ZPOOL_CONFIG_IS_HOLE, 1) == 0);
	- if (lastlogid == 0)
	- lastlogid = schildren;
	- ++schildren;
	- continue;
	- }
	- lastlogid = 0;
	- VERIFY(nvlist_lookup_nvlist_array(child[c],
	- ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
	- VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
	- }
	-
	- /* OK, create a config that can be used to split */
	- VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
	- VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_ROOT) == 0);
	- VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
	- lastlogid != 0 ? lastlogid : schildren) == 0);
	-
	- VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
	- VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
	-
	- for (c = 0; c < schildren; c++)
	- nvlist_free(schild[c]);
	- free(schild);
	- nvlist_free(split);
	-
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- rw_enter(&ztest_name_lock, RW_WRITER);
	- error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
	- rw_exit(&ztest_name_lock);
	-
	- nvlist_free(config);
	-
	- if (error == 0) {
	- (void) printf("successful split - results:\n");
	- mutex_enter(&spa_namespace_lock);
	- show_pool_stats(spa);
	- show_pool_stats(spa_lookup("splitp"));
	- mutex_exit(&spa_namespace_lock);
	- ++zs->zs_splits;
	- --zs->zs_mirrors;
	- }
	- mutex_exit(&ztest_vdev_lock);
	-}
	-
	-/*
	- * Verify that we can attach and detach devices.
	- */
	-/* ARGSUSED */
	-void
	-ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_shared_t *zs = ztest_shared;
	- spa_t *spa = ztest_spa;
	- spa_aux_vdev_t *sav = &spa->spa_spares;
	- vdev_t *rvd = spa->spa_root_vdev;
	- vdev_t oldvd, newvd, *pvd;
	- nvlist_t *root;
	- uint64_t leaves;
	- uint64_t leaf, top;
	- uint64_t ashift = ztest_get_ashift();
	- uint64_t oldguid, pguid;
	- uint64_t oldsize, newsize;
	- char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
	- int replacing;
	- int oldvd_has_siblings = B_FALSE;
	- int newvd_is_spare = B_FALSE;
	- int oldvd_is_log;
	- int error, expected_error;
	-
	- if (ztest_opts.zo_mmp_test)
	- return;
	-
	- mutex_enter(&ztest_vdev_lock);
	- leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	-
	- /*
	- * If a vdev is in the process of being removed, its removal may
	- * finish while we are in progress, leading to an unexpected error
	- * value. Don't bother trying to attach while we are in the middle
	- * of removal.
	- */
	- if (ztest_device_removal_active) {
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- mutex_exit(&ztest_vdev_lock);
	- return;
	- }
	-
	- /*
	- * Decide whether to do an attach or a replace.
	- */
	- replacing = ztest_random(2);
	-
	- /*
	- * Pick a random top-level vdev.
	- */
	- top = ztest_random_vdev_top(spa, B_TRUE);
	-
	- /*
	- * Pick a random leaf within it.
	- */
	- leaf = ztest_random(leaves);
	-
	- /*
	- * Locate this vdev.
	- */
	- oldvd = rvd->vdev_child[top];
	-
	- /* pick a child from the mirror */
	- if (zs->zs_mirrors >= 1) {
	- ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
	- ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
	- oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
	- }
	-
	- /* pick a child out of the raidz group */
	- if (ztest_opts.zo_raidz > 1) {
	- ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
	- ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
	- oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
	- }
	-
	- /*
	- * If we're already doing an attach or replace, oldvd may be a
	- * mirror vdev -- in which case, pick a random child.
	- */
	- while (oldvd->vdev_children != 0) {
	- oldvd_has_siblings = B_TRUE;
	- ASSERT(oldvd->vdev_children >= 2);
	- oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
	- }
	-
	- oldguid = oldvd->vdev_guid;
	- oldsize = vdev_get_min_asize(oldvd);
	- oldvd_is_log = oldvd->vdev_top->vdev_islog;
	- (void) strcpy(oldpath, oldvd->vdev_path);
	- pvd = oldvd->vdev_parent;
	- pguid = pvd->vdev_guid;
	-
	- /*
	- * If oldvd has siblings, then half of the time, detach it.
	- */
	- if (oldvd_has_siblings && ztest_random(2) == 0) {
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
	- if (error != 0 && error != ENODEV && error != EBUSY &&
	- error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS &&
	- error != ZFS_ERR_DISCARDING_CHECKPOINT)
	- fatal(0, "detach (%s) returned %d", oldpath, error);
	- mutex_exit(&ztest_vdev_lock);
	- return;
	- }
	-
	- /*
	- * For the new vdev, choose with equal probability between the two
	- * standard paths (ending in either 'a' or 'b') or a random hot spare.
	- */
	- if (sav->sav_count != 0 && ztest_random(3) == 0) {
	- newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
	- newvd_is_spare = B_TRUE;
	- (void) strcpy(newpath, newvd->vdev_path);
	- } else {
	- (void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
	- ztest_opts.zo_dir, ztest_opts.zo_pool,
	- top * leaves + leaf);
	- if (ztest_random(2) == 0)
	- newpath[strlen(newpath) - 1] = 'b';
	- newvd = vdev_lookup_by_path(rvd, newpath);
	- }
	-
	- if (newvd) {
	- /*
	- * Reopen to ensure the vdev's asize field isn't stale.
	- */
	- vdev_reopen(newvd);
	- newsize = vdev_get_min_asize(newvd);
	- } else {
	- /*
	- * Make newsize a little bigger or smaller than oldsize.
	- * If it's smaller, the attach should fail.
	- * If it's larger, and we're doing a replace,
	- * we should get dynamic LUN growth when we're done.
	- */
	- newsize = 10 * oldsize / (9 + ztest_random(3));
	- }
	-
	- /*
	- * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
	- * unless it's a replace; in that case any non-replacing parent is OK.
	- *
	- * If newvd is already part of the pool, it should fail with EBUSY.
	- *
	- * If newvd is too small, it should fail with EOVERFLOW.
	- */
	- if (pvd->vdev_ops != &vdev_mirror_ops &&
	- pvd->vdev_ops != &vdev_root_ops && (!replacing \|\|
	- pvd->vdev_ops == &vdev_replacing_ops \|\|
	- pvd->vdev_ops == &vdev_spare_ops))
	- expected_error = ENOTSUP;
	- else if (newvd_is_spare && (!replacing \|\| oldvd_is_log))
	- expected_error = ENOTSUP;
	- else if (newvd == oldvd)
	- expected_error = replacing ? 0 : EBUSY;
	- else if (vdev_lookup_by_path(rvd, newpath) != NULL)
	- expected_error = EBUSY;
	- else if (newsize < oldsize)
	- expected_error = EOVERFLOW;
	- else if (ashift > oldvd->vdev_top->vdev_ashift)
	- expected_error = EDOM;
	- else
	- expected_error = 0;
	-
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- /*
	- * Build the nvlist describing newpath.
	- */
	- root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
	- ashift, NULL, 0, 0, 1);
	-
	- error = spa_vdev_attach(spa, oldguid, root, replacing);
	-
	- nvlist_free(root);
	-
	- /*
	- * If our parent was the replacing vdev, but the replace completed,
	- * then instead of failing with ENOTSUP we may either succeed,
	- * fail with ENODEV, or fail with EOVERFLOW.
	- */
	- if (expected_error == ENOTSUP &&
	- (error == 0 \|\| error == ENODEV \|\| error == EOVERFLOW))
	- expected_error = error;
	-
	- /*
	- * If someone grew the LUN, the replacement may be too small.
	- */
	- if (error == EOVERFLOW \|\| error == EBUSY)
	- expected_error = error;
	-
	- if (error == ZFS_ERR_CHECKPOINT_EXISTS \|\|
	- error == ZFS_ERR_DISCARDING_CHECKPOINT)
	- expected_error = error;
	-
	- /* XXX workaround 6690467 */
	- if (error != expected_error && expected_error != EBUSY) {
	- fatal(0, "attach (%s %llu, %s %llu, %d) "
	- "returned %d, expected %d",
	- oldpath, oldsize, newpath,
	- newsize, replacing, error, expected_error);
	- }
	-
	- mutex_exit(&ztest_vdev_lock);
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_device_removal(ztest_ds_t *zd, uint64_t id)
	-{
	- spa_t *spa = ztest_spa;
	- vdev_t *vd;
	- uint64_t guid;
	- int error;
	-
	- mutex_enter(&ztest_vdev_lock);
	-
	- if (ztest_device_removal_active) {
	- mutex_exit(&ztest_vdev_lock);
	- return;
	- }
	-
	- /*
	- * Remove a random top-level vdev and wait for removal to finish.
	- */
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	- vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
	- guid = vd->vdev_guid;
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- error = spa_vdev_remove(spa, guid, B_FALSE);
	- if (error == 0) {
	- ztest_device_removal_active = B_TRUE;
	- mutex_exit(&ztest_vdev_lock);
	-
	- while (spa->spa_vdev_removal != NULL)
	- txg_wait_synced(spa_get_dsl(spa), 0);
	- } else {
	- mutex_exit(&ztest_vdev_lock);
	- return;
	- }
	-
	- /*
	- * The pool needs to be scrubbed after completing device removal.
	- * Failure to do so may result in checksum errors due to the
	- * strategy employed by ztest_fault_inject() when selecting which
	- * offset are redundant and can be damaged.
	- */
	- error = spa_scan(spa, POOL_SCAN_SCRUB);
	- if (error == 0) {
	- while (dsl_scan_scrubbing(spa_get_dsl(spa)))
	- txg_wait_synced(spa_get_dsl(spa), 0);
	- }
	-
	- mutex_enter(&ztest_vdev_lock);
	- ztest_device_removal_active = B_FALSE;
	- mutex_exit(&ztest_vdev_lock);
	-}
	-
	-/*
	- * Callback function which expands the physical size of the vdev.
	- */
	-vdev_t *
	-grow_vdev(vdev_t vd, void arg)
	-{
	- spa_t *spa = vd->vdev_spa;
	- size_t *newsize = arg;
	- size_t fsize;
	- int fd;
	-
	- ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
	- return (vd);
	-
	- fsize = lseek(fd, 0, SEEK_END);
	- (void) ftruncate(fd, *newsize);
	-
	- if (ztest_opts.zo_verbose >= 6) {
	- (void) printf("%s grew from %lu to %lu bytes\n",
	- vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
	- }
	- (void) close(fd);
	- return (NULL);
	-}
	-
	-/*
	- * Callback function which expands a given vdev by calling vdev_online().
	- */
	-/* ARGSUSED */
	-vdev_t *
	-online_vdev(vdev_t vd, void arg)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_t *tvd = vd->vdev_top;
	- uint64_t guid = vd->vdev_guid;
	- uint64_t generation = spa->spa_config_generation + 1;
	- vdev_state_t newstate = VDEV_STATE_UNKNOWN;
	- int error;
	-
	- ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- /* Calling vdev_online will initialize the new metaslabs */
	- spa_config_exit(spa, SCL_STATE, spa);
	- error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
	- spa_config_enter(spa, SCL_STATE, spa, RW_READER);
	-
	- /*
	- * If vdev_online returned an error or the underlying vdev_open
	- * failed then we abort the expand. The only way to know that
	- * vdev_open fails is by checking the returned newstate.
	- */
	- if (error \|\| newstate != VDEV_STATE_HEALTHY) {
	- if (ztest_opts.zo_verbose >= 5) {
	- (void) printf("Unable to expand vdev, state %llu, "
	- "error %d\n", (u_longlong_t)newstate, error);
	- }
	- return (vd);
	- }
	- ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
	-
	- /*
	- * Since we dropped the lock we need to ensure that we're
	- * still talking to the original vdev. It's possible this
	- * vdev may have been detached/replaced while we were
	- * trying to online it.
	- */
	- if (generation != spa->spa_config_generation) {
	- if (ztest_opts.zo_verbose >= 5) {
	- (void) printf("vdev configuration has changed, "
	- "guid %llu, state %llu, expected gen %llu, "
	- "got gen %llu\n",
	- (u_longlong_t)guid,
	- (u_longlong_t)tvd->vdev_state,
	- (u_longlong_t)generation,
	- (u_longlong_t)spa->spa_config_generation);
	- }
	- return (vd);
	- }
	- return (NULL);
	-}
	-
	-/*
	- * Traverse the vdev tree calling the supplied function.
	- * We continue to walk the tree until we either have walked all
	- * children or we receive a non-NULL return from the callback.
	- * If a NULL callback is passed, then we just return back the first
	- * leaf vdev we encounter.
	- */
	-vdev_t *
	-vdev_walk_tree(vdev_t vd, vdev_t (func)(vdev_t , void ), void arg)
	-{
	- if (vd->vdev_ops->vdev_op_leaf) {
	- if (func == NULL)
	- return (vd);
	- else
	- return (func(vd, arg));
	- }
	-
	- for (uint_t c = 0; c < vd->vdev_children; c++) {
	- vdev_t *cvd = vd->vdev_child[c];
	- if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
	- return (cvd);
	- }
	- return (NULL);
	-}
	-
	-/*
	- * Verify that dynamic LUN growth works as expected.
	- */
	-/* ARGSUSED */
	-void
	-ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
	-{
	- spa_t *spa = ztest_spa;
	- vdev_t vd, tvd;
	- metaslab_class_t *mc;
	- metaslab_group_t *mg;
	- size_t psize, newsize;
	- uint64_t top;
	- uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
	-
	- mutex_enter(&ztest_checkpoint_lock);
	- mutex_enter(&ztest_vdev_lock);
	- spa_config_enter(spa, SCL_STATE, spa, RW_READER);
	-
	- /*
	- * If there is a vdev removal in progress, it could complete while
	- * we are running, in which case we would not be able to verify
	- * that the metaslab_class space increased (because it decreases
	- * when the device removal completes).
	- */
	- if (ztest_device_removal_active) {
	- spa_config_exit(spa, SCL_STATE, spa);
	- mutex_exit(&ztest_vdev_lock);
	- mutex_exit(&ztest_checkpoint_lock);
	- return;
	- }
	-
	- top = ztest_random_vdev_top(spa, B_TRUE);
	-
	- tvd = spa->spa_root_vdev->vdev_child[top];
	- mg = tvd->vdev_mg;
	- mc = mg->mg_class;
	- old_ms_count = tvd->vdev_ms_count;
	- old_class_space = metaslab_class_get_space(mc);
	-
	- /*
	- * Determine the size of the first leaf vdev associated with
	- * our top-level device.
	- */
	- vd = vdev_walk_tree(tvd, NULL, NULL);
	- ASSERT3P(vd, !=, NULL);
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- psize = vd->vdev_psize;
	-
	- /*
	- * We only try to expand the vdev if it's healthy, less than 4x its
	- * original size, and it has a valid psize.
	- */
	- if (tvd->vdev_state != VDEV_STATE_HEALTHY \|\|
	- psize == 0 \|\| psize >= 4 * ztest_opts.zo_vdev_size) {
	- spa_config_exit(spa, SCL_STATE, spa);
	- mutex_exit(&ztest_vdev_lock);
	- mutex_exit(&ztest_checkpoint_lock);
	- return;
	- }
	- ASSERT(psize > 0);
	- newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE);
	- ASSERT3U(newsize, >, psize);
	-
	- if (ztest_opts.zo_verbose >= 6) {
	- (void) printf("Expanding LUN %s from %lu to %lu\n",
	- vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
	- }
	-
	- /*
	- * Growing the vdev is a two step process:
	- * 1). expand the physical size (i.e. relabel)
	- * 2). online the vdev to create the new metaslabs
	- */
	- if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL \|\|
	- vdev_walk_tree(tvd, online_vdev, NULL) != NULL \|\|
	- tvd->vdev_state != VDEV_STATE_HEALTHY) {
	- if (ztest_opts.zo_verbose >= 5) {
	- (void) printf("Could not expand LUN because "
	- "the vdev configuration changed.\n");
	- }
	- spa_config_exit(spa, SCL_STATE, spa);
	- mutex_exit(&ztest_vdev_lock);
	- mutex_exit(&ztest_checkpoint_lock);
	- return;
	- }
	-
	- spa_config_exit(spa, SCL_STATE, spa);
	-
	- /*
	- * Expanding the LUN will update the config asynchronously,
	- * thus we must wait for the async thread to complete any
	- * pending tasks before proceeding.
	- */
	- for (;;) {
	- boolean_t done;
	- mutex_enter(&spa->spa_async_lock);
	- done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
	- mutex_exit(&spa->spa_async_lock);
	- if (done)
	- break;
	- txg_wait_synced(spa_get_dsl(spa), 0);
	- (void) poll(NULL, 0, 100);
	- }
	-
	- spa_config_enter(spa, SCL_STATE, spa, RW_READER);
	-
	- tvd = spa->spa_root_vdev->vdev_child[top];
	- new_ms_count = tvd->vdev_ms_count;
	- new_class_space = metaslab_class_get_space(mc);
	-
	- if (tvd->vdev_mg != mg \|\| mg->mg_class != mc) {
	- if (ztest_opts.zo_verbose >= 5) {
	- (void) printf("Could not verify LUN expansion due to "
	- "intervening vdev offline or remove.\n");
	- }
	- spa_config_exit(spa, SCL_STATE, spa);
	- mutex_exit(&ztest_vdev_lock);
	- mutex_exit(&ztest_checkpoint_lock);
	- return;
	- }
	-
	- /*
	- * Make sure we were able to grow the vdev.
	- */
	- if (new_ms_count <= old_ms_count) {
	- fatal(0, "LUN expansion failed: ms_count %llu < %llu\n",
	- old_ms_count, new_ms_count);
	- }
	-
	- /*
	- * Make sure we were able to grow the pool.
	- */
	- if (new_class_space <= old_class_space) {
	- fatal(0, "LUN expansion failed: class_space %llu < %llu\n",
	- old_class_space, new_class_space);
	- }
	-
	- if (ztest_opts.zo_verbose >= 5) {
	- char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ];
	-
	- nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf));
	- nicenum(new_class_space, newnumbuf, sizeof (newnumbuf));
	- (void) printf("%s grew from %s to %s\n",
	- spa->spa_name, oldnumbuf, newnumbuf);
	- }
	-
	- spa_config_exit(spa, SCL_STATE, spa);
	- mutex_exit(&ztest_vdev_lock);
	- mutex_exit(&ztest_checkpoint_lock);
	-}
	-
	-/*
	- * Verify that dmu_objset_{create,destroy,open,close} work as expected.
	- */
	-/* ARGSUSED */
	-static void
	-ztest_objset_create_cb(objset_t os, void arg, cred_t cr, dmu_tx_t tx)
	-{
	- /*
	- * Create the objects common to all ztest datasets.
	- */
	- VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
	- DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
	-}
	-
	-static int
	-ztest_dataset_create(char *dsname)
	-{
	- uint64_t zilset = ztest_random(100);
	- int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
	- ztest_objset_create_cb, NULL);
	-
	- if (err \|\| zilset < 80)
	- return (err);
	-
	- if (ztest_opts.zo_verbose >= 6)
	- (void) printf("Setting dataset %s to sync always\n", dsname);
	- return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
	- ZFS_SYNC_ALWAYS, B_FALSE));
	-}
	-
	-/* ARGSUSED */
	-static int
	-ztest_objset_destroy_cb(const char name, void arg)
	-{
	- objset_t *os;
	- dmu_object_info_t doi;
	- int error;
	-
	- /*
	- * Verify that the dataset contains a directory object.
	- */
	- VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os));
	- error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
	- if (error != ENOENT) {
	- /* We could have crashed in the middle of destroying it */
	- ASSERT0(error);
	- ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
	- ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
	- }
	- dmu_objset_disown(os, FTAG);
	-
	- /*
	- * Destroy the dataset.
	- */
	- if (strchr(name, '@') != NULL) {
	- VERIFY0(dsl_destroy_snapshot(name, B_FALSE));
	- } else {
	- VERIFY0(dsl_destroy_head(name));
	- }
	- return (0);
	-}
	-
	-static boolean_t
	-ztest_snapshot_create(char *osname, uint64_t id)
	-{
	- char snapname[ZFS_MAX_DATASET_NAME_LEN];
	- int error;
	-
	- (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
	-
	- error = dmu_objset_snapshot_one(osname, snapname);
	- if (error == ENOSPC) {
	- ztest_record_enospc(FTAG);
	- return (B_FALSE);
	- }
	- if (error != 0 && error != EEXIST) {
	- fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
	- snapname, error);
	- }
	- return (B_TRUE);
	-}
	-
	-static boolean_t
	-ztest_snapshot_destroy(char *osname, uint64_t id)
	-{
	- char snapname[ZFS_MAX_DATASET_NAME_LEN];
	- int error;
	-
	- (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname,
	- (u_longlong_t)id);
	-
	- error = dsl_destroy_snapshot(snapname, B_FALSE);
	- if (error != 0 && error != ENOENT)
	- fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
	- return (B_TRUE);
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_ds_t zdtmp;
	- int iters;
	- int error;
	- objset_t os, os2;
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- zilog_t *zilog;
	-
	- rw_enter(&ztest_name_lock, RW_READER);
	-
	- (void) snprintf(name, sizeof (name), "%s/temp_%llu",
	- ztest_opts.zo_pool, (u_longlong_t)id);
	-
	- /*
	- * If this dataset exists from a previous run, process its replay log
	- * half of the time. If we don't replay it, then dmu_objset_destroy()
	- * (invoked from ztest_objset_destroy_cb()) should just throw it away.
	- */
	- if (ztest_random(2) == 0 &&
	- dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
	- ztest_zd_init(&zdtmp, NULL, os);
	- zil_replay(os, &zdtmp, ztest_replay_vector);
	- ztest_zd_fini(&zdtmp);
	- dmu_objset_disown(os, FTAG);
	- }
	-
	- /*
	- * There may be an old instance of the dataset we're about to
	- * create lying around from a previous run. If so, destroy it
	- * and all of its snapshots.
	- */
	- (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
	- DS_FIND_CHILDREN \| DS_FIND_SNAPSHOTS);
	-
	- /*
	- * Verify that the destroyed dataset is no longer in the namespace.
	- */
	- VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
	- FTAG, &os));
	-
	- /*
	- * Verify that we can create a new dataset.
	- */
	- error = ztest_dataset_create(name);
	- if (error) {
	- if (error == ENOSPC) {
	- ztest_record_enospc(FTAG);
	- rw_exit(&ztest_name_lock);
	- return;
	- }
	- fatal(0, "dmu_objset_create(%s) = %d", name, error);
	- }
	-
	- VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
	-
	- ztest_zd_init(&zdtmp, NULL, os);
	-
	- /*
	- * Open the intent log for it.
	- */
	- zilog = zil_open(os, ztest_get_data);
	-
	- /*
	- * Put some objects in there, do a little I/O to them,
	- * and randomly take a couple of snapshots along the way.
	- */
	- iters = ztest_random(5);
	- for (int i = 0; i < iters; i++) {
	- ztest_dmu_object_alloc_free(&zdtmp, id);
	- if (ztest_random(iters) == 0)
	- (void) ztest_snapshot_create(name, i);
	- }
	-
	- /*
	- * Verify that we cannot create an existing dataset.
	- */
	- VERIFY3U(EEXIST, ==,
	- dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
	-
	- /*
	- * Verify that we can hold an objset that is also owned.
	- */
	- VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
	- dmu_objset_rele(os2, FTAG);
	-
	- /*
	- * Verify that we cannot own an objset that is already owned.
	- */
	- VERIFY3U(EBUSY, ==,
	- dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
	-
	- zil_close(zilog);
	- dmu_objset_disown(os, FTAG);
	- ztest_zd_fini(&zdtmp);
	-
	- rw_exit(&ztest_name_lock);
	-}
	-
	-/*
	- * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
	- */
	-void
	-ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
	-{
	- rw_enter(&ztest_name_lock, RW_READER);
	- (void) ztest_snapshot_destroy(zd->zd_name, id);
	- (void) ztest_snapshot_create(zd->zd_name, id);
	- rw_exit(&ztest_name_lock);
	-}
	-
	-/*
	- * Cleanup non-standard snapshots and clones.
	- */
	-void
	-ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
	-{
	- char snap1name[ZFS_MAX_DATASET_NAME_LEN];
	- char clone1name[ZFS_MAX_DATASET_NAME_LEN];
	- char snap2name[ZFS_MAX_DATASET_NAME_LEN];
	- char clone2name[ZFS_MAX_DATASET_NAME_LEN];
	- char snap3name[ZFS_MAX_DATASET_NAME_LEN];
	- int error;
	-
	- (void) snprintf(snap1name, sizeof (snap1name),
	- "%s@s1_%llu", osname, id);
	- (void) snprintf(clone1name, sizeof (clone1name),
	- "%s/c1_%llu", osname, id);
	- (void) snprintf(snap2name, sizeof (snap2name),
	- "%s@s2_%llu", clone1name, id);
	- (void) snprintf(clone2name, sizeof (clone2name),
	- "%s/c2_%llu", osname, id);
	- (void) snprintf(snap3name, sizeof (snap3name),
	- "%s@s3_%llu", clone1name, id);
	-
	- error = dsl_destroy_head(clone2name);
	- if (error && error != ENOENT)
	- fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
	- error = dsl_destroy_snapshot(snap3name, B_FALSE);
	- if (error && error != ENOENT)
	- fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
	- error = dsl_destroy_snapshot(snap2name, B_FALSE);
	- if (error && error != ENOENT)
	- fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
	- error = dsl_destroy_head(clone1name);
	- if (error && error != ENOENT)
	- fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
	- error = dsl_destroy_snapshot(snap1name, B_FALSE);
	- if (error && error != ENOENT)
	- fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
	-}
	-
	-/*
	- * Verify dsl_dataset_promote handles EBUSY
	- */
	-void
	-ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os;
	- char snap1name[ZFS_MAX_DATASET_NAME_LEN];
	- char clone1name[ZFS_MAX_DATASET_NAME_LEN];
	- char snap2name[ZFS_MAX_DATASET_NAME_LEN];
	- char clone2name[ZFS_MAX_DATASET_NAME_LEN];
	- char snap3name[ZFS_MAX_DATASET_NAME_LEN];
	- char *osname = zd->zd_name;
	- int error;
	-
	- rw_enter(&ztest_name_lock, RW_READER);
	-
	- ztest_dsl_dataset_cleanup(osname, id);
	-
	- (void) snprintf(snap1name, sizeof (snap1name),
	- "%s@s1_%llu", osname, id);
	- (void) snprintf(clone1name, sizeof (clone1name),
	- "%s/c1_%llu", osname, id);
	- (void) snprintf(snap2name, sizeof (snap2name),
	- "%s@s2_%llu", clone1name, id);
	- (void) snprintf(clone2name, sizeof (clone2name),
	- "%s/c2_%llu", osname, id);
	- (void) snprintf(snap3name, sizeof (snap3name),
	- "%s@s3_%llu", clone1name, id);
	-
	- error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
	- if (error && error != EEXIST) {
	- if (error == ENOSPC) {
	- ztest_record_enospc(FTAG);
	- goto out;
	- }
	- fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
	- }
	-
	- error = dmu_objset_clone(clone1name, snap1name);
	- if (error) {
	- if (error == ENOSPC) {
	- ztest_record_enospc(FTAG);
	- goto out;
	- }
	- fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
	- }
	-
	- error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
	- if (error && error != EEXIST) {
	- if (error == ENOSPC) {
	- ztest_record_enospc(FTAG);
	- goto out;
	- }
	- fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
	- }
	-
	- error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
	- if (error && error != EEXIST) {
	- if (error == ENOSPC) {
	- ztest_record_enospc(FTAG);
	- goto out;
	- }
	- fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
	- }
	-
	- error = dmu_objset_clone(clone2name, snap3name);
	- if (error) {
	- if (error == ENOSPC) {
	- ztest_record_enospc(FTAG);
	- goto out;
	- }
	- fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
	- }
	-
	- error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
	- if (error)
	- fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
	- error = dsl_dataset_promote(clone2name, NULL);
	- if (error == ENOSPC) {
	- dmu_objset_disown(os, FTAG);
	- ztest_record_enospc(FTAG);
	- goto out;
	- }
	- if (error != EBUSY)
	- fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
	- error);
	- dmu_objset_disown(os, FTAG);
	-
	-out:
	- ztest_dsl_dataset_cleanup(osname, id);
	-
	- rw_exit(&ztest_name_lock);
	-}
	-
	-/*
	- * Verify that dmu_object_{alloc,free} work as expected.
	- */
	-void
	-ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_od_t od[4];
	- int batchsize = sizeof (od) / sizeof (od[0]);
	-
	- for (int b = 0; b < batchsize; b++) {
	- ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER,
	- 0, 0, 0);
	- }
	-
	- /*
	- * Destroy the previous batch of objects, create a new batch,
	- * and do some I/O on the new objects.
	- */
	- if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
	- return;
	-
	- while (ztest_random(4 * batchsize) != 0)
	- ztest_io(zd, od[ztest_random(batchsize)].od_object,
	- ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
	-}
	-
	-/*
	- * Rewind the global allocator to verify object allocation backfilling.
	- */
	-void
	-ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os = zd->zd_os;
	- int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
	- uint64_t object;
	-
	- /*
	- * Rewind the global allocator randomly back to a lower object number
	- * to force backfilling and reclamation of recently freed dnodes.
	- */
	- mutex_enter(&os->os_obj_lock);
	- object = ztest_random(os->os_obj_next_chunk);
	- os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
	- mutex_exit(&os->os_obj_lock);
	-}
	-
	-/*
	- * Verify that dmu_{read,write} work as expected.
	- */
	-void
	-ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os = zd->zd_os;
	- ztest_od_t od[2];
	- dmu_tx_t *tx;
	- int i, freeit, error;
	- uint64_t n, s, txg;
	- bufwad_t packbuf, bigbuf, pack, bigH, *bigT;
	- uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
	- uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
	- uint64_t regions = 997;
	- uint64_t stride = 123456789ULL;
	- uint64_t width = 40;
	- int free_percent = 5;
	-
	- /*
	- * This test uses two objects, packobj and bigobj, that are always
	- * updated together (i.e. in the same tx) so that their contents are
	- * in sync and can be compared. Their contents relate to each other
	- * in a simple way: packobj is a dense array of 'bufwad' structures,
	- * while bigobj is a sparse array of the same bufwads. Specifically,
	- * for any index n, there are three bufwads that should be identical:
	- *
	- * packobj, at offset n * sizeof (bufwad_t)
	- * bigobj, at the head of the nth chunk
	- * bigobj, at the tail of the nth chunk
	- *
	- * The chunk size is arbitrary. It doesn't have to be a power of two,
	- * and it doesn't have any relation to the object blocksize.
	- * The only requirement is that it can hold at least two bufwads.
	- *
	- * Normally, we write the bufwad to each of these locations.
	- * However, free_percent of the time we instead write zeroes to
	- * packobj and perform a dmu_free_range() on bigobj. By comparing
	- * bigobj to packobj, we can verify that the DMU is correctly
	- * tracking which parts of an object are allocated and free,
	- * and that the contents of the allocated blocks are correct.
	- */
	-
	- /*
	- * Read the directory info. If it's the first time, set things up.
	- */
	- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0,
	- chunksize);
	- ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
	- chunksize);
	-
	- if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	- return;
	-
	- bigobj = od[0].od_object;
	- packobj = od[1].od_object;
	- chunksize = od[0].od_gen;
	- ASSERT(chunksize == od[1].od_gen);
	-
	- /*
	- * Prefetch a random chunk of the big object.
	- * Our aim here is to get some async reads in flight
	- * for blocks that we may free below; the DMU should
	- * handle this race correctly.
	- */
	- n = ztest_random(regions) * stride + ztest_random(width);
	- s = 1 + ztest_random(2 * width - 1);
	- dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
	- ZIO_PRIORITY_SYNC_READ);
	-
	- /*
	- * Pick a random index and compute the offsets into packobj and bigobj.
	- */
	- n = ztest_random(regions) * stride + ztest_random(width);
	- s = 1 + ztest_random(width - 1);
	-
	- packoff = n * sizeof (bufwad_t);
	- packsize = s * sizeof (bufwad_t);
	-
	- bigoff = n * chunksize;
	- bigsize = s * chunksize;
	-
	- packbuf = umem_alloc(packsize, UMEM_NOFAIL);
	- bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
	-
	- /*
	- * free_percent of the time, free a range of bigobj rather than
	- * overwriting it.
	- */
	- freeit = (ztest_random(100) < free_percent);
	-
	- /*
	- * Read the current contents of our objects.
	- */
	- error = dmu_read(os, packobj, packoff, packsize, packbuf,
	- DMU_READ_PREFETCH);
	- ASSERT0(error);
	- error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
	- DMU_READ_PREFETCH);
	- ASSERT0(error);
	-
	- /*
	- * Get a tx for the mods to both packobj and bigobj.
	- */
	- tx = dmu_tx_create(os);
	-
	- dmu_tx_hold_write(tx, packobj, packoff, packsize);
	-
	- if (freeit)
	- dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
	- else
	- dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
	-
	- /* This accounts for setting the checksum/compression. */
	- dmu_tx_hold_bonus(tx, bigobj);
	-
	- txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	- if (txg == 0) {
	- umem_free(packbuf, packsize);
	- umem_free(bigbuf, bigsize);
	- return;
	- }
	-
	- enum zio_checksum cksum;
	- do {
	- cksum = (enum zio_checksum)
	- ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
	- } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
	- dmu_object_set_checksum(os, bigobj, cksum, tx);
	-
	- enum zio_compress comp;
	- do {
	- comp = (enum zio_compress)
	- ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
	- } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
	- dmu_object_set_compress(os, bigobj, comp, tx);
	-
	- /*
	- * For each index from n to n + s, verify that the existing bufwad
	- * in packobj matches the bufwads at the head and tail of the
	- * corresponding chunk in bigobj. Then update all three bufwads
	- * with the new values we want to write out.
	- */
	- for (i = 0; i < s; i++) {
	- /* LINTED */
	- pack = (bufwad_t )((char )packbuf + i * sizeof (bufwad_t));
	- /* LINTED */
	- bigH = (bufwad_t )((char )bigbuf + i * chunksize);
	- /* LINTED */
	- bigT = (bufwad_t )((char )bigH + chunksize) - 1;
	-
	- ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
	- ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
	-
	- if (pack->bw_txg > txg)
	- fatal(0, "future leak: got %llx, open txg is %llx",
	- pack->bw_txg, txg);
	-
	- if (pack->bw_data != 0 && pack->bw_index != n + i)
	- fatal(0, "wrong index: got %llx, wanted %llx+%llx",
	- pack->bw_index, n, i);
	-
	- if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
	- fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
	-
	- if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
	- fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
	-
	- if (freeit) {
	- bzero(pack, sizeof (bufwad_t));
	- } else {
	- pack->bw_index = n + i;
	- pack->bw_txg = txg;
	- pack->bw_data = 1 + ztest_random(-2ULL);
	- }
	- bigH = pack;
	- bigT = pack;
	- }
	-
	- /*
	- * We've verified all the old bufwads, and made new ones.
	- * Now write them out.
	- */
	- dmu_write(os, packobj, packoff, packsize, packbuf, tx);
	-
	- if (freeit) {
	- if (ztest_opts.zo_verbose >= 7) {
	- (void) printf("freeing offset %llx size %llx"
	- " txg %llx\n",
	- (u_longlong_t)bigoff,
	- (u_longlong_t)bigsize,
	- (u_longlong_t)txg);
	- }
	- VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
	- } else {
	- if (ztest_opts.zo_verbose >= 7) {
	- (void) printf("writing offset %llx size %llx"
	- " txg %llx\n",
	- (u_longlong_t)bigoff,
	- (u_longlong_t)bigsize,
	- (u_longlong_t)txg);
	- }
	- dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
	- }
	-
	- dmu_tx_commit(tx);
	-
	- /*
	- * Sanity check the stuff we just wrote.
	- */
	- {
	- void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
	- void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
	-
	- VERIFY(0 == dmu_read(os, packobj, packoff,
	- packsize, packcheck, DMU_READ_PREFETCH));
	- VERIFY(0 == dmu_read(os, bigobj, bigoff,
	- bigsize, bigcheck, DMU_READ_PREFETCH));
	-
	- ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
	- ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
	-
	- umem_free(packcheck, packsize);
	- umem_free(bigcheck, bigsize);
	- }
	-
	- umem_free(packbuf, packsize);
	- umem_free(bigbuf, bigsize);
	-}
	-
	-void
	-compare_and_update_pbbufs(uint64_t s, bufwad_t packbuf, bufwad_t bigbuf,
	- uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
	-{
	- uint64_t i;
	- bufwad_t *pack;
	- bufwad_t *bigH;
	- bufwad_t *bigT;
	-
	- /*
	- * For each index from n to n + s, verify that the existing bufwad
	- * in packobj matches the bufwads at the head and tail of the
	- * corresponding chunk in bigobj. Then update all three bufwads
	- * with the new values we want to write out.
	- */
	- for (i = 0; i < s; i++) {
	- /* LINTED */
	- pack = (bufwad_t )((char )packbuf + i * sizeof (bufwad_t));
	- /* LINTED */
	- bigH = (bufwad_t )((char )bigbuf + i * chunksize);
	- /* LINTED */
	- bigT = (bufwad_t )((char )bigH + chunksize) - 1;
	-
	- ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
	- ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
	-
	- if (pack->bw_txg > txg)
	- fatal(0, "future leak: got %llx, open txg is %llx",
	- pack->bw_txg, txg);
	-
	- if (pack->bw_data != 0 && pack->bw_index != n + i)
	- fatal(0, "wrong index: got %llx, wanted %llx+%llx",
	- pack->bw_index, n, i);
	-
	- if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
	- fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
	-
	- if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
	- fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
	-
	- pack->bw_index = n + i;
	- pack->bw_txg = txg;
	- pack->bw_data = 1 + ztest_random(-2ULL);
	-
	- bigH = pack;
	- bigT = pack;
	- }
	-}
	-
	-void
	-ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os = zd->zd_os;
	- ztest_od_t od[2];
	- dmu_tx_t *tx;
	- uint64_t i;
	- int error;
	- uint64_t n, s, txg;
	- bufwad_t packbuf, bigbuf;
	- uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
	- uint64_t blocksize = ztest_random_blocksize();
	- uint64_t chunksize = blocksize;
	- uint64_t regions = 997;
	- uint64_t stride = 123456789ULL;
	- uint64_t width = 9;
	- dmu_buf_t *bonus_db;
	- arc_buf_t **bigbuf_arcbufs;
	- dmu_object_info_t doi;
	-
	- /*
	- * This test uses two objects, packobj and bigobj, that are always
	- * updated together (i.e. in the same tx) so that their contents are
	- * in sync and can be compared. Their contents relate to each other
	- * in a simple way: packobj is a dense array of 'bufwad' structures,
	- * while bigobj is a sparse array of the same bufwads. Specifically,
	- * for any index n, there are three bufwads that should be identical:
	- *
	- * packobj, at offset n * sizeof (bufwad_t)
	- * bigobj, at the head of the nth chunk
	- * bigobj, at the tail of the nth chunk
	- *
	- * The chunk size is set equal to bigobj block size so that
	- * dmu_assign_arcbuf() can be tested for object updates.
	- */
	-
	- /*
	- * Read the directory info. If it's the first time, set things up.
	- */
	- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize,
	- 0, 0);
	- ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
	- chunksize);
	-
	- if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	- return;
	-
	- bigobj = od[0].od_object;
	- packobj = od[1].od_object;
	- blocksize = od[0].od_blocksize;
	- chunksize = blocksize;
	- ASSERT(chunksize == od[1].od_gen);
	-
	- VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
	- VERIFY(ISP2(doi.doi_data_block_size));
	- VERIFY(chunksize == doi.doi_data_block_size);
	- VERIFY(chunksize >= 2 * sizeof (bufwad_t));
	-
	- /*
	- * Pick a random index and compute the offsets into packobj and bigobj.
	- */
	- n = ztest_random(regions) * stride + ztest_random(width);
	- s = 1 + ztest_random(width - 1);
	-
	- packoff = n * sizeof (bufwad_t);
	- packsize = s * sizeof (bufwad_t);
	-
	- bigoff = n * chunksize;
	- bigsize = s * chunksize;
	-
	- packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
	- bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
	-
	- VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
	-
	- bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
	-
	- /*
	- * Iteration 0 test zcopy for DB_UNCACHED dbufs.
	- * Iteration 1 test zcopy to already referenced dbufs.
	- * Iteration 2 test zcopy to dirty dbuf in the same txg.
	- * Iteration 3 test zcopy to dbuf dirty in previous txg.
	- * Iteration 4 test zcopy when dbuf is no longer dirty.
	- * Iteration 5 test zcopy when it can't be done.
	- * Iteration 6 one more zcopy write.
	- */
	- for (i = 0; i < 7; i++) {
	- uint64_t j;
	- uint64_t off;
	-
	- /*
	- * In iteration 5 (i == 5) use arcbufs
	- * that don't match bigobj blksz to test
	- * dmu_assign_arcbuf() when it can't directly
	- * assign an arcbuf to a dbuf.
	- */
	- for (j = 0; j < s; j++) {
	- if (i != 5) {
	- bigbuf_arcbufs[j] =
	- dmu_request_arcbuf(bonus_db, chunksize);
	- } else {
	- bigbuf_arcbufs[2 * j] =
	- dmu_request_arcbuf(bonus_db, chunksize / 2);
	- bigbuf_arcbufs[2 * j + 1] =
	- dmu_request_arcbuf(bonus_db, chunksize / 2);
	- }
	- }
	-
	- /*
	- * Get a tx for the mods to both packobj and bigobj.
	- */
	- tx = dmu_tx_create(os);
	-
	- dmu_tx_hold_write(tx, packobj, packoff, packsize);
	- dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
	-
	- txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	- if (txg == 0) {
	- umem_free(packbuf, packsize);
	- umem_free(bigbuf, bigsize);
	- for (j = 0; j < s; j++) {
	- if (i != 5) {
	- dmu_return_arcbuf(bigbuf_arcbufs[j]);
	- } else {
	- dmu_return_arcbuf(
	- bigbuf_arcbufs[2 * j]);
	- dmu_return_arcbuf(
	- bigbuf_arcbufs[2 * j + 1]);
	- }
	- }
	- umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
	- dmu_buf_rele(bonus_db, FTAG);
	- return;
	- }
	-
	- /*
	- * 50% of the time don't read objects in the 1st iteration to
	- * test dmu_assign_arcbuf() for the case when there're no
	- * existing dbufs for the specified offsets.
	- */
	- if (i != 0 \|\| ztest_random(2) != 0) {
	- error = dmu_read(os, packobj, packoff,
	- packsize, packbuf, DMU_READ_PREFETCH);
	- ASSERT0(error);
	- error = dmu_read(os, bigobj, bigoff, bigsize,
	- bigbuf, DMU_READ_PREFETCH);
	- ASSERT0(error);
	- }
	- compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
	- n, chunksize, txg);
	-
	- /*
	- * We've verified all the old bufwads, and made new ones.
	- * Now write them out.
	- */
	- dmu_write(os, packobj, packoff, packsize, packbuf, tx);
	- if (ztest_opts.zo_verbose >= 7) {
	- (void) printf("writing offset %llx size %llx"
	- " txg %llx\n",
	- (u_longlong_t)bigoff,
	- (u_longlong_t)bigsize,
	- (u_longlong_t)txg);
	- }
	- for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
	- dmu_buf_t *dbt;
	- if (i != 5) {
	- bcopy((caddr_t)bigbuf + (off - bigoff),
	- bigbuf_arcbufs[j]->b_data, chunksize);
	- } else {
	- bcopy((caddr_t)bigbuf + (off - bigoff),
	- bigbuf_arcbufs[2 * j]->b_data,
	- chunksize / 2);
	- bcopy((caddr_t)bigbuf + (off - bigoff) +
	- chunksize / 2,
	- bigbuf_arcbufs[2 * j + 1]->b_data,
	- chunksize / 2);
	- }
	-
	- if (i == 1) {
	- VERIFY(dmu_buf_hold(os, bigobj, off,
	- FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
	- }
	- if (i != 5) {
	- dmu_assign_arcbuf(bonus_db, off,
	- bigbuf_arcbufs[j], tx);
	- } else {
	- dmu_assign_arcbuf(bonus_db, off,
	- bigbuf_arcbufs[2 * j], tx);
	- dmu_assign_arcbuf(bonus_db,
	- off + chunksize / 2,
	- bigbuf_arcbufs[2 * j + 1], tx);
	- }
	- if (i == 1) {
	- dmu_buf_rele(dbt, FTAG);
	- }
	- }
	- dmu_tx_commit(tx);
	-
	- /*
	- * Sanity check the stuff we just wrote.
	- */
	- {
	- void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
	- void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
	-
	- VERIFY(0 == dmu_read(os, packobj, packoff,
	- packsize, packcheck, DMU_READ_PREFETCH));
	- VERIFY(0 == dmu_read(os, bigobj, bigoff,
	- bigsize, bigcheck, DMU_READ_PREFETCH));
	-
	- ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
	- ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
	-
	- umem_free(packcheck, packsize);
	- umem_free(bigcheck, bigsize);
	- }
	- if (i == 2) {
	- txg_wait_open(dmu_objset_pool(os), 0);
	- } else if (i == 3) {
	- txg_wait_synced(dmu_objset_pool(os), 0);
	- }
	- }
	-
	- dmu_buf_rele(bonus_db, FTAG);
	- umem_free(packbuf, packsize);
	- umem_free(bigbuf, bigsize);
	- umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_od_t od[1];
	- uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
	- (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
	-
	- /*
	- * Have multiple threads write to large offsets in an object
	- * to verify that parallel writes to an object -- even to the
	- * same blocks within the object -- doesn't cause any trouble.
	- */
	- ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER,
	- 0, 0, 0);
	-
	- if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	- return;
	-
	- while (ztest_random(10) != 0)
	- ztest_io(zd, od[0].od_object, offset);
	-}
	-
	-void
	-ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_od_t od[1];
	- uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
	- (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
	- uint64_t count = ztest_random(20) + 1;
	- uint64_t blocksize = ztest_random_blocksize();
	- void *data;
	-
	- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize,
	- 0, 0);
	-
	- if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
	- return;
	-
	- if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
	- return;
	-
	- ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);
	-
	- data = umem_zalloc(blocksize, UMEM_NOFAIL);
	-
	- while (ztest_random(count) != 0) {
	- uint64_t randoff = offset + (ztest_random(count) * blocksize);
	- if (ztest_write(zd, od[0].od_object, randoff, blocksize,
	- data) != 0)
	- break;
	- while (ztest_random(4) != 0)
	- ztest_io(zd, od[0].od_object, randoff);
	- }
	-
	- umem_free(data, blocksize);
	-}
	-
	-/*
	- * Verify that zap_{create,destroy,add,remove,update} work as expected.
	- */
	-#define ZTEST_ZAP_MIN_INTS 1
	-#define ZTEST_ZAP_MAX_INTS 4
	-#define ZTEST_ZAP_MAX_PROPS 1000
	-
	-void
	-ztest_zap(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os = zd->zd_os;
	- ztest_od_t od[1];
	- uint64_t object;
	- uint64_t txg, last_txg;
	- uint64_t value[ZTEST_ZAP_MAX_INTS];
	- uint64_t zl_ints, zl_intsize, prop;
	- int i, ints;
	- dmu_tx_t *tx;
	- char propname[100], txgname[100];
	- int error;
	- char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
	-
	- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
	-
	- if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
	- return;
	-
	- object = od[0].od_object;
	-
	- /*
	- * Generate a known hash collision, and verify that
	- * we can lookup and remove both entries.
	- */
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
	- txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	- if (txg == 0)
	- return;
	- for (i = 0; i < 2; i++) {
	- value[i] = i;
	- VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
	- 1, &value[i], tx));
	- }
	- for (i = 0; i < 2; i++) {
	- VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
	- sizeof (uint64_t), 1, &value[i], tx));
	- VERIFY3U(0, ==,
	- zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
	- ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
	- ASSERT3U(zl_ints, ==, 1);
	- }
	- for (i = 0; i < 2; i++) {
	- VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
	- }
	- dmu_tx_commit(tx);
	-
	- /*
	- * Generate a buch of random entries.
	- */
	- ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
	-
	- prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
	- (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
	- (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
	- bzero(value, sizeof (value));
	- last_txg = 0;
	-
	- /*
	- * If these zap entries already exist, validate their contents.
	- */
	- error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
	- if (error == 0) {
	- ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
	- ASSERT3U(zl_ints, ==, 1);
	-
	- VERIFY(zap_lookup(os, object, txgname, zl_intsize,
	- zl_ints, &last_txg) == 0);
	-
	- VERIFY(zap_length(os, object, propname, &zl_intsize,
	- &zl_ints) == 0);
	-
	- ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
	- ASSERT3U(zl_ints, ==, ints);
	-
	- VERIFY(zap_lookup(os, object, propname, zl_intsize,
	- zl_ints, value) == 0);
	-
	- for (i = 0; i < ints; i++) {
	- ASSERT3U(value[i], ==, last_txg + object + i);
	- }
	- } else {
	- ASSERT3U(error, ==, ENOENT);
	- }
	-
	- /*
	- * Atomically update two entries in our zap object.
	- * The first is named txg_%llu, and contains the txg
	- * in which the property was last updated. The second
	- * is named prop_%llu, and the nth element of its value
	- * should be txg + object + n.
	- */
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
	- txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	- if (txg == 0)
	- return;
	-
	- if (last_txg > txg)
	- fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
	-
	- for (i = 0; i < ints; i++)
	- value[i] = txg + object + i;
	-
	- VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
	- 1, &txg, tx));
	- VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
	- ints, value, tx));
	-
	- dmu_tx_commit(tx);
	-
	- /*
	- * Remove a random pair of entries.
	- */
	- prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
	- (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
	- (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
	-
	- error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
	-
	- if (error == ENOENT)
	- return;
	-
	- ASSERT0(error);
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
	- txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	- if (txg == 0)
	- return;
	- VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
	- VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
	- dmu_tx_commit(tx);
	-}
	-
	-/*
	- * Testcase to test the upgrading of a microzap to fatzap.
	- */
	-void
	-ztest_fzap(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os = zd->zd_os;
	- ztest_od_t od[1];
	- uint64_t object, txg;
	-
	- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
	-
	- if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
	- return;
	-
	- object = od[0].od_object;
	-
	- /*
	- * Add entries to this ZAP and make sure it spills over
	- * and gets upgraded to a fatzap. Also, since we are adding
	- * 2050 entries we should see ptrtbl growth and leaf-block split.
	- */
	- for (int i = 0; i < 2050; i++) {
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- uint64_t value = i;
	- dmu_tx_t *tx;
	- int error;
	-
	- (void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
	- id, value);
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, object, B_TRUE, name);
	- txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	- if (txg == 0)
	- return;
	- error = zap_add(os, object, name, sizeof (uint64_t), 1,
	- &value, tx);
	- ASSERT(error == 0 \|\| error == EEXIST);
	- dmu_tx_commit(tx);
	- }
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os = zd->zd_os;
	- ztest_od_t od[1];
	- uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
	- dmu_tx_t *tx;
	- int i, namelen, error;
	- int micro = ztest_random(2);
	- char name[20], string_value[20];
	- void *data;
	-
	- ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER,
	- 0, 0, 0);
	-
	- if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	- return;
	-
	- object = od[0].od_object;
	-
	- /*
	- * Generate a random name of the form 'xxx.....' where each
	- * x is a random printable character and the dots are dots.
	- * There are 94 such characters, and the name length goes from
	- * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
	- */
	- namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
	-
	- for (i = 0; i < 3; i++)
	- name[i] = '!' + ztest_random('~' - '!' + 1);
	- for (; i < namelen - 1; i++)
	- name[i] = '.';
	- name[i] = '\0';
	-
	- if ((namelen & 1) \|\| micro) {
	- wsize = sizeof (txg);
	- wc = 1;
	- data = &txg;
	- } else {
	- wsize = 1;
	- wc = namelen;
	- data = string_value;
	- }
	-
	- count = -1ULL;
	- VERIFY0(zap_count(os, object, &count));
	- ASSERT(count != -1ULL);
	-
	- /*
	- * Select an operation: length, lookup, add, update, remove.
	- */
	- i = ztest_random(5);
	-
	- if (i >= 2) {
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
	- txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	- if (txg == 0)
	- return;
	- bcopy(name, string_value, namelen);
	- } else {
	- tx = NULL;
	- txg = 0;
	- bzero(string_value, namelen);
	- }
	-
	- switch (i) {
	-
	- case 0:
	- error = zap_length(os, object, name, &zl_wsize, &zl_wc);
	- if (error == 0) {
	- ASSERT3U(wsize, ==, zl_wsize);
	- ASSERT3U(wc, ==, zl_wc);
	- } else {
	- ASSERT3U(error, ==, ENOENT);
	- }
	- break;
	-
	- case 1:
	- error = zap_lookup(os, object, name, wsize, wc, data);
	- if (error == 0) {
	- if (data == string_value &&
	- bcmp(name, data, namelen) != 0)
	- fatal(0, "name '%s' != val '%s' len %d",
	- name, data, namelen);
	- } else {
	- ASSERT3U(error, ==, ENOENT);
	- }
	- break;
	-
	- case 2:
	- error = zap_add(os, object, name, wsize, wc, data, tx);
	- ASSERT(error == 0 \|\| error == EEXIST);
	- break;
	-
	- case 3:
	- VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
	- break;
	-
	- case 4:
	- error = zap_remove(os, object, name, tx);
	- ASSERT(error == 0 \|\| error == ENOENT);
	- break;
	- }
	-
	- if (tx != NULL)
	- dmu_tx_commit(tx);
	-}
	-
	-/*
	- * Commit callback data.
	- */
	-typedef struct ztest_cb_data {
	- list_node_t zcd_node;
	- uint64_t zcd_txg;
	- int zcd_expected_err;
	- boolean_t zcd_added;
	- boolean_t zcd_called;
	- spa_t *zcd_spa;
	-} ztest_cb_data_t;
	-
	-/* This is the actual commit callback function */
	-static void
	-ztest_commit_callback(void *arg, int error)
	-{
	- ztest_cb_data_t *data = arg;
	- uint64_t synced_txg;
	-
	- VERIFY(data != NULL);
	- VERIFY3S(data->zcd_expected_err, ==, error);
	- VERIFY(!data->zcd_called);
	-
	- synced_txg = spa_last_synced_txg(data->zcd_spa);
	- if (data->zcd_txg > synced_txg)
	- fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
	- ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
	- synced_txg);
	-
	- data->zcd_called = B_TRUE;
	-
	- if (error == ECANCELED) {
	- ASSERT0(data->zcd_txg);
	- ASSERT(!data->zcd_added);
	-
	- /*
	- * The private callback data should be destroyed here, but
	- * since we are going to check the zcd_called field after
	- * dmu_tx_abort(), we will destroy it there.
	- */
	- return;
	- }
	-
	- /* Was this callback added to the global callback list? */
	- if (!data->zcd_added)
	- goto out;
	-
	- ASSERT3U(data->zcd_txg, !=, 0);
	-
	- /* Remove our callback from the list */
	- mutex_enter(&zcl.zcl_callbacks_lock);
	- list_remove(&zcl.zcl_callbacks, data);
	- mutex_exit(&zcl.zcl_callbacks_lock);
	-
	-out:
	- umem_free(data, sizeof (ztest_cb_data_t));
	-}
	-
	-/* Allocate and initialize callback data structure */
	-static ztest_cb_data_t *
	-ztest_create_cb_data(objset_t *os, uint64_t txg)
	-{
	- ztest_cb_data_t *cb_data;
	-
	- cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
	-
	- cb_data->zcd_txg = txg;
	- cb_data->zcd_spa = dmu_objset_spa(os);
	-
	- return (cb_data);
	-}
	-
	-/*
	- * If a number of txgs equal to this threshold have been created after a commit
	- * callback has been registered but not called, then we assume there is an
	- * implementation bug.
	- */
	-#define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2)
	-
	-/*
	- * Commit callback test.
	- */
	-void
	-ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os = zd->zd_os;
	- ztest_od_t od[1];
	- dmu_tx_t *tx;
	- ztest_cb_data_t cb_data[3], tmp_cb;
	- uint64_t old_txg, txg;
	- int i, error;
	-
	- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
	-
	- if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	- return;
	-
	- tx = dmu_tx_create(os);
	-
	- cb_data[0] = ztest_create_cb_data(os, 0);
	- dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
	-
	- dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
	-
	- /* Every once in a while, abort the transaction on purpose */
	- if (ztest_random(100) == 0)
	- error = -1;
	-
	- if (!error)
	- error = dmu_tx_assign(tx, TXG_NOWAIT);
	-
	- txg = error ? 0 : dmu_tx_get_txg(tx);
	-
	- cb_data[0]->zcd_txg = txg;
	- cb_data[1] = ztest_create_cb_data(os, txg);
	- dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
	-
	- if (error) {
	- /*
	- * It's not a strict requirement to call the registered
	- * callbacks from inside dmu_tx_abort(), but that's what
	- * it's supposed to happen in the current implementation
	- * so we will check for that.
	- */
	- for (i = 0; i < 2; i++) {
	- cb_data[i]->zcd_expected_err = ECANCELED;
	- VERIFY(!cb_data[i]->zcd_called);
	- }
	-
	- dmu_tx_abort(tx);
	-
	- for (i = 0; i < 2; i++) {
	- VERIFY(cb_data[i]->zcd_called);
	- umem_free(cb_data[i], sizeof (ztest_cb_data_t));
	- }
	-
	- return;
	- }
	-
	- cb_data[2] = ztest_create_cb_data(os, txg);
	- dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
	-
	- /*
	- * Read existing data to make sure there isn't a future leak.
	- */
	- VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
	- &old_txg, DMU_READ_PREFETCH));
	-
	- if (old_txg > txg)
	- fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
	- old_txg, txg);
	-
	- dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
	-
	- mutex_enter(&zcl.zcl_callbacks_lock);
	-
	- /*
	- * Since commit callbacks don't have any ordering requirement and since
	- * it is theoretically possible for a commit callback to be called
	- * after an arbitrary amount of time has elapsed since its txg has been
	- * synced, it is difficult to reliably determine whether a commit
	- * callback hasn't been called due to high load or due to a flawed
	- * implementation.
	- *
	- * In practice, we will assume that if after a certain number of txgs a
	- * commit callback hasn't been called, then most likely there's an
	- * implementation bug..
	- */
	- tmp_cb = list_head(&zcl.zcl_callbacks);
	- if (tmp_cb != NULL &&
	- (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) {
	- fatal(0, "Commit callback threshold exceeded, oldest txg: %"
	- PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
	- }
	-
	- /*
	- * Let's find the place to insert our callbacks.
	- *
	- * Even though the list is ordered by txg, it is possible for the
	- * insertion point to not be the end because our txg may already be
	- * quiescing at this point and other callbacks in the open txg
	- * (from other objsets) may have sneaked in.
	- */
	- tmp_cb = list_tail(&zcl.zcl_callbacks);
	- while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
	- tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
	-
	- /* Add the 3 callbacks to the list */
	- for (i = 0; i < 3; i++) {
	- if (tmp_cb == NULL)
	- list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
	- else
	- list_insert_after(&zcl.zcl_callbacks, tmp_cb,
	- cb_data[i]);
	-
	- cb_data[i]->zcd_added = B_TRUE;
	- VERIFY(!cb_data[i]->zcd_called);
	-
	- tmp_cb = cb_data[i];
	- }
	-
	- mutex_exit(&zcl.zcl_callbacks_lock);
	-
	- dmu_tx_commit(tx);
	-}
	-
	-/*
	- * Visit each object in the dataset. Verify that its properties
	- * are consistent what was stored in the block tag when it was created,
	- * and that its unused bonus buffer space has not been overwritten.
	- */
	-void
	-ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id)
	-{
	- objset_t *os = zd->zd_os;
	- uint64_t obj;
	- int err = 0;
	-
	- for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
	- ztest_block_tag_t *bt = NULL;
	- dmu_object_info_t doi;
	- dmu_buf_t *db;
	-
	- if (dmu_bonus_hold(os, obj, FTAG, &db) != 0)
	- continue;
	-
	- dmu_object_info_from_db(db, &doi);
	- if (doi.doi_bonus_size >= sizeof (*bt))
	- bt = ztest_bt_bonus(db);
	-
	- if (bt && bt->bt_magic == BT_MAGIC) {
	- ztest_bt_verify(bt, os, obj, doi.doi_dnodesize,
	- bt->bt_offset, bt->bt_gen, bt->bt_txg,
	- bt->bt_crtxg);
	- ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen);
	- }
	-
	- dmu_buf_rele(db, FTAG);
	- }
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
	-{
	- zfs_prop_t proplist[] = {
	- ZFS_PROP_CHECKSUM,
	- ZFS_PROP_COMPRESSION,
	- ZFS_PROP_COPIES,
	- ZFS_PROP_DEDUP
	- };
	-
	- rw_enter(&ztest_name_lock, RW_READER);
	-
	- for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
	- (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
	- ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
	-
	- rw_exit(&ztest_name_lock);
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_remap_blocks(ztest_ds_t *zd, uint64_t id)
	-{
	- rw_enter(&ztest_name_lock, RW_READER);
	-
	- int error = dmu_objset_remap_indirects(zd->zd_name);
	- if (error == ENOSPC)
	- error = 0;
	- ASSERT0(error);
	-
	- rw_exit(&ztest_name_lock);
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
	-{
	- nvlist_t *props = NULL;
	-
	- rw_enter(&ztest_name_lock, RW_READER);
	-
	- (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
	- ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
	-
	- VERIFY0(spa_prop_get(ztest_spa, &props));
	-
	- if (ztest_opts.zo_verbose >= 6)
	- dump_nvlist(props, 4);
	-
	- nvlist_free(props);
	-
	- rw_exit(&ztest_name_lock);
	-}
	-
	-static int
	-user_release_one(const char snapname, const char holdname)
	-{
	- nvlist_t snaps, holds;
	- int error;
	-
	- snaps = fnvlist_alloc();
	- holds = fnvlist_alloc();
	- fnvlist_add_boolean(holds, holdname);
	- fnvlist_add_nvlist(snaps, snapname, holds);
	- fnvlist_free(holds);
	- error = dsl_dataset_user_release(snaps, NULL);
	- fnvlist_free(snaps);
	- return (error);
	-}
	-
	-/*
	- * Test snapshot hold/release and deferred destroy.
	- */
	-void
	-ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
	-{
	- int error;
	- objset_t *os = zd->zd_os;
	- objset_t *origin;
	- char snapname[100];
	- char fullname[100];
	- char clonename[100];
	- char tag[100];
	- char osname[ZFS_MAX_DATASET_NAME_LEN];
	- nvlist_t *holds;
	-
	- rw_enter(&ztest_name_lock, RW_READER);
	-
	- dmu_objset_name(os, osname);
	-
	- (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id);
	- (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
	- (void) snprintf(clonename, sizeof (clonename),
	- "%s/ch1_%llu", osname, id);
	- (void) snprintf(tag, sizeof (tag), "tag_%llu", id);
	-
	- /*
	- * Clean up from any previous run.
	- */
	- error = dsl_destroy_head(clonename);
	- if (error != ENOENT)
	- ASSERT0(error);
	- error = user_release_one(fullname, tag);
	- if (error != ESRCH && error != ENOENT)
	- ASSERT0(error);
	- error = dsl_destroy_snapshot(fullname, B_FALSE);
	- if (error != ENOENT)
	- ASSERT0(error);
	-
	- /*
	- * Create snapshot, clone it, mark snap for deferred destroy,
	- * destroy clone, verify snap was also destroyed.
	- */
	- error = dmu_objset_snapshot_one(osname, snapname);
	- if (error) {
	- if (error == ENOSPC) {
	- ztest_record_enospc("dmu_objset_snapshot");
	- goto out;
	- }
	- fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
	- }
	-
	- error = dmu_objset_clone(clonename, fullname);
	- if (error) {
	- if (error == ENOSPC) {
	- ztest_record_enospc("dmu_objset_clone");
	- goto out;
	- }
	- fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
	- }
	-
	- error = dsl_destroy_snapshot(fullname, B_TRUE);
	- if (error) {
	- fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
	- fullname, error);
	- }
	-
	- error = dsl_destroy_head(clonename);
	- if (error)
	- fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
	-
	- error = dmu_objset_hold(fullname, FTAG, &origin);
	- if (error != ENOENT)
	- fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
	-
	- /*
	- * Create snapshot, add temporary hold, verify that we can't
	- * destroy a held snapshot, mark for deferred destroy,
	- * release hold, verify snapshot was destroyed.
	- */
	- error = dmu_objset_snapshot_one(osname, snapname);
	- if (error) {
	- if (error == ENOSPC) {
	- ztest_record_enospc("dmu_objset_snapshot");
	- goto out;
	- }
	- fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
	- }
	-
	- holds = fnvlist_alloc();
	- fnvlist_add_string(holds, fullname, tag);
	- error = dsl_dataset_user_hold(holds, 0, NULL);
	- fnvlist_free(holds);
	-
	- if (error == ENOSPC) {
	- ztest_record_enospc("dsl_dataset_user_hold");
	- goto out;
	- } else if (error) {
	- fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
	- fullname, tag, error);
	- }
	-
	- error = dsl_destroy_snapshot(fullname, B_FALSE);
	- if (error != EBUSY) {
	- fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
	- fullname, error);
	- }
	-
	- error = dsl_destroy_snapshot(fullname, B_TRUE);
	- if (error) {
	- fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
	- fullname, error);
	- }
	-
	- error = user_release_one(fullname, tag);
	- if (error)
	- fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
	-
	- VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
	-
	-out:
	- rw_exit(&ztest_name_lock);
	-}
	-
	-/*
	- * Inject random faults into the on-disk data.
	- */
	-/* ARGSUSED */
	-void
	-ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_shared_t *zs = ztest_shared;
	- spa_t *spa = ztest_spa;
	- int fd;
	- uint64_t offset;
	- uint64_t leaves;
	- uint64_t bad = 0x1990c0ffeedecadeULL;
	- uint64_t top, leaf;
	- char path0[MAXPATHLEN];
	- char pathrand[MAXPATHLEN];
	- size_t fsize;
	- int bshift = SPA_MAXBLOCKSHIFT + 2;
	- int iters = 1000;
	- int maxfaults;
	- int mirror_save;
	- vdev_t *vd0 = NULL;
	- uint64_t guid0 = 0;
	- boolean_t islog = B_FALSE;
	-
	- mutex_enter(&ztest_vdev_lock);
	-
	- /*
	- * Device removal is in progress, fault injection must be disabled
	- * until it completes and the pool is scrubbed. The fault injection
	- * strategy for damaging blocks does not take in to account evacuated
	- * blocks which may have already been damaged.
	- */
	- if (ztest_device_removal_active) {
	- mutex_exit(&ztest_vdev_lock);
	- return;
	- }
	-
	- maxfaults = MAXFAULTS();
	- leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
	- mirror_save = zs->zs_mirrors;
	- mutex_exit(&ztest_vdev_lock);
	-
	- ASSERT(leaves >= 1);
	-
	- /*
	- * Grab the name lock as reader. There are some operations
	- * which don't like to have their vdevs changed while
	- * they are in progress (i.e. spa_change_guid). Those
	- * operations will have grabbed the name lock as writer.
	- */
	- rw_enter(&ztest_name_lock, RW_READER);
	-
	- /*
	- * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
	- */
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	-
	- if (ztest_random(2) == 0) {
	- /*
	- * Inject errors on a normal data device or slog device.
	- */
	- top = ztest_random_vdev_top(spa, B_TRUE);
	- leaf = ztest_random(leaves) + zs->zs_splits;
	-
	- /*
	- * Generate paths to the first leaf in this top-level vdev,
	- * and to the random leaf we selected. We'll induce transient
	- * write failures and random online/offline activity on leaf 0,
	- * and we'll write random garbage to the randomly chosen leaf.
	- */
	- (void) snprintf(path0, sizeof (path0), ztest_dev_template,
	- ztest_opts.zo_dir, ztest_opts.zo_pool,
	- top * leaves + zs->zs_splits);
	- (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
	- ztest_opts.zo_dir, ztest_opts.zo_pool,
	- top * leaves + leaf);
	-
	- vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
	- if (vd0 != NULL && vd0->vdev_top->vdev_islog)
	- islog = B_TRUE;
	-
	- /*
	- * If the top-level vdev needs to be resilvered
	- * then we only allow faults on the device that is
	- * resilvering.
	- */
	- if (vd0 != NULL && maxfaults != 1 &&
	- (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) \|\|
	- vd0->vdev_resilver_txg != 0)) {
	- /*
	- * Make vd0 explicitly claim to be unreadable,
	- * or unwriteable, or reach behind its back
	- * and close the underlying fd. We can do this if
	- * maxfaults == 0 because we'll fail and reexecute,
	- * and we can do it if maxfaults >= 2 because we'll
	- * have enough redundancy. If maxfaults == 1, the
	- * combination of this with injection of random data
	- * corruption below exceeds the pool's fault tolerance.
	- */
	- vdev_file_t *vf = vd0->vdev_tsd;
	-
	- zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d",
	- (long long)vd0->vdev_id, (int)maxfaults);
	-
	- if (vf != NULL && ztest_random(3) == 0) {
	- (void) close(vf->vf_vnode->v_fd);
	- vf->vf_vnode->v_fd = -1;
	- } else if (ztest_random(2) == 0) {
	- vd0->vdev_cant_read = B_TRUE;
	- } else {
	- vd0->vdev_cant_write = B_TRUE;
	- }
	- guid0 = vd0->vdev_guid;
	- }
	- } else {
	- /*
	- * Inject errors on an l2cache device.
	- */
	- spa_aux_vdev_t *sav = &spa->spa_l2cache;
	-
	- if (sav->sav_count == 0) {
	- spa_config_exit(spa, SCL_STATE, FTAG);
	- rw_exit(&ztest_name_lock);
	- return;
	- }
	- vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
	- guid0 = vd0->vdev_guid;
	- (void) strcpy(path0, vd0->vdev_path);
	- (void) strcpy(pathrand, vd0->vdev_path);
	-
	- leaf = 0;
	- leaves = 1;
	- maxfaults = INT_MAX; /* no limit on cache devices */
	- }
	-
	- spa_config_exit(spa, SCL_STATE, FTAG);
	- rw_exit(&ztest_name_lock);
	-
	- /*
	- * If we can tolerate two or more faults, or we're dealing
	- * with a slog, randomly online/offline vd0.
	- */
	- if ((maxfaults >= 2 \|\| islog) && guid0 != 0) {
	- if (ztest_random(10) < 6) {
	- int flags = (ztest_random(2) == 0 ?
	- ZFS_OFFLINE_TEMPORARY : 0);
	-
	- /*
	- * We have to grab the zs_name_lock as writer to
	- * prevent a race between offlining a slog and
	- * destroying a dataset. Offlining the slog will
	- * grab a reference on the dataset which may cause
	- * dmu_objset_destroy() to fail with EBUSY thus
	- * leaving the dataset in an inconsistent state.
	- */
	- if (islog)
	- rw_enter(&ztest_name_lock, RW_WRITER);
	-
	- VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
	-
	- if (islog)
	- rw_exit(&ztest_name_lock);
	- } else {
	- /*
	- * Ideally we would like to be able to randomly
	- * call vdev_[on\|off]line without holding locks
	- * to force unpredictable failures but the side
	- * effects of vdev_[on\|off]line prevent us from
	- * doing so. We grab the ztest_vdev_lock here to
	- * prevent a race between injection testing and
	- * aux_vdev removal.
	- */
	- mutex_enter(&ztest_vdev_lock);
	- (void) vdev_online(spa, guid0, 0, NULL);
	- mutex_exit(&ztest_vdev_lock);
	- }
	- }
	-
	- if (maxfaults == 0)
	- return;
	-
	- /*
	- * We have at least single-fault tolerance, so inject data corruption.
	- */
	- fd = open(pathrand, O_RDWR);
	-
	- if (fd == -1) /* we hit a gap in the device namespace */
	- return;
	-
	- fsize = lseek(fd, 0, SEEK_END);
	-
	- while (--iters != 0) {
	- /*
	- * The offset must be chosen carefully to ensure that
	- * we do not inject a given logical block with errors
	- * on two different leaf devices, because ZFS can not
	- * tolerate that (if maxfaults==1).
	- *
	- * We divide each leaf into chunks of size
	- * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk
	- * there is a series of ranges to which we can inject errors.
	- * Each range can accept errors on only a single leaf vdev.
	- * The error injection ranges are separated by ranges
	- * which we will not inject errors on any device (DMZs).
	- * Each DMZ must be large enough such that a single block
	- * can not straddle it, so that a single block can not be
	- * a target in two different injection ranges (on different
	- * leaf vdevs).
	- *
	- * For example, with 3 leaves, each chunk looks like:
	- * 0 to 32M: injection range for leaf 0
	- * 32M to 64M: DMZ - no injection allowed
	- * 64M to 96M: injection range for leaf 1
	- * 96M to 128M: DMZ - no injection allowed
	- * 128M to 160M: injection range for leaf 2
	- * 160M to 192M: DMZ - no injection allowed
	- */
	- offset = ztest_random(fsize / (leaves << bshift)) *
	- (leaves << bshift) + (leaf << bshift) +
	- (ztest_random(1ULL << (bshift - 1)) & -8ULL);
	-
	- /*
	- * Only allow damage to the labels at one end of the vdev.
	- *
	- * If all labels are damaged, the device will be totally
	- * inaccessible, which will result in loss of data,
	- * because we also damage (parts of) the other side of
	- * the mirror/raidz.
	- *
	- * Additionally, we will always have both an even and an
	- * odd label, so that we can handle crashes in the
	- * middle of vdev_config_sync().
	- */
	- if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
	- continue;
	-
	- /*
	- * The two end labels are stored at the "end" of the disk, but
	- * the end of the disk (vdev_psize) is aligned to
	- * sizeof (vdev_label_t).
	- */
	- uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
	- if ((leaf & 1) == 1 &&
	- offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
	- continue;
	-
	- mutex_enter(&ztest_vdev_lock);
	- if (mirror_save != zs->zs_mirrors) {
	- mutex_exit(&ztest_vdev_lock);
	- (void) close(fd);
	- return;
	- }
	-
	- if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
	- fatal(1, "can't inject bad word at 0x%llx in %s",
	- offset, pathrand);
	-
	- mutex_exit(&ztest_vdev_lock);
	-
	- if (ztest_opts.zo_verbose >= 7)
	- (void) printf("injected bad word into %s,"
	- " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
	- }
	-
	- (void) close(fd);
	-}
	-
	-/*
	- * Verify that DDT repair works as expected.
	- */
	-void
	-ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
	-{
	- ztest_shared_t *zs = ztest_shared;
	- spa_t *spa = ztest_spa;
	- objset_t *os = zd->zd_os;
	- ztest_od_t od[1];
	- uint64_t object, blocksize, txg, pattern, psize;
	- enum zio_checksum checksum = spa_dedup_checksum(spa);
	- dmu_buf_t *db;
	- dmu_tx_t *tx;
	- abd_t *abd;
	- blkptr_t blk;
	- int copies = 2 * ZIO_DEDUPDITTO_MIN;
	-
	- blocksize = ztest_random_blocksize();
	- blocksize = MIN(blocksize, 2048); /* because we write so many */
	-
	- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize,
	- 0, 0);
	-
	- if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	- return;
	-
	- /*
	- * Take the name lock as writer to prevent anyone else from changing
	- * the pool and dataset properies we need to maintain during this test.
	- */
	- rw_enter(&ztest_name_lock, RW_WRITER);
	-
	- if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
	- B_FALSE) != 0 \|\|
	- ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
	- B_FALSE) != 0) {
	- rw_exit(&ztest_name_lock);
	- return;
	- }
	-
	- dmu_objset_stats_t dds;
	- dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	- dmu_objset_fast_stat(os, &dds);
	- dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	-
	- object = od[0].od_object;
	- blocksize = od[0].od_blocksize;
	- pattern = zs->zs_guid ^ dds.dds_guid;
	-
	- ASSERT(object != 0);
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_write(tx, object, 0, copies * blocksize);
	- txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	- if (txg == 0) {
	- rw_exit(&ztest_name_lock);
	- return;
	- }
	-
	- /*
	- * Write all the copies of our block.
	- */
	- for (int i = 0; i < copies; i++) {
	- uint64_t offset = i * blocksize;
	- int error = dmu_buf_hold(os, object, offset, FTAG, &db,
	- DMU_READ_NO_PREFETCH);
	- if (error != 0) {
	- fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
	- os, (long long)object, (long long) offset, error);
	- }
	- ASSERT(db->db_offset == offset);
	- ASSERT(db->db_size == blocksize);
	- ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) \|\|
	- ztest_pattern_match(db->db_data, db->db_size, 0ULL));
	- dmu_buf_will_fill(db, tx);
	- ztest_pattern_set(db->db_data, db->db_size, pattern);
	- dmu_buf_rele(db, FTAG);
	- }
	-
	- dmu_tx_commit(tx);
	- txg_wait_synced(spa_get_dsl(spa), txg);
	-
	- /*
	- * Find out what block we got.
	- */
	- VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
	- DMU_READ_NO_PREFETCH));
	- blk = ((dmu_buf_impl_t )db)->db_blkptr;
	- dmu_buf_rele(db, FTAG);
	-
	- /*
	- * Damage the block. Dedup-ditto will save us when we read it later.
	- */
	- psize = BP_GET_PSIZE(&blk);
	- abd = abd_alloc_linear(psize, B_TRUE);
	- ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
	-
	- (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
	- abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_INDUCE_DAMAGE, NULL));
	-
	- abd_free(abd);
	-
	- rw_exit(&ztest_name_lock);
	-}
	-
	-/*
	- * Scrub the pool.
	- */
	-/* ARGSUSED */
	-void
	-ztest_scrub(ztest_ds_t *zd, uint64_t id)
	-{
	- spa_t *spa = ztest_spa;
	-
	- /*
	- * Scrub in progress by device removal.
	- */
	- if (ztest_device_removal_active)
	- return;
	-
	- (void) spa_scan(spa, POOL_SCAN_SCRUB);
	- (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
	- (void) spa_scan(spa, POOL_SCAN_SCRUB);
	-}
	-
	-/*
	- * Change the guid for the pool.
	- */
	-/* ARGSUSED */
	-void
	-ztest_reguid(ztest_ds_t *zd, uint64_t id)
	-{
	- spa_t *spa = ztest_spa;
	- uint64_t orig, load;
	- int error;
	-
	- if (ztest_opts.zo_mmp_test)
	- return;
	-
	- orig = spa_guid(spa);
	- load = spa_load_guid(spa);
	-
	- rw_enter(&ztest_name_lock, RW_WRITER);
	- error = spa_change_guid(spa);
	- rw_exit(&ztest_name_lock);
	-
	- if (error != 0)
	- return;
	-
	- if (ztest_opts.zo_verbose >= 4) {
	- (void) printf("Changed guid old %llu -> %llu\n",
	- (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
	- }
	-
	- VERIFY3U(orig, !=, spa_guid(spa));
	- VERIFY3U(load, ==, spa_load_guid(spa));
	-}
	-
	-static vdev_t *
	-ztest_random_concrete_vdev_leaf(vdev_t *vd)
	-{
	- if (vd == NULL)
	- return (NULL);
	-
	- if (vd->vdev_children == 0)
	- return (vd);
	-
	- vdev_t *eligible[vd->vdev_children];
	- int eligible_idx = 0, i;
	- for (i = 0; i < vd->vdev_children; i++) {
	- vdev_t *cvd = vd->vdev_child[i];
	- if (cvd->vdev_top->vdev_removing)
	- continue;
	- if (cvd->vdev_children > 0 \|\|
	- (vdev_is_concrete(cvd) && !cvd->vdev_detached)) {
	- eligible[eligible_idx++] = cvd;
	- }
	- }
	- VERIFY(eligible_idx > 0);
	-
	- uint64_t child_no = ztest_random(eligible_idx);
	- return (ztest_random_concrete_vdev_leaf(eligible[child_no]));
	-}
	-
	-/* ARGSUSED */
	-void
	-ztest_initialize(ztest_ds_t *zd, uint64_t id)
	-{
	- spa_t *spa = ztest_spa;
	- int error = 0;
	-
	- mutex_enter(&ztest_vdev_lock);
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	-
	- /* Random leaf vdev */
	- vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
	- if (rand_vd == NULL) {
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	- mutex_exit(&ztest_vdev_lock);
	- return;
	- }
	-
	- /*
	- * The random vdev we've selected may change as soon as we
	- * drop the spa_config_lock. We create local copies of things
	- * we're interested in.
	- */
	- uint64_t guid = rand_vd->vdev_guid;
	- char *path = strdup(rand_vd->vdev_path);
	- boolean_t active = rand_vd->vdev_initialize_thread != NULL;
	-
	- zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid);
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS);
	- error = spa_vdev_initialize(spa, guid, cmd);
	- switch (cmd) {
	- case POOL_INITIALIZE_CANCEL:
	- if (ztest_opts.zo_verbose >= 4) {
	- (void) printf("Cancel initialize %s", path);
	- if (!active)
	- (void) printf(" failed (no initialize active)");
	- (void) printf("\n");
	- }
	- break;
	- case POOL_INITIALIZE_DO:
	- if (ztest_opts.zo_verbose >= 4) {
	- (void) printf("Start initialize %s", path);
	- if (active && error == 0)
	- (void) printf(" failed (already active)");
	- else if (error != 0)
	- (void) printf(" failed (error %d)", error);
	- (void) printf("\n");
	- }
	- break;
	- case POOL_INITIALIZE_SUSPEND:
	- if (ztest_opts.zo_verbose >= 4) {
	- (void) printf("Suspend initialize %s", path);
	- if (!active)
	- (void) printf(" failed (no initialize active)");
	- (void) printf("\n");
	- }
	- break;
	- }
	- free(path);
	- mutex_exit(&ztest_vdev_lock);
	-}
	-
	-/*
	- * Verify pool integrity by running zdb.
	- */
	-static void
	-ztest_run_zdb(char *pool)
	-{
	- int status;
	- char zdb[MAXPATHLEN + MAXNAMELEN + 20];
	- char zbuf[1024];
	- char *bin;
	- char *ztest;
	- char *isa;
	- int isalen;
	- FILE *fp;
	-
	- strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb));
	-
	- /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
	- bin = strstr(zdb, "/usr/bin/");
	- ztest = strstr(bin, "/ztest");
	- isa = bin + 8;
	- isalen = ztest - isa;
	- isa = strdup(isa);
	- /* LINTED */
	- (void) sprintf(bin,
	- "/usr/sbin%.*s/zdb -bcc%s%s -G -d -U %s "
	- "-o zfs_reconstruct_indirect_combinations_max=65536 %s",
	- isalen,
	- isa,
	- ztest_opts.zo_verbose >= 3 ? "s" : "",
	- ztest_opts.zo_verbose >= 4 ? "v" : "",
	- spa_config_path,
	- pool);
	- free(isa);
	-
	- if (ztest_opts.zo_verbose >= 5)
	- (void) printf("Executing %s\n", strstr(zdb, "zdb "));
	-
	- fp = popen(zdb, "r");
	- assert(fp != NULL);
	-
	- while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
	- if (ztest_opts.zo_verbose >= 3)
	- (void) printf("%s", zbuf);
	-
	- status = pclose(fp);
	-
	- if (status == 0)
	- return;
	-
	- ztest_dump_core = 0;
	- if (WIFEXITED(status))
	- fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
	- else
	- fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
	-}
	-
	-static void
	-ztest_walk_pool_directory(char *header)
	-{
	- spa_t *spa = NULL;
	-
	- if (ztest_opts.zo_verbose >= 6)
	- (void) printf("%s\n", header);
	-
	- mutex_enter(&spa_namespace_lock);
	- while ((spa = spa_next(spa)) != NULL)
	- if (ztest_opts.zo_verbose >= 6)
	- (void) printf("\t%s\n", spa_name(spa));
	- mutex_exit(&spa_namespace_lock);
	-}
	-
	-static void
	-ztest_spa_import_export(char oldname, char newname)
	-{
	- nvlist_t config, newconfig;
	- uint64_t pool_guid;
	- spa_t *spa;
	- int error;
	-
	- if (ztest_opts.zo_verbose >= 4) {
	- (void) printf("import/export: old = %s, new = %s\n",
	- oldname, newname);
	- }
	-
	- /*
	- * Clean up from previous runs.
	- */
	- (void) spa_destroy(newname);
	-
	- /*
	- * Get the pool's configuration and guid.
	- */
	- VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
	-
	- /*
	- * Kick off a scrub to tickle scrub/export races.
	- */
	- if (ztest_random(2) == 0)
	- (void) spa_scan(spa, POOL_SCAN_SCRUB);
	-
	- pool_guid = spa_guid(spa);
	- spa_close(spa, FTAG);
	-
	- ztest_walk_pool_directory("pools before export");
	-
	- /*
	- * Export it.
	- */
	- VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
	-
	- ztest_walk_pool_directory("pools after export");
	-
	- /*
	- * Try to import it.
	- */
	- newconfig = spa_tryimport(config);
	- ASSERT(newconfig != NULL);
	- nvlist_free(newconfig);
	-
	- /*
	- * Import it under the new name.
	- */
	- error = spa_import(newname, config, NULL, 0);
	- if (error != 0) {
	- dump_nvlist(config, 0);
	- fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
	- oldname, newname, error);
	- }
	-
	- ztest_walk_pool_directory("pools after import");
	-
	- /*
	- * Try to import it again -- should fail with EEXIST.
	- */
	- VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
	-
	- /*
	- * Try to import it under a different name -- should fail with EEXIST.
	- */
	- VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
	-
	- /*
	- * Verify that the pool is no longer visible under the old name.
	- */
	- VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
	-
	- /*
	- * Verify that we can open and close the pool using the new name.
	- */
	- VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
	- ASSERT(pool_guid == spa_guid(spa));
	- spa_close(spa, FTAG);
	-
	- nvlist_free(config);
	-}
	-
	-static void
	-ztest_resume(spa_t *spa)
	-{
	- if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
	- (void) printf("resuming from suspended state\n");
	- spa_vdev_state_enter(spa, SCL_NONE);
	- vdev_clear(spa, NULL);
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	- (void) zio_resume(spa);
	-}
	-
	-static void *
	-ztest_resume_thread(void *arg)
	-{
	- spa_t *spa = arg;
	-
	- while (!ztest_exiting) {
	- if (spa_suspended(spa))
	- ztest_resume(spa);
	- (void) poll(NULL, 0, 100);
	-
	- /*
	- * Periodically change the zfs_compressed_arc_enabled setting.
	- */
	- if (ztest_random(10) == 0)
	- zfs_compressed_arc_enabled = ztest_random(2);
	-
	- /*
	- * Periodically change the zfs_abd_scatter_enabled setting.
	- */
	- if (ztest_random(10) == 0)
	- zfs_abd_scatter_enabled = ztest_random(2);
	- }
	- return (NULL);
	-}
	-
	-static void *
	-ztest_deadman_thread(void *arg)
	-{
	- ztest_shared_t *zs = arg;
	- spa_t *spa = ztest_spa;
	- hrtime_t delta, total = 0;
	-
	- for (;;) {
	- delta = zs->zs_thread_stop - zs->zs_thread_start +
	- MSEC2NSEC(zfs_deadman_synctime_ms);
	-
	- (void) poll(NULL, 0, (int)NSEC2MSEC(delta));
	-
	- /*
	- * If the pool is suspended then fail immediately. Otherwise,
	- * check to see if the pool is making any progress. If
	- * vdev_deadman() discovers that there hasn't been any recent
	- * I/Os then it will end up aborting the tests.
	- */
	- if (spa_suspended(spa) \|\| spa->spa_root_vdev == NULL) {
	- fatal(0, "aborting test after %llu seconds because "
	- "pool has transitioned to a suspended state.",
	- zfs_deadman_synctime_ms / 1000);
	- return (NULL);
	- }
	- vdev_deadman(spa->spa_root_vdev);
	-
	- total += zfs_deadman_synctime_ms/1000;
	- (void) printf("ztest has been running for %lld seconds\n",
	- total);
	- }
	-}
	-
	-static void
	-ztest_execute(int test, ztest_info_t *zi, uint64_t id)
	-{
	- ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
	- ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
	- hrtime_t functime = gethrtime();
	-
	- for (int i = 0; i < zi->zi_iters; i++)
	- zi->zi_func(zd, id);
	-
	- functime = gethrtime() - functime;
	-
	- atomic_add_64(&zc->zc_count, 1);
	- atomic_add_64(&zc->zc_time, functime);
	-
	- if (ztest_opts.zo_verbose >= 4) {
	- Dl_info dli;
	- (void) dladdr((void *)zi->zi_func, &dli);
	- (void) printf("%6.2f sec in %s\n",
	- (double)functime / NANOSEC, dli.dli_sname);
	- }
	-}
	-
	-static void *
	-ztest_thread(void *arg)
	-{
	- int rand;
	- uint64_t id = (uintptr_t)arg;
	- ztest_shared_t *zs = ztest_shared;
	- uint64_t call_next;
	- hrtime_t now;
	- ztest_info_t *zi;
	- ztest_shared_callstate_t *zc;
	-
	- while ((now = gethrtime()) < zs->zs_thread_stop) {
	- /*
	- * See if it's time to force a crash.
	- */
	- if (now > zs->zs_thread_kill)
	- ztest_kill(zs);
	-
	- /*
	- * If we're getting ENOSPC with some regularity, stop.
	- */
	- if (zs->zs_enospc_count > 10)
	- break;
	-
	- /*
	- * Pick a random function to execute.
	- */
	- rand = ztest_random(ZTEST_FUNCS);
	- zi = &ztest_info[rand];
	- zc = ZTEST_GET_SHARED_CALLSTATE(rand);
	- call_next = zc->zc_next;
	-
	- if (now >= call_next &&
	- atomic_cas_64(&zc->zc_next, call_next, call_next +
	- ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
	- ztest_execute(rand, zi, id);
	- }
	- }
	-
	- return (NULL);
	-}
	-
	-static void
	-ztest_dataset_name(char dsname, char pool, int d)
	-{
	- (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d);
	-}
	-
	-static void
	-ztest_dataset_destroy(int d)
	-{
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	-
	- ztest_dataset_name(name, ztest_opts.zo_pool, d);
	-
	- if (ztest_opts.zo_verbose >= 3)
	- (void) printf("Destroying %s to free up space\n", name);
	-
	- /*
	- * Cleanup any non-standard clones and snapshots. In general,
	- * ztest thread t operates on dataset (t % zopt_datasets),
	- * so there may be more than one thing to clean up.
	- */
	- for (int t = d; t < ztest_opts.zo_threads;
	- t += ztest_opts.zo_datasets) {
	- ztest_dsl_dataset_cleanup(name, t);
	- }
	-
	- (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
	- DS_FIND_SNAPSHOTS \| DS_FIND_CHILDREN);
	-}
	-
	-static void
	-ztest_dataset_dirobj_verify(ztest_ds_t *zd)
	-{
	- uint64_t usedobjs, dirobjs, scratch;
	-
	- /*
	- * ZTEST_DIROBJ is the object directory for the entire dataset.
	- * Therefore, the number of objects in use should equal the
	- * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
	- * If not, we have an object leak.
	- *
	- * Note that we can only check this in ztest_dataset_open(),
	- * when the open-context and syncing-context values agree.
	- * That's because zap_count() returns the open-context value,
	- * while dmu_objset_space() returns the rootbp fill count.
	- */
	- VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
	- dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
	- ASSERT3U(dirobjs + 1, ==, usedobjs);
	-}
	-
	-static int
	-ztest_dataset_open(int d)
	-{
	- ztest_ds_t *zd = &ztest_ds[d];
	- uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
	- objset_t *os;
	- zilog_t *zilog;
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- int error;
	-
	- ztest_dataset_name(name, ztest_opts.zo_pool, d);
	-
	- rw_enter(&ztest_name_lock, RW_READER);
	-
	- error = ztest_dataset_create(name);
	- if (error == ENOSPC) {
	- rw_exit(&ztest_name_lock);
	- ztest_record_enospc(FTAG);
	- return (error);
	- }
	- ASSERT(error == 0 \|\| error == EEXIST);
	-
	- VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
	- rw_exit(&ztest_name_lock);
	-
	- ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
	-
	- zilog = zd->zd_zilog;
	-
	- if (zilog->zl_header->zh_claim_lr_seq != 0 &&
	- zilog->zl_header->zh_claim_lr_seq < committed_seq)
	- fatal(0, "missing log records: claimed %llu < committed %llu",
	- zilog->zl_header->zh_claim_lr_seq, committed_seq);
	-
	- ztest_dataset_dirobj_verify(zd);
	-
	- zil_replay(os, zd, ztest_replay_vector);
	-
	- ztest_dataset_dirobj_verify(zd);
	-
	- if (ztest_opts.zo_verbose >= 6)
	- (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
	- zd->zd_name,
	- (u_longlong_t)zilog->zl_parse_blk_count,
	- (u_longlong_t)zilog->zl_parse_lr_count,
	- (u_longlong_t)zilog->zl_replaying_seq);
	-
	- zilog = zil_open(os, ztest_get_data);
	-
	- if (zilog->zl_replaying_seq != 0 &&
	- zilog->zl_replaying_seq < committed_seq)
	- fatal(0, "missing log records: replayed %llu < committed %llu",
	- zilog->zl_replaying_seq, committed_seq);
	-
	- return (0);
	-}
	-
	-static void
	-ztest_dataset_close(int d)
	-{
	- ztest_ds_t *zd = &ztest_ds[d];
	-
	- zil_close(zd->zd_zilog);
	- dmu_objset_disown(zd->zd_os, zd);
	-
	- ztest_zd_fini(zd);
	-}
	-
	-/*
	- * Kick off threads to run tests on all datasets in parallel.
	- */
	-static void
	-ztest_run(ztest_shared_t *zs)
	-{
	- thread_t *tid;
	- spa_t *spa;
	- objset_t *os;
	- thread_t resume_tid;
	- int error;
	-
	- ztest_exiting = B_FALSE;
	-
	- /*
	- * Initialize parent/child shared state.
	- */
	- mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL);
	- mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL);
	- rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL);
	-
	- zs->zs_thread_start = gethrtime();
	- zs->zs_thread_stop =
	- zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
	- zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
	- zs->zs_thread_kill = zs->zs_thread_stop;
	- if (ztest_random(100) < ztest_opts.zo_killrate) {
	- zs->zs_thread_kill -=
	- ztest_random(ztest_opts.zo_passtime * NANOSEC);
	- }
	-
	- mutex_init(&zcl.zcl_callbacks_lock, NULL, USYNC_THREAD, NULL);
	-
	- list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
	- offsetof(ztest_cb_data_t, zcd_node));
	-
	- /*
	- * Open our pool.
	- */
	- kernel_init(FREAD \| FWRITE);
	- VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
	- metaslab_preload_limit = ztest_random(20) + 1;
	- ztest_spa = spa;
	-
	- dmu_objset_stats_t dds;
	- VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
	- DMU_OST_ANY, B_TRUE, FTAG, &os));
	- dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	- dmu_objset_fast_stat(os, &dds);
	- dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	- zs->zs_guid = dds.dds_guid;
	- dmu_objset_disown(os, FTAG);
	-
	- spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
	-
	- /*
	- * We don't expect the pool to suspend unless maxfaults == 0,
	- * in which case ztest_fault_inject() temporarily takes away
	- * the only valid replica.
	- */
	- if (MAXFAULTS() == 0)
	- spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
	- else
	- spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
	-
	- /*
	- * Create a thread to periodically resume suspended I/O.
	- */
	- VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
	- &resume_tid) == 0);
	-
	- /*
	- * Create a deadman thread to abort() if we hang.
	- */
	- VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
	- NULL) == 0);
	-
	- /*
	- * Verify that we can safely inquire about any object,
	- * whether it's allocated or not. To make it interesting,
	- * we probe a 5-wide window around each power of two.
	- * This hits all edge cases, including zero and the max.
	- */
	- for (int t = 0; t < 64; t++) {
	- for (int d = -5; d <= 5; d++) {
	- error = dmu_object_info(spa->spa_meta_objset,
	- (1ULL << t) + d, NULL);
	- ASSERT(error == 0 \|\| error == ENOENT \|\|
	- error == EINVAL);
	- }
	- }
	-
	- /*
	- * If we got any ENOSPC errors on the previous run, destroy something.
	- */
	- if (zs->zs_enospc_count != 0) {
	- int d = ztest_random(ztest_opts.zo_datasets);
	- ztest_dataset_destroy(d);
	- }
	- zs->zs_enospc_count = 0;
	-
	- tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t),
	- UMEM_NOFAIL);
	-
	- if (ztest_opts.zo_verbose >= 4)
	- (void) printf("starting main threads...\n");
	-
	- /*
	- * Kick off all the tests that run in parallel.
	- */
	- for (int t = 0; t < ztest_opts.zo_threads; t++) {
	- if (t < ztest_opts.zo_datasets &&
	- ztest_dataset_open(t) != 0)
	- return;
	- VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
	- THR_BOUND, &tid[t]) == 0);
	- }
	-
	- /*
	- * Wait for all of the tests to complete. We go in reverse order
	- * so we don't close datasets while threads are still using them.
	- */
	- for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
	- VERIFY(thr_join(tid[t], NULL, NULL) == 0);
	- if (t < ztest_opts.zo_datasets)
	- ztest_dataset_close(t);
	- }
	-
	- txg_wait_synced(spa_get_dsl(spa), 0);
	-
	- zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
	- zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
	- zfs_dbgmsg_print(FTAG);
	-
	- umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));
	-
	- /* Kill the resume thread */
	- ztest_exiting = B_TRUE;
	- VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
	- ztest_resume(spa);
	-
	- /*
	- * Right before closing the pool, kick off a bunch of async I/O;
	- * spa_close() should wait for it to complete.
	- */
	- for (uint64_t object = 1; object < 50; object++) {
	- dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
	- ZIO_PRIORITY_SYNC_READ);
	- }
	-
	- spa_close(spa, FTAG);
	-
	- /*
	- * Verify that we can loop over all pools.
	- */
	- mutex_enter(&spa_namespace_lock);
	- for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
	- if (ztest_opts.zo_verbose > 3)
	- (void) printf("spa_next: found %s\n", spa_name(spa));
	- mutex_exit(&spa_namespace_lock);
	-
	- /*
	- * Verify that we can export the pool and reimport it under a
	- * different name.
	- */
	- if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) {
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- (void) snprintf(name, sizeof (name), "%s_import",
	- ztest_opts.zo_pool);
	- ztest_spa_import_export(ztest_opts.zo_pool, name);
	- ztest_spa_import_export(name, ztest_opts.zo_pool);
	- }
	-
	- kernel_fini();
	-
	- list_destroy(&zcl.zcl_callbacks);
	-
	- mutex_destroy(&zcl.zcl_callbacks_lock);
	-
	- rw_destroy(&ztest_name_lock);
	- mutex_destroy(&ztest_vdev_lock);
	- mutex_destroy(&ztest_checkpoint_lock);
	-}
	-
	-static void
	-ztest_freeze(void)
	-{
	- ztest_ds_t *zd = &ztest_ds[0];
	- spa_t *spa;
	- int numloops = 0;
	-
	- if (ztest_opts.zo_verbose >= 3)
	- (void) printf("testing spa_freeze()...\n");
	-
	- kernel_init(FREAD \| FWRITE);
	- VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
	- VERIFY3U(0, ==, ztest_dataset_open(0));
	- ztest_spa = spa;
	-
	- /*
	- * Force the first log block to be transactionally allocated.
	- * We have to do this before we freeze the pool -- otherwise
	- * the log chain won't be anchored.
	- */
	- while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
	- ztest_dmu_object_alloc_free(zd, 0);
	- zil_commit(zd->zd_zilog, 0);
	- }
	-
	- txg_wait_synced(spa_get_dsl(spa), 0);
	-
	- /*
	- * Freeze the pool. This stops spa_sync() from doing anything,
	- * so that the only way to record changes from now on is the ZIL.
	- */
	- spa_freeze(spa);
	-
	- /*
	- * Because it is hard to predict how much space a write will actually
	- * require beforehand, we leave ourselves some fudge space to write over
	- * capacity.
	- */
	- uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
	-
	- /*
	- * Run tests that generate log records but don't alter the pool config
	- * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
	- * We do a txg_wait_synced() after each iteration to force the txg
	- * to increase well beyond the last synced value in the uberblock.
	- * The ZIL should be OK with that.
	- *
	- * Run a random number of times less than zo_maxloops and ensure we do
	- * not run out of space on the pool.
	- */
	- while (ztest_random(10) != 0 &&
	- numloops++ < ztest_opts.zo_maxloops &&
	- metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
	- ztest_od_t od;
	- ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
	- VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
	- ztest_io(zd, od.od_object,
	- ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
	- txg_wait_synced(spa_get_dsl(spa), 0);
	- }
	-
	- /*
	- * Commit all of the changes we just generated.
	- */
	- zil_commit(zd->zd_zilog, 0);
	- txg_wait_synced(spa_get_dsl(spa), 0);
	-
	- /*
	- * Close our dataset and close the pool.
	- */
	- ztest_dataset_close(0);
	- spa_close(spa, FTAG);
	- kernel_fini();
	-
	- /*
	- * Open and close the pool and dataset to induce log replay.
	- */
	- kernel_init(FREAD \| FWRITE);
	- VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
	- ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
	- VERIFY3U(0, ==, ztest_dataset_open(0));
	- ztest_dataset_close(0);
	-
	- ztest_spa = spa;
	- txg_wait_synced(spa_get_dsl(spa), 0);
	- ztest_reguid(NULL, 0);
	-
	- spa_close(spa, FTAG);
	- kernel_fini();
	-}
	-
	-void
	-print_time(hrtime_t t, char *timebuf)
	-{
	- hrtime_t s = t / NANOSEC;
	- hrtime_t m = s / 60;
	- hrtime_t h = m / 60;
	- hrtime_t d = h / 24;
	-
	- s -= m * 60;
	- m -= h * 60;
	- h -= d * 24;
	-
	- timebuf[0] = '\0';
	-
	- if (d)
	- (void) sprintf(timebuf,
	- "%llud%02lluh%02llum%02llus", d, h, m, s);
	- else if (h)
	- (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
	- else if (m)
	- (void) sprintf(timebuf, "%llum%02llus", m, s);
	- else
	- (void) sprintf(timebuf, "%llus", s);
	-}
	-
	-static nvlist_t *
	-make_random_props()
	-{
	- nvlist_t *props;
	-
	- VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
	-
	- if (ztest_random(2) == 0)
	- return (props);
	- VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
	-
	- return (props);
	-}
	-
	-/*
	- * Import a storage pool with the given name.
	- */
	-static void
	-ztest_import(ztest_shared_t *zs)
	-{
	- libzfs_handle_t *hdl;
	- importargs_t args = { 0 };
	- spa_t *spa;
	- nvlist_t *cfg = NULL;
	- int nsearch = 1;
	- char *searchdirs[nsearch];
	- char *name = ztest_opts.zo_pool;
	- int flags = ZFS_IMPORT_MISSING_LOG;
	- int error;
	-
	- mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
	- rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL);
	-
	- kernel_init(FREAD \| FWRITE);
	- hdl = libzfs_init();
	-
	- searchdirs[0] = ztest_opts.zo_dir;
	- args.paths = nsearch;
	- args.path = searchdirs;
	- args.can_be_active = B_FALSE;
	-
	- error = zpool_tryimport(hdl, name, &cfg, &args);
	- if (error)
	- (void) fatal(0, "No pools found\n");
	-
	- VERIFY0(spa_import(name, cfg, NULL, flags));
	- VERIFY0(spa_open(name, &spa, FTAG));
	- zs->zs_metaslab_sz =
	- 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
	- spa_close(spa, FTAG);
	-
	- libzfs_fini(hdl);
	- kernel_fini();
	-
	- if (!ztest_opts.zo_mmp_test) {
	- ztest_run_zdb(ztest_opts.zo_pool);
	- ztest_freeze();
	- ztest_run_zdb(ztest_opts.zo_pool);
	- }
	-
	- rw_destroy(&ztest_name_lock);
	- mutex_destroy(&ztest_vdev_lock);
	-}
	-
	-/*
	- * Create a storage pool with the given name and initial vdev size.
	- * Then test spa_freeze() functionality.
	- */
	-static void
	-ztest_init(ztest_shared_t *zs)
	-{
	- spa_t *spa;
	- nvlist_t nvroot, props;
	-
	- mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL);
	- mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL);
	- rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL);
	-
	- kernel_init(FREAD \| FWRITE);
	-
	- /*
	- * Create the storage pool.
	- */
	- (void) spa_destroy(ztest_opts.zo_pool);
	- ztest_shared->zs_vdev_next_leaf = 0;
	- zs->zs_splits = 0;
	- zs->zs_mirrors = ztest_opts.zo_mirrors;
	- nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
	- NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
	- props = make_random_props();
	- for (int i = 0; i < SPA_FEATURES; i++) {
	- char buf[1024];
	- (void) snprintf(buf, sizeof (buf), "feature@%s",
	- spa_feature_table[i].fi_uname);
	- VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
	- }
	- VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL));
	- nvlist_free(nvroot);
	- nvlist_free(props);
	-
	- VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
	- zs->zs_metaslab_sz =
	- 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
	-
	- spa_close(spa, FTAG);
	-
	- kernel_fini();
	-
	- if (!ztest_opts.zo_mmp_test) {
	- ztest_run_zdb(ztest_opts.zo_pool);
	- ztest_freeze();
	- ztest_run_zdb(ztest_opts.zo_pool);
	- }
	-
	- rw_destroy(&ztest_name_lock);
	- mutex_destroy(&ztest_vdev_lock);
	- mutex_destroy(&ztest_checkpoint_lock);
	-}
	-
	-static void
	-setup_data_fd(void)
	-{
	- static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
	-
	- ztest_fd_data = mkstemp(ztest_name_data);
	- ASSERT3S(ztest_fd_data, >=, 0);
	- (void) unlink(ztest_name_data);
	-}
	-
	-
	-static int
	-shared_data_size(ztest_shared_hdr_t *hdr)
	-{
	- int size;
	-
	- size = hdr->zh_hdr_size;
	- size += hdr->zh_opts_size;
	- size += hdr->zh_size;
	- size += hdr->zh_stats_size * hdr->zh_stats_count;
	- size += hdr->zh_ds_size * hdr->zh_ds_count;
	-
	- return (size);
	-}
	-
	-static void
	-setup_hdr(void)
	-{
	- int size;
	- ztest_shared_hdr_t *hdr;
	-
	- hdr = (void )mmap(0, P2ROUNDUP(sizeof (hdr), getpagesize()),
	- PROT_READ \| PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
	- ASSERT(hdr != MAP_FAILED);
	-
	- VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
	-
	- hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
	- hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
	- hdr->zh_size = sizeof (ztest_shared_t);
	- hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
	- hdr->zh_stats_count = ZTEST_FUNCS;
	- hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
	- hdr->zh_ds_count = ztest_opts.zo_datasets;
	-
	- size = shared_data_size(hdr);
	- VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
	-
	- (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
	-}
	-
	-static void
	-setup_data(void)
	-{
	- int size, offset;
	- ztest_shared_hdr_t *hdr;
	- uint8_t *buf;
	-
	- hdr = (void )mmap(0, P2ROUNDUP(sizeof (hdr), getpagesize()),
	- PROT_READ, MAP_SHARED, ztest_fd_data, 0);
	- ASSERT(hdr != MAP_FAILED);
	-
	- size = shared_data_size(hdr);
	-
	- (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
	- hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
	- PROT_READ \| PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
	- ASSERT(hdr != MAP_FAILED);
	- buf = (uint8_t *)hdr;
	-
	- offset = hdr->zh_hdr_size;
	- ztest_shared_opts = (void *)&buf[offset];
	- offset += hdr->zh_opts_size;
	- ztest_shared = (void *)&buf[offset];
	- offset += hdr->zh_size;
	- ztest_shared_callstate = (void *)&buf[offset];
	- offset += hdr->zh_stats_size * hdr->zh_stats_count;
	- ztest_shared_ds = (void *)&buf[offset];
	-}
	-
	-static boolean_t
	-exec_child(char cmd, char libpath, boolean_t ignorekill, int *statusp)
	-{
	- pid_t pid;
	- int status;
	- char *cmdbuf = NULL;
	-
	- pid = fork();
	-
	- if (cmd == NULL) {
	- cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
	- (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
	- cmd = cmdbuf;
	- }
	-
	- if (pid == -1)
	- fatal(1, "fork failed");
	-
	- if (pid == 0) { /* child */
	- char *emptyargv[2] = { cmd, NULL };
	- char fd_data_str[12];
	-
	- struct rlimit rl = { 1024, 1024 };
	- (void) setrlimit(RLIMIT_NOFILE, &rl);
	-
	- (void) close(ztest_fd_rand);
	- VERIFY3U(11, >=,
	- snprintf(fd_data_str, 12, "%d", ztest_fd_data));
	- VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1));
	-
	- (void) enable_extended_FILE_stdio(-1, -1);
	- if (libpath != NULL)
	- VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
	-#ifdef illumos
	- (void) execv(cmd, emptyargv);
	-#else
	- (void) execvp(cmd, emptyargv);
	-#endif
	- ztest_dump_core = B_FALSE;
	- fatal(B_TRUE, "exec failed: %s", cmd);
	- }
	-
	- if (cmdbuf != NULL) {
	- umem_free(cmdbuf, MAXPATHLEN);
	- cmd = NULL;
	- }
	-
	- while (waitpid(pid, &status, 0) != pid)
	- continue;
	- if (statusp != NULL)
	- *statusp = status;
	-
	- if (WIFEXITED(status)) {
	- if (WEXITSTATUS(status) != 0) {
	- (void) fprintf(stderr, "child exited with code %d\n",
	- WEXITSTATUS(status));
	- exit(2);
	- }
	- return (B_FALSE);
	- } else if (WIFSIGNALED(status)) {
	- if (!ignorekill \|\| WTERMSIG(status) != SIGKILL) {
	- (void) fprintf(stderr, "child died with signal %d\n",
	- WTERMSIG(status));
	- exit(3);
	- }
	- return (B_TRUE);
	- } else {
	- (void) fprintf(stderr, "something strange happened to child\n");
	- exit(4);
	- /* NOTREACHED */
	- }
	-}
	-
	-static void
	-ztest_run_init(void)
	-{
	- ztest_shared_t *zs = ztest_shared;
	-
	- /*
	- * Blow away any existing copy of zpool.cache
	- */
	- (void) remove(spa_config_path);
	-
	- if (ztest_opts.zo_init == 0) {
	- if (ztest_opts.zo_verbose >= 1)
	- (void) printf("Importing pool %s\n",
	- ztest_opts.zo_pool);
	- ztest_import(zs);
	- return;
	- }
	-
	- /*
	- * Create and initialize our storage pool.
	- */
	- for (int i = 1; i <= ztest_opts.zo_init; i++) {
	- bzero(zs, sizeof (ztest_shared_t));
	- if (ztest_opts.zo_verbose >= 3 &&
	- ztest_opts.zo_init != 1) {
	- (void) printf("ztest_init(), pass %d\n", i);
	- }
	- ztest_init(zs);
	- }
	-}
	-
	-int
	-main(int argc, char **argv)
	-{
	- int kills = 0;
	- int iters = 0;
	- int older = 0;
	- int newer = 0;
	- ztest_shared_t *zs;
	- ztest_info_t *zi;
	- ztest_shared_callstate_t *zc;
	- char timebuf[100];
	- char numbuf[NN_NUMBUF_SZ];
	- char *cmd;
	- boolean_t hasalt;
	- char *fd_data_str = getenv("ZTEST_FD_DATA");
	-
	- (void) setvbuf(stdout, NULL, _IOLBF, 0);
	-
	- dprintf_setup(&argc, argv);
	- zfs_deadman_synctime_ms = 300000;
	- /*
	- * As two-word space map entries may not come up often (especially
	- * if pool and vdev sizes are small) we want to force at least some
	- * of them so the feature get tested.
	- */
	- zfs_force_some_double_word_sm_entries = B_TRUE;
	-
	- /*
	- * Verify that even extensively damaged split blocks with many
	- * segments can be reconstructed in a reasonable amount of time
	- * when reconstruction is known to be possible.
	- */
	- zfs_reconstruct_indirect_damage_fraction = 4;
	-
	- ztest_fd_rand = open("/dev/urandom", O_RDONLY);
	- ASSERT3S(ztest_fd_rand, >=, 0);
	-
	- if (!fd_data_str) {
	- process_options(argc, argv);
	-
	- setup_data_fd();
	- setup_hdr();
	- setup_data();
	- bcopy(&ztest_opts, ztest_shared_opts,
	- sizeof (*ztest_shared_opts));
	- } else {
	- ztest_fd_data = atoi(fd_data_str);
	- setup_data();
	- bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
	- }
	- ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
	-
	- /* Override location of zpool.cache */
	- VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache",
	- ztest_opts.zo_dir), !=, -1);
	-
	- ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
	- UMEM_NOFAIL);
	- zs = ztest_shared;
	-
	- if (fd_data_str) {
	- metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging;
	- metaslab_df_alloc_threshold =
	- zs->zs_metaslab_df_alloc_threshold;
	-
	- if (zs->zs_do_init)
	- ztest_run_init();
	- else
	- ztest_run(zs);
	- exit(0);
	- }
	-
	- hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
	-
	- if (ztest_opts.zo_verbose >= 1) {
	- (void) printf("%llu vdevs, %d datasets, %d threads,"
	- " %llu seconds...\n",
	- (u_longlong_t)ztest_opts.zo_vdevs,
	- ztest_opts.zo_datasets,
	- ztest_opts.zo_threads,
	- (u_longlong_t)ztest_opts.zo_time);
	- }
	-
	- cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
	- (void) strlcpy(cmd, getexecname(), MAXNAMELEN);
	-
	- zs->zs_do_init = B_TRUE;
	- if (strlen(ztest_opts.zo_alt_ztest) != 0) {
	- if (ztest_opts.zo_verbose >= 1) {
	- (void) printf("Executing older ztest for "
	- "initialization: %s\n", ztest_opts.zo_alt_ztest);
	- }
	- VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
	- ztest_opts.zo_alt_libpath, B_FALSE, NULL));
	- } else {
	- VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
	- }
	- zs->zs_do_init = B_FALSE;
	-
	- zs->zs_proc_start = gethrtime();
	- zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;
	-
	- for (int f = 0; f < ZTEST_FUNCS; f++) {
	- zi = &ztest_info[f];
	- zc = ZTEST_GET_SHARED_CALLSTATE(f);
	- if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
	- zc->zc_next = UINT64_MAX;
	- else
	- zc->zc_next = zs->zs_proc_start +
	- ztest_random(2 * zi->zi_interval[0] + 1);
	- }
	-
	- /*
	- * Run the tests in a loop. These tests include fault injection
	- * to verify that self-healing data works, and forced crashes
	- * to verify that we never lose on-disk consistency.
	- */
	- while (gethrtime() < zs->zs_proc_stop) {
	- int status;
	- boolean_t killed;
	-
	- /*
	- * Initialize the workload counters for each function.
	- */
	- for (int f = 0; f < ZTEST_FUNCS; f++) {
	- zc = ZTEST_GET_SHARED_CALLSTATE(f);
	- zc->zc_count = 0;
	- zc->zc_time = 0;
	- }
	-
	- /* Set the allocation switch size */
	- zs->zs_metaslab_df_alloc_threshold =
	- ztest_random(zs->zs_metaslab_sz / 4) + 1;
	-
	- if (!hasalt \|\| ztest_random(2) == 0) {
	- if (hasalt && ztest_opts.zo_verbose >= 1) {
	- (void) printf("Executing newer ztest: %s\n",
	- cmd);
	- }
	- newer++;
	- killed = exec_child(cmd, NULL, B_TRUE, &status);
	- } else {
	- if (hasalt && ztest_opts.zo_verbose >= 1) {
	- (void) printf("Executing older ztest: %s\n",
	- ztest_opts.zo_alt_ztest);
	- }
	- older++;
	- killed = exec_child(ztest_opts.zo_alt_ztest,
	- ztest_opts.zo_alt_libpath, B_TRUE, &status);
	- }
	-
	- if (killed)
	- kills++;
	- iters++;
	-
	- if (ztest_opts.zo_verbose >= 1) {
	- hrtime_t now = gethrtime();
	-
	- now = MIN(now, zs->zs_proc_stop);
	- print_time(zs->zs_proc_stop - now, timebuf);
	- nicenum(zs->zs_space, numbuf, sizeof (numbuf));
	-
	- (void) printf("Pass %3d, %8s, %3llu ENOSPC, "
	- "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
	- iters,
	- WIFEXITED(status) ? "Complete" : "SIGKILL",
	- (u_longlong_t)zs->zs_enospc_count,
	- 100.0 * zs->zs_alloc / zs->zs_space,
	- numbuf,
	- 100.0 * (now - zs->zs_proc_start) /
	- (ztest_opts.zo_time * NANOSEC), timebuf);
	- }
	-
	- if (ztest_opts.zo_verbose >= 2) {
	- (void) printf("\nWorkload summary:\n\n");
	- (void) printf("%7s %9s %s\n",
	- "Calls", "Time", "Function");
	- (void) printf("%7s %9s %s\n",
	- "-----", "----", "--------");
	- for (int f = 0; f < ZTEST_FUNCS; f++) {
	- Dl_info dli;
	-
	- zi = &ztest_info[f];
	- zc = ZTEST_GET_SHARED_CALLSTATE(f);
	- print_time(zc->zc_time, timebuf);
	- (void) dladdr((void *)zi->zi_func, &dli);
	- (void) printf("%7llu %9s %s\n",
	- (u_longlong_t)zc->zc_count, timebuf,
	- dli.dli_sname);
	- }
	- (void) printf("\n");
	- }
	-
	- if (!ztest_opts.zo_mmp_test)
	- ztest_run_zdb(ztest_opts.zo_pool);
	- }
	-
	- if (ztest_opts.zo_verbose >= 1) {
	- if (hasalt) {
	- (void) printf("%d runs of older ztest: %s\n", older,
	- ztest_opts.zo_alt_ztest);
	- (void) printf("%d runs of newer ztest: %s\n", newer,
	- cmd);
	- }
	- (void) printf("%d killed, %d completed, %.0f%% kill rate\n",
	- kills, iters - kills, (100.0 * kills) / MAX(1, iters));
	- }
	-
	- umem_free(cmd, MAXNAMELEN);
	-
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libdtrace/common/drti.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libdtrace/common/drti.c
	+++ head/cddl/contrib/opensolaris/lib/libdtrace/common/drti.c
	@@ -24,6 +24,7 @@
	* Use is subject to license terms.
	*/

	+#include <sys/types.h>
	#include <unistd.h>
	#include <fcntl.h>
	#include <dlfcn.h>
	Index: head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c
	+++ head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c
	@@ -31,6 +31,7 @@

	#include <assert.h>
	#include <elf.h>
	+#include <sys/types.h>
	#include <fcntl.h>
	#include <gelf.h>
	#include <limits.h>
	Index: head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c
	+++ head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c
	@@ -77,7 +77,6 @@
	#include <netdb.h>
	#include <netinet/in.h>
	#include <arpa/inet.h>
	-#include <arpa/nameser.h>

	#include <dt_module.h>
	#include <dt_printf.h>
	Index: head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c
	+++ head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c
	@@ -44,11 +44,18 @@
	#include <netdb.h>
	#include <netinet/in.h>
	#include <arpa/inet.h>
	-#include <arpa/nameser.h>
	-
	+#include <sys/byteorder.h>
	#include <dt_printf.h>
	#include <dt_string.h>
	#include <dt_impl.h>
	+
	+#ifndef NS_IN6ADDRSZ
	+#define NS_IN6ADDRSZ 16
	+#endif
	+
	+#ifndef NS_INADDRSZ
	+#define NS_INADDRSZ 4
	+#endif

	/ARGSUSED/
	static int
	Index: head/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h
	+++ head/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h
	@@ -1,196 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- */
	-
	-#ifndef _LIBNVPAIR_H
	-#define _LIBNVPAIR_H
	-
	-#include <sys/nvpair.h>
	-#include <stdlib.h>
	-#include <stdio.h>
	-#include <regex.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * All interfaces described in this file are private to Solaris, and
	- * are subject to change at any time and without notice. The public
	- * nvlist/nvpair interfaces, as documented in manpage sections 3NVPAIR,
	- * are all imported from <sys/nvpair.h> included above.
	- */
	-
	-extern int nvpair_value_match(nvpair_t , int, char , char **);
	-extern int nvpair_value_match_regex(nvpair_t , int, char , regex_t *,
	- char **);
	-
	-extern void nvlist_print(FILE , nvlist_t );
	-extern int nvlist_print_json(FILE , nvlist_t );
	-extern void dump_nvlist(nvlist_t *, int);
	-
	-/*
	- * Private nvlist printing interface that allows the caller some control
	- * over output rendering (as opposed to nvlist_print and dump_nvlist).
	- *
	- * Obtain an opaque nvlist_prtctl_t cookie using nvlist_prtctl_alloc
	- * (NULL on failure); on return the cookie is set up for default formatting
	- * and rendering. Quote the cookie in subsequent customisation functions and
	- * then pass the cookie to nvlist_prt to render the nvlist. Finally,
	- * use nvlist_prtctl_free to release the cookie.
	- *
	- * For all nvlist_lookup_xxx and nvlist_lookup_xxx_array functions
	- * we have a corresponding brace of functions that appoint replacement
	- * rendering functions:
	- *
	- * extern void nvlist_prtctl_xxx(nvlist_prtctl_t,
	- * void ()(nvlist_prtctl_t ctl, void private, const char *name,
	- * xxxtype value))
	- *
	- * and
	- *
	- * extern void nvlist_prtctl_xxx_array(nvlist_prtctl_t,
	- * void ()(nvlist_prtctl_t ctl, void private, const char *name,
	- * xxxtype value, uint_t count))
	- *
	- * where xxxtype is the C datatype corresponding to xxx, eg int8_t for "int8"
	- * and char * for "string". The function that is appointed to render the
	- * specified datatype receives as arguments the cookie, the nvlist
	- * member name, the value of that member (or a pointer for array function),
	- * and (for array rendering functions) a count of the number of elements.
	- */
	-
	-typedef struct nvlist_prtctl nvlist_prtctl_t; / opaque */
	-
	-enum nvlist_indent_mode {
	- NVLIST_INDENT_ABS, /* Absolute indentation */
	- NVLIST_INDENT_TABBED /* Indent with tabstops */
	-};
	-
	-extern nvlist_prtctl_t nvlist_prtctl_alloc(void);
	-extern void nvlist_prtctl_free(nvlist_prtctl_t);
	-extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t);
	-
	-/* Output stream */
	-extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *);
	-extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t);
	-
	-/* Indentation mode, start indent, indent increment; default tabbed/0/1 */
	-extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode,
	- int, int);
	-extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int);
	-
	-enum nvlist_prtctl_fmt {
	- NVLIST_FMT_MEMBER_NAME, /* name fmt; default "%s = " */
	- NVLIST_FMT_MEMBER_POSTAMBLE, /* after nvlist member; default "\n" */
	- NVLIST_FMT_BTWN_ARRAY /* between array members; default " " */
	-};
	-
	-extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt,
	- const char *);
	-extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...);
	-
	-/*
	- * Function prototypes for interfaces that appoint a new rendering function
	- * for single-valued nvlist members.
	- *
	- * A replacement function receives arguments as follows:
	- *
	- * nvlist_prtctl_t Print control structure; do not change preferences
	- * for this object from a print callback function.
	- *
	- * void * The function-private cookie argument registered
	- * when the replacement function was appointed.
	- *
	- * nvlist_t * The full nvlist that is being processed. The
	- * rendering function is called to render a single
	- * member (name and value passed as below) but it may
	- * want to reference or incorporate other aspects of
	- * the full nvlist.
	- *
	- * const char * Member name to render
	- *
	- * valtype Value of the member to render
	- *
	- * The function must return non-zero if it has rendered output for this
	- * member, or 0 if it wants to default to standard rendering for this
	- * one member.
	- */
	-
	-#define NVLIST_PRINTCTL_SVDECL(funcname, valtype) \
	- extern void funcname(nvlist_prtctl_t, \
	- int ()(nvlist_prtctl_t, void , nvlist_t , const char , valtype), \
	- void *)
	-
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean, int);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean_value, boolean_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_byte, uchar_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int8, int8_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint8, uint8_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int16, int16_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint16, uint16_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int32, int32_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint32, uint32_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int64, int64_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint64, uint64_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_double, double);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_string, char *);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_hrtime, hrtime_t);
	-NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *);
	-
	-#undef NVLIST_PRINTCTL_SVDECL /* was just for "clarity" above */
	-
	-/*
	- * Function prototypes for interfaces that appoint a new rendering function
	- * for array-valued nvlist members.
	- *
	- * One additional argument is taken: uint_t for the number of array elements
	- *
	- * Return values as above.
	- */
	-#define NVLIST_PRINTCTL_AVDECL(funcname, vtype) \
	- extern void funcname(nvlist_prtctl_t, \
	- int ()(nvlist_prtctl_t, void , nvlist_t , const char , vtype, uint_t), \
	- void *)
	-
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_boolean_array, boolean_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_byte_array, uchar_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int8_array, int8_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint8_array, uint8_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int16_array, int16_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint16_array, uint16_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int32_array, int32_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint32_array, uint32_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int64_array, int64_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint64_array, uint64_t *);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_string_array, char **);
	-NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_nvlist_array, nvlist_t **);
	-
	-#undef NVLIST_PRINTCTL_AVDECL /* was just for "clarity" above */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _LIBNVPAIR_H */
	Index: head/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c
	+++ head/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c
	@@ -1,1286 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- */
	-
	-#include <solaris.h>
	-#include <inttypes.h>
	-#include <unistd.h>
	-#include <string.h>
	-#include <libintl.h>
	-#include <stdarg.h>
	-#include "libnvpair.h"
	-
	-/*
	- * libnvpair - A tools library for manipulating <name, value> pairs.
	- *
	- * This library provides routines packing an unpacking nv pairs
	- * for transporting data across process boundaries, transporting
	- * between kernel and userland, and possibly saving onto disk files.
	- */
	-
	-/*
	- * Print control structure.
	- */
	-
	-#define DEFINEOP(opname, vtype) \
	- struct { \
	- int (op)(struct nvlist_prtctl , void , nvlist_t , \
	- const char *, vtype); \
	- void *arg; \
	- } opname
	-
	-#define DEFINEARROP(opname, vtype) \
	- struct { \
	- int (op)(struct nvlist_prtctl , void , nvlist_t , \
	- const char *, vtype, uint_t); \
	- void *arg; \
	- } opname
	-
	-struct nvlist_printops {
	- DEFINEOP(print_boolean, int);
	- DEFINEOP(print_boolean_value, boolean_t);
	- DEFINEOP(print_byte, uchar_t);
	- DEFINEOP(print_int8, int8_t);
	- DEFINEOP(print_uint8, uint8_t);
	- DEFINEOP(print_int16, int16_t);
	- DEFINEOP(print_uint16, uint16_t);
	- DEFINEOP(print_int32, int32_t);
	- DEFINEOP(print_uint32, uint32_t);
	- DEFINEOP(print_int64, int64_t);
	- DEFINEOP(print_uint64, uint64_t);
	- DEFINEOP(print_double, double);
	- DEFINEOP(print_string, char *);
	- DEFINEOP(print_hrtime, hrtime_t);
	- DEFINEOP(print_nvlist, nvlist_t *);
	- DEFINEARROP(print_boolean_array, boolean_t *);
	- DEFINEARROP(print_byte_array, uchar_t *);
	- DEFINEARROP(print_int8_array, int8_t *);
	- DEFINEARROP(print_uint8_array, uint8_t *);
	- DEFINEARROP(print_int16_array, int16_t *);
	- DEFINEARROP(print_uint16_array, uint16_t *);
	- DEFINEARROP(print_int32_array, int32_t *);
	- DEFINEARROP(print_uint32_array, uint32_t *);
	- DEFINEARROP(print_int64_array, int64_t *);
	- DEFINEARROP(print_uint64_array, uint64_t *);
	- DEFINEARROP(print_string_array, char **);
	- DEFINEARROP(print_nvlist_array, nvlist_t **);
	-};
	-
	-struct nvlist_prtctl {
	- FILE nvprt_fp; / output destination */
	- enum nvlist_indent_mode nvprt_indent_mode; /* see above */
	- int nvprt_indent; /* absolute indent, or tab depth */
	- int nvprt_indentinc; /* indent or tab increment */
	- const char nvprt_nmfmt; / member name format, max one %s */
	- const char nvprt_eomfmt; / after member format, e.g. "\n" */
	- const char nvprt_btwnarrfmt; / between array members */
	- int nvprt_btwnarrfmt_nl; /* nvprt_eoamfmt includes newline? */
	- struct nvlist_printops *nvprt_dfltops;
	- struct nvlist_printops *nvprt_custops;
	-};
	-
	-#define DFLTPRTOP(pctl, type) \
	- ((pctl)->nvprt_dfltops->print_##type.op)
	-
	-#define DFLTPRTOPARG(pctl, type) \
	- ((pctl)->nvprt_dfltops->print_##type.arg)
	-
	-#define CUSTPRTOP(pctl, type) \
	- ((pctl)->nvprt_custops->print_##type.op)
	-
	-#define CUSTPRTOPARG(pctl, type) \
	- ((pctl)->nvprt_custops->print_##type.arg)
	-
	-#define RENDER(pctl, type, nvl, name, val) \
	- { \
	- int done = 0; \
	- if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \
	- done = CUSTPRTOP(pctl, type)(pctl, \
	- CUSTPRTOPARG(pctl, type), nvl, name, val); \
	- } \
	- if (!done) { \
	- (void) DFLTPRTOP(pctl, type)(pctl, \
	- DFLTPRTOPARG(pctl, type), nvl, name, val); \
	- } \
	- (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \
	- }
	-
	-#define ARENDER(pctl, type, nvl, name, arrp, count) \
	- { \
	- int done = 0; \
	- if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \
	- done = CUSTPRTOP(pctl, type)(pctl, \
	- CUSTPRTOPARG(pctl, type), nvl, name, arrp, count); \
	- } \
	- if (!done) { \
	- (void) DFLTPRTOP(pctl, type)(pctl, \
	- DFLTPRTOPARG(pctl, type), nvl, name, arrp, count); \
	- } \
	- (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \
	- }
	-
	-static void nvlist_print_with_indent(nvlist_t *, nvlist_prtctl_t);
	-
	-/*
	- * ======================================================================
	- * \| \|
	- * \| Indentation \|
	- * \| \|
	- * ======================================================================
	- */
	-
	-static void
	-indent(nvlist_prtctl_t pctl, int onemore)
	-{
	- int depth;
	-
	- switch (pctl->nvprt_indent_mode) {
	- case NVLIST_INDENT_ABS:
	- (void) fprintf(pctl->nvprt_fp, "%*s",
	- pctl->nvprt_indent + onemore * pctl->nvprt_indentinc, "");
	- break;
	-
	- case NVLIST_INDENT_TABBED:
	- depth = pctl->nvprt_indent + onemore;
	- while (depth-- > 0)
	- (void) fprintf(pctl->nvprt_fp, "\t");
	- }
	-}
	-
	-/*
	- * ======================================================================
	- * \| \|
	- * \| Default nvlist member rendering functions. \|
	- * \| \|
	- * ======================================================================
	- */
	-
	-/*
	- * Generate functions to print single-valued nvlist members.
	- *
	- * type_and_variant - suffix to form function name
	- * vtype - C type for the member value
	- * ptype - C type to cast value to for printing
	- * vfmt - format string for pair value, e.g "%d" or "0x%llx"
	- */
	-
	-#define NVLIST_PRTFUNC(type_and_variant, vtype, ptype, vfmt) \
	-static int \
	-nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \
	- nvlist_t nvl, const char name, vtype value) \
	-{ \
	- FILE *fp = pctl->nvprt_fp; \
	- NOTE(ARGUNUSED(private)) \
	- NOTE(ARGUNUSED(nvl)) \
	- indent(pctl, 1); \
	- (void) fprintf(fp, pctl->nvprt_nmfmt, name); \
	- (void) fprintf(fp, vfmt, (ptype)value); \
	- return (1); \
	-}
	-
	-NVLIST_PRTFUNC(boolean, int, int, "%d")
	-NVLIST_PRTFUNC(boolean_value, boolean_t, int, "%d")
	-NVLIST_PRTFUNC(byte, uchar_t, uchar_t, "0x%2.2x")
	-NVLIST_PRTFUNC(int8, int8_t, int, "%d")
	-NVLIST_PRTFUNC(uint8, uint8_t, uint8_t, "0x%x")
	-NVLIST_PRTFUNC(int16, int16_t, int16_t, "%d")
	-NVLIST_PRTFUNC(uint16, uint16_t, uint16_t, "0x%x")
	-NVLIST_PRTFUNC(int32, int32_t, int32_t, "%d")
	-NVLIST_PRTFUNC(uint32, uint32_t, uint32_t, "0x%x")
	-NVLIST_PRTFUNC(int64, int64_t, longlong_t, "%lld")
	-NVLIST_PRTFUNC(uint64, uint64_t, u_longlong_t, "0x%llx")
	-NVLIST_PRTFUNC(double, double, double, "0x%f")
	-NVLIST_PRTFUNC(string, char , char , "%s")
	-NVLIST_PRTFUNC(hrtime, hrtime_t, hrtime_t, "0x%llx")
	-
	-/*
	- * Generate functions to print array-valued nvlist members.
	- */
	-
	-#define NVLIST_ARRPRTFUNC(type_and_variant, vtype, ptype, vfmt) \
	-static int \
	-nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \
	- nvlist_t nvl, const char name, vtype *valuep, uint_t count) \
	-{ \
	- FILE *fp = pctl->nvprt_fp; \
	- uint_t i; \
	- NOTE(ARGUNUSED(private)) \
	- NOTE(ARGUNUSED(nvl)) \
	- for (i = 0; i < count; i++) { \
	- if (i == 0 \|\| pctl->nvprt_btwnarrfmt_nl) { \
	- indent(pctl, 1); \
	- (void) fprintf(fp, pctl->nvprt_nmfmt, name); \
	- if (pctl->nvprt_btwnarrfmt_nl) \
	- (void) fprintf(fp, "[%d]: ", i); \
	- } \
	- if (i != 0) \
	- (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \
	- (void) fprintf(fp, vfmt, (ptype)valuep[i]); \
	- } \
	- return (1); \
	-}
	-
	-NVLIST_ARRPRTFUNC(boolean_array, boolean_t, boolean_t, "%d")
	-NVLIST_ARRPRTFUNC(byte_array, uchar_t, uchar_t, "0x%2.2x")
	-NVLIST_ARRPRTFUNC(int8_array, int8_t, int8_t, "%d")
	-NVLIST_ARRPRTFUNC(uint8_array, uint8_t, uint8_t, "0x%x")
	-NVLIST_ARRPRTFUNC(int16_array, int16_t, int16_t, "%d")
	-NVLIST_ARRPRTFUNC(uint16_array, uint16_t, uint16_t, "0x%x")
	-NVLIST_ARRPRTFUNC(int32_array, int32_t, int32_t, "%d")
	-NVLIST_ARRPRTFUNC(uint32_array, uint32_t, uint32_t, "0x%x")
	-NVLIST_ARRPRTFUNC(int64_array, int64_t, longlong_t, "%lld")
	-NVLIST_ARRPRTFUNC(uint64_array, uint64_t, u_longlong_t, "0x%llx")
	-NVLIST_ARRPRTFUNC(string_array, char , char , "%s")
	-
	-/ARGSUSED/
	-static int
	-nvprint_nvlist(nvlist_prtctl_t pctl, void *private,
	- nvlist_t nvl, const char name, nvlist_t *value)
	-{
	- FILE *fp = pctl->nvprt_fp;
	-
	- indent(pctl, 1);
	- (void) fprintf(fp, "%s = (embedded nvlist)\n", name);
	-
	- pctl->nvprt_indent += pctl->nvprt_indentinc;
	- nvlist_print_with_indent(value, pctl);
	- pctl->nvprt_indent -= pctl->nvprt_indentinc;
	-
	- indent(pctl, 1);
	- (void) fprintf(fp, "(end %s)\n", name);
	-
	- return (1);
	-}
	-
	-/ARGSUSED/
	-static int
	-nvaprint_nvlist_array(nvlist_prtctl_t pctl, void *private,
	- nvlist_t nvl, const char name, nvlist_t **valuep, uint_t count)
	-{
	- FILE *fp = pctl->nvprt_fp;
	- uint_t i;
	-
	- indent(pctl, 1);
	- (void) fprintf(fp, "%s = (array of embedded nvlists)\n", name);
	-
	- for (i = 0; i < count; i++) {
	- indent(pctl, 1);
	- (void) fprintf(fp, "(start %s[%d])\n", name, i);
	-
	- pctl->nvprt_indent += pctl->nvprt_indentinc;
	- nvlist_print_with_indent(valuep[i], pctl);
	- pctl->nvprt_indent -= pctl->nvprt_indentinc;
	-
	- indent(pctl, 1);
	- (void) fprintf(fp, "(end %s[%d])\n", name, i);
	- }
	-
	- return (1);
	-}
	-
	-/*
	- * ======================================================================
	- * \| \|
	- * \| Interfaces that allow control over formatting. \|
	- * \| \|
	- * ======================================================================
	- */
	-
	-void
	-nvlist_prtctl_setdest(nvlist_prtctl_t pctl, FILE *fp)
	-{
	- pctl->nvprt_fp = fp;
	-}
	-
	-FILE *
	-nvlist_prtctl_getdest(nvlist_prtctl_t pctl)
	-{
	- return (pctl->nvprt_fp);
	-}
	-
	-
	-void
	-nvlist_prtctl_setindent(nvlist_prtctl_t pctl, enum nvlist_indent_mode mode,
	- int start, int inc)
	-{
	- if (mode < NVLIST_INDENT_ABS \|\| mode > NVLIST_INDENT_TABBED)
	- mode = NVLIST_INDENT_TABBED;
	-
	- if (start < 0)
	- start = 0;
	-
	- if (inc < 0)
	- inc = 1;
	-
	- pctl->nvprt_indent_mode = mode;
	- pctl->nvprt_indent = start;
	- pctl->nvprt_indentinc = inc;
	-}
	-
	-void
	-nvlist_prtctl_doindent(nvlist_prtctl_t pctl, int onemore)
	-{
	- indent(pctl, onemore);
	-}
	-
	-
	-void
	-nvlist_prtctl_setfmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which,
	- const char *fmt)
	-{
	- switch (which) {
	- case NVLIST_FMT_MEMBER_NAME:
	- if (fmt == NULL)
	- fmt = "%s = ";
	- pctl->nvprt_nmfmt = fmt;
	- break;
	-
	- case NVLIST_FMT_MEMBER_POSTAMBLE:
	- if (fmt == NULL)
	- fmt = "\n";
	- pctl->nvprt_eomfmt = fmt;
	- break;
	-
	- case NVLIST_FMT_BTWN_ARRAY:
	- if (fmt == NULL) {
	- pctl->nvprt_btwnarrfmt = " ";
	- pctl->nvprt_btwnarrfmt_nl = 0;
	- } else {
	- pctl->nvprt_btwnarrfmt = fmt;
	- pctl->nvprt_btwnarrfmt_nl = (strstr(fmt, "\n") != NULL);
	- }
	- break;
	-
	- default:
	- break;
	- }
	-}
	-
	-
	-void
	-nvlist_prtctl_dofmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, ...)
	-{
	- FILE *fp = pctl->nvprt_fp;
	- va_list ap;
	- char *name;
	-
	- va_start(ap, which);
	-
	- switch (which) {
	- case NVLIST_FMT_MEMBER_NAME:
	- name = va_arg(ap, char *);
	- (void) fprintf(fp, pctl->nvprt_nmfmt, name);
	- break;
	-
	- case NVLIST_FMT_MEMBER_POSTAMBLE:
	- (void) fprintf(fp, pctl->nvprt_eomfmt);
	- break;
	-
	- case NVLIST_FMT_BTWN_ARRAY:
	- (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \
	- break;
	-
	- default:
	- break;
	- }
	-
	- va_end(ap);
	-}
	-
	-/*
	- * ======================================================================
	- * \| \|
	- * \| Interfaces to allow appointment of replacement rendering functions.\|
	- * \| \|
	- * ======================================================================
	- */
	-
	-#define NVLIST_PRINTCTL_REPLACE(type, vtype) \
	-void \
	-nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \
	- int (func)(nvlist_prtctl_t, void , nvlist_t , const char , vtype), \
	- void *private) \
	-{ \
	- CUSTPRTOP(pctl, type) = func; \
	- CUSTPRTOPARG(pctl, type) = private; \
	-}
	-
	-NVLIST_PRINTCTL_REPLACE(boolean, int)
	-NVLIST_PRINTCTL_REPLACE(boolean_value, boolean_t)
	-NVLIST_PRINTCTL_REPLACE(byte, uchar_t)
	-NVLIST_PRINTCTL_REPLACE(int8, int8_t)
	-NVLIST_PRINTCTL_REPLACE(uint8, uint8_t)
	-NVLIST_PRINTCTL_REPLACE(int16, int16_t)
	-NVLIST_PRINTCTL_REPLACE(uint16, uint16_t)
	-NVLIST_PRINTCTL_REPLACE(int32, int32_t)
	-NVLIST_PRINTCTL_REPLACE(uint32, uint32_t)
	-NVLIST_PRINTCTL_REPLACE(int64, int64_t)
	-NVLIST_PRINTCTL_REPLACE(uint64, uint64_t)
	-NVLIST_PRINTCTL_REPLACE(double, double)
	-NVLIST_PRINTCTL_REPLACE(string, char *)
	-NVLIST_PRINTCTL_REPLACE(hrtime, hrtime_t)
	-NVLIST_PRINTCTL_REPLACE(nvlist, nvlist_t *)
	-
	-#define NVLIST_PRINTCTL_AREPLACE(type, vtype) \
	-void \
	-nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \
	- int (func)(nvlist_prtctl_t, void , nvlist_t , const char , vtype, \
	- uint_t), void *private) \
	-{ \
	- CUSTPRTOP(pctl, type) = func; \
	- CUSTPRTOPARG(pctl, type) = private; \
	-}
	-
	-NVLIST_PRINTCTL_AREPLACE(boolean_array, boolean_t *)
	-NVLIST_PRINTCTL_AREPLACE(byte_array, uchar_t *)
	-NVLIST_PRINTCTL_AREPLACE(int8_array, int8_t *)
	-NVLIST_PRINTCTL_AREPLACE(uint8_array, uint8_t *)
	-NVLIST_PRINTCTL_AREPLACE(int16_array, int16_t *)
	-NVLIST_PRINTCTL_AREPLACE(uint16_array, uint16_t *)
	-NVLIST_PRINTCTL_AREPLACE(int32_array, int32_t *)
	-NVLIST_PRINTCTL_AREPLACE(uint32_array, uint32_t *)
	-NVLIST_PRINTCTL_AREPLACE(int64_array, int64_t *)
	-NVLIST_PRINTCTL_AREPLACE(uint64_array, uint64_t *)
	-NVLIST_PRINTCTL_AREPLACE(string_array, char **)
	-NVLIST_PRINTCTL_AREPLACE(nvlist_array, nvlist_t **)
	-
	-/*
	- * ======================================================================
	- * \| \|
	- * \| Interfaces to manage nvlist_prtctl_t cookies. \|
	- * \| \|
	- * ======================================================================
	- */
	-
	-
	-static const struct nvlist_printops defprtops = {
	- { nvprint_boolean, NULL },
	- { nvprint_boolean_value, NULL },
	- { nvprint_byte, NULL },
	- { nvprint_int8, NULL },
	- { nvprint_uint8, NULL },
	- { nvprint_int16, NULL },
	- { nvprint_uint16, NULL },
	- { nvprint_int32, NULL },
	- { nvprint_uint32, NULL },
	- { nvprint_int64, NULL },
	- { nvprint_uint64, NULL },
	- { nvprint_double, NULL },
	- { nvprint_string, NULL },
	- { nvprint_hrtime, NULL },
	- { nvprint_nvlist, NULL },
	- { nvaprint_boolean_array, NULL },
	- { nvaprint_byte_array, NULL },
	- { nvaprint_int8_array, NULL },
	- { nvaprint_uint8_array, NULL },
	- { nvaprint_int16_array, NULL },
	- { nvaprint_uint16_array, NULL },
	- { nvaprint_int32_array, NULL },
	- { nvaprint_uint32_array, NULL },
	- { nvaprint_int64_array, NULL },
	- { nvaprint_uint64_array, NULL },
	- { nvaprint_string_array, NULL },
	- { nvaprint_nvlist_array, NULL },
	-};
	-
	-static void
	-prtctl_defaults(FILE fp, struct nvlist_prtctl pctl,
	- struct nvlist_printops *ops)
	-{
	- pctl->nvprt_fp = fp;
	- pctl->nvprt_indent_mode = NVLIST_INDENT_TABBED;
	- pctl->nvprt_indent = 0;
	- pctl->nvprt_indentinc = 1;
	- pctl->nvprt_nmfmt = "%s = ";
	- pctl->nvprt_eomfmt = "\n";
	- pctl->nvprt_btwnarrfmt = " ";
	- pctl->nvprt_btwnarrfmt_nl = 0;
	-
	- pctl->nvprt_dfltops = (struct nvlist_printops *)&defprtops;
	- pctl->nvprt_custops = ops;
	-}
	-
	-nvlist_prtctl_t
	-nvlist_prtctl_alloc(void)
	-{
	- struct nvlist_prtctl *pctl;
	- struct nvlist_printops *ops;
	-
	- if ((pctl = malloc(sizeof (*pctl))) == NULL)
	- return (NULL);
	-
	- if ((ops = calloc(1, sizeof (*ops))) == NULL) {
	- free(pctl);
	- return (NULL);
	- }
	-
	- prtctl_defaults(stdout, pctl, ops);
	-
	- return (pctl);
	-}
	-
	-void
	-nvlist_prtctl_free(nvlist_prtctl_t pctl)
	-{
	- if (pctl != NULL) {
	- free(pctl->nvprt_custops);
	- free(pctl);
	- }
	-}
	-
	-/*
	- * ======================================================================
	- * \| \|
	- * \| Top-level print request interfaces. \|
	- * \| \|
	- * ======================================================================
	- */
	-
	-/*
	- * nvlist_print - Prints elements in an event buffer
	- */
	-static void
	-nvlist_print_with_indent(nvlist_t *nvl, nvlist_prtctl_t pctl)
	-{
	- FILE *fp = pctl->nvprt_fp;
	- char *name;
	- uint_t nelem;
	- nvpair_t *nvp;
	-
	- if (nvl == NULL)
	- return;
	-
	- indent(pctl, 0);
	- (void) fprintf(fp, "nvlist version: %d\n", NVL_VERSION(nvl));
	-
	- nvp = nvlist_next_nvpair(nvl, NULL);
	-
	- while (nvp) {
	- data_type_t type = nvpair_type(nvp);
	-
	- name = nvpair_name(nvp);
	- nelem = 0;
	-
	- switch (type) {
	- case DATA_TYPE_BOOLEAN: {
	- RENDER(pctl, boolean, nvl, name, 1);
	- break;
	- }
	- case DATA_TYPE_BOOLEAN_VALUE: {
	- boolean_t val;
	- (void) nvpair_value_boolean_value(nvp, &val);
	- RENDER(pctl, boolean_value, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_BYTE: {
	- uchar_t val;
	- (void) nvpair_value_byte(nvp, &val);
	- RENDER(pctl, byte, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_INT8: {
	- int8_t val;
	- (void) nvpair_value_int8(nvp, &val);
	- RENDER(pctl, int8, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_UINT8: {
	- uint8_t val;
	- (void) nvpair_value_uint8(nvp, &val);
	- RENDER(pctl, uint8, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_INT16: {
	- int16_t val;
	- (void) nvpair_value_int16(nvp, &val);
	- RENDER(pctl, int16, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_UINT16: {
	- uint16_t val;
	- (void) nvpair_value_uint16(nvp, &val);
	- RENDER(pctl, uint16, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_INT32: {
	- int32_t val;
	- (void) nvpair_value_int32(nvp, &val);
	- RENDER(pctl, int32, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_UINT32: {
	- uint32_t val;
	- (void) nvpair_value_uint32(nvp, &val);
	- RENDER(pctl, uint32, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_INT64: {
	- int64_t val;
	- (void) nvpair_value_int64(nvp, &val);
	- RENDER(pctl, int64, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_UINT64: {
	- uint64_t val;
	- (void) nvpair_value_uint64(nvp, &val);
	- RENDER(pctl, uint64, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_DOUBLE: {
	- double val;
	- (void) nvpair_value_double(nvp, &val);
	- RENDER(pctl, double, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_STRING: {
	- char *val;
	- (void) nvpair_value_string(nvp, &val);
	- RENDER(pctl, string, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_BOOLEAN_ARRAY: {
	- boolean_t *val;
	- (void) nvpair_value_boolean_array(nvp, &val, &nelem);
	- ARENDER(pctl, boolean_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_BYTE_ARRAY: {
	- uchar_t *val;
	- (void) nvpair_value_byte_array(nvp, &val, &nelem);
	- ARENDER(pctl, byte_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_INT8_ARRAY: {
	- int8_t *val;
	- (void) nvpair_value_int8_array(nvp, &val, &nelem);
	- ARENDER(pctl, int8_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_UINT8_ARRAY: {
	- uint8_t *val;
	- (void) nvpair_value_uint8_array(nvp, &val, &nelem);
	- ARENDER(pctl, uint8_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_INT16_ARRAY: {
	- int16_t *val;
	- (void) nvpair_value_int16_array(nvp, &val, &nelem);
	- ARENDER(pctl, int16_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_UINT16_ARRAY: {
	- uint16_t *val;
	- (void) nvpair_value_uint16_array(nvp, &val, &nelem);
	- ARENDER(pctl, uint16_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_INT32_ARRAY: {
	- int32_t *val;
	- (void) nvpair_value_int32_array(nvp, &val, &nelem);
	- ARENDER(pctl, int32_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_UINT32_ARRAY: {
	- uint32_t *val;
	- (void) nvpair_value_uint32_array(nvp, &val, &nelem);
	- ARENDER(pctl, uint32_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_INT64_ARRAY: {
	- int64_t *val;
	- (void) nvpair_value_int64_array(nvp, &val, &nelem);
	- ARENDER(pctl, int64_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_UINT64_ARRAY: {
	- uint64_t *val;
	- (void) nvpair_value_uint64_array(nvp, &val, &nelem);
	- ARENDER(pctl, uint64_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_STRING_ARRAY: {
	- char **val;
	- (void) nvpair_value_string_array(nvp, &val, &nelem);
	- ARENDER(pctl, string_array, nvl, name, val, nelem);
	- break;
	- }
	- case DATA_TYPE_HRTIME: {
	- hrtime_t val;
	- (void) nvpair_value_hrtime(nvp, &val);
	- RENDER(pctl, hrtime, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_NVLIST: {
	- nvlist_t *val;
	- (void) nvpair_value_nvlist(nvp, &val);
	- RENDER(pctl, nvlist, nvl, name, val);
	- break;
	- }
	- case DATA_TYPE_NVLIST_ARRAY: {
	- nvlist_t **val;
	- (void) nvpair_value_nvlist_array(nvp, &val, &nelem);
	- ARENDER(pctl, nvlist_array, nvl, name, val, nelem);
	- break;
	- }
	- default:
	- (void) fprintf(fp, " unknown data type (%d)", type);
	- break;
	- }
	- nvp = nvlist_next_nvpair(nvl, nvp);
	- }
	-}
	-
	-void
	-nvlist_print(FILE fp, nvlist_t nvl)
	-{
	- struct nvlist_prtctl pc;
	-
	- prtctl_defaults(fp, &pc, NULL);
	- nvlist_print_with_indent(nvl, &pc);
	-}
	-
	-void
	-nvlist_prt(nvlist_t *nvl, nvlist_prtctl_t pctl)
	-{
	- nvlist_print_with_indent(nvl, pctl);
	-}
	-
	-#define NVP(elem, type, vtype, ptype, format) { \
	- vtype value; \
	-\
	- (void) nvpair_value_##type(elem, &value); \
	- (void) printf("%*s%s: " format "\n", indent, "", \
	- nvpair_name(elem), (ptype)value); \
	-}
	-
	-#define NVPA(elem, type, vtype, ptype, format) { \
	- uint_t i, count; \
	- vtype *value; \
	-\
	- (void) nvpair_value_##type(elem, &value, &count); \
	- for (i = 0; i < count; i++) { \
	- (void) printf("%*s%s[%d]: " format "\n", indent, "", \
	- nvpair_name(elem), i, (ptype)value[i]); \
	- } \
	-}
	-
	-/*
	- * Similar to nvlist_print() but handles arrays slightly differently.
	- */
	-void
	-dump_nvlist(nvlist_t *list, int indent)
	-{
	- nvpair_t *elem = NULL;
	- boolean_t bool_value;
	- boolean_t *bool_array_value;
	- nvlist_t *nvlist_value;
	- nvlist_t **nvlist_array_value;
	- uint_t i, count;
	-
	- if (list == NULL) {
	- return;
	- }
	-
	- while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
	- switch (nvpair_type(elem)) {
	- case DATA_TYPE_BOOLEAN:
	- (void) printf("%*s%s\n", indent, "", nvpair_name(elem));
	- break;
	-
	- case DATA_TYPE_BOOLEAN_VALUE:
	- (void) nvpair_value_boolean_value(elem, &bool_value);
	- (void) printf("%*s%s: %s\n", indent, "",
	- nvpair_name(elem), bool_value ? "true" : "false");
	- break;
	-
	- case DATA_TYPE_BYTE:
	- NVP(elem, byte, uchar_t, int, "%u");
	- break;
	-
	- case DATA_TYPE_INT8:
	- NVP(elem, int8, int8_t, int, "%d");
	- break;
	-
	- case DATA_TYPE_UINT8:
	- NVP(elem, uint8, uint8_t, int, "%u");
	- break;
	-
	- case DATA_TYPE_INT16:
	- NVP(elem, int16, int16_t, int, "%d");
	- break;
	-
	- case DATA_TYPE_UINT16:
	- NVP(elem, uint16, uint16_t, int, "%u");
	- break;
	-
	- case DATA_TYPE_INT32:
	- NVP(elem, int32, int32_t, long, "%ld");
	- break;
	-
	- case DATA_TYPE_UINT32:
	- NVP(elem, uint32, uint32_t, ulong_t, "%lu");
	- break;
	-
	- case DATA_TYPE_INT64:
	- NVP(elem, int64, int64_t, longlong_t, "%lld");
	- break;
	-
	- case DATA_TYPE_UINT64:
	- NVP(elem, uint64, uint64_t, u_longlong_t, "%llu");
	- break;
	-
	- case DATA_TYPE_STRING:
	- NVP(elem, string, char , char , "'%s'");
	- break;
	-
	- case DATA_TYPE_BOOLEAN_ARRAY:
	- (void) nvpair_value_boolean_array(elem,
	- &bool_array_value, &count);
	- for (i = 0; i < count; i++) {
	- (void) printf("%*s%s[%d]: %s\n", indent, "",
	- nvpair_name(elem), i,
	- bool_array_value[i] ? "true" : "false");
	- }
	- break;
	-
	- case DATA_TYPE_BYTE_ARRAY:
	- NVPA(elem, byte_array, uchar_t, int, "%u");
	- break;
	-
	- case DATA_TYPE_INT8_ARRAY:
	- NVPA(elem, int8_array, int8_t, int, "%d");
	- break;
	-
	- case DATA_TYPE_UINT8_ARRAY:
	- NVPA(elem, uint8_array, uint8_t, int, "%u");
	- break;
	-
	- case DATA_TYPE_INT16_ARRAY:
	- NVPA(elem, int16_array, int16_t, int, "%d");
	- break;
	-
	- case DATA_TYPE_UINT16_ARRAY:
	- NVPA(elem, uint16_array, uint16_t, int, "%u");
	- break;
	-
	- case DATA_TYPE_INT32_ARRAY:
	- NVPA(elem, int32_array, int32_t, long, "%ld");
	- break;
	-
	- case DATA_TYPE_UINT32_ARRAY:
	- NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu");
	- break;
	-
	- case DATA_TYPE_INT64_ARRAY:
	- NVPA(elem, int64_array, int64_t, longlong_t, "%lld");
	- break;
	-
	- case DATA_TYPE_UINT64_ARRAY:
	- NVPA(elem, uint64_array, uint64_t, u_longlong_t,
	- "%llu");
	- break;
	-
	- case DATA_TYPE_STRING_ARRAY:
	- NVPA(elem, string_array, char , char , "'%s'");
	- break;
	-
	- case DATA_TYPE_NVLIST:
	- (void) nvpair_value_nvlist(elem, &nvlist_value);
	- (void) printf("%*s%s:\n", indent, "",
	- nvpair_name(elem));
	- dump_nvlist(nvlist_value, indent + 4);
	- break;
	-
	- case DATA_TYPE_NVLIST_ARRAY:
	- (void) nvpair_value_nvlist_array(elem,
	- &nvlist_array_value, &count);
	- for (i = 0; i < count; i++) {
	- (void) printf("%*s%s[%u]:\n", indent, "",
	- nvpair_name(elem), i);
	- dump_nvlist(nvlist_array_value[i], indent + 4);
	- }
	- break;
	-
	- default:
	- (void) printf(dgettext(TEXT_DOMAIN, "bad config type "
	- "%d for %s\n"), nvpair_type(elem),
	- nvpair_name(elem));
	- }
	- }
	-}
	-
	-/*
	- * ======================================================================
	- * \| \|
	- * \| Misc private interface. \|
	- * \| \|
	- * ======================================================================
	- */
	-
	-/*
	- * Determine if string 'value' matches 'nvp' value. The 'value' string is
	- * converted, depending on the type of 'nvp', prior to match. For numeric
	- * types, a radix independent sscanf conversion of 'value' is used. If 'nvp'
	- * is an array type, 'ai' is the index into the array against which we are
	- * checking for match. If nvp is of DATA_TYPE_STRING*, the caller can pass
	- * in a regex_t compilation of value in 'value_regex' to trigger regular
	- * expression string match instead of simple strcmp().
	- *
	- * Return 1 on match, 0 on no-match, and -1 on error. If the error is
	- * related to value syntax error and 'ep' is non-NULL, *ep will point into
	- * the 'value' string at the location where the error exists.
	- *
	- * NOTE: It may be possible to move the non-regex_t version of this into
	- * common code used by library/kernel/boot.
	- */
	-int
	-nvpair_value_match_regex(nvpair_t *nvp, int ai,
	- char value, regex_t value_regex, char **ep)
	-{
	- char *evalue;
	- uint_t a_len;
	- int sr;
	-
	- if (ep)
	- *ep = NULL;
	-
	- if ((nvp == NULL) \|\| (value == NULL))
	- return (-1); /* error fail match - invalid args */
	-
	- /* make sure array and index combination make sense */
	- if ((nvpair_type_is_array(nvp) && (ai < 0)) \|\|
	- (!nvpair_type_is_array(nvp) && (ai >= 0)))
	- return (-1); /* error fail match - bad index */
	-
	- /* non-string values should be single 'chunk' */
	- if ((nvpair_type(nvp) != DATA_TYPE_STRING) &&
	- (nvpair_type(nvp) != DATA_TYPE_STRING_ARRAY)) {
	- value += strspn(value, " \t");
	- evalue = value + strcspn(value, " \t");
	- if (*evalue) {
	- if (ep)
	- *ep = evalue;
	- return (-1); /* error fail match - syntax */
	- }
	- }
	-
	- sr = EOF;
	- switch (nvpair_type(nvp)) {
	- case DATA_TYPE_STRING: {
	- char *val;
	-
	- /* check string value for match */
	- if (nvpair_value_string(nvp, &val) == 0) {
	- if (value_regex) {
	- if (regexec(value_regex, val,
	- (size_t)0, NULL, 0) == 0)
	- return (1); /* match */
	- } else {
	- if (strcmp(value, val) == 0)
	- return (1); /* match */
	- }
	- }
	- break;
	- }
	- case DATA_TYPE_STRING_ARRAY: {
	- char **val_array;
	-
	- /* check indexed string value of array for match */
	- if ((nvpair_value_string_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len)) {
	- if (value_regex) {
	- if (regexec(value_regex, val_array[ai],
	- (size_t)0, NULL, 0) == 0)
	- return (1);
	- } else {
	- if (strcmp(value, val_array[ai]) == 0)
	- return (1);
	- }
	- }
	- break;
	- }
	- case DATA_TYPE_BYTE: {
	- uchar_t val, val_arg;
	-
	- /* scanf uchar_t from value and check for match */
	- sr = sscanf(value, "%c", &val_arg);
	- if ((sr == 1) && (nvpair_value_byte(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_BYTE_ARRAY: {
	- uchar_t *val_array, val_arg;
	-
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%c", &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_byte_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_INT8: {
	- int8_t val, val_arg;
	-
	- /* scanf int8_t from value and check for match */
	- sr = sscanf(value, "%"SCNi8, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_int8(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_INT8_ARRAY: {
	- int8_t *val_array, val_arg;
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%"SCNi8, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_int8_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_UINT8: {
	- uint8_t val, val_arg;
	-
	- /* scanf uint8_t from value and check for match */
	- sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_uint8(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_UINT8_ARRAY: {
	- uint8_t *val_array, val_arg;
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_uint8_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_INT16: {
	- int16_t val, val_arg;
	-
	- /* scanf int16_t from value and check for match */
	- sr = sscanf(value, "%"SCNi16, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_int16(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_INT16_ARRAY: {
	- int16_t *val_array, val_arg;
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%"SCNi16, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_int16_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_UINT16: {
	- uint16_t val, val_arg;
	-
	- /* scanf uint16_t from value and check for match */
	- sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_uint16(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_UINT16_ARRAY: {
	- uint16_t *val_array, val_arg;
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_uint16_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_INT32: {
	- int32_t val, val_arg;
	-
	- /* scanf int32_t from value and check for match */
	- sr = sscanf(value, "%"SCNi32, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_int32(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_INT32_ARRAY: {
	- int32_t *val_array, val_arg;
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%"SCNi32, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_int32_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_UINT32: {
	- uint32_t val, val_arg;
	-
	- /* scanf uint32_t from value and check for match */
	- sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_uint32(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_UINT32_ARRAY: {
	- uint32_t *val_array, val_arg;
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_uint32_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_INT64: {
	- int64_t val, val_arg;
	-
	- /* scanf int64_t from value and check for match */
	- sr = sscanf(value, "%"SCNi64, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_int64(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_INT64_ARRAY: {
	- int64_t *val_array, val_arg;
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%"SCNi64, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_int64_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_UINT64: {
	- uint64_t val_arg, val;
	-
	- /* scanf uint64_t from value and check for match */
	- sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_uint64(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_UINT64_ARRAY: {
	- uint64_t *val_array, val_arg;
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_uint64_array(nvp, &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_BOOLEAN_VALUE: {
	- int32_t val_arg;
	- boolean_t val;
	-
	- /* scanf boolean_t from value and check for match */
	- sr = sscanf(value, "%"SCNi32, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_boolean_value(nvp, &val) == 0) &&
	- (val == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_BOOLEAN_ARRAY: {
	- boolean_t *val_array;
	- int32_t val_arg;
	-
	- /* check indexed value of array for match */
	- sr = sscanf(value, "%"SCNi32, &val_arg);
	- if ((sr == 1) &&
	- (nvpair_value_boolean_array(nvp,
	- &val_array, &a_len) == 0) &&
	- (ai < a_len) &&
	- (val_array[ai] == val_arg))
	- return (1);
	- break;
	- }
	- case DATA_TYPE_HRTIME:
	- case DATA_TYPE_NVLIST:
	- case DATA_TYPE_NVLIST_ARRAY:
	- case DATA_TYPE_BOOLEAN:
	- case DATA_TYPE_DOUBLE:
	- case DATA_TYPE_UNKNOWN:
	- default:
	- /*
	- * unknown/unsupported data type
	- */
	- return (-1); /* error fail match */
	- }
	-
	- /*
	- * check to see if sscanf failed conversion, return approximate
	- * pointer to problem
	- */
	- if (sr != 1) {
	- if (ep)
	- *ep = value;
	- return (-1); /* error fail match - syntax */
	- }
	-
	- return (0); /* fail match */
	-}
	-
	-int
	-nvpair_value_match(nvpair_t nvp, int ai, char value, char **ep)
	-{
	- return (nvpair_value_match_regex(nvp, ai, value, NULL, ep));
	-}
	Index: head/cddl/contrib/opensolaris/lib/libnvpair/nvpair_alloc_system.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libnvpair/nvpair_alloc_system.c
	+++ head/cddl/contrib/opensolaris/lib/libnvpair/nvpair_alloc_system.c
	@@ -1,59 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include <sys/nvpair.h>
	-#include <stdlib.h>
	-
	-/ARGSUSED/
	-static void *
	-nv_alloc_sys(nv_alloc_t *nva, size_t size)
	-{
	- return (malloc(size));
	-}
	-
	-/ARGSUSED/
	-static void
	-nv_free_sys(nv_alloc_t nva, void buf, size_t size)
	-{
	- free(buf);
	-}
	-
	-const nv_alloc_ops_t system_ops_def = {
	- NULL, /* nv_ao_init() */
	- NULL, /* nv_ao_fini() */
	- nv_alloc_sys, /* nv_ao_alloc() */
	- nv_free_sys, /* nv_ao_free() */
	- NULL /* nv_ao_reset() */
	-};
	-
	-nv_alloc_t nv_alloc_nosleep_def = {
	- &system_ops_def,
	- NULL
	-};
	-
	-nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def;
	Index: head/cddl/contrib/opensolaris/lib/libnvpair/nvpair_json.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libnvpair/nvpair_json.c
	+++ head/cddl/contrib/opensolaris/lib/libnvpair/nvpair_json.c
	@@ -1,406 +0,0 @@
	-/*
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- */
	-/*
	- * Copyright (c) 2014, Joyent, Inc.
	- * Copyright (c) 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <wchar.h>
	-#include <sys/debug.h>
	-
	-#include "libnvpair.h"
	-
	-#define FPRINTF(fp, ...) \
	- do { \
	- if (fprintf(fp, __VA_ARGS__) < 0) \
	- return (-1); \
	- } while (0)
	-
	-/*
	- * When formatting a string for JSON output we must escape certain characters,
	- * as described in RFC4627. This applies to both member names and
	- * DATA_TYPE_STRING values.
	- *
	- * This function will only operate correctly if the following conditions are
	- * met:
	- *
	- * 1. The input String is encoded in the current locale.
	- *
	- * 2. The current locale includes the Basic Multilingual Plane (plane 0)
	- * as defined in the Unicode standard.
	- *
	- * The output will be entirely 7-bit ASCII (as a subset of UTF-8) with all
	- * representable Unicode characters included in their escaped numeric form.
	- */
	-static int
	-nvlist_print_json_string(FILE fp, const char input)
	-{
	- mbstate_t mbr;
	- wchar_t c;
	- size_t sz;
	-
	- bzero(&mbr, sizeof (mbr));
	-
	- FPRINTF(fp, "\"");
	- while ((sz = mbrtowc(&c, input, MB_CUR_MAX, &mbr)) > 0) {
	- switch (c) {
	- case '"':
	- FPRINTF(fp, "\\\"");
	- break;
	- case '\n':
	- FPRINTF(fp, "\\n");
	- break;
	- case '\r':
	- FPRINTF(fp, "\\r");
	- break;
	- case '\\':
	- FPRINTF(fp, "\\\\");
	- break;
	- case '\f':
	- FPRINTF(fp, "\\f");
	- break;
	- case '\t':
	- FPRINTF(fp, "\\t");
	- break;
	- case '\b':
	- FPRINTF(fp, "\\b");
	- break;
	- default:
	- if ((c >= 0x00 && c <= 0x1f) \|\|
	- (c > 0x7f && c <= 0xffff)) {
	- /*
	- * Render both Control Characters and Unicode
	- * characters in the Basic Multilingual Plane
	- * as JSON-escaped multibyte characters.
	- */
	- FPRINTF(fp, "\\u%04x", (int)(0xffff & c));
	- } else if (c >= 0x20 && c <= 0x7f) {
	- /*
	- * Render other 7-bit ASCII characters directly
	- * and drop other, unrepresentable characters.
	- */
	- FPRINTF(fp, "%c", (int)(0xff & c));
	- }
	- break;
	- }
	- input += sz;
	- }
	-
	- if (sz == (size_t)-1 \|\| sz == (size_t)-2) {
	- /*
	- * We last read an invalid multibyte character sequence,
	- * so return an error.
	- */
	- return (-1);
	- }
	-
	- FPRINTF(fp, "\"");
	- return (0);
	-}
	-
	-/*
	- * Dump a JSON-formatted representation of an nvlist to the provided FILE *.
	- * This routine does not output any new-lines or additional whitespace other
	- * than that contained in strings, nor does it call fflush(3C).
	- */
	-int
	-nvlist_print_json(FILE fp, nvlist_t nvl)
	-{
	- nvpair_t *curr;
	- boolean_t first = B_TRUE;
	-
	- FPRINTF(fp, "{");
	-
	- for (curr = nvlist_next_nvpair(nvl, NULL); curr;
	- curr = nvlist_next_nvpair(nvl, curr)) {
	- data_type_t type = nvpair_type(curr);
	-
	- if (!first)
	- FPRINTF(fp, ",");
	- else
	- first = B_FALSE;
	-
	- if (nvlist_print_json_string(fp, nvpair_name(curr)) == -1)
	- return (-1);
	- FPRINTF(fp, ":");
	-
	- switch (type) {
	- case DATA_TYPE_STRING: {
	- char *string = fnvpair_value_string(curr);
	- if (nvlist_print_json_string(fp, string) == -1)
	- return (-1);
	- break;
	- }
	-
	- case DATA_TYPE_BOOLEAN: {
	- FPRINTF(fp, "true");
	- break;
	- }
	-
	- case DATA_TYPE_BOOLEAN_VALUE: {
	- FPRINTF(fp, "%s", fnvpair_value_boolean_value(curr) ==
	- B_TRUE ? "true" : "false");
	- break;
	- }
	-
	- case DATA_TYPE_BYTE: {
	- FPRINTF(fp, "%hhu", fnvpair_value_byte(curr));
	- break;
	- }
	-
	- case DATA_TYPE_INT8: {
	- FPRINTF(fp, "%hhd", fnvpair_value_int8(curr));
	- break;
	- }
	-
	- case DATA_TYPE_UINT8: {
	- FPRINTF(fp, "%hhu", fnvpair_value_uint8_t(curr));
	- break;
	- }
	-
	- case DATA_TYPE_INT16: {
	- FPRINTF(fp, "%hd", fnvpair_value_int16(curr));
	- break;
	- }
	-
	- case DATA_TYPE_UINT16: {
	- FPRINTF(fp, "%hu", fnvpair_value_uint16(curr));
	- break;
	- }
	-
	- case DATA_TYPE_INT32: {
	- FPRINTF(fp, "%d", fnvpair_value_int32(curr));
	- break;
	- }
	-
	- case DATA_TYPE_UINT32: {
	- FPRINTF(fp, "%u", fnvpair_value_uint32(curr));
	- break;
	- }
	-
	- case DATA_TYPE_INT64: {
	- FPRINTF(fp, "%lld",
	- (long long)fnvpair_value_int64(curr));
	- break;
	- }
	-
	- case DATA_TYPE_UINT64: {
	- FPRINTF(fp, "%llu",
	- (unsigned long long)fnvpair_value_uint64(curr));
	- break;
	- }
	-
	- case DATA_TYPE_HRTIME: {
	- hrtime_t val;
	- VERIFY0(nvpair_value_hrtime(curr, &val));
	- FPRINTF(fp, "%llu", (unsigned long long)val);
	- break;
	- }
	-
	- case DATA_TYPE_DOUBLE: {
	- double val;
	- VERIFY0(nvpair_value_double(curr, &val));
	- FPRINTF(fp, "%f", val);
	- break;
	- }
	-
	- case DATA_TYPE_NVLIST: {
	- if (nvlist_print_json(fp,
	- fnvpair_value_nvlist(curr)) == -1)
	- return (-1);
	- break;
	- }
	-
	- case DATA_TYPE_STRING_ARRAY: {
	- char **val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_string_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- if (nvlist_print_json_string(fp, val[i]) == -1)
	- return (-1);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_NVLIST_ARRAY: {
	- nvlist_t **val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_nvlist_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- if (nvlist_print_json(fp, val[i]) == -1)
	- return (-1);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_BOOLEAN_ARRAY: {
	- boolean_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_boolean_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, val[i] == B_TRUE ?
	- "true" : "false");
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_BYTE_ARRAY: {
	- uchar_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_byte_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, "%hhu", val[i]);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_UINT8_ARRAY: {
	- uint8_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_uint8_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, "%hhu", val[i]);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_INT8_ARRAY: {
	- int8_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_int8_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, "%hhd", val[i]);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_UINT16_ARRAY: {
	- uint16_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_uint16_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, "%hu", val[i]);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_INT16_ARRAY: {
	- int16_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_int16_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, "%hd", val[i]);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_UINT32_ARRAY: {
	- uint32_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_uint32_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, "%u", val[i]);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_INT32_ARRAY: {
	- int32_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_int32_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, "%d", val[i]);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_UINT64_ARRAY: {
	- uint64_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_uint64_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, "%llu",
	- (unsigned long long)val[i]);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_INT64_ARRAY: {
	- int64_t *val;
	- uint_t valsz, i;
	- VERIFY0(nvpair_value_int64_array(curr, &val, &valsz));
	- FPRINTF(fp, "[");
	- for (i = 0; i < valsz; i++) {
	- if (i > 0)
	- FPRINTF(fp, ",");
	- FPRINTF(fp, "%lld", (long long)val[i]);
	- }
	- FPRINTF(fp, "]");
	- break;
	- }
	-
	- case DATA_TYPE_UNKNOWN:
	- case DATA_TYPE_DONTCARE:
	- return (-1);
	- }
	-
	- }
	-
	- FPRINTF(fp, "}");
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h
	@@ -1,391 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _LIBUUTIL_H
	-#define _LIBUUTIL_H
	-
	-#include <solaris.h>
	-#include <sys/types.h>
	-#include <stdarg.h>
	-#include <stdio.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Standard flags codes.
	- */
	-#define UU_DEFAULT 0
	-
	-/*
	- * Standard error codes.
	- */
	-#define UU_ERROR_NONE 0 /* no error */
	-#define UU_ERROR_INVALID_ARGUMENT 1 /* invalid argument */
	-#define UU_ERROR_UNKNOWN_FLAG 2 /* passed flag invalid */
	-#define UU_ERROR_NO_MEMORY 3 /* out of memory */
	-#define UU_ERROR_CALLBACK_FAILED 4 /* callback-initiated error */
	-#define UU_ERROR_NOT_SUPPORTED 5 /* operation not supported */
	-#define UU_ERROR_EMPTY 6 /* no value provided */
	-#define UU_ERROR_UNDERFLOW 7 /* value is too small */
	-#define UU_ERROR_OVERFLOW 8 /* value is too value */
	-#define UU_ERROR_INVALID_CHAR 9 /* value contains unexpected char */
	-#define UU_ERROR_INVALID_DIGIT 10 /* value contains digit not in base */
	-
	-#define UU_ERROR_SYSTEM 99 /* underlying system error */
	-#define UU_ERROR_UNKNOWN 100 /* error status not known */
	-
	-/*
	- * Standard program exit codes.
	- */
	-#define UU_EXIT_OK (*(uu_exit_ok()))
	-#define UU_EXIT_FATAL (*(uu_exit_fatal()))
	-#define UU_EXIT_USAGE (*(uu_exit_usage()))
	-
	-/*
	- * Exit status profiles.
	- */
	-#define UU_PROFILE_DEFAULT 0
	-#define UU_PROFILE_LAUNCHER 1
	-
	-/*
	- * Error reporting functions.
	- */
	-uint32_t uu_error(void);
	-const char *uu_strerror(uint32_t);
	-
	-/*
	- * Program notification functions.
	- */
	-extern void uu_alt_exit(int);
	-extern const char uu_setpname(char );
	-extern const char *uu_getpname(void);
	-/PRINTFLIKE1/
	-extern void uu_warn(const char *, ...);
	-extern void uu_vwarn(const char *, va_list);
	-/PRINTFLIKE1/
	-extern void uu_die(const char *, ...) __NORETURN;
	-extern void uu_vdie(const char *, va_list) __NORETURN;
	-/PRINTFLIKE2/
	-extern void uu_xdie(int, const char *, ...) __NORETURN;
	-extern void uu_vxdie(int, const char *, va_list) __NORETURN;
	-
	-/*
	- * Exit status functions (not to be used directly)
	- */
	-extern int *uu_exit_ok(void);
	-extern int *uu_exit_fatal(void);
	-extern int *uu_exit_usage(void);
	-
	-/*
	- * string->number conversions
	- */
	-extern int uu_strtoint(const char , void , size_t, int, int64_t, int64_t);
	-extern int uu_strtouint(const char , void , size_t, int, uint64_t, uint64_t);
	-
	-/*
	- * Debug print facility functions.
	- */
	-typedef struct uu_dprintf uu_dprintf_t;
	-
	-typedef enum {
	- UU_DPRINTF_SILENT,
	- UU_DPRINTF_FATAL,
	- UU_DPRINTF_WARNING,
	- UU_DPRINTF_NOTICE,
	- UU_DPRINTF_INFO,
	- UU_DPRINTF_DEBUG
	-} uu_dprintf_severity_t;
	-
	-extern uu_dprintf_t uu_dprintf_create(const char , uu_dprintf_severity_t,
	- uint_t);
	-/PRINTFLIKE3/
	-extern void uu_dprintf(uu_dprintf_t *, uu_dprintf_severity_t,
	- const char *, ...);
	-extern void uu_dprintf_destroy(uu_dprintf_t *);
	-extern const char uu_dprintf_getname(uu_dprintf_t );
	-
	-/*
	- * Identifier test flags and function.
	- */
	-#define UU_NAME_DOMAIN 0x1 /* allow SUNW, or com.sun, prefix */
	-#define UU_NAME_PATH 0x2 /* allow '/'-delimited paths */
	-
	-int uu_check_name(const char *, uint_t);
	-
	-/*
	- * File creation functions.
	- */
	-extern int uu_open_tmp(const char *dir, uint_t uflags);
	-
	-/*
	- * Convenience functions.
	- */
	-#define UU_NELEM(a) (sizeof (a) / sizeof ((a)[0]))
	-
	-/PRINTFLIKE1/
	-extern char uu_msprintf(const char format, ...);
	-extern void *uu_zalloc(size_t);
	-extern char uu_strdup(const char );
	-extern void uu_free(void *);
	-
	-extern boolean_t uu_strcaseeq(const char a, const char b);
	-extern boolean_t uu_streq(const char a, const char b);
	-extern char uu_strndup(const char s, size_t n);
	-extern boolean_t uu_strbw(const char a, const char b);
	-extern void uu_memdup(const void buf, size_t sz);
	-extern void uu_dump(FILE out, const char prefix, const void *buf, size_t len);
	-
	-/*
	- * Comparison function type definition.
	- * Developers should be careful in their use of the _private argument. If you
	- * break interface guarantees, you get undefined behavior.
	- */
	-typedef int uu_compare_fn_t(const void __left, const void __right,
	- void *__private);
	-
	-/*
	- * Walk variant flags.
	- * A data structure need not provide support for all variants and
	- * combinations. Refer to the appropriate documentation.
	- */
	-#define UU_WALK_ROBUST 0x00000001 /* walk can survive removes */
	-#define UU_WALK_REVERSE 0x00000002 /* reverse walk order */
	-
	-#define UU_WALK_PREORDER 0x00000010 /* walk tree in pre-order */
	-#define UU_WALK_POSTORDER 0x00000020 /* walk tree in post-order */
	-
	-/*
	- * Walk callback function return codes.
	- */
	-#define UU_WALK_ERROR -1
	-#define UU_WALK_NEXT 0
	-#define UU_WALK_DONE 1
	-
	-/*
	- * Walk callback function type definition.
	- */
	-typedef int uu_walk_fn_t(void _elem, void _private);
	-
	-/*
	- * lists: opaque structures
	- */
	-typedef struct uu_list_pool uu_list_pool_t;
	-typedef struct uu_list uu_list_t;
	-
	-typedef struct uu_list_node {
	- uintptr_t uln_opaque[2];
	-} uu_list_node_t;
	-
	-typedef struct uu_list_walk uu_list_walk_t;
	-
	-typedef uintptr_t uu_list_index_t;
	-
	-/*
	- * lists: interface
	- *
	- * basic usage:
	- * typedef struct foo {
	- * ...
	- * uu_list_node_t foo_node;
	- * ...
	- * } foo_t;
	- *
	- * static int
	- * foo_compare(void l_arg, void r_arg, void *private)
	- * {
	- * foo_t *l = l_arg;
	- * foo_t *r = r_arg;
	- *
	- * if (... l greater than r ...)
	- * return (1);
	- * if (... l less than r ...)
	- * return (-1);
	- * return (0);
	- * }
	- *
	- * ...
	- * // at initialization time
	- * foo_pool = uu_list_pool_create("foo_pool",
	- * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare,
	- * debugging? 0 : UU_AVL_POOL_DEBUG);
	- * ...
	- */
	-uu_list_pool_t uu_list_pool_create(const char , size_t, size_t,
	- uu_compare_fn_t *, uint32_t);
	-#define UU_LIST_POOL_DEBUG 0x00000001
	-
	-void uu_list_pool_destroy(uu_list_pool_t *);
	-
	-/*
	- * usage:
	- *
	- * foo_t *a;
	- * a = malloc(sizeof(*a));
	- * uu_list_node_init(a, &a->foo_list, pool);
	- * ...
	- * uu_list_node_fini(a, &a->foo_list, pool);
	- * free(a);
	- */
	-void uu_list_node_init(void , uu_list_node_t , uu_list_pool_t *);
	-void uu_list_node_fini(void , uu_list_node_t , uu_list_pool_t *);
	-
	-uu_list_t uu_list_create(uu_list_pool_t , void *_parent, uint32_t);
	-#define UU_LIST_DEBUG 0x00000001
	-#define UU_LIST_SORTED 0x00000002 /* list is sorted */
	-
	-void uu_list_destroy(uu_list_t ); / list must be empty */
	-
	-size_t uu_list_numnodes(uu_list_t *);
	-
	-void uu_list_first(uu_list_t );
	-void uu_list_last(uu_list_t );
	-
	-void uu_list_next(uu_list_t , void *);
	-void uu_list_prev(uu_list_t , void *);
	-
	-int uu_list_walk(uu_list_t , uu_walk_fn_t , void *, uint32_t);
	-
	-uu_list_walk_t uu_list_walk_start(uu_list_t , uint32_t);
	-void uu_list_walk_next(uu_list_walk_t );
	-void uu_list_walk_end(uu_list_walk_t *);
	-
	-void uu_list_find(uu_list_t , void , void , uu_list_index_t *);
	-void uu_list_insert(uu_list_t , void , uu_list_index_t);
	-
	-void uu_list_nearest_next(uu_list_t , uu_list_index_t);
	-void uu_list_nearest_prev(uu_list_t , uu_list_index_t);
	-
	-void uu_list_teardown(uu_list_t , void **);
	-
	-void uu_list_remove(uu_list_t , void );
	-
	-/*
	- * lists: interfaces for non-sorted lists only
	- */
	-int uu_list_insert_before(uu_list_t , void _target, void *_elem);
	-int uu_list_insert_after(uu_list_t , void _target, void *_elem);
	-
	-/*
	- * avl trees: opaque structures
	- */
	-typedef struct uu_avl_pool uu_avl_pool_t;
	-typedef struct uu_avl uu_avl_t;
	-
	-typedef struct uu_avl_node {
	-#ifdef _LP64
	- uintptr_t uan_opaque[3];
	-#else
	- uintptr_t uan_opaque[4];
	-#endif
	-} uu_avl_node_t;
	-
	-typedef struct uu_avl_walk uu_avl_walk_t;
	-
	-typedef uintptr_t uu_avl_index_t;
	-
	-/*
	- * avl trees: interface
	- *
	- * basic usage:
	- * typedef struct foo {
	- * ...
	- * uu_avl_node_t foo_node;
	- * ...
	- * } foo_t;
	- *
	- * static int
	- * foo_compare(void l_arg, void r_arg, void *private)
	- * {
	- * foo_t *l = l_arg;
	- * foo_t *r = r_arg;
	- *
	- * if (... l greater than r ...)
	- * return (1);
	- * if (... l less than r ...)
	- * return (-1);
	- * return (0);
	- * }
	- *
	- * ...
	- * // at initialization time
	- * foo_pool = uu_avl_pool_create("foo_pool",
	- * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare,
	- * debugging? 0 : UU_AVL_POOL_DEBUG);
	- * ...
	- */
	-uu_avl_pool_t uu_avl_pool_create(const char , size_t, size_t,
	- uu_compare_fn_t *, uint32_t);
	-#define UU_AVL_POOL_DEBUG 0x00000001
	-
	-void uu_avl_pool_destroy(uu_avl_pool_t *);
	-
	-/*
	- * usage:
	- *
	- * foo_t *a;
	- * a = malloc(sizeof(*a));
	- * uu_avl_node_init(a, &a->foo_avl, pool);
	- * ...
	- * uu_avl_node_fini(a, &a->foo_avl, pool);
	- * free(a);
	- */
	-void uu_avl_node_init(void , uu_avl_node_t , uu_avl_pool_t *);
	-void uu_avl_node_fini(void , uu_avl_node_t , uu_avl_pool_t *);
	-
	-uu_avl_t uu_avl_create(uu_avl_pool_t , void *_parent, uint32_t);
	-#define UU_AVL_DEBUG 0x00000001
	-
	-void uu_avl_destroy(uu_avl_t ); / list must be empty */
	-
	-size_t uu_avl_numnodes(uu_avl_t *);
	-
	-void uu_avl_first(uu_avl_t );
	-void uu_avl_last(uu_avl_t );
	-
	-void uu_avl_next(uu_avl_t , void *);
	-void uu_avl_prev(uu_avl_t , void *);
	-
	-int uu_avl_walk(uu_avl_t , uu_walk_fn_t , void *, uint32_t);
	-
	-uu_avl_walk_t uu_avl_walk_start(uu_avl_t , uint32_t);
	-void uu_avl_walk_next(uu_avl_walk_t );
	-void uu_avl_walk_end(uu_avl_walk_t *);
	-
	-void uu_avl_find(uu_avl_t , void , void , uu_avl_index_t *);
	-void uu_avl_insert(uu_avl_t , void , uu_avl_index_t);
	-
	-void uu_avl_nearest_next(uu_avl_t , uu_avl_index_t);
	-void uu_avl_nearest_prev(uu_avl_t , uu_avl_index_t);
	-
	-void uu_avl_teardown(uu_avl_t , void **);
	-
	-void uu_avl_remove(uu_avl_t , void );
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _LIBUUTIL_H */
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_common.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_common.h
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_common.h
	@@ -1,35 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _LIBUUTIL_COMMON_H
	-#define _LIBUUTIL_COMMON_H
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include <libuutil.h>
	-#include <libuutil_impl.h>
	-
	-#endif /* _LIBUUTIL_COMMON_H */
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_impl.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_impl.h
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_impl.h
	@@ -1,181 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _LIBUUTIL_IMPL_H
	-#define _LIBUUTIL_IMPL_H
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include <libuutil.h>
	-#include <pthread.h>
	-
	-#include <sys/avl_impl.h>
	-#include <sys/byteorder.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-void uu_set_error(uint_t);
	-#pragma rarely_called(uu_set_error)
	-
	-/PRINTFLIKE1/
	-void uu_panic(const char *format, ...);
	-#pragma rarely_called(uu_panic)
	-
	-struct uu_dprintf {
	- char *uud_name;
	- uu_dprintf_severity_t uud_severity;
	- uint_t uud_flags;
	-};
	-
	-/*
	- * For debugging purposes, libuutil keeps around linked lists of all uu_lists
	- * and uu_avls, along with pointers to their parents. These can cause false
	- * negatives when looking for memory leaks, so we encode the pointers by
	- * storing them with swapped endianness; this is not perfect, but it's about
	- * the best we can do without wasting a lot of space.
	- */
	-#ifdef _LP64
	-#define UU_PTR_ENCODE(ptr) BSWAP_64((uintptr_t)(void *)(ptr))
	-#else
	-#define UU_PTR_ENCODE(ptr) BSWAP_32((uintptr_t)(void *)(ptr))
	-#endif
	-
	-#define UU_PTR_DECODE(ptr) ((void *)UU_PTR_ENCODE(ptr))
	-
	-/*
	- * uu_list structures
	- */
	-typedef struct uu_list_node_impl {
	- struct uu_list_node_impl *uln_next;
	- struct uu_list_node_impl *uln_prev;
	-} uu_list_node_impl_t;
	-
	-struct uu_list_walk {
	- uu_list_walk_t *ulw_next;
	- uu_list_walk_t *ulw_prev;
	-
	- uu_list_t *ulw_list;
	- int8_t ulw_dir;
	- uint8_t ulw_robust;
	- uu_list_node_impl_t *ulw_next_result;
	-};
	-
	-struct uu_list {
	- uintptr_t ul_next_enc;
	- uintptr_t ul_prev_enc;
	-
	- uu_list_pool_t *ul_pool;
	- uintptr_t ul_parent_enc; /* encoded parent pointer */
	- size_t ul_offset;
	- size_t ul_numnodes;
	- uint8_t ul_debug;
	- uint8_t ul_sorted;
	- uint8_t ul_index; /* mark for uu_list_index_ts */
	-
	- uu_list_node_impl_t ul_null_node;
	- uu_list_walk_t ul_null_walk; /* for robust walkers */
	-};
	-
	-#define UU_LIST_PTR(ptr) ((uu_list_t *)UU_PTR_DECODE(ptr))
	-
	-#define UU_LIST_POOL_MAXNAME 64
	-
	-struct uu_list_pool {
	- uu_list_pool_t *ulp_next;
	- uu_list_pool_t *ulp_prev;
	-
	- char ulp_name[UU_LIST_POOL_MAXNAME];
	- size_t ulp_nodeoffset;
	- size_t ulp_objsize;
	- uu_compare_fn_t *ulp_cmp;
	- uint8_t ulp_debug;
	- uint8_t ulp_last_index;
	- pthread_mutex_t ulp_lock; /* protects null_list */
	- uu_list_t ulp_null_list;
	-};
	-
	-/*
	- * uu_avl structures
	- */
	-typedef struct avl_node uu_avl_node_impl_t;
	-
	-struct uu_avl_walk {
	- uu_avl_walk_t *uaw_next;
	- uu_avl_walk_t *uaw_prev;
	-
	- uu_avl_t *uaw_avl;
	- void *uaw_next_result;
	- int8_t uaw_dir;
	- uint8_t uaw_robust;
	-};
	-
	-struct uu_avl {
	- uintptr_t ua_next_enc;
	- uintptr_t ua_prev_enc;
	-
	- uu_avl_pool_t *ua_pool;
	- uintptr_t ua_parent_enc;
	- uint8_t ua_debug;
	- uint8_t ua_index; /* mark for uu_avl_index_ts */
	-
	- struct avl_tree ua_tree;
	- uu_avl_walk_t ua_null_walk;
	-};
	-
	-#define UU_AVL_PTR(x) ((uu_avl_t *)UU_PTR_DECODE(x))
	-
	-#define UU_AVL_POOL_MAXNAME 64
	-
	-struct uu_avl_pool {
	- uu_avl_pool_t *uap_next;
	- uu_avl_pool_t *uap_prev;
	-
	- char uap_name[UU_AVL_POOL_MAXNAME];
	- size_t uap_nodeoffset;
	- size_t uap_objsize;
	- uu_compare_fn_t *uap_cmp;
	- uint8_t uap_debug;
	- uint8_t uap_last_index;
	- pthread_mutex_t uap_lock; /* protects null_avl */
	- uu_avl_t uap_null_avl;
	-};
	-
	-/*
	- * atfork() handlers
	- */
	-void uu_avl_lockup(void);
	-void uu_avl_release(void);
	-
	-void uu_list_lockup(void);
	-void uu_list_release(void);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _LIBUUTIL_IMPL_H */
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c
	@@ -1,135 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#include "libuutil_common.h"
	-
	-#include <stdarg.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <string.h>
	-
	-void *
	-uu_zalloc(size_t n)
	-{
	- void *p = malloc(n);
	-
	- if (p == NULL) {
	- uu_set_error(UU_ERROR_SYSTEM);
	- return (NULL);
	- }
	-
	- (void) memset(p, 0, n);
	-
	- return (p);
	-}
	-
	-void
	-uu_free(void *p)
	-{
	- free(p);
	-}
	-
	-char *
	-uu_strdup(const char *str)
	-{
	- char *buf = NULL;
	-
	- if (str != NULL) {
	- size_t sz;
	-
	- sz = strlen(str) + 1;
	- buf = uu_zalloc(sz);
	- if (buf != NULL)
	- (void) memcpy(buf, str, sz);
	- }
	- return (buf);
	-}
	-
	-/*
	- * Duplicate up to n bytes of a string. Kind of sort of like
	- * strdup(strlcpy(s, n)).
	- */
	-char *
	-uu_strndup(const char *s, size_t n)
	-{
	- size_t len;
	- char *p;
	-
	- len = strnlen(s, n);
	- p = uu_zalloc(len + 1);
	- if (p == NULL)
	- return (NULL);
	-
	- if (len > 0)
	- (void) memcpy(p, s, len);
	- p[len] = '\0';
	-
	- return (p);
	-}
	-
	-/*
	- * Duplicate a block of memory. Combines malloc with memcpy, much as
	- * strdup combines malloc, strlen, and strcpy.
	- */
	-void *
	-uu_memdup(const void *buf, size_t sz)
	-{
	- void *p;
	-
	- p = uu_zalloc(sz);
	- if (p == NULL)
	- return (NULL);
	- (void) memcpy(p, buf, sz);
	- return (p);
	-}
	-
	-char *
	-uu_msprintf(const char *format, ...)
	-{
	- va_list args;
	- char attic[1];
	- uint_t M, m;
	- char *b;
	-
	- va_start(args, format);
	- M = vsnprintf(attic, 1, format, args);
	- va_end(args);
	-
	- for (;;) {
	- m = M;
	- if ((b = uu_zalloc(m + 1)) == NULL)
	- return (NULL);
	-
	- va_start(args, format);
	- M = vsnprintf(b, m + 1, format, args);
	- va_end(args);
	-
	- if (M == m)
	- break; /* sizes match */
	-
	- uu_free(b);
	- }
	-
	- return (b);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_avl.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_avl.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_avl.c
	@@ -1,570 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include "libuutil_common.h"
	-
	-#include <stdlib.h>
	-#include <string.h>
	-#include <unistd.h>
	-#include <sys/avl.h>
	-
	-static uu_avl_pool_t uu_null_apool = { &uu_null_apool, &uu_null_apool };
	-static pthread_mutex_t uu_apool_list_lock = PTHREAD_MUTEX_INITIALIZER;
	-
	-/*
	- * The index mark change on every insert and delete, to catch stale
	- * references.
	- *
	- * We leave the low bit alone, since the avl code uses it.
	- */
	-#define INDEX_MAX (sizeof (uintptr_t) - 2)
	-#define INDEX_NEXT(m) (((m) == INDEX_MAX)? 2 : ((m) + 2) & INDEX_MAX)
	-
	-#define INDEX_DECODE(i) ((i) & ~INDEX_MAX)
	-#define INDEX_ENCODE(p, n) (((n) & ~INDEX_MAX) \| (p)->ua_index)
	-#define INDEX_VALID(p, i) (((i) & INDEX_MAX) == (p)->ua_index)
	-#define INDEX_CHECK(i) (((i) & INDEX_MAX) != 0)
	-
	-/*
	- * When an element is inactive (not in a tree), we keep a marked pointer to
	- * its containing pool in its first word, and a NULL pointer in its second.
	- *
	- * On insert, we use these to verify that it comes from the correct pool.
	- */
	-#define NODE_ARRAY(p, n) ((uintptr_t *)((uintptr_t)(n) + \
	- (pp)->uap_nodeoffset))
	-
	-#define POOL_TO_MARKER(pp) (((uintptr_t)(pp) \| 1))
	-
	-#define DEAD_MARKER 0xc4
	-
	-uu_avl_pool_t *
	-uu_avl_pool_create(const char *name, size_t objsize, size_t nodeoffset,
	- uu_compare_fn_t *compare_func, uint32_t flags)
	-{
	- uu_avl_pool_t pp, next, *prev;
	-
	- if (name == NULL \|\|
	- uu_check_name(name, UU_NAME_DOMAIN) == -1 \|\|
	- nodeoffset + sizeof (uu_avl_node_t) > objsize \|\|
	- compare_func == NULL) {
	- uu_set_error(UU_ERROR_INVALID_ARGUMENT);
	- return (NULL);
	- }
	-
	- if (flags & ~UU_AVL_POOL_DEBUG) {
	- uu_set_error(UU_ERROR_UNKNOWN_FLAG);
	- return (NULL);
	- }
	-
	- pp = uu_zalloc(sizeof (uu_avl_pool_t));
	- if (pp == NULL) {
	- uu_set_error(UU_ERROR_NO_MEMORY);
	- return (NULL);
	- }
	-
	- (void) strlcpy(pp->uap_name, name, sizeof (pp->uap_name));
	- pp->uap_nodeoffset = nodeoffset;
	- pp->uap_objsize = objsize;
	- pp->uap_cmp = compare_func;
	- if (flags & UU_AVL_POOL_DEBUG)
	- pp->uap_debug = 1;
	- pp->uap_last_index = 0;
	-
	- (void) pthread_mutex_init(&pp->uap_lock, NULL);
	-
	- pp->uap_null_avl.ua_next_enc = UU_PTR_ENCODE(&pp->uap_null_avl);
	- pp->uap_null_avl.ua_prev_enc = UU_PTR_ENCODE(&pp->uap_null_avl);
	-
	- (void) pthread_mutex_lock(&uu_apool_list_lock);
	- pp->uap_next = next = &uu_null_apool;
	- pp->uap_prev = prev = next->uap_prev;
	- next->uap_prev = pp;
	- prev->uap_next = pp;
	- (void) pthread_mutex_unlock(&uu_apool_list_lock);
	-
	- return (pp);
	-}
	-
	-void
	-uu_avl_pool_destroy(uu_avl_pool_t *pp)
	-{
	- if (pp->uap_debug) {
	- if (pp->uap_null_avl.ua_next_enc !=
	- UU_PTR_ENCODE(&pp->uap_null_avl) \|\|
	- pp->uap_null_avl.ua_prev_enc !=
	- UU_PTR_ENCODE(&pp->uap_null_avl)) {
	- uu_panic("uu_avl_pool_destroy: Pool \"%.*s\" (%p) has "
	- "outstanding avls, or is corrupt.\n",
	- (int)sizeof (pp->uap_name), pp->uap_name,
	- (void *)pp);
	- }
	- }
	- (void) pthread_mutex_lock(&uu_apool_list_lock);
	- pp->uap_next->uap_prev = pp->uap_prev;
	- pp->uap_prev->uap_next = pp->uap_next;
	- (void) pthread_mutex_unlock(&uu_apool_list_lock);
	- (void) pthread_mutex_destroy(&pp->uap_lock);
	- pp->uap_prev = NULL;
	- pp->uap_next = NULL;
	- uu_free(pp);
	-}
	-
	-void
	-uu_avl_node_init(void base, uu_avl_node_t np, uu_avl_pool_t *pp)
	-{
	- uintptr_t na = (uintptr_t )np;
	-
	- if (pp->uap_debug) {
	- uintptr_t offset = (uintptr_t)np - (uintptr_t)base;
	- if (offset + sizeof (*np) > pp->uap_objsize) {
	- uu_panic("uu_avl_node_init(%p, %p, %p (\"%s\")): "
	- "offset %ld doesn't fit in object (size %ld)\n",
	- base, (void )np, (void )pp, pp->uap_name,
	- (long)offset, (long)pp->uap_objsize);
	- }
	- if (offset != pp->uap_nodeoffset) {
	- uu_panic("uu_avl_node_init(%p, %p, %p (\"%s\")): "
	- "offset %ld doesn't match pool's offset (%ld)\n",
	- base, (void )np, (void )pp, pp->uap_name,
	- (long)offset, (long)pp->uap_objsize);
	- }
	- }
	-
	- na[0] = POOL_TO_MARKER(pp);
	- na[1] = 0;
	-}
	-
	-void
	-uu_avl_node_fini(void base, uu_avl_node_t np, uu_avl_pool_t *pp)
	-{
	- uintptr_t na = (uintptr_t )np;
	-
	- if (pp->uap_debug) {
	- if (na[0] == DEAD_MARKER && na[1] == DEAD_MARKER) {
	- uu_panic("uu_avl_node_fini(%p, %p, %p (\"%s\")): "
	- "node already finied\n",
	- base, (void )np, (void )pp, pp->uap_name);
	- }
	- if (na[0] != POOL_TO_MARKER(pp) \|\| na[1] != 0) {
	- uu_panic("uu_avl_node_fini(%p, %p, %p (\"%s\")): "
	- "node corrupt, in tree, or in different pool\n",
	- base, (void )np, (void )pp, pp->uap_name);
	- }
	- }
	-
	- na[0] = DEAD_MARKER;
	- na[1] = DEAD_MARKER;
	- na[2] = DEAD_MARKER;
	-}
	-
	-struct uu_avl_node_compare_info {
	- uu_compare_fn_t *ac_compare;
	- void *ac_private;
	- void *ac_right;
	- void *ac_found;
	-};
	-
	-static int
	-uu_avl_node_compare(const void l, const void r)
	-{
	- struct uu_avl_node_compare_info *info =
	- (struct uu_avl_node_compare_info *)l;
	-
	- int res = info->ac_compare(r, info->ac_right, info->ac_private);
	-
	- if (res == 0) {
	- if (info->ac_found == NULL)
	- info->ac_found = (void *)r;
	- return (-1);
	- }
	- if (res < 0)
	- return (1);
	- return (-1);
	-}
	-
	-uu_avl_t *
	-uu_avl_create(uu_avl_pool_t pp, void parent, uint32_t flags)
	-{
	- uu_avl_t ap, next, *prev;
	-
	- if (flags & ~UU_AVL_DEBUG) {
	- uu_set_error(UU_ERROR_UNKNOWN_FLAG);
	- return (NULL);
	- }
	-
	- ap = uu_zalloc(sizeof (*ap));
	- if (ap == NULL) {
	- uu_set_error(UU_ERROR_NO_MEMORY);
	- return (NULL);
	- }
	-
	- ap->ua_pool = pp;
	- ap->ua_parent_enc = UU_PTR_ENCODE(parent);
	- ap->ua_debug = pp->uap_debug \|\| (flags & UU_AVL_DEBUG);
	- ap->ua_index = (pp->uap_last_index = INDEX_NEXT(pp->uap_last_index));
	-
	- avl_create(&ap->ua_tree, &uu_avl_node_compare, pp->uap_objsize,
	- pp->uap_nodeoffset);
	-
	- ap->ua_null_walk.uaw_next = &ap->ua_null_walk;
	- ap->ua_null_walk.uaw_prev = &ap->ua_null_walk;
	-
	- (void) pthread_mutex_lock(&pp->uap_lock);
	- next = &pp->uap_null_avl;
	- prev = UU_PTR_DECODE(next->ua_prev_enc);
	- ap->ua_next_enc = UU_PTR_ENCODE(next);
	- ap->ua_prev_enc = UU_PTR_ENCODE(prev);
	- next->ua_prev_enc = UU_PTR_ENCODE(ap);
	- prev->ua_next_enc = UU_PTR_ENCODE(ap);
	- (void) pthread_mutex_unlock(&pp->uap_lock);
	-
	- return (ap);
	-}
	-
	-void
	-uu_avl_destroy(uu_avl_t *ap)
	-{
	- uu_avl_pool_t *pp = ap->ua_pool;
	-
	- if (ap->ua_debug) {
	- if (avl_numnodes(&ap->ua_tree) != 0) {
	- uu_panic("uu_avl_destroy(%p): tree not empty\n",
	- (void *)ap);
	- }
	- if (ap->ua_null_walk.uaw_next != &ap->ua_null_walk \|\|
	- ap->ua_null_walk.uaw_prev != &ap->ua_null_walk) {
	- uu_panic("uu_avl_destroy(%p): outstanding walkers\n",
	- (void *)ap);
	- }
	- }
	- (void) pthread_mutex_lock(&pp->uap_lock);
	- UU_AVL_PTR(ap->ua_next_enc)->ua_prev_enc = ap->ua_prev_enc;
	- UU_AVL_PTR(ap->ua_prev_enc)->ua_next_enc = ap->ua_next_enc;
	- (void) pthread_mutex_unlock(&pp->uap_lock);
	- ap->ua_prev_enc = UU_PTR_ENCODE(NULL);
	- ap->ua_next_enc = UU_PTR_ENCODE(NULL);
	-
	- ap->ua_pool = NULL;
	- avl_destroy(&ap->ua_tree);
	-
	- uu_free(ap);
	-}
	-
	-size_t
	-uu_avl_numnodes(uu_avl_t *ap)
	-{
	- return (avl_numnodes(&ap->ua_tree));
	-}
	-
	-void *
	-uu_avl_first(uu_avl_t *ap)
	-{
	- return (avl_first(&ap->ua_tree));
	-}
	-
	-void *
	-uu_avl_last(uu_avl_t *ap)
	-{
	- return (avl_last(&ap->ua_tree));
	-}
	-
	-void *
	-uu_avl_next(uu_avl_t ap, void node)
	-{
	- return (AVL_NEXT(&ap->ua_tree, node));
	-}
	-
	-void *
	-uu_avl_prev(uu_avl_t ap, void node)
	-{
	- return (AVL_PREV(&ap->ua_tree, node));
	-}
	-
	-static void
	-_avl_walk_init(uu_avl_walk_t wp, uu_avl_t ap, uint32_t flags)
	-{
	- uu_avl_walk_t next, prev;
	-
	- int robust = (flags & UU_WALK_ROBUST);
	- int direction = (flags & UU_WALK_REVERSE)? -1 : 1;
	-
	- (void) memset(wp, 0, sizeof (*wp));
	- wp->uaw_avl = ap;
	- wp->uaw_robust = robust;
	- wp->uaw_dir = direction;
	-
	- if (direction > 0)
	- wp->uaw_next_result = avl_first(&ap->ua_tree);
	- else
	- wp->uaw_next_result = avl_last(&ap->ua_tree);
	-
	- if (ap->ua_debug \|\| robust) {
	- wp->uaw_next = next = &ap->ua_null_walk;
	- wp->uaw_prev = prev = next->uaw_prev;
	- next->uaw_prev = wp;
	- prev->uaw_next = wp;
	- }
	-}
	-
	-static void *
	-_avl_walk_advance(uu_avl_walk_t wp, uu_avl_t ap)
	-{
	- void *np = wp->uaw_next_result;
	-
	- avl_tree_t *t = &ap->ua_tree;
	-
	- if (np == NULL)
	- return (NULL);
	-
	- wp->uaw_next_result = (wp->uaw_dir > 0)? AVL_NEXT(t, np) :
	- AVL_PREV(t, np);
	-
	- return (np);
	-}
	-
	-static void
	-_avl_walk_fini(uu_avl_walk_t *wp)
	-{
	- if (wp->uaw_next != NULL) {
	- wp->uaw_next->uaw_prev = wp->uaw_prev;
	- wp->uaw_prev->uaw_next = wp->uaw_next;
	- wp->uaw_next = NULL;
	- wp->uaw_prev = NULL;
	- }
	- wp->uaw_avl = NULL;
	- wp->uaw_next_result = NULL;
	-}
	-
	-uu_avl_walk_t *
	-uu_avl_walk_start(uu_avl_t *ap, uint32_t flags)
	-{
	- uu_avl_walk_t *wp;
	-
	- if (flags & ~(UU_WALK_ROBUST \| UU_WALK_REVERSE)) {
	- uu_set_error(UU_ERROR_UNKNOWN_FLAG);
	- return (NULL);
	- }
	-
	- wp = uu_zalloc(sizeof (*wp));
	- if (wp == NULL) {
	- uu_set_error(UU_ERROR_NO_MEMORY);
	- return (NULL);
	- }
	-
	- _avl_walk_init(wp, ap, flags);
	- return (wp);
	-}
	-
	-void *
	-uu_avl_walk_next(uu_avl_walk_t *wp)
	-{
	- return (_avl_walk_advance(wp, wp->uaw_avl));
	-}
	-
	-void
	-uu_avl_walk_end(uu_avl_walk_t *wp)
	-{
	- _avl_walk_fini(wp);
	- uu_free(wp);
	-}
	-
	-int
	-uu_avl_walk(uu_avl_t ap, uu_walk_fn_t func, void *private, uint32_t flags)
	-{
	- void *e;
	- uu_avl_walk_t my_walk;
	-
	- int status = UU_WALK_NEXT;
	-
	- if (flags & ~(UU_WALK_ROBUST \| UU_WALK_REVERSE)) {
	- uu_set_error(UU_ERROR_UNKNOWN_FLAG);
	- return (-1);
	- }
	-
	- _avl_walk_init(&my_walk, ap, flags);
	- while (status == UU_WALK_NEXT &&
	- (e = _avl_walk_advance(&my_walk, ap)) != NULL)
	- status = (*func)(e, private);
	- _avl_walk_fini(&my_walk);
	-
	- if (status >= 0)
	- return (0);
	- uu_set_error(UU_ERROR_CALLBACK_FAILED);
	- return (-1);
	-}
	-
	-void
	-uu_avl_remove(uu_avl_t ap, void elem)
	-{
	- uu_avl_walk_t *wp;
	- uu_avl_pool_t *pp = ap->ua_pool;
	- uintptr_t *na = NODE_ARRAY(pp, elem);
	-
	- if (ap->ua_debug) {
	- /*
	- * invalidate outstanding uu_avl_index_ts.
	- */
	- ap->ua_index = INDEX_NEXT(ap->ua_index);
	- }
	-
	- /*
	- * Robust walkers most be advanced, if we are removing the node
	- * they are currently using. In debug mode, non-robust walkers
	- * are also on the walker list.
	- */
	- for (wp = ap->ua_null_walk.uaw_next; wp != &ap->ua_null_walk;
	- wp = wp->uaw_next) {
	- if (wp->uaw_robust) {
	- if (elem == wp->uaw_next_result)
	- (void) _avl_walk_advance(wp, ap);
	- } else if (wp->uaw_next_result != NULL) {
	- uu_panic("uu_avl_remove(%p, %p): active non-robust "
	- "walker\n", (void *)ap, elem);
	- }
	- }
	-
	- avl_remove(&ap->ua_tree, elem);
	-
	- na[0] = POOL_TO_MARKER(pp);
	- na[1] = 0;
	-}
	-
	-void *
	-uu_avl_teardown(uu_avl_t ap, void *cookie)
	-{
	- void *elem = avl_destroy_nodes(&ap->ua_tree, cookie);
	-
	- if (elem != NULL) {
	- uu_avl_pool_t *pp = ap->ua_pool;
	- uintptr_t *na = NODE_ARRAY(pp, elem);
	-
	- na[0] = POOL_TO_MARKER(pp);
	- na[1] = 0;
	- }
	- return (elem);
	-}
	-
	-void *
	-uu_avl_find(uu_avl_t ap, void elem, void private, uu_avl_index_t out)
	-{
	- struct uu_avl_node_compare_info info;
	- void *result;
	-
	- info.ac_compare = ap->ua_pool->uap_cmp;
	- info.ac_private = private;
	- info.ac_right = elem;
	- info.ac_found = NULL;
	-
	- result = avl_find(&ap->ua_tree, &info, out);
	- if (out != NULL)
	- out = INDEX_ENCODE(ap, out);
	-
	- if (ap->ua_debug && result != NULL)
	- uu_panic("uu_avl_find: internal error: avl_find succeeded\n");
	-
	- return (info.ac_found);
	-}
	-
	-void
	-uu_avl_insert(uu_avl_t ap, void elem, uu_avl_index_t idx)
	-{
	- if (ap->ua_debug) {
	- uu_avl_pool_t *pp = ap->ua_pool;
	- uintptr_t *na = NODE_ARRAY(pp, elem);
	-
	- if (na[1] != 0)
	- uu_panic("uu_avl_insert(%p, %p, %p): node already "
	- "in tree, or corrupt\n",
	- (void )ap, elem, (void )idx);
	- if (na[0] == 0)
	- uu_panic("uu_avl_insert(%p, %p, %p): node not "
	- "initialized\n",
	- (void )ap, elem, (void )idx);
	- if (na[0] != POOL_TO_MARKER(pp))
	- uu_panic("uu_avl_insert(%p, %p, %p): node from "
	- "other pool, or corrupt\n",
	- (void )ap, elem, (void )idx);
	-
	- if (!INDEX_VALID(ap, idx))
	- uu_panic("uu_avl_insert(%p, %p, %p): %s\n",
	- (void )ap, elem, (void )idx,
	- INDEX_CHECK(idx)? "outdated index" :
	- "invalid index");
	-
	- /*
	- * invalidate outstanding uu_avl_index_ts.
	- */
	- ap->ua_index = INDEX_NEXT(ap->ua_index);
	- }
	- avl_insert(&ap->ua_tree, elem, INDEX_DECODE(idx));
	-}
	-
	-void *
	-uu_avl_nearest_next(uu_avl_t *ap, uu_avl_index_t idx)
	-{
	- if (ap->ua_debug && !INDEX_VALID(ap, idx))
	- uu_panic("uu_avl_nearest_next(%p, %p): %s\n",
	- (void )ap, (void )idx, INDEX_CHECK(idx)?
	- "outdated index" : "invalid index");
	- return (avl_nearest(&ap->ua_tree, INDEX_DECODE(idx), AVL_AFTER));
	-}
	-
	-void *
	-uu_avl_nearest_prev(uu_avl_t *ap, uu_avl_index_t idx)
	-{
	- if (ap->ua_debug && !INDEX_VALID(ap, idx))
	- uu_panic("uu_avl_nearest_prev(%p, %p): %s\n",
	- (void )ap, (void )idx, INDEX_CHECK(idx)?
	- "outdated index" : "invalid index");
	- return (avl_nearest(&ap->ua_tree, INDEX_DECODE(idx), AVL_BEFORE));
	-}
	-
	-/*
	- * called from uu_lockup() and uu_release(), as part of our fork1()-safety.
	- */
	-void
	-uu_avl_lockup(void)
	-{
	- uu_avl_pool_t *pp;
	-
	- (void) pthread_mutex_lock(&uu_apool_list_lock);
	- for (pp = uu_null_apool.uap_next; pp != &uu_null_apool;
	- pp = pp->uap_next)
	- (void) pthread_mutex_lock(&pp->uap_lock);
	-}
	-
	-void
	-uu_avl_release(void)
	-{
	- uu_avl_pool_t *pp;
	-
	- for (pp = uu_null_apool.uap_next; pp != &uu_null_apool;
	- pp = pp->uap_next)
	- (void) pthread_mutex_unlock(&pp->uap_lock);
	- (void) pthread_mutex_unlock(&uu_apool_list_lock);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_dprintf.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_dprintf.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_dprintf.c
	@@ -1,128 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include "libuutil_common.h"
	-
	-#include <errno.h>
	-#include <libintl.h>
	-#include <stdarg.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <string.h>
	-
	-#define FACILITY_FMT "%s (%s): "
	-
	-#if !defined(TEXT_DOMAIN)
	-#define TEXT_DOMAIN "SYS_TEST"
	-#endif
	-
	-static const char *
	-strseverity(uu_dprintf_severity_t severity)
	-{
	- switch (severity) {
	- case UU_DPRINTF_SILENT:
	- return (dgettext(TEXT_DOMAIN, "silent"));
	- case UU_DPRINTF_FATAL:
	- return (dgettext(TEXT_DOMAIN, "FATAL"));
	- case UU_DPRINTF_WARNING:
	- return (dgettext(TEXT_DOMAIN, "WARNING"));
	- case UU_DPRINTF_NOTICE:
	- return (dgettext(TEXT_DOMAIN, "note"));
	- case UU_DPRINTF_INFO:
	- return (dgettext(TEXT_DOMAIN, "info"));
	- case UU_DPRINTF_DEBUG:
	- return (dgettext(TEXT_DOMAIN, "debug"));
	- default:
	- return (dgettext(TEXT_DOMAIN, "unspecified"));
	- }
	-}
	-
	-uu_dprintf_t *
	-uu_dprintf_create(const char *name, uu_dprintf_severity_t severity,
	- uint_t flags)
	-{
	- uu_dprintf_t *D;
	-
	- if (uu_check_name(name, UU_NAME_DOMAIN) == -1) {
	- uu_set_error(UU_ERROR_INVALID_ARGUMENT);
	- return (NULL);
	- }
	-
	- if ((D = uu_zalloc(sizeof (uu_dprintf_t))) == NULL)
	- return (NULL);
	-
	- if (name != NULL) {
	- D->uud_name = strdup(name);
	- if (D->uud_name == NULL) {
	- uu_free(D);
	- return (NULL);
	- }
	- } else {
	- D->uud_name = NULL;
	- }
	-
	- D->uud_severity = severity;
	- D->uud_flags = flags;
	-
	- return (D);
	-}
	-
	-/PRINTFLIKE3/
	-void
	-uu_dprintf(uu_dprintf_t *D, uu_dprintf_severity_t severity,
	- const char *format, ...)
	-{
	- va_list alist;
	-
	- /* XXX Assert that severity is not UU_DPRINTF_SILENT. */
	-
	- if (severity > D->uud_severity)
	- return;
	-
	- (void) fprintf(stderr, FACILITY_FMT, D->uud_name,
	- strseverity(severity));
	-
	- va_start(alist, format);
	- (void) vfprintf(stderr, format, alist);
	- va_end(alist);
	-}
	-
	-void
	-uu_dprintf_destroy(uu_dprintf_t *D)
	-{
	- if (D->uud_name)
	- free(D->uud_name);
	-
	- uu_free(D);
	-}
	-
	-const char *
	-uu_dprintf_getname(uu_dprintf_t *D)
	-{
	- return (D->uud_name);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_ident.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_ident.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_ident.c
	@@ -1,122 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include "libuutil_common.h"
	-
	-#include <string.h>
	-
	-/*
	- * We require names of the form:
	- * [provider,]identifier[/[provider,]identifier]...
	- *
	- * Where provider is either a stock symbol (SUNW) or a java-style reversed
	- * domain name (com.sun).
	- *
	- * Both providers and identifiers must start with a letter, and may
	- * only contain alphanumerics, dashes, and underlines. Providers
	- * may also contain periods.
	- *
	- * Note that we do _not_ use the macros in <ctype.h>, since they are affected
	- * by the current locale settings.
	- */
	-
	-#define IS_ALPHA(c) \
	- (((c) >= 'a' && (c) <= 'z') \|\| ((c) >= 'A' && (c) <= 'Z'))
	-
	-#define IS_DIGIT(c) \
	- ((c) >= '0' && (c) <= '9')
	-
	-static int
	-is_valid_ident(const char s, const char e, int allowdot)
	-{
	- char c;
	-
	- if (s >= e)
	- return (0); /* name is empty */
	-
	- c = *s++;
	- if (!IS_ALPHA(c))
	- return (0); /* does not start with letter */
	-
	- while (s < e && (c = *s++) != 0) {
	- if (IS_ALPHA(c) \|\| IS_DIGIT(c) \|\| c == '-' \|\| c == '_' \|\|
	- (allowdot && c == '.'))
	- continue;
	- return (0); /* invalid character */
	- }
	- return (1);
	-}
	-
	-static int
	-is_valid_component(const char b, const char e, uint_t flags)
	-{
	- char *sp;
	-
	- if (flags & UU_NAME_DOMAIN) {
	- sp = strchr(b, ',');
	- if (sp != NULL && sp < e) {
	- if (!is_valid_ident(b, sp, 1))
	- return (0);
	- b = sp + 1;
	- }
	- }
	-
	- return (is_valid_ident(b, e, 0));
	-}
	-
	-int
	-uu_check_name(const char *name, uint_t flags)
	-{
	- const char *end = name + strlen(name);
	- const char *p;
	-
	- if (flags & ~(UU_NAME_DOMAIN \| UU_NAME_PATH)) {
	- uu_set_error(UU_ERROR_UNKNOWN_FLAG);
	- return (-1);
	- }
	-
	- if (!(flags & UU_NAME_PATH)) {
	- if (!is_valid_component(name, end, flags))
	- goto bad;
	- return (0);
	- }
	-
	- while ((p = strchr(name, '/')) != NULL) {
	- if (!is_valid_component(name, p - 1, flags))
	- goto bad;
	- name = p + 1;
	- }
	- if (!is_valid_component(name, end, flags))
	- goto bad;
	-
	- return (0);
	-
	-bad:
	- uu_set_error(UU_ERROR_INVALID_ARGUMENT);
	- return (-1);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_list.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_list.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_list.c
	@@ -1,718 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include "libuutil_common.h"
	-
	-#include <stdlib.h>
	-#include <string.h>
	-#include <unistd.h>
	-#include <sys/time.h>
	-
	-#define ELEM_TO_NODE(lp, e) \
	- ((uu_list_node_impl_t *)((uintptr_t)(e) + (lp)->ul_offset))
	-
	-#define NODE_TO_ELEM(lp, n) \
	- ((void *)((uintptr_t)(n) - (lp)->ul_offset))
	-
	-/*
	- * uu_list_index_ts define a location for insertion. They are simply a
	- * pointer to the object after the insertion point. We store a mark
	- * in the low-bits of the index, to help prevent mistakes.
	- *
	- * When debugging, the index mark changes on every insert and delete, to
	- * catch stale references.
	- */
	-#define INDEX_MAX (sizeof (uintptr_t) - 1)
	-#define INDEX_NEXT(m) (((m) == INDEX_MAX)? 1 : ((m) + 1) & INDEX_MAX)
	-
	-#define INDEX_TO_NODE(i) ((uu_list_node_impl_t *)((i) & ~INDEX_MAX))
	-#define NODE_TO_INDEX(p, n) (((uintptr_t)(n) & ~INDEX_MAX) \| (p)->ul_index)
	-#define INDEX_VALID(p, i) (((i) & INDEX_MAX) == (p)->ul_index)
	-#define INDEX_CHECK(i) (((i) & INDEX_MAX) != 0)
	-
	-#define POOL_TO_MARKER(pp) ((void *)((uintptr_t)(pp) \| 1))
	-
	-static uu_list_pool_t uu_null_lpool = { &uu_null_lpool, &uu_null_lpool };
	-static pthread_mutex_t uu_lpool_list_lock = PTHREAD_MUTEX_INITIALIZER;
	-
	-uu_list_pool_t *
	-uu_list_pool_create(const char *name, size_t objsize,
	- size_t nodeoffset, uu_compare_fn_t *compare_func, uint32_t flags)
	-{
	- uu_list_pool_t pp, next, *prev;
	-
	- if (name == NULL \|\|
	- uu_check_name(name, UU_NAME_DOMAIN) == -1 \|\|
	- nodeoffset + sizeof (uu_list_node_t) > objsize) {
	- uu_set_error(UU_ERROR_INVALID_ARGUMENT);
	- return (NULL);
	- }
	-
	- if (flags & ~UU_LIST_POOL_DEBUG) {
	- uu_set_error(UU_ERROR_UNKNOWN_FLAG);
	- return (NULL);
	- }
	-
	- pp = uu_zalloc(sizeof (uu_list_pool_t));
	- if (pp == NULL) {
	- uu_set_error(UU_ERROR_NO_MEMORY);
	- return (NULL);
	- }
	-
	- (void) strlcpy(pp->ulp_name, name, sizeof (pp->ulp_name));
	- pp->ulp_nodeoffset = nodeoffset;
	- pp->ulp_objsize = objsize;
	- pp->ulp_cmp = compare_func;
	- if (flags & UU_LIST_POOL_DEBUG)
	- pp->ulp_debug = 1;
	- pp->ulp_last_index = 0;
	-
	- (void) pthread_mutex_init(&pp->ulp_lock, NULL);
	-
	- pp->ulp_null_list.ul_next_enc = UU_PTR_ENCODE(&pp->ulp_null_list);
	- pp->ulp_null_list.ul_prev_enc = UU_PTR_ENCODE(&pp->ulp_null_list);
	-
	- (void) pthread_mutex_lock(&uu_lpool_list_lock);
	- pp->ulp_next = next = &uu_null_lpool;
	- pp->ulp_prev = prev = next->ulp_prev;
	- next->ulp_prev = pp;
	- prev->ulp_next = pp;
	- (void) pthread_mutex_unlock(&uu_lpool_list_lock);
	-
	- return (pp);
	-}
	-
	-void
	-uu_list_pool_destroy(uu_list_pool_t *pp)
	-{
	- if (pp->ulp_debug) {
	- if (pp->ulp_null_list.ul_next_enc !=
	- UU_PTR_ENCODE(&pp->ulp_null_list) \|\|
	- pp->ulp_null_list.ul_prev_enc !=
	- UU_PTR_ENCODE(&pp->ulp_null_list)) {
	- uu_panic("uu_list_pool_destroy: Pool \"%.*s\" (%p) has "
	- "outstanding lists, or is corrupt.\n",
	- (int)sizeof (pp->ulp_name), pp->ulp_name,
	- (void *)pp);
	- }
	- }
	- (void) pthread_mutex_lock(&uu_lpool_list_lock);
	- pp->ulp_next->ulp_prev = pp->ulp_prev;
	- pp->ulp_prev->ulp_next = pp->ulp_next;
	- (void) pthread_mutex_unlock(&uu_lpool_list_lock);
	- pp->ulp_prev = NULL;
	- pp->ulp_next = NULL;
	- uu_free(pp);
	-}
	-
	-void
	-uu_list_node_init(void base, uu_list_node_t np_arg, uu_list_pool_t *pp)
	-{
	- uu_list_node_impl_t np = (uu_list_node_impl_t )np_arg;
	-
	- if (pp->ulp_debug) {
	- uintptr_t offset = (uintptr_t)np - (uintptr_t)base;
	- if (offset + sizeof (*np) > pp->ulp_objsize) {
	- uu_panic("uu_list_node_init(%p, %p, %p (\"%s\")): "
	- "offset %ld doesn't fit in object (size %ld)\n",
	- base, (void )np, (void )pp, pp->ulp_name,
	- (long)offset, (long)pp->ulp_objsize);
	- }
	- if (offset != pp->ulp_nodeoffset) {
	- uu_panic("uu_list_node_init(%p, %p, %p (\"%s\")): "
	- "offset %ld doesn't match pool's offset (%ld)\n",
	- base, (void )np, (void )pp, pp->ulp_name,
	- (long)offset, (long)pp->ulp_objsize);
	- }
	- }
	- np->uln_next = POOL_TO_MARKER(pp);
	- np->uln_prev = NULL;
	-}
	-
	-void
	-uu_list_node_fini(void base, uu_list_node_t np_arg, uu_list_pool_t *pp)
	-{
	- uu_list_node_impl_t np = (uu_list_node_impl_t )np_arg;
	-
	- if (pp->ulp_debug) {
	- if (np->uln_next == NULL &&
	- np->uln_prev == NULL) {
	- uu_panic("uu_list_node_fini(%p, %p, %p (\"%s\")): "
	- "node already finied\n",
	- base, (void )np_arg, (void )pp, pp->ulp_name);
	- }
	- if (np->uln_next != POOL_TO_MARKER(pp) \|\|
	- np->uln_prev != NULL) {
	- uu_panic("uu_list_node_fini(%p, %p, %p (\"%s\")): "
	- "node corrupt or on list\n",
	- base, (void )np_arg, (void )pp, pp->ulp_name);
	- }
	- }
	- np->uln_next = NULL;
	- np->uln_prev = NULL;
	-}
	-
	-uu_list_t *
	-uu_list_create(uu_list_pool_t pp, void parent, uint32_t flags)
	-{
	- uu_list_t lp, next, *prev;
	-
	- if (flags & ~(UU_LIST_DEBUG \| UU_LIST_SORTED)) {
	- uu_set_error(UU_ERROR_UNKNOWN_FLAG);
	- return (NULL);
	- }
	-
	- if ((flags & UU_LIST_SORTED) && pp->ulp_cmp == NULL) {
	- if (pp->ulp_debug)
	- uu_panic("uu_list_create(%p, ...): requested "
	- "UU_LIST_SORTED, but pool has no comparison func\n",
	- (void *)pp);
	- uu_set_error(UU_ERROR_NOT_SUPPORTED);
	- return (NULL);
	- }
	-
	- lp = uu_zalloc(sizeof (*lp));
	- if (lp == NULL) {
	- uu_set_error(UU_ERROR_NO_MEMORY);
	- return (NULL);
	- }
	-
	- lp->ul_pool = pp;
	- lp->ul_parent_enc = UU_PTR_ENCODE(parent);
	- lp->ul_offset = pp->ulp_nodeoffset;
	- lp->ul_debug = pp->ulp_debug \|\| (flags & UU_LIST_DEBUG);
	- lp->ul_sorted = (flags & UU_LIST_SORTED);
	- lp->ul_numnodes = 0;
	- lp->ul_index = (pp->ulp_last_index = INDEX_NEXT(pp->ulp_last_index));
	-
	- lp->ul_null_node.uln_next = &lp->ul_null_node;
	- lp->ul_null_node.uln_prev = &lp->ul_null_node;
	-
	- lp->ul_null_walk.ulw_next = &lp->ul_null_walk;
	- lp->ul_null_walk.ulw_prev = &lp->ul_null_walk;
	-
	- (void) pthread_mutex_lock(&pp->ulp_lock);
	- next = &pp->ulp_null_list;
	- prev = UU_PTR_DECODE(next->ul_prev_enc);
	- lp->ul_next_enc = UU_PTR_ENCODE(next);
	- lp->ul_prev_enc = UU_PTR_ENCODE(prev);
	- next->ul_prev_enc = UU_PTR_ENCODE(lp);
	- prev->ul_next_enc = UU_PTR_ENCODE(lp);
	- (void) pthread_mutex_unlock(&pp->ulp_lock);
	-
	- return (lp);
	-}
	-
	-void
	-uu_list_destroy(uu_list_t *lp)
	-{
	- uu_list_pool_t *pp = lp->ul_pool;
	-
	- if (lp->ul_debug) {
	- if (lp->ul_null_node.uln_next != &lp->ul_null_node \|\|
	- lp->ul_null_node.uln_prev != &lp->ul_null_node) {
	- uu_panic("uu_list_destroy(%p): list not empty\n",
	- (void *)lp);
	- }
	- if (lp->ul_numnodes != 0) {
	- uu_panic("uu_list_destroy(%p): numnodes is nonzero, "
	- "but list is empty\n", (void *)lp);
	- }
	- if (lp->ul_null_walk.ulw_next != &lp->ul_null_walk \|\|
	- lp->ul_null_walk.ulw_prev != &lp->ul_null_walk) {
	- uu_panic("uu_list_destroy(%p): outstanding walkers\n",
	- (void *)lp);
	- }
	- }
	-
	- (void) pthread_mutex_lock(&pp->ulp_lock);
	- UU_LIST_PTR(lp->ul_next_enc)->ul_prev_enc = lp->ul_prev_enc;
	- UU_LIST_PTR(lp->ul_prev_enc)->ul_next_enc = lp->ul_next_enc;
	- (void) pthread_mutex_unlock(&pp->ulp_lock);
	- lp->ul_prev_enc = UU_PTR_ENCODE(NULL);
	- lp->ul_next_enc = UU_PTR_ENCODE(NULL);
	- lp->ul_pool = NULL;
	- uu_free(lp);
	-}
	-
	-static void
	-list_insert(uu_list_t lp, uu_list_node_impl_t np, uu_list_node_impl_t *prev,
	- uu_list_node_impl_t *next)
	-{
	- if (lp->ul_debug) {
	- if (next->uln_prev != prev \|\| prev->uln_next != next)
	- uu_panic("insert(%p): internal error: %p and %p not "
	- "neighbors\n", (void )lp, (void )next,
	- (void *)prev);
	-
	- if (np->uln_next != POOL_TO_MARKER(lp->ul_pool) \|\|
	- np->uln_prev != NULL) {
	- uu_panic("insert(%p): elem %p node %p corrupt, "
	- "not initialized, or already in a list.\n",
	- (void )lp, NODE_TO_ELEM(lp, np), (void )np);
	- }
	- /*
	- * invalidate outstanding uu_list_index_ts.
	- */
	- lp->ul_index = INDEX_NEXT(lp->ul_index);
	- }
	- np->uln_next = next;
	- np->uln_prev = prev;
	- next->uln_prev = np;
	- prev->uln_next = np;
	-
	- lp->ul_numnodes++;
	-}
	-
	-void
	-uu_list_insert(uu_list_t lp, void elem, uu_list_index_t idx)
	-{
	- uu_list_node_impl_t *np;
	-
	- np = INDEX_TO_NODE(idx);
	- if (np == NULL)
	- np = &lp->ul_null_node;
	-
	- if (lp->ul_debug) {
	- if (!INDEX_VALID(lp, idx))
	- uu_panic("uu_list_insert(%p, %p, %p): %s\n",
	- (void )lp, elem, (void )idx,
	- INDEX_CHECK(idx)? "outdated index" :
	- "invalid index");
	- if (np->uln_prev == NULL)
	- uu_panic("uu_list_insert(%p, %p, %p): out-of-date "
	- "index\n", (void )lp, elem, (void )idx);
	- }
	-
	- list_insert(lp, ELEM_TO_NODE(lp, elem), np->uln_prev, np);
	-}
	-
	-void *
	-uu_list_find(uu_list_t lp, void elem, void private, uu_list_index_t out)
	-{
	- int sorted = lp->ul_sorted;
	- uu_compare_fn_t *func = lp->ul_pool->ulp_cmp;
	- uu_list_node_impl_t *np;
	-
	- if (func == NULL) {
	- if (out != NULL)
	- *out = 0;
	- uu_set_error(UU_ERROR_NOT_SUPPORTED);
	- return (NULL);
	- }
	- for (np = lp->ul_null_node.uln_next; np != &lp->ul_null_node;
	- np = np->uln_next) {
	- void *ep = NODE_TO_ELEM(lp, np);
	- int cmp = func(ep, elem, private);
	- if (cmp == 0) {
	- if (out != NULL)
	- *out = NODE_TO_INDEX(lp, np);
	- return (ep);
	- }
	- if (sorted && cmp > 0) {
	- if (out != NULL)
	- *out = NODE_TO_INDEX(lp, np);
	- return (NULL);
	- }
	- }
	- if (out != NULL)
	- *out = NODE_TO_INDEX(lp, 0);
	- return (NULL);
	-}
	-
	-void *
	-uu_list_nearest_next(uu_list_t *lp, uu_list_index_t idx)
	-{
	- uu_list_node_impl_t *np = INDEX_TO_NODE(idx);
	-
	- if (np == NULL)
	- np = &lp->ul_null_node;
	-
	- if (lp->ul_debug) {
	- if (!INDEX_VALID(lp, idx))
	- uu_panic("uu_list_nearest_next(%p, %p): %s\n",
	- (void )lp, (void )idx,
	- INDEX_CHECK(idx)? "outdated index" :
	- "invalid index");
	- if (np->uln_prev == NULL)
	- uu_panic("uu_list_nearest_next(%p, %p): out-of-date "
	- "index\n", (void )lp, (void )idx);
	- }
	-
	- if (np == &lp->ul_null_node)
	- return (NULL);
	- else
	- return (NODE_TO_ELEM(lp, np));
	-}
	-
	-void *
	-uu_list_nearest_prev(uu_list_t *lp, uu_list_index_t idx)
	-{
	- uu_list_node_impl_t *np = INDEX_TO_NODE(idx);
	-
	- if (np == NULL)
	- np = &lp->ul_null_node;
	-
	- if (lp->ul_debug) {
	- if (!INDEX_VALID(lp, idx))
	- uu_panic("uu_list_nearest_prev(%p, %p): %s\n",
	- (void )lp, (void )idx, INDEX_CHECK(idx)?
	- "outdated index" : "invalid index");
	- if (np->uln_prev == NULL)
	- uu_panic("uu_list_nearest_prev(%p, %p): out-of-date "
	- "index\n", (void )lp, (void )idx);
	- }
	-
	- if ((np = np->uln_prev) == &lp->ul_null_node)
	- return (NULL);
	- else
	- return (NODE_TO_ELEM(lp, np));
	-}
	-
	-static void
	-list_walk_init(uu_list_walk_t wp, uu_list_t lp, uint32_t flags)
	-{
	- uu_list_walk_t next, prev;
	-
	- int robust = (flags & UU_WALK_ROBUST);
	- int direction = (flags & UU_WALK_REVERSE)? -1 : 1;
	-
	- (void) memset(wp, 0, sizeof (*wp));
	- wp->ulw_list = lp;
	- wp->ulw_robust = robust;
	- wp->ulw_dir = direction;
	- if (direction > 0)
	- wp->ulw_next_result = lp->ul_null_node.uln_next;
	- else
	- wp->ulw_next_result = lp->ul_null_node.uln_prev;
	-
	- if (lp->ul_debug \|\| robust) {
	- /*
	- * Add this walker to the list's list of walkers so
	- * uu_list_remove() can advance us if somebody tries to
	- * remove ulw_next_result.
	- */
	- wp->ulw_next = next = &lp->ul_null_walk;
	- wp->ulw_prev = prev = next->ulw_prev;
	- next->ulw_prev = wp;
	- prev->ulw_next = wp;
	- }
	-}
	-
	-static uu_list_node_impl_t *
	-list_walk_advance(uu_list_walk_t wp, uu_list_t lp)
	-{
	- uu_list_node_impl_t *np = wp->ulw_next_result;
	- uu_list_node_impl_t *next;
	-
	- if (np == &lp->ul_null_node)
	- return (NULL);
	-
	- next = (wp->ulw_dir > 0)? np->uln_next : np->uln_prev;
	-
	- wp->ulw_next_result = next;
	- return (np);
	-}
	-
	-static void
	-list_walk_fini(uu_list_walk_t *wp)
	-{
	- /* GLXXX debugging? */
	- if (wp->ulw_next != NULL) {
	- wp->ulw_next->ulw_prev = wp->ulw_prev;
	- wp->ulw_prev->ulw_next = wp->ulw_next;
	- wp->ulw_next = NULL;
	- wp->ulw_prev = NULL;
	- }
	- wp->ulw_list = NULL;
	- wp->ulw_next_result = NULL;
	-}
	-
	-uu_list_walk_t *
	-uu_list_walk_start(uu_list_t *lp, uint32_t flags)
	-{
	- uu_list_walk_t *wp;
	-
	- if (flags & ~(UU_WALK_ROBUST \| UU_WALK_REVERSE)) {
	- uu_set_error(UU_ERROR_UNKNOWN_FLAG);
	- return (NULL);
	- }
	-
	- wp = uu_zalloc(sizeof (*wp));
	- if (wp == NULL) {
	- uu_set_error(UU_ERROR_NO_MEMORY);
	- return (NULL);
	- }
	-
	- list_walk_init(wp, lp, flags);
	- return (wp);
	-}
	-
	-void *
	-uu_list_walk_next(uu_list_walk_t *wp)
	-{
	- uu_list_t *lp = wp->ulw_list;
	- uu_list_node_impl_t *np = list_walk_advance(wp, lp);
	-
	- if (np == NULL)
	- return (NULL);
	-
	- return (NODE_TO_ELEM(lp, np));
	-}
	-
	-void
	-uu_list_walk_end(uu_list_walk_t *wp)
	-{
	- list_walk_fini(wp);
	- uu_free(wp);
	-}
	-
	-int
	-uu_list_walk(uu_list_t lp, uu_walk_fn_t func, void *private, uint32_t flags)
	-{
	- uu_list_node_impl_t *np;
	-
	- int status = UU_WALK_NEXT;
	-
	- int robust = (flags & UU_WALK_ROBUST);
	- int reverse = (flags & UU_WALK_REVERSE);
	-
	- if (flags & ~(UU_WALK_ROBUST \| UU_WALK_REVERSE)) {
	- uu_set_error(UU_ERROR_UNKNOWN_FLAG);
	- return (-1);
	- }
	-
	- if (lp->ul_debug \|\| robust) {
	- uu_list_walk_t my_walk;
	- void *e;
	-
	- list_walk_init(&my_walk, lp, flags);
	- while (status == UU_WALK_NEXT &&
	- (e = uu_list_walk_next(&my_walk)) != NULL)
	- status = (*func)(e, private);
	- list_walk_fini(&my_walk);
	- } else {
	- if (!reverse) {
	- for (np = lp->ul_null_node.uln_next;
	- status == UU_WALK_NEXT && np != &lp->ul_null_node;
	- np = np->uln_next) {
	- status = (*func)(NODE_TO_ELEM(lp, np), private);
	- }
	- } else {
	- for (np = lp->ul_null_node.uln_prev;
	- status == UU_WALK_NEXT && np != &lp->ul_null_node;
	- np = np->uln_prev) {
	- status = (*func)(NODE_TO_ELEM(lp, np), private);
	- }
	- }
	- }
	- if (status >= 0)
	- return (0);
	- uu_set_error(UU_ERROR_CALLBACK_FAILED);
	- return (-1);
	-}
	-
	-void
	-uu_list_remove(uu_list_t lp, void elem)
	-{
	- uu_list_node_impl_t *np = ELEM_TO_NODE(lp, elem);
	- uu_list_walk_t *wp;
	-
	- if (lp->ul_debug) {
	- if (np->uln_prev == NULL)
	- uu_panic("uu_list_remove(%p, %p): elem not on list\n",
	- (void *)lp, elem);
	- /*
	- * invalidate outstanding uu_list_index_ts.
	- */
	- lp->ul_index = INDEX_NEXT(lp->ul_index);
	- }
	-
	- /*
	- * robust walkers must be advanced. In debug mode, non-robust
	- * walkers are also on the list. If there are any, it's an error.
	- */
	- for (wp = lp->ul_null_walk.ulw_next; wp != &lp->ul_null_walk;
	- wp = wp->ulw_next) {
	- if (wp->ulw_robust) {
	- if (np == wp->ulw_next_result)
	- (void) list_walk_advance(wp, lp);
	- } else if (wp->ulw_next_result != NULL) {
	- uu_panic("uu_list_remove(%p, %p): active non-robust "
	- "walker\n", (void *)lp, elem);
	- }
	- }
	-
	- np->uln_next->uln_prev = np->uln_prev;
	- np->uln_prev->uln_next = np->uln_next;
	-
	- lp->ul_numnodes--;
	-
	- np->uln_next = POOL_TO_MARKER(lp->ul_pool);
	- np->uln_prev = NULL;
	-}
	-
	-void *
	-uu_list_teardown(uu_list_t lp, void *cookie)
	-{
	- void *ep;
	-
	- /*
	- * XXX: disable list modification until list is empty
	- */
	- if (lp->ul_debug && *cookie != NULL)
	- uu_panic("uu_list_teardown(%p, %p): unexpected cookie\n",
	- (void )lp, (void )cookie);
	-
	- ep = uu_list_first(lp);
	- if (ep)
	- uu_list_remove(lp, ep);
	- return (ep);
	-}
	-
	-int
	-uu_list_insert_before(uu_list_t lp, void target, void *elem)
	-{
	- uu_list_node_impl_t *np = ELEM_TO_NODE(lp, target);
	-
	- if (target == NULL)
	- np = &lp->ul_null_node;
	-
	- if (lp->ul_debug) {
	- if (np->uln_prev == NULL)
	- uu_panic("uu_list_insert_before(%p, %p, %p): %p is "
	- "not currently on a list\n",
	- (void *)lp, target, elem, target);
	- }
	- if (lp->ul_sorted) {
	- if (lp->ul_debug)
	- uu_panic("uu_list_insert_before(%p, ...): list is "
	- "UU_LIST_SORTED\n", (void *)lp);
	- uu_set_error(UU_ERROR_NOT_SUPPORTED);
	- return (-1);
	- }
	-
	- list_insert(lp, ELEM_TO_NODE(lp, elem), np->uln_prev, np);
	- return (0);
	-}
	-
	-int
	-uu_list_insert_after(uu_list_t lp, void target, void *elem)
	-{
	- uu_list_node_impl_t *np = ELEM_TO_NODE(lp, target);
	-
	- if (target == NULL)
	- np = &lp->ul_null_node;
	-
	- if (lp->ul_debug) {
	- if (np->uln_prev == NULL)
	- uu_panic("uu_list_insert_after(%p, %p, %p): %p is "
	- "not currently on a list\n",
	- (void *)lp, target, elem, target);
	- }
	- if (lp->ul_sorted) {
	- if (lp->ul_debug)
	- uu_panic("uu_list_insert_after(%p, ...): list is "
	- "UU_LIST_SORTED\n", (void *)lp);
	- uu_set_error(UU_ERROR_NOT_SUPPORTED);
	- return (-1);
	- }
	-
	- list_insert(lp, ELEM_TO_NODE(lp, elem), np, np->uln_next);
	- return (0);
	-}
	-
	-size_t
	-uu_list_numnodes(uu_list_t *lp)
	-{
	- return (lp->ul_numnodes);
	-}
	-
	-void *
	-uu_list_first(uu_list_t *lp)
	-{
	- uu_list_node_impl_t *n = lp->ul_null_node.uln_next;
	- if (n == &lp->ul_null_node)
	- return (NULL);
	- return (NODE_TO_ELEM(lp, n));
	-}
	-
	-void *
	-uu_list_last(uu_list_t *lp)
	-{
	- uu_list_node_impl_t *n = lp->ul_null_node.uln_prev;
	- if (n == &lp->ul_null_node)
	- return (NULL);
	- return (NODE_TO_ELEM(lp, n));
	-}
	-
	-void *
	-uu_list_next(uu_list_t lp, void elem)
	-{
	- uu_list_node_impl_t *n = ELEM_TO_NODE(lp, elem);
	-
	- n = n->uln_next;
	- if (n == &lp->ul_null_node)
	- return (NULL);
	- return (NODE_TO_ELEM(lp, n));
	-}
	-
	-void *
	-uu_list_prev(uu_list_t lp, void elem)
	-{
	- uu_list_node_impl_t *n = ELEM_TO_NODE(lp, elem);
	-
	- n = n->uln_prev;
	- if (n == &lp->ul_null_node)
	- return (NULL);
	- return (NODE_TO_ELEM(lp, n));
	-}
	-
	-/*
	- * called from uu_lockup() and uu_release(), as part of our fork1()-safety.
	- */
	-void
	-uu_list_lockup(void)
	-{
	- uu_list_pool_t *pp;
	-
	- (void) pthread_mutex_lock(&uu_lpool_list_lock);
	- for (pp = uu_null_lpool.ulp_next; pp != &uu_null_lpool;
	- pp = pp->ulp_next)
	- (void) pthread_mutex_lock(&pp->ulp_lock);
	-}
	-
	-void
	-uu_list_release(void)
	-{
	- uu_list_pool_t *pp;
	-
	- for (pp = uu_null_lpool.ulp_next; pp != &uu_null_lpool;
	- pp = pp->ulp_next)
	- (void) pthread_mutex_unlock(&pp->ulp_lock);
	- (void) pthread_mutex_unlock(&uu_lpool_list_lock);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c
	@@ -1,277 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#include "libuutil_common.h"
	-
	-#define HAVE_ASSFAIL 1
	-
	-#include <assert.h>
	-#include <errno.h>
	-#include <libintl.h>
	-#include <pthread.h>
	-#include <stdarg.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <string.h>
	-#include <sys/debug.h>
	-#include <thread.h>
	-#include <unistd.h>
	-#include <ctype.h>
	-
	-#if !defined(TEXT_DOMAIN)
	-#define TEXT_DOMAIN "SYS_TEST"
	-#endif
	-
	-/*
	- * All of the old code under !defined(PTHREAD_ONCE_KEY_NP)
	- * is here to enable the building of a native version of
	- * libuutil.so when the build machine has not yet been upgraded
	- * to a version of libc that provides pthread_key_create_once_np().
	- * It should all be deleted when solaris_nevada ships.
	- * The code is not MT-safe in a relaxed memory model.
	- */
	-
	-#if defined(PTHREAD_ONCE_KEY_NP)
	-static pthread_key_t uu_error_key = PTHREAD_ONCE_KEY_NP;
	-#else /* PTHREAD_ONCE_KEY_NP */
	-static pthread_key_t uu_error_key = 0;
	-static pthread_mutex_t uu_key_lock = PTHREAD_MUTEX_INITIALIZER;
	-#endif /* PTHREAD_ONCE_KEY_NP */
	-
	-static int uu_error_key_setup = 0;
	-
	-static pthread_mutex_t uu_panic_lock = PTHREAD_MUTEX_INITIALIZER;
	-/* LINTED static unused */
	-static const char *uu_panic_format;
	-/* LINTED static unused */
	-static va_list uu_panic_args;
	-static pthread_t uu_panic_thread;
	-
	-static uint32_t _uu_main_error;
	-
	-void
	-uu_set_error(uint_t code)
	-{
	-
	-#if defined(PTHREAD_ONCE_KEY_NP)
	- if (pthread_key_create_once_np(&uu_error_key, NULL) != 0)
	- uu_error_key_setup = -1;
	- else
	- uu_error_key_setup = 1;
	-#else /* PTHREAD_ONCE_KEY_NP */
	- if (uu_error_key_setup == 0) {
	- (void) pthread_mutex_lock(&uu_key_lock);
	- if (uu_error_key_setup == 0) {
	- if (pthread_key_create(&uu_error_key, NULL) != 0)
	- uu_error_key_setup = -1;
	- else
	- uu_error_key_setup = 1;
	- }
	- (void) pthread_mutex_unlock(&uu_key_lock);
	- }
	-#endif /* PTHREAD_ONCE_KEY_NP */
	- if (uu_error_key_setup > 0)
	- (void) pthread_setspecific(uu_error_key,
	- (void *)(uintptr_t)code);
	-}
	-
	-uint32_t
	-uu_error(void)
	-{
	-
	- if (uu_error_key_setup < 0) /* can't happen? */
	- return (UU_ERROR_UNKNOWN);
	-
	- /*
	- * Because UU_ERROR_NONE == 0, if uu_set_error() was
	- * never called, then this will return UU_ERROR_NONE:
	- */
	- return ((uint32_t)(uintptr_t)pthread_getspecific(uu_error_key));
	-}
	-
	-const char *
	-uu_strerror(uint32_t code)
	-{
	- const char *str;
	-
	- switch (code) {
	- case UU_ERROR_NONE:
	- str = dgettext(TEXT_DOMAIN, "No error");
	- break;
	-
	- case UU_ERROR_INVALID_ARGUMENT:
	- str = dgettext(TEXT_DOMAIN, "Invalid argument");
	- break;
	-
	- case UU_ERROR_UNKNOWN_FLAG:
	- str = dgettext(TEXT_DOMAIN, "Unknown flag passed");
	- break;
	-
	- case UU_ERROR_NO_MEMORY:
	- str = dgettext(TEXT_DOMAIN, "Out of memory");
	- break;
	-
	- case UU_ERROR_CALLBACK_FAILED:
	- str = dgettext(TEXT_DOMAIN, "Callback-initiated failure");
	- break;
	-
	- case UU_ERROR_NOT_SUPPORTED:
	- str = dgettext(TEXT_DOMAIN, "Operation not supported");
	- break;
	-
	- case UU_ERROR_EMPTY:
	- str = dgettext(TEXT_DOMAIN, "No value provided");
	- break;
	-
	- case UU_ERROR_UNDERFLOW:
	- str = dgettext(TEXT_DOMAIN, "Value too small");
	- break;
	-
	- case UU_ERROR_OVERFLOW:
	- str = dgettext(TEXT_DOMAIN, "Value too large");
	- break;
	-
	- case UU_ERROR_INVALID_CHAR:
	- str = dgettext(TEXT_DOMAIN,
	- "Value contains unexpected character");
	- break;
	-
	- case UU_ERROR_INVALID_DIGIT:
	- str = dgettext(TEXT_DOMAIN,
	- "Value contains digit not in base");
	- break;
	-
	- case UU_ERROR_SYSTEM:
	- str = dgettext(TEXT_DOMAIN, "Underlying system error");
	- break;
	-
	- case UU_ERROR_UNKNOWN:
	- str = dgettext(TEXT_DOMAIN, "Error status not known");
	- break;
	-
	- default:
	- errno = ESRCH;
	- str = NULL;
	- break;
	- }
	- return (str);
	-}
	-
	-void
	-uu_panic(const char *format, ...)
	-{
	- va_list args;
	-
	- va_start(args, format);
	-
	- (void) pthread_mutex_lock(&uu_panic_lock);
	- if (uu_panic_thread == 0) {
	- uu_panic_thread = pthread_self();
	- uu_panic_format = format;
	- va_copy(uu_panic_args, args);
	- }
	- (void) pthread_mutex_unlock(&uu_panic_lock);
	-
	- (void) vfprintf(stderr, format, args);
	-
	- if (uu_panic_thread == pthread_self())
	- abort();
	- else
	- for (;;)
	- (void) pause();
	-}
	-
	-int
	-assfail(const char astring, const char file, int line)
	-{
	- __assert(astring, file, line);
	- /NOTREACHED/
	- return (0);
	-}
	-
	-static void
	-uu_lockup(void)
	-{
	- (void) pthread_mutex_lock(&uu_panic_lock);
	-#if !defined(PTHREAD_ONCE_KEY_NP)
	- (void) pthread_mutex_lock(&uu_key_lock);
	-#endif
	- uu_avl_lockup();
	- uu_list_lockup();
	-}
	-
	-static void
	-uu_release(void)
	-{
	- (void) pthread_mutex_unlock(&uu_panic_lock);
	-#if !defined(PTHREAD_ONCE_KEY_NP)
	- (void) pthread_mutex_unlock(&uu_key_lock);
	-#endif
	- uu_avl_release();
	- uu_list_release();
	-}
	-
	-static void
	-uu_release_child(void)
	-{
	- uu_panic_format = NULL;
	- uu_panic_thread = 0;
	-
	- uu_release();
	-}
	-
	-#pragma init(uu_init)
	-static void
	-uu_init(void)
	-{
	- (void) pthread_atfork(uu_lockup, uu_release, uu_release_child);
	-}
	-
	-/*
	- * Dump a block of memory in hex+ascii, for debugging
	- */
	-void
	-uu_dump(FILE out, const char prefix, const void *buf, size_t len)
	-{
	- const unsigned char *p = buf;
	- int i;
	-
	- for (i = 0; i < len; i += 16) {
	- int j;
	-
	- (void) fprintf(out, "%s", prefix);
	- for (j = 0; j < 16 && i + j < len; j++) {
	- (void) fprintf(out, "%2.2x ", p[i + j]);
	- }
	- for (; j < 16; j++) {
	- (void) fprintf(out, " ");
	- }
	- for (j = 0; j < 16 && i + j < len; j++) {
	- (void) fprintf(out, "%c",
	- isprint(p[i + j]) ? p[i + j] : '.');
	- }
	- (void) fprintf(out, "\n");
	- }
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_open.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_open.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_open.c
	@@ -1,70 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include "libuutil_common.h"
	-
	-#include <sys/time.h>
	-
	-#include <errno.h>
	-#include <fcntl.h>
	-#include <limits.h>
	-#include <stdio.h>
	-#include <unistd.h>
	-
	-#ifdef _LP64
	-#define TMPPATHFMT "%s/uu%ld"
	-#else /* _LP64 */
	-#define TMPPATHFMT "%s/uu%lld"
	-#endif /* _LP64 */
	-
	-/ARGSUSED/
	-int
	-uu_open_tmp(const char *dir, uint_t uflags)
	-{
	- int f;
	- char *fname = uu_zalloc(PATH_MAX);
	-
	- if (fname == NULL)
	- return (-1);
	-
	- for (;;) {
	- (void) snprintf(fname, PATH_MAX, "%s/uu%lld", dir, gethrtime());
	-
	- f = open(fname, O_CREAT \| O_EXCL \| O_RDWR, 0600);
	-
	- if (f >= 0 \|\| errno != EEXIST)
	- break;
	- }
	-
	- if (f >= 0)
	- (void) unlink(fname);
	-
	- uu_free(fname);
	-
	- return (f);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_pname.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_pname.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_pname.c
	@@ -1,205 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include "libuutil_common.h"
	-
	-#include <libintl.h>
	-#include <limits.h>
	-#include <string.h>
	-#include <stdlib.h>
	-#include <stdarg.h>
	-#include <stdio.h>
	-#include <errno.h>
	-#include <wchar.h>
	-#include <unistd.h>
	-
	-static const char PNAME_FMT[] = "%s: ";
	-static const char ERRNO_FMT[] = ": %s\n";
	-
	-static const char *pname;
	-
	-static void
	-uu_die_internal(int status, const char *format, va_list alist) __NORETURN;
	-
	-int uu_exit_ok_value = EXIT_SUCCESS;
	-int uu_exit_fatal_value = EXIT_FAILURE;
	-int uu_exit_usage_value = 2;
	-
	-int *
	-uu_exit_ok(void)
	-{
	- return (&uu_exit_ok_value);
	-}
	-
	-int *
	-uu_exit_fatal(void)
	-{
	- return (&uu_exit_fatal_value);
	-}
	-
	-int *
	-uu_exit_usage(void)
	-{
	- return (&uu_exit_usage_value);
	-}
	-
	-void
	-uu_alt_exit(int profile)
	-{
	- switch (profile) {
	- case UU_PROFILE_DEFAULT:
	- uu_exit_ok_value = EXIT_SUCCESS;
	- uu_exit_fatal_value = EXIT_FAILURE;
	- uu_exit_usage_value = 2;
	- break;
	- case UU_PROFILE_LAUNCHER:
	- uu_exit_ok_value = EXIT_SUCCESS;
	- uu_exit_fatal_value = 124;
	- uu_exit_usage_value = 125;
	- break;
	- }
	-}
	-
	-static void
	-uu_warn_internal(int err, const char *format, va_list alist)
	-{
	- if (pname != NULL)
	- (void) fprintf(stderr, PNAME_FMT, pname);
	-
	- (void) vfprintf(stderr, format, alist);
	-
	- if (strrchr(format, '\n') == NULL)
	- (void) fprintf(stderr, ERRNO_FMT, strerror(err));
	-}
	-
	-void
	-uu_vwarn(const char *format, va_list alist)
	-{
	- uu_warn_internal(errno, format, alist);
	-}
	-
	-/PRINTFLIKE1/
	-void
	-uu_warn(const char *format, ...)
	-{
	- va_list alist;
	- va_start(alist, format);
	- uu_warn_internal(errno, format, alist);
	- va_end(alist);
	-}
	-
	-static void
	-uu_die_internal(int status, const char *format, va_list alist)
	-{
	- uu_warn_internal(errno, format, alist);
	-#ifdef DEBUG
	- {
	- char *cp;
	-
	- if (!issetugid()) {
	- cp = getenv("UU_DIE_ABORTS");
	- if (cp != NULL && *cp != '\0')
	- abort();
	- }
	- }
	-#endif
	- exit(status);
	-}
	-
	-void
	-uu_vdie(const char *format, va_list alist)
	-{
	- uu_die_internal(UU_EXIT_FATAL, format, alist);
	-}
	-
	-/PRINTFLIKE1/
	-void
	-uu_die(const char *format, ...)
	-{
	- va_list alist;
	- va_start(alist, format);
	- uu_die_internal(UU_EXIT_FATAL, format, alist);
	- va_end(alist);
	-}
	-
	-void
	-uu_vxdie(int status, const char *format, va_list alist)
	-{
	- uu_die_internal(status, format, alist);
	-}
	-
	-/PRINTFLIKE2/
	-void
	-uu_xdie(int status, const char *format, ...)
	-{
	- va_list alist;
	- va_start(alist, format);
	- uu_die_internal(status, format, alist);
	- va_end(alist);
	-}
	-
	-const char *
	-uu_setpname(char *arg0)
	-{
	- /*
	- * Having a NULL argv[0], while uncommon, is possible. It
	- * makes more sense to handle this event in uu_setpname rather
	- * than in each of its consumers.
	- */
	- if (arg0 == NULL) {
	- pname = "unknown_command";
	- return (pname);
	- }
	-
	- /*
	- * Guard against '/' at end of command invocation.
	- */
	- for (;;) {
	- char *p = strrchr(arg0, '/');
	- if (p == NULL) {
	- pname = arg0;
	- break;
	- } else {
	- if (*(p + 1) == '\0') {
	- *p = '\0';
	- continue;
	- }
	-
	- pname = p + 1;
	- break;
	- }
	- }
	-
	- return (pname);
	-}
	-
	-const char *
	-uu_getpname(void)
	-{
	- return (pname);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c
	@@ -1,56 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-/*
	- * String helper functions
	- */
	-
	-#include <string.h>
	-#include <sys/types.h>
	-#include <stdio.h>
	-#include <malloc.h>
	-#include <ctype.h>
	-#include "libuutil.h"
	-
	-/* Return true if strings are equal */
	-boolean_t
	-uu_streq(const char a, const char b)
	-{
	- return (strcmp(a, b) == 0);
	-}
	-
	-/* Return true if strings are equal, case-insensitively */
	-boolean_t
	-uu_strcaseeq(const char a, const char b)
	-{
	- return (strcasecmp(a, b) == 0);
	-}
	-
	-/* Return true if string a Begins With string b */
	-boolean_t
	-uu_strbw(const char a, const char b)
	-{
	- return (strncmp(a, b, strlen(b)) == 0);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libuutil/common/uu_strtoint.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libuutil/common/uu_strtoint.c
	+++ head/cddl/contrib/opensolaris/lib/libuutil/common/uu_strtoint.c
	@@ -1,300 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include "libuutil_common.h"
	-
	-#include <limits.h>
	-#include <ctype.h>
	-
	-#define MAX_BASE 36
	-
	-#define IS_DIGIT(x) ((x) >= '0' && (x) <= '9')
	-
	-#define CTOI(x) (((x) >= '0' && (x) <= '9') ? (x) - '0' : \
	- ((x) >= 'a' && (x) <= 'z') ? (x) + 10 - 'a' : (x) + 10 - 'A')
	-
	-static int
	-strtoint(const char s_arg, uint64_t out, uint32_t base, int sign)
	-{
	- const unsigned char s = (const unsigned char )s_arg;
	-
	- uint64_t val = 0;
	- uint64_t multmax;
	-
	- unsigned c, i;
	-
	- int neg = 0;
	-
	- int bad_digit = 0;
	- int bad_char = 0;
	- int overflow = 0;
	-
	- if (s == NULL \|\| base == 1 \|\| base > MAX_BASE) {
	- uu_set_error(UU_ERROR_INVALID_ARGUMENT);
	- return (-1);
	- }
	-
	- while ((c = *s) != 0 && isspace(c))
	- s++;
	-
	- switch (c) {
	- case '-':
	- if (!sign)
	- overflow = 1; /* becomes underflow below */
	- neg = 1;
	- /FALLTHRU/
	- case '+':
	- c = *++s;
	- break;
	- default:
	- break;
	- }
	-
	- if (c == '\0') {
	- uu_set_error(UU_ERROR_EMPTY);
	- return (-1);
	- }
	-
	- if (base == 0) {
	- if (c != '0')
	- base = 10;
	- else if (s[1] == 'x' \|\| s[1] == 'X')
	- base = 16;
	- else
	- base = 8;
	- }
	-
	- if (base == 16 && c == '0' && (s[1] == 'x' \|\| s[1] == 'X'))
	- c = *(s += 2);
	-
	- if ((val = CTOI(c)) >= base) {
	- if (IS_DIGIT(c))
	- bad_digit = 1;
	- else
	- bad_char = 1;
	- val = 0;
	- }
	-
	- multmax = (uint64_t)UINT64_MAX / (uint64_t)base;
	-
	- for (c = ++s; c != '\0'; c = ++s) {
	- if ((i = CTOI(c)) >= base) {
	- if (isspace(c))
	- break;
	- if (IS_DIGIT(c))
	- bad_digit = 1;
	- else
	- bad_char = 1;
	- i = 0;
	- }
	-
	- if (val > multmax)
	- overflow = 1;
	-
	- val *= base;
	- if ((uint64_t)UINT64_MAX - val < (uint64_t)i)
	- overflow = 1;
	-
	- val += i;
	- }
	-
	- while ((c = *s) != 0) {
	- if (!isspace(c))
	- bad_char = 1;
	- s++;
	- }
	-
	- if (sign) {
	- if (neg) {
	- if (val > -(uint64_t)INT64_MIN)
	- overflow = 1;
	- } else {
	- if (val > INT64_MAX)
	- overflow = 1;
	- }
	- }
	-
	- if (neg)
	- val = -val;
	-
	- if (bad_char \| bad_digit \| overflow) {
	- if (bad_char)
	- uu_set_error(UU_ERROR_INVALID_CHAR);
	- else if (bad_digit)
	- uu_set_error(UU_ERROR_INVALID_DIGIT);
	- else if (overflow) {
	- if (neg)
	- uu_set_error(UU_ERROR_UNDERFLOW);
	- else
	- uu_set_error(UU_ERROR_OVERFLOW);
	- }
	- return (-1);
	- }
	-
	- *out = val;
	- return (0);
	-}
	-
	-int
	-uu_strtoint(const char s, void v, size_t sz, int base,
	- int64_t min, int64_t max)
	-{
	- uint64_t val_u;
	- int64_t val;
	-
	- if (min > max)
	- goto bad_argument;
	-
	- switch (sz) {
	- case 1:
	- if (max > INT8_MAX \|\| min < INT8_MIN)
	- goto bad_argument;
	- break;
	- case 2:
	- if (max > INT16_MAX \|\| min < INT16_MIN)
	- goto bad_argument;
	- break;
	- case 4:
	- if (max > INT32_MAX \|\| min < INT32_MIN)
	- goto bad_argument;
	- break;
	- case 8:
	- if (max > INT64_MAX \|\| min < INT64_MIN)
	- goto bad_argument;
	- break;
	- default:
	- goto bad_argument;
	- }
	-
	- if (min == 0 && max == 0) {
	- min = -(1ULL << (8 * sz - 1));
	- max = (1ULL << (8 * sz - 1)) - 1;
	- }
	-
	- if (strtoint(s, &val_u, base, 1) == -1)
	- return (-1);
	-
	- val = (int64_t)val_u;
	-
	- if (val < min) {
	- uu_set_error(UU_ERROR_UNDERFLOW);
	- return (-1);
	- } else if (val > max) {
	- uu_set_error(UU_ERROR_OVERFLOW);
	- return (-1);
	- }
	-
	- switch (sz) {
	- case 1:
	- (int8_t )v = val;
	- return (0);
	- case 2:
	- (int16_t )v = val;
	- return (0);
	- case 4:
	- (int32_t )v = val;
	- return (0);
	- case 8:
	- (int64_t )v = val;
	- return (0);
	- default:
	- break; /* fall through to bad_argument */
	- }
	-
	-bad_argument:
	- uu_set_error(UU_ERROR_INVALID_ARGUMENT);
	- return (-1);
	-}
	-
	-int
	-uu_strtouint(const char s, void v, size_t sz, int base,
	- uint64_t min, uint64_t max)
	-{
	- uint64_t val;
	-
	- if (min > max)
	- goto bad_argument;
	-
	- switch (sz) {
	- case 1:
	- if (max > UINT8_MAX)
	- goto bad_argument;
	- break;
	- case 2:
	- if (max > UINT16_MAX)
	- goto bad_argument;
	- break;
	- case 4:
	- if (max > UINT32_MAX)
	- goto bad_argument;
	- break;
	- case 8:
	- if (max > UINT64_MAX)
	- goto bad_argument;
	- break;
	- default:
	- goto bad_argument;
	- }
	-
	- if (min == 0 && max == 0) {
	- /* we have to be careful, since << can overflow */
	- max = (1ULL << (8 * sz - 1)) * 2 - 1;
	- }
	-
	- if (strtoint(s, &val, base, 0) == -1)
	- return (-1);
	-
	- if (val < min) {
	- uu_set_error(UU_ERROR_UNDERFLOW);
	- return (-1);
	- } else if (val > max) {
	- uu_set_error(UU_ERROR_OVERFLOW);
	- return (-1);
	- }
	-
	- switch (sz) {
	- case 1:
	- (uint8_t )v = val;
	- return (0);
	- case 2:
	- (uint16_t )v = val;
	- return (0);
	- case 4:
	- (uint32_t )v = val;
	- return (0);
	- case 8:
	- (uint64_t )v = val;
	- return (0);
	- default:
	- break; /* shouldn't happen, fall through */
	- }
	-
	-bad_argument:
	- uu_set_error(UU_ERROR_INVALID_ARGUMENT);
	- return (-1);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
	@@ -1,894 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
	- * Copyright 2019 Joyent, Inc.
	- * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016 Nexenta Systems, Inc.
	- * Copyright (c) 2019 Datto Inc.
	- */
	-
	-#ifndef _LIBZFS_H
	-#define _LIBZFS_H
	-
	-#include <assert.h>
	-#include <libnvpair.h>
	-#include <sys/mnttab.h>
	-#include <sys/param.h>
	-#include <sys/types.h>
	-#include <sys/varargs.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/avl.h>
	-#include <sys/zfs_ioctl.h>
	-#include <libzfs_core.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Miscellaneous ZFS constants
	- */
	-#define ZFS_MAXPROPLEN MAXPATHLEN
	-#define ZPOOL_MAXPROPLEN MAXPATHLEN
	-
	-/*
	- * libzfs errors
	- */
	-typedef enum zfs_error {
	- EZFS_SUCCESS = 0, /* no error -- success */
	- EZFS_NOMEM = 2000, /* out of memory */
	- EZFS_BADPROP, /* invalid property value */
	- EZFS_PROPREADONLY, /* cannot set readonly property */
	- EZFS_PROPTYPE, /* property does not apply to dataset type */
	- EZFS_PROPNONINHERIT, /* property is not inheritable */
	- EZFS_PROPSPACE, /* bad quota or reservation */
	- EZFS_BADTYPE, /* dataset is not of appropriate type */
	- EZFS_BUSY, /* pool or dataset is busy */
	- EZFS_EXISTS, /* pool or dataset already exists */
	- EZFS_NOENT, /* no such pool or dataset */
	- EZFS_BADSTREAM, /* bad backup stream */
	- EZFS_DSREADONLY, /* dataset is readonly */
	- EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */
	- EZFS_INVALIDNAME, /* invalid dataset name */
	- EZFS_BADRESTORE, /* unable to restore to destination */
	- EZFS_BADBACKUP, /* backup failed */
	- EZFS_BADTARGET, /* bad attach/detach/replace target */
	- EZFS_NODEVICE, /* no such device in pool */
	- EZFS_BADDEV, /* invalid device to add */
	- EZFS_NOREPLICAS, /* no valid replicas */
	- EZFS_RESILVERING, /* currently resilvering */
	- EZFS_BADVERSION, /* unsupported version */
	- EZFS_POOLUNAVAIL, /* pool is currently unavailable */
	- EZFS_DEVOVERFLOW, /* too many devices in one vdev */
	- EZFS_BADPATH, /* must be an absolute path */
	- EZFS_CROSSTARGET, /* rename or clone across pool or dataset */
	- EZFS_ZONED, /* used improperly in local zone */
	- EZFS_MOUNTFAILED, /* failed to mount dataset */
	- EZFS_UMOUNTFAILED, /* failed to unmount dataset */
	- EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */
	- EZFS_SHARENFSFAILED, /* share(1M) failed */
	- EZFS_PERM, /* permission denied */
	- EZFS_NOSPC, /* out of space */
	- EZFS_FAULT, /* bad address */
	- EZFS_IO, /* I/O error */
	- EZFS_INTR, /* signal received */
	- EZFS_ISSPARE, /* device is a hot spare */
	- EZFS_INVALCONFIG, /* invalid vdev configuration */
	- EZFS_RECURSIVE, /* recursive dependency */
	- EZFS_NOHISTORY, /* no history object */
	- EZFS_POOLPROPS, /* couldn't retrieve pool props */
	- EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */
	- EZFS_POOL_INVALARG, /* invalid argument for this pool operation */
	- EZFS_NAMETOOLONG, /* dataset name is too long */
	- EZFS_OPENFAILED, /* open of device failed */
	- EZFS_NOCAP, /* couldn't get capacity */
	- EZFS_LABELFAILED, /* write of label failed */
	- EZFS_BADWHO, /* invalid permission who */
	- EZFS_BADPERM, /* invalid permission */
	- EZFS_BADPERMSET, /* invalid permission set name */
	- EZFS_NODELEGATION, /* delegated administration is disabled */
	- EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */
	- EZFS_SHARESMBFAILED, /* failed to share over smb */
	- EZFS_BADCACHE, /* bad cache file */
	- EZFS_ISL2CACHE, /* device is for the level 2 ARC */
	- EZFS_VDEVNOTSUP, /* unsupported vdev type */
	- EZFS_NOTSUP, /* ops not supported on this dataset */
	- EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */
	- EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */
	- EZFS_REFTAG_RELE, /* snapshot release: tag not found */
	- EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */
	- EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */
	- EZFS_PIPEFAILED, /* pipe create failed */
	- EZFS_THREADCREATEFAILED, /* thread create failed */
	- EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */
	- EZFS_SCRUBBING, /* currently scrubbing */
	- EZFS_NO_SCRUB, /* no active scrub */
	- EZFS_DIFF, /* general failure of zfs diff */
	- EZFS_DIFFDATA, /* bad zfs diff data */
	- EZFS_POOLREADONLY, /* pool is in read-only mode */
	- EZFS_SCRUB_PAUSED, /* scrub currently paused */
	- EZFS_ACTIVE_POOL, /* pool is imported on a different system */
	- EZFS_NO_PENDING, /* cannot cancel, no operation is pending */
	- EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */
	- EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */
	- EZFS_NO_CHECKPOINT, /* pool has no checkpoint */
	- EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */
	- EZFS_VDEV_TOO_BIG, /* a device is too big to be used */
	- EZFS_TOOMANY, /* argument list too long */
	- EZFS_INITIALIZING, /* currently initializing */
	- EZFS_NO_INITIALIZE, /* no active initialize */
	- EZFS_WRONG_PARENT, /* invalid parent dataset (e.g ZVOL) */
	- EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */
	- EZFS_UNKNOWN
	-} zfs_error_t;
	-
	-/*
	- * UEFI boot support parameters. When creating whole disk boot pool,
	- * zpool create should allow to create EFI System partition for UEFI boot
	- * program. In case of BIOS, the EFI System partition is not used
	- * even if it does exist.
	- */
	-typedef enum zpool_boot_label {
	- ZPOOL_NO_BOOT_LABEL = 0,
	- ZPOOL_CREATE_BOOT_LABEL,
	- ZPOOL_COPY_BOOT_LABEL
	-} zpool_boot_label_t;
	-
	-/*
	- * The following data structures are all part
	- * of the zfs_allow_t data structure which is
	- * used for printing 'allow' permissions.
	- * It is a linked list of zfs_allow_t's which
	- * then contain avl tree's for user/group/sets/...
	- * and each one of the entries in those trees have
	- * avl tree's for the permissions they belong to and
	- * whether they are local,descendent or local+descendent
	- * permissions. The AVL trees are used primarily for
	- * sorting purposes, but also so that we can quickly find
	- * a given user and or permission.
	- */
	-typedef struct zfs_perm_node {
	- avl_node_t z_node;
	- char z_pname[MAXPATHLEN];
	-} zfs_perm_node_t;
	-
	-typedef struct zfs_allow_node {
	- avl_node_t z_node;
	- char z_key[MAXPATHLEN]; /* name, such as joe */
	- avl_tree_t z_localdescend; /* local+descendent perms */
	- avl_tree_t z_local; /* local permissions */
	- avl_tree_t z_descend; /* descendent permissions */
	-} zfs_allow_node_t;
	-
	-typedef struct zfs_allow {
	- struct zfs_allow *z_next;
	- char z_setpoint[MAXPATHLEN];
	- avl_tree_t z_sets;
	- avl_tree_t z_crperms;
	- avl_tree_t z_user;
	- avl_tree_t z_group;
	- avl_tree_t z_everyone;
	-} zfs_allow_t;
	-
	-/*
	- * Basic handle types
	- */
	-typedef struct zfs_handle zfs_handle_t;
	-typedef struct zpool_handle zpool_handle_t;
	-typedef struct libzfs_handle libzfs_handle_t;
	-
	-/*
	- * Library initialization
	- */
	-extern libzfs_handle_t *libzfs_init(void);
	-extern void libzfs_fini(libzfs_handle_t *);
	-
	-extern libzfs_handle_t zpool_get_handle(zpool_handle_t );
	-extern libzfs_handle_t zfs_get_handle(zfs_handle_t );
	-
	-extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
	-
	-extern void zfs_save_arguments(int argc, char *, char , int);
	-extern int zpool_log_history(libzfs_handle_t , const char );
	-
	-extern int libzfs_errno(libzfs_handle_t *);
	-extern const char libzfs_error_action(libzfs_handle_t );
	-extern const char libzfs_error_description(libzfs_handle_t );
	-extern int zfs_standard_error(libzfs_handle_t , int, const char );
	-extern void libzfs_mnttab_init(libzfs_handle_t *);
	-extern void libzfs_mnttab_fini(libzfs_handle_t *);
	-extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t);
	-extern int libzfs_mnttab_find(libzfs_handle_t , const char ,
	- struct mnttab *);
	-extern void libzfs_mnttab_add(libzfs_handle_t , const char ,
	- const char , const char );
	-extern void libzfs_mnttab_remove(libzfs_handle_t , const char );
	-
	-/*
	- * Basic handle functions
	- */
	-extern zpool_handle_t zpool_open(libzfs_handle_t , const char *);
	-extern zpool_handle_t zpool_open_canfail(libzfs_handle_t , const char *);
	-extern void zpool_close(zpool_handle_t *);
	-extern const char zpool_get_name(zpool_handle_t );
	-extern int zpool_get_state(zpool_handle_t *);
	-extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t);
	-extern const char *zpool_pool_state_to_name(pool_state_t);
	-extern void zpool_free_handles(libzfs_handle_t *);
	-extern int zpool_nextboot(libzfs_handle_t , uint64_t, uint64_t, const char );
	-
	-/*
	- * Iterate over all active pools in the system.
	- */
	-typedef int (zpool_iter_f)(zpool_handle_t , void *);
	-extern int zpool_iter(libzfs_handle_t , zpool_iter_f, void );
	-extern boolean_t zpool_skip_pool(const char *);
	-
	-/*
	- * Functions to create and destroy pools
	- */
	-extern int zpool_create(libzfs_handle_t , const char , nvlist_t *,
	- nvlist_t , nvlist_t );
	-extern int zpool_destroy(zpool_handle_t , const char );
	-extern int zpool_add(zpool_handle_t , nvlist_t );
	-
	-typedef struct splitflags {
	- /* do not split, but return the config that would be split off */
	- int dryrun : 1;
	-
	- /* after splitting, import the pool */
	- int import : 1;
	- int name_flags;
	-} splitflags_t;
	-
	-/*
	- * Functions to manipulate pool and vdev state
	- */
	-extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
	-extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
	- nvlist_t *);
	-extern int zpool_clear(zpool_handle_t , const char , nvlist_t *);
	-extern int zpool_reguid(zpool_handle_t *);
	-extern int zpool_reopen(zpool_handle_t *);
	-
	-extern int zpool_sync_one(zpool_handle_t , void );
	-
	-extern int zpool_vdev_online(zpool_handle_t , const char , int,
	- vdev_state_t *);
	-extern int zpool_vdev_offline(zpool_handle_t , const char , boolean_t);
	-extern int zpool_vdev_attach(zpool_handle_t , const char ,
	- const char , nvlist_t , int);
	-extern int zpool_vdev_detach(zpool_handle_t , const char );
	-extern int zpool_vdev_remove(zpool_handle_t , const char );
	-extern int zpool_vdev_remove_cancel(zpool_handle_t *);
	-extern int zpool_vdev_indirect_size(zpool_handle_t , const char , uint64_t *);
	-extern int zpool_vdev_split(zpool_handle_t , char , nvlist_t *, nvlist_t ,
	- splitflags_t);
	-
	-extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
	-extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
	-extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);
	-
	-extern nvlist_t zpool_find_vdev(zpool_handle_t , const char , boolean_t ,
	- boolean_t , boolean_t );
	-extern nvlist_t zpool_find_vdev_by_physpath(zpool_handle_t , const char *,
	- boolean_t , boolean_t , boolean_t *);
	-extern int zpool_label_disk(libzfs_handle_t , zpool_handle_t , const char *,
	- zpool_boot_label_t, uint64_t, int *);
	-
	-/*
	- * Functions to manage pool properties
	- */
	-extern int zpool_set_prop(zpool_handle_t , const char , const char *);
	-extern int zpool_get_prop(zpool_handle_t , zpool_prop_t, char ,
	- size_t proplen, zprop_source_t *, boolean_t);
	-extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t,
	- zprop_source_t *);
	-
	-extern const char *zpool_prop_to_name(zpool_prop_t);
	-extern const char *zpool_prop_values(zpool_prop_t);
	-
	-/*
	- * Pool health statistics.
	- */
	-typedef enum {
	- /*
	- * The following correspond to faults as defined in the (fault.fs.zfs.*)
	- * event namespace. Each is associated with a corresponding message ID.
	- * This must be kept in sync with the zfs_msgid_table in
	- * lib/libzfs/libzfs_status.c.
	- */
	- ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */
	- ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */
	- ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */
	- ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */
	- ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */
	- ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */
	- ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */
	- ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */
	- ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */
	- ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */
	- ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */
	- ZPOOL_STATUS_HOSTID_ACTIVE, /* currently active on another system */
	- ZPOOL_STATUS_HOSTID_REQUIRED, /* multihost=on and hostid=0 */
	- ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */
	- ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */
	- ZPOOL_STATUS_IO_FAILURE_MMP, /* failed MMP, failmode not 'panic' */
	- ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */
	-
	- /*
	- * If the pool has unsupported features but can still be opened in
	- * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the
	- * pool has unsupported features but cannot be opened at all, its
	- * status is ZPOOL_STATUS_UNSUP_FEAT_READ.
	- */
	- ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */
	- ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */
	-
	- /*
	- * These faults have no corresponding message ID. At the time we are
	- * checking the status, the original reason for the FMA fault (I/O or
	- * checksum errors) has been lost.
	- */
	- ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */
	- ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */
	-
	- /*
	- * The following are not faults per se, but still an error possibly
	- * requiring administrative attention. There is no corresponding
	- * message ID.
	- */
	- ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */
	- ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */
	- ZPOOL_STATUS_RESILVERING, /* device being resilvered */
	- ZPOOL_STATUS_OFFLINE_DEV, /* device offline */
	- ZPOOL_STATUS_REMOVED_DEV, /* removed device */
	- ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */
	-
	- /*
	- * Finally, the following indicates a healthy pool.
	- */
	- ZPOOL_STATUS_OK
	-} zpool_status_t;
	-
	-extern zpool_status_t zpool_get_status(zpool_handle_t , char *);
	-extern zpool_status_t zpool_import_status(nvlist_t , char *);
	-extern void zpool_dump_ddt(const ddt_stat_t dds, const ddt_histogram_t ddh);
	-
	-/*
	- * Statistics and configuration functions.
	- */
	-extern nvlist_t zpool_get_config(zpool_handle_t , nvlist_t **);
	-extern nvlist_t zpool_get_features(zpool_handle_t );
	-extern int zpool_refresh_stats(zpool_handle_t , boolean_t );
	-extern int zpool_get_errlog(zpool_handle_t , nvlist_t *);
	-extern boolean_t zpool_is_bootable(zpool_handle_t *);
	-
	-/*
	- * Import and export functions
	- */
	-extern int zpool_export(zpool_handle_t , boolean_t, const char );
	-extern int zpool_export_force(zpool_handle_t , const char );
	-extern int zpool_import(libzfs_handle_t , nvlist_t , const char *,
	- char *altroot);
	-extern int zpool_import_props(libzfs_handle_t , nvlist_t , const char *,
	- nvlist_t *, int);
	-extern void zpool_print_unsup_feat(nvlist_t *config);
	-
	-/*
	- * Search for pools to import
	- */
	-
	-typedef struct importargs {
	- char *path; / a list of paths to search */
	- int paths; /* number of paths to search */
	- char poolname; / name of a pool to find */
	- uint64_t guid; /* guid of a pool to find */
	- char cachefile; / cachefile to use for import */
	- int can_be_active : 1; /* can the pool be active? */
	- int unique : 1; /* does 'poolname' already exist? */
	- int exists : 1; /* set on return if pool already exists */
	- nvlist_t policy; / load policy (max txg, rewind, etc.) */
	-} importargs_t;
	-
	-extern nvlist_t zpool_search_import(libzfs_handle_t , importargs_t *);
	-extern int zpool_tryimport(libzfs_handle_t hdl, char target,
	- nvlist_t *configp, importargs_t args);
	-
	-/* legacy pool search routines */
	-extern nvlist_t zpool_find_import(libzfs_handle_t , int, char **);
	-extern nvlist_t zpool_find_import_cached(libzfs_handle_t , const char *,
	- char *, uint64_t);
	-
	-/*
	- * Miscellaneous pool functions
	- */
	-struct zfs_cmd;
	-
	-extern const char *zfs_history_event_names[];
	-
	-typedef enum {
	- VDEV_NAME_PATH = 1 << 0,
	- VDEV_NAME_GUID = 1 << 1,
	- VDEV_NAME_FOLLOW_LINKS = 1 << 2,
	- VDEV_NAME_TYPE_ID = 1 << 3,
	-} vdev_name_t;
	-
	-extern char zpool_vdev_name(libzfs_handle_t , zpool_handle_t , nvlist_t ,
	- int name_flags);
	-extern int zpool_upgrade(zpool_handle_t *, uint64_t);
	-extern int zpool_get_history(zpool_handle_t , nvlist_t , uint64_t ,
	- boolean_t *);
	-extern int zpool_history_unpack(char , uint64_t, uint64_t ,
	- nvlist_t **, uint_t );
	-extern void zpool_obj_to_path(zpool_handle_t , uint64_t, uint64_t, char ,
	- size_t len);
	-extern int zfs_ioctl(libzfs_handle_t , int request, struct zfs_cmd );
	-extern int zpool_get_physpath(zpool_handle_t , char , size_t);
	-extern void zpool_explain_recover(libzfs_handle_t , const char , int,
	- nvlist_t *);
	-extern int zpool_checkpoint(zpool_handle_t *);
	-extern int zpool_discard_checkpoint(zpool_handle_t *);
	-
	-/*
	- * Basic handle manipulations. These functions do not create or destroy the
	- * underlying datasets, only the references to them.
	- */
	-extern zfs_handle_t zfs_open(libzfs_handle_t , const char *, int);
	-extern zfs_handle_t zfs_handle_dup(zfs_handle_t );
	-extern void zfs_close(zfs_handle_t *);
	-extern zfs_type_t zfs_get_type(const zfs_handle_t *);
	-extern const char zfs_get_name(const zfs_handle_t );
	-extern zpool_handle_t zfs_get_pool_handle(const zfs_handle_t );
	-extern const char zfs_get_pool_name(const zfs_handle_t );
	-
	-/*
	- * Property management functions. Some functions are shared with the kernel,
	- * and are found in sys/fs/zfs.h.
	- */
	-
	-/*
	- * zfs dataset property management
	- */
	-extern const char *zfs_prop_default_string(zfs_prop_t);
	-extern uint64_t zfs_prop_default_numeric(zfs_prop_t);
	-extern const char *zfs_prop_column_name(zfs_prop_t);
	-extern boolean_t zfs_prop_align_right(zfs_prop_t);
	-
	-extern nvlist_t zfs_valid_proplist(libzfs_handle_t , zfs_type_t,
	- nvlist_t , uint64_t, zfs_handle_t , zpool_handle_t , const char );
	-
	-extern const char *zfs_prop_to_name(zfs_prop_t);
	-extern int zfs_prop_set(zfs_handle_t , const char , const char *);
	-extern int zfs_prop_set_list(zfs_handle_t , nvlist_t );
	-extern int zfs_prop_get(zfs_handle_t , zfs_prop_t, char , size_t,
	- zprop_source_t , char , size_t, boolean_t);
	-extern int zfs_prop_get_recvd(zfs_handle_t , const char , char *, size_t,
	- boolean_t);
	-extern int zfs_prop_get_numeric(zfs_handle_t , zfs_prop_t, uint64_t ,
	- zprop_source_t , char , size_t);
	-extern int zfs_prop_get_userquota_int(zfs_handle_t zhp, const char propname,
	- uint64_t *propvalue);
	-extern int zfs_prop_get_userquota(zfs_handle_t zhp, const char propname,
	- char *propbuf, int proplen, boolean_t literal);
	-extern int zfs_prop_get_written_int(zfs_handle_t zhp, const char propname,
	- uint64_t *propvalue);
	-extern int zfs_prop_get_written(zfs_handle_t zhp, const char propname,
	- char *propbuf, int proplen, boolean_t literal);
	-extern int zfs_prop_get_feature(zfs_handle_t zhp, const char propname,
	- char *buf, size_t len);
	-extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
	-extern int zfs_prop_inherit(zfs_handle_t , const char , boolean_t);
	-extern const char *zfs_prop_values(zfs_prop_t);
	-extern int zfs_prop_is_string(zfs_prop_t prop);
	-extern nvlist_t zfs_get_user_props(zfs_handle_t );
	-extern nvlist_t zfs_get_recvd_props(zfs_handle_t );
	-extern nvlist_t zfs_get_clones_nvl(zfs_handle_t );
	-
	-
	-typedef struct zprop_list {
	- int pl_prop;
	- char *pl_user_prop;
	- struct zprop_list *pl_next;
	- boolean_t pl_all;
	- size_t pl_width;
	- size_t pl_recvd_width;
	- boolean_t pl_fixed;
	-} zprop_list_t;
	-
	-extern int zfs_expand_proplist(zfs_handle_t , zprop_list_t *, boolean_t,
	- boolean_t);
	-extern void zfs_prune_proplist(zfs_handle_t , uint8_t );
	-
	-#define ZFS_MOUNTPOINT_NONE "none"
	-#define ZFS_MOUNTPOINT_LEGACY "legacy"
	-
	-#define ZFS_FEATURE_DISABLED "disabled"
	-#define ZFS_FEATURE_ENABLED "enabled"
	-#define ZFS_FEATURE_ACTIVE "active"
	-
	-#define ZFS_UNSUPPORTED_INACTIVE "inactive"
	-#define ZFS_UNSUPPORTED_READONLY "readonly"
	-
	-/*
	- * zpool property management
	- */
	-extern int zpool_expand_proplist(zpool_handle_t , zprop_list_t *);
	-extern int zpool_prop_get_feature(zpool_handle_t , const char , char *,
	- size_t);
	-extern const char *zpool_prop_default_string(zpool_prop_t);
	-extern uint64_t zpool_prop_default_numeric(zpool_prop_t);
	-extern const char *zpool_prop_column_name(zpool_prop_t);
	-extern boolean_t zpool_prop_align_right(zpool_prop_t);
	-
	-/*
	- * Functions shared by zfs and zpool property management.
	- */
	-extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all,
	- boolean_t ordered, zfs_type_t type);
	-extern int zprop_get_list(libzfs_handle_t , char , zprop_list_t **,
	- zfs_type_t);
	-extern void zprop_free_list(zprop_list_t *);
	-
	-#define ZFS_GET_NCOLS 5
	-
	-typedef enum {
	- GET_COL_NONE,
	- GET_COL_NAME,
	- GET_COL_PROPERTY,
	- GET_COL_VALUE,
	- GET_COL_RECVD,
	- GET_COL_SOURCE
	-} zfs_get_column_t;
	-
	-/*
	- * Functions for printing zfs or zpool properties
	- */
	-typedef struct zprop_get_cbdata {
	- int cb_sources;
	- zfs_get_column_t cb_columns[ZFS_GET_NCOLS];
	- int cb_colwidths[ZFS_GET_NCOLS + 1];
	- boolean_t cb_scripted;
	- boolean_t cb_literal;
	- boolean_t cb_first;
	- zprop_list_t *cb_proplist;
	- zfs_type_t cb_type;
	-} zprop_get_cbdata_t;
	-
	-void zprop_print_one_property(const char , zprop_get_cbdata_t ,
	- const char , const char , zprop_source_t, const char *,
	- const char *);
	-
	-/*
	- * Iterator functions.
	- */
	-typedef int (zfs_iter_f)(zfs_handle_t , void *);
	-extern int zfs_iter_root(libzfs_handle_t , zfs_iter_f, void );
	-extern int zfs_iter_children(zfs_handle_t , zfs_iter_f, void );
	-extern int zfs_iter_dependents(zfs_handle_t , boolean_t, zfs_iter_f, void );
	-extern int zfs_iter_filesystems(zfs_handle_t , zfs_iter_f, void );
	-extern int zfs_iter_snapshots(zfs_handle_t , boolean_t, zfs_iter_f, void ,
	- uint64_t, uint64_t);
	-extern int zfs_iter_snapshots_sorted(zfs_handle_t , zfs_iter_f, void ,
	- uint64_t, uint64_t);
	-extern int zfs_iter_snapspec(zfs_handle_t , const char , zfs_iter_f, void *);
	-extern int zfs_iter_bookmarks(zfs_handle_t , zfs_iter_f, void );
	-
	-typedef struct get_all_cb {
	- zfs_handle_t **cb_handles;
	- size_t cb_alloc;
	- size_t cb_used;
	-} get_all_cb_t;
	-
	-void zfs_foreach_mountpoint(libzfs_handle_t , zfs_handle_t *, size_t,
	- zfs_iter_f, void*, boolean_t);
	-
	-void libzfs_add_handle(get_all_cb_t , zfs_handle_t );
	-
	-/*
	- * Functions to create and destroy datasets.
	- */
	-extern int zfs_create(libzfs_handle_t , const char , zfs_type_t,
	- nvlist_t *);
	-extern int zfs_create_ancestors(libzfs_handle_t , const char );
	-extern int zfs_destroy(zfs_handle_t *, boolean_t);
	-extern int zfs_destroy_snaps(zfs_handle_t , char , boolean_t);
	-extern int zfs_destroy_snaps_nvl(libzfs_handle_t , nvlist_t , boolean_t);
	-extern int zfs_clone(zfs_handle_t , const char , nvlist_t *);
	-extern int zfs_snapshot(libzfs_handle_t , const char , boolean_t, nvlist_t *);
	-extern int zfs_snapshot_nvl(libzfs_handle_t hdl, nvlist_t snaps,
	- nvlist_t *props);
	-extern int zfs_rollback(zfs_handle_t , zfs_handle_t , boolean_t);
	-
	-typedef struct renameflags {
	- /* recursive rename */
	- int recurse : 1;
	-
	- /* don't unmount file systems */
	- int nounmount : 1;
	-
	- /* force unmount file systems */
	- int forceunmount : 1;
	-} renameflags_t;
	-
	-extern int zfs_rename(zfs_handle_t , const char , const char *,
	- renameflags_t flags);
	-
	-typedef struct sendflags {
	- /* print informational messages (ie, -v was specified) */
	- boolean_t verbose;
	-
	- /* recursive send (ie, -R) */
	- boolean_t replicate;
	-
	- /* for incrementals, do all intermediate snapshots */
	- boolean_t doall;
	-
	- /* if dataset is a clone, do incremental from its origin */
	- boolean_t fromorigin;
	-
	- /* do deduplication */
	- boolean_t dedup;
	-
	- /* send properties (ie, -p) */
	- boolean_t props;
	-
	- /* do not send (no-op, ie. -n) */
	- boolean_t dryrun;
	-
	- /* parsable verbose output (ie. -P) */
	- boolean_t parsable;
	-
	- /* show progress (ie. -v) */
	- boolean_t progress;
	-
	- /* large blocks (>128K) are permitted */
	- boolean_t largeblock;
	-
	- /* WRITE_EMBEDDED records of type DATA are permitted */
	- boolean_t embed_data;
	-
	- /* compressed WRITE records are permitted */
	- boolean_t compress;
	-
	- /* show progress as process title(ie. -V) */
	- boolean_t progressastitle;
	-} sendflags_t;
	-
	-typedef boolean_t (snapfilter_cb_t)(zfs_handle_t , void );
	-
	-extern int zfs_send(zfs_handle_t , const char , const char *,
	- sendflags_t , int, snapfilter_cb_t, void , nvlist_t **);
	-extern int zfs_send_one(zfs_handle_t , const char , int, sendflags_t flags);
	-extern int zfs_send_resume(libzfs_handle_t , sendflags_t , int outfd,
	- const char *);
	-extern nvlist_t zfs_send_resume_token_to_nvlist(libzfs_handle_t hdl,
	- const char *token);
	-
	-extern int zfs_promote(zfs_handle_t *);
	-extern int zfs_hold(zfs_handle_t , const char , const char *,
	- boolean_t, int);
	-extern int zfs_hold_nvl(zfs_handle_t , int, nvlist_t );
	-extern int zfs_release(zfs_handle_t , const char , const char *, boolean_t);
	-extern int zfs_get_holds(zfs_handle_t , nvlist_t *);
	-extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
	-
	-typedef int (zfs_userspace_cb_t)(void arg, const char *domain,
	- uid_t rid, uint64_t space);
	-
	-extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t,
	- zfs_userspace_cb_t, void *);
	-
	-extern int zfs_get_fsacl(zfs_handle_t , nvlist_t *);
	-extern int zfs_set_fsacl(zfs_handle_t , boolean_t, nvlist_t );
	-
	-typedef struct recvflags {
	- /* print informational messages (ie, -v was specified) */
	- boolean_t verbose;
	-
	- /* the destination is a prefix, not the exact fs (ie, -d) */
	- boolean_t isprefix;
	-
	- /*
	- * Only the tail of the sent snapshot path is appended to the
	- * destination to determine the received snapshot name (ie, -e).
	- */
	- boolean_t istail;
	-
	- /* do not actually do the recv, just check if it would work (ie, -n) */
	- boolean_t dryrun;
	-
	- /* rollback/destroy filesystems as necessary (eg, -F) */
	- boolean_t force;
	-
	- /* set "canmount=off" on all modified filesystems */
	- boolean_t canmountoff;
	-
	- /*
	- * Mark the file systems as "resumable" and do not destroy them if the
	- * receive is interrupted
	- */
	- boolean_t resumable;
	-
	- /* byteswap flag is used internally; callers need not specify */
	- boolean_t byteswap;
	-
	- /* do not mount file systems as they are extracted (private) */
	- boolean_t nomount;
	-
	- /* force unmount while recv snapshot (private) */
	- boolean_t forceunmount;
	-} recvflags_t;
	-
	-extern int zfs_receive(libzfs_handle_t , const char , nvlist_t *,
	- recvflags_t , int, avl_tree_t );
	-
	-typedef enum diff_flags {
	- ZFS_DIFF_PARSEABLE = 0x1,
	- ZFS_DIFF_TIMESTAMP = 0x2,
	- ZFS_DIFF_CLASSIFY = 0x4
	-} diff_flags_t;
	-
	-extern int zfs_show_diffs(zfs_handle_t , int, const char , const char *,
	- int);
	-
	-/*
	- * Miscellaneous functions.
	- */
	-extern const char *zfs_type_to_name(zfs_type_t);
	-extern void zfs_refresh_properties(zfs_handle_t *);
	-extern int zfs_name_valid(const char *, zfs_type_t);
	-extern zfs_handle_t zfs_path_to_zhandle(libzfs_handle_t , char *, zfs_type_t);
	-extern boolean_t zfs_dataset_exists(libzfs_handle_t , const char ,
	- zfs_type_t);
	-extern int zfs_spa_version(zfs_handle_t , int );
	-extern boolean_t zfs_bookmark_exists(const char *path);
	-extern ulong_t get_system_hostid(void);
	-
	-/*
	- * Mount support functions.
	- */
	-extern boolean_t is_mounted(libzfs_handle_t , const char special, char **);
	-extern boolean_t zfs_is_mounted(zfs_handle_t , char *);
	-extern int zfs_mount(zfs_handle_t , const char , int);
	-extern int zfs_mount_at(zfs_handle_t , const char , int, const char *);
	-extern int zfs_unmount(zfs_handle_t , const char , int);
	-extern int zfs_unmountall(zfs_handle_t *, int);
	-
	-/*
	- * Share support functions.
	- */
	-extern boolean_t zfs_is_shared(zfs_handle_t *);
	-extern int zfs_share(zfs_handle_t *);
	-extern int zfs_unshare(zfs_handle_t *);
	-
	-/*
	- * Protocol-specific share support functions.
	- */
	-extern boolean_t zfs_is_shared_nfs(zfs_handle_t , char *);
	-extern boolean_t zfs_is_shared_smb(zfs_handle_t , char *);
	-extern int zfs_share_nfs(zfs_handle_t *);
	-extern int zfs_share_smb(zfs_handle_t *);
	-extern int zfs_shareall(zfs_handle_t *);
	-extern int zfs_unshare_nfs(zfs_handle_t , const char );
	-extern int zfs_unshare_smb(zfs_handle_t , const char );
	-extern int zfs_unshareall_nfs(zfs_handle_t *);
	-extern int zfs_unshareall_smb(zfs_handle_t *);
	-extern int zfs_unshareall_bypath(zfs_handle_t , const char );
	-extern int zfs_unshareall(zfs_handle_t *);
	-extern int zfs_deleg_share_nfs(libzfs_handle_t , char , char , char ,
	- void , void , int, zfs_share_op_t);
	-
	-/*
	- * FreeBSD-specific jail support function.
	- */
	-extern int zfs_jail(zfs_handle_t *, int, int);
	-
	-/*
	- * When dealing with nvlists, verify() is extremely useful
	- */
	-#ifndef verify
	-#ifdef NDEBUG
	-#define verify(EX) ((void)(EX))
	-#else
	-#define verify(EX) assert(EX)
	-#endif
	-#endif
	-
	-/*
	- * Utility function to convert a number to a human-readable form.
	- */
	-extern void zfs_nicenum(uint64_t, char *, size_t);
	-extern int zfs_nicestrtonum(libzfs_handle_t , const char , uint64_t *);
	-
	-/*
	- * Given a device or file, determine if it is part of a pool.
	- */
	-extern int zpool_in_use(libzfs_handle_t , int, pool_state_t , char **,
	- boolean_t *);
	-
	-/*
	- * Label manipulation.
	- */
	-extern int zpool_read_label(int, nvlist_t **);
	-extern int zpool_read_all_labels(int, nvlist_t **);
	-extern int zpool_clear_label(int);
	-extern int zpool_set_bootenv(zpool_handle_t , const char );
	-extern int zpool_get_bootenv(zpool_handle_t , char , size_t, off_t);
	-
	-/* is this zvol valid for use as a dump device? */
	-extern int zvol_check_dump_config(char *);
	-
	-/*
	- * Management interfaces for SMB ACL files
	- */
	-
	-int zfs_smb_acl_add(libzfs_handle_t , char , char , char );
	-int zfs_smb_acl_remove(libzfs_handle_t , char , char , char );
	-int zfs_smb_acl_purge(libzfs_handle_t , char , char *);
	-int zfs_smb_acl_rename(libzfs_handle_t , char , char , char , char *);
	-
	-/*
	- * Enable and disable datasets within a pool by mounting/unmounting and
	- * sharing/unsharing them.
	- */
	-extern int zpool_enable_datasets(zpool_handle_t , const char , int);
	-extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
	-
	-/*
	- * Mappings between vdev and FRU.
	- */
	-extern void libzfs_fru_refresh(libzfs_handle_t *);
	-extern const char libzfs_fru_lookup(libzfs_handle_t , const char *);
	-extern const char libzfs_fru_devpath(libzfs_handle_t , const char *);
	-extern boolean_t libzfs_fru_compare(libzfs_handle_t , const char ,
	- const char *);
	-extern boolean_t libzfs_fru_notself(libzfs_handle_t , const char );
	-extern int zpool_fru_set(zpool_handle_t , uint64_t, const char );
	-
	-#ifndef illumos
	-extern int zmount(const char , const char , int, char , char , int, char *,
	- int);
	-#endif
	-extern int zfs_remap_indirects(libzfs_handle_t hdl, const char );
	-
	-/* Allow consumers to initialize libshare externally for optimal performance */
	-extern int zfs_init_libshare_arg(libzfs_handle_t , int, void );
	-/*
	- * For most consumers, zfs_init_libshare_arg is sufficient on its own, and
	- * zfs_uninit_libshare is unnecessary. zfs_uninit_libshare should only be called
	- * if the caller has already initialized libshare for one set of zfs handles,
	- * and wishes to share or unshare filesystems outside of that set. In that case,
	- * the caller should uninitialize libshare, and then re-initialize it with the
	- * new handles being shared or unshared.
	- */
	-extern void zfs_uninit_libshare(libzfs_handle_t *);
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _LIBZFS_H */
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c
	@@ -1,736 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- *
	- * Portions Copyright 2007 Ramprakash Jelari
	- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
	- * All rights reserved.
	- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	- */
	-
	-#include <libintl.h>
	-#include <libuutil.h>
	-#include <stddef.h>
	-#include <stdlib.h>
	-#include <string.h>
	-#include <unistd.h>
	-#include <zone.h>
	-
	-#include <libzfs.h>
	-
	-#include "libzfs_impl.h"
	-
	-/*
	- * Structure to keep track of dataset state. Before changing the 'sharenfs' or
	- * 'mountpoint' property, we record whether the filesystem was previously
	- * mounted/shared. This prior state dictates whether we remount/reshare the
	- * dataset after the property has been changed.
	- *
	- * The interface consists of the following sequence of functions:
	- *
	- * changelist_gather()
	- * changelist_prefix()
	- * < change property >
	- * changelist_postfix()
	- * changelist_free()
	- *
	- * Other interfaces:
	- *
	- * changelist_remove() - remove a node from a gathered list
	- * changelist_rename() - renames all datasets appropriately when doing a rename
	- * changelist_unshare() - unshares all the nodes in a given changelist
	- * changelist_haszonedchild() - check if there is any child exported to
	- * a local zone
	- */
	-typedef struct prop_changenode {
	- zfs_handle_t *cn_handle;
	- int cn_shared;
	- int cn_mounted;
	- int cn_zoned;
	- boolean_t cn_needpost; /* is postfix() needed? */
	- uu_list_node_t cn_listnode;
	-} prop_changenode_t;
	-
	-struct prop_changelist {
	- zfs_prop_t cl_prop;
	- zfs_prop_t cl_realprop;
	- zfs_prop_t cl_shareprop; /* used with sharenfs/sharesmb */
	- uu_list_pool_t *cl_pool;
	- uu_list_t *cl_list;
	- boolean_t cl_waslegacy;
	- boolean_t cl_allchildren;
	- boolean_t cl_alldependents;
	- int cl_mflags; /* Mount flags */
	- int cl_gflags; /* Gather request flags */
	- boolean_t cl_haszonedchild;
	- boolean_t cl_sorted;
	-};
	-
	-/*
	- * If the property is 'mountpoint', go through and unmount filesystems as
	- * necessary. We don't do the same for 'sharenfs', because we can just re-share
	- * with different options without interrupting service. We do handle 'sharesmb'
	- * since there may be old resource names that need to be removed.
	- */
	-int
	-changelist_prefix(prop_changelist_t *clp)
	-{
	- prop_changenode_t *cn;
	- int ret = 0;
	-
	- if (clp->cl_prop != ZFS_PROP_MOUNTPOINT &&
	- clp->cl_prop != ZFS_PROP_SHARESMB)
	- return (0);
	-
	- for (cn = uu_list_first(clp->cl_list); cn != NULL;
	- cn = uu_list_next(clp->cl_list, cn)) {
	-
	- /* if a previous loop failed, set the remaining to false */
	- if (ret == -1) {
	- cn->cn_needpost = B_FALSE;
	- continue;
	- }
	-
	- /*
	- * If we are in the global zone, but this dataset is exported
	- * to a local zone, do nothing.
	- */
	- if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned)
	- continue;
	-
	- if (!ZFS_IS_VOLUME(cn->cn_handle)) {
	- /*
	- * Do the property specific processing.
	- */
	- switch (clp->cl_prop) {
	- case ZFS_PROP_MOUNTPOINT:
	- if (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT)
	- break;
	- if (zfs_unmount(cn->cn_handle, NULL,
	- clp->cl_mflags) != 0) {
	- ret = -1;
	- cn->cn_needpost = B_FALSE;
	- }
	- break;
	- case ZFS_PROP_SHARESMB:
	- (void) zfs_unshare_smb(cn->cn_handle, NULL);
	- break;
	-
	- default:
	- break;
	- }
	- }
	- }
	-
	- if (ret == -1)
	- (void) changelist_postfix(clp);
	-
	- return (ret);
	-}
	-
	-/*
	- * If the property is 'mountpoint' or 'sharenfs', go through and remount and/or
	- * reshare the filesystems as necessary. In changelist_gather() we recorded
	- * whether the filesystem was previously shared or mounted. The action we take
	- * depends on the previous state, and whether the value was previously 'legacy'.
	- * For non-legacy properties, we only remount/reshare the filesystem if it was
	- * previously mounted/shared. Otherwise, we always remount/reshare the
	- * filesystem.
	- */
	-int
	-changelist_postfix(prop_changelist_t *clp)
	-{
	- prop_changenode_t *cn;
	- char shareopts[ZFS_MAXPROPLEN];
	- int errors = 0;
	- libzfs_handle_t *hdl;
	-#ifdef illumos
	- size_t num_datasets = 0, i;
	- zfs_handle_t **zhandle_arr;
	- sa_init_selective_arg_t sharearg;
	-#endif
	-
	- /*
	- * If we're changing the mountpoint, attempt to destroy the underlying
	- * mountpoint. All other datasets will have inherited from this dataset
	- * (in which case their mountpoints exist in the filesystem in the new
	- * location), or have explicit mountpoints set (in which case they won't
	- * be in the changelist).
	- */
	- if ((cn = uu_list_last(clp->cl_list)) == NULL)
	- return (0);
	-
	- if (clp->cl_prop == ZFS_PROP_MOUNTPOINT &&
	- !(clp->cl_gflags & CL_GATHER_DONT_UNMOUNT)) {
	- remove_mountpoint(cn->cn_handle);
	- }
	-
	- /*
	- * It is possible that the changelist_prefix() used libshare
	- * to unshare some entries. Since libshare caches data, an
	- * attempt to reshare during postfix can fail unless libshare
	- * is uninitialized here so that it will reinitialize later.
	- */
	- if (cn->cn_handle != NULL) {
	- hdl = cn->cn_handle->zfs_hdl;
	- assert(hdl != NULL);
	- zfs_uninit_libshare(hdl);
	-
	-#ifdef illumos
	- /*
	- * For efficiencies sake, we initialize libshare for only a few
	- * shares (the ones affected here). Future initializations in
	- * this process should just use the cached initialization.
	- */
	- for (cn = uu_list_last(clp->cl_list); cn != NULL;
	- cn = uu_list_prev(clp->cl_list, cn)) {
	- num_datasets++;
	- }
	-
	- zhandle_arr = zfs_alloc(hdl,
	- num_datasets * sizeof (zfs_handle_t *));
	- for (i = 0, cn = uu_list_last(clp->cl_list); cn != NULL;
	- cn = uu_list_prev(clp->cl_list, cn)) {
	- zhandle_arr[i++] = cn->cn_handle;
	- zfs_refresh_properties(cn->cn_handle);
	- }
	- assert(i == num_datasets);
	- sharearg.zhandle_arr = zhandle_arr;
	- sharearg.zhandle_len = num_datasets;
	- errors = zfs_init_libshare_arg(hdl, SA_INIT_SHARE_API_SELECTIVE,
	- &sharearg);
	- free(zhandle_arr);
	-#endif
	- }
	- /*
	- * We walk the datasets in reverse, because we want to mount any parent
	- * datasets before mounting the children. We walk all datasets even if
	- * there are errors.
	- */
	- for (cn = uu_list_last(clp->cl_list); cn != NULL;
	- cn = uu_list_prev(clp->cl_list, cn)) {
	-
	- boolean_t sharenfs;
	- boolean_t sharesmb;
	- boolean_t mounted;
	-
	- /*
	- * If we are in the global zone, but this dataset is exported
	- * to a local zone, do nothing.
	- */
	- if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned)
	- continue;
	-
	- /* Only do post-processing if it's required */
	- if (!cn->cn_needpost)
	- continue;
	- cn->cn_needpost = B_FALSE;
	-
	-#ifndef illumos
	- zfs_refresh_properties(cn->cn_handle);
	-#endif
	-
	- if (ZFS_IS_VOLUME(cn->cn_handle))
	- continue;
	-
	- /*
	- * Remount if previously mounted or mountpoint was legacy,
	- * or sharenfs or sharesmb property is set.
	- */
	- sharenfs = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARENFS,
	- shareopts, sizeof (shareopts), NULL, NULL, 0,
	- B_FALSE) == 0) && (strcmp(shareopts, "off") != 0));
	-
	- sharesmb = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARESMB,
	- shareopts, sizeof (shareopts), NULL, NULL, 0,
	- B_FALSE) == 0) && (strcmp(shareopts, "off") != 0));
	-
	- mounted = (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) \|\|
	- zfs_is_mounted(cn->cn_handle, NULL);
	-
	- if (!mounted && (cn->cn_mounted \|\|
	- ((sharenfs \|\| sharesmb \|\| clp->cl_waslegacy) &&
	- (zfs_prop_get_int(cn->cn_handle,
	- ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) {
	-
	- if (zfs_mount(cn->cn_handle, NULL, 0) != 0)
	- errors++;
	- else
	- mounted = TRUE;
	- }
	-
	- /*
	- * If the file system is mounted we always re-share even
	- * if the filesystem is currently shared, so that we can
	- * adopt any new options.
	- */
	- if (sharenfs && mounted)
	- errors += zfs_share_nfs(cn->cn_handle);
	- else if (cn->cn_shared \|\| clp->cl_waslegacy)
	- errors += zfs_unshare_nfs(cn->cn_handle, NULL);
	- if (sharesmb && mounted)
	- errors += zfs_share_smb(cn->cn_handle);
	- else if (cn->cn_shared \|\| clp->cl_waslegacy)
	- errors += zfs_unshare_smb(cn->cn_handle, NULL);
	- }
	-
	- return (errors ? -1 : 0);
	-}
	-
	-/*
	- * Is this "dataset" a child of "parent"?
	- */
	-boolean_t
	-isa_child_of(const char dataset, const char parent)
	-{
	- int len;
	-
	- len = strlen(parent);
	-
	- if (strncmp(dataset, parent, len) == 0 &&
	- (dataset[len] == '@' \|\| dataset[len] == '/' \|\|
	- dataset[len] == '\0'))
	- return (B_TRUE);
	- else
	- return (B_FALSE);
	-
	-}
	-
	-/*
	- * If we rename a filesystem, child filesystem handles are no longer valid
	- * since we identify each dataset by its name in the ZFS namespace. As a
	- * result, we have to go through and fix up all the names appropriately. We
	- * could do this automatically if libzfs kept track of all open handles, but
	- * this is a lot less work.
	- */
	-void
	-changelist_rename(prop_changelist_t clp, const char src, const char *dst)
	-{
	- prop_changenode_t *cn;
	- char newname[ZFS_MAX_DATASET_NAME_LEN];
	-
	- for (cn = uu_list_first(clp->cl_list); cn != NULL;
	- cn = uu_list_next(clp->cl_list, cn)) {
	- /*
	- * Do not rename a clone that's not in the source hierarchy.
	- */
	- if (!isa_child_of(cn->cn_handle->zfs_name, src))
	- continue;
	-
	- /*
	- * Destroy the previous mountpoint if needed.
	- */
	- remove_mountpoint(cn->cn_handle);
	-
	- (void) strlcpy(newname, dst, sizeof (newname));
	- (void) strcat(newname, cn->cn_handle->zfs_name + strlen(src));
	-
	- (void) strlcpy(cn->cn_handle->zfs_name, newname,
	- sizeof (cn->cn_handle->zfs_name));
	- }
	-}
	-
	-/*
	- * Given a gathered changelist for the 'sharenfs' or 'sharesmb' property,
	- * unshare all the datasets in the list.
	- */
	-int
	-changelist_unshare(prop_changelist_t clp, zfs_share_proto_t proto)
	-{
	- prop_changenode_t *cn;
	- int ret = 0;
	-
	- if (clp->cl_prop != ZFS_PROP_SHARENFS &&
	- clp->cl_prop != ZFS_PROP_SHARESMB)
	- return (0);
	-
	- for (cn = uu_list_first(clp->cl_list); cn != NULL;
	- cn = uu_list_next(clp->cl_list, cn)) {
	- if (zfs_unshare_proto(cn->cn_handle, NULL, proto) != 0)
	- ret = -1;
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * Check if there is any child exported to a local zone in a given changelist.
	- * This information has already been recorded while gathering the changelist
	- * via changelist_gather().
	- */
	-int
	-changelist_haszonedchild(prop_changelist_t *clp)
	-{
	- return (clp->cl_haszonedchild);
	-}
	-
	-/*
	- * Remove a node from a gathered list.
	- */
	-void
	-changelist_remove(prop_changelist_t clp, const char name)
	-{
	- prop_changenode_t *cn;
	-
	- for (cn = uu_list_first(clp->cl_list); cn != NULL;
	- cn = uu_list_next(clp->cl_list, cn)) {
	-
	- if (strcmp(cn->cn_handle->zfs_name, name) == 0) {
	- uu_list_remove(clp->cl_list, cn);
	- zfs_close(cn->cn_handle);
	- free(cn);
	- return;
	- }
	- }
	-}
	-
	-/*
	- * Release any memory associated with a changelist.
	- */
	-void
	-changelist_free(prop_changelist_t *clp)
	-{
	- prop_changenode_t *cn;
	- void *cookie;
	-
	- if (clp->cl_list) {
	- cookie = NULL;
	- while ((cn = uu_list_teardown(clp->cl_list, &cookie)) != NULL) {
	- zfs_close(cn->cn_handle);
	- free(cn);
	- }
	-
	- uu_list_destroy(clp->cl_list);
	- }
	- if (clp->cl_pool)
	- uu_list_pool_destroy(clp->cl_pool);
	-
	- free(clp);
	-}
	-
	-static int
	-change_one(zfs_handle_t zhp, void data)
	-{
	- prop_changelist_t *clp = data;
	- char property[ZFS_MAXPROPLEN];
	- char where[64];
	- prop_changenode_t *cn;
	- zprop_source_t sourcetype;
	- zprop_source_t share_sourcetype;
	-
	- /*
	- * We only want to unmount/unshare those filesystems that may inherit
	- * from the target filesystem. If we find any filesystem with a
	- * locally set mountpoint, we ignore any children since changing the
	- * property will not affect them. If this is a rename, we iterate
	- * over all children regardless, since we need them unmounted in
	- * order to do the rename. Also, if this is a volume and we're doing
	- * a rename, then always add it to the changelist.
	- */
	-
	- if (!(ZFS_IS_VOLUME(zhp) && clp->cl_realprop == ZFS_PROP_NAME) &&
	- zfs_prop_get(zhp, clp->cl_prop, property,
	- sizeof (property), &sourcetype, where, sizeof (where),
	- B_FALSE) != 0) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- /*
	- * If we are "watching" sharenfs or sharesmb
	- * then check out the companion property which is tracked
	- * in cl_shareprop
	- */
	- if (clp->cl_shareprop != ZPROP_INVAL &&
	- zfs_prop_get(zhp, clp->cl_shareprop, property,
	- sizeof (property), &share_sourcetype, where, sizeof (where),
	- B_FALSE) != 0) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- if (clp->cl_alldependents \|\| clp->cl_allchildren \|\|
	- sourcetype == ZPROP_SRC_DEFAULT \|\|
	- sourcetype == ZPROP_SRC_INHERITED \|\|
	- (clp->cl_shareprop != ZPROP_INVAL &&
	- (share_sourcetype == ZPROP_SRC_DEFAULT \|\|
	- share_sourcetype == ZPROP_SRC_INHERITED))) {
	- if ((cn = zfs_alloc(zfs_get_handle(zhp),
	- sizeof (prop_changenode_t))) == NULL) {
	- zfs_close(zhp);
	- return (-1);
	- }
	-
	- cn->cn_handle = zhp;
	- cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) \|\|
	- zfs_is_mounted(zhp, NULL);
	- cn->cn_shared = zfs_is_shared(zhp);
	- cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
	- cn->cn_needpost = B_TRUE;
	-
	- /* Indicate if any child is exported to a local zone. */
	- if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned)
	- clp->cl_haszonedchild = B_TRUE;
	-
	- uu_list_node_init(cn, &cn->cn_listnode, clp->cl_pool);
	-
	- if (clp->cl_sorted) {
	- uu_list_index_t idx;
	-
	- (void) uu_list_find(clp->cl_list, cn, NULL,
	- &idx);
	- uu_list_insert(clp->cl_list, cn, idx);
	- } else {
	- /*
	- * Add this child to beginning of the list. Children
	- * below this one in the hierarchy will get added above
	- * this one in the list. This produces a list in
	- * reverse dataset name order.
	- * This is necessary when the original mountpoint
	- * is legacy or none.
	- */
	- verify(uu_list_insert_before(clp->cl_list,
	- uu_list_first(clp->cl_list), cn) == 0);
	- }
	-
	- if (!clp->cl_alldependents)
	- return (zfs_iter_children(zhp, change_one, data));
	- } else {
	- zfs_close(zhp);
	- }
	-
	- return (0);
	-}
	-
	-/ARGSUSED/
	-static int
	-compare_mountpoints(const void a, const void b, void *unused)
	-{
	- const prop_changenode_t *ca = a;
	- const prop_changenode_t *cb = b;
	-
	- char mounta[MAXPATHLEN];
	- char mountb[MAXPATHLEN];
	-
	- boolean_t hasmounta, hasmountb;
	-
	- /*
	- * When unsharing or unmounting filesystems, we need to do it in
	- * mountpoint order. This allows the user to have a mountpoint
	- * hierarchy that is different from the dataset hierarchy, and still
	- * allow it to be changed. However, if either dataset doesn't have a
	- * mountpoint (because it is a volume or a snapshot), we place it at the
	- * end of the list, because it doesn't affect our change at all.
	- */
	- hasmounta = (zfs_prop_get(ca->cn_handle, ZFS_PROP_MOUNTPOINT, mounta,
	- sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
	- hasmountb = (zfs_prop_get(cb->cn_handle, ZFS_PROP_MOUNTPOINT, mountb,
	- sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
	-
	- if (!hasmounta && hasmountb)
	- return (-1);
	- else if (hasmounta && !hasmountb)
	- return (1);
	- else if (!hasmounta && !hasmountb)
	- return (0);
	- else
	- return (strcmp(mountb, mounta));
	-}
	-
	-/*
	- * Given a ZFS handle and a property, construct a complete list of datasets
	- * that need to be modified as part of this process. For anything but the
	- * 'mountpoint' and 'sharenfs' properties, this just returns an empty list.
	- * Otherwise, we iterate over all children and look for any datasets that
	- * inherit the property. For each such dataset, we add it to the list and
	- * mark whether it was shared beforehand.
	- */
	-prop_changelist_t *
	-changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
	- int mnt_flags)
	-{
	- prop_changelist_t *clp;
	- prop_changenode_t *cn;
	- zfs_handle_t *temp;
	- char property[ZFS_MAXPROPLEN];
	- uu_compare_fn_t *compare = NULL;
	- boolean_t legacy = B_FALSE;
	-
	- if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL)
	- return (NULL);
	-
	- /*
	- * For mountpoint-related tasks, we want to sort everything by
	- * mountpoint, so that we mount and unmount them in the appropriate
	- * order, regardless of their position in the hierarchy.
	- */
	- if (prop == ZFS_PROP_NAME \|\| prop == ZFS_PROP_ZONED \|\|
	- prop == ZFS_PROP_MOUNTPOINT \|\| prop == ZFS_PROP_SHARENFS \|\|
	- prop == ZFS_PROP_SHARESMB) {
	-
	- if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
	- property, sizeof (property),
	- NULL, NULL, 0, B_FALSE) == 0 &&
	- (strcmp(property, "legacy") == 0 \|\|
	- strcmp(property, "none") == 0)) {
	-
	- legacy = B_TRUE;
	- }
	- if (!legacy) {
	- compare = compare_mountpoints;
	- clp->cl_sorted = B_TRUE;
	- }
	- }
	-
	- clp->cl_pool = uu_list_pool_create("changelist_pool",
	- sizeof (prop_changenode_t),
	- offsetof(prop_changenode_t, cn_listnode),
	- compare, 0);
	- if (clp->cl_pool == NULL) {
	- assert(uu_error() == UU_ERROR_NO_MEMORY);
	- (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error");
	- changelist_free(clp);
	- return (NULL);
	- }
	-
	- clp->cl_list = uu_list_create(clp->cl_pool, NULL,
	- clp->cl_sorted ? UU_LIST_SORTED : 0);
	- clp->cl_gflags = gather_flags;
	- clp->cl_mflags = mnt_flags;
	-
	- if (clp->cl_list == NULL) {
	- assert(uu_error() == UU_ERROR_NO_MEMORY);
	- (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error");
	- changelist_free(clp);
	- return (NULL);
	- }
	-
	- /*
	- * If this is a rename or the 'zoned' property, we pretend we're
	- * changing the mountpoint and flag it so we can catch all children in
	- * change_one().
	- *
	- * Flag cl_alldependents to catch all children plus the dependents
	- * (clones) that are not in the hierarchy.
	- */
	- if (prop == ZFS_PROP_NAME) {
	- clp->cl_prop = ZFS_PROP_MOUNTPOINT;
	- clp->cl_alldependents = B_TRUE;
	- } else if (prop == ZFS_PROP_ZONED) {
	- clp->cl_prop = ZFS_PROP_MOUNTPOINT;
	- clp->cl_allchildren = B_TRUE;
	- } else if (prop == ZFS_PROP_CANMOUNT) {
	- clp->cl_prop = ZFS_PROP_MOUNTPOINT;
	- } else if (prop == ZFS_PROP_VOLSIZE) {
	- clp->cl_prop = ZFS_PROP_MOUNTPOINT;
	- } else {
	- clp->cl_prop = prop;
	- }
	- clp->cl_realprop = prop;
	-
	- if (clp->cl_prop != ZFS_PROP_MOUNTPOINT &&
	- clp->cl_prop != ZFS_PROP_SHARENFS &&
	- clp->cl_prop != ZFS_PROP_SHARESMB)
	- return (clp);
	-
	- /*
	- * If watching SHARENFS or SHARESMB then
	- * also watch its companion property.
	- */
	- if (clp->cl_prop == ZFS_PROP_SHARENFS)
	- clp->cl_shareprop = ZFS_PROP_SHARESMB;
	- else if (clp->cl_prop == ZFS_PROP_SHARESMB)
	- clp->cl_shareprop = ZFS_PROP_SHARENFS;
	-
	- if (clp->cl_alldependents) {
	- if (zfs_iter_dependents(zhp, B_TRUE, change_one, clp) != 0) {
	- changelist_free(clp);
	- return (NULL);
	- }
	- } else if (zfs_iter_children(zhp, change_one, clp) != 0) {
	- changelist_free(clp);
	- return (NULL);
	- }
	-
	- /*
	- * We have to re-open ourselves because we auto-close all the handles
	- * and can't tell the difference.
	- */
	- if ((temp = zfs_open(zhp->zfs_hdl, zfs_get_name(zhp),
	- ZFS_TYPE_DATASET)) == NULL) {
	- changelist_free(clp);
	- return (NULL);
	- }
	-
	- /*
	- * Always add ourself to the list. We add ourselves to the end so that
	- * we're the last to be unmounted.
	- */
	- if ((cn = zfs_alloc(zhp->zfs_hdl,
	- sizeof (prop_changenode_t))) == NULL) {
	- zfs_close(temp);
	- changelist_free(clp);
	- return (NULL);
	- }
	-
	- cn->cn_handle = temp;
	- cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) \|\|
	- zfs_is_mounted(temp, NULL);
	- cn->cn_shared = zfs_is_shared(temp);
	- cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
	- cn->cn_needpost = B_TRUE;
	-
	- uu_list_node_init(cn, &cn->cn_listnode, clp->cl_pool);
	- if (clp->cl_sorted) {
	- uu_list_index_t idx;
	- (void) uu_list_find(clp->cl_list, cn, NULL, &idx);
	- uu_list_insert(clp->cl_list, cn, idx);
	- } else {
	- /*
	- * Add the target dataset to the end of the list.
	- * The list is not really unsorted. The list will be
	- * in reverse dataset name order. This is necessary
	- * when the original mountpoint is legacy or none.
	- */
	- verify(uu_list_insert_after(clp->cl_list,
	- uu_list_last(clp->cl_list), cn) == 0);
	- }
	-
	- /*
	- * If the mountpoint property was previously 'legacy', or 'none',
	- * record it as the behavior of changelist_postfix() will be different.
	- */
	- if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) {
	- /*
	- * do not automatically mount ex-legacy datasets if
	- * we specifically set canmount to noauto
	- */
	- if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) !=
	- ZFS_CANMOUNT_NOAUTO)
	- clp->cl_waslegacy = B_TRUE;
	- }
	-
	- return (clp);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h
	@@ -1,44 +0,0 @@
	-/*
	- * CDDL HEADER SART
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- */
	-
	-#ifndef _LIBZFS_COMPAT_H
	-#define _LIBZFS_COMPAT_H
	-
	-#include <zfs_ioctl_compat.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-int get_zfs_ioctl_version(void);
	-int zcmd_ioctl(int fd, int request, zfs_cmd_t *zc);
	-
	-#define ioctl(fd, ioc, zc) zcmd_ioctl((fd), (ioc), (zc))
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _LIBZFS_COMPAT_H */
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c
	@@ -1,121 +0,0 @@
	-/*
	- * CDDL HEADER SART
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- */
	-
	-#include "libzfs_compat.h"
	-
	-int zfs_ioctl_version = ZFS_IOCVER_UNDEF;
	-static int zfs_spa_version = -1;
	-
	-/*
	- * Get zfs_ioctl_version
	- */
	-int
	-get_zfs_ioctl_version(void)
	-{
	- size_t ver_size;
	- int ver = ZFS_IOCVER_NONE;
	-
	- ver_size = sizeof(ver);
	- sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0);
	-
	- return (ver);
	-}
	-
	-/*
	- * Get the SPA version
	- */
	-static int
	-get_zfs_spa_version(void)
	-{
	- size_t ver_size;
	- int ver = 0;
	-
	- ver_size = sizeof(ver);
	- sysctlbyname("vfs.zfs.version.spa", &ver, &ver_size, NULL, 0);
	-
	- return (ver);
	-}
	-
	-/*
	- * This is FreeBSD version of ioctl, because Solaris' ioctl() updates
	- * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an
	- * error is returned zc_nvlist_dst_size won't be updated.
	- */
	-int
	-zcmd_ioctl(int fd, int request, zfs_cmd_t *zc)
	-{
	- size_t oldsize;
	- int ret, cflag = ZFS_CMD_COMPAT_NONE;
	-
	- if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
	- zfs_ioctl_version = get_zfs_ioctl_version();
	-
	- if (zfs_ioctl_version >= ZFS_IOCVER_DEADMAN) {
	- switch (zfs_ioctl_version) {
	- case ZFS_IOCVER_INLANES:
	- cflag = ZFS_CMD_COMPAT_INLANES;
	- break;
	- case ZFS_IOCVER_RESUME:
	- cflag = ZFS_CMD_COMPAT_RESUME;
	- break;
	- case ZFS_IOCVER_EDBP:
	- cflag = ZFS_CMD_COMPAT_EDBP;
	- break;
	- case ZFS_IOCVER_ZCMD:
	- cflag = ZFS_CMD_COMPAT_ZCMD;
	- break;
	- case ZFS_IOCVER_LZC:
	- cflag = ZFS_CMD_COMPAT_LZC;
	- break;
	- case ZFS_IOCVER_DEADMAN:
	- cflag = ZFS_CMD_COMPAT_DEADMAN;
	- break;
	- }
	- } else {
	- /*
	- * If vfs.zfs.version.ioctl is not defined, assume we have v28
	- * compatible binaries and use vfs.zfs.version.spa to test for v15
	- */
	- cflag = ZFS_CMD_COMPAT_V28;
	-
	- if (zfs_spa_version < 0)
	- zfs_spa_version = get_zfs_spa_version();
	-
	- if (zfs_spa_version == SPA_VERSION_15 \|\|
	- zfs_spa_version == SPA_VERSION_14 \|\|
	- zfs_spa_version == SPA_VERSION_13)
	- cflag = ZFS_CMD_COMPAT_V15;
	- }
	-
	- oldsize = zc->zc_nvlist_dst_size;
	- ret = zcmd_ioctl_compat(fd, request, zc, cflag);
	-
	- if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) {
	- ret = -1;
	- errno = ENOMEM;
	- }
	-
	- return (ret);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c
	@@ -1,469 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- * Copyright (c) 2015 by Syneto S.R.L. All rights reserved.
	- * Copyright 2016 Nexenta Systems, Inc.
	- */
	-
	-/*
	- * The pool configuration repository is stored in /etc/zfs/zpool.cache as a
	- * single packed nvlist. While it would be nice to just read in this
	- * file from userland, this wouldn't work from a local zone. So we have to have
	- * a zpool ioctl to return the complete configuration for all pools. In the
	- * global zone, this will be identical to reading the file and unpacking it in
	- * userland.
	- */
	-
	-#include <errno.h>
	-#include <sys/stat.h>
	-#include <fcntl.h>
	-#include <stddef.h>
	-#include <string.h>
	-#include <unistd.h>
	-#include <libintl.h>
	-#include <libuutil.h>
	-
	-#include "libzfs_impl.h"
	-
	-typedef struct config_node {
	- char *cn_name;
	- nvlist_t *cn_config;
	- uu_avl_node_t cn_avl;
	-} config_node_t;
	-
	-/* ARGSUSED */
	-static int
	-config_node_compare(const void a, const void b, void *unused)
	-{
	- int ret;
	-
	- const config_node_t ca = (config_node_t )a;
	- const config_node_t cb = (config_node_t )b;
	-
	- ret = strcmp(ca->cn_name, cb->cn_name);
	-
	- if (ret < 0)
	- return (-1);
	- else if (ret > 0)
	- return (1);
	- else
	- return (0);
	-}
	-
	-void
	-namespace_clear(libzfs_handle_t *hdl)
	-{
	- if (hdl->libzfs_ns_avl) {
	- config_node_t *cn;
	- void *cookie = NULL;
	-
	- while ((cn = uu_avl_teardown(hdl->libzfs_ns_avl,
	- &cookie)) != NULL) {
	- nvlist_free(cn->cn_config);
	- free(cn->cn_name);
	- free(cn);
	- }
	-
	- uu_avl_destroy(hdl->libzfs_ns_avl);
	- hdl->libzfs_ns_avl = NULL;
	- }
	-
	- if (hdl->libzfs_ns_avlpool) {
	- uu_avl_pool_destroy(hdl->libzfs_ns_avlpool);
	- hdl->libzfs_ns_avlpool = NULL;
	- }
	-}
	-
	-/*
	- * Loads the pool namespace, or re-loads it if the cache has changed.
	- */
	-static int
	-namespace_reload(libzfs_handle_t *hdl)
	-{
	- nvlist_t *config;
	- config_node_t *cn;
	- nvpair_t *elem;
	- zfs_cmd_t zc = { 0 };
	- void *cookie;
	-
	- if (hdl->libzfs_ns_gen == 0) {
	- /*
	- * This is the first time we've accessed the configuration
	- * cache. Initialize the AVL tree and then fall through to the
	- * common code.
	- */
	- if ((hdl->libzfs_ns_avlpool = uu_avl_pool_create("config_pool",
	- sizeof (config_node_t),
	- offsetof(config_node_t, cn_avl),
	- config_node_compare, UU_DEFAULT)) == NULL)
	- return (no_memory(hdl));
	-
	- if ((hdl->libzfs_ns_avl = uu_avl_create(hdl->libzfs_ns_avlpool,
	- NULL, UU_DEFAULT)) == NULL)
	- return (no_memory(hdl));
	- }
	-
	- if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
	- return (-1);
	-
	- for (;;) {
	- zc.zc_cookie = hdl->libzfs_ns_gen;
	- if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_CONFIGS, &zc) != 0) {
	- switch (errno) {
	- case EEXIST:
	- /*
	- * The namespace hasn't changed.
	- */
	- zcmd_free_nvlists(&zc);
	- return (0);
	-
	- case ENOMEM:
	- if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- break;
	-
	- default:
	- zcmd_free_nvlists(&zc);
	- return (zfs_standard_error(hdl, errno,
	- dgettext(TEXT_DOMAIN, "failed to read "
	- "pool configuration")));
	- }
	- } else {
	- hdl->libzfs_ns_gen = zc.zc_cookie;
	- break;
	- }
	- }
	-
	- if (zcmd_read_dst_nvlist(hdl, &zc, &config) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	-
	- zcmd_free_nvlists(&zc);
	-
	- /*
	- * Clear out any existing configuration information.
	- */
	- cookie = NULL;
	- while ((cn = uu_avl_teardown(hdl->libzfs_ns_avl, &cookie)) != NULL) {
	- nvlist_free(cn->cn_config);
	- free(cn->cn_name);
	- free(cn);
	- }
	-
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(config, elem)) != NULL) {
	- nvlist_t *child;
	- uu_avl_index_t where;
	-
	- if ((cn = zfs_alloc(hdl, sizeof (config_node_t))) == NULL) {
	- nvlist_free(config);
	- return (-1);
	- }
	-
	- if ((cn->cn_name = zfs_strdup(hdl,
	- nvpair_name(elem))) == NULL) {
	- free(cn);
	- nvlist_free(config);
	- return (-1);
	- }
	-
	- verify(nvpair_value_nvlist(elem, &child) == 0);
	- if (nvlist_dup(child, &cn->cn_config, 0) != 0) {
	- free(cn->cn_name);
	- free(cn);
	- nvlist_free(config);
	- return (no_memory(hdl));
	- }
	- verify(uu_avl_find(hdl->libzfs_ns_avl, cn, NULL, &where)
	- == NULL);
	-
	- uu_avl_insert(hdl->libzfs_ns_avl, cn, where);
	- }
	-
	- nvlist_free(config);
	- return (0);
	-}
	-
	-/*
	- * Retrieve the configuration for the given pool. The configuration is a nvlist
	- * describing the vdevs, as well as the statistics associated with each one.
	- */
	-nvlist_t *
	-zpool_get_config(zpool_handle_t zhp, nvlist_t *oldconfig)
	-{
	- if (oldconfig)
	- *oldconfig = zhp->zpool_old_config;
	- return (zhp->zpool_config);
	-}
	-
	-/*
	- * Retrieves a list of enabled features and their refcounts and caches it in
	- * the pool handle.
	- */
	-nvlist_t *
	-zpool_get_features(zpool_handle_t *zhp)
	-{
	- nvlist_t config, features;
	-
	- config = zpool_get_config(zhp, NULL);
	-
	- if (config == NULL \|\| !nvlist_exists(config,
	- ZPOOL_CONFIG_FEATURE_STATS)) {
	- int error;
	- boolean_t missing = B_FALSE;
	-
	- error = zpool_refresh_stats(zhp, &missing);
	-
	- if (error != 0 \|\| missing)
	- return (NULL);
	-
	- config = zpool_get_config(zhp, NULL);
	- }
	-
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
	- &features) != 0)
	- return (NULL);
	-
	- return (features);
	-}
	-
	-/*
	- * Refresh the vdev statistics associated with the given pool. This is used in
	- * iostat to show configuration changes and determine the delta from the last
	- * time the function was called. This function can fail, in case the pool has
	- * been destroyed.
	- */
	-int
	-zpool_refresh_stats(zpool_handle_t zhp, boolean_t missing)
	-{
	- zfs_cmd_t zc = { 0 };
	- int error;
	- nvlist_t *config;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- *missing = B_FALSE;
	- (void) strcpy(zc.zc_name, zhp->zpool_name);
	-
	- if (zhp->zpool_config_size == 0)
	- zhp->zpool_config_size = 1 << 16;
	-
	- if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size) != 0)
	- return (-1);
	-
	- for (;;) {
	- if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_POOL_STATS,
	- &zc) == 0) {
	- /*
	- * The real error is returned in the zc_cookie field.
	- */
	- error = zc.zc_cookie;
	- break;
	- }
	-
	- if (errno == ENOMEM) {
	- if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- } else {
	- zcmd_free_nvlists(&zc);
	- if (errno == ENOENT \|\| errno == EINVAL)
	- *missing = B_TRUE;
	- zhp->zpool_state = POOL_STATE_UNAVAIL;
	- return (0);
	- }
	- }
	-
	- if (zcmd_read_dst_nvlist(hdl, &zc, &config) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	-
	- zcmd_free_nvlists(&zc);
	-
	- zhp->zpool_config_size = zc.zc_nvlist_dst_size;
	-
	- if (zhp->zpool_config != NULL) {
	- uint64_t oldtxg, newtxg;
	-
	- verify(nvlist_lookup_uint64(zhp->zpool_config,
	- ZPOOL_CONFIG_POOL_TXG, &oldtxg) == 0);
	- verify(nvlist_lookup_uint64(config,
	- ZPOOL_CONFIG_POOL_TXG, &newtxg) == 0);
	-
	- nvlist_free(zhp->zpool_old_config);
	-
	- if (oldtxg != newtxg) {
	- nvlist_free(zhp->zpool_config);
	- zhp->zpool_old_config = NULL;
	- } else {
	- zhp->zpool_old_config = zhp->zpool_config;
	- }
	- }
	-
	- zhp->zpool_config = config;
	- if (error)
	- zhp->zpool_state = POOL_STATE_UNAVAIL;
	- else
	- zhp->zpool_state = POOL_STATE_ACTIVE;
	-
	- return (0);
	-}
	-
	-/*
	- * The following environment variables are undocumented
	- * and should be used for testing purposes only:
	- *
	- * __ZFS_POOL_EXCLUDE - don't iterate over the pools it lists
	- * __ZFS_POOL_RESTRICT - iterate only over the pools it lists
	- *
	- * This function returns B_TRUE if the pool should be skipped
	- * during iteration.
	- */
	-boolean_t
	-zpool_skip_pool(const char *poolname)
	-{
	- static boolean_t initialized = B_FALSE;
	- static const char *exclude = NULL;
	- static const char *restricted = NULL;
	-
	- const char cur, end;
	- int len;
	- int namelen = strlen(poolname);
	-
	- if (!initialized) {
	- initialized = B_TRUE;
	- exclude = getenv("__ZFS_POOL_EXCLUDE");
	- restricted = getenv("__ZFS_POOL_RESTRICT");
	- }
	-
	- if (exclude != NULL) {
	- cur = exclude;
	- do {
	- end = strchr(cur, ' ');
	- len = (NULL == end) ? strlen(cur) : (end - cur);
	- if (len == namelen && 0 == strncmp(cur, poolname, len))
	- return (B_TRUE);
	- cur += (len + 1);
	- } while (NULL != end);
	- }
	-
	- if (NULL == restricted)
	- return (B_FALSE);
	-
	- cur = restricted;
	- do {
	- end = strchr(cur, ' ');
	- len = (NULL == end) ? strlen(cur) : (end - cur);
	-
	- if (len == namelen && 0 == strncmp(cur, poolname, len)) {
	- return (B_FALSE);
	- }
	-
	- cur += (len + 1);
	- } while (NULL != end);
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Iterate over all pools in the system.
	- */
	-int
	-zpool_iter(libzfs_handle_t hdl, zpool_iter_f func, void data)
	-{
	- config_node_t *cn;
	- zpool_handle_t *zhp;
	- int ret;
	-
	- /*
	- * If someone makes a recursive call to zpool_iter(), we want to avoid
	- * refreshing the namespace because that will invalidate the parent
	- * context. We allow recursive calls, but simply re-use the same
	- * namespace AVL tree.
	- */
	- if (!hdl->libzfs_pool_iter && namespace_reload(hdl) != 0)
	- return (-1);
	-
	- hdl->libzfs_pool_iter++;
	- for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL;
	- cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) {
	-
	- if (zpool_skip_pool(cn->cn_name))
	- continue;
	-
	- if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) {
	- hdl->libzfs_pool_iter--;
	- return (-1);
	- }
	-
	- if (zhp == NULL)
	- continue;
	-
	- if ((ret = func(zhp, data)) != 0) {
	- hdl->libzfs_pool_iter--;
	- return (ret);
	- }
	- }
	- hdl->libzfs_pool_iter--;
	-
	- return (0);
	-}
	-
	-/*
	- * Iterate over root datasets, calling the given function for each. The zfs
	- * handle passed each time must be explicitly closed by the callback.
	- */
	-int
	-zfs_iter_root(libzfs_handle_t hdl, zfs_iter_f func, void data)
	-{
	- config_node_t *cn;
	- zfs_handle_t *zhp;
	- int ret;
	-
	- if (namespace_reload(hdl) != 0)
	- return (-1);
	-
	- for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL;
	- cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) {
	-
	- if (zpool_skip_pool(cn->cn_name))
	- continue;
	-
	- if ((zhp = make_dataset_handle(hdl, cn->cn_name)) == NULL)
	- continue;
	-
	- if ((ret = func(zhp, data)) != 0)
	- return (ret);
	- }
	-
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
	@@ -1,5284 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2018, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	- * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved.
	- * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright (c) 2013 Martin Matuska. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Nexenta Systems, Inc.
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	- * Copyright 2017-2018 RackTop Systems.
	- * Copyright (c) 2019 Datto Inc.
	- */
	-
	-#include <ctype.h>
	-#include <errno.h>
	-#include <libintl.h>
	-#include <math.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <stddef.h>
	-#include <zone.h>
	-#include <fcntl.h>
	-#include <sys/mntent.h>
	-#include <sys/mount.h>
	-#include <priv.h>
	-#include <pwd.h>
	-#include <grp.h>
	-#include <stddef.h>
	-#ifdef illumos
	-#include <idmap.h>
	-#endif
	-
	-#include <sys/dnode.h>
	-#include <sys/spa.h>
	-#include <sys/zap.h>
	-#include <sys/misc.h>
	-#include <libzfs.h>
	-
	-#include "zfs_namecheck.h"
	-#include "zfs_prop.h"
	-#include "libzfs_impl.h"
	-#include "zfs_deleg.h"
	-
	-static int userquota_propname_decode(const char *propname, boolean_t zoned,
	- zfs_userquota_prop_t typep, char domain, int domainlen, uint64_t *ridp);
	-
	-/*
	- * Given a single type (not a mask of types), return the type in a human
	- * readable form.
	- */
	-const char *
	-zfs_type_to_name(zfs_type_t type)
	-{
	- switch (type) {
	- case ZFS_TYPE_FILESYSTEM:
	- return (dgettext(TEXT_DOMAIN, "filesystem"));
	- case ZFS_TYPE_SNAPSHOT:
	- return (dgettext(TEXT_DOMAIN, "snapshot"));
	- case ZFS_TYPE_VOLUME:
	- return (dgettext(TEXT_DOMAIN, "volume"));
	- case ZFS_TYPE_POOL:
	- return (dgettext(TEXT_DOMAIN, "pool"));
	- case ZFS_TYPE_BOOKMARK:
	- return (dgettext(TEXT_DOMAIN, "bookmark"));
	- default:
	- assert(!"unhandled zfs_type_t");
	- }
	-
	- return (NULL);
	-}
	-
	-/*
	- * Validate a ZFS path. This is used even before trying to open the dataset, to
	- * provide a more meaningful error message. We call zfs_error_aux() to
	- * explain exactly why the name was not valid.
	- */
	-int
	-zfs_validate_name(libzfs_handle_t hdl, const char path, int type,
	- boolean_t modifying)
	-{
	- namecheck_err_t why;
	- char what;
	-
	- if (entity_namecheck(path, &why, &what) != 0) {
	- if (hdl != NULL) {
	- switch (why) {
	- case NAME_ERR_TOOLONG:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "name is too long"));
	- break;
	-
	- case NAME_ERR_LEADING_SLASH:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "leading slash in name"));
	- break;
	-
	- case NAME_ERR_EMPTY_COMPONENT:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "empty component in name"));
	- break;
	-
	- case NAME_ERR_TRAILING_SLASH:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "trailing slash in name"));
	- break;
	-
	- case NAME_ERR_INVALCHAR:
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "invalid character "
	- "'%c' in name"), what);
	- break;
	-
	- case NAME_ERR_MULTIPLE_DELIMITERS:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "multiple '@' and/or '#' delimiters in "
	- "name"));
	- break;
	-
	- case NAME_ERR_NOLETTER:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool doesn't begin with a letter"));
	- break;
	-
	- case NAME_ERR_RESERVED:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "name is reserved"));
	- break;
	-
	- case NAME_ERR_DISKLIKE:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "reserved disk name"));
	- break;
	-
	- default:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "(%d) not defined"), why);
	- break;
	- }
	- }
	-
	- return (0);
	- }
	-
	- if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) {
	- if (hdl != NULL)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "snapshot delimiter '@' is not expected here"));
	- return (0);
	- }
	-
	- if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) {
	- if (hdl != NULL)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "missing '@' delimiter in snapshot name"));
	- return (0);
	- }
	-
	- if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) {
	- if (hdl != NULL)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "bookmark delimiter '#' is not expected here"));
	- return (0);
	- }
	-
	- if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) {
	- if (hdl != NULL)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "missing '#' delimiter in bookmark name"));
	- return (0);
	- }
	-
	- if (modifying && strchr(path, '%') != NULL) {
	- if (hdl != NULL)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid character %c in name"), '%');
	- return (0);
	- }
	-
	- return (-1);
	-}
	-
	-int
	-zfs_name_valid(const char *name, zfs_type_t type)
	-{
	- if (type == ZFS_TYPE_POOL)
	- return (zpool_name_valid(NULL, B_FALSE, name));
	- return (zfs_validate_name(NULL, name, type, B_FALSE));
	-}
	-
	-/*
	- * This function takes the raw DSL properties, and filters out the user-defined
	- * properties into a separate nvlist.
	- */
	-static nvlist_t *
	-process_user_props(zfs_handle_t zhp, nvlist_t props)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- nvpair_t *elem;
	- nvlist_t *propval;
	- nvlist_t *nvl;
	-
	- if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
	- (void) no_memory(hdl);
	- return (NULL);
	- }
	-
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
	- if (!zfs_prop_user(nvpair_name(elem)))
	- continue;
	-
	- verify(nvpair_value_nvlist(elem, &propval) == 0);
	- if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) {
	- nvlist_free(nvl);
	- (void) no_memory(hdl);
	- return (NULL);
	- }
	- }
	-
	- return (nvl);
	-}
	-
	-static zpool_handle_t *
	-zpool_add_handle(zfs_handle_t zhp, const char pool_name)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- zpool_handle_t *zph;
	-
	- if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) {
	- if (hdl->libzfs_pool_handles != NULL)
	- zph->zpool_next = hdl->libzfs_pool_handles;
	- hdl->libzfs_pool_handles = zph;
	- }
	- return (zph);
	-}
	-
	-static zpool_handle_t *
	-zpool_find_handle(zfs_handle_t zhp, const char pool_name, int len)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- zpool_handle_t *zph = hdl->libzfs_pool_handles;
	-
	- while ((zph != NULL) &&
	- (strncmp(pool_name, zpool_get_name(zph), len) != 0))
	- zph = zph->zpool_next;
	- return (zph);
	-}
	-
	-/*
	- * Returns a handle to the pool that contains the provided dataset.
	- * If a handle to that pool already exists then that handle is returned.
	- * Otherwise, a new handle is created and added to the list of handles.
	- */
	-static zpool_handle_t *
	-zpool_handle(zfs_handle_t *zhp)
	-{
	- char *pool_name;
	- int len;
	- zpool_handle_t *zph;
	-
	- len = strcspn(zhp->zfs_name, "/@#") + 1;
	- pool_name = zfs_alloc(zhp->zfs_hdl, len);
	- (void) strlcpy(pool_name, zhp->zfs_name, len);
	-
	- zph = zpool_find_handle(zhp, pool_name, len);
	- if (zph == NULL)
	- zph = zpool_add_handle(zhp, pool_name);
	-
	- free(pool_name);
	- return (zph);
	-}
	-
	-void
	-zpool_free_handles(libzfs_handle_t *hdl)
	-{
	- zpool_handle_t next, zph = hdl->libzfs_pool_handles;
	-
	- while (zph != NULL) {
	- next = zph->zpool_next;
	- zpool_close(zph);
	- zph = next;
	- }
	- hdl->libzfs_pool_handles = NULL;
	-}
	-
	-/*
	- * Utility function to gather stats (objset and zpl) for the given object.
	- */
	-static int
	-get_stats_ioctl(zfs_handle_t zhp, zfs_cmd_t zc)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	-
	- (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
	-
	- while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) {
	- if (errno == ENOMEM) {
	- if (zcmd_expand_dst_nvlist(hdl, zc) != 0) {
	- return (-1);
	- }
	- } else {
	- return (-1);
	- }
	- }
	- return (0);
	-}
	-
	-/*
	- * Utility function to get the received properties of the given object.
	- */
	-static int
	-get_recvd_props_ioctl(zfs_handle_t *zhp)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- nvlist_t *recvdprops;
	- zfs_cmd_t zc = { 0 };
	- int err;
	-
	- if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
	- return (-1);
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	-
	- while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) {
	- if (errno == ENOMEM) {
	- if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	- return (-1);
	- }
	- } else {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- }
	-
	- err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops);
	- zcmd_free_nvlists(&zc);
	- if (err != 0)
	- return (-1);
	-
	- nvlist_free(zhp->zfs_recvd_props);
	- zhp->zfs_recvd_props = recvdprops;
	-
	- return (0);
	-}
	-
	-static int
	-put_stats_zhdl(zfs_handle_t zhp, zfs_cmd_t zc)
	-{
	- nvlist_t allprops, userprops;
	-
	- zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */
	-
	- if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) {
	- return (-1);
	- }
	-
	- /*
	- * XXX Why do we store the user props separately, in addition to
	- * storing them in zfs_props?
	- */
	- if ((userprops = process_user_props(zhp, allprops)) == NULL) {
	- nvlist_free(allprops);
	- return (-1);
	- }
	-
	- nvlist_free(zhp->zfs_props);
	- nvlist_free(zhp->zfs_user_props);
	-
	- zhp->zfs_props = allprops;
	- zhp->zfs_user_props = userprops;
	-
	- return (0);
	-}
	-
	-static int
	-get_stats(zfs_handle_t *zhp)
	-{
	- int rc = 0;
	- zfs_cmd_t zc = { 0 };
	-
	- if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
	- return (-1);
	- if (get_stats_ioctl(zhp, &zc) != 0)
	- rc = -1;
	- else if (put_stats_zhdl(zhp, &zc) != 0)
	- rc = -1;
	- zcmd_free_nvlists(&zc);
	- return (rc);
	-}
	-
	-/*
	- * Refresh the properties currently stored in the handle.
	- */
	-void
	-zfs_refresh_properties(zfs_handle_t *zhp)
	-{
	- (void) get_stats(zhp);
	-}
	-
	-/*
	- * Makes a handle from the given dataset name. Used by zfs_open() and
	- * zfs_iter_* to create child handles on the fly.
	- */
	-static int
	-make_dataset_handle_common(zfs_handle_t zhp, zfs_cmd_t zc)
	-{
	- if (put_stats_zhdl(zhp, zc) != 0)
	- return (-1);
	-
	- /*
	- * We've managed to open the dataset and gather statistics. Determine
	- * the high-level type.
	- */
	- if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
	- zhp->zfs_head_type = ZFS_TYPE_VOLUME;
	- else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
	- zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM;
	- else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER)
	- return (-1);
	- else
	- abort();
	-
	- if (zhp->zfs_dmustats.dds_is_snapshot)
	- zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
	- else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
	- zhp->zfs_type = ZFS_TYPE_VOLUME;
	- else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
	- zhp->zfs_type = ZFS_TYPE_FILESYSTEM;
	- else
	- abort(); /* we should never see any other types */
	-
	- if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL)
	- return (-1);
	-
	- return (0);
	-}
	-
	-zfs_handle_t *
	-make_dataset_handle(libzfs_handle_t hdl, const char path)
	-{
	- zfs_cmd_t zc = { 0 };
	-
	- zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
	-
	- if (zhp == NULL)
	- return (NULL);
	-
	- zhp->zfs_hdl = hdl;
	- (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
	- if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) {
	- free(zhp);
	- return (NULL);
	- }
	- if (get_stats_ioctl(zhp, &zc) == -1) {
	- zcmd_free_nvlists(&zc);
	- free(zhp);
	- return (NULL);
	- }
	- if (make_dataset_handle_common(zhp, &zc) == -1) {
	- free(zhp);
	- zhp = NULL;
	- }
	- zcmd_free_nvlists(&zc);
	- return (zhp);
	-}
	-
	-zfs_handle_t *
	-make_dataset_handle_zc(libzfs_handle_t hdl, zfs_cmd_t zc)
	-{
	- zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
	-
	- if (zhp == NULL)
	- return (NULL);
	-
	- zhp->zfs_hdl = hdl;
	- (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
	- if (make_dataset_handle_common(zhp, zc) == -1) {
	- free(zhp);
	- return (NULL);
	- }
	- return (zhp);
	-}
	-
	-zfs_handle_t *
	-make_dataset_simple_handle_zc(zfs_handle_t pzhp, zfs_cmd_t zc)
	-{
	- zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
	-
	- if (zhp == NULL)
	- return (NULL);
	-
	- zhp->zfs_hdl = pzhp->zfs_hdl;
	- (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
	- zhp->zfs_head_type = pzhp->zfs_type;
	- zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
	- zhp->zpool_hdl = zpool_handle(zhp);
	- return (zhp);
	-}
	-
	-zfs_handle_t *
	-zfs_handle_dup(zfs_handle_t *zhp_orig)
	-{
	- zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
	-
	- if (zhp == NULL)
	- return (NULL);
	-
	- zhp->zfs_hdl = zhp_orig->zfs_hdl;
	- zhp->zpool_hdl = zhp_orig->zpool_hdl;
	- (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name,
	- sizeof (zhp->zfs_name));
	- zhp->zfs_type = zhp_orig->zfs_type;
	- zhp->zfs_head_type = zhp_orig->zfs_head_type;
	- zhp->zfs_dmustats = zhp_orig->zfs_dmustats;
	- if (zhp_orig->zfs_props != NULL) {
	- if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) {
	- (void) no_memory(zhp->zfs_hdl);
	- zfs_close(zhp);
	- return (NULL);
	- }
	- }
	- if (zhp_orig->zfs_user_props != NULL) {
	- if (nvlist_dup(zhp_orig->zfs_user_props,
	- &zhp->zfs_user_props, 0) != 0) {
	- (void) no_memory(zhp->zfs_hdl);
	- zfs_close(zhp);
	- return (NULL);
	- }
	- }
	- if (zhp_orig->zfs_recvd_props != NULL) {
	- if (nvlist_dup(zhp_orig->zfs_recvd_props,
	- &zhp->zfs_recvd_props, 0)) {
	- (void) no_memory(zhp->zfs_hdl);
	- zfs_close(zhp);
	- return (NULL);
	- }
	- }
	- zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck;
	- if (zhp_orig->zfs_mntopts != NULL) {
	- zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl,
	- zhp_orig->zfs_mntopts);
	- }
	- zhp->zfs_props_table = zhp_orig->zfs_props_table;
	- return (zhp);
	-}
	-
	-boolean_t
	-zfs_bookmark_exists(const char *path)
	-{
	- nvlist_t *bmarks;
	- nvlist_t *props;
	- char fsname[ZFS_MAX_DATASET_NAME_LEN];
	- char *bmark_name;
	- char *pound;
	- int err;
	- boolean_t rv;
	-
	-
	- (void) strlcpy(fsname, path, sizeof (fsname));
	- pound = strchr(fsname, '#');
	- if (pound == NULL)
	- return (B_FALSE);
	-
	- *pound = '\0';
	- bmark_name = pound + 1;
	- props = fnvlist_alloc();
	- err = lzc_get_bookmarks(fsname, props, &bmarks);
	- nvlist_free(props);
	- if (err != 0) {
	- nvlist_free(bmarks);
	- return (B_FALSE);
	- }
	-
	- rv = nvlist_exists(bmarks, bmark_name);
	- nvlist_free(bmarks);
	- return (rv);
	-}
	-
	-zfs_handle_t *
	-make_bookmark_handle(zfs_handle_t parent, const char path,
	- nvlist_t *bmark_props)
	-{
	- zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
	-
	- if (zhp == NULL)
	- return (NULL);
	-
	- /* Fill in the name. */
	- zhp->zfs_hdl = parent->zfs_hdl;
	- (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
	-
	- /* Set the property lists. */
	- if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) {
	- free(zhp);
	- return (NULL);
	- }
	-
	- /* Set the types. */
	- zhp->zfs_head_type = parent->zfs_head_type;
	- zhp->zfs_type = ZFS_TYPE_BOOKMARK;
	-
	- if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) {
	- nvlist_free(zhp->zfs_props);
	- free(zhp);
	- return (NULL);
	- }
	-
	- return (zhp);
	-}
	-
	-struct zfs_open_bookmarks_cb_data {
	- const char *path;
	- zfs_handle_t *zhp;
	-};
	-
	-static int
	-zfs_open_bookmarks_cb(zfs_handle_t zhp, void data)
	-{
	- struct zfs_open_bookmarks_cb_data *dp = data;
	-
	- /*
	- * Is it the one we are looking for?
	- */
	- if (strcmp(dp->path, zfs_get_name(zhp)) == 0) {
	- /*
	- * We found it. Save it and let the caller know we are done.
	- */
	- dp->zhp = zhp;
	- return (EEXIST);
	- }
	-
	- /*
	- * Not found. Close the handle and ask for another one.
	- */
	- zfs_close(zhp);
	- return (0);
	-}
	-
	-/*
	- * Opens the given snapshot, bookmark, filesystem, or volume. The 'types'
	- * argument is a mask of acceptable types. The function will print an
	- * appropriate error message and return NULL if it can't be opened.
	- */
	-zfs_handle_t *
	-zfs_open(libzfs_handle_t hdl, const char path, int types)
	-{
	- zfs_handle_t *zhp;
	- char errbuf[1024];
	- char *bookp;
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
	-
	- /*
	- * Validate the name before we even try to open it.
	- */
	- if (!zfs_validate_name(hdl, path, types, B_FALSE)) {
	- (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
	- return (NULL);
	- }
	-
	- /*
	- * Bookmarks needs to be handled separately.
	- */
	- bookp = strchr(path, '#');
	- if (bookp == NULL) {
	- /*
	- * Try to get stats for the dataset, which will tell us if it
	- * exists.
	- */
	- errno = 0;
	- if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
	- (void) zfs_standard_error(hdl, errno, errbuf);
	- return (NULL);
	- }
	- } else {
	- char dsname[ZFS_MAX_DATASET_NAME_LEN];
	- zfs_handle_t *pzhp;
	- struct zfs_open_bookmarks_cb_data cb_data = {path, NULL};
	-
	- /*
	- * We need to cut out '#' and everything after '#'
	- * to get the parent dataset name only.
	- */
	- assert(bookp - path < sizeof (dsname));
	- (void) strncpy(dsname, path, bookp - path);
	- dsname[bookp - path] = '\0';
	-
	- /*
	- * Create handle for the parent dataset.
	- */
	- errno = 0;
	- if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) {
	- (void) zfs_standard_error(hdl, errno, errbuf);
	- return (NULL);
	- }
	-
	- /*
	- * Iterate bookmarks to find the right one.
	- */
	- errno = 0;
	- if ((zfs_iter_bookmarks(pzhp, zfs_open_bookmarks_cb,
	- &cb_data) == 0) && (cb_data.zhp == NULL)) {
	- (void) zfs_error(hdl, EZFS_NOENT, errbuf);
	- zfs_close(pzhp);
	- return (NULL);
	- }
	- if (cb_data.zhp == NULL) {
	- (void) zfs_standard_error(hdl, errno, errbuf);
	- zfs_close(pzhp);
	- return (NULL);
	- }
	- zhp = cb_data.zhp;
	-
	- /*
	- * Cleanup.
	- */
	- zfs_close(pzhp);
	- }
	-
	- if (zhp == NULL) {
	- char *at = strchr(path, '@');
	-
	- if (at != NULL)
	- *at = '\0';
	- errno = 0;
	- if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
	- (void) zfs_standard_error(hdl, errno, errbuf);
	- return (NULL);
	- }
	- if (at != NULL)
	- *at = '@';
	- (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
	- zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
	- }
	-
	- if (!(types & zhp->zfs_type)) {
	- (void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	- zfs_close(zhp);
	- return (NULL);
	- }
	-
	- return (zhp);
	-}
	-
	-/*
	- * Release a ZFS handle. Nothing to do but free the associated memory.
	- */
	-void
	-zfs_close(zfs_handle_t *zhp)
	-{
	- if (zhp->zfs_mntopts)
	- free(zhp->zfs_mntopts);
	- nvlist_free(zhp->zfs_props);
	- nvlist_free(zhp->zfs_user_props);
	- nvlist_free(zhp->zfs_recvd_props);
	- free(zhp);
	-}
	-
	-typedef struct mnttab_node {
	- struct mnttab mtn_mt;
	- avl_node_t mtn_node;
	-} mnttab_node_t;
	-
	-static int
	-libzfs_mnttab_cache_compare(const void arg1, const void arg2)
	-{
	- const mnttab_node_t mtn1 = (const mnttab_node_t )arg1;
	- const mnttab_node_t mtn2 = (const mnttab_node_t )arg2;
	- int rv;
	-
	- rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);
	-
	- return (AVL_ISIGN(rv));
	-}
	-
	-void
	-libzfs_mnttab_init(libzfs_handle_t *hdl)
	-{
	- pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL);
	- assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
	- avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
	- sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
	-}
	-
	-void
	-libzfs_mnttab_update(libzfs_handle_t *hdl)
	-{
	- struct mnttab entry;
	-
	- rewind(hdl->libzfs_mnttab);
	- while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
	- mnttab_node_t *mtn;
	-
	- if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
	- continue;
	- mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
	- mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special);
	- mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp);
	- mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype);
	- mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts);
	- avl_add(&hdl->libzfs_mnttab_cache, mtn);
	- }
	-}
	-
	-void
	-libzfs_mnttab_fini(libzfs_handle_t *hdl)
	-{
	- void *cookie = NULL;
	- mnttab_node_t *mtn;
	-
	- while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie))
	- != NULL) {
	- free(mtn->mtn_mt.mnt_special);
	- free(mtn->mtn_mt.mnt_mountp);
	- free(mtn->mtn_mt.mnt_fstype);
	- free(mtn->mtn_mt.mnt_mntopts);
	- free(mtn);
	- }
	- avl_destroy(&hdl->libzfs_mnttab_cache);
	- (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock);
	-}
	-
	-void
	-libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable)
	-{
	- hdl->libzfs_mnttab_enable = enable;
	-}
	-
	-int
	-libzfs_mnttab_find(libzfs_handle_t hdl, const char fsname,
	- struct mnttab *entry)
	-{
	- mnttab_node_t find;
	- mnttab_node_t *mtn;
	- int ret = ENOENT;
	-
	- if (!hdl->libzfs_mnttab_enable) {
	- struct mnttab srch = { 0 };
	-
	- if (avl_numnodes(&hdl->libzfs_mnttab_cache))
	- libzfs_mnttab_fini(hdl);
	- rewind(hdl->libzfs_mnttab);
	- srch.mnt_special = (char *)fsname;
	- srch.mnt_fstype = MNTTYPE_ZFS;
	- if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0)
	- return (0);
	- else
	- return (ENOENT);
	- }
	-
	- pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
	- if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
	- libzfs_mnttab_update(hdl);
	-
	- find.mtn_mt.mnt_special = (char *)fsname;
	- mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
	- if (mtn) {
	- *entry = mtn->mtn_mt;
	- ret = 0;
	- }
	- pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
	- return (ret);
	-}
	-
	-void
	-libzfs_mnttab_add(libzfs_handle_t hdl, const char special,
	- const char mountp, const char mntopts)
	-{
	- mnttab_node_t *mtn;
	-
	- pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
	- if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) {
	- mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
	- mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
	- mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
	- mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
	- mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
	- avl_add(&hdl->libzfs_mnttab_cache, mtn);
	- }
	- pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
	-}
	-
	-void
	-libzfs_mnttab_remove(libzfs_handle_t hdl, const char fsname)
	-{
	- mnttab_node_t find;
	- mnttab_node_t *ret;
	-
	- pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
	- find.mtn_mt.mnt_special = (char *)fsname;
	- if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
	- != NULL) {
	- avl_remove(&hdl->libzfs_mnttab_cache, ret);
	- free(ret->mtn_mt.mnt_special);
	- free(ret->mtn_mt.mnt_mountp);
	- free(ret->mtn_mt.mnt_fstype);
	- free(ret->mtn_mt.mnt_mntopts);
	- free(ret);
	- }
	- pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
	-}
	-
	-int
	-zfs_spa_version(zfs_handle_t zhp, int spa_version)
	-{
	- zpool_handle_t *zpool_handle = zhp->zpool_hdl;
	-
	- if (zpool_handle == NULL)
	- return (-1);
	-
	- *spa_version = zpool_get_prop_int(zpool_handle,
	- ZPOOL_PROP_VERSION, NULL);
	- return (0);
	-}
	-
	-/*
	- * The choice of reservation property depends on the SPA version.
	- */
	-static int
	-zfs_which_resv_prop(zfs_handle_t zhp, zfs_prop_t resv_prop)
	-{
	- int spa_version;
	-
	- if (zfs_spa_version(zhp, &spa_version) < 0)
	- return (-1);
	-
	- if (spa_version >= SPA_VERSION_REFRESERVATION)
	- *resv_prop = ZFS_PROP_REFRESERVATION;
	- else
	- *resv_prop = ZFS_PROP_RESERVATION;
	-
	- return (0);
	-}
	-
	-/*
	- * Given an nvlist of properties to set, validates that they are correct, and
	- * parses any numeric properties (index, boolean, etc) if they are specified as
	- * strings.
	- */
	-nvlist_t *
	-zfs_valid_proplist(libzfs_handle_t hdl, zfs_type_t type, nvlist_t nvl,
	- uint64_t zoned, zfs_handle_t zhp, zpool_handle_t zpool_hdl,
	- const char *errbuf)
	-{
	- nvpair_t *elem;
	- uint64_t intval;
	- char *strval;
	- zfs_prop_t prop;
	- nvlist_t *ret;
	- int chosen_normal = -1;
	- int chosen_utf = -1;
	-
	- if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) {
	- (void) no_memory(hdl);
	- return (NULL);
	- }
	-
	- /*
	- * Make sure this property is valid and applies to this type.
	- */
	-
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
	- const char *propname = nvpair_name(elem);
	-
	- prop = zfs_name_to_prop(propname);
	- if (prop == ZPROP_INVAL && zfs_prop_user(propname)) {
	- /*
	- * This is a user property: make sure it's a
	- * string, and that it's less than ZAP_MAXNAMELEN.
	- */
	- if (nvpair_type(elem) != DATA_TYPE_STRING) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be a string"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property name '%s' is too long"),
	- propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- (void) nvpair_value_string(elem, &strval);
	- if (nvlist_add_string(ret, propname, strval) != 0) {
	- (void) no_memory(hdl);
	- goto error;
	- }
	- continue;
	- }
	-
	- /*
	- * Currently, only user properties can be modified on
	- * snapshots.
	- */
	- if (type == ZFS_TYPE_SNAPSHOT) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "this property can not be modified for snapshots"));
	- (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
	- goto error;
	- }
	-
	- if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) {
	- zfs_userquota_prop_t uqtype;
	- char newpropname[128];
	- char domain[128];
	- uint64_t rid;
	- uint64_t valary[3];
	-
	- if (userquota_propname_decode(propname, zoned,
	- &uqtype, domain, sizeof (domain), &rid) != 0) {
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN,
	- "'%s' has an invalid user/group name"),
	- propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- if (uqtype != ZFS_PROP_USERQUOTA &&
	- uqtype != ZFS_PROP_GROUPQUOTA) {
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "'%s' is readonly"),
	- propname);
	- (void) zfs_error(hdl, EZFS_PROPREADONLY,
	- errbuf);
	- goto error;
	- }
	-
	- if (nvpair_type(elem) == DATA_TYPE_STRING) {
	- (void) nvpair_value_string(elem, &strval);
	- if (strcmp(strval, "none") == 0) {
	- intval = 0;
	- } else if (zfs_nicestrtonum(hdl,
	- strval, &intval) != 0) {
	- (void) zfs_error(hdl,
	- EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- } else if (nvpair_type(elem) ==
	- DATA_TYPE_UINT64) {
	- (void) nvpair_value_uint64(elem, &intval);
	- if (intval == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "use 'none' to disable "
	- "userquota/groupquota"));
	- goto error;
	- }
	- } else {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be a number"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- /*
	- * Encode the prop name as
	- * userquota@<hex-rid>-domain, to make it easy
	- * for the kernel to decode.
	- */
	- (void) snprintf(newpropname, sizeof (newpropname),
	- "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype],
	- (longlong_t)rid, domain);
	- valary[0] = uqtype;
	- valary[1] = rid;
	- valary[2] = intval;
	- if (nvlist_add_uint64_array(ret, newpropname,
	- valary, 3) != 0) {
	- (void) no_memory(hdl);
	- goto error;
	- }
	- continue;
	- } else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' is readonly"),
	- propname);
	- (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
	- goto error;
	- }
	-
	- if (prop == ZPROP_INVAL) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid property '%s'"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- if (!zfs_prop_valid_for_type(prop, type)) {
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "'%s' does not "
	- "apply to datasets of this type"), propname);
	- (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
	- goto error;
	- }
	-
	- if (zfs_prop_readonly(prop) &&
	- (!zfs_prop_setonce(prop) \|\| zhp != NULL)) {
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "'%s' is readonly"),
	- propname);
	- (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
	- goto error;
	- }
	-
	- if (zprop_parse_value(hdl, elem, prop, type, ret,
	- &strval, &intval, errbuf) != 0)
	- goto error;
	-
	- /*
	- * Perform some additional checks for specific properties.
	- */
	- switch (prop) {
	- case ZFS_PROP_VERSION:
	- {
	- int version;
	-
	- if (zhp == NULL)
	- break;
	- version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
	- if (intval < version) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "Can not downgrade; already at version %u"),
	- version);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- break;
	- }
	-
	- case ZFS_PROP_VOLBLOCKSIZE:
	- case ZFS_PROP_RECORDSIZE:
	- {
	- int maxbs = SPA_MAXBLOCKSIZE;
	- if (zpool_hdl != NULL) {
	- maxbs = zpool_get_prop_int(zpool_hdl,
	- ZPOOL_PROP_MAXBLOCKSIZE, NULL);
	- }
	- /*
	- * Volumes are limited to a volblocksize of 128KB,
	- * because they typically service workloads with
	- * small random writes, which incur a large performance
	- * penalty with large blocks.
	- */
	- if (prop == ZFS_PROP_VOLBLOCKSIZE)
	- maxbs = SPA_OLD_MAXBLOCKSIZE;
	- /*
	- * The value must be a power of two between
	- * SPA_MINBLOCKSIZE and maxbs.
	- */
	- if (intval < SPA_MINBLOCKSIZE \|\|
	- intval > maxbs \|\| !ISP2(intval)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be power of 2 from 512B "
	- "to %uKB"), propname, maxbs >> 10);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- break;
	- }
	-
	- case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
	- if (zpool_hdl != NULL) {
	- char state[64] = "";
	-
	- /*
	- * Issue a warning but do not fail so that
	- * tests for setable properties succeed.
	- */
	- if (zpool_prop_get_feature(zpool_hdl,
	- "feature@allocation_classes", state,
	- sizeof (state)) != 0 \|\|
	- strcmp(state, ZFS_FEATURE_ACTIVE) != 0) {
	- (void) fprintf(stderr, gettext(
	- "%s: property requires a special "
	- "device in the pool\n"), propname);
	- }
	- }
	- if (intval != 0 &&
	- (intval < SPA_MINBLOCKSIZE \|\|
	- intval > SPA_OLD_MAXBLOCKSIZE \|\| !ISP2(intval))) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid '%s=%d' property: must be zero or "
	- "a power of 2 from 512B to 128K"), propname,
	- intval);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- break;
	-
	- case ZFS_PROP_MLSLABEL:
	- {
	-#ifdef illumos
	- /*
	- * Verify the mlslabel string and convert to
	- * internal hex label string.
	- */
	-
	- m_label_t *new_sl;
	- char hex = NULL; / internal label string */
	-
	- /* Default value is already OK. */
	- if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
	- break;
	-
	- /* Verify the label can be converted to binary form */
	- if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) \|\|
	- (str_to_label(strval, &new_sl, MAC_LABEL,
	- L_NO_CORRECTION, NULL) == -1)) {
	- goto badlabel;
	- }
	-
	- /* Now translate to hex internal label string */
	- if (label_to_str(new_sl, &hex, M_INTERNAL,
	- DEF_NAMES) != 0) {
	- if (hex)
	- free(hex);
	- goto badlabel;
	- }
	- m_label_free(new_sl);
	-
	- /* If string is already in internal form, we're done. */
	- if (strcmp(strval, hex) == 0) {
	- free(hex);
	- break;
	- }
	-
	- /* Replace the label string with the internal form. */
	- (void) nvlist_remove(ret, zfs_prop_to_name(prop),
	- DATA_TYPE_STRING);
	- verify(nvlist_add_string(ret, zfs_prop_to_name(prop),
	- hex) == 0);
	- free(hex);
	-
	- break;
	-
	-badlabel:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid mlslabel '%s'"), strval);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- m_label_free(new_sl); /* OK if null */
	-#else /* !illumos */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "mlslabel is not supported on FreeBSD"));
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	-#endif /* illumos */
	- goto error;
	-
	- }
	-
	- case ZFS_PROP_MOUNTPOINT:
	- {
	- namecheck_err_t why;
	-
	- if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 \|\|
	- strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0)
	- break;
	-
	- if (mountpoint_namecheck(strval, &why)) {
	- switch (why) {
	- case NAME_ERR_LEADING_SLASH:
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN,
	- "'%s' must be an absolute path, "
	- "'none', or 'legacy'"), propname);
	- break;
	- case NAME_ERR_TOOLONG:
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN,
	- "component of '%s' is too long"),
	- propname);
	- break;
	-
	- default:
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN,
	- "(%d) not defined"),
	- why);
	- break;
	- }
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- }
	-
	- /FALLTHRU/
	-
	- case ZFS_PROP_SHARESMB:
	- case ZFS_PROP_SHARENFS:
	- /*
	- * For the mountpoint and sharenfs or sharesmb
	- * properties, check if it can be set in a
	- * global/non-global zone based on
	- * the zoned property value:
	- *
	- * global zone non-global zone
	- * --------------------------------------------------
	- * zoned=on mountpoint (no) mountpoint (yes)
	- * sharenfs (no) sharenfs (no)
	- * sharesmb (no) sharesmb (no)
	- *
	- * zoned=off mountpoint (yes) N/A
	- * sharenfs (yes)
	- * sharesmb (yes)
	- */
	- if (zoned) {
	- if (getzoneid() == GLOBAL_ZONEID) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' cannot be set on "
	- "dataset in a non-global zone"),
	- propname);
	- (void) zfs_error(hdl, EZFS_ZONED,
	- errbuf);
	- goto error;
	- } else if (prop == ZFS_PROP_SHARENFS \|\|
	- prop == ZFS_PROP_SHARESMB) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' cannot be set in "
	- "a non-global zone"), propname);
	- (void) zfs_error(hdl, EZFS_ZONED,
	- errbuf);
	- goto error;
	- }
	- } else if (getzoneid() != GLOBAL_ZONEID) {
	- /*
	- * If zoned property is 'off', this must be in
	- * a global zone. If not, something is wrong.
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' cannot be set while dataset "
	- "'zoned' property is set"), propname);
	- (void) zfs_error(hdl, EZFS_ZONED, errbuf);
	- goto error;
	- }
	-
	- /*
	- * At this point, it is legitimate to set the
	- * property. Now we want to make sure that the
	- * property value is valid if it is sharenfs.
	- */
	- if ((prop == ZFS_PROP_SHARENFS \|\|
	- prop == ZFS_PROP_SHARESMB) &&
	- strcmp(strval, "on") != 0 &&
	- strcmp(strval, "off") != 0) {
	- zfs_share_proto_t proto;
	-
	- if (prop == ZFS_PROP_SHARESMB)
	- proto = PROTO_SMB;
	- else
	- proto = PROTO_NFS;
	-
	- /*
	- * Must be an valid sharing protocol
	- * option string so init the libshare
	- * in order to enable the parser and
	- * then parse the options. We use the
	- * control API since we don't care about
	- * the current configuration and don't
	- * want the overhead of loading it
	- * until we actually do something.
	- */
	-
	- if (zfs_init_libshare(hdl,
	- SA_INIT_CONTROL_API) != SA_OK) {
	- /*
	- * An error occurred so we can't do
	- * anything
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' cannot be set: problem "
	- "in share initialization"),
	- propname);
	- (void) zfs_error(hdl, EZFS_BADPROP,
	- errbuf);
	- goto error;
	- }
	-
	- if (zfs_parse_options(strval, proto) != SA_OK) {
	- /*
	- * There was an error in parsing so
	- * deal with it by issuing an error
	- * message and leaving after
	- * uninitializing the the libshare
	- * interface.
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' cannot be set to invalid "
	- "options"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP,
	- errbuf);
	- zfs_uninit_libshare(hdl);
	- goto error;
	- }
	- zfs_uninit_libshare(hdl);
	- }
	-
	- break;
	-
	- case ZFS_PROP_UTF8ONLY:
	- chosen_utf = (int)intval;
	- break;
	-
	- case ZFS_PROP_NORMALIZE:
	- chosen_normal = (int)intval;
	- break;
	-
	- default:
	- break;
	- }
	-
	- /*
	- * For changes to existing volumes, we have some additional
	- * checks to enforce.
	- */
	- if (type == ZFS_TYPE_VOLUME && zhp != NULL) {
	- uint64_t volsize = zfs_prop_get_int(zhp,
	- ZFS_PROP_VOLSIZE);
	- uint64_t blocksize = zfs_prop_get_int(zhp,
	- ZFS_PROP_VOLBLOCKSIZE);
	- char buf[64];
	-
	- switch (prop) {
	- case ZFS_PROP_RESERVATION:
	- if (intval > volsize) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' is greater than current "
	- "volume size"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP,
	- errbuf);
	- goto error;
	- }
	- break;
	-
	- case ZFS_PROP_REFRESERVATION:
	- if (intval > volsize && intval != UINT64_MAX) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' is greater than current "
	- "volume size"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP,
	- errbuf);
	- goto error;
	- }
	- break;
	-
	- case ZFS_PROP_VOLSIZE:
	- if (intval % blocksize != 0) {
	- zfs_nicenum(blocksize, buf,
	- sizeof (buf));
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be a multiple of "
	- "volume block size (%s)"),
	- propname, buf);
	- (void) zfs_error(hdl, EZFS_BADPROP,
	- errbuf);
	- goto error;
	- }
	-
	- if (intval == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' cannot be zero"),
	- propname);
	- (void) zfs_error(hdl, EZFS_BADPROP,
	- errbuf);
	- goto error;
	- }
	- break;
	-
	- default:
	- break;
	- }
	- }
	- }
	-
	- /*
	- * If normalization was chosen, but no UTF8 choice was made,
	- * enforce rejection of non-UTF8 names.
	- *
	- * If normalization was chosen, but rejecting non-UTF8 names
	- * was explicitly not chosen, it is an error.
	- */
	- if (chosen_normal > 0 && chosen_utf < 0) {
	- if (nvlist_add_uint64(ret,
	- zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) {
	- (void) no_memory(hdl);
	- goto error;
	- }
	- } else if (chosen_normal > 0 && chosen_utf == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be set 'on' if normalization chosen"),
	- zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- return (ret);
	-
	-error:
	- nvlist_free(ret);
	- return (NULL);
	-}
	-
	-int
	-zfs_add_synthetic_resv(zfs_handle_t zhp, nvlist_t nvl)
	-{
	- uint64_t old_volsize;
	- uint64_t new_volsize;
	- uint64_t old_reservation;
	- uint64_t new_reservation;
	- zfs_prop_t resv_prop;
	- nvlist_t *props;
	-
	- /*
	- * If this is an existing volume, and someone is setting the volsize,
	- * make sure that it matches the reservation, or add it if necessary.
	- */
	- old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
	- if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
	- return (-1);
	- old_reservation = zfs_prop_get_int(zhp, resv_prop);
	-
	- props = fnvlist_alloc();
	- fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	- zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
	-
	- if ((zvol_volsize_to_reservation(old_volsize, props) !=
	- old_reservation) \|\| nvlist_exists(nvl,
	- zfs_prop_to_name(resv_prop))) {
	- fnvlist_free(props);
	- return (0);
	- }
	- if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
	- &new_volsize) != 0) {
	- fnvlist_free(props);
	- return (-1);
	- }
	- new_reservation = zvol_volsize_to_reservation(new_volsize, props);
	- fnvlist_free(props);
	-
	- if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
	- new_reservation) != 0) {
	- (void) no_memory(zhp->zfs_hdl);
	- return (-1);
	- }
	- return (1);
	-}
	-
	-/*
	- * Helper for 'zfs {set\|clone} refreservation=auto'. Must be called after
	- * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinal value.
	- * Return codes must match zfs_add_synthetic_resv().
	- */
	-static int
	-zfs_fix_auto_resv(zfs_handle_t zhp, nvlist_t nvl)
	-{
	- uint64_t volsize;
	- uint64_t resvsize;
	- zfs_prop_t prop;
	- nvlist_t *props;
	-
	- if (!ZFS_IS_VOLUME(zhp)) {
	- return (0);
	- }
	-
	- if (zfs_which_resv_prop(zhp, &prop) != 0) {
	- return (-1);
	- }
	-
	- if (prop != ZFS_PROP_REFRESERVATION) {
	- return (0);
	- }
	-
	- if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(prop), &resvsize) != 0) {
	- /* No value being set, so it can't be "auto" */
	- return (0);
	- }
	- if (resvsize != UINT64_MAX) {
	- /* Being set to a value other than "auto" */
	- return (0);
	- }
	-
	- props = fnvlist_alloc();
	-
	- fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	- zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
	-
	- if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
	- &volsize) != 0) {
	- volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
	- }
	-
	- resvsize = zvol_volsize_to_reservation(volsize, props);
	- fnvlist_free(props);
	-
	- (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop));
	- if (nvlist_add_uint64(nvl, zfs_prop_to_name(prop), resvsize) != 0) {
	- (void) no_memory(zhp->zfs_hdl);
	- return (-1);
	- }
	- return (1);
	-}
	-
	-void
	-zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
	- char *errbuf)
	-{
	- switch (err) {
	-
	- case ENOSPC:
	- /*
	- * For quotas and reservations, ENOSPC indicates
	- * something different; setting a quota or reservation
	- * doesn't use any disk space.
	- */
	- switch (prop) {
	- case ZFS_PROP_QUOTA:
	- case ZFS_PROP_REFQUOTA:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "size is less than current used or "
	- "reserved space"));
	- (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
	- break;
	-
	- case ZFS_PROP_RESERVATION:
	- case ZFS_PROP_REFRESERVATION:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "size is greater than available space"));
	- (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
	- break;
	-
	- default:
	- (void) zfs_standard_error(hdl, err, errbuf);
	- break;
	- }
	- break;
	-
	- case EBUSY:
	- (void) zfs_standard_error(hdl, EBUSY, errbuf);
	- break;
	-
	- case EROFS:
	- (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
	- break;
	-
	- case E2BIG:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property value too long"));
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- break;
	-
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool and or dataset must be upgraded to set this "
	- "property or value"));
	- (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- break;
	-
	- case ERANGE:
	- case EDOM:
	- if (prop == ZFS_PROP_COMPRESSION \|\|
	- prop == ZFS_PROP_RECORDSIZE) {
	- (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property setting is not allowed on "
	- "bootable datasets"));
	- (void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
	- } else if (prop == ZFS_PROP_CHECKSUM \|\|
	- prop == ZFS_PROP_DEDUP) {
	- (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property setting is not allowed on "
	- "root pools"));
	- (void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
	- } else {
	- (void) zfs_standard_error(hdl, err, errbuf);
	- }
	- break;
	-
	- case EINVAL:
	- if (prop == ZPROP_INVAL) {
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- } else {
	- (void) zfs_standard_error(hdl, err, errbuf);
	- }
	- break;
	-
	- case EOVERFLOW:
	- /*
	- * This platform can't address a volume this big.
	- */
	-#ifdef _ILP32
	- if (prop == ZFS_PROP_VOLSIZE) {
	- (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
	- break;
	- }
	-#endif
	- /* FALLTHROUGH */
	- default:
	- (void) zfs_standard_error(hdl, err, errbuf);
	- }
	-}
	-
	-/*
	- * Given a property name and value, set the property for the given dataset.
	- */
	-int
	-zfs_prop_set(zfs_handle_t zhp, const char propname, const char *propval)
	-{
	- int ret = -1;
	- char errbuf[1024];
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- nvlist_t *nvl = NULL;
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
	- zhp->zfs_name);
	-
	- if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 \|\|
	- nvlist_add_string(nvl, propname, propval) != 0) {
	- (void) no_memory(hdl);
	- goto error;
	- }
	-
	- ret = zfs_prop_set_list(zhp, nvl);
	-
	-error:
	- nvlist_free(nvl);
	- return (ret);
	-}
	-
	-
	-
	-/*
	- * Given an nvlist of property names and values, set the properties for the
	- * given dataset.
	- */
	-int
	-zfs_prop_set_list(zfs_handle_t zhp, nvlist_t props)
	-{
	- zfs_cmd_t zc = { 0 };
	- int ret = -1;
	- prop_changelist_t **cls = NULL;
	- int cl_idx;
	- char errbuf[1024];
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- nvlist_t *nvl;
	- int nvl_len;
	- int added_resv = 0;
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
	- zhp->zfs_name);
	-
	- if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props,
	- zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl,
	- errbuf)) == NULL)
	- goto error;
	-
	- /*
	- * We have to check for any extra properties which need to be added
	- * before computing the length of the nvlist.
	- */
	- for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
	- elem != NULL;
	- elem = nvlist_next_nvpair(nvl, elem)) {
	- if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE &&
	- (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) {
	- goto error;
	- }
	- }
	-
	- if (added_resv != 1 &&
	- (added_resv = zfs_fix_auto_resv(zhp, nvl)) == -1) {
	- goto error;
	- }
	-
	- /*
	- * Check how many properties we're setting and allocate an array to
	- * store changelist pointers for postfix().
	- */
	- nvl_len = 0;
	- for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
	- elem != NULL;
	- elem = nvlist_next_nvpair(nvl, elem))
	- nvl_len++;
	- if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL)
	- goto error;
	-
	- cl_idx = 0;
	- for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
	- elem != NULL;
	- elem = nvlist_next_nvpair(nvl, elem)) {
	-
	- zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));
	-
	- assert(cl_idx < nvl_len);
	- /*
	- * We don't want to unmount & remount the dataset when changing
	- * its canmount property to 'on' or 'noauto'. We only use
	- * the changelist logic to unmount when setting canmount=off.
	- */
	- if (prop != ZFS_PROP_CANMOUNT \|\|
	- (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF &&
	- zfs_is_mounted(zhp, NULL))) {
	- cls[cl_idx] = changelist_gather(zhp, prop, 0, 0);
	- if (cls[cl_idx] == NULL)
	- goto error;
	- }
	-
	- if (prop == ZFS_PROP_MOUNTPOINT &&
	- changelist_haszonedchild(cls[cl_idx])) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "child dataset with inherited mountpoint is used "
	- "in a non-global zone"));
	- ret = zfs_error(hdl, EZFS_ZONED, errbuf);
	- goto error;
	- }
	-
	- /* We don't support those properties on FreeBSD. */
	- switch (prop) {
	- case ZFS_PROP_DEVICES:
	- case ZFS_PROP_ISCSIOPTIONS:
	- case ZFS_PROP_XATTR:
	- case ZFS_PROP_VSCAN:
	- case ZFS_PROP_NBMAND:
	- case ZFS_PROP_MLSLABEL:
	- (void) snprintf(errbuf, sizeof (errbuf),
	- "property '%s' not supported on FreeBSD",
	- nvpair_name(elem));
	- ret = zfs_error(hdl, EZFS_PERM, errbuf);
	- goto error;
	- }
	-
	- if (cls[cl_idx] != NULL &&
	- (ret = changelist_prefix(cls[cl_idx])) != 0)
	- goto error;
	-
	- cl_idx++;
	- }
	- assert(cl_idx == nvl_len);
	-
	- /*
	- * Execute the corresponding ioctl() to set this list of properties.
	- */
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	-
	- if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 \|\|
	- (ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0)
	- goto error;
	-
	- ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
	-
	- if (ret != 0) {
	- if (zc.zc_nvlist_dst_filled == B_FALSE) {
	- (void) zfs_standard_error(hdl, errno, errbuf);
	- goto error;
	- }
	-
	- /* Get the list of unset properties back and report them. */
	- nvlist_t *errorprops = NULL;
	- if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0)
	- goto error;
	- for (nvpair_t *elem = nvlist_next_nvpair(errorprops, NULL);
	- elem != NULL;
	- elem = nvlist_next_nvpair(errorprops, elem)) {
	- zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));
	- zfs_setprop_error(hdl, prop, errno, errbuf);
	- }
	- nvlist_free(errorprops);
	-
	- if (added_resv && errno == ENOSPC) {
	- /* clean up the volsize property we tried to set */
	- uint64_t old_volsize = zfs_prop_get_int(zhp,
	- ZFS_PROP_VOLSIZE);
	- nvlist_free(nvl);
	- nvl = NULL;
	- zcmd_free_nvlists(&zc);
	-
	- if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
	- goto error;
	- if (nvlist_add_uint64(nvl,
	- zfs_prop_to_name(ZFS_PROP_VOLSIZE),
	- old_volsize) != 0)
	- goto error;
	- if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0)
	- goto error;
	- (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
	- }
	- } else {
	- for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
	- if (cls[cl_idx] != NULL) {
	- int clp_err = changelist_postfix(cls[cl_idx]);
	- if (clp_err != 0)
	- ret = clp_err;
	- }
	- }
	-
	- /*
	- * Refresh the statistics so the new property value
	- * is reflected.
	- */
	- if (ret == 0)
	- (void) get_stats(zhp);
	- }
	-
	-error:
	- nvlist_free(nvl);
	- zcmd_free_nvlists(&zc);
	- if (cls != NULL) {
	- for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
	- if (cls[cl_idx] != NULL)
	- changelist_free(cls[cl_idx]);
	- }
	- free(cls);
	- }
	- return (ret);
	-}
	-
	-/*
	- * Given a property, inherit the value from the parent dataset, or if received
	- * is TRUE, revert to the received value, if any.
	- */
	-int
	-zfs_prop_inherit(zfs_handle_t zhp, const char propname, boolean_t received)
	-{
	- zfs_cmd_t zc = { 0 };
	- int ret;
	- prop_changelist_t *cl;
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- char errbuf[1024];
	- zfs_prop_t prop;
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot inherit %s for '%s'"), propname, zhp->zfs_name);
	-
	- zc.zc_cookie = received;
	- if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
	- /*
	- * For user properties, the amount of work we have to do is very
	- * small, so just do it here.
	- */
	- if (!zfs_prop_user(propname)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid property"));
	- return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	- }
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	- (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
	-
	- if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0)
	- return (zfs_standard_error(hdl, errno, errbuf));
	-
	- return (0);
	- }
	-
	- /*
	- * Verify that this property is inheritable.
	- */
	- if (zfs_prop_readonly(prop))
	- return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));
	-
	- if (!zfs_prop_inheritable(prop) && !received)
	- return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));
	-
	- /*
	- * Check to see if the value applies to this type
	- */
	- if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
	- return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));
	-
	- /*
	- * Normalize the name, to get rid of shorthand abbreviations.
	- */
	- propname = zfs_prop_to_name(prop);
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	- (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
	-
	- if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID &&
	- zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "dataset is used in a non-global zone"));
	- return (zfs_error(hdl, EZFS_ZONED, errbuf));
	- }
	-
	- /*
	- * Determine datasets which will be affected by this change, if any.
	- */
	- if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL)
	- return (-1);
	-
	- if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "child dataset with inherited mountpoint is used "
	- "in a non-global zone"));
	- ret = zfs_error(hdl, EZFS_ZONED, errbuf);
	- goto error;
	- }
	-
	- if ((ret = changelist_prefix(cl)) != 0)
	- goto error;
	-
	- if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) {
	- return (zfs_standard_error(hdl, errno, errbuf));
	- } else {
	-
	- if ((ret = changelist_postfix(cl)) != 0)
	- goto error;
	-
	- /*
	- * Refresh the statistics so the new property is reflected.
	- */
	- (void) get_stats(zhp);
	- }
	-
	-error:
	- changelist_free(cl);
	- return (ret);
	-}
	-
	-/*
	- * True DSL properties are stored in an nvlist. The following two functions
	- * extract them appropriately.
	- */
	-static uint64_t
	-getprop_uint64(zfs_handle_t zhp, zfs_prop_t prop, char *source)
	-{
	- nvlist_t *nv;
	- uint64_t value;
	-
	- *source = NULL;
	- if (nvlist_lookup_nvlist(zhp->zfs_props,
	- zfs_prop_to_name(prop), &nv) == 0) {
	- verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
	- (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
	- } else {
	- verify(!zhp->zfs_props_table \|\|
	- zhp->zfs_props_table[prop] == B_TRUE);
	- value = zfs_prop_default_numeric(prop);
	- *source = "";
	- }
	-
	- return (value);
	-}
	-
	-static const char *
	-getprop_string(zfs_handle_t zhp, zfs_prop_t prop, char *source)
	-{
	- nvlist_t *nv;
	- const char *value;
	-
	- *source = NULL;
	- if (nvlist_lookup_nvlist(zhp->zfs_props,
	- zfs_prop_to_name(prop), &nv) == 0) {
	- value = fnvlist_lookup_string(nv, ZPROP_VALUE);
	- (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
	- } else {
	- verify(!zhp->zfs_props_table \|\|
	- zhp->zfs_props_table[prop] == B_TRUE);
	- value = zfs_prop_default_string(prop);
	- *source = "";
	- }
	-
	- return (value);
	-}
	-
	-static boolean_t
	-zfs_is_recvd_props_mode(zfs_handle_t *zhp)
	-{
	- return (zhp->zfs_props == zhp->zfs_recvd_props);
	-}
	-
	-static void
	-zfs_set_recvd_props_mode(zfs_handle_t zhp, uint64_t cookie)
	-{
	- *cookie = (uint64_t)(uintptr_t)zhp->zfs_props;
	- zhp->zfs_props = zhp->zfs_recvd_props;
	-}
	-
	-static void
	-zfs_unset_recvd_props_mode(zfs_handle_t zhp, uint64_t cookie)
	-{
	- zhp->zfs_props = (nvlist_t )(uintptr_t)cookie;
	- *cookie = 0;
	-}
	-
	-/*
	- * Internal function for getting a numeric property. Both zfs_prop_get() and
	- * zfs_prop_get_int() are built using this interface.
	- *
	- * Certain properties can be overridden using 'mount -o'. In this case, scan
	- * the contents of the /etc/mnttab entry, searching for the appropriate options.
	- * If they differ from the on-disk values, report the current values and mark
	- * the source "temporary".
	- */
	-static int
	-get_numeric_property(zfs_handle_t zhp, zfs_prop_t prop, zprop_source_t src,
	- char *source, uint64_t val)
	-{
	- zfs_cmd_t zc = { 0 };
	- nvlist_t *zplprops = NULL;
	- struct mnttab mnt;
	- char *mntopt_on = NULL;
	- char *mntopt_off = NULL;
	- boolean_t received = zfs_is_recvd_props_mode(zhp);
	-
	- *source = NULL;
	-
	- switch (prop) {
	- case ZFS_PROP_ATIME:
	- mntopt_on = MNTOPT_ATIME;
	- mntopt_off = MNTOPT_NOATIME;
	- break;
	-
	- case ZFS_PROP_DEVICES:
	- mntopt_on = MNTOPT_DEVICES;
	- mntopt_off = MNTOPT_NODEVICES;
	- break;
	-
	- case ZFS_PROP_EXEC:
	- mntopt_on = MNTOPT_EXEC;
	- mntopt_off = MNTOPT_NOEXEC;
	- break;
	-
	- case ZFS_PROP_READONLY:
	- mntopt_on = MNTOPT_RO;
	- mntopt_off = MNTOPT_RW;
	- break;
	-
	- case ZFS_PROP_SETUID:
	- mntopt_on = MNTOPT_SETUID;
	- mntopt_off = MNTOPT_NOSETUID;
	- break;
	-
	- case ZFS_PROP_XATTR:
	- mntopt_on = MNTOPT_XATTR;
	- mntopt_off = MNTOPT_NOXATTR;
	- break;
	-
	- case ZFS_PROP_NBMAND:
	- mntopt_on = MNTOPT_NBMAND;
	- mntopt_off = MNTOPT_NONBMAND;
	- break;
	-
	- default:
	- break;
	- }
	-
	- /*
	- * Because looking up the mount options is potentially expensive
	- * (iterating over all of /etc/mnttab), we defer its calculation until
	- * we're looking up a property which requires its presence.
	- */
	- if (!zhp->zfs_mntcheck &&
	- (mntopt_on != NULL \|\| prop == ZFS_PROP_MOUNTED)) {
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- struct mnttab entry;
	-
	- if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) {
	- zhp->zfs_mntopts = zfs_strdup(hdl,
	- entry.mnt_mntopts);
	- if (zhp->zfs_mntopts == NULL)
	- return (-1);
	- }
	-
	- zhp->zfs_mntcheck = B_TRUE;
	- }
	-
	- if (zhp->zfs_mntopts == NULL)
	- mnt.mnt_mntopts = "";
	- else
	- mnt.mnt_mntopts = zhp->zfs_mntopts;
	-
	- switch (prop) {
	- case ZFS_PROP_ATIME:
	- case ZFS_PROP_DEVICES:
	- case ZFS_PROP_EXEC:
	- case ZFS_PROP_READONLY:
	- case ZFS_PROP_SETUID:
	- case ZFS_PROP_XATTR:
	- case ZFS_PROP_NBMAND:
	- *val = getprop_uint64(zhp, prop, source);
	-
	- if (received)
	- break;
	-
	- if (hasmntopt(&mnt, mntopt_on) && !*val) {
	- *val = B_TRUE;
	- if (src)
	- *src = ZPROP_SRC_TEMPORARY;
	- } else if (hasmntopt(&mnt, mntopt_off) && *val) {
	- *val = B_FALSE;
	- if (src)
	- *src = ZPROP_SRC_TEMPORARY;
	- }
	- break;
	-
	- case ZFS_PROP_CANMOUNT:
	- case ZFS_PROP_VOLSIZE:
	- case ZFS_PROP_QUOTA:
	- case ZFS_PROP_REFQUOTA:
	- case ZFS_PROP_RESERVATION:
	- case ZFS_PROP_REFRESERVATION:
	- case ZFS_PROP_FILESYSTEM_LIMIT:
	- case ZFS_PROP_SNAPSHOT_LIMIT:
	- case ZFS_PROP_FILESYSTEM_COUNT:
	- case ZFS_PROP_SNAPSHOT_COUNT:
	- *val = getprop_uint64(zhp, prop, source);
	-
	- if (*source == NULL) {
	- /* not default, must be local */
	- *source = zhp->zfs_name;
	- }
	- break;
	-
	- case ZFS_PROP_MOUNTED:
	- *val = (zhp->zfs_mntopts != NULL);
	- break;
	-
	- case ZFS_PROP_NUMCLONES:
	- *val = zhp->zfs_dmustats.dds_num_clones;
	- break;
	-
	- case ZFS_PROP_VERSION:
	- case ZFS_PROP_NORMALIZE:
	- case ZFS_PROP_UTF8ONLY:
	- case ZFS_PROP_CASE:
	- if (!zfs_prop_valid_for_type(prop, zhp->zfs_head_type) \|\|
	- zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
	- return (-1);
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	- if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 \|\|
	- nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop),
	- val) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- nvlist_free(zplprops);
	- zcmd_free_nvlists(&zc);
	- break;
	-
	- case ZFS_PROP_INCONSISTENT:
	- *val = zhp->zfs_dmustats.dds_inconsistent;
	- break;
	-
	- default:
	- switch (zfs_prop_get_type(prop)) {
	- case PROP_TYPE_NUMBER:
	- case PROP_TYPE_INDEX:
	- *val = getprop_uint64(zhp, prop, source);
	- /*
	- * If we tried to use a default value for a
	- * readonly property, it means that it was not
	- * present. Note this only applies to "truly"
	- * readonly properties, not set-once properties
	- * like volblocksize.
	- */
	- if (zfs_prop_readonly(prop) &&
	- !zfs_prop_setonce(prop) &&
	- source != NULL && (source)[0] == '\0') {
	- *source = NULL;
	- return (-1);
	- }
	- break;
	-
	- case PROP_TYPE_STRING:
	- default:
	- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	- "cannot get non-numeric property"));
	- return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP,
	- dgettext(TEXT_DOMAIN, "internal error")));
	- }
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Calculate the source type, given the raw source string.
	- */
	-static void
	-get_source(zfs_handle_t zhp, zprop_source_t srctype, char *source,
	- char *statbuf, size_t statlen)
	-{
	- if (statbuf == NULL \|\| *srctype == ZPROP_SRC_TEMPORARY)
	- return;
	-
	- if (source == NULL) {
	- *srctype = ZPROP_SRC_NONE;
	- } else if (source[0] == '\0') {
	- *srctype = ZPROP_SRC_DEFAULT;
	- } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) {
	- *srctype = ZPROP_SRC_RECEIVED;
	- } else {
	- if (strcmp(source, zhp->zfs_name) == 0) {
	- *srctype = ZPROP_SRC_LOCAL;
	- } else {
	- (void) strlcpy(statbuf, source, statlen);
	- *srctype = ZPROP_SRC_INHERITED;
	- }
	- }
	-
	-}
	-
	-int
	-zfs_prop_get_recvd(zfs_handle_t zhp, const char propname, char *propbuf,
	- size_t proplen, boolean_t literal)
	-{
	- zfs_prop_t prop;
	- int err = 0;
	-
	- if (zhp->zfs_recvd_props == NULL)
	- if (get_recvd_props_ioctl(zhp) != 0)
	- return (-1);
	-
	- prop = zfs_name_to_prop(propname);
	-
	- if (prop != ZPROP_INVAL) {
	- uint64_t cookie;
	- if (!nvlist_exists(zhp->zfs_recvd_props, propname))
	- return (-1);
	- zfs_set_recvd_props_mode(zhp, &cookie);
	- err = zfs_prop_get(zhp, prop, propbuf, proplen,
	- NULL, NULL, 0, literal);
	- zfs_unset_recvd_props_mode(zhp, &cookie);
	- } else {
	- nvlist_t *propval;
	- char *recvdval;
	- if (nvlist_lookup_nvlist(zhp->zfs_recvd_props,
	- propname, &propval) != 0)
	- return (-1);
	- verify(nvlist_lookup_string(propval, ZPROP_VALUE,
	- &recvdval) == 0);
	- (void) strlcpy(propbuf, recvdval, proplen);
	- }
	-
	- return (err == 0 ? 0 : -1);
	-}
	-
	-static int
	-get_clones_string(zfs_handle_t zhp, char propbuf, size_t proplen)
	-{
	- nvlist_t *value;
	- nvpair_t *pair;
	-
	- value = zfs_get_clones_nvl(zhp);
	- if (value == NULL)
	- return (-1);
	-
	- propbuf[0] = '\0';
	- for (pair = nvlist_next_nvpair(value, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(value, pair)) {
	- if (propbuf[0] != '\0')
	- (void) strlcat(propbuf, ",", proplen);
	- (void) strlcat(propbuf, nvpair_name(pair), proplen);
	- }
	-
	- return (0);
	-}
	-
	-struct get_clones_arg {
	- uint64_t numclones;
	- nvlist_t *value;
	- const char *origin;
	- char buf[ZFS_MAX_DATASET_NAME_LEN];
	-};
	-
	-int
	-get_clones_cb(zfs_handle_t zhp, void arg)
	-{
	- struct get_clones_arg *gca = arg;
	-
	- if (gca->numclones == 0) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf),
	- NULL, NULL, 0, B_TRUE) != 0)
	- goto out;
	- if (strcmp(gca->buf, gca->origin) == 0) {
	- fnvlist_add_boolean(gca->value, zfs_get_name(zhp));
	- gca->numclones--;
	- }
	-
	-out:
	- (void) zfs_iter_children(zhp, get_clones_cb, gca);
	- zfs_close(zhp);
	- return (0);
	-}
	-
	-nvlist_t *
	-zfs_get_clones_nvl(zfs_handle_t *zhp)
	-{
	- nvlist_t nv, value;
	-
	- if (nvlist_lookup_nvlist(zhp->zfs_props,
	- zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) {
	- struct get_clones_arg gca;
	-
	- /*
	- * if this is a snapshot, then the kernel wasn't able
	- * to get the clones. Do it by slowly iterating.
	- */
	- if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT)
	- return (NULL);
	- if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0)
	- return (NULL);
	- if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) {
	- nvlist_free(nv);
	- return (NULL);
	- }
	-
	- gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES);
	- gca.value = value;
	- gca.origin = zhp->zfs_name;
	-
	- if (gca.numclones != 0) {
	- zfs_handle_t *root;
	- char pool[ZFS_MAX_DATASET_NAME_LEN];
	- char *cp = pool;
	-
	- /* get the pool name */
	- (void) strlcpy(pool, zhp->zfs_name, sizeof (pool));
	- (void) strsep(&cp, "/@");
	- root = zfs_open(zhp->zfs_hdl, pool,
	- ZFS_TYPE_FILESYSTEM);
	-
	- (void) get_clones_cb(root, &gca);
	- }
	-
	- if (gca.numclones != 0 \|\|
	- nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 \|\|
	- nvlist_add_nvlist(zhp->zfs_props,
	- zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) {
	- nvlist_free(nv);
	- nvlist_free(value);
	- return (NULL);
	- }
	- nvlist_free(nv);
	- nvlist_free(value);
	- verify(0 == nvlist_lookup_nvlist(zhp->zfs_props,
	- zfs_prop_to_name(ZFS_PROP_CLONES), &nv));
	- }
	-
	- verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0);
	-
	- return (value);
	-}
	-
	-/*
	- * Accepts a property and value and checks that the value
	- * matches the one found by the channel program. If they are
	- * not equal, print both of them.
	- */
	-void
	-zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval,
	- const char *strval)
	-{
	- if (!zhp->zfs_hdl->libzfs_prop_debug)
	- return;
	- int error;
	- char *poolname = zhp->zpool_hdl->zpool_name;
	- const char *program =
	- "args = ...\n"
	- "ds = args['dataset']\n"
	- "prop = args['property']\n"
	- "value, setpoint = zfs.get_prop(ds, prop)\n"
	- "return {value=value, setpoint=setpoint}\n";
	- nvlist_t *outnvl;
	- nvlist_t *retnvl;
	- nvlist_t *argnvl = fnvlist_alloc();
	-
	- fnvlist_add_string(argnvl, "dataset", zhp->zfs_name);
	- fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop));
	-
	- error = lzc_channel_program_nosync(poolname, program,
	- 10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl);
	-
	- if (error == 0) {
	- retnvl = fnvlist_lookup_nvlist(outnvl, "return");
	- if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) {
	- int64_t ans;
	- error = nvlist_lookup_int64(retnvl, "value", &ans);
	- if (error != 0) {
	- (void) fprintf(stderr, "zcp check error: %u\n",
	- error);
	- return;
	- }
	- if (ans != intval) {
	- (void) fprintf(stderr,
	- "%s: zfs found %lld, but zcp found %lld\n",
	- zfs_prop_to_name(prop),
	- (longlong_t)intval, (longlong_t)ans);
	- }
	- } else {
	- char *str_ans;
	- error = nvlist_lookup_string(retnvl, "value", &str_ans);
	- if (error != 0) {
	- (void) fprintf(stderr, "zcp check error: %u\n",
	- error);
	- return;
	- }
	- if (strcmp(strval, str_ans) != 0) {
	- (void) fprintf(stderr,
	- "%s: zfs found %s, but zcp found %s\n",
	- zfs_prop_to_name(prop),
	- strval, str_ans);
	- }
	- }
	- } else {
	- (void) fprintf(stderr,
	- "zcp check failed, channel program error: %u\n", error);
	- }
	- nvlist_free(argnvl);
	- nvlist_free(outnvl);
	-}
	-
	-/*
	- * Retrieve a property from the given object. If 'literal' is specified, then
	- * numbers are left as exact values. Otherwise, numbers are converted to a
	- * human-readable form.
	- *
	- * Returns 0 on success, or -1 on error.
	- */
	-int
	-zfs_prop_get(zfs_handle_t zhp, zfs_prop_t prop, char propbuf, size_t proplen,
	- zprop_source_t src, char statbuf, size_t statlen, boolean_t literal)
	-{
	- char *source = NULL;
	- uint64_t val;
	- const char *str;
	- const char *strval;
	- boolean_t received = zfs_is_recvd_props_mode(zhp);
	-
	- /*
	- * Check to see if this property applies to our object
	- */
	- if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
	- return (-1);
	-
	- if (received && zfs_prop_readonly(prop))
	- return (-1);
	-
	- if (src)
	- *src = ZPROP_SRC_NONE;
	-
	- switch (prop) {
	- case ZFS_PROP_CREATION:
	- /*
	- * 'creation' is a time_t stored in the statistics. We convert
	- * this into a string unless 'literal' is specified.
	- */
	- {
	- val = getprop_uint64(zhp, prop, &source);
	- time_t time = (time_t)val;
	- struct tm t;
	-
	- if (literal \|\|
	- localtime_r(&time, &t) == NULL \|\|
	- strftime(propbuf, proplen, "%a %b %e %k:%M %Y",
	- &t) == 0)
	- (void) snprintf(propbuf, proplen, "%llu", val);
	- }
	- zcp_check(zhp, prop, val, NULL);
	- break;
	-
	- case ZFS_PROP_MOUNTPOINT:
	- /*
	- * Getting the precise mountpoint can be tricky.
	- *
	- * - for 'none' or 'legacy', return those values.
	- * - for inherited mountpoints, we want to take everything
	- * after our ancestor and append it to the inherited value.
	- *
	- * If the pool has an alternate root, we want to prepend that
	- * root to any values we return.
	- */
	-
	- str = getprop_string(zhp, prop, &source);
	-
	- if (str[0] == '/') {
	- char buf[MAXPATHLEN];
	- char *root = buf;
	- const char *relpath;
	-
	- /*
	- * If we inherit the mountpoint, even from a dataset
	- * with a received value, the source will be the path of
	- * the dataset we inherit from. If source is
	- * ZPROP_SOURCE_VAL_RECVD, the received value is not
	- * inherited.
	- */
	- if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
	- relpath = "";
	- } else {
	- relpath = zhp->zfs_name + strlen(source);
	- if (relpath[0] == '/')
	- relpath++;
	- }
	-
	- if ((zpool_get_prop(zhp->zpool_hdl,
	- ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL,
	- B_FALSE)) \|\| (strcmp(root, "-") == 0))
	- root[0] = '\0';
	- /*
	- * Special case an alternate root of '/'. This will
	- * avoid having multiple leading slashes in the
	- * mountpoint path.
	- */
	- if (strcmp(root, "/") == 0)
	- root++;
	-
	- /*
	- * If the mountpoint is '/' then skip over this
	- * if we are obtaining either an alternate root or
	- * an inherited mountpoint.
	- */
	- if (str[1] == '\0' && (root[0] != '\0' \|\|
	- relpath[0] != '\0'))
	- str++;
	-
	- if (relpath[0] == '\0')
	- (void) snprintf(propbuf, proplen, "%s%s",
	- root, str);
	- else
	- (void) snprintf(propbuf, proplen, "%s%s%s%s",
	- root, str, relpath[0] == '@' ? "" : "/",
	- relpath);
	- } else {
	- /* 'legacy' or 'none' */
	- (void) strlcpy(propbuf, str, proplen);
	- }
	- zcp_check(zhp, prop, NULL, propbuf);
	- break;
	-
	- case ZFS_PROP_ORIGIN:
	- str = getprop_string(zhp, prop, &source);
	- if (str == NULL)
	- return (-1);
	- (void) strlcpy(propbuf, str, proplen);
	- zcp_check(zhp, prop, NULL, str);
	- break;
	-
	- case ZFS_PROP_CLONES:
	- if (get_clones_string(zhp, propbuf, proplen) != 0)
	- return (-1);
	- break;
	-
	- case ZFS_PROP_QUOTA:
	- case ZFS_PROP_REFQUOTA:
	- case ZFS_PROP_RESERVATION:
	- case ZFS_PROP_REFRESERVATION:
	-
	- if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
	- return (-1);
	- /*
	- * If quota or reservation is 0, we translate this into 'none'
	- * (unless literal is set), and indicate that it's the default
	- * value. Otherwise, we print the number nicely and indicate
	- * that its set locally.
	- */
	- if (val == 0) {
	- if (literal)
	- (void) strlcpy(propbuf, "0", proplen);
	- else
	- (void) strlcpy(propbuf, "none", proplen);
	- } else {
	- if (literal)
	- (void) snprintf(propbuf, proplen, "%llu",
	- (u_longlong_t)val);
	- else
	- zfs_nicenum(val, propbuf, proplen);
	- }
	- zcp_check(zhp, prop, val, NULL);
	- break;
	-
	- case ZFS_PROP_FILESYSTEM_LIMIT:
	- case ZFS_PROP_SNAPSHOT_LIMIT:
	- case ZFS_PROP_FILESYSTEM_COUNT:
	- case ZFS_PROP_SNAPSHOT_COUNT:
	-
	- if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
	- return (-1);
	-
	- /*
	- * If limit is UINT64_MAX, we translate this into 'none' (unless
	- * literal is set), and indicate that it's the default value.
	- * Otherwise, we print the number nicely and indicate that it's
	- * set locally.
	- */
	- if (literal) {
	- (void) snprintf(propbuf, proplen, "%llu",
	- (u_longlong_t)val);
	- } else if (val == UINT64_MAX) {
	- (void) strlcpy(propbuf, "none", proplen);
	- } else {
	- zfs_nicenum(val, propbuf, proplen);
	- }
	-
	- zcp_check(zhp, prop, val, NULL);
	- break;
	-
	- case ZFS_PROP_REFRATIO:
	- case ZFS_PROP_COMPRESSRATIO:
	- if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
	- return (-1);
	- (void) snprintf(propbuf, proplen, "%llu.%02llux",
	- (u_longlong_t)(val / 100),
	- (u_longlong_t)(val % 100));
	- zcp_check(zhp, prop, val, NULL);
	- break;
	-
	- case ZFS_PROP_TYPE:
	- switch (zhp->zfs_type) {
	- case ZFS_TYPE_FILESYSTEM:
	- str = "filesystem";
	- break;
	- case ZFS_TYPE_VOLUME:
	- str = "volume";
	- break;
	- case ZFS_TYPE_SNAPSHOT:
	- str = "snapshot";
	- break;
	- case ZFS_TYPE_BOOKMARK:
	- str = "bookmark";
	- break;
	- default:
	- abort();
	- }
	- (void) snprintf(propbuf, proplen, "%s", str);
	- zcp_check(zhp, prop, NULL, propbuf);
	- break;
	-
	- case ZFS_PROP_MOUNTED:
	- /*
	- * The 'mounted' property is a pseudo-property that described
	- * whether the filesystem is currently mounted. Even though
	- * it's a boolean value, the typical values of "on" and "off"
	- * don't make sense, so we translate to "yes" and "no".
	- */
	- if (get_numeric_property(zhp, ZFS_PROP_MOUNTED,
	- src, &source, &val) != 0)
	- return (-1);
	- if (val)
	- (void) strlcpy(propbuf, "yes", proplen);
	- else
	- (void) strlcpy(propbuf, "no", proplen);
	- break;
	-
	- case ZFS_PROP_NAME:
	- /*
	- * The 'name' property is a pseudo-property derived from the
	- * dataset name. It is presented as a real property to simplify
	- * consumers.
	- */
	- (void) strlcpy(propbuf, zhp->zfs_name, proplen);
	- zcp_check(zhp, prop, NULL, propbuf);
	- break;
	-
	- case ZFS_PROP_MLSLABEL:
	- {
	-#ifdef illumos
	- m_label_t *new_sl = NULL;
	- char ascii = NULL; / human readable label */
	-
	- (void) strlcpy(propbuf,
	- getprop_string(zhp, prop, &source), proplen);
	-
	- if (literal \|\| (strcasecmp(propbuf,
	- ZFS_MLSLABEL_DEFAULT) == 0))
	- break;
	-
	- /*
	- * Try to translate the internal hex string to
	- * human-readable output. If there are any
	- * problems just use the hex string.
	- */
	-
	- if (str_to_label(propbuf, &new_sl, MAC_LABEL,
	- L_NO_CORRECTION, NULL) == -1) {
	- m_label_free(new_sl);
	- break;
	- }
	-
	- if (label_to_str(new_sl, &ascii, M_LABEL,
	- DEF_NAMES) != 0) {
	- if (ascii)
	- free(ascii);
	- m_label_free(new_sl);
	- break;
	- }
	- m_label_free(new_sl);
	-
	- (void) strlcpy(propbuf, ascii, proplen);
	- free(ascii);
	-#else /* !illumos */
	- propbuf[0] = '\0';
	-#endif /* illumos */
	- }
	- break;
	-
	- case ZFS_PROP_GUID:
	- case ZFS_PROP_CREATETXG:
	- /*
	- * GUIDs are stored as numbers, but they are identifiers.
	- * We don't want them to be pretty printed, because pretty
	- * printing mangles the ID into a truncated and useless value.
	- */
	- if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
	- return (-1);
	- (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val);
	- zcp_check(zhp, prop, val, NULL);
	- break;
	-
	- default:
	- switch (zfs_prop_get_type(prop)) {
	- case PROP_TYPE_NUMBER:
	- if (get_numeric_property(zhp, prop, src,
	- &source, &val) != 0) {
	- return (-1);
	- }
	-
	- if (literal) {
	- (void) snprintf(propbuf, proplen, "%llu",
	- (u_longlong_t)val);
	- } else {
	- zfs_nicenum(val, propbuf, proplen);
	- }
	- zcp_check(zhp, prop, val, NULL);
	- break;
	-
	- case PROP_TYPE_STRING:
	- str = getprop_string(zhp, prop, &source);
	- if (str == NULL)
	- return (-1);
	-
	- (void) strlcpy(propbuf, str, proplen);
	- zcp_check(zhp, prop, NULL, str);
	- break;
	-
	- case PROP_TYPE_INDEX:
	- if (get_numeric_property(zhp, prop, src,
	- &source, &val) != 0)
	- return (-1);
	- if (zfs_prop_index_to_string(prop, val, &strval) != 0)
	- return (-1);
	-
	- (void) strlcpy(propbuf, strval, proplen);
	- zcp_check(zhp, prop, NULL, strval);
	- break;
	-
	- default:
	- abort();
	- }
	- }
	-
	- get_source(zhp, src, source, statbuf, statlen);
	-
	- return (0);
	-}
	-
	-/*
	- * Utility function to get the given numeric property. Does no validation that
	- * the given property is the appropriate type; should only be used with
	- * hard-coded property types.
	- */
	-uint64_t
	-zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop)
	-{
	- char *source;
	- uint64_t val;
	-
	- (void) get_numeric_property(zhp, prop, NULL, &source, &val);
	-
	- return (val);
	-}
	-
	-int
	-zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val)
	-{
	- char buf[64];
	-
	- (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val);
	- return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf));
	-}
	-
	-/*
	- * Similar to zfs_prop_get(), but returns the value as an integer.
	- */
	-int
	-zfs_prop_get_numeric(zfs_handle_t zhp, zfs_prop_t prop, uint64_t value,
	- zprop_source_t src, char statbuf, size_t statlen)
	-{
	- char *source;
	-
	- /*
	- * Check to see if this property applies to our object
	- */
	- if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) {
	- return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE,
	- dgettext(TEXT_DOMAIN, "cannot get property '%s'"),
	- zfs_prop_to_name(prop)));
	- }
	-
	- if (src)
	- *src = ZPROP_SRC_NONE;
	-
	- if (get_numeric_property(zhp, prop, src, &source, value) != 0)
	- return (-1);
	-
	- get_source(zhp, src, source, statbuf, statlen);
	-
	- return (0);
	-}
	-
	-static int
	-idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
	- char *domainp, idmap_rid_t ridp)
	-{
	-#ifdef illumos
	- idmap_get_handle_t *get_hdl = NULL;
	- idmap_stat status;
	- int err = EINVAL;
	-
	- if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS)
	- goto out;
	-
	- if (isuser) {
	- err = idmap_get_sidbyuid(get_hdl, id,
	- IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
	- } else {
	- err = idmap_get_sidbygid(get_hdl, id,
	- IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
	- }
	- if (err == IDMAP_SUCCESS &&
	- idmap_get_mappings(get_hdl) == IDMAP_SUCCESS &&
	- status == IDMAP_SUCCESS)
	- err = 0;
	- else
	- err = EINVAL;
	-out:
	- if (get_hdl)
	- idmap_get_destroy(get_hdl);
	- return (err);
	-#else /* !illumos */
	- assert(!"invalid code path");
	- return (EINVAL); // silence compiler warning
	-#endif /* illumos */
	-}
	-
	-/*
	- * convert the propname into parameters needed by kernel
	- * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829
	- * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789
	- */
	-static int
	-userquota_propname_decode(const char *propname, boolean_t zoned,
	- zfs_userquota_prop_t typep, char domain, int domainlen, uint64_t *ridp)
	-{
	- zfs_userquota_prop_t type;
	- char cp, end;
	- char *numericsid = NULL;
	- boolean_t isuser;
	-
	- domain[0] = '\0';
	- *ridp = 0;
	- /* Figure out the property type ({user\|group}{quota\|space}) */
	- for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
	- if (strncmp(propname, zfs_userquota_prop_prefixes[type],
	- strlen(zfs_userquota_prop_prefixes[type])) == 0)
	- break;
	- }
	- if (type == ZFS_NUM_USERQUOTA_PROPS)
	- return (EINVAL);
	- *typep = type;
	-
	- isuser = (type == ZFS_PROP_USERQUOTA \|\|
	- type == ZFS_PROP_USERUSED);
	-
	- cp = strchr(propname, '@') + 1;
	-
	- if (strchr(cp, '@')) {
	-#ifdef illumos
	- /*
	- * It's a SID name (eg "user@domain") that needs to be
	- * turned into S-1-domainID-RID.
	- */
	- int flag = 0;
	- idmap_stat stat, map_stat;
	- uid_t pid;
	- idmap_rid_t rid;
	- idmap_get_handle_t *gh = NULL;
	-
	- stat = idmap_get_create(&gh);
	- if (stat != IDMAP_SUCCESS) {
	- idmap_get_destroy(gh);
	- return (ENOMEM);
	- }
	- if (zoned && getzoneid() == GLOBAL_ZONEID)
	- return (ENOENT);
	- if (isuser) {
	- stat = idmap_getuidbywinname(cp, NULL, flag, &pid);
	- if (stat < 0)
	- return (ENOENT);
	- stat = idmap_get_sidbyuid(gh, pid, flag, &numericsid,
	- &rid, &map_stat);
	- } else {
	- stat = idmap_getgidbywinname(cp, NULL, flag, &pid);
	- if (stat < 0)
	- return (ENOENT);
	- stat = idmap_get_sidbygid(gh, pid, flag, &numericsid,
	- &rid, &map_stat);
	- }
	- if (stat < 0) {
	- idmap_get_destroy(gh);
	- return (ENOENT);
	- }
	- stat = idmap_get_mappings(gh);
	- idmap_get_destroy(gh);
	-
	- if (stat < 0) {
	- return (ENOENT);
	- }
	- if (numericsid == NULL)
	- return (ENOENT);
	- cp = numericsid;
	- *ridp = rid;
	- /* will be further decoded below */
	-#else /* !illumos */
	- return (ENOENT);
	-#endif /* illumos */
	- }
	-
	- if (strncmp(cp, "S-1-", 4) == 0) {
	- /* It's a numeric SID (eg "S-1-234-567-89") */
	- (void) strlcpy(domain, cp, domainlen);
	- errno = 0;
	- if (*ridp == 0) {
	- cp = strrchr(domain, '-');
	- *cp = '\0';
	- cp++;
	- *ridp = strtoull(cp, &end, 10);
	- } else {
	- end = "";
	- }
	- if (numericsid) {
	- free(numericsid);
	- numericsid = NULL;
	- }
	- if (errno != 0 \|\| *end != '\0')
	- return (EINVAL);
	- } else if (!isdigit(*cp)) {
	- /*
	- * It's a user/group name (eg "user") that needs to be
	- * turned into a uid/gid
	- */
	- if (zoned && getzoneid() == GLOBAL_ZONEID)
	- return (ENOENT);
	- if (isuser) {
	- struct passwd *pw;
	- pw = getpwnam(cp);
	- if (pw == NULL)
	- return (ENOENT);
	- *ridp = pw->pw_uid;
	- } else {
	- struct group *gr;
	- gr = getgrnam(cp);
	- if (gr == NULL)
	- return (ENOENT);
	- *ridp = gr->gr_gid;
	- }
	- } else {
	- /* It's a user/group ID (eg "12345"). */
	- uid_t id = strtoul(cp, &end, 10);
	- idmap_rid_t rid;
	- char *mapdomain;
	-
	- if (*end != '\0')
	- return (EINVAL);
	- if (id > MAXUID) {
	- /* It's an ephemeral ID. */
	- if (idmap_id_to_numeric_domain_rid(id, isuser,
	- &mapdomain, &rid) != 0)
	- return (ENOENT);
	- (void) strlcpy(domain, mapdomain, domainlen);
	- *ridp = rid;
	- } else {
	- *ridp = id;
	- }
	- }
	-
	- ASSERT3P(numericsid, ==, NULL);
	- return (0);
	-}
	-
	-static int
	-zfs_prop_get_userquota_common(zfs_handle_t zhp, const char propname,
	- uint64_t propvalue, zfs_userquota_prop_t typep)
	-{
	- int err;
	- zfs_cmd_t zc = { 0 };
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	-
	- err = userquota_propname_decode(propname,
	- zfs_prop_get_int(zhp, ZFS_PROP_ZONED),
	- typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid);
	- zc.zc_objset_type = *typep;
	- if (err)
	- return (err);
	-
	- err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc);
	- if (err)
	- return (err);
	-
	- *propvalue = zc.zc_cookie;
	- return (0);
	-}
	-
	-int
	-zfs_prop_get_userquota_int(zfs_handle_t zhp, const char propname,
	- uint64_t *propvalue)
	-{
	- zfs_userquota_prop_t type;
	-
	- return (zfs_prop_get_userquota_common(zhp, propname, propvalue,
	- &type));
	-}
	-
	-int
	-zfs_prop_get_userquota(zfs_handle_t zhp, const char propname,
	- char *propbuf, int proplen, boolean_t literal)
	-{
	- int err;
	- uint64_t propvalue;
	- zfs_userquota_prop_t type;
	-
	- err = zfs_prop_get_userquota_common(zhp, propname, &propvalue,
	- &type);
	-
	- if (err)
	- return (err);
	-
	- if (literal) {
	- (void) snprintf(propbuf, proplen, "%llu", propvalue);
	- } else if (propvalue == 0 &&
	- (type == ZFS_PROP_USERQUOTA \|\| type == ZFS_PROP_GROUPQUOTA)) {
	- (void) strlcpy(propbuf, "none", proplen);
	- } else {
	- zfs_nicenum(propvalue, propbuf, proplen);
	- }
	- return (0);
	-}
	-
	-int
	-zfs_prop_get_written_int(zfs_handle_t zhp, const char propname,
	- uint64_t *propvalue)
	-{
	- int err;
	- zfs_cmd_t zc = { 0 };
	- const char *snapname;
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	-
	- snapname = strchr(propname, '@') + 1;
	- if (strchr(snapname, '@')) {
	- (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
	- } else {
	- /* snapname is the short name, append it to zhp's fsname */
	- char *cp;
	-
	- (void) strlcpy(zc.zc_value, zhp->zfs_name,
	- sizeof (zc.zc_value));
	- cp = strchr(zc.zc_value, '@');
	- if (cp != NULL)
	- *cp = '\0';
	- (void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value));
	- (void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value));
	- }
	-
	- err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc);
	- if (err)
	- return (err);
	-
	- *propvalue = zc.zc_cookie;
	- return (0);
	-}
	-
	-int
	-zfs_prop_get_written(zfs_handle_t zhp, const char propname,
	- char *propbuf, int proplen, boolean_t literal)
	-{
	- int err;
	- uint64_t propvalue;
	-
	- err = zfs_prop_get_written_int(zhp, propname, &propvalue);
	-
	- if (err)
	- return (err);
	-
	- if (literal) {
	- (void) snprintf(propbuf, proplen, "%llu", propvalue);
	- } else {
	- zfs_nicenum(propvalue, propbuf, proplen);
	- }
	- return (0);
	-}
	-
	-/*
	- * Returns the name of the given zfs handle.
	- */
	-const char *
	-zfs_get_name(const zfs_handle_t *zhp)
	-{
	- return (zhp->zfs_name);
	-}
	-
	-/*
	- * Returns the name of the parent pool for the given zfs handle.
	- */
	-const char *
	-zfs_get_pool_name(const zfs_handle_t *zhp)
	-{
	- return (zhp->zpool_hdl->zpool_name);
	-}
	-
	-/*
	- * Returns the type of the given zfs handle.
	- */
	-zfs_type_t
	-zfs_get_type(const zfs_handle_t *zhp)
	-{
	- return (zhp->zfs_type);
	-}
	-
	-/*
	- * Is one dataset name a child dataset of another?
	- *
	- * Needs to handle these cases:
	- * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo"
	- * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar"
	- * Descendant? No. No. No. Yes.
	- */
	-static boolean_t
	-is_descendant(const char ds1, const char ds2)
	-{
	- size_t d1len = strlen(ds1);
	-
	- /* ds2 can't be a descendant if it's smaller */
	- if (strlen(ds2) < d1len)
	- return (B_FALSE);
	-
	- /* otherwise, compare strings and verify that there's a '/' char */
	- return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0));
	-}
	-
	-/*
	- * Given a complete name, return just the portion that refers to the parent.
	- * Will return -1 if there is no parent (path is just the name of the
	- * pool).
	- */
	-static int
	-parent_name(const char path, char buf, size_t buflen)
	-{
	- char *slashp;
	-
	- (void) strlcpy(buf, path, buflen);
	-
	- if ((slashp = strrchr(buf, '/')) == NULL)
	- return (-1);
	- *slashp = '\0';
	-
	- return (0);
	-}
	-
	-/*
	- * If accept_ancestor is false, then check to make sure that the given path has
	- * a parent, and that it exists. If accept_ancestor is true, then find the
	- * closest existing ancestor for the given path. In prefixlen return the
	- * length of already existing prefix of the given path. We also fetch the
	- * 'zoned' property, which is used to validate property settings when creating
	- * new datasets.
	- */
	-static int
	-check_parents(libzfs_handle_t hdl, const char path, uint64_t *zoned,
	- boolean_t accept_ancestor, int *prefixlen)
	-{
	- zfs_cmd_t zc = { 0 };
	- char parent[ZFS_MAX_DATASET_NAME_LEN];
	- char *slash;
	- zfs_handle_t *zhp;
	- char errbuf[1024];
	- uint64_t is_zoned;
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);
	-
	- /* get parent, and check to see if this is just a pool */
	- if (parent_name(path, parent, sizeof (parent)) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "missing dataset name"));
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	- }
	-
	- /* check to see if the pool exists */
	- if ((slash = strchr(parent, '/')) == NULL)
	- slash = parent + strlen(parent);
	- (void) strncpy(zc.zc_name, parent, slash - parent);
	- zc.zc_name[slash - parent] = '\0';
	- if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
	- errno == ENOENT) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "no such pool '%s'"), zc.zc_name);
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	- }
	-
	- /* check to see if the parent dataset exists */
	- while ((zhp = make_dataset_handle(hdl, parent)) == NULL) {
	- if (errno == ENOENT && accept_ancestor) {
	- /*
	- * Go deeper to find an ancestor, give up on top level.
	- */
	- if (parent_name(parent, parent, sizeof (parent)) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "no such pool '%s'"), zc.zc_name);
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	- }
	- } else if (errno == ENOENT) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "parent does not exist"));
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	- } else
	- return (zfs_standard_error(hdl, errno, errbuf));
	- }
	-
	- is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
	- if (zoned != NULL)
	- *zoned = is_zoned;
	-
	- /* we are in a non-global zone, but parent is in the global zone */
	- if (getzoneid() != GLOBAL_ZONEID && !is_zoned) {
	- (void) zfs_standard_error(hdl, EPERM, errbuf);
	- zfs_close(zhp);
	- return (-1);
	- }
	-
	- /* make sure parent is a filesystem */
	- if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "parent is not a filesystem"));
	- (void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	- zfs_close(zhp);
	- return (-1);
	- }
	-
	- zfs_close(zhp);
	- if (prefixlen != NULL)
	- *prefixlen = strlen(parent);
	- return (0);
	-}
	-
	-/*
	- * Finds whether the dataset of the given type(s) exists.
	- */
	-boolean_t
	-zfs_dataset_exists(libzfs_handle_t hdl, const char path, zfs_type_t types)
	-{
	- zfs_handle_t *zhp;
	-
	- if (!zfs_validate_name(hdl, path, types, B_FALSE))
	- return (B_FALSE);
	-
	- /*
	- * Try to get stats for the dataset, which will tell us if it exists.
	- */
	- if ((zhp = make_dataset_handle(hdl, path)) != NULL) {
	- int ds_type = zhp->zfs_type;
	-
	- zfs_close(zhp);
	- if (types & ds_type)
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * Given a path to 'target', create all the ancestors between
	- * the prefixlen portion of the path, and the target itself.
	- * Fail if the initial prefixlen-ancestor does not already exist.
	- */
	-int
	-create_parents(libzfs_handle_t hdl, char target, int prefixlen)
	-{
	- zfs_handle_t *h;
	- char *cp;
	- const char *opname;
	-
	- /* make sure prefix exists */
	- cp = target + prefixlen;
	- if (*cp != '/') {
	- assert(strchr(cp, '/') == NULL);
	- h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
	- } else {
	- *cp = '\0';
	- h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
	- *cp = '/';
	- }
	- if (h == NULL)
	- return (-1);
	- zfs_close(h);
	-
	- /*
	- * Attempt to create, mount, and share any ancestor filesystems,
	- * up to the prefixlen-long one.
	- */
	- for (cp = target + prefixlen + 1;
	- (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) {
	-
	- *cp = '\0';
	-
	- h = make_dataset_handle(hdl, target);
	- if (h) {
	- /* it already exists, nothing to do here */
	- zfs_close(h);
	- continue;
	- }
	-
	- if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM,
	- NULL) != 0) {
	- opname = dgettext(TEXT_DOMAIN, "create");
	- goto ancestorerr;
	- }
	-
	- h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
	- if (h == NULL) {
	- opname = dgettext(TEXT_DOMAIN, "open");
	- goto ancestorerr;
	- }
	-
	- if (zfs_mount(h, NULL, 0) != 0) {
	- opname = dgettext(TEXT_DOMAIN, "mount");
	- goto ancestorerr;
	- }
	-
	- if (zfs_share(h) != 0) {
	- opname = dgettext(TEXT_DOMAIN, "share");
	- goto ancestorerr;
	- }
	-
	- zfs_close(h);
	- }
	-
	- return (0);
	-
	-ancestorerr:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "failed to %s ancestor '%s'"), opname, target);
	- return (-1);
	-}
	-
	-/*
	- * Creates non-existing ancestors of the given path.
	- */
	-int
	-zfs_create_ancestors(libzfs_handle_t hdl, const char path)
	-{
	- int prefix;
	- char *path_copy;
	- char errbuf[1024];
	- int rc = 0;
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot create '%s'"), path);
	-
	- /*
	- * Check that we are not passing the nesting limit
	- * before we start creating any ancestors.
	- */
	- if (dataset_nestcheck(path) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "maximum name nesting depth exceeded"));
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	- }
	-
	- if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0)
	- return (-1);
	-
	- if ((path_copy = strdup(path)) != NULL) {
	- rc = create_parents(hdl, path_copy, prefix);
	- free(path_copy);
	- }
	- if (path_copy == NULL \|\| rc != 0)
	- return (-1);
	-
	- return (0);
	-}
	-
	-/*
	- * Create a new filesystem or volume.
	- */
	-int
	-zfs_create(libzfs_handle_t hdl, const char path, zfs_type_t type,
	- nvlist_t *props)
	-{
	- int ret;
	- uint64_t size = 0;
	- uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
	- char errbuf[1024];
	- uint64_t zoned;
	- enum lzc_dataset_type ost;
	- zpool_handle_t *zpool_handle;
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot create '%s'"), path);
	-
	- /* validate the path, taking care to note the extended error message */
	- if (!zfs_validate_name(hdl, path, type, B_TRUE))
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	-
	- if (dataset_nestcheck(path) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "maximum name nesting depth exceeded"));
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	- }
	-
	- /* validate parents exist */
	- if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0)
	- return (-1);
	-
	- /*
	- * The failure modes when creating a dataset of a different type over
	- * one that already exists is a little strange. In particular, if you
	- * try to create a dataset on top of an existing dataset, the ioctl()
	- * will return ENOENT, not EEXIST. To prevent this from happening, we
	- * first try to see if the dataset exists.
	- */
	- if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "dataset already exists"));
	- return (zfs_error(hdl, EZFS_EXISTS, errbuf));
	- }
	-
	- if (type == ZFS_TYPE_VOLUME)
	- ost = LZC_DATSET_TYPE_ZVOL;
	- else
	- ost = LZC_DATSET_TYPE_ZFS;
	-
	- /* open zpool handle for prop validation */
	- char pool_path[ZFS_MAX_DATASET_NAME_LEN];
	- (void) strlcpy(pool_path, path, sizeof (pool_path));
	-
	- /* truncate pool_path at first slash */
	- char *p = strchr(pool_path, '/');
	- if (p != NULL)
	- *p = '\0';
	-
	- if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL)
	- return (-1);
	-
	- if (props && (props = zfs_valid_proplist(hdl, type, props,
	- zoned, NULL, zpool_handle, errbuf)) == 0) {
	- zpool_close(zpool_handle);
	- return (-1);
	- }
	- zpool_close(zpool_handle);
	-
	- if (type == ZFS_TYPE_VOLUME) {
	- /*
	- * If we are creating a volume, the size and block size must
	- * satisfy a few restraints. First, the blocksize must be a
	- * valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the
	- * volsize must be a multiple of the block size, and cannot be
	- * zero.
	- */
	- if (props == NULL \|\| nvlist_lookup_uint64(props,
	- zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) {
	- nvlist_free(props);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "missing volume size"));
	- return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	- }
	-
	- if ((ret = nvlist_lookup_uint64(props,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	- &blocksize)) != 0) {
	- if (ret == ENOENT) {
	- blocksize = zfs_prop_default_numeric(
	- ZFS_PROP_VOLBLOCKSIZE);
	- } else {
	- nvlist_free(props);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "missing volume block size"));
	- return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	- }
	- }
	-
	- if (size == 0) {
	- nvlist_free(props);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "volume size cannot be zero"));
	- return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	- }
	-
	- if (size % blocksize != 0) {
	- nvlist_free(props);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "volume size must be a multiple of volume block "
	- "size"));
	- return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	- }
	- }
	-
	- /* create the dataset */
	- ret = lzc_create(path, ost, props);
	- nvlist_free(props);
	-
	- /* check for failure */
	- if (ret != 0) {
	- char parent[ZFS_MAX_DATASET_NAME_LEN];
	- (void) parent_name(path, parent, sizeof (parent));
	-
	- switch (errno) {
	- case ENOENT:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "no such parent '%s'"), parent);
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	-
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded to set this "
	- "property or value"));
	- return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
	- case ERANGE:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid property value(s) specified"));
	- return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	-#ifdef _ILP32
	- case EOVERFLOW:
	- /*
	- * This platform can't address a volume this big.
	- */
	- if (type == ZFS_TYPE_VOLUME)
	- return (zfs_error(hdl, EZFS_VOLTOOBIG,
	- errbuf));
	-#endif
	- /* FALLTHROUGH */
	- default:
	- return (zfs_standard_error(hdl, errno, errbuf));
	- }
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Destroys the given dataset. The caller must make sure that the filesystem
	- * isn't mounted, and that there are no active dependents. If the file system
	- * does not exist this function does nothing.
	- */
	-int
	-zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
	-{
	- int error;
	-
	- if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && defer)
	- return (EINVAL);
	-
	- if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) {
	- nvlist_t *nv = fnvlist_alloc();
	- fnvlist_add_boolean(nv, zhp->zfs_name);
	- error = lzc_destroy_bookmarks(nv, NULL);
	- fnvlist_free(nv);
	- if (error != 0) {
	- return (zfs_standard_error_fmt(zhp->zfs_hdl, error,
	- dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
	- zhp->zfs_name));
	- }
	- return (0);
	- }
	-
	- if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
	- nvlist_t *nv = fnvlist_alloc();
	- fnvlist_add_boolean(nv, zhp->zfs_name);
	- error = lzc_destroy_snaps(nv, defer, NULL);
	- fnvlist_free(nv);
	- } else {
	- error = lzc_destroy(zhp->zfs_name);
	- }
	-
	- if (error != 0 && error != ENOENT) {
	- return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
	- dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
	- zhp->zfs_name));
	- }
	-
	- remove_mountpoint(zhp);
	-
	- return (0);
	-}
	-
	-struct destroydata {
	- nvlist_t *nvl;
	- const char *snapname;
	-};
	-
	-static int
	-zfs_check_snap_cb(zfs_handle_t zhp, void arg)
	-{
	- struct destroydata *dd = arg;
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- int rv = 0;
	-
	- (void) snprintf(name, sizeof (name),
	- "%s@%s", zhp->zfs_name, dd->snapname);
	-
	- if (lzc_exists(name))
	- verify(nvlist_add_boolean(dd->nvl, name) == 0);
	-
	- rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd);
	- zfs_close(zhp);
	- return (rv);
	-}
	-
	-/*
	- * Destroys all snapshots with the given name in zhp & descendants.
	- */
	-int
	-zfs_destroy_snaps(zfs_handle_t zhp, char snapname, boolean_t defer)
	-{
	- int ret;
	- struct destroydata dd = { 0 };
	-
	- dd.snapname = snapname;
	- verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0);
	- (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd);
	-
	- if (nvlist_empty(dd.nvl)) {
	- ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
	- dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
	- zhp->zfs_name, snapname);
	- } else {
	- ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer);
	- }
	- nvlist_free(dd.nvl);
	- return (ret);
	-}
	-
	-/*
	- * Destroys all the snapshots named in the nvlist.
	- */
	-int
	-zfs_destroy_snaps_nvl(libzfs_handle_t hdl, nvlist_t snaps, boolean_t defer)
	-{
	- int ret;
	- nvlist_t *errlist = NULL;
	-
	- ret = lzc_destroy_snaps(snaps, defer, &errlist);
	-
	- if (ret == 0) {
	- nvlist_free(errlist);
	- return (0);
	- }
	-
	- if (nvlist_empty(errlist)) {
	- char errbuf[1024];
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot destroy snapshots"));
	-
	- ret = zfs_standard_error(hdl, ret, errbuf);
	- }
	- for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
	- char errbuf[1024];
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
	- nvpair_name(pair));
	-
	- switch (fnvpair_value_int32(pair)) {
	- case EEXIST:
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "snapshot is cloned"));
	- ret = zfs_error(hdl, EZFS_EXISTS, errbuf);
	- break;
	- default:
	- ret = zfs_standard_error(hdl, errno, errbuf);
	- break;
	- }
	- }
	-
	- nvlist_free(errlist);
	- return (ret);
	-}
	-
	-/*
	- * Clones the given dataset. The target must be of the same type as the source.
	- */
	-int
	-zfs_clone(zfs_handle_t zhp, const char target, nvlist_t *props)
	-{
	- char parent[ZFS_MAX_DATASET_NAME_LEN];
	- int ret;
	- char errbuf[1024];
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- uint64_t zoned;
	-
	- assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot create '%s'"), target);
	-
	- /* validate the target/clone name */
	- if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE))
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	-
	- /* validate parents exist */
	- if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0)
	- return (-1);
	-
	- (void) parent_name(target, parent, sizeof (parent));
	-
	- /* do the clone */
	-
	- if (props) {
	- zfs_type_t type;
	-
	- if (ZFS_IS_VOLUME(zhp)) {
	- type = ZFS_TYPE_VOLUME;
	- } else {
	- type = ZFS_TYPE_FILESYSTEM;
	- }
	- if ((props = zfs_valid_proplist(hdl, type, props, zoned,
	- zhp, zhp->zpool_hdl, errbuf)) == NULL)
	- return (-1);
	- if (zfs_fix_auto_resv(zhp, props) == -1) {
	- nvlist_free(props);
	- return (-1);
	- }
	- }
	-
	- ret = lzc_clone(target, zhp->zfs_name, props);
	- nvlist_free(props);
	-
	- if (ret != 0) {
	- switch (errno) {
	-
	- case ENOENT:
	- /*
	- * The parent doesn't exist. We should have caught this
	- * above, but there may a race condition that has since
	- * destroyed the parent.
	- *
	- * At this point, we don't know whether it's the source
	- * that doesn't exist anymore, or whether the target
	- * dataset doesn't exist.
	- */
	- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	- "no such parent '%s'"), parent);
	- return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
	-
	- case EXDEV:
	- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	- "source and target pools differ"));
	- return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET,
	- errbuf));
	-
	- default:
	- return (zfs_standard_error(zhp->zfs_hdl, errno,
	- errbuf));
	- }
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * Promotes the given clone fs to be the clone parent.
	- */
	-int
	-zfs_promote(zfs_handle_t *zhp)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- char snapname[ZFS_MAX_DATASET_NAME_LEN];
	- int ret;
	- char errbuf[1024];
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot promote '%s'"), zhp->zfs_name);
	-
	- if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "snapshots can not be promoted"));
	- return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	- }
	-
	- if (zhp->zfs_dmustats.dds_origin[0] == '\0') {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "not a cloned filesystem"));
	- return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	- }
	-
	- if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE))
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	-
	- ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname));
	-
	- if (ret != 0) {
	- switch (ret) {
	- case EEXIST:
	- /* There is a conflicting snapshot name. */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "conflicting snapshot '%s' from parent '%s'"),
	- snapname, zhp->zfs_dmustats.dds_origin);
	- return (zfs_error(hdl, EZFS_EXISTS, errbuf));
	-
	- default:
	- return (zfs_standard_error(hdl, ret, errbuf));
	- }
	- }
	- return (ret);
	-}
	-
	-typedef struct snapdata {
	- nvlist_t *sd_nvl;
	- const char *sd_snapname;
	-} snapdata_t;
	-
	-static int
	-zfs_snapshot_cb(zfs_handle_t zhp, void arg)
	-{
	- snapdata_t *sd = arg;
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- int rv = 0;
	-
	- if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) {
	- (void) snprintf(name, sizeof (name),
	- "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
	-
	- fnvlist_add_boolean(sd->sd_nvl, name);
	-
	- rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
	- }
	- zfs_close(zhp);
	-
	- return (rv);
	-}
	-
	-int
	-zfs_remap_indirects(libzfs_handle_t hdl, const char fs)
	-{
	- int err;
	- char errbuf[1024];
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot remap dataset '%s'"), fs);
	-
	- err = lzc_remap(fs);
	-
	- if (err != 0) {
	- switch (err) {
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded"));
	- (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- break;
	- case EINVAL:
	- (void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	- break;
	- default:
	- (void) zfs_standard_error(hdl, err, errbuf);
	- break;
	- }
	- }
	-
	- return (err);
	-}
	-
	-/*
	- * Creates snapshots. The keys in the snaps nvlist are the snapshots to be
	- * created.
	- */
	-int
	-zfs_snapshot_nvl(libzfs_handle_t hdl, nvlist_t snaps, nvlist_t *props)
	-{
	- int ret;
	- char errbuf[1024];
	- nvpair_t *elem;
	- nvlist_t *errors;
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot create snapshots "));
	-
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) {
	- const char *snapname = nvpair_name(elem);
	-
	- /* validate the target name */
	- if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT,
	- B_TRUE)) {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot create snapshot '%s'"), snapname);
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	- }
	- }
	-
	- /*
	- * get pool handle for prop validation. assumes all snaps are in the
	- * same pool, as does lzc_snapshot (below).
	- */
	- char pool[ZFS_MAX_DATASET_NAME_LEN];
	- elem = nvlist_next_nvpair(snaps, NULL);
	- (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	- pool[strcspn(pool, "/@")] = '\0';
	- zpool_handle_t *zpool_hdl = zpool_open(hdl, pool);
	-
	- if (props != NULL &&
	- (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT,
	- props, B_FALSE, NULL, zpool_hdl, errbuf)) == NULL) {
	- zpool_close(zpool_hdl);
	- return (-1);
	- }
	- zpool_close(zpool_hdl);
	-
	- ret = lzc_snapshot(snaps, props, &errors);
	-
	- if (ret != 0) {
	- boolean_t printed = B_FALSE;
	- for (elem = nvlist_next_nvpair(errors, NULL);
	- elem != NULL;
	- elem = nvlist_next_nvpair(errors, elem)) {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot create snapshot '%s'"), nvpair_name(elem));
	- (void) zfs_standard_error(hdl,
	- fnvpair_value_int32(elem), errbuf);
	- printed = B_TRUE;
	- }
	- if (!printed) {
	- switch (ret) {
	- case EXDEV:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "multiple snapshots of same "
	- "fs not allowed"));
	- (void) zfs_error(hdl, EZFS_EXISTS, errbuf);
	-
	- break;
	- default:
	- (void) zfs_standard_error(hdl, ret, errbuf);
	- }
	- }
	- }
	-
	- nvlist_free(props);
	- nvlist_free(errors);
	- return (ret);
	-}
	-
	-int
	-zfs_snapshot(libzfs_handle_t hdl, const char path, boolean_t recursive,
	- nvlist_t *props)
	-{
	- int ret;
	- snapdata_t sd = { 0 };
	- char fsname[ZFS_MAX_DATASET_NAME_LEN];
	- char *cp;
	- zfs_handle_t *zhp;
	- char errbuf[1024];
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot snapshot %s"), path);
	-
	- if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE))
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	-
	- (void) strlcpy(fsname, path, sizeof (fsname));
	- cp = strchr(fsname, '@');
	- *cp = '\0';
	- sd.sd_snapname = cp + 1;
	-
	- if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM \|
	- ZFS_TYPE_VOLUME)) == NULL) {
	- return (-1);
	- }
	-
	- verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0);
	- if (recursive) {
	- (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd);
	- } else {
	- fnvlist_add_boolean(sd.sd_nvl, path);
	- }
	-
	- ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props);
	- nvlist_free(sd.sd_nvl);
	- zfs_close(zhp);
	- return (ret);
	-}
	-
	-/*
	- * Destroy any more recent snapshots. We invoke this callback on any dependents
	- * of the snapshot first. If the 'cb_dependent' member is non-zero, then this
	- * is a dependent and we should just destroy it without checking the transaction
	- * group.
	- */
	-typedef struct rollback_data {
	- const char cb_target; / the snapshot */
	- uint64_t cb_create; /* creation time reference */
	- boolean_t cb_error;
	- boolean_t cb_force;
	-} rollback_data_t;
	-
	-static int
	-rollback_destroy_dependent(zfs_handle_t zhp, void data)
	-{
	- rollback_data_t *cbp = data;
	- prop_changelist_t *clp;
	-
	- /* We must destroy this clone; first unmount it */
	- clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
	- cbp->cb_force ? MS_FORCE: 0);
	- if (clp == NULL \|\| changelist_prefix(clp) != 0) {
	- cbp->cb_error = B_TRUE;
	- zfs_close(zhp);
	- return (0);
	- }
	- if (zfs_destroy(zhp, B_FALSE) != 0)
	- cbp->cb_error = B_TRUE;
	- else
	- changelist_remove(clp, zhp->zfs_name);
	- (void) changelist_postfix(clp);
	- changelist_free(clp);
	-
	- zfs_close(zhp);
	- return (0);
	-}
	-
	-static int
	-rollback_destroy(zfs_handle_t zhp, void data)
	-{
	- rollback_data_t *cbp = data;
	-
	- if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
	- cbp->cb_error \|= zfs_iter_dependents(zhp, B_FALSE,
	- rollback_destroy_dependent, cbp);
	-
	- cbp->cb_error \|= zfs_destroy(zhp, B_FALSE);
	- }
	-
	- zfs_close(zhp);
	- return (0);
	-}
	-
	-/*
	- * Given a dataset, rollback to a specific snapshot, discarding any
	- * data changes since then and making it the active dataset.
	- *
	- * Any snapshots and bookmarks more recent than the target are
	- * destroyed, along with their dependents (i.e. clones).
	- */
	-int
	-zfs_rollback(zfs_handle_t zhp, zfs_handle_t snap, boolean_t force)
	-{
	- rollback_data_t cb = { 0 };
	- int err;
	- boolean_t restore_resv = 0;
	- uint64_t min_txg = 0, old_volsize = 0, new_volsize;
	- zfs_prop_t resv_prop;
	-
	- assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM \|\|
	- zhp->zfs_type == ZFS_TYPE_VOLUME);
	-
	- /*
	- * Destroy all recent snapshots and their dependents.
	- */
	- cb.cb_force = force;
	- cb.cb_target = snap->zfs_name;
	- cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
	-
	- if (cb.cb_create > 0)
	- min_txg = cb.cb_create;
	-
	- (void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb,
	- min_txg, 0);
	-
	- (void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb);
	-
	- if (cb.cb_error)
	- return (-1);
	-
	- /*
	- * Now that we have verified that the snapshot is the latest,
	- * rollback to the given snapshot.
	- */
	-
	- if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
	- if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
	- return (-1);
	- old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
	- restore_resv =
	- (old_volsize == zfs_prop_get_int(zhp, resv_prop));
	- }
	-
	- /*
	- * Pass both the filesystem and the wanted snapshot names,
	- * we would get an error back if the snapshot is destroyed or
	- * a new snapshot is created before this request is processed.
	- */
	- err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name);
	- if (err != 0) {
	- char errbuf[1024];
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot rollback '%s'"),
	- zhp->zfs_name);
	- switch (err) {
	- case EEXIST:
	- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	- "there is a snapshot or bookmark more recent "
	- "than '%s'"), snap->zfs_name);
	- (void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf);
	- break;
	- case ESRCH:
	- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	- "'%s' is not found among snapshots of '%s'"),
	- snap->zfs_name, zhp->zfs_name);
	- (void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf);
	- break;
	- case EINVAL:
	- (void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf);
	- break;
	- default:
	- (void) zfs_standard_error(zhp->zfs_hdl, err, errbuf);
	- }
	- return (err);
	- }
	-
	- /*
	- * For volumes, if the pre-rollback volsize matched the pre-
	- * rollback reservation and the volsize has changed then set
	- * the reservation property to the post-rollback volsize.
	- * Make a new handle since the rollback closed the dataset.
	- */
	- if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
	- (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
	- if (restore_resv) {
	- new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
	- if (old_volsize != new_volsize)
	- err = zfs_prop_set_int(zhp, resv_prop,
	- new_volsize);
	- }
	- zfs_close(zhp);
	- }
	- return (err);
	-}
	-
	-/*
	- * Renames the given dataset.
	- */
	-int
	-zfs_rename(zfs_handle_t zhp, const char source, const char *target,
	- renameflags_t flags)
	-{
	- int ret = 0;
	- zfs_cmd_t zc = { 0 };
	- char *delim;
	- prop_changelist_t *cl = NULL;
	- zfs_handle_t *zhrp = NULL;
	- char *parentname = NULL;
	- char parent[ZFS_MAX_DATASET_NAME_LEN];
	- char property[ZFS_MAXPROPLEN];
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- char errbuf[1024];
	-
	- /* if we have the same exact name, just return success */
	- if (strcmp(zhp->zfs_name, target) == 0)
	- return (0);
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot rename to '%s'"), target);
	-
	- if (source != NULL) {
	- /*
	- * This is recursive snapshots rename, put snapshot name
	- * (that might not exist) into zfs_name.
	- */
	- assert(flags.recurse);
	-
	- (void) strlcat(zhp->zfs_name, "@", sizeof(zhp->zfs_name));
	- (void) strlcat(zhp->zfs_name, source, sizeof(zhp->zfs_name));
	- zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
	- }
	-
	- /* make sure source name is valid */
	- if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE))
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	-
	- /*
	- * Make sure the target name is valid
	- */
	- if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT \|\|
	- zhp->zfs_type == ZFS_TYPE_BOOKMARK) {
	- const char sep = zhp->zfs_type == ZFS_TYPE_SNAPSHOT ? '@' : '#';
	-
	- if ((strchr(target, sep) == NULL) \|\| *target == sep) {
	- /*
	- * Snapshot target name is abbreviated,
	- * reconstruct full dataset name
	- */
	- (void) strlcpy(parent, zhp->zfs_name, sizeof (parent));
	- delim = strchr(parent, sep);
	- if (strchr(target, sep) == NULL)
	- *(++delim) = '\0';
	- else
	- *delim = '\0';
	- (void) strlcat(parent, target, sizeof (parent));
	- target = parent;
	- } else {
	- /*
	- * Make sure we're renaming within the same dataset.
	- */
	- delim = strchr(target, sep);
	- if (strncmp(zhp->zfs_name, target, delim - target)
	- != 0 \|\| zhp->zfs_name[delim - target] != sep) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "%s must be part of same dataset"),
	- zhp->zfs_type == ZFS_TYPE_SNAPSHOT ?
	- "snapshots" : "bookmarks");
	- return (zfs_error(hdl, EZFS_CROSSTARGET,
	- errbuf));
	- }
	- }
	-
	- if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	- } else {
	- if (flags.recurse) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "recursive rename must be a snapshot"));
	- return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	- }
	-
	- if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	-
	- /* validate parents */
	- if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0)
	- return (-1);
	-
	- /* make sure we're in the same pool */
	- verify((delim = strchr(target, '/')) != NULL);
	- if (strncmp(zhp->zfs_name, target, delim - target) != 0 \|\|
	- zhp->zfs_name[delim - target] != '/') {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "datasets must be within same pool"));
	- return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
	- }
	-
	- /* new name cannot be a child of the current dataset name */
	- if (is_descendant(zhp->zfs_name, target)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "New dataset name cannot be a descendant of "
	- "current dataset name"));
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	- }
	- }
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name);
	-
	- if (getzoneid() == GLOBAL_ZONEID &&
	- zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "dataset is used in a non-global zone"));
	- return (zfs_error(hdl, EZFS_ZONED, errbuf));
	- }
	-
	- /*
	- * Avoid unmounting file systems with mountpoint property set to
	- * 'legacy' or 'none' even if -u option is not given.
	- */
	- if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
	- !flags.recurse && !flags.nounmount &&
	- zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property,
	- sizeof (property), NULL, NULL, 0, B_FALSE) == 0 &&
	- (strcmp(property, "legacy") == 0 \|\|
	- strcmp(property, "none") == 0)) {
	- flags.nounmount = B_TRUE;
	- }
	- if (flags.recurse) {
	- parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
	- if (parentname == NULL) {
	- ret = -1;
	- goto error;
	- }
	- delim = strchr(parentname, '@');
	- *delim = '\0';
	- zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET);
	- if (zhrp == NULL) {
	- ret = -1;
	- goto error;
	- }
	- } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT &&
	- zhp->zfs_type != ZFS_TYPE_BOOKMARK) {
	- if ((cl = changelist_gather(zhp, ZFS_PROP_NAME,
	- flags.nounmount ? CL_GATHER_DONT_UNMOUNT : 0,
	- flags.forceunmount ? MS_FORCE : 0)) == NULL) {
	- return (-1);
	- }
	-
	- if (changelist_haszonedchild(cl)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "child dataset with inherited mountpoint is used "
	- "in a non-global zone"));
	- (void) zfs_error(hdl, EZFS_ZONED, errbuf);
	- ret = -1;
	- goto error;
	- }
	-
	- if ((ret = changelist_prefix(cl)) != 0)
	- goto error;
	- }
	-
	- if (ZFS_IS_VOLUME(zhp))
	- zc.zc_objset_type = DMU_OST_ZVOL;
	- else
	- zc.zc_objset_type = DMU_OST_ZFS;
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	- (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
	-
	- zc.zc_cookie = flags.recurse ? 1 : 0;
	- if (flags.nounmount)
	- zc.zc_cookie \|= 2;
	-
	- if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) {
	- /*
	- * if it was recursive, the one that actually failed will
	- * be in zc.zc_name
	- */
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot rename '%s'"), zc.zc_name);
	-
	- if (flags.recurse && errno == EEXIST) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "a child dataset already has a snapshot "
	- "with the new name"));
	- (void) zfs_error(hdl, EZFS_EXISTS, errbuf);
	- } else if (errno == EINVAL) {
	- (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
	- } else {
	- (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
	- }
	-
	- /*
	- * On failure, we still want to remount any filesystems that
	- * were previously mounted, so we don't alter the system state.
	- */
	- if (cl != NULL)
	- (void) changelist_postfix(cl);
	- } else {
	- if (cl != NULL) {
	- changelist_rename(cl, zfs_get_name(zhp), target);
	- ret = changelist_postfix(cl);
	- }
	- }
	-
	-error:
	- if (parentname != NULL) {
	- free(parentname);
	- }
	- if (zhrp != NULL) {
	- zfs_close(zhrp);
	- }
	- if (cl != NULL) {
	- changelist_free(cl);
	- }
	- return (ret);
	-}
	-
	-nvlist_t *
	-zfs_get_user_props(zfs_handle_t *zhp)
	-{
	- return (zhp->zfs_user_props);
	-}
	-
	-nvlist_t *
	-zfs_get_recvd_props(zfs_handle_t *zhp)
	-{
	- if (zhp->zfs_recvd_props == NULL)
	- if (get_recvd_props_ioctl(zhp) != 0)
	- return (NULL);
	- return (zhp->zfs_recvd_props);
	-}
	-
	-/*
	- * This function is used by 'zfs list' to determine the exact set of columns to
	- * display, and their maximum widths. This does two main things:
	- *
	- * - If this is a list of all properties, then expand the list to include
	- * all native properties, and set a flag so that for each dataset we look
	- * for new unique user properties and add them to the list.
	- *
	- * - For non fixed-width properties, keep track of the maximum width seen
	- * so that we can size the column appropriately. If the user has
	- * requested received property values, we also need to compute the width
	- * of the RECEIVED column.
	- */
	-int
	-zfs_expand_proplist(zfs_handle_t zhp, zprop_list_t *plp, boolean_t received,
	- boolean_t literal)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- zprop_list_t *entry;
	- zprop_list_t last, start;
	- nvlist_t userprops, propval;
	- nvpair_t *elem;
	- char *strval;
	- char buf[ZFS_MAXPROPLEN];
	-
	- if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0)
	- return (-1);
	-
	- userprops = zfs_get_user_props(zhp);
	-
	- entry = *plp;
	- if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) {
	- /*
	- * Go through and add any user properties as necessary. We
	- * start by incrementing our list pointer to the first
	- * non-native property.
	- */
	- start = plp;
	- while (*start != NULL) {
	- if ((*start)->pl_prop == ZPROP_INVAL)
	- break;
	- start = &(*start)->pl_next;
	- }
	-
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) {
	- /*
	- * See if we've already found this property in our list.
	- */
	- for (last = start; *last != NULL;
	- last = &(*last)->pl_next) {
	- if (strcmp((*last)->pl_user_prop,
	- nvpair_name(elem)) == 0)
	- break;
	- }
	-
	- if (*last == NULL) {
	- if ((entry = zfs_alloc(hdl,
	- sizeof (zprop_list_t))) == NULL \|\|
	- ((entry->pl_user_prop = zfs_strdup(hdl,
	- nvpair_name(elem)))) == NULL) {
	- free(entry);
	- return (-1);
	- }
	-
	- entry->pl_prop = ZPROP_INVAL;
	- entry->pl_width = strlen(nvpair_name(elem));
	- entry->pl_all = B_TRUE;
	- *last = entry;
	- }
	- }
	- }
	-
	- /*
	- * Now go through and check the width of any non-fixed columns
	- */
	- for (entry = *plp; entry != NULL; entry = entry->pl_next) {
	- if (entry->pl_fixed && !literal)
	- continue;
	-
	- if (entry->pl_prop != ZPROP_INVAL) {
	- if (zfs_prop_get(zhp, entry->pl_prop,
	- buf, sizeof (buf), NULL, NULL, 0, literal) == 0) {
	- if (strlen(buf) > entry->pl_width)
	- entry->pl_width = strlen(buf);
	- }
	- if (received && zfs_prop_get_recvd(zhp,
	- zfs_prop_to_name(entry->pl_prop),
	- buf, sizeof (buf), literal) == 0)
	- if (strlen(buf) > entry->pl_recvd_width)
	- entry->pl_recvd_width = strlen(buf);
	- } else {
	- if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop,
	- &propval) == 0) {
	- verify(nvlist_lookup_string(propval,
	- ZPROP_VALUE, &strval) == 0);
	- if (strlen(strval) > entry->pl_width)
	- entry->pl_width = strlen(strval);
	- }
	- if (received && zfs_prop_get_recvd(zhp,
	- entry->pl_user_prop,
	- buf, sizeof (buf), literal) == 0)
	- if (strlen(buf) > entry->pl_recvd_width)
	- entry->pl_recvd_width = strlen(buf);
	- }
	- }
	-
	- return (0);
	-}
	-
	-int
	-zfs_deleg_share_nfs(libzfs_handle_t hdl, char dataset, char *path,
	- char resource, void export, void *sharetab,
	- int sharemax, zfs_share_op_t operation)
	-{
	- zfs_cmd_t zc = { 0 };
	- int error;
	-
	- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
	- (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
	- if (resource)
	- (void) strlcpy(zc.zc_string, resource, sizeof (zc.zc_string));
	- zc.zc_share.z_sharedata = (uint64_t)(uintptr_t)sharetab;
	- zc.zc_share.z_exportdata = (uint64_t)(uintptr_t)export;
	- zc.zc_share.z_sharetype = operation;
	- zc.zc_share.z_sharemax = sharemax;
	- error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc);
	- return (error);
	-}
	-
	-void
	-zfs_prune_proplist(zfs_handle_t zhp, uint8_t props)
	-{
	- nvpair_t *curr;
	-
	- /*
	- * Keep a reference to the props-table against which we prune the
	- * properties.
	- */
	- zhp->zfs_props_table = props;
	-
	- curr = nvlist_next_nvpair(zhp->zfs_props, NULL);
	-
	- while (curr) {
	- zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr));
	- nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr);
	-
	- /*
	- * User properties will result in ZPROP_INVAL, and since we
	- * only know how to prune standard ZFS properties, we always
	- * leave these in the list. This can also happen if we
	- * encounter an unknown DSL property (when running older
	- * software, for example).
	- */
	- if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE)
	- (void) nvlist_remove(zhp->zfs_props,
	- nvpair_name(curr), nvpair_type(curr));
	- curr = next;
	- }
	-}
	-
	-#ifdef illumos
	-static int
	-zfs_smb_acl_mgmt(libzfs_handle_t hdl, char dataset, char *path,
	- zfs_smb_acl_op_t cmd, char resource1, char resource2)
	-{
	- zfs_cmd_t zc = { 0 };
	- nvlist_t *nvlist = NULL;
	- int error;
	-
	- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
	- (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
	- zc.zc_cookie = (uint64_t)cmd;
	-
	- if (cmd == ZFS_SMB_ACL_RENAME) {
	- if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) {
	- (void) no_memory(hdl);
	- return (0);
	- }
	- }
	-
	- switch (cmd) {
	- case ZFS_SMB_ACL_ADD:
	- case ZFS_SMB_ACL_REMOVE:
	- (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string));
	- break;
	- case ZFS_SMB_ACL_RENAME:
	- if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC,
	- resource1) != 0) {
	- (void) no_memory(hdl);
	- return (-1);
	- }
	- if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET,
	- resource2) != 0) {
	- (void) no_memory(hdl);
	- return (-1);
	- }
	- if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) {
	- nvlist_free(nvlist);
	- return (-1);
	- }
	- break;
	- case ZFS_SMB_ACL_PURGE:
	- break;
	- default:
	- return (-1);
	- }
	- error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc);
	- nvlist_free(nvlist);
	- return (error);
	-}
	-
	-int
	-zfs_smb_acl_add(libzfs_handle_t hdl, char dataset,
	- char path, char resource)
	-{
	- return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD,
	- resource, NULL));
	-}
	-
	-int
	-zfs_smb_acl_remove(libzfs_handle_t hdl, char dataset,
	- char path, char resource)
	-{
	- return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE,
	- resource, NULL));
	-}
	-
	-int
	-zfs_smb_acl_purge(libzfs_handle_t hdl, char dataset, char *path)
	-{
	- return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE,
	- NULL, NULL));
	-}
	-
	-int
	-zfs_smb_acl_rename(libzfs_handle_t hdl, char dataset, char *path,
	- char oldname, char newname)
	-{
	- return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME,
	- oldname, newname));
	-}
	-#endif /* illumos */
	-
	-int
	-zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
	- zfs_userspace_cb_t func, void *arg)
	-{
	- zfs_cmd_t zc = { 0 };
	- zfs_useracct_t buf[100];
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- int ret;
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	-
	- zc.zc_objset_type = type;
	- zc.zc_nvlist_dst = (uintptr_t)buf;
	-
	- for (;;) {
	- zfs_useracct_t *zua = buf;
	-
	- zc.zc_nvlist_dst_size = sizeof (buf);
	- if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) {
	- char errbuf[1024];
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot get used/quota for %s"), zc.zc_name);
	- return (zfs_standard_error_fmt(hdl, errno, errbuf));
	- }
	- if (zc.zc_nvlist_dst_size == 0)
	- break;
	-
	- while (zc.zc_nvlist_dst_size > 0) {
	- if ((ret = func(arg, zua->zu_domain, zua->zu_rid,
	- zua->zu_space)) != 0)
	- return (ret);
	- zua++;
	- zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t);
	- }
	- }
	-
	- return (0);
	-}
	-
	-struct holdarg {
	- nvlist_t *nvl;
	- const char *snapname;
	- const char *tag;
	- boolean_t recursive;
	- int error;
	-};
	-
	-static int
	-zfs_hold_one(zfs_handle_t zhp, void arg)
	-{
	- struct holdarg *ha = arg;
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- int rv = 0;
	-
	- (void) snprintf(name, sizeof (name),
	- "%s@%s", zhp->zfs_name, ha->snapname);
	-
	- if (lzc_exists(name))
	- fnvlist_add_string(ha->nvl, name, ha->tag);
	-
	- if (ha->recursive)
	- rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha);
	- zfs_close(zhp);
	- return (rv);
	-}
	-
	-int
	-zfs_hold(zfs_handle_t zhp, const char snapname, const char *tag,
	- boolean_t recursive, int cleanup_fd)
	-{
	- int ret;
	- struct holdarg ha;
	-
	- ha.nvl = fnvlist_alloc();
	- ha.snapname = snapname;
	- ha.tag = tag;
	- ha.recursive = recursive;
	- (void) zfs_hold_one(zfs_handle_dup(zhp), &ha);
	-
	- if (nvlist_empty(ha.nvl)) {
	- char errbuf[1024];
	-
	- fnvlist_free(ha.nvl);
	- ret = ENOENT;
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot hold snapshot '%s@%s'"),
	- zhp->zfs_name, snapname);
	- (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf);
	- return (ret);
	- }
	-
	- ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl);
	- fnvlist_free(ha.nvl);
	-
	- return (ret);
	-}
	-
	-int
	-zfs_hold_nvl(zfs_handle_t zhp, int cleanup_fd, nvlist_t holds)
	-{
	- int ret;
	- nvlist_t *errors;
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- char errbuf[1024];
	- nvpair_t *elem;
	-
	- errors = NULL;
	- ret = lzc_hold(holds, cleanup_fd, &errors);
	-
	- if (ret == 0) {
	- /* There may be errors even in the success case. */
	- fnvlist_free(errors);
	- return (0);
	- }
	-
	- if (nvlist_empty(errors)) {
	- /* no hold-specific errors */
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot hold"));
	- switch (ret) {
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded"));
	- (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- break;
	- case EINVAL:
	- (void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	- break;
	- default:
	- (void) zfs_standard_error(hdl, ret, errbuf);
	- }
	- }
	-
	- for (elem = nvlist_next_nvpair(errors, NULL);
	- elem != NULL;
	- elem = nvlist_next_nvpair(errors, elem)) {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot hold snapshot '%s'"), nvpair_name(elem));
	- switch (fnvpair_value_int32(elem)) {
	- case E2BIG:
	- /*
	- * Temporary tags wind up having the ds object id
	- * prepended. So even if we passed the length check
	- * above, it's still possible for the tag to wind
	- * up being slightly too long.
	- */
	- (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf);
	- break;
	- case EINVAL:
	- (void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	- break;
	- case EEXIST:
	- (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf);
	- break;
	- default:
	- (void) zfs_standard_error(hdl,
	- fnvpair_value_int32(elem), errbuf);
	- }
	- }
	-
	- fnvlist_free(errors);
	- return (ret);
	-}
	-
	-static int
	-zfs_release_one(zfs_handle_t zhp, void arg)
	-{
	- struct holdarg *ha = arg;
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- int rv = 0;
	- nvlist_t *existing_holds;
	-
	- (void) snprintf(name, sizeof (name),
	- "%s@%s", zhp->zfs_name, ha->snapname);
	-
	- if (lzc_get_holds(name, &existing_holds) != 0) {
	- ha->error = ENOENT;
	- } else if (!nvlist_exists(existing_holds, ha->tag)) {
	- ha->error = ESRCH;
	- } else {
	- nvlist_t *torelease = fnvlist_alloc();
	- fnvlist_add_boolean(torelease, ha->tag);
	- fnvlist_add_nvlist(ha->nvl, name, torelease);
	- fnvlist_free(torelease);
	- }
	-
	- if (ha->recursive)
	- rv = zfs_iter_filesystems(zhp, zfs_release_one, ha);
	- zfs_close(zhp);
	- return (rv);
	-}
	-
	-int
	-zfs_release(zfs_handle_t zhp, const char snapname, const char *tag,
	- boolean_t recursive)
	-{
	- int ret;
	- struct holdarg ha;
	- nvlist_t *errors = NULL;
	- nvpair_t *elem;
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- char errbuf[1024];
	-
	- ha.nvl = fnvlist_alloc();
	- ha.snapname = snapname;
	- ha.tag = tag;
	- ha.recursive = recursive;
	- ha.error = 0;
	- (void) zfs_release_one(zfs_handle_dup(zhp), &ha);
	-
	- if (nvlist_empty(ha.nvl)) {
	- fnvlist_free(ha.nvl);
	- ret = ha.error;
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot release hold from snapshot '%s@%s'"),
	- zhp->zfs_name, snapname);
	- if (ret == ESRCH) {
	- (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
	- } else {
	- (void) zfs_standard_error(hdl, ret, errbuf);
	- }
	- return (ret);
	- }
	-
	- ret = lzc_release(ha.nvl, &errors);
	- fnvlist_free(ha.nvl);
	-
	- if (ret == 0) {
	- /* There may be errors even in the success case. */
	- fnvlist_free(errors);
	- return (0);
	- }
	-
	- if (nvlist_empty(errors)) {
	- /* no hold-specific errors */
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot release"));
	- switch (errno) {
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded"));
	- (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- break;
	- default:
	- (void) zfs_standard_error_fmt(hdl, errno, errbuf);
	- }
	- }
	-
	- for (elem = nvlist_next_nvpair(errors, NULL);
	- elem != NULL;
	- elem = nvlist_next_nvpair(errors, elem)) {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot release hold from snapshot '%s'"),
	- nvpair_name(elem));
	- switch (fnvpair_value_int32(elem)) {
	- case ESRCH:
	- (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
	- break;
	- case EINVAL:
	- (void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	- break;
	- default:
	- (void) zfs_standard_error_fmt(hdl,
	- fnvpair_value_int32(elem), errbuf);
	- }
	- }
	-
	- fnvlist_free(errors);
	- return (ret);
	-}
	-
	-int
	-zfs_get_fsacl(zfs_handle_t zhp, nvlist_t *nvl)
	-{
	- zfs_cmd_t zc = { 0 };
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- int nvsz = 2048;
	- void *nvbuf;
	- int err = 0;
	- char errbuf[1024];
	-
	- assert(zhp->zfs_type == ZFS_TYPE_VOLUME \|\|
	- zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
	-
	-tryagain:
	-
	- nvbuf = malloc(nvsz);
	- if (nvbuf == NULL) {
	- err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
	- goto out;
	- }
	-
	- zc.zc_nvlist_dst_size = nvsz;
	- zc.zc_nvlist_dst = (uintptr_t)nvbuf;
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	-
	- if (ioctl(hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"),
	- zc.zc_name);
	- switch (errno) {
	- case ENOMEM:
	- free(nvbuf);
	- nvsz = zc.zc_nvlist_dst_size;
	- goto tryagain;
	-
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded"));
	- err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- break;
	- case EINVAL:
	- err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
	- break;
	- case ENOENT:
	- err = zfs_error(hdl, EZFS_NOENT, errbuf);
	- break;
	- default:
	- err = zfs_standard_error_fmt(hdl, errno, errbuf);
	- break;
	- }
	- } else {
	- /* success */
	- int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
	- if (rc) {
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(
	- TEXT_DOMAIN, "cannot get permissions on '%s'"),
	- zc.zc_name);
	- err = zfs_standard_error_fmt(hdl, rc, errbuf);
	- }
	- }
	-
	- free(nvbuf);
	-out:
	- return (err);
	-}
	-
	-int
	-zfs_set_fsacl(zfs_handle_t zhp, boolean_t un, nvlist_t nvl)
	-{
	- zfs_cmd_t zc = { 0 };
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- char *nvbuf;
	- char errbuf[1024];
	- size_t nvsz;
	- int err;
	-
	- assert(zhp->zfs_type == ZFS_TYPE_VOLUME \|\|
	- zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
	-
	- err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE);
	- assert(err == 0);
	-
	- nvbuf = malloc(nvsz);
	-
	- err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0);
	- assert(err == 0);
	-
	- zc.zc_nvlist_src_size = nvsz;
	- zc.zc_nvlist_src = (uintptr_t)nvbuf;
	- zc.zc_perm_action = un;
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"),
	- zc.zc_name);
	- switch (errno) {
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded"));
	- err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- break;
	- case EINVAL:
	- err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
	- break;
	- case ENOENT:
	- err = zfs_error(hdl, EZFS_NOENT, errbuf);
	- break;
	- default:
	- err = zfs_standard_error_fmt(hdl, errno, errbuf);
	- break;
	- }
	- }
	-
	- free(nvbuf);
	-
	- return (err);
	-}
	-
	-int
	-zfs_get_holds(zfs_handle_t zhp, nvlist_t *nvl)
	-{
	- int err;
	- char errbuf[1024];
	-
	- err = lzc_get_holds(zhp->zfs_name, nvl);
	-
	- if (err != 0) {
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
	- zhp->zfs_name);
	- switch (err) {
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded"));
	- err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- break;
	- case EINVAL:
	- err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
	- break;
	- case ENOENT:
	- err = zfs_error(hdl, EZFS_NOENT, errbuf);
	- break;
	- default:
	- err = zfs_standard_error_fmt(hdl, errno, errbuf);
	- break;
	- }
	- }
	-
	- return (err);
	-}
	-
	-/*
	- * Convert the zvol's volume size to an appropriate reservation.
	- * Note: If this routine is updated, it is necessary to update the ZFS test
	- * suite's shell version in reservation.kshlib.
	- */
	-uint64_t
	-zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
	-{
	- uint64_t numdb;
	- uint64_t nblocks, volblocksize;
	- int ncopies;
	- char *strval;
	-
	- if (nvlist_lookup_string(props,
	- zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0)
	- ncopies = atoi(strval);
	- else
	- ncopies = 1;
	- if (nvlist_lookup_uint64(props,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	- &volblocksize) != 0)
	- volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
	- nblocks = volsize/volblocksize;
	- /* start with metadnode L0-L6 */
	- numdb = 7;
	- /* calculate number of indirects */
	- while (nblocks > 1) {
	- nblocks += DNODES_PER_LEVEL - 1;
	- nblocks /= DNODES_PER_LEVEL;
	- numdb += nblocks;
	- }
	- numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1);
	- volsize *= ncopies;
	- /*
	- * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't
	- * compressed, but in practice they compress down to about
	- * 1100 bytes
	- */
	- numdb *= 1ULL << DN_MAX_INDBLKSHIFT;
	- volsize += numdb;
	- return (volsize);
	-}
	-
	-/*
	- * Attach/detach the given filesystem to/from the given jail.
	- */
	-int
	-zfs_jail(zfs_handle_t *zhp, int jailid, int attach)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- zfs_cmd_t zc = { 0 };
	- char errbuf[1024];
	- unsigned long cmd;
	- int ret;
	-
	- if (attach) {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name);
	- } else {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name);
	- }
	-
	- switch (zhp->zfs_type) {
	- case ZFS_TYPE_VOLUME:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "volumes can not be jailed"));
	- return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	- case ZFS_TYPE_SNAPSHOT:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "snapshots can not be jailed"));
	- return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	- }
	- assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	- zc.zc_objset_type = DMU_OST_ZFS;
	- zc.zc_jailid = jailid;
	-
	- cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL;
	- if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0)
	- zfs_standard_error(hdl, errno, errbuf);
	-
	- return (ret);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c
	@@ -1,834 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
	- * Copyright 2016 Joyent, Inc.
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	- */
	-
	-/*
	- * zfs diff support
	- */
	-#include <ctype.h>
	-#include <errno.h>
	-#include <libintl.h>
	-#include <string.h>
	-#include <sys/types.h>
	-#include <sys/stat.h>
	-#include <fcntl.h>
	-#include <stddef.h>
	-#include <unistd.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <pthread.h>
	-#include <sys/zfs_ioctl.h>
	-#include <libzfs.h>
	-#include "libzfs_impl.h"
	-
	-#define ZDIFF_SNAPDIR "/.zfs/snapshot/"
	-#define ZDIFF_SHARESDIR "/.zfs/shares/"
	-#define ZDIFF_PREFIX "zfs-diff-%d"
	-
	-#define ZDIFF_ADDED '+'
	-#define ZDIFF_MODIFIED 'M'
	-#define ZDIFF_REMOVED '-'
	-#define ZDIFF_RENAMED 'R'
	-
	-typedef struct differ_info {
	- zfs_handle_t *zhp;
	- char *fromsnap;
	- char *frommnt;
	- char *tosnap;
	- char *tomnt;
	- char *ds;
	- char *dsmnt;
	- char *tmpsnap;
	- char errbuf[1024];
	- boolean_t isclone;
	- boolean_t scripted;
	- boolean_t classify;
	- boolean_t timestamped;
	- uint64_t shares;
	- int zerr;
	- int cleanupfd;
	- int outputfd;
	- int datafd;
	-} differ_info_t;
	-
	-/*
	- * Given a {dsname, object id}, get the object path
	- */
	-static int
	-get_stats_for_obj(differ_info_t di, const char dsname, uint64_t obj,
	- char pn, int maxlen, zfs_stat_t sb)
	-{
	- zfs_cmd_t zc = { 0 };
	- int error;
	-
	- (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
	- zc.zc_obj = obj;
	-
	- errno = 0;
	- error = ioctl(di->zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_STATS, &zc);
	- di->zerr = errno;
	-
	- /* we can get stats even if we failed to get a path */
	- (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t));
	- if (error == 0) {
	- ASSERT(di->zerr == 0);
	- (void) strlcpy(pn, zc.zc_value, maxlen);
	- return (0);
	- }
	-
	- if (di->zerr == ESTALE) {
	- (void) snprintf(pn, maxlen, "(on_delete_queue)");
	- return (0);
	- } else if (di->zerr == EPERM) {
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN,
	- "The sys_config privilege or diff delegated permission "
	- "is needed\nto discover path names"));
	- return (-1);
	- } else {
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN,
	- "Unable to determine path or stats for "
	- "object %jd in %s"), (uintmax_t)obj, dsname);
	- return (-1);
	- }
	-}
	-
	-/*
	- * stream_bytes
	- *
	- * Prints a file name out a character at a time. If the character is
	- * not in the range of what we consider "printable" ASCII, display it
	- * as an escaped 3-digit octal value. ASCII values less than a space
	- * are all control characters and we declare the upper end as the
	- * DELete character. This also is the last 7-bit ASCII character.
	- * We choose to treat all 8-bit ASCII as not printable for this
	- * application.
	- */
	-static void
	-stream_bytes(FILE fp, const char string)
	-{
	- char c;
	-
	- while ((c = *string++) != '\0') {
	- if (c > ' ' && c != '\\' && c < '\177') {
	- (void) fprintf(fp, "%c", c);
	- } else {
	- (void) fprintf(fp, "\\%03o", (uint8_t)c);
	- }
	- }
	-}
	-
	-static void
	-print_what(FILE *fp, mode_t what)
	-{
	- char symbol;
	-
	- switch (what & S_IFMT) {
	- case S_IFBLK:
	- symbol = 'B';
	- break;
	- case S_IFCHR:
	- symbol = 'C';
	- break;
	- case S_IFDIR:
	- symbol = '/';
	- break;
	-#ifdef S_IFDOOR
	- case S_IFDOOR:
	- symbol = '>';
	- break;
	-#endif
	- case S_IFIFO:
	- symbol = '\|';
	- break;
	- case S_IFLNK:
	- symbol = '@';
	- break;
	-#ifdef S_IFPORT
	- case S_IFPORT:
	- symbol = 'P';
	- break;
	-#endif
	- case S_IFSOCK:
	- symbol = '=';
	- break;
	- case S_IFREG:
	- symbol = 'F';
	- break;
	- default:
	- symbol = '?';
	- break;
	- }
	- (void) fprintf(fp, "%c", symbol);
	-}
	-
	-static void
	-print_cmn(FILE fp, differ_info_t di, const char *file)
	-{
	- stream_bytes(fp, di->dsmnt);
	- stream_bytes(fp, file);
	-}
	-
	-static void
	-print_rename(FILE fp, differ_info_t di, const char old, const char new,
	- zfs_stat_t *isb)
	-{
	- if (di->timestamped)
	- (void) fprintf(fp, "%10lld.%09lld\t",
	- (longlong_t)isb->zs_ctime[0],
	- (longlong_t)isb->zs_ctime[1]);
	- (void) fprintf(fp, "%c\t", ZDIFF_RENAMED);
	- if (di->classify) {
	- print_what(fp, isb->zs_mode);
	- (void) fprintf(fp, "\t");
	- }
	- print_cmn(fp, di, old);
	- if (di->scripted)
	- (void) fprintf(fp, "\t");
	- else
	- (void) fprintf(fp, " -> ");
	- print_cmn(fp, di, new);
	- (void) fprintf(fp, "\n");
	-}
	-
	-static void
	-print_link_change(FILE fp, differ_info_t di, int delta, const char *file,
	- zfs_stat_t *isb)
	-{
	- if (di->timestamped)
	- (void) fprintf(fp, "%10lld.%09lld\t",
	- (longlong_t)isb->zs_ctime[0],
	- (longlong_t)isb->zs_ctime[1]);
	- (void) fprintf(fp, "%c\t", ZDIFF_MODIFIED);
	- if (di->classify) {
	- print_what(fp, isb->zs_mode);
	- (void) fprintf(fp, "\t");
	- }
	- print_cmn(fp, di, file);
	- (void) fprintf(fp, "\t(%+d)", delta);
	- (void) fprintf(fp, "\n");
	-}
	-
	-static void
	-print_file(FILE fp, differ_info_t di, char type, const char *file,
	- zfs_stat_t *isb)
	-{
	- if (di->timestamped)
	- (void) fprintf(fp, "%10lld.%09lld\t",
	- (longlong_t)isb->zs_ctime[0],
	- (longlong_t)isb->zs_ctime[1]);
	- (void) fprintf(fp, "%c\t", type);
	- if (di->classify) {
	- print_what(fp, isb->zs_mode);
	- (void) fprintf(fp, "\t");
	- }
	- print_cmn(fp, di, file);
	- (void) fprintf(fp, "\n");
	-}
	-
	-static int
	-write_inuse_diffs_one(FILE fp, differ_info_t di, uint64_t dobj)
	-{
	- struct zfs_stat fsb, tsb;
	- mode_t fmode, tmode;
	- char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN];
	- int fobjerr, tobjerr;
	- int change;
	-
	- if (dobj == di->shares)
	- return (0);
	-
	- /*
	- * Check the from and to snapshots for info on the object. If
	- * we get ENOENT, then the object just didn't exist in that
	- * snapshot. If we get ENOTSUP, then we tried to get
	- * info on a non-ZPL object, which we don't care about anyway.
	- */
	- fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname,
	- MAXPATHLEN, &fsb);
	- if (fobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP)
	- return (-1);
	-
	- tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname,
	- MAXPATHLEN, &tsb);
	- if (tobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP)
	- return (-1);
	-
	- /*
	- * Unallocated object sharing the same meta dnode block
	- */
	- if (fobjerr && tobjerr) {
	- ASSERT(di->zerr == ENOENT \|\| di->zerr == ENOTSUP);
	- di->zerr = 0;
	- return (0);
	- }
	-
	- di->zerr = 0; /* negate get_stats_for_obj() from side that failed */
	- fmode = fsb.zs_mode & S_IFMT;
	- tmode = tsb.zs_mode & S_IFMT;
	- if (fmode == S_IFDIR \|\| tmode == S_IFDIR \|\| fsb.zs_links == 0 \|\|
	- tsb.zs_links == 0)
	- change = 0;
	- else
	- change = tsb.zs_links - fsb.zs_links;
	-
	- if (fobjerr) {
	- if (change) {
	- print_link_change(fp, di, change, tobjname, &tsb);
	- return (0);
	- }
	- print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb);
	- return (0);
	- } else if (tobjerr) {
	- if (change) {
	- print_link_change(fp, di, change, fobjname, &fsb);
	- return (0);
	- }
	- print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb);
	- return (0);
	- }
	-
	- if (fmode != tmode && fsb.zs_gen == tsb.zs_gen)
	- tsb.zs_gen++; /* Force a generational difference */
	-
	- /* Simple modification or no change */
	- if (fsb.zs_gen == tsb.zs_gen) {
	- /* No apparent changes. Could we assert !this? */
	- if (fsb.zs_ctime[0] == tsb.zs_ctime[0] &&
	- fsb.zs_ctime[1] == tsb.zs_ctime[1])
	- return (0);
	- if (change) {
	- print_link_change(fp, di, change,
	- change > 0 ? fobjname : tobjname, &tsb);
	- } else if (strcmp(fobjname, tobjname) == 0) {
	- print_file(fp, di, ZDIFF_MODIFIED, fobjname, &tsb);
	- } else {
	- print_rename(fp, di, fobjname, tobjname, &tsb);
	- }
	- return (0);
	- } else {
	- /* file re-created or object re-used */
	- print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb);
	- print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb);
	- return (0);
	- }
	-}
	-
	-static int
	-write_inuse_diffs(FILE fp, differ_info_t di, dmu_diff_record_t *dr)
	-{
	- uint64_t o;
	- int err;
	-
	- for (o = dr->ddr_first; o <= dr->ddr_last; o++) {
	- if ((err = write_inuse_diffs_one(fp, di, o)) != 0)
	- return (err);
	- }
	- return (0);
	-}
	-
	-static int
	-describe_free(FILE fp, differ_info_t di, uint64_t object, char *namebuf,
	- int maxlen)
	-{
	- struct zfs_stat sb;
	-
	- if (get_stats_for_obj(di, di->fromsnap, object, namebuf,
	- maxlen, &sb) != 0) {
	- return (-1);
	- }
	- /* Don't print if in the delete queue on from side */
	- if (di->zerr == ESTALE) {
	- di->zerr = 0;
	- return (0);
	- }
	-
	- print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb);
	- return (0);
	-}
	-
	-static int
	-write_free_diffs(FILE fp, differ_info_t di, dmu_diff_record_t *dr)
	-{
	- zfs_cmd_t zc = { 0 };
	- libzfs_handle_t *lhdl = di->zhp->zfs_hdl;
	- char fobjname[MAXPATHLEN];
	-
	- (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name));
	- zc.zc_obj = dr->ddr_first - 1;
	-
	- ASSERT(di->zerr == 0);
	-
	- while (zc.zc_obj < dr->ddr_last) {
	- int err;
	-
	- err = ioctl(lhdl->libzfs_fd, ZFS_IOC_NEXT_OBJ, &zc);
	- if (err == 0) {
	- if (zc.zc_obj == di->shares) {
	- zc.zc_obj++;
	- continue;
	- }
	- if (zc.zc_obj > dr->ddr_last) {
	- break;
	- }
	- err = describe_free(fp, di, zc.zc_obj, fobjname,
	- MAXPATHLEN);
	- if (err)
	- break;
	- } else if (errno == ESRCH) {
	- break;
	- } else {
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN,
	- "next allocated object (> %jd) find failure"),
	- (uintmax_t)zc.zc_obj);
	- di->zerr = errno;
	- break;
	- }
	- }
	- if (di->zerr)
	- return (-1);
	- return (0);
	-}
	-
	-static void *
	-differ(void *arg)
	-{
	- differ_info_t *di = arg;
	- dmu_diff_record_t dr;
	- FILE *ofp;
	- int err = 0;
	-
	- if ((ofp = fdopen(di->outputfd, "w")) == NULL) {
	- di->zerr = errno;
	- (void) strerror_r(errno, di->errbuf, sizeof (di->errbuf));
	- (void) close(di->datafd);
	- return ((void *)-1);
	- }
	-
	- for (;;) {
	- char cp = (char )&dr;
	- int len = sizeof (dr);
	- int rv;
	-
	- do {
	- rv = read(di->datafd, cp, len);
	- cp += rv;
	- len -= rv;
	- } while (len > 0 && rv > 0);
	-
	- if (rv < 0 \|\| (rv == 0 && len != sizeof (dr))) {
	- di->zerr = EPIPE;
	- break;
	- } else if (rv == 0) {
	- /* end of file at a natural breaking point */
	- break;
	- }
	-
	- switch (dr.ddr_type) {
	- case DDR_FREE:
	- err = write_free_diffs(ofp, di, &dr);
	- break;
	- case DDR_INUSE:
	- err = write_inuse_diffs(ofp, di, &dr);
	- break;
	- default:
	- di->zerr = EPIPE;
	- break;
	- }
	-
	- if (err \|\| di->zerr)
	- break;
	- }
	-
	- (void) fclose(ofp);
	- (void) close(di->datafd);
	- if (err)
	- return ((void *)-1);
	- if (di->zerr) {
	- ASSERT(di->zerr == EPIPE);
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN,
	- "Internal error: bad data from diff IOCTL"));
	- return ((void *)-1);
	- }
	- return ((void *)0);
	-}
	-
	-static int
	-find_shares_object(differ_info_t *di)
	-{
	- char fullpath[MAXPATHLEN];
	- struct stat64 sb = { 0 };
	-
	- (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN);
	- (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN);
	-
	- if (stat64(fullpath, &sb) != 0) {
	-#ifdef illumos
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath);
	- return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf));
	-#else
	- return (0);
	-#endif
	- }
	-
	- di->shares = (uint64_t)sb.st_ino;
	- return (0);
	-}
	-
	-static int
	-make_temp_snapshot(differ_info_t *di)
	-{
	- libzfs_handle_t *hdl = di->zhp->zfs_hdl;
	- zfs_cmd_t zc = { 0 };
	-
	- (void) snprintf(zc.zc_value, sizeof (zc.zc_value),
	- ZDIFF_PREFIX, getpid());
	- (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name));
	- zc.zc_cleanup_fd = di->cleanupfd;
	-
	- if (ioctl(hdl->libzfs_fd, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) {
	- int err = errno;
	- if (err == EPERM) {
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN, "The diff delegated "
	- "permission is needed in order\nto create a "
	- "just-in-time snapshot for diffing\n"));
	- return (zfs_error(hdl, EZFS_DIFF, di->errbuf));
	- } else {
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN, "Cannot create just-in-time "
	- "snapshot of '%s'"), zc.zc_name);
	- return (zfs_standard_error(hdl, err, di->errbuf));
	- }
	- }
	-
	- di->tmpsnap = zfs_strdup(hdl, zc.zc_value);
	- di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap);
	- return (0);
	-}
	-
	-static void
	-teardown_differ_info(differ_info_t *di)
	-{
	- free(di->ds);
	- free(di->dsmnt);
	- free(di->fromsnap);
	- free(di->frommnt);
	- free(di->tosnap);
	- free(di->tmpsnap);
	- free(di->tomnt);
	- (void) close(di->cleanupfd);
	-}
	-
	-static int
	-get_snapshot_names(differ_info_t di, const char fromsnap,
	- const char *tosnap)
	-{
	- libzfs_handle_t *hdl = di->zhp->zfs_hdl;
	- char *atptrf = NULL;
	- char *atptrt = NULL;
	- int fdslen, fsnlen;
	- int tdslen, tsnlen;
	-
	- /*
	- * Can accept
	- * dataset@snap1
	- * dataset@snap1 dataset@snap2
	- * dataset@snap1 @snap2
	- * dataset@snap1 dataset
	- * @snap1 dataset@snap2
	- */
	- if (tosnap == NULL) {
	- /* only a from snapshot given, must be valid */
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN,
	- "Badly formed snapshot name %s"), fromsnap);
	-
	- if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT,
	- B_FALSE)) {
	- return (zfs_error(hdl, EZFS_INVALIDNAME,
	- di->errbuf));
	- }
	-
	- atptrf = strchr(fromsnap, '@');
	- ASSERT(atptrf != NULL);
	- fdslen = atptrf - fromsnap;
	-
	- di->fromsnap = zfs_strdup(hdl, fromsnap);
	- di->ds = zfs_strdup(hdl, fromsnap);
	- di->ds[fdslen] = '\0';
	-
	- /* the to snap will be a just-in-time snap of the head */
	- return (make_temp_snapshot(di));
	- }
	-
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN,
	- "Unable to determine which snapshots to compare"));
	-
	- atptrf = strchr(fromsnap, '@');
	- atptrt = strchr(tosnap, '@');
	- fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap);
	- tdslen = atptrt ? atptrt - tosnap : strlen(tosnap);
	- fsnlen = strlen(fromsnap) - fdslen; /* includes @ sign */
	- tsnlen = strlen(tosnap) - tdslen; /* includes @ sign */
	-
	- if (fsnlen <= 1 \|\| tsnlen == 1 \|\| (fdslen == 0 && tdslen == 0) \|\|
	- (fsnlen == 0 && tsnlen == 0)) {
	- return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf));
	- } else if ((fdslen > 0 && tdslen > 0) &&
	- ((tdslen != fdslen \|\| strncmp(fromsnap, tosnap, fdslen) != 0))) {
	- /*
	- * not the same dataset name, might be okay if
	- * tosnap is a clone of a fromsnap descendant.
	- */
	- char origin[ZFS_MAX_DATASET_NAME_LEN];
	- zprop_source_t src;
	- zfs_handle_t *zhp;
	-
	- di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1);
	- (void) strncpy(di->ds, tosnap, tdslen);
	- di->ds[tdslen] = '\0';
	-
	- zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM);
	- while (zhp != NULL) {
	- if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin,
	- sizeof (origin), &src, NULL, 0, B_FALSE) != 0) {
	- (void) zfs_close(zhp);
	- zhp = NULL;
	- break;
	- }
	- if (strncmp(origin, fromsnap, fsnlen) == 0)
	- break;
	-
	- (void) zfs_close(zhp);
	- zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM);
	- }
	-
	- if (zhp == NULL) {
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN,
	- "Not an earlier snapshot from the same fs"));
	- return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf));
	- } else {
	- (void) zfs_close(zhp);
	- }
	-
	- di->isclone = B_TRUE;
	- di->fromsnap = zfs_strdup(hdl, fromsnap);
	- if (tsnlen) {
	- di->tosnap = zfs_strdup(hdl, tosnap);
	- } else {
	- return (make_temp_snapshot(di));
	- }
	- } else {
	- int dslen = fdslen ? fdslen : tdslen;
	-
	- di->ds = zfs_alloc(hdl, dslen + 1);
	- (void) strncpy(di->ds, fdslen ? fromsnap : tosnap, dslen);
	- di->ds[dslen] = '\0';
	-
	- di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf);
	- if (tsnlen) {
	- di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt);
	- } else {
	- return (make_temp_snapshot(di));
	- }
	- }
	- return (0);
	-}
	-
	-static int
	-get_mountpoint(differ_info_t di, char dsnm, char **mntpt)
	-{
	- boolean_t mounted;
	-
	- mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt);
	- if (mounted == B_FALSE) {
	- (void) snprintf(di->errbuf, sizeof (di->errbuf),
	- dgettext(TEXT_DOMAIN,
	- "Cannot diff an unmounted snapshot"));
	- return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf));
	- }
	-
	- /* Avoid a double slash at the beginning of root-mounted datasets */
	- if (*mntpt == '/' && (*mntpt + 1) == '\0')
	- **mntpt = '\0';
	- return (0);
	-}
	-
	-static int
	-get_mountpoints(differ_info_t *di)
	-{
	- char *strptr;
	- char *frommntpt;
	-
	- /*
	- * first get the mountpoint for the parent dataset
	- */
	- if (get_mountpoint(di, di->ds, &di->dsmnt) != 0)
	- return (-1);
	-
	- strptr = strchr(di->tosnap, '@');
	- ASSERT3P(strptr, !=, NULL);
	- di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt,
	- ZDIFF_SNAPDIR, ++strptr);
	-
	- strptr = strchr(di->fromsnap, '@');
	- ASSERT3P(strptr, !=, NULL);
	-
	- frommntpt = di->dsmnt;
	- if (di->isclone) {
	- char *mntpt;
	- int err;
	-
	- *strptr = '\0';
	- err = get_mountpoint(di, di->fromsnap, &mntpt);
	- *strptr = '@';
	- if (err != 0)
	- return (-1);
	- frommntpt = mntpt;
	- }
	-
	- di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt,
	- ZDIFF_SNAPDIR, ++strptr);
	-
	- if (di->isclone)
	- free(frommntpt);
	-
	- return (0);
	-}
	-
	-static int
	-setup_differ_info(zfs_handle_t zhp, const char fromsnap,
	- const char tosnap, differ_info_t di)
	-{
	- di->zhp = zhp;
	-
	- di->cleanupfd = open(ZFS_DEV, O_RDWR\|O_EXCL);
	- VERIFY(di->cleanupfd >= 0);
	-
	- if (get_snapshot_names(di, fromsnap, tosnap) != 0)
	- return (-1);
	-
	- if (get_mountpoints(di) != 0)
	- return (-1);
	-
	- if (find_shares_object(di) != 0)
	- return (-1);
	-
	- return (0);
	-}
	-
	-int
	-zfs_show_diffs(zfs_handle_t zhp, int outfd, const char fromsnap,
	- const char *tosnap, int flags)
	-{
	- zfs_cmd_t zc = { 0 };
	- char errbuf[1024];
	- differ_info_t di = { 0 };
	- pthread_t tid;
	- int pipefd[2];
	- int iocerr;
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "zfs diff failed"));
	-
	- if (setup_differ_info(zhp, fromsnap, tosnap, &di)) {
	- teardown_differ_info(&di);
	- return (-1);
	- }
	-
	- if (pipe(pipefd)) {
	- zfs_error_aux(zhp->zfs_hdl, strerror(errno));
	- teardown_differ_info(&di);
	- return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf));
	- }
	-
	- di.scripted = (flags & ZFS_DIFF_PARSEABLE);
	- di.classify = (flags & ZFS_DIFF_CLASSIFY);
	- di.timestamped = (flags & ZFS_DIFF_TIMESTAMP);
	-
	- di.outputfd = outfd;
	- di.datafd = pipefd[0];
	-
	- if (pthread_create(&tid, NULL, differ, &di)) {
	- zfs_error_aux(zhp->zfs_hdl, strerror(errno));
	- (void) close(pipefd[0]);
	- (void) close(pipefd[1]);
	- teardown_differ_info(&di);
	- return (zfs_error(zhp->zfs_hdl,
	- EZFS_THREADCREATEFAILED, errbuf));
	- }
	-
	- /* do the ioctl() */
	- (void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1);
	- (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1);
	- zc.zc_cookie = pipefd[1];
	-
	- iocerr = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DIFF, &zc);
	- if (iocerr != 0) {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "Unable to obtain diffs"));
	- if (errno == EPERM) {
	- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	- "\n The sys_mount privilege or diff delegated "
	- "permission is needed\n to execute the "
	- "diff ioctl"));
	- } else if (errno == EXDEV) {
	- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	- "\n Not an earlier snapshot from the same fs"));
	- } else if (errno != EPIPE \|\| di.zerr == 0) {
	- zfs_error_aux(zhp->zfs_hdl, strerror(errno));
	- }
	- (void) close(pipefd[1]);
	- (void) pthread_cancel(tid);
	- (void) pthread_join(tid, NULL);
	- teardown_differ_info(&di);
	- if (di.zerr != 0 && di.zerr != EPIPE) {
	- zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr));
	- return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf));
	- } else {
	- return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf));
	- }
	- }
	-
	- (void) close(pipefd[1]);
	- (void) pthread_join(tid, NULL);
	-
	- if (di.zerr != 0) {
	- zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr));
	- return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf));
	- }
	- teardown_differ_info(&di);
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c
	@@ -1,452 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#include <dlfcn.h>
	-#include <errno.h>
	-#include <libintl.h>
	-#include <link.h>
	-#include <pthread.h>
	-#include <strings.h>
	-#include <unistd.h>
	-
	-#include <libzfs.h>
	-
	-#include <fm/libtopo.h>
	-#include <sys/fm/protocol.h>
	-#include <sys/systeminfo.h>
	-
	-#include "libzfs_impl.h"
	-
	-/*
	- * This file is responsible for determining the relationship between I/O
	- * devices paths and physical locations. In the world of MPxIO and external
	- * enclosures, the device path is not synonymous with the physical location.
	- * If you remove a drive and insert it into a different slot, it will end up
	- * with the same path under MPxIO. If you recable storage enclosures, the
	- * device paths may change. All of this makes it difficult to implement the
	- * 'autoreplace' property, which is supposed to automatically manage disk
	- * replacement based on physical slot.
	- *
	- * In order to work around these limitations, we have a per-vdev FRU property
	- * that is the libtopo path (minus disk-specific authority information) to the
	- * physical location of the device on the system. This is an optional
	- * property, and is only needed when using the 'autoreplace' property or when
	- * generating FMA faults against vdevs.
	- */
	-
	-/*
	- * Because the FMA packages depend on ZFS, we have to dlopen() libtopo in case
	- * it is not present. We only need this once per library instance, so it is
	- * not part of the libzfs handle.
	- */
	-static void *_topo_dlhandle;
	-static topo_hdl_t (_topo_open)(int, const char , int );
	-static void (_topo_close)(topo_hdl_t );
	-static char (_topo_snap_hold)(topo_hdl_t , const char , int *);
	-static void (_topo_snap_release)(topo_hdl_t );
	-static topo_walk_t (_topo_walk_init)(topo_hdl_t , const char ,
	- topo_walk_cb_t, void , int );
	-static int (_topo_walk_step)(topo_walk_t , int);
	-static void (_topo_walk_fini)(topo_walk_t );
	-static void (_topo_hdl_strfree)(topo_hdl_t , char *);
	-static char (_topo_node_name)(tnode_t *);
	-static int (_topo_prop_get_string)(tnode_t , const char , const char ,
	- char *, int );
	-static int (_topo_node_fru)(tnode_t , nvlist_t *, nvlist_t , int *);
	-static int (_topo_fmri_nvl2str)(topo_hdl_t , nvlist_t , char , int );
	-static int (_topo_fmri_strcmp_noauth)(topo_hdl_t , const char *,
	- const char *);
	-
	-#define ZFS_FRU_HASH_SIZE 257
	-
	-static size_t
	-fru_strhash(const char *key)
	-{
	- ulong_t g, h = 0;
	- const char *p;
	-
	- for (p = key; *p != '\0'; p++) {
	- h = (h << 4) + *p;
	-
	- if ((g = (h & 0xf0000000)) != 0) {
	- h ^= (g >> 24);
	- h ^= g;
	- }
	- }
	-
	- return (h % ZFS_FRU_HASH_SIZE);
	-}
	-
	-static int
	-libzfs_fru_gather(topo_hdl_t thp, tnode_t tn, void *arg)
	-{
	- libzfs_handle_t *hdl = arg;
	- nvlist_t *fru;
	- char devpath, frustr;
	- int err;
	- libzfs_fru_t *frup;
	- size_t idx;
	-
	- /*
	- * If this is the chassis node, and we don't yet have the system
	- * chassis ID, then fill in this value now.
	- */
	- if (hdl->libzfs_chassis_id[0] == '\0' &&
	- strcmp(_topo_node_name(tn), "chassis") == 0) {
	- if (_topo_prop_get_string(tn, FM_FMRI_AUTHORITY,
	- FM_FMRI_AUTH_CHASSIS, &devpath, &err) == 0)
	- (void) strlcpy(hdl->libzfs_chassis_id, devpath,
	- sizeof (hdl->libzfs_chassis_id));
	- }
	-
	- /*
	- * Skip non-disk nodes.
	- */
	- if (strcmp(_topo_node_name(tn), "disk") != 0)
	- return (TOPO_WALK_NEXT);
	-
	- /*
	- * Get the devfs path and FRU.
	- */
	- if (_topo_prop_get_string(tn, "io", "devfs-path", &devpath, &err) != 0)
	- return (TOPO_WALK_NEXT);
	-
	- if (libzfs_fru_lookup(hdl, devpath) != NULL) {
	- _topo_hdl_strfree(thp, devpath);
	- return (TOPO_WALK_NEXT);
	- }
	-
	- if (_topo_node_fru(tn, &fru, NULL, &err) != 0) {
	- _topo_hdl_strfree(thp, devpath);
	- return (TOPO_WALK_NEXT);
	- }
	-
	- /*
	- * Convert the FRU into a string.
	- */
	- if (_topo_fmri_nvl2str(thp, fru, &frustr, &err) != 0) {
	- nvlist_free(fru);
	- _topo_hdl_strfree(thp, devpath);
	- return (TOPO_WALK_NEXT);
	- }
	-
	- nvlist_free(fru);
	-
	- /*
	- * Finally, we have a FRU string and device path. Add it to the hash.
	- */
	- if ((frup = calloc(sizeof (libzfs_fru_t), 1)) == NULL) {
	- _topo_hdl_strfree(thp, devpath);
	- _topo_hdl_strfree(thp, frustr);
	- return (TOPO_WALK_NEXT);
	- }
	-
	- if ((frup->zf_device = strdup(devpath)) == NULL \|\|
	- (frup->zf_fru = strdup(frustr)) == NULL) {
	- free(frup->zf_device);
	- free(frup);
	- _topo_hdl_strfree(thp, devpath);
	- _topo_hdl_strfree(thp, frustr);
	- return (TOPO_WALK_NEXT);
	- }
	-
	- _topo_hdl_strfree(thp, devpath);
	- _topo_hdl_strfree(thp, frustr);
	-
	- idx = fru_strhash(frup->zf_device);
	- frup->zf_chain = hdl->libzfs_fru_hash[idx];
	- hdl->libzfs_fru_hash[idx] = frup;
	- frup->zf_next = hdl->libzfs_fru_list;
	- hdl->libzfs_fru_list = frup;
	-
	- return (TOPO_WALK_NEXT);
	-}
	-
	-/*
	- * Called during initialization to setup the dynamic libtopo connection.
	- */
	-#pragma init(libzfs_init_fru)
	-static void
	-libzfs_init_fru(void)
	-{
	- char path[MAXPATHLEN];
	- char isa[257];
	-
	-#if defined(_LP64)
	- if (sysinfo(SI_ARCHITECTURE_64, isa, sizeof (isa)) < 0)
	- isa[0] = '\0';
	-#else
	- isa[0] = '\0';
	-#endif
	- (void) snprintf(path, sizeof (path),
	- "/usr/lib/fm/%s/libtopo.so", isa);
	-
	- if ((_topo_dlhandle = dlopen(path, RTLD_LAZY)) == NULL)
	- return;
	-
	- _topo_open = (topo_hdl_t ()())
	- dlsym(_topo_dlhandle, "topo_open");
	- _topo_close = (void (*)())
	- dlsym(_topo_dlhandle, "topo_close");
	- _topo_snap_hold = (char ()())
	- dlsym(_topo_dlhandle, "topo_snap_hold");
	- _topo_snap_release = (void (*)())
	- dlsym(_topo_dlhandle, "topo_snap_release");
	- _topo_walk_init = (topo_walk_t ()())
	- dlsym(_topo_dlhandle, "topo_walk_init");
	- _topo_walk_step = (int (*)())
	- dlsym(_topo_dlhandle, "topo_walk_step");
	- _topo_walk_fini = (void (*)())
	- dlsym(_topo_dlhandle, "topo_walk_fini");
	- _topo_hdl_strfree = (void (*)())
	- dlsym(_topo_dlhandle, "topo_hdl_strfree");
	- _topo_node_name = (char ()())
	- dlsym(_topo_dlhandle, "topo_node_name");
	- _topo_prop_get_string = (int (*)())
	- dlsym(_topo_dlhandle, "topo_prop_get_string");
	- _topo_node_fru = (int (*)())
	- dlsym(_topo_dlhandle, "topo_node_fru");
	- _topo_fmri_nvl2str = (int (*)())
	- dlsym(_topo_dlhandle, "topo_fmri_nvl2str");
	- _topo_fmri_strcmp_noauth = (int (*)())
	- dlsym(_topo_dlhandle, "topo_fmri_strcmp_noauth");
	-
	- if (_topo_open == NULL \|\| _topo_close == NULL \|\|
	- _topo_snap_hold == NULL \|\| _topo_snap_release == NULL \|\|
	- _topo_walk_init == NULL \|\| _topo_walk_step == NULL \|\|
	- _topo_walk_fini == NULL \|\| _topo_hdl_strfree == NULL \|\|
	- _topo_node_name == NULL \|\| _topo_prop_get_string == NULL \|\|
	- _topo_node_fru == NULL \|\| _topo_fmri_nvl2str == NULL \|\|
	- _topo_fmri_strcmp_noauth == NULL) {
	- (void) dlclose(_topo_dlhandle);
	- _topo_dlhandle = NULL;
	- }
	-}
	-
	-/*
	- * Refresh the mappings from device path -> FMRI. We do this by walking the
	- * hc topology looking for disk nodes, and recording the io/devfs-path and FRU.
	- * Note that we strip out the disk-specific authority information (serial,
	- * part, revision, etc) so that we are left with only the identifying
	- * characteristics of the slot (hc path and chassis-id).
	- */
	-void
	-libzfs_fru_refresh(libzfs_handle_t *hdl)
	-{
	- int err;
	- char *uuid;
	- topo_hdl_t *thp;
	- topo_walk_t *twp;
	-
	- if (_topo_dlhandle == NULL)
	- return;
	-
	- /*
	- * Clear the FRU hash and initialize our basic structures.
	- */
	- libzfs_fru_clear(hdl, B_FALSE);
	-
	- if ((hdl->libzfs_topo_hdl = _topo_open(TOPO_VERSION,
	- NULL, &err)) == NULL)
	- return;
	-
	- thp = hdl->libzfs_topo_hdl;
	-
	- if ((uuid = _topo_snap_hold(thp, NULL, &err)) == NULL)
	- return;
	-
	- _topo_hdl_strfree(thp, uuid);
	-
	- if (hdl->libzfs_fru_hash == NULL &&
	- (hdl->libzfs_fru_hash =
	- calloc(ZFS_FRU_HASH_SIZE, sizeof (void *))) == NULL)
	- return;
	-
	- /*
	- * We now have a topo snapshot, so iterate over the hc topology looking
	- * for disks to add to the hash.
	- */
	- twp = _topo_walk_init(thp, FM_FMRI_SCHEME_HC,
	- libzfs_fru_gather, hdl, &err);
	- if (twp != NULL) {
	- (void) _topo_walk_step(twp, TOPO_WALK_CHILD);
	- _topo_walk_fini(twp);
	- }
	-}
	-
	-/*
	- * Given a devfs path, return the FRU for the device, if known. This will
	- * automatically call libzfs_fru_refresh() if it hasn't already been called by
	- * the consumer. The string returned is valid until the next call to
	- * libzfs_fru_refresh().
	- */
	-const char *
	-libzfs_fru_lookup(libzfs_handle_t hdl, const char devpath)
	-{
	- size_t idx = fru_strhash(devpath);
	- libzfs_fru_t *frup;
	-
	- if (hdl->libzfs_fru_hash == NULL)
	- libzfs_fru_refresh(hdl);
	-
	- if (hdl->libzfs_fru_hash == NULL)
	- return (NULL);
	-
	- for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
	- frup = frup->zf_chain) {
	- if (strcmp(devpath, frup->zf_device) == 0)
	- return (frup->zf_fru);
	- }
	-
	- return (NULL);
	-}
	-
	-/*
	- * Given a fru path, return the device path. This will automatically call
	- * libzfs_fru_refresh() if it hasn't already been called by the consumer. The
	- * string returned is valid until the next call to libzfs_fru_refresh().
	- */
	-const char *
	-libzfs_fru_devpath(libzfs_handle_t hdl, const char fru)
	-{
	- libzfs_fru_t *frup;
	- size_t idx;
	-
	- if (hdl->libzfs_fru_hash == NULL)
	- libzfs_fru_refresh(hdl);
	-
	- if (hdl->libzfs_fru_hash == NULL)
	- return (NULL);
	-
	- for (idx = 0; idx < ZFS_FRU_HASH_SIZE; idx++) {
	- for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
	- frup = frup->zf_next) {
	- if (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl,
	- fru, frup->zf_fru))
	- return (frup->zf_device);
	- }
	- }
	-
	- return (NULL);
	-}
	-
	-/*
	- * Change the stored FRU for the given vdev.
	- */
	-int
	-zpool_fru_set(zpool_handle_t zhp, uint64_t vdev_guid, const char fru)
	-{
	- zfs_cmd_t zc = { 0 };
	-
	- (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- (void) strncpy(zc.zc_value, fru, sizeof (zc.zc_value));
	- zc.zc_guid = vdev_guid;
	-
	- if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SETFRU, &zc) != 0)
	- return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
	- dgettext(TEXT_DOMAIN, "cannot set FRU")));
	-
	- return (0);
	-}
	-
	-/*
	- * Compare to two FRUs, ignoring any authority information.
	- */
	-boolean_t
	-libzfs_fru_compare(libzfs_handle_t hdl, const char a, const char *b)
	-{
	- if (hdl->libzfs_fru_hash == NULL)
	- libzfs_fru_refresh(hdl);
	-
	- if (hdl->libzfs_fru_hash == NULL)
	- return (strcmp(a, b) == 0);
	-
	- return (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, a, b));
	-}
	-
	-/*
	- * This special function checks to see whether the FRU indicates it's supposed
	- * to be in the system chassis, but the chassis-id doesn't match. This can
	- * happen in a clustered case, where both head nodes have the same logical
	- * disk, but opening the device on the other head node is meaningless.
	- */
	-boolean_t
	-libzfs_fru_notself(libzfs_handle_t hdl, const char fru)
	-{
	- const char *chassisid;
	- size_t len;
	-
	- if (hdl->libzfs_fru_hash == NULL)
	- libzfs_fru_refresh(hdl);
	-
	- if (hdl->libzfs_chassis_id[0] == '\0')
	- return (B_FALSE);
	-
	- if (strstr(fru, "/chassis=0/") == NULL)
	- return (B_FALSE);
	-
	- if ((chassisid = strstr(fru, ":chassis-id=")) == NULL)
	- return (B_FALSE);
	-
	- chassisid += 12;
	- len = strlen(hdl->libzfs_chassis_id);
	- if (strncmp(chassisid, hdl->libzfs_chassis_id, len) == 0 &&
	- (chassisid[len] == '/' \|\| chassisid[len] == ':'))
	- return (B_FALSE);
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Clear memory associated with the FRU hash.
	- */
	-void
	-libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final)
	-{
	- libzfs_fru_t *frup;
	-
	- while ((frup = hdl->libzfs_fru_list) != NULL) {
	- hdl->libzfs_fru_list = frup->zf_next;
	- free(frup->zf_device);
	- free(frup->zf_fru);
	- free(frup);
	- }
	-
	- hdl->libzfs_fru_list = NULL;
	-
	- if (hdl->libzfs_topo_hdl != NULL) {
	- _topo_snap_release(hdl->libzfs_topo_hdl);
	- _topo_close(hdl->libzfs_topo_hdl);
	- hdl->libzfs_topo_hdl = NULL;
	- }
	-
	- if (final) {
	- free(hdl->libzfs_fru_hash);
	- } else if (hdl->libzfs_fru_hash != NULL) {
	- bzero(hdl->libzfs_fru_hash,
	- ZFS_FRU_HASH_SIZE * sizeof (void *));
	- }
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
	@@ -1,228 +0,0 @@
	-/*
	- * CDDL HEADER SART
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- */
	-
	-#ifndef _LIBZFS_IMPL_H
	-#define _LIBZFS_IMPL_H
	-
	-#include <sys/fs/zfs.h>
	-#include <sys/spa.h>
	-#include <sys/nvpair.h>
	-#include <sys/dmu.h>
	-#include <sys/zfs_ioctl.h>
	-
	-#include <libshare.h>
	-#include <libuutil.h>
	-#include <libzfs.h>
	-#include <libzfs_core.h>
	-#include <libzfs_compat.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#ifdef VERIFY
	-#undef VERIFY
	-#endif
	-#define VERIFY verify
	-
	-typedef struct libzfs_fru {
	- char *zf_device;
	- char *zf_fru;
	- struct libzfs_fru *zf_chain;
	- struct libzfs_fru *zf_next;
	-} libzfs_fru_t;
	-
	-struct libzfs_handle {
	- int libzfs_error;
	- int libzfs_fd;
	- FILE *libzfs_mnttab;
	- FILE *libzfs_sharetab;
	- zpool_handle_t *libzfs_pool_handles;
	- uu_avl_pool_t *libzfs_ns_avlpool;
	- uu_avl_t *libzfs_ns_avl;
	- uint64_t libzfs_ns_gen;
	- int libzfs_desc_active;
	- char libzfs_action[1024];
	- char libzfs_desc[1024];
	- int libzfs_printerr;
	- int libzfs_storeerr; /* stuff error messages into buffer */
	- void libzfs_sharehdl; / libshare handle */
	- boolean_t libzfs_mnttab_enable;
	- /*
	- * We need a lock to handle the case where parallel mount
	- * threads are populating the mnttab cache simultaneously. The
	- * lock only protects the integrity of the avl tree, and does
	- * not protect the contents of the mnttab entries themselves.
	- */
	- pthread_mutex_t libzfs_mnttab_cache_lock;
	- avl_tree_t libzfs_mnttab_cache;
	- int libzfs_pool_iter;
	- libzfs_fru_t **libzfs_fru_hash;
	- libzfs_fru_t *libzfs_fru_list;
	- char libzfs_chassis_id[256];
	- boolean_t libzfs_prop_debug;
	-};
	-
	-struct zfs_handle {
	- libzfs_handle_t *zfs_hdl;
	- zpool_handle_t *zpool_hdl;
	- char zfs_name[ZFS_MAX_DATASET_NAME_LEN];
	- zfs_type_t zfs_type; /* type including snapshot */
	- zfs_type_t zfs_head_type; /* type excluding snapshot */
	- dmu_objset_stats_t zfs_dmustats;
	- nvlist_t *zfs_props;
	- nvlist_t *zfs_user_props;
	- nvlist_t *zfs_recvd_props;
	- boolean_t zfs_mntcheck;
	- char *zfs_mntopts;
	- uint8_t *zfs_props_table;
	-};
	-
	-/*
	- * This is different from checking zfs_type, because it will also catch
	- * snapshots of volumes.
	- */
	-#define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME)
	-
	-struct zpool_handle {
	- libzfs_handle_t *zpool_hdl;
	- zpool_handle_t *zpool_next;
	- char zpool_name[ZFS_MAX_DATASET_NAME_LEN];
	- int zpool_state;
	- size_t zpool_config_size;
	- nvlist_t *zpool_config;
	- nvlist_t *zpool_old_config;
	- nvlist_t *zpool_props;
	- diskaddr_t zpool_start_block;
	-};
	-
	-typedef enum {
	- PROTO_NFS = 0,
	- PROTO_SMB = 1,
	- PROTO_END = 2
	-} zfs_share_proto_t;
	-
	-/*
	- * The following can be used as a bitmask and any new values
	- * added must preserve that capability.
	- */
	-typedef enum {
	- SHARED_NOT_SHARED = 0x0,
	- SHARED_NFS = 0x2,
	- SHARED_SMB = 0x4
	-} zfs_share_type_t;
	-
	-#define CONFIG_BUF_MINSIZE 262144
	-
	-int zfs_error(libzfs_handle_t , int, const char );
	-int zfs_error_fmt(libzfs_handle_t , int, const char , ...);
	-void zfs_error_aux(libzfs_handle_t , const char , ...);
	-void zfs_alloc(libzfs_handle_t , size_t);
	-void zfs_realloc(libzfs_handle_t , void *, size_t, size_t);
	-char zfs_asprintf(libzfs_handle_t , const char *, ...);
	-char zfs_strdup(libzfs_handle_t , const char *);
	-int no_memory(libzfs_handle_t *);
	-
	-int zfs_standard_error(libzfs_handle_t , int, const char );
	-int zfs_standard_error_fmt(libzfs_handle_t , int, const char , ...);
	-int zpool_standard_error(libzfs_handle_t , int, const char );
	-int zpool_standard_error_fmt(libzfs_handle_t , int, const char , ...);
	-
	-int get_dependents(libzfs_handle_t , boolean_t, const char , char ***,
	- size_t *);
	-zfs_handle_t make_dataset_handle_zc(libzfs_handle_t , zfs_cmd_t *);
	-zfs_handle_t make_dataset_simple_handle_zc(zfs_handle_t , zfs_cmd_t *);
	-
	-int zprop_parse_value(libzfs_handle_t , nvpair_t , int, zfs_type_t,
	- nvlist_t , char , uint64_t , const char *);
	-int zprop_expand_list(libzfs_handle_t hdl, zprop_list_t *plp,
	- zfs_type_t type);
	-
	-/*
	- * Use this changelist_gather() flag to force attempting mounts
	- * on each change node regardless of whether or not it is currently
	- * mounted.
	- */
	-#define CL_GATHER_MOUNT_ALWAYS 0x01
	-/*
	- * Use this changelist_gather() flag to prevent unmounting of file systems.
	- */
	-#define CL_GATHER_DONT_UNMOUNT 0x02
	-
	-typedef struct prop_changelist prop_changelist_t;
	-
	-int zcmd_alloc_dst_nvlist(libzfs_handle_t , zfs_cmd_t , size_t);
	-int zcmd_write_src_nvlist(libzfs_handle_t , zfs_cmd_t , nvlist_t *);
	-int zcmd_write_conf_nvlist(libzfs_handle_t , zfs_cmd_t , nvlist_t *);
	-int zcmd_expand_dst_nvlist(libzfs_handle_t , zfs_cmd_t );
	-int zcmd_read_dst_nvlist(libzfs_handle_t , zfs_cmd_t , nvlist_t **);
	-void zcmd_free_nvlists(zfs_cmd_t *);
	-
	-int changelist_prefix(prop_changelist_t *);
	-int changelist_postfix(prop_changelist_t *);
	-void changelist_rename(prop_changelist_t , const char , const char *);
	-void changelist_remove(prop_changelist_t , const char );
	-void changelist_free(prop_changelist_t *);
	-prop_changelist_t changelist_gather(zfs_handle_t , zfs_prop_t, int, int);
	-int changelist_unshare(prop_changelist_t , zfs_share_proto_t );
	-int changelist_haszonedchild(prop_changelist_t *);
	-
	-void remove_mountpoint(zfs_handle_t *);
	-int create_parents(libzfs_handle_t , char , int);
	-boolean_t isa_child_of(const char dataset, const char parent);
	-
	-zfs_handle_t make_dataset_handle(libzfs_handle_t , const char *);
	-zfs_handle_t make_bookmark_handle(zfs_handle_t , const char *,
	- nvlist_t *props);
	-
	-int zpool_open_silent(libzfs_handle_t , const char , zpool_handle_t **);
	-
	-boolean_t zpool_name_valid(libzfs_handle_t , boolean_t, const char );
	-
	-int zfs_validate_name(libzfs_handle_t hdl, const char path, int type,
	- boolean_t modifying);
	-
	-void namespace_clear(libzfs_handle_t *);
	-
	-/*
	- * libshare (sharemgr) interfaces used internally.
	- */
	-
	-extern int zfs_init_libshare(libzfs_handle_t *, int);
	-extern int zfs_parse_options(char *, zfs_share_proto_t);
	-
	-extern int zfs_unshare_proto(zfs_handle_t *,
	- const char , zfs_share_proto_t );
	-
	-extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _LIBZFS_IMPL_H */
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c
	@@ -1,1929 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright 2015 RackTop Systems.
	- * Copyright 2016 Nexenta Systems, Inc.
	- */
	-
	-/*
	- * Pool import support functions.
	- *
	- * To import a pool, we rely on reading the configuration information from the
	- * ZFS label of each device. If we successfully read the label, then we
	- * organize the configuration information in the following hierarchy:
	- *
	- * pool guid -> toplevel vdev guid -> label txg
	- *
	- * Duplicate entries matching this same tuple will be discarded. Once we have
	- * examined every device, we pick the best label txg config for each toplevel
	- * vdev. We then arrange these toplevel vdevs into a complete pool config, and
	- * update any paths that have changed. Finally, we attempt to import the pool
	- * using our derived config, and record the results.
	- */
	-
	-#include <aio.h>
	-#include <ctype.h>
	-#include <devid.h>
	-#include <dirent.h>
	-#include <errno.h>
	-#include <libintl.h>
	-#include <stddef.h>
	-#include <stdlib.h>
	-#include <string.h>
	-#include <sys/stat.h>
	-#include <unistd.h>
	-#include <fcntl.h>
	-#include <thread_pool.h>
	-#include <libgeom.h>
	-
	-#include <sys/vdev_impl.h>
	-
	-#include "libzfs.h"
	-#include "libzfs_impl.h"
	-
	-/*
	- * Intermediate structures used to gather configuration information.
	- */
	-typedef struct config_entry {
	- uint64_t ce_txg;
	- nvlist_t *ce_config;
	- struct config_entry *ce_next;
	-} config_entry_t;
	-
	-typedef struct vdev_entry {
	- uint64_t ve_guid;
	- config_entry_t *ve_configs;
	- struct vdev_entry *ve_next;
	-} vdev_entry_t;
	-
	-typedef struct pool_entry {
	- uint64_t pe_guid;
	- vdev_entry_t *pe_vdevs;
	- struct pool_entry *pe_next;
	-} pool_entry_t;
	-
	-typedef struct name_entry {
	- char *ne_name;
	- uint64_t ne_guid;
	- struct name_entry *ne_next;
	-} name_entry_t;
	-
	-typedef struct pool_list {
	- pool_entry_t *pools;
	- name_entry_t *names;
	-} pool_list_t;
	-
	-static char *
	-get_devid(const char *path)
	-{
	-#ifdef have_devid
	- int fd;
	- ddi_devid_t devid;
	- char minor, ret;
	-
	- if ((fd = open(path, O_RDONLY)) < 0)
	- return (NULL);
	-
	- minor = NULL;
	- ret = NULL;
	- if (devid_get(fd, &devid) == 0) {
	- if (devid_get_minor_name(fd, &minor) == 0)
	- ret = devid_str_encode(devid, minor);
	- if (minor != NULL)
	- devid_str_free(minor);
	- devid_free(devid);
	- }
	- (void) close(fd);
	-
	- return (ret);
	-#else
	- return (NULL);
	-#endif
	-}
	-
	-
	-/*
	- * Go through and fix up any path and/or devid information for the given vdev
	- * configuration.
	- */
	-static int
	-fix_paths(nvlist_t nv, name_entry_t names)
	-{
	- nvlist_t **child;
	- uint_t c, children;
	- uint64_t guid;
	- name_entry_t ne, best;
	- char path, devid;
	- int matched;
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++)
	- if (fix_paths(child[c], names) != 0)
	- return (-1);
	- return (0);
	- }
	-
	- /*
	- * This is a leaf (file or disk) vdev. In either case, go through
	- * the name list and see if we find a matching guid. If so, replace
	- * the path and see if we can calculate a new devid.
	- *
	- * There may be multiple names associated with a particular guid, in
	- * which case we have overlapping slices or multiple paths to the same
	- * disk. If this is the case, then we want to pick the path that is
	- * the most similar to the original, where "most similar" is the number
	- * of matching characters starting from the end of the path. This will
	- * preserve slice numbers even if the disks have been reorganized, and
	- * will also catch preferred disk names if multiple paths exist.
	- */
	- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
	- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
	- path = NULL;
	-
	- matched = 0;
	- best = NULL;
	- for (ne = names; ne != NULL; ne = ne->ne_next) {
	- if (ne->ne_guid == guid) {
	- const char src, dst;
	- int count;
	-
	- if (path == NULL) {
	- best = ne;
	- break;
	- }
	-
	- src = ne->ne_name + strlen(ne->ne_name) - 1;
	- dst = path + strlen(path) - 1;
	- for (count = 0; src >= ne->ne_name && dst >= path;
	- src--, dst--, count++)
	- if (src != dst)
	- break;
	-
	- /*
	- * At this point, 'count' is the number of characters
	- * matched from the end.
	- */
	- if (count > matched \|\| best == NULL) {
	- best = ne;
	- matched = count;
	- }
	- }
	- }
	-
	- if (best == NULL)
	- return (0);
	-
	- if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)
	- return (-1);
	-
	- if ((devid = get_devid(best->ne_name)) == NULL) {
	- (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
	- } else {
	- if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) {
	- devid_str_free(devid);
	- return (-1);
	- }
	- devid_str_free(devid);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Add the given configuration to the list of known devices.
	- */
	-static int
	-add_config(libzfs_handle_t hdl, pool_list_t pl, const char *path,
	- nvlist_t *config)
	-{
	- uint64_t pool_guid, vdev_guid, top_guid, txg, state;
	- pool_entry_t *pe;
	- vdev_entry_t *ve;
	- config_entry_t *ce;
	- name_entry_t *ne;
	-
	- /*
	- * If this is a hot spare not currently in use or level 2 cache
	- * device, add it to the list of names to translate, but don't do
	- * anything else.
	- */
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	- &state) == 0 &&
	- (state == POOL_STATE_SPARE \|\| state == POOL_STATE_L2CACHE) &&
	- nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
	- if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
	- return (-1);
	-
	- if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
	- free(ne);
	- return (-1);
	- }
	-
	- ne->ne_guid = vdev_guid;
	- ne->ne_next = pl->names;
	- pl->names = ne;
	-
	- return (0);
	- }
	-
	- /*
	- * If we have a valid config but cannot read any of these fields, then
	- * it means we have a half-initialized label. In vdev_label_init()
	- * we write a label with txg == 0 so that we can identify the device
	- * in case the user refers to the same disk later on. If we fail to
	- * create the pool, we'll be left with a label in this state
	- * which should not be considered part of a valid pool.
	- */
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	- &pool_guid) != 0 \|\|
	- nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
	- &vdev_guid) != 0 \|\|
	- nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
	- &top_guid) != 0 \|\|
	- nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
	- &txg) != 0 \|\| txg == 0) {
	- return (0);
	- }
	-
	- /*
	- * First, see if we know about this pool. If not, then add it to the
	- * list of known pools.
	- */
	- for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
	- if (pe->pe_guid == pool_guid)
	- break;
	- }
	-
	- if (pe == NULL) {
	- if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) {
	- return (-1);
	- }
	- pe->pe_guid = pool_guid;
	- pe->pe_next = pl->pools;
	- pl->pools = pe;
	- }
	-
	- /*
	- * Second, see if we know about this toplevel vdev. Add it if its
	- * missing.
	- */
	- for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
	- if (ve->ve_guid == top_guid)
	- break;
	- }
	-
	- if (ve == NULL) {
	- if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {
	- return (-1);
	- }
	- ve->ve_guid = top_guid;
	- ve->ve_next = pe->pe_vdevs;
	- pe->pe_vdevs = ve;
	- }
	-
	- /*
	- * Third, see if we have a config with a matching transaction group. If
	- * so, then we do nothing. Otherwise, add it to the list of known
	- * configs.
	- */
	- for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
	- if (ce->ce_txg == txg)
	- break;
	- }
	-
	- if (ce == NULL) {
	- if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) {
	- return (-1);
	- }
	- ce->ce_txg = txg;
	- ce->ce_config = fnvlist_dup(config);
	- ce->ce_next = ve->ve_configs;
	- ve->ve_configs = ce;
	- }
	-
	- /*
	- * At this point we've successfully added our config to the list of
	- * known configs. The last thing to do is add the vdev guid -> path
	- * mappings so that we can fix up the configuration as necessary before
	- * doing the import.
	- */
	- if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
	- return (-1);
	-
	- if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
	- free(ne);
	- return (-1);
	- }
	-
	- ne->ne_guid = vdev_guid;
	- ne->ne_next = pl->names;
	- pl->names = ne;
	-
	- return (0);
	-}
	-
	-/*
	- * Returns true if the named pool matches the given GUID.
	- */
	-static int
	-pool_active(libzfs_handle_t hdl, const char name, uint64_t guid,
	- boolean_t *isactive)
	-{
	- zpool_handle_t *zhp;
	- uint64_t theguid;
	-
	- if (zpool_open_silent(hdl, name, &zhp) != 0)
	- return (-1);
	-
	- if (zhp == NULL) {
	- *isactive = B_FALSE;
	- return (0);
	- }
	-
	- verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID,
	- &theguid) == 0);
	-
	- zpool_close(zhp);
	-
	- *isactive = (theguid == guid);
	- return (0);
	-}
	-
	-static nvlist_t *
	-refresh_config(libzfs_handle_t hdl, nvlist_t config)
	-{
	- nvlist_t *nvl;
	- zfs_cmd_t zc = { 0 };
	- int err, dstbuf_size;
	-
	- if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0)
	- return (NULL);
	-
	- dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4);
	-
	- if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (NULL);
	- }
	-
	- while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT,
	- &zc)) != 0 && errno == ENOMEM) {
	- if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (NULL);
	- }
	- }
	-
	- if (err) {
	- zcmd_free_nvlists(&zc);
	- return (NULL);
	- }
	-
	- if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (NULL);
	- }
	-
	- zcmd_free_nvlists(&zc);
	- return (nvl);
	-}
	-
	-/*
	- * Determine if the vdev id is a hole in the namespace.
	- */
	-boolean_t
	-vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
	-{
	- for (int c = 0; c < holes; c++) {
	-
	- /* Top-level is a hole */
	- if (hole_array[c] == id)
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * Convert our list of pools into the definitive set of configurations. We
	- * start by picking the best config for each toplevel vdev. Once that's done,
	- * we assemble the toplevel vdevs into a full config for the pool. We make a
	- * pass to fix up any incorrect paths, and then add it to the main list to
	- * return to the user.
	- */
	-static nvlist_t *
	-get_configs(libzfs_handle_t hdl, pool_list_t pl, boolean_t active_ok,
	- nvlist_t *policy)
	-{
	- pool_entry_t *pe;
	- vdev_entry_t *ve;
	- config_entry_t *ce;
	- nvlist_t ret = NULL, config = NULL, tmp = NULL, nvtop, *nvroot;
	- nvlist_t spares, l2cache;
	- uint_t i, nspares, nl2cache;
	- boolean_t config_seen;
	- uint64_t best_txg;
	- char name, hostname = NULL;
	- uint64_t guid;
	- uint_t children = 0;
	- nvlist_t **child = NULL;
	- uint_t holes;
	- uint64_t *hole_array, max_id;
	- uint_t c;
	- boolean_t isactive;
	- uint64_t hostid;
	- nvlist_t *nvl;
	- boolean_t found_one = B_FALSE;
	- boolean_t valid_top_config = B_FALSE;
	-
	- if (nvlist_alloc(&ret, 0, 0) != 0)
	- goto nomem;
	-
	- for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
	- uint64_t id, max_txg = 0;
	-
	- if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
	- goto nomem;
	- config_seen = B_FALSE;
	-
	- /*
	- * Iterate over all toplevel vdevs. Grab the pool configuration
	- * from the first one we find, and then go through the rest and
	- * add them as necessary to the 'vdevs' member of the config.
	- */
	- for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
	-
	- /*
	- * Determine the best configuration for this vdev by
	- * selecting the config with the latest transaction
	- * group.
	- */
	- best_txg = 0;
	- for (ce = ve->ve_configs; ce != NULL;
	- ce = ce->ce_next) {
	-
	- if (ce->ce_txg > best_txg) {
	- tmp = ce->ce_config;
	- best_txg = ce->ce_txg;
	- }
	- }
	-
	- /*
	- * We rely on the fact that the max txg for the
	- * pool will contain the most up-to-date information
	- * about the valid top-levels in the vdev namespace.
	- */
	- if (best_txg > max_txg) {
	- (void) nvlist_remove(config,
	- ZPOOL_CONFIG_VDEV_CHILDREN,
	- DATA_TYPE_UINT64);
	- (void) nvlist_remove(config,
	- ZPOOL_CONFIG_HOLE_ARRAY,
	- DATA_TYPE_UINT64_ARRAY);
	-
	- max_txg = best_txg;
	- hole_array = NULL;
	- holes = 0;
	- max_id = 0;
	- valid_top_config = B_FALSE;
	-
	- if (nvlist_lookup_uint64(tmp,
	- ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
	- verify(nvlist_add_uint64(config,
	- ZPOOL_CONFIG_VDEV_CHILDREN,
	- max_id) == 0);
	- valid_top_config = B_TRUE;
	- }
	-
	- if (nvlist_lookup_uint64_array(tmp,
	- ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
	- &holes) == 0) {
	- verify(nvlist_add_uint64_array(config,
	- ZPOOL_CONFIG_HOLE_ARRAY,
	- hole_array, holes) == 0);
	- }
	- }
	-
	- if (!config_seen) {
	- /*
	- * Copy the relevant pieces of data to the pool
	- * configuration:
	- *
	- * version
	- * pool guid
	- * name
	- * comment (if available)
	- * pool state
	- * hostid (if available)
	- * hostname (if available)
	- */
	- uint64_t state, version;
	- char *comment = NULL;
	-
	- version = fnvlist_lookup_uint64(tmp,
	- ZPOOL_CONFIG_VERSION);
	- fnvlist_add_uint64(config,
	- ZPOOL_CONFIG_VERSION, version);
	- guid = fnvlist_lookup_uint64(tmp,
	- ZPOOL_CONFIG_POOL_GUID);
	- fnvlist_add_uint64(config,
	- ZPOOL_CONFIG_POOL_GUID, guid);
	- name = fnvlist_lookup_string(tmp,
	- ZPOOL_CONFIG_POOL_NAME);
	- fnvlist_add_string(config,
	- ZPOOL_CONFIG_POOL_NAME, name);
	-
	- if (nvlist_lookup_string(tmp,
	- ZPOOL_CONFIG_COMMENT, &comment) == 0)
	- fnvlist_add_string(config,
	- ZPOOL_CONFIG_COMMENT, comment);
	-
	- state = fnvlist_lookup_uint64(tmp,
	- ZPOOL_CONFIG_POOL_STATE);
	- fnvlist_add_uint64(config,
	- ZPOOL_CONFIG_POOL_STATE, state);
	-
	- hostid = 0;
	- if (nvlist_lookup_uint64(tmp,
	- ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
	- fnvlist_add_uint64(config,
	- ZPOOL_CONFIG_HOSTID, hostid);
	- hostname = fnvlist_lookup_string(tmp,
	- ZPOOL_CONFIG_HOSTNAME);
	- fnvlist_add_string(config,
	- ZPOOL_CONFIG_HOSTNAME, hostname);
	- }
	-
	- config_seen = B_TRUE;
	- }
	-
	- /*
	- * Add this top-level vdev to the child array.
	- */
	- verify(nvlist_lookup_nvlist(tmp,
	- ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
	- verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
	- &id) == 0);
	-
	- if (id >= children) {
	- nvlist_t **newchild;
	-
	- newchild = zfs_alloc(hdl, (id + 1) *
	- sizeof (nvlist_t *));
	- if (newchild == NULL)
	- goto nomem;
	-
	- for (c = 0; c < children; c++)
	- newchild[c] = child[c];
	-
	- free(child);
	- child = newchild;
	- children = id + 1;
	- }
	- if (nvlist_dup(nvtop, &child[id], 0) != 0)
	- goto nomem;
	-
	- }
	-
	- /*
	- * If we have information about all the top-levels then
	- * clean up the nvlist which we've constructed. This
	- * means removing any extraneous devices that are
	- * beyond the valid range or adding devices to the end
	- * of our array which appear to be missing.
	- */
	- if (valid_top_config) {
	- if (max_id < children) {
	- for (c = max_id; c < children; c++)
	- nvlist_free(child[c]);
	- children = max_id;
	- } else if (max_id > children) {
	- nvlist_t **newchild;
	-
	- newchild = zfs_alloc(hdl, (max_id) *
	- sizeof (nvlist_t *));
	- if (newchild == NULL)
	- goto nomem;
	-
	- for (c = 0; c < children; c++)
	- newchild[c] = child[c];
	-
	- free(child);
	- child = newchild;
	- children = max_id;
	- }
	- }
	-
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	- &guid) == 0);
	-
	- /*
	- * The vdev namespace may contain holes as a result of
	- * device removal. We must add them back into the vdev
	- * tree before we process any missing devices.
	- */
	- if (holes > 0) {
	- ASSERT(valid_top_config);
	-
	- for (c = 0; c < children; c++) {
	- nvlist_t *holey;
	-
	- if (child[c] != NULL \|\|
	- !vdev_is_hole(hole_array, holes, c))
	- continue;
	-
	- if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
	- 0) != 0)
	- goto nomem;
	-
	- /*
	- * Holes in the namespace are treated as
	- * "hole" top-level vdevs and have a
	- * special flag set on them.
	- */
	- if (nvlist_add_string(holey,
	- ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_HOLE) != 0 \|\|
	- nvlist_add_uint64(holey,
	- ZPOOL_CONFIG_ID, c) != 0 \|\|
	- nvlist_add_uint64(holey,
	- ZPOOL_CONFIG_GUID, 0ULL) != 0) {
	- nvlist_free(holey);
	- goto nomem;
	- }
	- child[c] = holey;
	- }
	- }
	-
	- /*
	- * Look for any missing top-level vdevs. If this is the case,
	- * create a faked up 'missing' vdev as a placeholder. We cannot
	- * simply compress the child array, because the kernel performs
	- * certain checks to make sure the vdev IDs match their location
	- * in the configuration.
	- */
	- for (c = 0; c < children; c++) {
	- if (child[c] == NULL) {
	- nvlist_t *missing;
	- if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
	- 0) != 0)
	- goto nomem;
	- if (nvlist_add_string(missing,
	- ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_MISSING) != 0 \|\|
	- nvlist_add_uint64(missing,
	- ZPOOL_CONFIG_ID, c) != 0 \|\|
	- nvlist_add_uint64(missing,
	- ZPOOL_CONFIG_GUID, 0ULL) != 0) {
	- nvlist_free(missing);
	- goto nomem;
	- }
	- child[c] = missing;
	- }
	- }
	-
	- /*
	- * Put all of this pool's top-level vdevs into a root vdev.
	- */
	- if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
	- goto nomem;
	- if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_ROOT) != 0 \|\|
	- nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 \|\|
	- nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 \|\|
	- nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- child, children) != 0) {
	- nvlist_free(nvroot);
	- goto nomem;
	- }
	-
	- for (c = 0; c < children; c++)
	- nvlist_free(child[c]);
	- free(child);
	- children = 0;
	- child = NULL;
	-
	- /*
	- * Go through and fix up any paths and/or devids based on our
	- * known list of vdev GUID -> path mappings.
	- */
	- if (fix_paths(nvroot, pl->names) != 0) {
	- nvlist_free(nvroot);
	- goto nomem;
	- }
	-
	- /*
	- * Add the root vdev to this pool's configuration.
	- */
	- if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- nvroot) != 0) {
	- nvlist_free(nvroot);
	- goto nomem;
	- }
	- nvlist_free(nvroot);
	-
	- /*
	- * zdb uses this path to report on active pools that were
	- * imported or created using -R.
	- */
	- if (active_ok)
	- goto add_pool;
	-
	- /*
	- * Determine if this pool is currently active, in which case we
	- * can't actually import it.
	- */
	- verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	- &name) == 0);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	- &guid) == 0);
	-
	- if (pool_active(hdl, name, guid, &isactive) != 0)
	- goto error;
	-
	- if (isactive) {
	- nvlist_free(config);
	- config = NULL;
	- continue;
	- }
	-
	- if (policy != NULL) {
	- if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
	- policy) != 0)
	- goto nomem;
	- }
	-
	- if ((nvl = refresh_config(hdl, config)) == NULL) {
	- nvlist_free(config);
	- config = NULL;
	- continue;
	- }
	-
	- nvlist_free(config);
	- config = nvl;
	-
	- /*
	- * Go through and update the paths for spares, now that we have
	- * them.
	- */
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	- &spares, &nspares) == 0) {
	- for (i = 0; i < nspares; i++) {
	- if (fix_paths(spares[i], pl->names) != 0)
	- goto nomem;
	- }
	- }
	-
	- /*
	- * Update the paths for l2cache devices.
	- */
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	- &l2cache, &nl2cache) == 0) {
	- for (i = 0; i < nl2cache; i++) {
	- if (fix_paths(l2cache[i], pl->names) != 0)
	- goto nomem;
	- }
	- }
	-
	- /*
	- * Restore the original information read from the actual label.
	- */
	- (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
	- DATA_TYPE_UINT64);
	- (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,
	- DATA_TYPE_STRING);
	- if (hostid != 0) {
	- verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
	- hostid) == 0);
	- verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
	- hostname) == 0);
	- }
	-
	-add_pool:
	- /*
	- * Add this pool to the list of configs.
	- */
	- verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	- &name) == 0);
	- if (nvlist_add_nvlist(ret, name, config) != 0)
	- goto nomem;
	-
	- found_one = B_TRUE;
	- nvlist_free(config);
	- config = NULL;
	- }
	-
	- if (!found_one) {
	- nvlist_free(ret);
	- ret = NULL;
	- }
	-
	- return (ret);
	-
	-nomem:
	- (void) no_memory(hdl);
	-error:
	- nvlist_free(config);
	- nvlist_free(ret);
	- for (c = 0; c < children; c++)
	- nvlist_free(child[c]);
	- free(child);
	-
	- return (NULL);
	-}
	-
	-/*
	- * Return the offset of the given label.
	- */
	-static uint64_t
	-label_offset(uint64_t size, int l)
	-{
	- ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0);
	- return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
	- 0 : size - VDEV_LABELS * sizeof (vdev_label_t)));
	-}
	-
	-/*
	- * Given a file descriptor, read the label information and return an nvlist
	- * describing the configuration, if there is one.
	- * Return 0 on success, or -1 on failure
	- */
	-int
	-zpool_read_label(int fd, nvlist_t **config)
	-{
	- struct stat64 statbuf;
	- int l;
	- vdev_label_t *label;
	- uint64_t state, txg, size;
	-
	- *config = NULL;
	-
	- if (fstat64(fd, &statbuf) == -1)
	- return (-1);
	- size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
	-
	- if ((label = malloc(sizeof (vdev_label_t))) == NULL)
	- return (-1);
	-
	- for (l = 0; l < VDEV_LABELS; l++) {
	- if (pread64(fd, label, sizeof (vdev_label_t),
	- label_offset(size, l)) != sizeof (vdev_label_t))
	- continue;
	-
	- if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
	- sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0)
	- continue;
	-
	- if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
	- &state) != 0 \|\| state > POOL_STATE_L2CACHE) {
	- nvlist_free(*config);
	- continue;
	- }
	-
	- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	- (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
	- &txg) != 0 \|\| txg == 0)) {
	- nvlist_free(*config);
	- continue;
	- }
	-
	- free(label);
	- return (0);
	- }
	-
	- free(label);
	- *config = NULL;
	- errno = ENOENT;
	- return (-1);
	-}
	-
	-/*
	- * Given a file descriptor, read the label information and return an nvlist
	- * describing the configuration, if there is one.
	- * returns the number of valid labels found
	- * If a label is found, returns it via config. The caller is responsible for
	- * freeing it.
	- */
	-int
	-zpool_read_all_labels(int fd, nvlist_t **config)
	-{
	- struct stat64 statbuf;
	- struct aiocb aiocbs[VDEV_LABELS];
	- struct aiocb *aiocbps[VDEV_LABELS];
	- int l;
	- vdev_phys_t *labels;
	- uint64_t state, txg, size;
	- int nlabels = 0;
	-
	- *config = NULL;
	-
	- if (fstat64(fd, &statbuf) == -1)
	- return (0);
	- size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
	-
	- if ((labels = calloc(VDEV_LABELS, sizeof (vdev_phys_t))) == NULL)
	- return (0);
	-
	- memset(aiocbs, 0, sizeof(aiocbs));
	- for (l = 0; l < VDEV_LABELS; l++) {
	- aiocbs[l].aio_fildes = fd;
	- aiocbs[l].aio_offset = label_offset(size, l) + VDEV_SKIP_SIZE;
	- aiocbs[l].aio_buf = &labels[l];
	- aiocbs[l].aio_nbytes = sizeof(vdev_phys_t);
	- aiocbs[l].aio_lio_opcode = LIO_READ;
	- aiocbps[l] = &aiocbs[l];
	- }
	-
	- if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) {
	- if (errno == EAGAIN \|\| errno == EINTR \|\| errno == EIO) {
	- for (l = 0; l < VDEV_LABELS; l++) {
	- errno = 0;
	- int r = aio_error(&aiocbs[l]);
	- if (r != EINVAL)
	- (void)aio_return(&aiocbs[l]);
	- }
	- }
	- free(labels);
	- return (0);
	- }
	-
	- for (l = 0; l < VDEV_LABELS; l++) {
	- nvlist_t *temp = NULL;
	-
	- if (aio_return(&aiocbs[l]) != sizeof(vdev_phys_t))
	- continue;
	-
	- if (nvlist_unpack(labels[l].vp_nvlist,
	- sizeof (labels[l].vp_nvlist), &temp, 0) != 0)
	- continue;
	-
	- if (nvlist_lookup_uint64(temp, ZPOOL_CONFIG_POOL_STATE,
	- &state) != 0 \|\| state > POOL_STATE_L2CACHE) {
	- nvlist_free(temp);
	- temp = NULL;
	- continue;
	- }
	-
	- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	- (nvlist_lookup_uint64(temp, ZPOOL_CONFIG_POOL_TXG,
	- &txg) != 0 \|\| txg == 0)) {
	- nvlist_free(temp);
	- temp = NULL;
	- continue;
	- }
	- if (temp)
	- *config = temp;
	-
	- nlabels++;
	- }
	-
	- free(labels);
	- return (nlabels);
	-}
	-
	-typedef struct rdsk_node {
	- char *rn_name;
	- int rn_dfd;
	- libzfs_handle_t *rn_hdl;
	- nvlist_t *rn_config;
	- avl_tree_t *rn_avl;
	- avl_node_t rn_node;
	- boolean_t rn_nozpool;
	-} rdsk_node_t;
	-
	-static int
	-slice_cache_compare(const void arg1, const void arg2)
	-{
	- const char nm1 = ((rdsk_node_t )arg1)->rn_name;
	- const char nm2 = ((rdsk_node_t )arg2)->rn_name;
	- char nm1slice, nm2slice;
	- int rv;
	-
	- /*
	- * slices zero and two are the most likely to provide results,
	- * so put those first
	- */
	- nm1slice = strstr(nm1, "s0");
	- nm2slice = strstr(nm2, "s0");
	- if (nm1slice && !nm2slice) {
	- return (-1);
	- }
	- if (!nm1slice && nm2slice) {
	- return (1);
	- }
	- nm1slice = strstr(nm1, "s2");
	- nm2slice = strstr(nm2, "s2");
	- if (nm1slice && !nm2slice) {
	- return (-1);
	- }
	- if (!nm1slice && nm2slice) {
	- return (1);
	- }
	-
	- rv = strcmp(nm1, nm2);
	- if (rv == 0)
	- return (0);
	- return (rv > 0 ? 1 : -1);
	-}
	-
	-#ifdef illumos
	-static void
	-check_one_slice(avl_tree_t r, char diskname, uint_t partno,
	- diskaddr_t size, uint_t blksz)
	-{
	- rdsk_node_t tmpnode;
	- rdsk_node_t *node;
	- char sname[MAXNAMELEN];
	-
	- tmpnode.rn_name = &sname[0];
	- (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
	- diskname, partno);
	- /*
	- * protect against division by zero for disk labels that
	- * contain a bogus sector size
	- */
	- if (blksz == 0)
	- blksz = DEV_BSIZE;
	- /* too small to contain a zpool? */
	- if ((size < (SPA_MINDEVSIZE / blksz)) &&
	- (node = avl_find(r, &tmpnode, NULL)))
	- node->rn_nozpool = B_TRUE;
	-}
	-#endif /* illumos */
	-
	-static void
	-nozpool_all_slices(avl_tree_t r, const char sname)
	-{
	-#ifdef illumos
	- char diskname[MAXNAMELEN];
	- char *ptr;
	- int i;
	-
	- (void) strncpy(diskname, sname, MAXNAMELEN);
	- if (((ptr = strrchr(diskname, 's')) == NULL) &&
	- ((ptr = strrchr(diskname, 'p')) == NULL))
	- return;
	- ptr[0] = 's';
	- ptr[1] = '\0';
	- for (i = 0; i < NDKMAP; i++)
	- check_one_slice(r, diskname, i, 0, 1);
	- ptr[0] = 'p';
	- for (i = 0; i <= FD_NUMPART; i++)
	- check_one_slice(r, diskname, i, 0, 1);
	-#endif /* illumos */
	-}
	-
	-#ifdef illumos
	-static void
	-check_slices(avl_tree_t r, int fd, const char sname)
	-{
	- struct extvtoc vtoc;
	- struct dk_gpt *gpt;
	- char diskname[MAXNAMELEN];
	- char *ptr;
	- int i;
	-
	- (void) strncpy(diskname, sname, MAXNAMELEN);
	- if ((ptr = strrchr(diskname, 's')) == NULL \|\| !isdigit(ptr[1]))
	- return;
	- ptr[1] = '\0';
	-
	- if (read_extvtoc(fd, &vtoc) >= 0) {
	- for (i = 0; i < NDKMAP; i++)
	- check_one_slice(r, diskname, i,
	- vtoc.v_part[i].p_size, vtoc.v_sectorsz);
	- } else if (efi_alloc_and_read(fd, &gpt) >= 0) {
	- /*
	- * on x86 we'll still have leftover links that point
	- * to slices s[9-15], so use NDKMAP instead
	- */
	- for (i = 0; i < NDKMAP; i++)
	- check_one_slice(r, diskname, i,
	- gpt->efi_parts[i].p_size, gpt->efi_lbasize);
	- /* nodes p[1-4] are never used with EFI labels */
	- ptr[0] = 'p';
	- for (i = 1; i <= FD_NUMPART; i++)
	- check_one_slice(r, diskname, i, 0, 1);
	- efi_free(gpt);
	- }
	-}
	-#endif /* illumos */
	-
	-static void
	-zpool_open_func(void *arg)
	-{
	- rdsk_node_t *rn = arg;
	- struct stat64 statbuf;
	- nvlist_t *config;
	- int fd;
	-
	- if (rn->rn_nozpool)
	- return;
	- if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
	- /* symlink to a device that's no longer there */
	- if (errno == ENOENT)
	- nozpool_all_slices(rn->rn_avl, rn->rn_name);
	- return;
	- }
	- /*
	- * Ignore failed stats. We only want regular
	- * files, character devs and block devs.
	- */
	- if (fstat64(fd, &statbuf) != 0 \|\|
	- (!S_ISREG(statbuf.st_mode) &&
	- !S_ISCHR(statbuf.st_mode) &&
	- !S_ISBLK(statbuf.st_mode))) {
	- (void) close(fd);
	- return;
	- }
	- /* this file is too small to hold a zpool */
	-#ifdef illumos
	- if (S_ISREG(statbuf.st_mode) &&
	- statbuf.st_size < SPA_MINDEVSIZE) {
	- (void) close(fd);
	- return;
	- } else if (!S_ISREG(statbuf.st_mode)) {
	- /*
	- * Try to read the disk label first so we don't have to
	- * open a bunch of minor nodes that can't have a zpool.
	- */
	- check_slices(rn->rn_avl, fd, rn->rn_name);
	- }
	-#else /* !illumos */
	- if (statbuf.st_size < SPA_MINDEVSIZE) {
	- (void) close(fd);
	- return;
	- }
	-#endif /* illumos */
	-
	- if ((zpool_read_label(fd, &config)) != 0 && errno == ENOMEM) {
	- (void) close(fd);
	- (void) no_memory(rn->rn_hdl);
	- return;
	- }
	- (void) close(fd);
	-
	- rn->rn_config = config;
	-}
	-
	-/*
	- * Given a file descriptor, clear (zero) the label information.
	- */
	-int
	-zpool_clear_label(int fd)
	-{
	- struct stat64 statbuf;
	- int l;
	- vdev_label_t *label;
	- uint64_t size;
	-
	- if (fstat64(fd, &statbuf) == -1)
	- return (0);
	- size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
	-
	- if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL)
	- return (-1);
	-
	- for (l = 0; l < VDEV_LABELS; l++) {
	- if (pwrite64(fd, label, sizeof (vdev_label_t),
	- label_offset(size, l)) != sizeof (vdev_label_t)) {
	- free(label);
	- return (-1);
	- }
	- }
	-
	- free(label);
	- return (0);
	-}
	-
	-/*
	- * Given a list of directories to search, find all pools stored on disk. This
	- * includes partial pools which are not available to import. If no args are
	- * given (argc is 0), then the default directory (/dev/dsk) is searched.
	- * poolname or guid (but not both) are provided by the caller when trying
	- * to import a specific pool.
	- */
	-static nvlist_t *
	-zpool_find_import_impl(libzfs_handle_t hdl, importargs_t iarg)
	-{
	- int i, dirs = iarg->paths;
	- struct dirent64 *dp;
	- char path[MAXPATHLEN];
	- char end, *dir = iarg->path;
	- size_t pathleft;
	- nvlist_t *ret = NULL;
	- static char *default_dir = "/dev";
	- pool_list_t pools = { 0 };
	- pool_entry_t pe, penext;
	- vdev_entry_t ve, venext;
	- config_entry_t ce, cenext;
	- name_entry_t ne, nenext;
	- avl_tree_t slice_cache;
	- rdsk_node_t *slice;
	- void *cookie;
	- boolean_t skip_zvols = B_FALSE;
	- int value;
	- size_t size = sizeof(value);
	-
	- if (dirs == 0) {
	- dirs = 1;
	- dir = &default_dir;
	- }
	-
	- if (sysctlbyname("vfs.zfs.vol.recursive", &value, &size, NULL, 0) == 0
	- && value == 0) {
	- skip_zvols = B_TRUE;
	- }
	-
	- /*
	- * Go through and read the label configuration information from every
	- * possible device, organizing the information according to pool GUID
	- * and toplevel GUID.
	- */
	- for (i = 0; i < dirs; i++) {
	- tpool_t *t;
	- char rdsk[MAXPATHLEN];
	- int dfd;
	- boolean_t config_failed = B_FALSE;
	- DIR *dirp;
	-
	- /* use realpath to normalize the path */
	- if (realpath(dir[i], path) == 0) {
	- (void) zfs_error_fmt(hdl, EZFS_BADPATH,
	- dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]);
	- goto error;
	- }
	- end = &path[strlen(path)];
	- *end++ = '/';
	- *end = 0;
	- pathleft = &path[sizeof (path)] - end;
	-
	-#ifdef illumos
	- /*
	- * Using raw devices instead of block devices when we're
	- * reading the labels skips a bunch of slow operations during
	- * close(2) processing, so we replace /dev/dsk with /dev/rdsk.
	- */
	- if (strcmp(path, ZFS_DISK_ROOTD) == 0)
	- (void) strlcpy(rdsk, ZFS_RDISK_ROOTD, sizeof (rdsk));
	- else
	-#endif
	- (void) strlcpy(rdsk, path, sizeof (rdsk));
	-
	- if ((dfd = open64(rdsk, O_RDONLY)) < 0 \|\|
	- (dirp = fdopendir(dfd)) == NULL) {
	- if (dfd >= 0)
	- (void) close(dfd);
	- zfs_error_aux(hdl, strerror(errno));
	- (void) zfs_error_fmt(hdl, EZFS_BADPATH,
	- dgettext(TEXT_DOMAIN, "cannot open '%s'"),
	- rdsk);
	- goto error;
	- }
	-
	- avl_create(&slice_cache, slice_cache_compare,
	- sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
	-
	- if (strcmp(rdsk, "/dev/") == 0) {
	- struct gmesh mesh;
	- struct gclass *mp;
	- struct ggeom *gp;
	- struct gprovider *pp;
	-
	- errno = geom_gettree(&mesh);
	- if (errno != 0) {
	- zfs_error_aux(hdl, strerror(errno));
	- (void) zfs_error_fmt(hdl, EZFS_BADPATH,
	- dgettext(TEXT_DOMAIN, "cannot get GEOM tree"));
	- goto error;
	- }
	-
	- LIST_FOREACH(mp, &mesh.lg_class, lg_class) {
	- if (skip_zvols &&
	- strcmp(mp->lg_name, "ZFS::ZVOL") == 0) {
	- continue;
	- }
	- LIST_FOREACH(gp, &mp->lg_geom, lg_geom) {
	- LIST_FOREACH(pp, &gp->lg_provider, lg_provider) {
	- slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
	- slice->rn_name = zfs_strdup(hdl, pp->lg_name);
	- slice->rn_avl = &slice_cache;
	- slice->rn_dfd = dfd;
	- slice->rn_hdl = hdl;
	- slice->rn_nozpool = B_FALSE;
	- avl_add(&slice_cache, slice);
	- }
	- }
	- }
	-
	- geom_deletetree(&mesh);
	- goto skipdir;
	- }
	-
	- /*
	- * This is not MT-safe, but we have no MT consumers of libzfs
	- */
	- while ((dp = readdir64(dirp)) != NULL) {
	- const char *name = dp->d_name;
	- if (name[0] == '.' &&
	- (name[1] == 0 \|\| (name[1] == '.' && name[2] == 0)))
	- continue;
	-
	- slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
	- slice->rn_name = zfs_strdup(hdl, name);
	- slice->rn_avl = &slice_cache;
	- slice->rn_dfd = dfd;
	- slice->rn_hdl = hdl;
	- slice->rn_nozpool = B_FALSE;
	- avl_add(&slice_cache, slice);
	- }
	-skipdir:
	- /*
	- * create a thread pool to do all of this in parallel;
	- * rn_nozpool is not protected, so this is racy in that
	- * multiple tasks could decide that the same slice can
	- * not hold a zpool, which is benign. Also choose
	- * double the number of processors; we hold a lot of
	- * locks in the kernel, so going beyond this doesn't
	- * buy us much.
	- */
	- t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN),
	- 0, NULL);
	- for (slice = avl_first(&slice_cache); slice;
	- (slice = avl_walk(&slice_cache, slice,
	- AVL_AFTER)))
	- (void) tpool_dispatch(t, zpool_open_func, slice);
	- tpool_wait(t);
	- tpool_destroy(t);
	-
	- cookie = NULL;
	- while ((slice = avl_destroy_nodes(&slice_cache,
	- &cookie)) != NULL) {
	- if (slice->rn_config != NULL && !config_failed) {
	- nvlist_t *config = slice->rn_config;
	- boolean_t matched = B_TRUE;
	-
	- if (iarg->poolname != NULL) {
	- char *pname;
	-
	- matched = nvlist_lookup_string(config,
	- ZPOOL_CONFIG_POOL_NAME,
	- &pname) == 0 &&
	- strcmp(iarg->poolname, pname) == 0;
	- } else if (iarg->guid != 0) {
	- uint64_t this_guid;
	-
	- matched = nvlist_lookup_uint64(config,
	- ZPOOL_CONFIG_POOL_GUID,
	- &this_guid) == 0 &&
	- iarg->guid == this_guid;
	- }
	- if (matched) {
	- /*
	- * use the non-raw path for the config
	- */
	- (void) strlcpy(end, slice->rn_name,
	- pathleft);
	- if (add_config(hdl, &pools, path,
	- config) != 0)
	- config_failed = B_TRUE;
	- }
	- nvlist_free(config);
	- }
	- free(slice->rn_name);
	- free(slice);
	- }
	- avl_destroy(&slice_cache);
	-
	- (void) closedir(dirp);
	-
	- if (config_failed)
	- goto error;
	- }
	-
	- ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
	-
	-error:
	- for (pe = pools.pools; pe != NULL; pe = penext) {
	- penext = pe->pe_next;
	- for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
	- venext = ve->ve_next;
	- for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
	- cenext = ce->ce_next;
	- nvlist_free(ce->ce_config);
	- free(ce);
	- }
	- free(ve);
	- }
	- free(pe);
	- }
	-
	- for (ne = pools.names; ne != NULL; ne = nenext) {
	- nenext = ne->ne_next;
	- free(ne->ne_name);
	- free(ne);
	- }
	-
	- return (ret);
	-}
	-
	-nvlist_t *
	-zpool_find_import(libzfs_handle_t hdl, int argc, char *argv)
	-{
	- importargs_t iarg = { 0 };
	-
	- iarg.paths = argc;
	- iarg.path = argv;
	-
	- return (zpool_find_import_impl(hdl, &iarg));
	-}
	-
	-/*
	- * Given a cache file, return the contents as a list of importable pools.
	- * poolname or guid (but not both) are provided by the caller when trying
	- * to import a specific pool.
	- */
	-nvlist_t *
	-zpool_find_import_cached(libzfs_handle_t hdl, const char cachefile,
	- char *poolname, uint64_t guid)
	-{
	- char *buf;
	- int fd;
	- struct stat64 statbuf;
	- nvlist_t raw, src, *dst;
	- nvlist_t *pools;
	- nvpair_t *elem;
	- char *name;
	- uint64_t this_guid;
	- boolean_t active;
	-
	- verify(poolname == NULL \|\| guid == 0);
	-
	- if ((fd = open(cachefile, O_RDONLY)) < 0) {
	- zfs_error_aux(hdl, "%s", strerror(errno));
	- (void) zfs_error(hdl, EZFS_BADCACHE,
	- dgettext(TEXT_DOMAIN, "failed to open cache file"));
	- return (NULL);
	- }
	-
	- if (fstat64(fd, &statbuf) != 0) {
	- zfs_error_aux(hdl, "%s", strerror(errno));
	- (void) close(fd);
	- (void) zfs_error(hdl, EZFS_BADCACHE,
	- dgettext(TEXT_DOMAIN, "failed to get size of cache file"));
	- return (NULL);
	- }
	-
	- if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) {
	- (void) close(fd);
	- return (NULL);
	- }
	-
	- if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
	- (void) close(fd);
	- free(buf);
	- (void) zfs_error(hdl, EZFS_BADCACHE,
	- dgettext(TEXT_DOMAIN,
	- "failed to read cache file contents"));
	- return (NULL);
	- }
	-
	- (void) close(fd);
	-
	- if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {
	- free(buf);
	- (void) zfs_error(hdl, EZFS_BADCACHE,
	- dgettext(TEXT_DOMAIN,
	- "invalid or corrupt cache file contents"));
	- return (NULL);
	- }
	-
	- free(buf);
	-
	- /*
	- * Go through and get the current state of the pools and refresh their
	- * state.
	- */
	- if (nvlist_alloc(&pools, 0, 0) != 0) {
	- (void) no_memory(hdl);
	- nvlist_free(raw);
	- return (NULL);
	- }
	-
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
	- src = fnvpair_value_nvlist(elem);
	-
	- name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
	- if (poolname != NULL && strcmp(poolname, name) != 0)
	- continue;
	-
	- this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
	- if (guid != 0 && guid != this_guid)
	- continue;
	-
	- if (pool_active(hdl, name, this_guid, &active) != 0) {
	- nvlist_free(raw);
	- nvlist_free(pools);
	- return (NULL);
	- }
	-
	- if (active)
	- continue;
	-
	- if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
	- cachefile) != 0) {
	- (void) no_memory(hdl);
	- nvlist_free(raw);
	- nvlist_free(pools);
	- return (NULL);
	- }
	-
	- if ((dst = refresh_config(hdl, src)) == NULL) {
	- nvlist_free(raw);
	- nvlist_free(pools);
	- return (NULL);
	- }
	-
	- if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {
	- (void) no_memory(hdl);
	- nvlist_free(dst);
	- nvlist_free(raw);
	- nvlist_free(pools);
	- return (NULL);
	- }
	- nvlist_free(dst);
	- }
	-
	- nvlist_free(raw);
	- return (pools);
	-}
	-
	-static int
	-name_or_guid_exists(zpool_handle_t zhp, void data)
	-{
	- importargs_t *import = data;
	- int found = 0;
	-
	- if (import->poolname != NULL) {
	- char *pool_name;
	-
	- verify(nvlist_lookup_string(zhp->zpool_config,
	- ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
	- if (strcmp(pool_name, import->poolname) == 0)
	- found = 1;
	- } else {
	- uint64_t pool_guid;
	-
	- verify(nvlist_lookup_uint64(zhp->zpool_config,
	- ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
	- if (pool_guid == import->guid)
	- found = 1;
	- }
	-
	- zpool_close(zhp);
	- return (found);
	-}
	-
	-nvlist_t *
	-zpool_search_import(libzfs_handle_t hdl, importargs_t import)
	-{
	- nvlist_t *pools = NULL;
	-
	- verify(import->poolname == NULL \|\| import->guid == 0);
	-
	- if (import->unique)
	- import->exists = zpool_iter(hdl, name_or_guid_exists, import);
	-
	- if (import->cachefile != NULL)
	- pools = zpool_find_import_cached(hdl, import->cachefile,
	- import->poolname, import->guid);
	- else
	- pools = zpool_find_import_impl(hdl, import);
	-
	- return (pools);
	-}
	-
	-static boolean_t
	-pool_match(nvlist_t cfg, char tgt)
	-{
	- uint64_t v, guid = strtoull(tgt, NULL, 0);
	- char *s;
	-
	- if (guid != 0) {
	- if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
	- return (v == guid);
	- } else {
	- if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
	- return (strcmp(s, tgt) == 0);
	- }
	- return (B_FALSE);
	-}
	-
	-int
	-zpool_tryimport(libzfs_handle_t hdl, char target, nvlist_t **configp,
	- importargs_t *args)
	-{
	- nvlist_t *pools;
	- nvlist_t *match = NULL;
	- nvlist_t *config = NULL;
	- char *sepp = NULL;
	- int count = 0;
	- char *targetdup = strdup(target);
	-
	- *configp = NULL;
	-
	- if ((sepp = strpbrk(targetdup, "/@")) != NULL) {
	- *sepp = '\0';
	- }
	-
	- pools = zpool_search_import(hdl, args);
	-
	- if (pools != NULL) {
	- nvpair_t *elem = NULL;
	- while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
	- VERIFY0(nvpair_value_nvlist(elem, &config));
	- if (pool_match(config, targetdup)) {
	- count++;
	- if (match != NULL) {
	- /* multiple matches found */
	- continue;
	- } else {
	- match = config;
	- }
	- }
	- }
	- }
	-
	- if (count == 0) {
	- free(targetdup);
	- return (ENOENT);
	- }
	-
	- if (count > 1) {
	- free(targetdup);
	- return (EINVAL);
	- }
	-
	- *configp = match;
	- free(targetdup);
	-
	- return (0);
	-}
	-
	-boolean_t
	-find_guid(nvlist_t *nv, uint64_t guid)
	-{
	- uint64_t tmp;
	- nvlist_t **child;
	- uint_t c, children;
	-
	- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &tmp) == 0);
	- if (tmp == guid)
	- return (B_TRUE);
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++)
	- if (find_guid(child[c], guid))
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-typedef struct aux_cbdata {
	- const char *cb_type;
	- uint64_t cb_guid;
	- zpool_handle_t *cb_zhp;
	-} aux_cbdata_t;
	-
	-static int
	-find_aux(zpool_handle_t zhp, void data)
	-{
	- aux_cbdata_t *cbp = data;
	- nvlist_t **list;
	- uint_t i, count;
	- uint64_t guid;
	- nvlist_t *nvroot;
	-
	- verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	-
	- if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type,
	- &list, &count) == 0) {
	- for (i = 0; i < count; i++) {
	- verify(nvlist_lookup_uint64(list[i],
	- ZPOOL_CONFIG_GUID, &guid) == 0);
	- if (guid == cbp->cb_guid) {
	- cbp->cb_zhp = zhp;
	- return (1);
	- }
	- }
	- }
	-
	- zpool_close(zhp);
	- return (0);
	-}
	-
	-/*
	- * Determines if the pool is in use. If so, it returns true and the state of
	- * the pool as well as the name of the pool. Both strings are allocated and
	- * must be freed by the caller.
	- */
	-int
	-zpool_in_use(libzfs_handle_t hdl, int fd, pool_state_t state, char **namestr,
	- boolean_t *inuse)
	-{
	- nvlist_t *config;
	- char *name;
	- boolean_t ret;
	- uint64_t guid, vdev_guid;
	- zpool_handle_t *zhp;
	- nvlist_t *pool_config;
	- uint64_t stateval, isspare;
	- aux_cbdata_t cb = { 0 };
	- boolean_t isactive;
	-
	- *inuse = B_FALSE;
	-
	- if (zpool_read_label(fd, &config) != 0 && errno == ENOMEM) {
	- (void) no_memory(hdl);
	- return (-1);
	- }
	-
	- if (config == NULL)
	- return (0);
	-
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	- &stateval) == 0);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
	- &vdev_guid) == 0);
	-
	- if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) {
	- verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	- &name) == 0);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	- &guid) == 0);
	- }
	-
	- switch (stateval) {
	- case POOL_STATE_EXPORTED:
	- /*
	- * A pool with an exported state may in fact be imported
	- * read-only, so check the in-core state to see if it's
	- * active and imported read-only. If it is, set
	- * its state to active.
	- */
	- if (pool_active(hdl, name, guid, &isactive) == 0 && isactive &&
	- (zhp = zpool_open_canfail(hdl, name)) != NULL) {
	- if (zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL))
	- stateval = POOL_STATE_ACTIVE;
	-
	- /*
	- * All we needed the zpool handle for is the
	- * readonly prop check.
	- */
	- zpool_close(zhp);
	- }
	-
	- ret = B_TRUE;
	- break;
	-
	- case POOL_STATE_ACTIVE:
	- /*
	- * For an active pool, we have to determine if it's really part
	- * of a currently active pool (in which case the pool will exist
	- * and the guid will be the same), or whether it's part of an
	- * active pool that was disconnected without being explicitly
	- * exported.
	- */
	- if (pool_active(hdl, name, guid, &isactive) != 0) {
	- nvlist_free(config);
	- return (-1);
	- }
	-
	- if (isactive) {
	- /*
	- * Because the device may have been removed while
	- * offlined, we only report it as active if the vdev is
	- * still present in the config. Otherwise, pretend like
	- * it's not in use.
	- */
	- if ((zhp = zpool_open_canfail(hdl, name)) != NULL &&
	- (pool_config = zpool_get_config(zhp, NULL))
	- != NULL) {
	- nvlist_t *nvroot;
	-
	- verify(nvlist_lookup_nvlist(pool_config,
	- ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	- ret = find_guid(nvroot, vdev_guid);
	- } else {
	- ret = B_FALSE;
	- }
	-
	- /*
	- * If this is an active spare within another pool, we
	- * treat it like an unused hot spare. This allows the
	- * user to create a pool with a hot spare that currently
	- * in use within another pool. Since we return B_TRUE,
	- * libdiskmgt will continue to prevent generic consumers
	- * from using the device.
	- */
	- if (ret && nvlist_lookup_uint64(config,
	- ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare)
	- stateval = POOL_STATE_SPARE;
	-
	- if (zhp != NULL)
	- zpool_close(zhp);
	- } else {
	- stateval = POOL_STATE_POTENTIALLY_ACTIVE;
	- ret = B_TRUE;
	- }
	- break;
	-
	- case POOL_STATE_SPARE:
	- /*
	- * For a hot spare, it can be either definitively in use, or
	- * potentially active. To determine if it's in use, we iterate
	- * over all pools in the system and search for one with a spare
	- * with a matching guid.
	- *
	- * Due to the shared nature of spares, we don't actually report
	- * the potentially active case as in use. This means the user
	- * can freely create pools on the hot spares of exported pools,
	- * but to do otherwise makes the resulting code complicated, and
	- * we end up having to deal with this case anyway.
	- */
	- cb.cb_zhp = NULL;
	- cb.cb_guid = vdev_guid;
	- cb.cb_type = ZPOOL_CONFIG_SPARES;
	- if (zpool_iter(hdl, find_aux, &cb) == 1) {
	- name = (char *)zpool_get_name(cb.cb_zhp);
	- ret = B_TRUE;
	- } else {
	- ret = B_FALSE;
	- }
	- break;
	-
	- case POOL_STATE_L2CACHE:
	-
	- /*
	- * Check if any pool is currently using this l2cache device.
	- */
	- cb.cb_zhp = NULL;
	- cb.cb_guid = vdev_guid;
	- cb.cb_type = ZPOOL_CONFIG_L2CACHE;
	- if (zpool_iter(hdl, find_aux, &cb) == 1) {
	- name = (char *)zpool_get_name(cb.cb_zhp);
	- ret = B_TRUE;
	- } else {
	- ret = B_FALSE;
	- }
	- break;
	-
	- default:
	- ret = B_FALSE;
	- }
	-
	-
	- if (ret) {
	- if ((*namestr = zfs_strdup(hdl, name)) == NULL) {
	- if (cb.cb_zhp)
	- zpool_close(cb.cb_zhp);
	- nvlist_free(config);
	- return (-1);
	- }
	- *state = (pool_state_t)stateval;
	- }
	-
	- if (cb.cb_zhp)
	- zpool_close(cb.cb_zhp);
	-
	- nvlist_free(config);
	- *inuse = ret;
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_iter.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_iter.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_iter.c
	@@ -1,546 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2019 Datto Inc.
	- */
	-
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <stddef.h>
	-#include <libintl.h>
	-#include <libzfs.h>
	-
	-#include "libzfs_impl.h"
	-
	-int
	-zfs_iter_clones(zfs_handle_t zhp, zfs_iter_f func, void data)
	-{
	- nvlist_t *nvl = zfs_get_clones_nvl(zhp);
	- nvpair_t *pair;
	-
	- if (nvl == NULL)
	- return (0);
	-
	- for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(nvl, pair)) {
	- zfs_handle_t *clone = zfs_open(zhp->zfs_hdl, nvpair_name(pair),
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (clone != NULL) {
	- int err = func(clone, data);
	- if (err != 0)
	- return (err);
	- }
	- }
	- return (0);
	-}
	-
	-static int
	-zfs_do_list_ioctl(zfs_handle_t zhp, unsigned long arg, zfs_cmd_t zc)
	-{
	- int rc;
	- uint64_t orig_cookie;
	-
	- orig_cookie = zc->zc_cookie;
	-top:
	- (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
	- rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc);
	-
	- if (rc == -1) {
	- switch (errno) {
	- case ENOMEM:
	- /* expand nvlist memory and try again */
	- if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) {
	- zcmd_free_nvlists(zc);
	- return (-1);
	- }
	- zc->zc_cookie = orig_cookie;
	- goto top;
	- /*
	- * An errno value of ESRCH indicates normal completion.
	- * If ENOENT is returned, then the underlying dataset
	- * has been removed since we obtained the handle.
	- */
	- case ESRCH:
	- case ENOENT:
	- rc = 1;
	- break;
	- default:
	- rc = zfs_standard_error(zhp->zfs_hdl, errno,
	- dgettext(TEXT_DOMAIN,
	- "cannot iterate filesystems"));
	- break;
	- }
	- }
	- return (rc);
	-}
	-
	-/*
	- * Iterate over all child filesystems
	- */
	-int
	-zfs_iter_filesystems(zfs_handle_t zhp, zfs_iter_f func, void data)
	-{
	- zfs_cmd_t zc = { 0 };
	- zfs_handle_t *nzhp;
	- int ret;
	-
	- if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM)
	- return (0);
	-
	- if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
	- return (-1);
	-
	- while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT,
	- &zc)) == 0) {
	- /*
	- * Silently ignore errors, as the only plausible explanation is
	- * that the pool has since been removed.
	- */
	- if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
	- &zc)) == NULL) {
	- continue;
	- }
	-
	- if ((ret = func(nzhp, data)) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (ret);
	- }
	- }
	- zcmd_free_nvlists(&zc);
	- return ((ret < 0) ? ret : 0);
	-}
	-
	-/*
	- * Iterate over all snapshots
	- */
	-int
	-zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func,
	- void *data, uint64_t min_txg, uint64_t max_txg)
	-{
	- zfs_cmd_t zc = { 0 };
	- zfs_handle_t *nzhp;
	- int ret;
	- nvlist_t *range_nvl = NULL;
	-
	- if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT \|\|
	- zhp->zfs_type == ZFS_TYPE_BOOKMARK)
	- return (0);
	-
	- zc.zc_simple = simple;
	-
	- if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
	- return (-1);
	-
	- if (min_txg != 0) {
	- range_nvl = fnvlist_alloc();
	- fnvlist_add_uint64(range_nvl, SNAP_ITER_MIN_TXG, min_txg);
	- }
	- if (max_txg != 0) {
	- if (range_nvl == NULL)
	- range_nvl = fnvlist_alloc();
	- fnvlist_add_uint64(range_nvl, SNAP_ITER_MAX_TXG, max_txg);
	- }
	-
	- if (range_nvl != NULL &&
	- zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, range_nvl) != 0) {
	- zcmd_free_nvlists(&zc);
	- fnvlist_free(range_nvl);
	- return (-1);
	- }
	-
	- while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT,
	- &zc)) == 0) {
	-
	- if (simple)
	- nzhp = make_dataset_simple_handle_zc(zhp, &zc);
	- else
	- nzhp = make_dataset_handle_zc(zhp->zfs_hdl, &zc);
	- if (nzhp == NULL)
	- continue;
	-
	- if ((ret = func(nzhp, data)) != 0) {
	- zcmd_free_nvlists(&zc);
	- fnvlist_free(range_nvl);
	- return (ret);
	- }
	- }
	- zcmd_free_nvlists(&zc);
	- fnvlist_free(range_nvl);
	- return ((ret < 0) ? ret : 0);
	-}
	-
	-/*
	- * Iterate over all bookmarks
	- */
	-int
	-zfs_iter_bookmarks(zfs_handle_t zhp, zfs_iter_f func, void data)
	-{
	- zfs_handle_t *nzhp;
	- nvlist_t *props = NULL;
	- nvlist_t *bmarks = NULL;
	- int err;
	-
	- if ((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT \| ZFS_TYPE_BOOKMARK)) != 0)
	- return (0);
	-
	- /* Setup the requested properties nvlist. */
	- props = fnvlist_alloc();
	- fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_GUID));
	- fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATETXG));
	- fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATION));
	-
	- if ((err = lzc_get_bookmarks(zhp->zfs_name, props, &bmarks)) != 0)
	- goto out;
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) {
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- char *bmark_name;
	- nvlist_t *bmark_props;
	-
	- bmark_name = nvpair_name(pair);
	- bmark_props = fnvpair_value_nvlist(pair);
	-
	- (void) snprintf(name, sizeof (name), "%s#%s", zhp->zfs_name,
	- bmark_name);
	-
	- nzhp = make_bookmark_handle(zhp, name, bmark_props);
	- if (nzhp == NULL)
	- continue;
	-
	- if ((err = func(nzhp, data)) != 0)
	- goto out;
	- }
	-
	-out:
	- fnvlist_free(props);
	- fnvlist_free(bmarks);
	-
	- return (err);
	-}
	-
	-/*
	- * Routines for dealing with the sorted snapshot functionality
	- */
	-typedef struct zfs_node {
	- zfs_handle_t *zn_handle;
	- avl_node_t zn_avlnode;
	-} zfs_node_t;
	-
	-static int
	-zfs_sort_snaps(zfs_handle_t zhp, void data)
	-{
	- avl_tree_t *avl = data;
	- zfs_node_t *node;
	- zfs_node_t search;
	-
	- search.zn_handle = zhp;
	- node = avl_find(avl, &search, NULL);
	- if (node) {
	- /*
	- * If this snapshot was renamed while we were creating the
	- * AVL tree, it's possible that we already inserted it under
	- * its old name. Remove the old handle before adding the new
	- * one.
	- */
	- zfs_close(node->zn_handle);
	- avl_remove(avl, node);
	- free(node);
	- }
	-
	- node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t));
	- node->zn_handle = zhp;
	- avl_add(avl, node);
	-
	- return (0);
	-}
	-
	-static int
	-zfs_snapshot_compare(const void larg, const void rarg)
	-{
	- zfs_handle_t l = ((zfs_node_t )larg)->zn_handle;
	- zfs_handle_t r = ((zfs_node_t )rarg)->zn_handle;
	- uint64_t lcreate, rcreate;
	-
	- /*
	- * Sort them according to creation time. We use the hidden
	- * CREATETXG property to get an absolute ordering of snapshots.
	- */
	- lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
	- rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
	-
	- return (AVL_CMP(lcreate, rcreate));
	-}
	-
	-int
	-zfs_iter_snapshots_sorted(zfs_handle_t zhp, zfs_iter_f callback, void data,
	- uint64_t min_txg, uint64_t max_txg)
	-{
	- int ret = 0;
	- zfs_node_t *node;
	- avl_tree_t avl;
	- void *cookie = NULL;
	-
	- avl_create(&avl, zfs_snapshot_compare,
	- sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode));
	-
	- ret = zfs_iter_snapshots(zhp, B_FALSE, zfs_sort_snaps, &avl, min_txg,
	- max_txg);
	-
	- for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node))
	- ret \|= callback(node->zn_handle, data);
	-
	- while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL)
	- free(node);
	-
	- avl_destroy(&avl);
	-
	- return (ret);
	-}
	-
	-typedef struct {
	- char *ssa_first;
	- char *ssa_last;
	- boolean_t ssa_seenfirst;
	- boolean_t ssa_seenlast;
	- zfs_iter_f ssa_func;
	- void *ssa_arg;
	-} snapspec_arg_t;
	-
	-static int
	-snapspec_cb(zfs_handle_t zhp, void arg)
	-{
	- snapspec_arg_t *ssa = arg;
	- const char *shortsnapname;
	- int err = 0;
	-
	- if (ssa->ssa_seenlast)
	- return (0);
	-
	- shortsnapname = strchr(zfs_get_name(zhp), '@') + 1;
	- if (!ssa->ssa_seenfirst && strcmp(shortsnapname, ssa->ssa_first) == 0)
	- ssa->ssa_seenfirst = B_TRUE;
	- if (strcmp(shortsnapname, ssa->ssa_last) == 0)
	- ssa->ssa_seenlast = B_TRUE;
	-
	- if (ssa->ssa_seenfirst) {
	- err = ssa->ssa_func(zhp, ssa->ssa_arg);
	- } else {
	- zfs_close(zhp);
	- }
	-
	- return (err);
	-}
	-
	-/*
	- * spec is a string like "A,B%C,D"
	- *
	- * <snaps>, where <snaps> can be:
	- * <snap> (single snapshot)
	- * <snap>%<snap> (range of snapshots, inclusive)
	- * %<snap> (range of snapshots, starting with earliest)
	- * <snap>% (range of snapshots, ending with last)
	- * % (all snapshots)
	- * <snaps>[,...] (comma separated list of the above)
	- *
	- * If a snapshot can not be opened, continue trying to open the others, but
	- * return ENOENT at the end.
	- */
	-int
	-zfs_iter_snapspec(zfs_handle_t fs_zhp, const char spec_orig,
	- zfs_iter_f func, void *arg)
	-{
	- char buf, comma_separated, *cp;
	- int err = 0;
	- int ret = 0;
	-
	- buf = zfs_strdup(fs_zhp->zfs_hdl, spec_orig);
	- cp = buf;
	-
	- while ((comma_separated = strsep(&cp, ",")) != NULL) {
	- char *pct = strchr(comma_separated, '%');
	- if (pct != NULL) {
	- snapspec_arg_t ssa = { 0 };
	- ssa.ssa_func = func;
	- ssa.ssa_arg = arg;
	-
	- if (pct == comma_separated)
	- ssa.ssa_seenfirst = B_TRUE;
	- else
	- ssa.ssa_first = comma_separated;
	- *pct = '\0';
	- ssa.ssa_last = pct + 1;
	-
	- /*
	- * If there is a lastname specified, make sure it
	- * exists.
	- */
	- if (ssa.ssa_last[0] != '\0') {
	- char snapname[ZFS_MAX_DATASET_NAME_LEN];
	- (void) snprintf(snapname, sizeof (snapname),
	- "%s@%s", zfs_get_name(fs_zhp),
	- ssa.ssa_last);
	- if (!zfs_dataset_exists(fs_zhp->zfs_hdl,
	- snapname, ZFS_TYPE_SNAPSHOT)) {
	- ret = ENOENT;
	- continue;
	- }
	- }
	-
	- err = zfs_iter_snapshots_sorted(fs_zhp,
	- snapspec_cb, &ssa, 0, 0);
	- if (ret == 0)
	- ret = err;
	- if (ret == 0 && (!ssa.ssa_seenfirst \|\|
	- (ssa.ssa_last[0] != '\0' && !ssa.ssa_seenlast))) {
	- ret = ENOENT;
	- }
	- } else {
	- char snapname[ZFS_MAX_DATASET_NAME_LEN];
	- zfs_handle_t *snap_zhp;
	- (void) snprintf(snapname, sizeof (snapname), "%s@%s",
	- zfs_get_name(fs_zhp), comma_separated);
	- snap_zhp = make_dataset_handle(fs_zhp->zfs_hdl,
	- snapname);
	- if (snap_zhp == NULL) {
	- ret = ENOENT;
	- continue;
	- }
	- err = func(snap_zhp, arg);
	- if (ret == 0)
	- ret = err;
	- }
	- }
	-
	- free(buf);
	- return (ret);
	-}
	-
	-/*
	- * Iterate over all children, snapshots and filesystems
	- * Process snapshots before filesystems because they are nearer the input
	- * handle: this is extremely important when used with zfs_iter_f functions
	- * looking for data, following the logic that we would like to find it as soon
	- * and as close as possible.
	- */
	-int
	-zfs_iter_children(zfs_handle_t zhp, zfs_iter_f func, void data)
	-{
	- int ret;
	-
	- if ((ret = zfs_iter_snapshots(zhp, B_FALSE, func, data, 0, 0)) != 0)
	- return (ret);
	-
	- return (zfs_iter_filesystems(zhp, func, data));
	-}
	-
	-
	-typedef struct iter_stack_frame {
	- struct iter_stack_frame *next;
	- zfs_handle_t *zhp;
	-} iter_stack_frame_t;
	-
	-typedef struct iter_dependents_arg {
	- boolean_t first;
	- boolean_t allowrecursion;
	- iter_stack_frame_t *stack;
	- zfs_iter_f func;
	- void *data;
	-} iter_dependents_arg_t;
	-
	-static int
	-iter_dependents_cb(zfs_handle_t zhp, void arg)
	-{
	- iter_dependents_arg_t *ida = arg;
	- int err = 0;
	- boolean_t first = ida->first;
	- ida->first = B_FALSE;
	-
	- if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
	- err = zfs_iter_clones(zhp, iter_dependents_cb, ida);
	- } else if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) {
	- iter_stack_frame_t isf;
	- iter_stack_frame_t *f;
	-
	- /*
	- * check if there is a cycle by seeing if this fs is already
	- * on the stack.
	- */
	- for (f = ida->stack; f != NULL; f = f->next) {
	- if (f->zhp->zfs_dmustats.dds_guid ==
	- zhp->zfs_dmustats.dds_guid) {
	- if (ida->allowrecursion) {
	- zfs_close(zhp);
	- return (0);
	- } else {
	- zfs_error_aux(zhp->zfs_hdl,
	- dgettext(TEXT_DOMAIN,
	- "recursive dependency at '%s'"),
	- zfs_get_name(zhp));
	- err = zfs_error(zhp->zfs_hdl,
	- EZFS_RECURSIVE,
	- dgettext(TEXT_DOMAIN,
	- "cannot determine dependent "
	- "datasets"));
	- zfs_close(zhp);
	- return (err);
	- }
	- }
	- }
	-
	- isf.zhp = zhp;
	- isf.next = ida->stack;
	- ida->stack = &isf;
	- err = zfs_iter_filesystems(zhp, iter_dependents_cb, ida);
	- if (err == 0) {
	- err = zfs_iter_snapshots(zhp, B_FALSE,
	- iter_dependents_cb, ida, 0, 0);
	- }
	- ida->stack = isf.next;
	- }
	-
	- if (!first && err == 0)
	- err = ida->func(zhp, ida->data);
	- else
	- zfs_close(zhp);
	-
	- return (err);
	-}
	-
	-int
	-zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion,
	- zfs_iter_f func, void *data)
	-{
	- iter_dependents_arg_t ida;
	- ida.allowrecursion = allowrecursion;
	- ida.stack = NULL;
	- ida.func = func;
	- ida.data = data;
	- ida.first = B_TRUE;
	- return (iter_dependents_cb(zfs_handle_dup(zhp), &ida));
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
	@@ -1,1734 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	- * Copyright 2017 Joyent, Inc.
	- * Copyright 2017 RackTop Systems.
	- * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
	- */
	-
	-/*
	- * Routines to manage ZFS mounts. We separate all the nasty routines that have
	- * to deal with the OS. The following functions are the main entry points --
	- * they are used by mount and unmount and when changing a filesystem's
	- * mountpoint.
	- *
	- * zfs_is_mounted()
	- * zfs_mount()
	- * zfs_unmount()
	- * zfs_unmountall()
	- *
	- * This file also contains the functions used to manage sharing filesystems via
	- * NFS and iSCSI:
	- *
	- * zfs_is_shared()
	- * zfs_share()
	- * zfs_unshare()
	- *
	- * zfs_is_shared_nfs()
	- * zfs_is_shared_smb()
	- * zfs_share_proto()
	- * zfs_shareall();
	- * zfs_unshare_nfs()
	- * zfs_unshare_smb()
	- * zfs_unshareall_nfs()
	- * zfs_unshareall_smb()
	- * zfs_unshareall()
	- * zfs_unshareall_bypath()
	- *
	- * The following functions are available for pool consumers, and will
	- * mount/unmount and share/unshare all datasets within pool:
	- *
	- * zpool_enable_datasets()
	- * zpool_disable_datasets()
	- */
	-
	-#include <dirent.h>
	-#include <dlfcn.h>
	-#include <errno.h>
	-#include <fcntl.h>
	-#include <libgen.h>
	-#include <libintl.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <zone.h>
	-#include <sys/mntent.h>
	-#include <sys/mount.h>
	-#include <sys/stat.h>
	-#include <sys/statvfs.h>
	-
	-#include <libzfs.h>
	-
	-#include "libzfs_impl.h"
	-#include <thread_pool.h>
	-
	-#include <libshare.h>
	-#define MAXISALEN 257 /* based on sysinfo(2) man page */
	-
	-static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */
	-
	-static void zfs_mount_task(void *);
	-static int zfs_share_proto(zfs_handle_t , zfs_share_proto_t );
	-zfs_share_type_t zfs_is_shared_proto(zfs_handle_t , char *,
	- zfs_share_proto_t);
	-
	-/*
	- * The share protocols table must be in the same order as the zfs_share_proto_t
	- * enum in libzfs_impl.h
	- */
	-typedef struct {
	- zfs_prop_t p_prop;
	- char *p_name;
	- int p_share_err;
	- int p_unshare_err;
	-} proto_table_t;
	-
	-proto_table_t proto_table[PROTO_END] = {
	- {ZFS_PROP_SHARENFS, "nfs", EZFS_SHARENFSFAILED, EZFS_UNSHARENFSFAILED},
	- {ZFS_PROP_SHARESMB, "smb", EZFS_SHARESMBFAILED, EZFS_UNSHARESMBFAILED},
	-};
	-
	-zfs_share_proto_t nfs_only[] = {
	- PROTO_NFS,
	- PROTO_END
	-};
	-
	-zfs_share_proto_t smb_only[] = {
	- PROTO_SMB,
	- PROTO_END
	-};
	-zfs_share_proto_t share_all_proto[] = {
	- PROTO_NFS,
	- PROTO_SMB,
	- PROTO_END
	-};
	-
	-/*
	- * Search the sharetab for the given mountpoint and protocol, returning
	- * a zfs_share_type_t value.
	- */
	-static zfs_share_type_t
	-is_shared(libzfs_handle_t hdl, const char mountpoint, zfs_share_proto_t proto)
	-{
	- char buf[MAXPATHLEN], *tab;
	- char *ptr;
	-
	- if (hdl->libzfs_sharetab == NULL)
	- return (SHARED_NOT_SHARED);
	-
	- (void) fseek(hdl->libzfs_sharetab, 0, SEEK_SET);
	-
	- while (fgets(buf, sizeof (buf), hdl->libzfs_sharetab) != NULL) {
	-
	- /* the mountpoint is the first entry on each line */
	- if ((tab = strchr(buf, '\t')) == NULL)
	- continue;
	-
	- *tab = '\0';
	- if (strcmp(buf, mountpoint) == 0) {
	-#ifdef illumos
	- /*
	- * the protocol field is the third field
	- * skip over second field
	- */
	- ptr = ++tab;
	- if ((tab = strchr(ptr, '\t')) == NULL)
	- continue;
	- ptr = ++tab;
	- if ((tab = strchr(ptr, '\t')) == NULL)
	- continue;
	- *tab = '\0';
	- if (strcmp(ptr,
	- proto_table[proto].p_name) == 0) {
	- switch (proto) {
	- case PROTO_NFS:
	- return (SHARED_NFS);
	- case PROTO_SMB:
	- return (SHARED_SMB);
	- default:
	- return (0);
	- }
	- }
	-#else
	- if (proto == PROTO_NFS)
	- return (SHARED_NFS);
	-#endif
	- }
	- }
	-
	- return (SHARED_NOT_SHARED);
	-}
	-
	-#ifdef illumos
	-static boolean_t
	-dir_is_empty_stat(const char *dirname)
	-{
	- struct stat st;
	-
	- /*
	- * We only want to return false if the given path is a non empty
	- * directory, all other errors are handled elsewhere.
	- */
	- if (stat(dirname, &st) < 0 \|\| !S_ISDIR(st.st_mode)) {
	- return (B_TRUE);
	- }
	-
	- /*
	- * An empty directory will still have two entries in it, one
	- * entry for each of "." and "..".
	- */
	- if (st.st_size > 2) {
	- return (B_FALSE);
	- }
	-
	- return (B_TRUE);
	-}
	-
	-static boolean_t
	-dir_is_empty_readdir(const char *dirname)
	-{
	- DIR *dirp;
	- struct dirent64 *dp;
	- int dirfd;
	-
	- if ((dirfd = openat(AT_FDCWD, dirname,
	- O_RDONLY \| O_NDELAY \| O_LARGEFILE \| O_CLOEXEC, 0)) < 0) {
	- return (B_TRUE);
	- }
	-
	- if ((dirp = fdopendir(dirfd)) == NULL) {
	- (void) close(dirfd);
	- return (B_TRUE);
	- }
	-
	- while ((dp = readdir64(dirp)) != NULL) {
	-
	- if (strcmp(dp->d_name, ".") == 0 \|\|
	- strcmp(dp->d_name, "..") == 0)
	- continue;
	-
	- (void) closedir(dirp);
	- return (B_FALSE);
	- }
	-
	- (void) closedir(dirp);
	- return (B_TRUE);
	-}
	-
	-/*
	- * Returns true if the specified directory is empty. If we can't open the
	- * directory at all, return true so that the mount can fail with a more
	- * informative error message.
	- */
	-static boolean_t
	-dir_is_empty(const char *dirname)
	-{
	- struct statvfs64 st;
	-
	- /*
	- * If the statvfs call fails or the filesystem is not a ZFS
	- * filesystem, fall back to the slow path which uses readdir.
	- */
	- if ((statvfs64(dirname, &st) != 0) \|\|
	- (strcmp(st.f_basetype, "zfs") != 0)) {
	- return (dir_is_empty_readdir(dirname));
	- }
	-
	- /*
	- * At this point, we know the provided path is on a ZFS
	- * filesystem, so we can use stat instead of readdir to
	- * determine if the directory is empty or not. We try to avoid
	- * using readdir because that requires opening "dirname"; this
	- * open file descriptor can potentially end up in a child
	- * process if there's a concurrent fork, thus preventing the
	- * zfs_mount() from otherwise succeeding (the open file
	- * descriptor inherited by the child process will cause the
	- * parent's mount to fail with EBUSY). The performance
	- * implications of replacing the open, read, and close with a
	- * single stat is nice; but is not the main motivation for the
	- * added complexity.
	- */
	- return (dir_is_empty_stat(dirname));
	-}
	-#endif
	-
	-/*
	- * Checks to see if the mount is active. If the filesystem is mounted, we fill
	- * in 'where' with the current mountpoint, and return 1. Otherwise, we return
	- * 0.
	- */
	-boolean_t
	-is_mounted(libzfs_handle_t zfs_hdl, const char special, char **where)
	-{
	- struct mnttab entry;
	-
	- if (libzfs_mnttab_find(zfs_hdl, special, &entry) != 0)
	- return (B_FALSE);
	-
	- if (where != NULL)
	- *where = zfs_strdup(zfs_hdl, entry.mnt_mountp);
	-
	- return (B_TRUE);
	-}
	-
	-boolean_t
	-zfs_is_mounted(zfs_handle_t zhp, char *where)
	-{
	- return (is_mounted(zhp->zfs_hdl, zfs_get_name(zhp), where));
	-}
	-
	-static boolean_t
	-zfs_is_mountable_internal(zfs_handle_t zhp, const char mountpoint)
	-{
	-
	- if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED) &&
	- getzoneid() == GLOBAL_ZONEID)
	- return (B_FALSE);
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Returns true if the given dataset is mountable, false otherwise. Returns the
	- * mountpoint in 'buf'.
	- */
	-static boolean_t
	-zfs_is_mountable(zfs_handle_t zhp, char buf, size_t buflen,
	- zprop_source_t *source)
	-{
	- char sourceloc[MAXNAMELEN];
	- zprop_source_t sourcetype;
	-
	- if (!zfs_prop_valid_for_type(ZFS_PROP_MOUNTPOINT, zhp->zfs_type))
	- return (B_FALSE);
	-
	- verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, buf, buflen,
	- &sourcetype, sourceloc, sizeof (sourceloc), B_FALSE) == 0);
	-
	- if (strcmp(buf, ZFS_MOUNTPOINT_NONE) == 0 \|\|
	- strcmp(buf, ZFS_MOUNTPOINT_LEGACY) == 0)
	- return (B_FALSE);
	-
	- if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_OFF)
	- return (B_FALSE);
	-
	- if (!zfs_is_mountable_internal(zhp, buf))
	- return (B_FALSE);
	-
	- if (source)
	- *source = sourcetype;
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Mount the given filesystem.
	- */
	-int
	-zfs_mount(zfs_handle_t zhp, const char options, int flags)
	-{
	- char mountpoint[ZFS_MAXPROPLEN];
	-
	- if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL))
	- return (0);
	-
	- return (zfs_mount_at(zhp, options, flags, mountpoint));
	-}
	-
	-int
	-zfs_mount_at(zfs_handle_t zhp, const char options, int flags,
	- const char *mountpoint)
	-{
	- struct stat buf;
	- char mntopts[MNT_LINE_MAX];
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	-
	- if (options == NULL)
	- mntopts[0] = '\0';
	- else
	- (void) strlcpy(mntopts, options, sizeof (mntopts));
	-
	- /*
	- * If the pool is imported read-only then all mounts must be read-only
	- */
	- if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL))
	- flags \|= MS_RDONLY;
	-
	- if (!zfs_is_mountable_internal(zhp, mountpoint))
	- return (B_FALSE);
	-
	- /* Create the directory if it doesn't already exist */
	- if (lstat(mountpoint, &buf) != 0) {
	- if (mkdirp(mountpoint, 0755) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "failed to create mountpoint"));
	- return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED,
	- dgettext(TEXT_DOMAIN, "cannot mount '%s'"),
	- mountpoint));
	- }
	- }
	-
	-#ifdef illumos /* FreeBSD: overlay mounts are not checked. */
	- /*
	- * Determine if the mountpoint is empty. If so, refuse to perform the
	- * mount. We don't perform this check if MS_OVERLAY is specified, which
	- * would defeat the point. We also avoid this check if 'remount' is
	- * specified.
	- */
	- if ((flags & MS_OVERLAY) == 0 &&
	- strstr(mntopts, MNTOPT_REMOUNT) == NULL &&
	- !dir_is_empty(mountpoint)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "directory is not empty"));
	- return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED,
	- dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint));
	- }
	-#endif
	-
	- /* perform the mount */
	- if (zmount(zfs_get_name(zhp), mountpoint, flags,
	- MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) {
	- /*
	- * Generic errors are nasty, but there are just way too many
	- * from mount(), and they're well-understood. We pick a few
	- * common ones to improve upon.
	- */
	- if (errno == EBUSY) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "mountpoint or dataset is busy"));
	- } else if (errno == EPERM) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "Insufficient privileges"));
	- } else if (errno == ENOTSUP) {
	- char buf[256];
	- int spa_version;
	-
	- VERIFY(zfs_spa_version(zhp, &spa_version) == 0);
	- (void) snprintf(buf, sizeof (buf),
	- dgettext(TEXT_DOMAIN, "Can't mount a version %lld "
	- "file system on a version %d pool. Pool must be"
	- " upgraded to mount this file system."),
	- (u_longlong_t)zfs_prop_get_int(zhp,
	- ZFS_PROP_VERSION), spa_version);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf));
	- } else {
	- zfs_error_aux(hdl, strerror(errno));
	- }
	- return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED,
	- dgettext(TEXT_DOMAIN, "cannot mount '%s'"),
	- zhp->zfs_name));
	- }
	-
	- /* add the mounted entry into our cache */
	- libzfs_mnttab_add(hdl, zfs_get_name(zhp), mountpoint,
	- mntopts);
	- return (0);
	-}
	-
	-/*
	- * Unmount a single filesystem.
	- */
	-static int
	-unmount_one(libzfs_handle_t hdl, const char mountpoint, int flags)
	-{
	- if (umount2(mountpoint, flags) != 0) {
	- zfs_error_aux(hdl, strerror(errno));
	- return (zfs_error_fmt(hdl, EZFS_UMOUNTFAILED,
	- dgettext(TEXT_DOMAIN, "cannot unmount '%s'"),
	- mountpoint));
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Unmount the given filesystem.
	- */
	-int
	-zfs_unmount(zfs_handle_t zhp, const char mountpoint, int flags)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- struct mnttab entry;
	- char *mntpt = NULL;
	-
	- /* check to see if we need to unmount the filesystem */
	- if (mountpoint != NULL \|\| ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
	- libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)) {
	- /*
	- * mountpoint may have come from a call to
	- * getmnt/getmntany if it isn't NULL. If it is NULL,
	- * we know it comes from libzfs_mnttab_find which can
	- * then get freed later. We strdup it to play it safe.
	- */
	- if (mountpoint == NULL)
	- mntpt = zfs_strdup(hdl, entry.mnt_mountp);
	- else
	- mntpt = zfs_strdup(hdl, mountpoint);
	-
	- /*
	- * Unshare and unmount the filesystem
	- */
	- if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0)
	- return (-1);
	-
	- if (unmount_one(hdl, mntpt, flags) != 0) {
	- free(mntpt);
	- (void) zfs_shareall(zhp);
	- return (-1);
	- }
	- libzfs_mnttab_remove(hdl, zhp->zfs_name);
	- free(mntpt);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Unmount this filesystem and any children inheriting the mountpoint property.
	- * To do this, just act like we're changing the mountpoint property, but don't
	- * remount the filesystems afterwards.
	- */
	-int
	-zfs_unmountall(zfs_handle_t *zhp, int flags)
	-{
	- prop_changelist_t *clp;
	- int ret;
	-
	- clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, 0, flags);
	- if (clp == NULL)
	- return (-1);
	-
	- ret = changelist_prefix(clp);
	- changelist_free(clp);
	-
	- return (ret);
	-}
	-
	-boolean_t
	-zfs_is_shared(zfs_handle_t *zhp)
	-{
	- zfs_share_type_t rc = 0;
	- zfs_share_proto_t *curr_proto;
	-
	- if (ZFS_IS_VOLUME(zhp))
	- return (B_FALSE);
	-
	- for (curr_proto = share_all_proto; *curr_proto != PROTO_END;
	- curr_proto++)
	- rc \|= zfs_is_shared_proto(zhp, NULL, *curr_proto);
	-
	- return (rc ? B_TRUE : B_FALSE);
	-}
	-
	-int
	-zfs_share(zfs_handle_t *zhp)
	-{
	- assert(!ZFS_IS_VOLUME(zhp));
	- return (zfs_share_proto(zhp, share_all_proto));
	-}
	-
	-int
	-zfs_unshare(zfs_handle_t *zhp)
	-{
	- assert(!ZFS_IS_VOLUME(zhp));
	- return (zfs_unshareall(zhp));
	-}
	-
	-/*
	- * Check to see if the filesystem is currently shared.
	- */
	-zfs_share_type_t
	-zfs_is_shared_proto(zfs_handle_t zhp, char *where, zfs_share_proto_t proto)
	-{
	- char *mountpoint;
	- zfs_share_type_t rc;
	-
	- if (!zfs_is_mounted(zhp, &mountpoint))
	- return (SHARED_NOT_SHARED);
	-
	- if ((rc = is_shared(zhp->zfs_hdl, mountpoint, proto))
	- != SHARED_NOT_SHARED) {
	- if (where != NULL)
	- *where = mountpoint;
	- else
	- free(mountpoint);
	- return (rc);
	- } else {
	- free(mountpoint);
	- return (SHARED_NOT_SHARED);
	- }
	-}
	-
	-boolean_t
	-zfs_is_shared_nfs(zfs_handle_t zhp, char *where)
	-{
	- return (zfs_is_shared_proto(zhp, where,
	- PROTO_NFS) != SHARED_NOT_SHARED);
	-}
	-
	-boolean_t
	-zfs_is_shared_smb(zfs_handle_t zhp, char *where)
	-{
	- return (zfs_is_shared_proto(zhp, where,
	- PROTO_SMB) != SHARED_NOT_SHARED);
	-}
	-
	-/*
	- * Make sure things will work if libshare isn't installed by using
	- * wrapper functions that check to see that the pointers to functions
	- * initialized in _zfs_init_libshare() are actually present.
	- */
	-
	-#ifdef illumos
	-static sa_handle_t (*_sa_init)(int);
	-static sa_handle_t (_sa_init_arg)(int, void );
	-static void (*_sa_fini)(sa_handle_t);
	-static sa_share_t (_sa_find_share)(sa_handle_t, char );
	-static int (_sa_enable_share)(sa_share_t, char );
	-static int (_sa_disable_share)(sa_share_t, char );
	-static char (_sa_errorstr)(int);
	-static int (_sa_parse_legacy_options)(sa_group_t, char , char *);
	-static boolean_t (_sa_needs_refresh)(sa_handle_t );
	-static libzfs_handle_t (_sa_get_zfs_handle)(sa_handle_t);
	-static int (*_sa_zfs_process_share)(sa_handle_t, sa_group_t, sa_share_t,
	- char , char , zprop_source_t, char , char , char *);
	-static void (*_sa_update_sharetab_ts)(sa_handle_t);
	-#endif
	-
	-/*
	- * _zfs_init_libshare()
	- *
	- * Find the libshare.so.1 entry points that we use here and save the
	- * values to be used later. This is triggered by the runtime loader.
	- * Make sure the correct ISA version is loaded.
	- */
	-
	-#pragma init(_zfs_init_libshare)
	-static void
	-_zfs_init_libshare(void)
	-{
	-#ifdef illumos
	- void *libshare;
	- char path[MAXPATHLEN];
	- char isa[MAXISALEN];
	-
	-#if defined(_LP64)
	- if (sysinfo(SI_ARCHITECTURE_64, isa, MAXISALEN) == -1)
	- isa[0] = '\0';
	-#else
	- isa[0] = '\0';
	-#endif
	- (void) snprintf(path, MAXPATHLEN,
	- "/usr/lib/%s/libshare.so.1", isa);
	-
	- if ((libshare = dlopen(path, RTLD_LAZY \| RTLD_GLOBAL)) != NULL) {
	- _sa_init = (sa_handle_t (*)(int))dlsym(libshare, "sa_init");
	- _sa_init_arg = (sa_handle_t ()(int, void ))dlsym(libshare,
	- "sa_init_arg");
	- _sa_fini = (void (*)(sa_handle_t))dlsym(libshare, "sa_fini");
	- _sa_find_share = (sa_share_t ()(sa_handle_t, char ))
	- dlsym(libshare, "sa_find_share");
	- _sa_enable_share = (int ()(sa_share_t, char ))dlsym(libshare,
	- "sa_enable_share");
	- _sa_disable_share = (int ()(sa_share_t, char ))dlsym(libshare,
	- "sa_disable_share");
	- _sa_errorstr = (char ()(int))dlsym(libshare, "sa_errorstr");
	- _sa_parse_legacy_options = (int ()(sa_group_t, char , char *))
	- dlsym(libshare, "sa_parse_legacy_options");
	- _sa_needs_refresh = (boolean_t ()(sa_handle_t ))
	- dlsym(libshare, "sa_needs_refresh");
	- _sa_get_zfs_handle = (libzfs_handle_t ()(sa_handle_t))
	- dlsym(libshare, "sa_get_zfs_handle");
	- _sa_zfs_process_share = (int (*)(sa_handle_t, sa_group_t,
	- sa_share_t, char , char , zprop_source_t, char *,
	- char , char ))dlsym(libshare, "sa_zfs_process_share");
	- _sa_update_sharetab_ts = (void (*)(sa_handle_t))
	- dlsym(libshare, "sa_update_sharetab_ts");
	- if (_sa_init == NULL \|\| _sa_init_arg == NULL \|\|
	- _sa_fini == NULL \|\| _sa_find_share == NULL \|\|
	- _sa_enable_share == NULL \|\| _sa_disable_share == NULL \|\|
	- _sa_errorstr == NULL \|\| _sa_parse_legacy_options == NULL \|\|
	- _sa_needs_refresh == NULL \|\| _sa_get_zfs_handle == NULL \|\|
	- _sa_zfs_process_share == NULL \|\|
	- _sa_update_sharetab_ts == NULL) {
	- _sa_init = NULL;
	- _sa_init_arg = NULL;
	- _sa_fini = NULL;
	- _sa_disable_share = NULL;
	- _sa_enable_share = NULL;
	- _sa_errorstr = NULL;
	- _sa_parse_legacy_options = NULL;
	- (void) dlclose(libshare);
	- _sa_needs_refresh = NULL;
	- _sa_get_zfs_handle = NULL;
	- _sa_zfs_process_share = NULL;
	- _sa_update_sharetab_ts = NULL;
	- }
	- }
	-#endif
	-}
	-
	-/*
	- * zfs_init_libshare(zhandle, service)
	- *
	- * Initialize the libshare API if it hasn't already been initialized.
	- * In all cases it returns 0 if it succeeded and an error if not. The
	- * service value is which part(s) of the API to initialize and is a
	- * direct map to the libshare sa_init(service) interface.
	- */
	-static int
	-zfs_init_libshare_impl(libzfs_handle_t zhandle, int service, void arg)
	-{
	-#ifdef illumos
	- /*
	- * libshare is either not installed or we're in a branded zone. The
	- * rest of the wrapper functions around the libshare calls already
	- * handle NULL function pointers, but we don't want the callers of
	- * zfs_init_libshare() to fail prematurely if libshare is not available.
	- */
	- if (_sa_init == NULL)
	- return (SA_OK);
	-
	- /*
	- * Attempt to refresh libshare. This is necessary if there was a cache
	- * miss for a new ZFS dataset that was just created, or if state of the
	- * sharetab file has changed since libshare was last initialized. We
	- * want to make sure so check timestamps to see if a different process
	- * has updated any of the configuration. If there was some non-ZFS
	- * change, we need to re-initialize the internal cache.
	- */
	- if (_sa_needs_refresh != NULL &&
	- _sa_needs_refresh(zhandle->libzfs_sharehdl)) {
	- zfs_uninit_libshare(zhandle);
	- zhandle->libzfs_sharehdl = _sa_init_arg(service, arg);
	- }
	-
	- if (zhandle && zhandle->libzfs_sharehdl == NULL)
	- zhandle->libzfs_sharehdl = _sa_init_arg(service, arg);
	-
	- if (zhandle->libzfs_sharehdl == NULL)
	- return (SA_NO_MEMORY);
	-#endif
	-
	- return (SA_OK);
	-}
	-int
	-zfs_init_libshare(libzfs_handle_t *zhandle, int service)
	-{
	- return (zfs_init_libshare_impl(zhandle, service, NULL));
	-}
	-
	-int
	-zfs_init_libshare_arg(libzfs_handle_t zhandle, int service, void arg)
	-{
	- return (zfs_init_libshare_impl(zhandle, service, arg));
	-}
	-
	-
	-/*
	- * zfs_uninit_libshare(zhandle)
	- *
	- * Uninitialize the libshare API if it hasn't already been
	- * uninitialized. It is OK to call multiple times.
	- */
	-void
	-zfs_uninit_libshare(libzfs_handle_t *zhandle)
	-{
	- if (zhandle != NULL && zhandle->libzfs_sharehdl != NULL) {
	-#ifdef illumos
	- if (_sa_fini != NULL)
	- _sa_fini(zhandle->libzfs_sharehdl);
	-#endif
	- zhandle->libzfs_sharehdl = NULL;
	- }
	-}
	-
	-/*
	- * zfs_parse_options(options, proto)
	- *
	- * Call the legacy parse interface to get the protocol specific
	- * options using the NULL arg to indicate that this is a "parse" only.
	- */
	-int
	-zfs_parse_options(char *options, zfs_share_proto_t proto)
	-{
	-#ifdef illumos
	- if (_sa_parse_legacy_options != NULL) {
	- return (_sa_parse_legacy_options(NULL, options,
	- proto_table[proto].p_name));
	- }
	- return (SA_CONFIG_ERR);
	-#else
	- return (SA_OK);
	-#endif
	-}
	-
	-#ifdef illumos
	-/*
	- * zfs_sa_find_share(handle, path)
	- *
	- * wrapper around sa_find_share to find a share path in the
	- * configuration.
	- */
	-static sa_share_t
	-zfs_sa_find_share(sa_handle_t handle, char *path)
	-{
	- if (_sa_find_share != NULL)
	- return (_sa_find_share(handle, path));
	- return (NULL);
	-}
	-
	-/*
	- * zfs_sa_enable_share(share, proto)
	- *
	- * Wrapper for sa_enable_share which enables a share for a specified
	- * protocol.
	- */
	-static int
	-zfs_sa_enable_share(sa_share_t share, char *proto)
	-{
	- if (_sa_enable_share != NULL)
	- return (_sa_enable_share(share, proto));
	- return (SA_CONFIG_ERR);
	-}
	-
	-/*
	- * zfs_sa_disable_share(share, proto)
	- *
	- * Wrapper for sa_enable_share which disables a share for a specified
	- * protocol.
	- */
	-static int
	-zfs_sa_disable_share(sa_share_t share, char *proto)
	-{
	- if (_sa_disable_share != NULL)
	- return (_sa_disable_share(share, proto));
	- return (SA_CONFIG_ERR);
	-}
	-#endif /* illumos */
	-
	-/*
	- * Share the given filesystem according to the options in the specified
	- * protocol specific properties (sharenfs, sharesmb). We rely
	- * on "libshare" to the dirty work for us.
	- */
	-static int
	-zfs_share_proto(zfs_handle_t zhp, zfs_share_proto_t proto)
	-{
	- char mountpoint[ZFS_MAXPROPLEN];
	- char shareopts[ZFS_MAXPROPLEN];
	- char sourcestr[ZFS_MAXPROPLEN];
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- zfs_share_proto_t *curr_proto;
	- zprop_source_t sourcetype;
	- int error, ret;
	-
	- if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL))
	- return (0);
	-
	- for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) {
	- /*
	- * Return success if there are no share options.
	- */
	- if (zfs_prop_get(zhp, proto_table[*curr_proto].p_prop,
	- shareopts, sizeof (shareopts), &sourcetype, sourcestr,
	- ZFS_MAXPROPLEN, B_FALSE) != 0 \|\|
	- strcmp(shareopts, "off") == 0)
	- continue;
	-#ifdef illumos
	- ret = zfs_init_libshare_arg(hdl, SA_INIT_ONE_SHARE_FROM_HANDLE,
	- zhp);
	- if (ret != SA_OK) {
	- (void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED,
	- dgettext(TEXT_DOMAIN, "cannot share '%s': %s"),
	- zfs_get_name(zhp), _sa_errorstr != NULL ?
	- _sa_errorstr(ret) : "");
	- return (-1);
	- }
	-#endif
	-
	- /*
	- * If the 'zoned' property is set, then zfs_is_mountable()
	- * will have already bailed out if we are in the global zone.
	- * But local zones cannot be NFS servers, so we ignore it for
	- * local zones as well.
	- */
	- if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED))
	- continue;
	-
	-#ifdef illumos
	- share = zfs_sa_find_share(hdl->libzfs_sharehdl, mountpoint);
	- if (share == NULL) {
	- /*
	- * This may be a new file system that was just
	- * created so isn't in the internal cache
	- * (second time through). Rather than
	- * reloading the entire configuration, we can
	- * assume ZFS has done the checking and it is
	- * safe to add this to the internal
	- * configuration.
	- */
	- if (_sa_zfs_process_share(hdl->libzfs_sharehdl,
	- NULL, NULL, mountpoint,
	- proto_table[*curr_proto].p_name, sourcetype,
	- shareopts, sourcestr, zhp->zfs_name) != SA_OK) {
	- (void) zfs_error_fmt(hdl,
	- proto_table[*curr_proto].p_share_err,
	- dgettext(TEXT_DOMAIN, "cannot share '%s'"),
	- zfs_get_name(zhp));
	- return (-1);
	- }
	- share = zfs_sa_find_share(hdl->libzfs_sharehdl,
	- mountpoint);
	- }
	- if (share != NULL) {
	- int err;
	- err = zfs_sa_enable_share(share,
	- proto_table[*curr_proto].p_name);
	- if (err != SA_OK) {
	- (void) zfs_error_fmt(hdl,
	- proto_table[*curr_proto].p_share_err,
	- dgettext(TEXT_DOMAIN, "cannot share '%s'"),
	- zfs_get_name(zhp));
	- return (-1);
	- }
	- } else
	-#else
	- if (*curr_proto != PROTO_NFS) {
	- fprintf(stderr, "Unsupported share protocol: %d.\n",
	- *curr_proto);
	- continue;
	- }
	-
	- if (strcmp(shareopts, "on") == 0)
	- error = fsshare(ZFS_EXPORTS_PATH, mountpoint, "");
	- else
	- error = fsshare(ZFS_EXPORTS_PATH, mountpoint, shareopts);
	- if (error != 0)
	-#endif
	- {
	- (void) zfs_error_fmt(hdl,
	- proto_table[*curr_proto].p_share_err,
	- dgettext(TEXT_DOMAIN, "cannot share '%s'"),
	- zfs_get_name(zhp));
	- return (-1);
	- }
	-
	- }
	- return (0);
	-}
	-
	-
	-int
	-zfs_share_nfs(zfs_handle_t *zhp)
	-{
	- return (zfs_share_proto(zhp, nfs_only));
	-}
	-
	-int
	-zfs_share_smb(zfs_handle_t *zhp)
	-{
	- return (zfs_share_proto(zhp, smb_only));
	-}
	-
	-int
	-zfs_shareall(zfs_handle_t *zhp)
	-{
	- return (zfs_share_proto(zhp, share_all_proto));
	-}
	-
	-/*
	- * Unshare a filesystem by mountpoint.
	- */
	-static int
	-unshare_one(libzfs_handle_t hdl, const char name, const char *mountpoint,
	- zfs_share_proto_t proto)
	-{
	-#ifdef illumos
	- sa_share_t share;
	- int err;
	- char *mntpt;
	-
	- /*
	- * Mountpoint could get trashed if libshare calls getmntany
	- * which it does during API initialization, so strdup the
	- * value.
	- */
	- mntpt = zfs_strdup(hdl, mountpoint);
	-
	- /*
	- * make sure libshare initialized, initialize everything because we
	- * don't know what other unsharing may happen later. Functions up the
	- * stack are allowed to initialize instead a subset of shares at the
	- * time the set is known.
	- */
	- if ((err = zfs_init_libshare_arg(hdl, SA_INIT_ONE_SHARE_FROM_NAME,
	- (void *)name)) != SA_OK) {
	- free(mntpt); /* don't need the copy anymore */
	- return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err,
	- dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"),
	- name, _sa_errorstr(err)));
	- }
	-
	- share = zfs_sa_find_share(hdl->libzfs_sharehdl, mntpt);
	- free(mntpt); /* don't need the copy anymore */
	-
	- if (share != NULL) {
	- err = zfs_sa_disable_share(share, proto_table[proto].p_name);
	- if (err != SA_OK) {
	- return (zfs_error_fmt(hdl,
	- proto_table[proto].p_unshare_err,
	- dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"),
	- name, _sa_errorstr(err)));
	- }
	- } else {
	- return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err,
	- dgettext(TEXT_DOMAIN, "cannot unshare '%s': not found"),
	- name));
	- }
	-#else
	- char buf[MAXPATHLEN];
	- FILE *fp;
	- int err;
	-
	- if (proto != PROTO_NFS) {
	- fprintf(stderr, "No SMB support in FreeBSD yet.\n");
	- return (EOPNOTSUPP);
	- }
	-
	- err = fsunshare(ZFS_EXPORTS_PATH, mountpoint);
	- if (err != 0) {
	- zfs_error_aux(hdl, "%s", strerror(err));
	- return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED,
	- dgettext(TEXT_DOMAIN,
	- "cannot unshare '%s'"), name));
	- }
	-#endif
	- return (0);
	-}
	-
	-/*
	- * Unshare the given filesystem.
	- */
	-int
	-zfs_unshare_proto(zfs_handle_t zhp, const char mountpoint,
	- zfs_share_proto_t *proto)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- struct mnttab entry;
	- char *mntpt = NULL;
	-
	- /* check to see if need to unmount the filesystem */
	- rewind(zhp->zfs_hdl->libzfs_mnttab);
	- if (mountpoint != NULL)
	- mountpoint = mntpt = zfs_strdup(hdl, mountpoint);
	-
	- if (mountpoint != NULL \|\| ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
	- libzfs_mnttab_find(hdl, zfs_get_name(zhp), &entry) == 0)) {
	- zfs_share_proto_t *curr_proto;
	-
	- if (mountpoint == NULL)
	- mntpt = zfs_strdup(zhp->zfs_hdl, entry.mnt_mountp);
	-
	- for (curr_proto = proto; *curr_proto != PROTO_END;
	- curr_proto++) {
	-
	- if (is_shared(hdl, mntpt, *curr_proto) &&
	- unshare_one(hdl, zhp->zfs_name,
	- mntpt, *curr_proto) != 0) {
	- if (mntpt != NULL)
	- free(mntpt);
	- return (-1);
	- }
	- }
	- }
	- if (mntpt != NULL)
	- free(mntpt);
	-
	- return (0);
	-}
	-
	-int
	-zfs_unshare_nfs(zfs_handle_t zhp, const char mountpoint)
	-{
	- return (zfs_unshare_proto(zhp, mountpoint, nfs_only));
	-}
	-
	-int
	-zfs_unshare_smb(zfs_handle_t zhp, const char mountpoint)
	-{
	- return (zfs_unshare_proto(zhp, mountpoint, smb_only));
	-}
	-
	-/*
	- * Same as zfs_unmountall(), but for NFS and SMB unshares.
	- */
	-int
	-zfs_unshareall_proto(zfs_handle_t zhp, zfs_share_proto_t proto)
	-{
	- prop_changelist_t *clp;
	- int ret;
	-
	- clp = changelist_gather(zhp, ZFS_PROP_SHARENFS, 0, 0);
	- if (clp == NULL)
	- return (-1);
	-
	- ret = changelist_unshare(clp, proto);
	- changelist_free(clp);
	-
	- return (ret);
	-}
	-
	-int
	-zfs_unshareall_nfs(zfs_handle_t *zhp)
	-{
	- return (zfs_unshareall_proto(zhp, nfs_only));
	-}
	-
	-int
	-zfs_unshareall_smb(zfs_handle_t *zhp)
	-{
	- return (zfs_unshareall_proto(zhp, smb_only));
	-}
	-
	-int
	-zfs_unshareall(zfs_handle_t *zhp)
	-{
	- return (zfs_unshareall_proto(zhp, share_all_proto));
	-}
	-
	-int
	-zfs_unshareall_bypath(zfs_handle_t zhp, const char mountpoint)
	-{
	- return (zfs_unshare_proto(zhp, mountpoint, share_all_proto));
	-}
	-
	-/*
	- * Remove the mountpoint associated with the current dataset, if necessary.
	- * We only remove the underlying directory if:
	- *
	- * - The mountpoint is not 'none' or 'legacy'
	- * - The mountpoint is non-empty
	- * - The mountpoint is the default or inherited
	- * - The 'zoned' property is set, or we're in a local zone
	- *
	- * Any other directories we leave alone.
	- */
	-void
	-remove_mountpoint(zfs_handle_t *zhp)
	-{
	- char mountpoint[ZFS_MAXPROPLEN];
	- zprop_source_t source;
	-
	- if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint),
	- &source))
	- return;
	-
	- if (source == ZPROP_SRC_DEFAULT \|\|
	- source == ZPROP_SRC_INHERITED) {
	- /*
	- * Try to remove the directory, silently ignoring any errors.
	- * The filesystem may have since been removed or moved around,
	- * and this error isn't really useful to the administrator in
	- * any way.
	- */
	- (void) rmdir(mountpoint);
	- }
	-}
	-
	-/*
	- * Add the given zfs handle to the cb_handles array, dynamically reallocating
	- * the array if it is out of space
	- */
	-void
	-libzfs_add_handle(get_all_cb_t cbp, zfs_handle_t zhp)
	-{
	- if (cbp->cb_alloc == cbp->cb_used) {
	- size_t newsz;
	- zfs_handle_t **newhandles;
	-
	- newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64;
	- newhandles = zfs_realloc(zhp->zfs_hdl,
	- cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *),
	- newsz * sizeof (zfs_handle_t *));
	- cbp->cb_handles = newhandles;
	- cbp->cb_alloc = newsz;
	- }
	- cbp->cb_handles[cbp->cb_used++] = zhp;
	-}
	-
	-/*
	- * Recursive helper function used during file system enumeration
	- */
	-static int
	-zfs_iter_cb(zfs_handle_t zhp, void data)
	-{
	- get_all_cb_t *cbp = data;
	-
	- if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- /*
	- * If this filesystem is inconsistent and has a receive resume
	- * token, we can not mount it.
	- */
	- if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
	- zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
	- NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- libzfs_add_handle(cbp, zhp);
	- if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) {
	- zfs_close(zhp);
	- return (-1);
	- }
	- return (0);
	-}
	-
	-/*
	- * Sort comparator that compares two mountpoint paths. We sort these paths so
	- * that subdirectories immediately follow their parents. This means that we
	- * effectively treat the '/' character as the lowest value non-nul char.
	- * Since filesystems from non-global zones can have the same mountpoint
	- * as other filesystems, the comparator sorts global zone filesystems to
	- * the top of the list. This means that the global zone will traverse the
	- * filesystem list in the correct order and can stop when it sees the
	- * first zoned filesystem. In a non-global zone, only the delegated
	- * filesystems are seen.
	- *
	- * An example sorted list using this comparator would look like:
	- *
	- * /foo
	- * /foo/bar
	- * /foo/bar/baz
	- * /foo/baz
	- * /foo.bar
	- * /foo (NGZ1)
	- * /foo (NGZ2)
	- *
	- * The mount code depend on this ordering to deterministically iterate
	- * over filesystems in order to spawn parallel mount tasks.
	- */
	-static int
	-mountpoint_cmp(const void arga, const void argb)
	-{
	- zfs_handle_t const zap = arga;
	- zfs_handle_t za = zap;
	- zfs_handle_t const zbp = argb;
	- zfs_handle_t zb = zbp;
	- char mounta[MAXPATHLEN];
	- char mountb[MAXPATHLEN];
	- const char *a = mounta;
	- const char *b = mountb;
	- boolean_t gota, gotb;
	- uint64_t zoneda, zonedb;
	-
	- zoneda = zfs_prop_get_int(za, ZFS_PROP_ZONED);
	- zonedb = zfs_prop_get_int(zb, ZFS_PROP_ZONED);
	- if (zoneda && !zonedb)
	- return (1);
	- if (!zoneda && zonedb)
	- return (-1);
	- gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM);
	- if (gota)
	- verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta,
	- sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
	- gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM);
	- if (gotb)
	- verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb,
	- sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
	-
	- if (gota && gotb) {
	- while (a != '\0' && (a == *b)) {
	- a++;
	- b++;
	- }
	- if (a == b)
	- return (0);
	- if (*a == '\0')
	- return (-1);
	- if (*b == '\0')
	- return (1);
	- if (*a == '/')
	- return (-1);
	- if (*b == '/')
	- return (1);
	- return (a < b ? -1 : a > b);
	- }
	-
	- if (gota)
	- return (-1);
	- if (gotb)
	- return (1);
	-
	- /*
	- * If neither filesystem has a mountpoint, revert to sorting by
	- * datset name.
	- */
	- return (strcmp(zfs_get_name(za), zfs_get_name(zb)));
	-}
	-
	-/*
	- * Return true if path2 is a child of path1 or path2 equals path1 or
	- * path1 is "/" (path2 is always a child of "/").
	- */
	-static boolean_t
	-libzfs_path_contains(const char path1, const char path2)
	-{
	- return (strcmp(path1, path2) == 0 \|\| strcmp(path1, "/") == 0 \|\|
	- (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/'));
	-}
	-
	-
	-static int
	-non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx)
	-{
	- char parent[ZFS_MAXPROPLEN];
	- char child[ZFS_MAXPROPLEN];
	- int i;
	-
	- verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent,
	- sizeof (parent), NULL, NULL, 0, B_FALSE) == 0);
	-
	- for (i = idx + 1; i < num_handles; i++) {
	- verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child,
	- sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
	- if (!libzfs_path_contains(parent, child))
	- break;
	- }
	- return (i);
	-}
	-
	-typedef struct mnt_param {
	- libzfs_handle_t *mnt_hdl;
	- tpool_t *mnt_tp;
	- zfs_handle_t *mnt_zhps; / filesystems to mount */
	- size_t mnt_num_handles;
	- int mnt_idx; /* Index of selected entry to mount */
	- zfs_iter_f mnt_func;
	- void *mnt_data;
	-} mnt_param_t;
	-
	-/*
	- * Allocate and populate the parameter struct for mount function, and
	- * schedule mounting of the entry selected by idx.
	- */
	-static void
	-zfs_dispatch_mount(libzfs_handle_t hdl, zfs_handle_t *handles,
	- size_t num_handles, int idx, zfs_iter_f func, void data, tpool_t tp)
	-{
	- mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t));
	-
	- mnt_param->mnt_hdl = hdl;
	- mnt_param->mnt_tp = tp;
	- mnt_param->mnt_zhps = handles;
	- mnt_param->mnt_num_handles = num_handles;
	- mnt_param->mnt_idx = idx;
	- mnt_param->mnt_func = func;
	- mnt_param->mnt_data = data;
	-
	- (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param);
	-}
	-
	-/*
	- * This is the structure used to keep state of mounting or sharing operations
	- * during a call to zpool_enable_datasets().
	- */
	-typedef struct mount_state {
	- /*
	- * ms_mntstatus is set to -1 if any mount fails. While multiple threads
	- * could update this variable concurrently, no synchronization is
	- * needed as it's only ever set to -1.
	- */
	- int ms_mntstatus;
	- int ms_mntflags;
	- const char *ms_mntopts;
	-} mount_state_t;
	-
	-static int
	-zfs_mount_one(zfs_handle_t zhp, void arg)
	-{
	- mount_state_t *ms = arg;
	- int ret = 0;
	-
	- if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0)
	- ret = ms->ms_mntstatus = -1;
	- return (ret);
	-}
	-
	-static int
	-zfs_share_one(zfs_handle_t zhp, void arg)
	-{
	- mount_state_t *ms = arg;
	- int ret = 0;
	-
	- if (zfs_share(zhp) != 0)
	- ret = ms->ms_mntstatus = -1;
	- return (ret);
	-}
	-
	-/*
	- * Thread pool function to mount one file system. On completion, it finds and
	- * schedules its children to be mounted. This depends on the sorting done in
	- * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries
	- * each descending from the previous) will have no parallelism since we always
	- * have to wait for the parent to finish mounting before we can schedule
	- * its children.
	- */
	-static void
	-zfs_mount_task(void *arg)
	-{
	- mnt_param_t *mp = arg;
	- int idx = mp->mnt_idx;
	- zfs_handle_t **handles = mp->mnt_zhps;
	- size_t num_handles = mp->mnt_num_handles;
	- char mountpoint[ZFS_MAXPROPLEN];
	-
	- verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint,
	- sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
	-
	- if (mp->mnt_func(handles[idx], mp->mnt_data) != 0)
	- return;
	-
	- /*
	- * We dispatch tasks to mount filesystems with mountpoints underneath
	- * this one. We do this by dispatching the next filesystem with a
	- * descendant mountpoint of the one we just mounted, then skip all of
	- * its descendants, dispatch the next descendant mountpoint, and so on.
	- * The non_descendant_idx() function skips over filesystems that are
	- * descendants of the filesystem we just dispatched.
	- */
	- for (int i = idx + 1; i < num_handles;
	- i = non_descendant_idx(handles, num_handles, i)) {
	- char child[ZFS_MAXPROPLEN];
	- verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT,
	- child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
	-
	- if (!libzfs_path_contains(mountpoint, child))
	- break; /* not a descendant, return */
	- zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i,
	- mp->mnt_func, mp->mnt_data, mp->mnt_tp);
	- }
	- free(mp);
	-}
	-
	-/*
	- * Issue the func callback for each ZFS handle contained in the handles
	- * array. This function is used to mount all datasets, and so this function
	- * guarantees that filesystems for parent mountpoints are called before their
	- * children. As such, before issuing any callbacks, we first sort the array
	- * of handles by mountpoint.
	- *
	- * Callbacks are issued in one of two ways:
	- *
	- * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT
	- * environment variable is set, then we issue callbacks sequentially.
	- *
	- * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT
	- * environment variable is not set, then we use a tpool to dispatch threads
	- * to mount filesystems in parallel. This function dispatches tasks to mount
	- * the filesystems at the top-level mountpoints, and these tasks in turn
	- * are responsible for recursively mounting filesystems in their children
	- * mountpoints.
	- */
	-void
	-zfs_foreach_mountpoint(libzfs_handle_t hdl, zfs_handle_t *handles,
	- size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel)
	-{
	- zoneid_t zoneid = getzoneid();
	-
	- /*
	- * The ZFS_SERIAL_MOUNT environment variable is an undocumented
	- * variable that can be used as a convenience to do a/b comparison
	- * of serial vs. parallel mounting.
	- */
	- boolean_t serial_mount = !parallel \|\|
	- (getenv("ZFS_SERIAL_MOUNT") != NULL);
	-
	- /*
	- * Sort the datasets by mountpoint. See mountpoint_cmp for details
	- * of how these are sorted.
	- */
	- qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp);
	-
	- if (serial_mount) {
	- for (int i = 0; i < num_handles; i++) {
	- func(handles[i], data);
	- }
	- return;
	- }
	-
	- /*
	- * Issue the callback function for each dataset using a parallel
	- * algorithm that uses a thread pool to manage threads.
	- */
	- tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL);
	-
	- /*
	- * There may be multiple "top level" mountpoints outside of the pool's
	- * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of
	- * these.
	- */
	- for (int i = 0; i < num_handles;
	- i = non_descendant_idx(handles, num_handles, i)) {
	- /*
	- * Since the mountpoints have been sorted so that the zoned
	- * filesystems are at the end, a zoned filesystem seen from
	- * the global zone means that we're done.
	- */
	- if (zoneid == GLOBAL_ZONEID &&
	- zfs_prop_get_int(handles[i], ZFS_PROP_ZONED))
	- break;
	- zfs_dispatch_mount(hdl, handles, num_handles, i, func, data,
	- tp);
	- }
	-
	- tpool_wait(tp); /* wait for all scheduled mounts to complete */
	- tpool_destroy(tp);
	-}
	-
	-/*
	- * Mount and share all datasets within the given pool. This assumes that no
	- * datasets within the pool are currently mounted.
	- */
	-#pragma weak zpool_mount_datasets = zpool_enable_datasets
	-int
	-zpool_enable_datasets(zpool_handle_t zhp, const char mntopts, int flags)
	-{
	- get_all_cb_t cb = { 0 };
	- mount_state_t ms = { 0 };
	- zfs_handle_t *zfsp;
	- int ret = 0;
	-
	- if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
	- ZFS_TYPE_DATASET)) == NULL)
	- goto out;
	-
	- /*
	- * Gather all non-snapshot datasets within the pool. Start by adding
	- * the root filesystem for this pool to the list, and then iterate
	- * over all child filesystems.
	- */
	- libzfs_add_handle(&cb, zfsp);
	- if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0)
	- goto out;
	-
	- /*
	- * Mount all filesystems
	- */
	- ms.ms_mntopts = mntopts;
	- ms.ms_mntflags = flags;
	- zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
	- zfs_mount_one, &ms, B_TRUE);
	- if (ms.ms_mntstatus != 0)
	- ret = ms.ms_mntstatus;
	-
	- /*
	- * Share all filesystems that need to be shared. This needs to be
	- * a separate pass because libshare is not mt-safe, and so we need
	- * to share serially.
	- */
	- ms.ms_mntstatus = 0;
	- zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
	- zfs_share_one, &ms, B_FALSE);
	- if (ms.ms_mntstatus != 0)
	- ret = ms.ms_mntstatus;
	-
	-out:
	- for (int i = 0; i < cb.cb_used; i++)
	- zfs_close(cb.cb_handles[i]);
	- free(cb.cb_handles);
	-
	- return (ret);
	-}
	-
	-static int
	-mountpoint_compare(const void a, const void b)
	-{
	- const char mounta = ((char **)a);
	- const char mountb = ((char **)b);
	-
	- return (strcmp(mountb, mounta));
	-}
	-
	-/* alias for 2002/240 */
	-#pragma weak zpool_unmount_datasets = zpool_disable_datasets
	-/*
	- * Unshare and unmount all datasets within the given pool. We don't want to
	- * rely on traversing the DSL to discover the filesystems within the pool,
	- * because this may be expensive (if not all of them are mounted), and can fail
	- * arbitrarily (on I/O error, for example). Instead, we walk /etc/mnttab and
	- * gather all the filesystems that are currently mounted.
	- */
	-int
	-zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
	-{
	- int used, alloc;
	- struct mnttab entry;
	- size_t namelen;
	- char **mountpoints = NULL;
	- zfs_handle_t **datasets = NULL;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- int i;
	- int ret = -1;
	- int flags = (force ? MS_FORCE : 0);
	-#ifdef illumos
	- sa_init_selective_arg_t sharearg;
	-#endif
	-
	- namelen = strlen(zhp->zpool_name);
	-
	- rewind(hdl->libzfs_mnttab);
	- used = alloc = 0;
	- while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
	- /*
	- * Ignore non-ZFS entries.
	- */
	- if (entry.mnt_fstype == NULL \|\|
	- strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
	- continue;
	-
	- /*
	- * Ignore filesystems not within this pool.
	- */
	- if (entry.mnt_mountp == NULL \|\|
	- strncmp(entry.mnt_special, zhp->zpool_name, namelen) != 0 \|\|
	- (entry.mnt_special[namelen] != '/' &&
	- entry.mnt_special[namelen] != '\0'))
	- continue;
	-
	- /*
	- * At this point we've found a filesystem within our pool. Add
	- * it to our growing list.
	- */
	- if (used == alloc) {
	- if (alloc == 0) {
	- if ((mountpoints = zfs_alloc(hdl,
	- 8 * sizeof (void *))) == NULL)
	- goto out;
	-
	- if ((datasets = zfs_alloc(hdl,
	- 8 * sizeof (void *))) == NULL)
	- goto out;
	-
	- alloc = 8;
	- } else {
	- void *ptr;
	-
	- if ((ptr = zfs_realloc(hdl, mountpoints,
	- alloc * sizeof (void *),
	- alloc * 2 * sizeof (void *))) == NULL)
	- goto out;
	- mountpoints = ptr;
	-
	- if ((ptr = zfs_realloc(hdl, datasets,
	- alloc * sizeof (void *),
	- alloc * 2 * sizeof (void *))) == NULL)
	- goto out;
	- datasets = ptr;
	-
	- alloc *= 2;
	- }
	- }
	-
	- if ((mountpoints[used] = zfs_strdup(hdl,
	- entry.mnt_mountp)) == NULL)
	- goto out;
	-
	- /*
	- * This is allowed to fail, in case there is some I/O error. It
	- * is only used to determine if we need to remove the underlying
	- * mountpoint, so failure is not fatal.
	- */
	- datasets[used] = make_dataset_handle(hdl, entry.mnt_special);
	-
	- used++;
	- }
	-
	- /*
	- * At this point, we have the entire list of filesystems, so sort it by
	- * mountpoint.
	- */
	-#ifdef illumos
	- sharearg.zhandle_arr = datasets;
	- sharearg.zhandle_len = used;
	- ret = zfs_init_libshare_arg(hdl, SA_INIT_SHARE_API_SELECTIVE,
	- &sharearg);
	- if (ret != 0)
	- goto out;
	-#endif
	- qsort(mountpoints, used, sizeof (char *), mountpoint_compare);
	-
	- /*
	- * Walk through and first unshare everything.
	- */
	- for (i = 0; i < used; i++) {
	- zfs_share_proto_t *curr_proto;
	- for (curr_proto = share_all_proto; *curr_proto != PROTO_END;
	- curr_proto++) {
	- if (is_shared(hdl, mountpoints[i], *curr_proto) &&
	- unshare_one(hdl, mountpoints[i],
	- mountpoints[i], *curr_proto) != 0)
	- goto out;
	- }
	- }
	-
	- /*
	- * Now unmount everything, removing the underlying directories as
	- * appropriate.
	- */
	- for (i = 0; i < used; i++) {
	- if (unmount_one(hdl, mountpoints[i], flags) != 0)
	- goto out;
	- }
	-
	- for (i = 0; i < used; i++) {
	- if (datasets[i])
	- remove_mountpoint(datasets[i]);
	- }
	-
	- ret = 0;
	-out:
	- for (i = 0; i < used; i++) {
	- if (datasets[i])
	- zfs_close(datasets[i]);
	- free(mountpoints[i]);
	- }
	- free(datasets);
	- free(mountpoints);
	-
	- return (ret);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
	@@ -1,4669 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright 2016 Nexenta Systems, Inc.
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	- * Copyright (c) 2017 Datto Inc.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/stat.h>
	-#include <ctype.h>
	-#include <errno.h>
	-#include <devid.h>
	-#include <fcntl.h>
	-#include <libintl.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <libgen.h>
	-#include <sys/zfs_ioctl.h>
	-#include <dlfcn.h>
	-
	-#include "zfs_namecheck.h"
	-#include "zfs_prop.h"
	-#include "libzfs_impl.h"
	-#include "zfs_comutil.h"
	-#include "zfeature_common.h"
	-
	-static int read_efi_label(nvlist_t , diskaddr_t , boolean_t *);
	-static boolean_t zpool_vdev_is_interior(const char *name);
	-
	-#define BACKUP_SLICE "s2"
	-
	-typedef struct prop_flags {
	- int create:1; /* Validate property on creation */
	- int import:1; /* Validate property on import */
	-} prop_flags_t;
	-
	-/*
	- * ====================================================================
	- * zpool property functions
	- * ====================================================================
	- */
	-
	-static int
	-zpool_get_all_props(zpool_handle_t *zhp)
	-{
	- zfs_cmd_t zc = { 0 };
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	-
	- if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
	- return (-1);
	-
	- while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) {
	- if (errno == ENOMEM) {
	- if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- } else {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- }
	-
	- if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	-
	- zcmd_free_nvlists(&zc);
	-
	- return (0);
	-}
	-
	-static int
	-zpool_props_refresh(zpool_handle_t *zhp)
	-{
	- nvlist_t *old_props;
	-
	- old_props = zhp->zpool_props;
	-
	- if (zpool_get_all_props(zhp) != 0)
	- return (-1);
	-
	- nvlist_free(old_props);
	- return (0);
	-}
	-
	-static char *
	-zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop,
	- zprop_source_t *src)
	-{
	- nvlist_t nv, nvl;
	- uint64_t ival;
	- char *value;
	- zprop_source_t source;
	-
	- nvl = zhp->zpool_props;
	- if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
	- verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0);
	- source = ival;
	- verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
	- } else {
	- source = ZPROP_SRC_DEFAULT;
	- if ((value = (char *)zpool_prop_default_string(prop)) == NULL)
	- value = "-";
	- }
	-
	- if (src)
	- *src = source;
	-
	- return (value);
	-}
	-
	-uint64_t
	-zpool_get_prop_int(zpool_handle_t zhp, zpool_prop_t prop, zprop_source_t src)
	-{
	- nvlist_t nv, nvl;
	- uint64_t value;
	- zprop_source_t source;
	-
	- if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) {
	- /*
	- * zpool_get_all_props() has most likely failed because
	- * the pool is faulted, but if all we need is the top level
	- * vdev's guid then get it from the zhp config nvlist.
	- */
	- if ((prop == ZPOOL_PROP_GUID) &&
	- (nvlist_lookup_nvlist(zhp->zpool_config,
	- ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) &&
	- (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value)
	- == 0)) {
	- return (value);
	- }
	- return (zpool_prop_default_numeric(prop));
	- }
	-
	- nvl = zhp->zpool_props;
	- if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
	- verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0);
	- source = value;
	- verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
	- } else {
	- source = ZPROP_SRC_DEFAULT;
	- value = zpool_prop_default_numeric(prop);
	- }
	-
	- if (src)
	- *src = source;
	-
	- return (value);
	-}
	-
	-/*
	- * Map VDEV STATE to printed strings.
	- */
	-const char *
	-zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
	-{
	- switch (state) {
	- case VDEV_STATE_CLOSED:
	- case VDEV_STATE_OFFLINE:
	- return (gettext("OFFLINE"));
	- case VDEV_STATE_REMOVED:
	- return (gettext("REMOVED"));
	- case VDEV_STATE_CANT_OPEN:
	- if (aux == VDEV_AUX_CORRUPT_DATA \|\| aux == VDEV_AUX_BAD_LOG)
	- return (gettext("FAULTED"));
	- else if (aux == VDEV_AUX_SPLIT_POOL)
	- return (gettext("SPLIT"));
	- else
	- return (gettext("UNAVAIL"));
	- case VDEV_STATE_FAULTED:
	- return (gettext("FAULTED"));
	- case VDEV_STATE_DEGRADED:
	- return (gettext("DEGRADED"));
	- case VDEV_STATE_HEALTHY:
	- return (gettext("ONLINE"));
	-
	- default:
	- break;
	- }
	-
	- return (gettext("UNKNOWN"));
	-}
	-
	-/*
	- * Map POOL STATE to printed strings.
	- */
	-const char *
	-zpool_pool_state_to_name(pool_state_t state)
	-{
	- switch (state) {
	- case POOL_STATE_ACTIVE:
	- return (gettext("ACTIVE"));
	- case POOL_STATE_EXPORTED:
	- return (gettext("EXPORTED"));
	- case POOL_STATE_DESTROYED:
	- return (gettext("DESTROYED"));
	- case POOL_STATE_SPARE:
	- return (gettext("SPARE"));
	- case POOL_STATE_L2CACHE:
	- return (gettext("L2CACHE"));
	- case POOL_STATE_UNINITIALIZED:
	- return (gettext("UNINITIALIZED"));
	- case POOL_STATE_UNAVAIL:
	- return (gettext("UNAVAIL"));
	- case POOL_STATE_POTENTIALLY_ACTIVE:
	- return (gettext("POTENTIALLY_ACTIVE"));
	- }
	-
	- return (gettext("UNKNOWN"));
	-}
	-
	-/*
	- * Get a zpool property value for 'prop' and return the value in
	- * a pre-allocated buffer.
	- */
	-int
	-zpool_get_prop(zpool_handle_t zhp, zpool_prop_t prop, char buf, size_t len,
	- zprop_source_t *srctype, boolean_t literal)
	-{
	- uint64_t intval;
	- const char *strval;
	- zprop_source_t src = ZPROP_SRC_NONE;
	- nvlist_t *nvroot;
	- vdev_stat_t *vs;
	- uint_t vsc;
	-
	- if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	- switch (prop) {
	- case ZPOOL_PROP_NAME:
	- (void) strlcpy(buf, zpool_get_name(zhp), len);
	- break;
	-
	- case ZPOOL_PROP_HEALTH:
	- (void) strlcpy(buf,
	- zpool_pool_state_to_name(POOL_STATE_UNAVAIL), len);
	- break;
	-
	- case ZPOOL_PROP_GUID:
	- intval = zpool_get_prop_int(zhp, prop, &src);
	- (void) snprintf(buf, len, "%llu", intval);
	- break;
	-
	- case ZPOOL_PROP_ALTROOT:
	- case ZPOOL_PROP_CACHEFILE:
	- case ZPOOL_PROP_COMMENT:
	- if (zhp->zpool_props != NULL \|\|
	- zpool_get_all_props(zhp) == 0) {
	- (void) strlcpy(buf,
	- zpool_get_prop_string(zhp, prop, &src),
	- len);
	- break;
	- }
	- /* FALLTHROUGH */
	- default:
	- (void) strlcpy(buf, "-", len);
	- break;
	- }
	-
	- if (srctype != NULL)
	- *srctype = src;
	- return (0);
	- }
	-
	- if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) &&
	- prop != ZPOOL_PROP_NAME)
	- return (-1);
	-
	- switch (zpool_prop_get_type(prop)) {
	- case PROP_TYPE_STRING:
	- (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src),
	- len);
	- break;
	-
	- case PROP_TYPE_NUMBER:
	- intval = zpool_get_prop_int(zhp, prop, &src);
	-
	- switch (prop) {
	- case ZPOOL_PROP_SIZE:
	- case ZPOOL_PROP_ALLOCATED:
	- case ZPOOL_PROP_FREE:
	- case ZPOOL_PROP_FREEING:
	- case ZPOOL_PROP_LEAKED:
	- if (literal) {
	- (void) snprintf(buf, len, "%llu",
	- (u_longlong_t)intval);
	- } else {
	- (void) zfs_nicenum(intval, buf, len);
	- }
	- break;
	- case ZPOOL_PROP_BOOTSIZE:
	- case ZPOOL_PROP_EXPANDSZ:
	- case ZPOOL_PROP_CHECKPOINT:
	- if (intval == 0) {
	- (void) strlcpy(buf, "-", len);
	- } else if (literal) {
	- (void) snprintf(buf, len, "%llu",
	- (u_longlong_t)intval);
	- } else {
	- (void) zfs_nicenum(intval, buf, len);
	- }
	- break;
	- case ZPOOL_PROP_CAPACITY:
	- if (literal) {
	- (void) snprintf(buf, len, "%llu",
	- (u_longlong_t)intval);
	- } else {
	- (void) snprintf(buf, len, "%llu%%",
	- (u_longlong_t)intval);
	- }
	- break;
	- case ZPOOL_PROP_FRAGMENTATION:
	- if (intval == UINT64_MAX) {
	- (void) strlcpy(buf, "-", len);
	- } else {
	- (void) snprintf(buf, len, "%llu%%",
	- (u_longlong_t)intval);
	- }
	- break;
	- case ZPOOL_PROP_DEDUPRATIO:
	- (void) snprintf(buf, len, "%llu.%02llux",
	- (u_longlong_t)(intval / 100),
	- (u_longlong_t)(intval % 100));
	- break;
	- case ZPOOL_PROP_HEALTH:
	- verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
	- ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	- verify(nvlist_lookup_uint64_array(nvroot,
	- ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
	- == 0);
	-
	- (void) strlcpy(buf, zpool_state_to_name(intval,
	- vs->vs_aux), len);
	- break;
	- case ZPOOL_PROP_VERSION:
	- if (intval >= SPA_VERSION_FEATURES) {
	- (void) snprintf(buf, len, "-");
	- break;
	- }
	- /* FALLTHROUGH */
	- default:
	- (void) snprintf(buf, len, "%llu", intval);
	- }
	- break;
	-
	- case PROP_TYPE_INDEX:
	- intval = zpool_get_prop_int(zhp, prop, &src);
	- if (zpool_prop_index_to_string(prop, intval, &strval)
	- != 0)
	- return (-1);
	- (void) strlcpy(buf, strval, len);
	- break;
	-
	- default:
	- abort();
	- }
	-
	- if (srctype)
	- *srctype = src;
	-
	- return (0);
	-}
	-
	-/*
	- * Check if the bootfs name has the same pool name as it is set to.
	- * Assuming bootfs is a valid dataset name.
	- */
	-static boolean_t
	-bootfs_name_valid(const char pool, const char bootfs)
	-{
	- int len = strlen(pool);
	-
	- if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM\|ZFS_TYPE_SNAPSHOT))
	- return (B_FALSE);
	-
	- if (strncmp(pool, bootfs, len) == 0 &&
	- (bootfs[len] == '/' \|\| bootfs[len] == '\0'))
	- return (B_TRUE);
	-
	- return (B_FALSE);
	-}
	-
	-boolean_t
	-zpool_is_bootable(zpool_handle_t *zhp)
	-{
	- char bootfs[ZFS_MAX_DATASET_NAME_LEN];
	-
	- return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs,
	- sizeof (bootfs), NULL, B_FALSE) == 0 && strncmp(bootfs, "-",
	- sizeof (bootfs)) != 0);
	-}
	-
	-
	-/*
	- * Given an nvlist of zpool properties to be set, validate that they are
	- * correct, and parse any numeric properties (index, boolean, etc) if they are
	- * specified as strings.
	- */
	-static nvlist_t *
	-zpool_valid_proplist(libzfs_handle_t hdl, const char poolname,
	- nvlist_t props, uint64_t version, prop_flags_t flags, char errbuf)
	-{
	- nvpair_t *elem;
	- nvlist_t *retprops;
	- zpool_prop_t prop;
	- char *strval;
	- uint64_t intval;
	- char slash, check;
	- struct stat64 statbuf;
	- zpool_handle_t *zhp;
	-
	- if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) {
	- (void) no_memory(hdl);
	- return (NULL);
	- }
	-
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
	- const char *propname = nvpair_name(elem);
	-
	- prop = zpool_name_to_prop(propname);
	- if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) {
	- int err;
	- char *fname = strchr(propname, '@') + 1;
	-
	- err = zfeature_lookup_name(fname, NULL);
	- if (err != 0) {
	- ASSERT3U(err, ==, ENOENT);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid feature '%s'"), fname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- if (nvpair_type(elem) != DATA_TYPE_STRING) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be a string"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- (void) nvpair_value_string(elem, &strval);
	- if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property '%s' can only be set to "
	- "'enabled'"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- if (nvlist_add_uint64(retprops, propname, 0) != 0) {
	- (void) no_memory(hdl);
	- goto error;
	- }
	- continue;
	- }
	-
	- /*
	- * Make sure this property is valid and applies to this type.
	- */
	- if (prop == ZPOOL_PROP_INVAL) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid property '%s'"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- if (zpool_prop_readonly(prop)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
	- "is readonly"), propname);
	- (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
	- goto error;
	- }
	-
	- if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops,
	- &strval, &intval, errbuf) != 0)
	- goto error;
	-
	- /*
	- * Perform additional checking for specific properties.
	- */
	- switch (prop) {
	- case ZPOOL_PROP_VERSION:
	- if (intval < version \|\|
	- !SPA_VERSION_IS_SUPPORTED(intval)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property '%s' number %d is invalid."),
	- propname, intval);
	- (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- goto error;
	- }
	- break;
	-
	- case ZPOOL_PROP_BOOTSIZE:
	- if (!flags.create) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property '%s' can only be set during pool "
	- "creation"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- break;
	-
	- case ZPOOL_PROP_BOOTFS:
	- if (flags.create \|\| flags.import) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property '%s' cannot be set at creation "
	- "or import time"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- if (version < SPA_VERSION_BOOTFS) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded to support "
	- "'%s' property"), propname);
	- (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- goto error;
	- }
	-
	- /*
	- * bootfs property value has to be a dataset name and
	- * the dataset has to be in the same pool as it sets to.
	- */
	- if (strval[0] != '\0' && !bootfs_name_valid(poolname,
	- strval)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
	- "is an invalid name"), strval);
	- (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
	- goto error;
	- }
	-
	- if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "could not open pool '%s'"), poolname);
	- (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
	- goto error;
	- }
	- zpool_close(zhp);
	- break;
	-
	- case ZPOOL_PROP_ALTROOT:
	- if (!flags.create && !flags.import) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property '%s' can only be set during pool "
	- "creation or import"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	-
	- if (strval[0] != '/') {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "bad alternate root '%s'"), strval);
	- (void) zfs_error(hdl, EZFS_BADPATH, errbuf);
	- goto error;
	- }
	- break;
	-
	- case ZPOOL_PROP_CACHEFILE:
	- if (strval[0] == '\0')
	- break;
	-
	- if (strcmp(strval, "none") == 0)
	- break;
	-
	- if (strval[0] != '/') {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property '%s' must be empty, an "
	- "absolute path, or 'none'"), propname);
	- (void) zfs_error(hdl, EZFS_BADPATH, errbuf);
	- goto error;
	- }
	-
	- slash = strrchr(strval, '/');
	-
	- if (slash[1] == '\0' \|\| strcmp(slash, "/.") == 0 \|\|
	- strcmp(slash, "/..") == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' is not a valid file"), strval);
	- (void) zfs_error(hdl, EZFS_BADPATH, errbuf);
	- goto error;
	- }
	-
	- *slash = '\0';
	-
	- if (strval[0] != '\0' &&
	- (stat64(strval, &statbuf) != 0 \|\|
	- !S_ISDIR(statbuf.st_mode))) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' is not a valid directory"),
	- strval);
	- (void) zfs_error(hdl, EZFS_BADPATH, errbuf);
	- goto error;
	- }
	-
	- *slash = '/';
	- break;
	-
	- case ZPOOL_PROP_COMMENT:
	- for (check = strval; *check != '\0'; check++) {
	- if (!isprint(*check)) {
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN,
	- "comment may only have printable "
	- "characters"));
	- (void) zfs_error(hdl, EZFS_BADPROP,
	- errbuf);
	- goto error;
	- }
	- }
	- if (strlen(strval) > ZPROP_MAX_COMMENT) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "comment must not exceed %d characters"),
	- ZPROP_MAX_COMMENT);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- break;
	-
	- case ZPOOL_PROP_READONLY:
	- if (!flags.import) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property '%s' can only be set at "
	- "import time"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- break;
	-
	- case ZPOOL_PROP_TNAME:
	- if (!flags.create) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property '%s' can only be set at "
	- "creation time"), propname);
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- break;
	-
	- case ZPOOL_PROP_MULTIHOST:
	- if (get_system_hostid() == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "requires a non-zero system hostid"));
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- goto error;
	- }
	- break;
	-
	- default:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "property '%s'(%d) not defined"), propname, prop);
	- break;
	- }
	- }
	-
	- return (retprops);
	-error:
	- nvlist_free(retprops);
	- return (NULL);
	-}
	-
	-/*
	- * Set zpool property : propname=propval.
	- */
	-int
	-zpool_set_prop(zpool_handle_t zhp, const char propname, const char *propval)
	-{
	- zfs_cmd_t zc = { 0 };
	- int ret = -1;
	- char errbuf[1024];
	- nvlist_t *nvl = NULL;
	- nvlist_t *realprops;
	- uint64_t version;
	- prop_flags_t flags = { 0 };
	-
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
	- zhp->zpool_name);
	-
	- if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
	- return (no_memory(zhp->zpool_hdl));
	-
	- if (nvlist_add_string(nvl, propname, propval) != 0) {
	- nvlist_free(nvl);
	- return (no_memory(zhp->zpool_hdl));
	- }
	-
	- version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
	- if ((realprops = zpool_valid_proplist(zhp->zpool_hdl,
	- zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) {
	- nvlist_free(nvl);
	- return (-1);
	- }
	-
	- nvlist_free(nvl);
	- nvl = realprops;
	-
	- /*
	- * Execute the corresponding ioctl() to set this property.
	- */
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	-
	- if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) {
	- nvlist_free(nvl);
	- return (-1);
	- }
	-
	- ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc);
	-
	- zcmd_free_nvlists(&zc);
	- nvlist_free(nvl);
	-
	- if (ret)
	- (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
	- else
	- (void) zpool_props_refresh(zhp);
	-
	- return (ret);
	-}
	-
	-int
	-zpool_expand_proplist(zpool_handle_t zhp, zprop_list_t *plp)
	-{
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- zprop_list_t *entry;
	- char buf[ZFS_MAXPROPLEN];
	- nvlist_t *features = NULL;
	- zprop_list_t **last;
	- boolean_t firstexpand = (NULL == *plp);
	-
	- if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0)
	- return (-1);
	-
	- last = plp;
	- while (*last != NULL)
	- last = &(*last)->pl_next;
	-
	- if ((*plp)->pl_all)
	- features = zpool_get_features(zhp);
	-
	- if ((*plp)->pl_all && firstexpand) {
	- for (int i = 0; i < SPA_FEATURES; i++) {
	- zprop_list_t *entry = zfs_alloc(hdl,
	- sizeof (zprop_list_t));
	- entry->pl_prop = ZPROP_INVAL;
	- entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s",
	- spa_feature_table[i].fi_uname);
	- entry->pl_width = strlen(entry->pl_user_prop);
	- entry->pl_all = B_TRUE;
	-
	- *last = entry;
	- last = &entry->pl_next;
	- }
	- }
	-
	- /* add any unsupported features */
	- for (nvpair_t *nvp = nvlist_next_nvpair(features, NULL);
	- nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) {
	- char *propname;
	- boolean_t found;
	- zprop_list_t *entry;
	-
	- if (zfeature_is_supported(nvpair_name(nvp)))
	- continue;
	-
	- propname = zfs_asprintf(hdl, "unsupported@%s",
	- nvpair_name(nvp));
	-
	- /*
	- * Before adding the property to the list make sure that no
	- * other pool already added the same property.
	- */
	- found = B_FALSE;
	- entry = *plp;
	- while (entry != NULL) {
	- if (entry->pl_user_prop != NULL &&
	- strcmp(propname, entry->pl_user_prop) == 0) {
	- found = B_TRUE;
	- break;
	- }
	- entry = entry->pl_next;
	- }
	- if (found) {
	- free(propname);
	- continue;
	- }
	-
	- entry = zfs_alloc(hdl, sizeof (zprop_list_t));
	- entry->pl_prop = ZPROP_INVAL;
	- entry->pl_user_prop = propname;
	- entry->pl_width = strlen(entry->pl_user_prop);
	- entry->pl_all = B_TRUE;
	-
	- *last = entry;
	- last = &entry->pl_next;
	- }
	-
	- for (entry = *plp; entry != NULL; entry = entry->pl_next) {
	-
	- if (entry->pl_fixed)
	- continue;
	-
	- if (entry->pl_prop != ZPROP_INVAL &&
	- zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf),
	- NULL, B_FALSE) == 0) {
	- if (strlen(buf) > entry->pl_width)
	- entry->pl_width = strlen(buf);
	- }
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Get the state for the given feature on the given ZFS pool.
	- */
	-int
	-zpool_prop_get_feature(zpool_handle_t zhp, const char propname, char *buf,
	- size_t len)
	-{
	- uint64_t refcount;
	- boolean_t found = B_FALSE;
	- nvlist_t *features = zpool_get_features(zhp);
	- boolean_t supported;
	- const char *feature = strchr(propname, '@') + 1;
	-
	- supported = zpool_prop_feature(propname);
	- ASSERT(supported \|\| zpool_prop_unsupported(propname));
	-
	- /*
	- * Convert from feature name to feature guid. This conversion is
	- * unecessary for unsupported@... properties because they already
	- * use guids.
	- */
	- if (supported) {
	- int ret;
	- spa_feature_t fid;
	-
	- ret = zfeature_lookup_name(feature, &fid);
	- if (ret != 0) {
	- (void) strlcpy(buf, "-", len);
	- return (ENOTSUP);
	- }
	- feature = spa_feature_table[fid].fi_guid;
	- }
	-
	- if (nvlist_lookup_uint64(features, feature, &refcount) == 0)
	- found = B_TRUE;
	-
	- if (supported) {
	- if (!found) {
	- (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len);
	- } else {
	- if (refcount == 0)
	- (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len);
	- else
	- (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len);
	- }
	- } else {
	- if (found) {
	- if (refcount == 0) {
	- (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE);
	- } else {
	- (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY);
	- }
	- } else {
	- (void) strlcpy(buf, "-", len);
	- return (ENOTSUP);
	- }
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Don't start the slice at the default block of 34; many storage
	- * devices will use a stripe width of 128k, so start there instead.
	- */
	-#define NEW_START_BLOCK 256
	-
	-/*
	- * Validate the given pool name, optionally putting an extended error message in
	- * 'buf'.
	- */
	-boolean_t
	-zpool_name_valid(libzfs_handle_t hdl, boolean_t isopen, const char pool)
	-{
	- namecheck_err_t why;
	- char what;
	- int ret;
	-
	- ret = pool_namecheck(pool, &why, &what);
	-
	- /*
	- * The rules for reserved pool names were extended at a later point.
	- * But we need to support users with existing pools that may now be
	- * invalid. So we only check for this expanded set of names during a
	- * create (or import), and only in userland.
	- */
	- if (ret == 0 && !isopen &&
	- (strncmp(pool, "mirror", 6) == 0 \|\|
	- strncmp(pool, "raidz", 5) == 0 \|\|
	- strncmp(pool, "spare", 5) == 0 \|\|
	- strcmp(pool, "log") == 0)) {
	- if (hdl != NULL)
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "name is reserved"));
	- return (B_FALSE);
	- }
	-
	-
	- if (ret != 0) {
	- if (hdl != NULL) {
	- switch (why) {
	- case NAME_ERR_TOOLONG:
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "name is too long"));
	- break;
	-
	- case NAME_ERR_INVALCHAR:
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "invalid character "
	- "'%c' in pool name"), what);
	- break;
	-
	- case NAME_ERR_NOLETTER:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "name must begin with a letter"));
	- break;
	-
	- case NAME_ERR_RESERVED:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "name is reserved"));
	- break;
	-
	- case NAME_ERR_DISKLIKE:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool name is reserved"));
	- break;
	-
	- case NAME_ERR_LEADING_SLASH:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "leading slash in name"));
	- break;
	-
	- case NAME_ERR_EMPTY_COMPONENT:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "empty component in name"));
	- break;
	-
	- case NAME_ERR_TRAILING_SLASH:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "trailing slash in name"));
	- break;
	-
	- case NAME_ERR_MULTIPLE_DELIMITERS:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "multiple '@' and/or '#' delimiters in "
	- "name"));
	- break;
	-
	- default:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "(%d) not defined"), why);
	- break;
	- }
	- }
	- return (B_FALSE);
	- }
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Open a handle to the given pool, even if the pool is currently in the FAULTED
	- * state.
	- */
	-zpool_handle_t *
	-zpool_open_canfail(libzfs_handle_t hdl, const char pool)
	-{
	- zpool_handle_t *zhp;
	- boolean_t missing;
	-
	- /*
	- * Make sure the pool name is valid.
	- */
	- if (!zpool_name_valid(hdl, B_TRUE, pool)) {
	- (void) zfs_error_fmt(hdl, EZFS_INVALIDNAME,
	- dgettext(TEXT_DOMAIN, "cannot open '%s'"),
	- pool);
	- return (NULL);
	- }
	-
	- if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
	- return (NULL);
	-
	- zhp->zpool_hdl = hdl;
	- (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
	-
	- if (zpool_refresh_stats(zhp, &missing) != 0) {
	- zpool_close(zhp);
	- return (NULL);
	- }
	-
	- if (missing) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool"));
	- (void) zfs_error_fmt(hdl, EZFS_NOENT,
	- dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool);
	- zpool_close(zhp);
	- return (NULL);
	- }
	-
	- return (zhp);
	-}
	-
	-/*
	- * Like the above, but silent on error. Used when iterating over pools (because
	- * the configuration cache may be out of date).
	- */
	-int
	-zpool_open_silent(libzfs_handle_t hdl, const char pool, zpool_handle_t **ret)
	-{
	- zpool_handle_t *zhp;
	- boolean_t missing;
	-
	- if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
	- return (-1);
	-
	- zhp->zpool_hdl = hdl;
	- (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
	-
	- if (zpool_refresh_stats(zhp, &missing) != 0) {
	- zpool_close(zhp);
	- return (-1);
	- }
	-
	- if (missing) {
	- zpool_close(zhp);
	- *ret = NULL;
	- return (0);
	- }
	-
	- *ret = zhp;
	- return (0);
	-}
	-
	-/*
	- * Similar to zpool_open_canfail(), but refuses to open pools in the faulted
	- * state.
	- */
	-zpool_handle_t *
	-zpool_open(libzfs_handle_t hdl, const char pool)
	-{
	- zpool_handle_t *zhp;
	-
	- if ((zhp = zpool_open_canfail(hdl, pool)) == NULL)
	- return (NULL);
	-
	- if (zhp->zpool_state == POOL_STATE_UNAVAIL) {
	- (void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL,
	- dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name);
	- zpool_close(zhp);
	- return (NULL);
	- }
	-
	- return (zhp);
	-}
	-
	-/*
	- * Close the handle. Simply frees the memory associated with the handle.
	- */
	-void
	-zpool_close(zpool_handle_t *zhp)
	-{
	- nvlist_free(zhp->zpool_config);
	- nvlist_free(zhp->zpool_old_config);
	- nvlist_free(zhp->zpool_props);
	- free(zhp);
	-}
	-
	-/*
	- * Return the name of the pool.
	- */
	-const char *
	-zpool_get_name(zpool_handle_t *zhp)
	-{
	- return (zhp->zpool_name);
	-}
	-
	-
	-/*
	- * Return the state of the pool (ACTIVE or UNAVAILABLE)
	- */
	-int
	-zpool_get_state(zpool_handle_t *zhp)
	-{
	- return (zhp->zpool_state);
	-}
	-
	-/*
	- * Check if vdev list contains a special vdev
	- */
	-static boolean_t
	-zpool_has_special_vdev(nvlist_t *nvroot)
	-{
	- nvlist_t **child;
	- uint_t children;
	-
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child,
	- &children) == 0) {
	- for (uint_t c = 0; c < children; c++) {
	- char *bias;
	-
	- if (nvlist_lookup_string(child[c],
	- ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 &&
	- strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
	- return (B_TRUE);
	- }
	- }
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * Create the named pool, using the provided vdev list. It is assumed
	- * that the consumer has already validated the contents of the nvlist, so we
	- * don't have to worry about error semantics.
	- */
	-int
	-zpool_create(libzfs_handle_t hdl, const char pool, nvlist_t *nvroot,
	- nvlist_t props, nvlist_t fsprops)
	-{
	- zfs_cmd_t zc = { 0 };
	- nvlist_t *zc_fsprops = NULL;
	- nvlist_t *zc_props = NULL;
	- char msg[1024];
	- int ret = -1;
	-
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot create '%s'"), pool);
	-
	- if (!zpool_name_valid(hdl, B_FALSE, pool))
	- return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
	-
	- if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
	- return (-1);
	-
	- if (props) {
	- prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE };
	-
	- if ((zc_props = zpool_valid_proplist(hdl, pool, props,
	- SPA_VERSION_1, flags, msg)) == NULL) {
	- goto create_failed;
	- }
	- }
	-
	- if (fsprops) {
	- uint64_t zoned;
	- char *zonestr;
	-
	- zoned = ((nvlist_lookup_string(fsprops,
	- zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) &&
	- strcmp(zonestr, "on") == 0);
	-
	- if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM,
	- fsprops, zoned, NULL, NULL, msg)) == NULL) {
	- goto create_failed;
	- }
	-
	- if (nvlist_exists(zc_fsprops,
	- zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) &&
	- !zpool_has_special_vdev(nvroot)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "%s property requires a special vdev"),
	- zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS));
	- (void) zfs_error(hdl, EZFS_BADPROP, msg);
	- goto create_failed;
	- }
	-
	- if (!zc_props &&
	- (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
	- goto create_failed;
	- }
	- if (nvlist_add_nvlist(zc_props,
	- ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) {
	- goto create_failed;
	- }
	- }
	-
	- if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
	- goto create_failed;
	-
	- (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
	-
	- if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) {
	-
	- zcmd_free_nvlists(&zc);
	- nvlist_free(zc_props);
	- nvlist_free(zc_fsprops);
	-
	- switch (errno) {
	- case EBUSY:
	- /*
	- * This can happen if the user has specified the same
	- * device multiple times. We can't reliably detect this
	- * until we try to add it and see we already have a
	- * label.
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "one or more vdevs refer to the same device"));
	- return (zfs_error(hdl, EZFS_BADDEV, msg));
	-
	- case ERANGE:
	- /*
	- * This happens if the record size is smaller or larger
	- * than the allowed size range, or not a power of 2.
	- *
	- * NOTE: although zfs_valid_proplist is called earlier,
	- * this case may have slipped through since the
	- * pool does not exist yet and it is therefore
	- * impossible to read properties e.g. max blocksize
	- * from the pool.
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "record size invalid"));
	- return (zfs_error(hdl, EZFS_BADPROP, msg));
	-
	- case EOVERFLOW:
	- /*
	- * This occurs when one of the devices is below
	- * SPA_MINDEVSIZE. Unfortunately, we can't detect which
	- * device was the problem device since there's no
	- * reliable way to determine device size from userland.
	- */
	- {
	- char buf[64];
	-
	- zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
	-
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "one or more devices is less than the "
	- "minimum size (%s)"), buf);
	- }
	- return (zfs_error(hdl, EZFS_BADDEV, msg));
	-
	- case ENOSPC:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "one or more devices is out of space"));
	- return (zfs_error(hdl, EZFS_BADDEV, msg));
	-
	- case ENOTBLK:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "cache device must be a disk or disk slice"));
	- return (zfs_error(hdl, EZFS_BADDEV, msg));
	-
	- default:
	- return (zpool_standard_error(hdl, errno, msg));
	- }
	- }
	-
	-create_failed:
	- zcmd_free_nvlists(&zc);
	- nvlist_free(zc_props);
	- nvlist_free(zc_fsprops);
	- return (ret);
	-}
	-
	-/*
	- * Destroy the given pool. It is up to the caller to ensure that there are no
	- * datasets left in the pool.
	- */
	-int
	-zpool_destroy(zpool_handle_t zhp, const char log_str)
	-{
	- zfs_cmd_t zc = { 0 };
	- zfs_handle_t *zfp = NULL;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- char msg[1024];
	-
	- if (zhp->zpool_state == POOL_STATE_ACTIVE &&
	- (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
	- return (-1);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- zc.zc_history = (uint64_t)(uintptr_t)log_str;
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot destroy '%s'"), zhp->zpool_name);
	-
	- if (errno == EROFS) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "one or more devices is read only"));
	- (void) zfs_error(hdl, EZFS_BADDEV, msg);
	- } else {
	- (void) zpool_standard_error(hdl, errno, msg);
	- }
	-
	- if (zfp)
	- zfs_close(zfp);
	- return (-1);
	- }
	-
	- if (zfp) {
	- remove_mountpoint(zfp);
	- zfs_close(zfp);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Create a checkpoint in the given pool.
	- */
	-int
	-zpool_checkpoint(zpool_handle_t *zhp)
	-{
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- char msg[1024];
	- int error;
	-
	- error = lzc_pool_checkpoint(zhp->zpool_name);
	- if (error != 0) {
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot checkpoint '%s'"), zhp->zpool_name);
	- (void) zpool_standard_error(hdl, error, msg);
	- return (-1);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Discard the checkpoint from the given pool.
	- */
	-int
	-zpool_discard_checkpoint(zpool_handle_t *zhp)
	-{
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- char msg[1024];
	- int error;
	-
	- error = lzc_pool_checkpoint_discard(zhp->zpool_name);
	- if (error != 0) {
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot discard checkpoint in '%s'"), zhp->zpool_name);
	- (void) zpool_standard_error(hdl, error, msg);
	- return (-1);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Add the given vdevs to the pool. The caller must have already performed the
	- * necessary verification to ensure that the vdev specification is well-formed.
	- */
	-int
	-zpool_add(zpool_handle_t zhp, nvlist_t nvroot)
	-{
	- zfs_cmd_t zc = { 0 };
	- int ret;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- char msg[1024];
	- nvlist_t spares, l2cache;
	- uint_t nspares, nl2cache;
	-
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot add to '%s'"), zhp->zpool_name);
	-
	- if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
	- SPA_VERSION_SPARES &&
	- nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	- &spares, &nspares) == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
	- "upgraded to add hot spares"));
	- return (zfs_error(hdl, EZFS_BADVERSION, msg));
	- }
	-
	- if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
	- SPA_VERSION_L2CACHE &&
	- nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	- &l2cache, &nl2cache) == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
	- "upgraded to add cache devices"));
	- return (zfs_error(hdl, EZFS_BADVERSION, msg));
	- }
	-
	- if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
	- return (-1);
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
	- switch (errno) {
	- case EBUSY:
	- /*
	- * This can happen if the user has specified the same
	- * device multiple times. We can't reliably detect this
	- * until we try to add it and see we already have a
	- * label.
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "one or more vdevs refer to the same device"));
	- (void) zfs_error(hdl, EZFS_BADDEV, msg);
	- break;
	-
	- case EINVAL:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid config; a pool with removing/removed "
	- "vdevs does not support adding raidz vdevs"));
	- (void) zfs_error(hdl, EZFS_BADDEV, msg);
	- break;
	-
	- case EOVERFLOW:
	- /*
	- * This occurrs when one of the devices is below
	- * SPA_MINDEVSIZE. Unfortunately, we can't detect which
	- * device was the problem device since there's no
	- * reliable way to determine device size from userland.
	- */
	- {
	- char buf[64];
	-
	- zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
	-
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "device is less than the minimum "
	- "size (%s)"), buf);
	- }
	- (void) zfs_error(hdl, EZFS_BADDEV, msg);
	- break;
	-
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded to add these vdevs"));
	- (void) zfs_error(hdl, EZFS_BADVERSION, msg);
	- break;
	-
	- case EDOM:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "root pool can not have multiple vdevs"
	- " or separate logs"));
	- (void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg);
	- break;
	-
	- case ENOTBLK:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "cache device must be a disk or disk slice"));
	- (void) zfs_error(hdl, EZFS_BADDEV, msg);
	- break;
	-
	- default:
	- (void) zpool_standard_error(hdl, errno, msg);
	- }
	-
	- ret = -1;
	- } else {
	- ret = 0;
	- }
	-
	- zcmd_free_nvlists(&zc);
	-
	- return (ret);
	-}
	-
	-/*
	- * Exports the pool from the system. The caller must ensure that there are no
	- * mounted datasets in the pool.
	- */
	-static int
	-zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce,
	- const char *log_str)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	-
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot export '%s'"), zhp->zpool_name);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- zc.zc_cookie = force;
	- zc.zc_guid = hardforce;
	- zc.zc_history = (uint64_t)(uintptr_t)log_str;
	-
	- if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
	- switch (errno) {
	- case EXDEV:
	- zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
	- "use '-f' to override the following errors:\n"
	- "'%s' has an active shared spare which could be"
	- " used by other pools once '%s' is exported."),
	- zhp->zpool_name, zhp->zpool_name);
	- return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE,
	- msg));
	- default:
	- return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
	- msg));
	- }
	- }
	-
	- return (0);
	-}
	-
	-int
	-zpool_export(zpool_handle_t zhp, boolean_t force, const char log_str)
	-{
	- return (zpool_export_common(zhp, force, B_FALSE, log_str));
	-}
	-
	-int
	-zpool_export_force(zpool_handle_t zhp, const char log_str)
	-{
	- return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str));
	-}
	-
	-static void
	-zpool_rewind_exclaim(libzfs_handle_t hdl, const char name, boolean_t dryrun,
	- nvlist_t *config)
	-{
	- nvlist_t *nv = NULL;
	- uint64_t rewindto;
	- int64_t loss = -1;
	- struct tm t;
	- char timestr[128];
	-
	- if (!hdl->libzfs_printerr \|\| config == NULL)
	- return;
	-
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 \|\|
	- nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) {
	- return;
	- }
	-
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
	- return;
	- (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
	-
	- if (localtime_r((time_t *)&rewindto, &t) != NULL &&
	- strftime(timestr, 128, 0, &t) != 0) {
	- if (dryrun) {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "Would be able to return %s "
	- "to its state as of %s.\n"),
	- name, timestr);
	- } else {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "Pool %s returned to its state as of %s.\n"),
	- name, timestr);
	- }
	- if (loss > 120) {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "%s approximately %lld "),
	- dryrun ? "Would discard" : "Discarded",
	- (loss + 30) / 60);
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "minutes of transactions.\n"));
	- } else if (loss > 0) {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "%s approximately %lld "),
	- dryrun ? "Would discard" : "Discarded", loss);
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "seconds of transactions.\n"));
	- }
	- }
	-}
	-
	-void
	-zpool_explain_recover(libzfs_handle_t hdl, const char name, int reason,
	- nvlist_t *config)
	-{
	- nvlist_t *nv = NULL;
	- int64_t loss = -1;
	- uint64_t edata = UINT64_MAX;
	- uint64_t rewindto;
	- struct tm t;
	- char timestr[128];
	-
	- if (!hdl->libzfs_printerr)
	- return;
	-
	- if (reason >= 0)
	- (void) printf(dgettext(TEXT_DOMAIN, "action: "));
	- else
	- (void) printf(dgettext(TEXT_DOMAIN, "\t"));
	-
	- /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 \|\|
	- nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 \|\|
	- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
	- goto no_info;
	-
	- (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
	- &edata);
	-
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "Recovery is possible, but will result in some data loss.\n"));
	-
	- if (localtime_r((time_t *)&rewindto, &t) != NULL &&
	- strftime(timestr, 128, 0, &t) != 0) {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "\tReturning the pool to its state as of %s\n"
	- "\tshould correct the problem. "),
	- timestr);
	- } else {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "\tReverting the pool to an earlier state "
	- "should correct the problem.\n\t"));
	- }
	-
	- if (loss > 120) {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "Approximately %lld minutes of data\n"
	- "\tmust be discarded, irreversibly. "), (loss + 30) / 60);
	- } else if (loss > 0) {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "Approximately %lld seconds of data\n"
	- "\tmust be discarded, irreversibly. "), loss);
	- }
	- if (edata != 0 && edata != UINT64_MAX) {
	- if (edata == 1) {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "After rewind, at least\n"
	- "\tone persistent user-data error will remain. "));
	- } else {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "After rewind, several\n"
	- "\tpersistent user-data errors will remain. "));
	- }
	- }
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "),
	- reason >= 0 ? "clear" : "import", name);
	-
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "A scrub of the pool\n"
	- "\tis strongly recommended after recovery.\n"));
	- return;
	-
	-no_info:
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "Destroy and re-create the pool from\n\ta backup source.\n"));
	-}
	-
	-/*
	- * zpool_import() is a contracted interface. Should be kept the same
	- * if possible.
	- *
	- * Applications should use zpool_import_props() to import a pool with
	- * new properties value to be set.
	- */
	-int
	-zpool_import(libzfs_handle_t hdl, nvlist_t config, const char *newname,
	- char *altroot)
	-{
	- nvlist_t *props = NULL;
	- int ret;
	-
	- if (altroot != NULL) {
	- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
	- return (zfs_error_fmt(hdl, EZFS_NOMEM,
	- dgettext(TEXT_DOMAIN, "cannot import '%s'"),
	- newname));
	- }
	-
	- if (nvlist_add_string(props,
	- zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 \|\|
	- nvlist_add_string(props,
	- zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) {
	- nvlist_free(props);
	- return (zfs_error_fmt(hdl, EZFS_NOMEM,
	- dgettext(TEXT_DOMAIN, "cannot import '%s'"),
	- newname));
	- }
	- }
	-
	- ret = zpool_import_props(hdl, config, newname, props,
	- ZFS_IMPORT_NORMAL);
	- nvlist_free(props);
	- return (ret);
	-}
	-
	-static void
	-print_vdev_tree(libzfs_handle_t hdl, const char name, nvlist_t *nv,
	- int indent)
	-{
	- nvlist_t **child;
	- uint_t c, children;
	- char *vname;
	- uint64_t is_log = 0;
	-
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG,
	- &is_log);
	-
	- if (name != NULL)
	- (void) printf("\t%*s%s%s\n", indent, "", name,
	- is_log ? " [log]" : "");
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0)
	- return;
	-
	- for (c = 0; c < children; c++) {
	- vname = zpool_vdev_name(hdl, NULL, child[c], VDEV_NAME_TYPE_ID);
	- print_vdev_tree(hdl, vname, child[c], indent + 2);
	- free(vname);
	- }
	-}
	-
	-void
	-zpool_print_unsup_feat(nvlist_t *config)
	-{
	- nvlist_t nvinfo, unsup_feat;
	-
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) ==
	- 0);
	- verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT,
	- &unsup_feat) == 0);
	-
	- for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL;
	- nvp = nvlist_next_nvpair(unsup_feat, nvp)) {
	- char *desc;
	-
	- verify(nvpair_type(nvp) == DATA_TYPE_STRING);
	- verify(nvpair_value_string(nvp, &desc) == 0);
	-
	- if (strlen(desc) > 0)
	- (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc);
	- else
	- (void) printf("\t%s\n", nvpair_name(nvp));
	- }
	-}
	-
	-/*
	- * Import the given pool using the known configuration and a list of
	- * properties to be set. The configuration should have come from
	- * zpool_find_import(). The 'newname' parameters control whether the pool
	- * is imported with a different name.
	- */
	-int
	-zpool_import_props(libzfs_handle_t hdl, nvlist_t config, const char *newname,
	- nvlist_t *props, int flags)
	-{
	- zfs_cmd_t zc = { 0 };
	- zpool_load_policy_t policy;
	- nvlist_t *nv = NULL;
	- nvlist_t *nvinfo = NULL;
	- nvlist_t *missing = NULL;
	- char *thename;
	- char *origname;
	- int ret;
	- int error = 0;
	- char errbuf[1024];
	-
	- verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	- &origname) == 0);
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot import pool '%s'"), origname);
	-
	- if (newname != NULL) {
	- if (!zpool_name_valid(hdl, B_FALSE, newname))
	- return (zfs_error_fmt(hdl, EZFS_INVALIDNAME,
	- dgettext(TEXT_DOMAIN, "cannot import '%s'"),
	- newname));
	- thename = (char *)newname;
	- } else {
	- thename = origname;
	- }
	-
	- if (props != NULL) {
	- uint64_t version;
	- prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
	-
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	- &version) == 0);
	-
	- if ((props = zpool_valid_proplist(hdl, origname,
	- props, version, flags, errbuf)) == NULL)
	- return (-1);
	- if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
	- nvlist_free(props);
	- return (-1);
	- }
	- nvlist_free(props);
	- }
	-
	- (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name));
	-
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	- &zc.zc_guid) == 0);
	-
	- if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	-
	- zc.zc_cookie = flags;
	- while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 &&
	- errno == ENOMEM) {
	- if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- }
	- if (ret != 0)
	- error = errno;
	-
	- (void) zcmd_read_dst_nvlist(hdl, &zc, &nv);
	-
	- zcmd_free_nvlists(&zc);
	-
	- zpool_get_load_policy(config, &policy);
	-
	- if (error) {
	- char desc[1024];
	- char aux[256];
	-
	- /*
	- * Dry-run failed, but we print out what success
	- * looks like if we found a best txg
	- */
	- if (policy.zlp_rewind & ZPOOL_TRY_REWIND) {
	- zpool_rewind_exclaim(hdl, newname ? origname : thename,
	- B_TRUE, nv);
	- nvlist_free(nv);
	- return (-1);
	- }
	-
	- if (newname == NULL)
	- (void) snprintf(desc, sizeof (desc),
	- dgettext(TEXT_DOMAIN, "cannot import '%s'"),
	- thename);
	- else
	- (void) snprintf(desc, sizeof (desc),
	- dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
	- origname, thename);
	-
	- switch (error) {
	- case ENOTSUP:
	- if (nv != NULL && nvlist_lookup_nvlist(nv,
	- ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
	- nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) {
	- (void) printf(dgettext(TEXT_DOMAIN, "This "
	- "pool uses the following feature(s) not "
	- "supported by this system:\n"));
	- zpool_print_unsup_feat(nv);
	- if (nvlist_exists(nvinfo,
	- ZPOOL_CONFIG_CAN_RDONLY)) {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "All unsupported features are only "
	- "required for writing to the pool."
	- "\nThe pool can be imported using "
	- "'-o readonly=on'.\n"));
	- }
	- }
	- /*
	- * Unsupported version.
	- */
	- (void) zfs_error(hdl, EZFS_BADVERSION, desc);
	- break;
	-
	- case EREMOTEIO:
	- if (nv != NULL && nvlist_lookup_nvlist(nv,
	- ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) {
	- char *hostname = "<unknown>";
	- uint64_t hostid = 0;
	- mmp_state_t mmp_state;
	-
	- mmp_state = fnvlist_lookup_uint64(nvinfo,
	- ZPOOL_CONFIG_MMP_STATE);
	-
	- if (nvlist_exists(nvinfo,
	- ZPOOL_CONFIG_MMP_HOSTNAME))
	- hostname = fnvlist_lookup_string(nvinfo,
	- ZPOOL_CONFIG_MMP_HOSTNAME);
	-
	- if (nvlist_exists(nvinfo,
	- ZPOOL_CONFIG_MMP_HOSTID))
	- hostid = fnvlist_lookup_uint64(nvinfo,
	- ZPOOL_CONFIG_MMP_HOSTID);
	-
	- if (mmp_state == MMP_STATE_ACTIVE) {
	- (void) snprintf(aux, sizeof (aux),
	- dgettext(TEXT_DOMAIN, "pool is imp"
	- "orted on host '%s' (hostid=%lx).\n"
	- "Export the pool on the other "
	- "system, then run 'zpool import'."),
	- hostname, (unsigned long) hostid);
	- } else if (mmp_state == MMP_STATE_NO_HOSTID) {
	- (void) snprintf(aux, sizeof (aux),
	- dgettext(TEXT_DOMAIN, "pool has "
	- "the multihost property on and "
	- "the\nsystem's hostid is not "
	- "set.\n"));
	- }
	-
	- (void) zfs_error_aux(hdl, aux);
	- }
	- (void) zfs_error(hdl, EZFS_ACTIVE_POOL, desc);
	- break;
	-
	- case EINVAL:
	- (void) zfs_error(hdl, EZFS_INVALCONFIG, desc);
	- break;
	-
	- case EROFS:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "one or more devices is read only"));
	- (void) zfs_error(hdl, EZFS_BADDEV, desc);
	- break;
	-
	- case ENXIO:
	- if (nv && nvlist_lookup_nvlist(nv,
	- ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
	- nvlist_lookup_nvlist(nvinfo,
	- ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
	- (void) printf(dgettext(TEXT_DOMAIN,
	- "The devices below are missing or "
	- "corrupted, use '-m' to import the pool "
	- "anyway:\n"));
	- print_vdev_tree(hdl, NULL, missing, 2);
	- (void) printf("\n");
	- }
	- (void) zpool_standard_error(hdl, error, desc);
	- break;
	-
	- case EEXIST:
	- (void) zpool_standard_error(hdl, error, desc);
	- break;
	- case ENAMETOOLONG:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "new name of at least one dataset is longer than "
	- "the maximum allowable length"));
	- (void) zfs_error(hdl, EZFS_NAMETOOLONG, desc);
	- break;
	- default:
	- (void) zpool_standard_error(hdl, error, desc);
	- zpool_explain_recover(hdl,
	- newname ? origname : thename, -error, nv);
	- break;
	- }
	-
	- nvlist_free(nv);
	- ret = -1;
	- } else {
	- zpool_handle_t *zhp;
	-
	- /*
	- * This should never fail, but play it safe anyway.
	- */
	- if (zpool_open_silent(hdl, thename, &zhp) != 0)
	- ret = -1;
	- else if (zhp != NULL)
	- zpool_close(zhp);
	- if (policy.zlp_rewind &
	- (ZPOOL_DO_REWIND \| ZPOOL_TRY_REWIND)) {
	- zpool_rewind_exclaim(hdl, newname ? origname : thename,
	- ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nv);
	- }
	- nvlist_free(nv);
	- return (0);
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * Scan the pool.
	- */
	-int
	-zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- int err;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- zc.zc_cookie = func;
	- zc.zc_flags = cmd;
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0)
	- return (0);
	-
	- err = errno;
	-
	- /* ECANCELED on a scrub means we resumed a paused scrub */
	- if (err == ECANCELED && func == POOL_SCAN_SCRUB &&
	- cmd == POOL_SCRUB_NORMAL)
	- return (0);
	-
	- if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL)
	- return (0);
	-
	- if (func == POOL_SCAN_SCRUB) {
	- if (cmd == POOL_SCRUB_PAUSE) {
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot pause scrubbing %s"), zc.zc_name);
	- } else {
	- assert(cmd == POOL_SCRUB_NORMAL);
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot scrub %s"), zc.zc_name);
	- }
	- } else if (func == POOL_SCAN_NONE) {
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
	- zc.zc_name);
	- } else {
	- assert(!"unexpected result");
	- }
	-
	- if (err == EBUSY) {
	- nvlist_t *nvroot;
	- pool_scan_stat_t *ps = NULL;
	- uint_t psc;
	-
	- verify(nvlist_lookup_nvlist(zhp->zpool_config,
	- ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	- (void) nvlist_lookup_uint64_array(nvroot,
	- ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
	- if (ps && ps->pss_func == POOL_SCAN_SCRUB) {
	- if (cmd == POOL_SCRUB_PAUSE)
	- return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg));
	- else
	- return (zfs_error(hdl, EZFS_SCRUBBING, msg));
	- } else {
	- return (zfs_error(hdl, EZFS_RESILVERING, msg));
	- }
	- } else if (err == ENOENT) {
	- return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
	- } else {
	- return (zpool_standard_error(hdl, err, msg));
	- }
	-}
	-
	-static int
	-xlate_init_err(int err)
	-{
	- switch (err) {
	- case ENODEV:
	- return (EZFS_NODEVICE);
	- case EINVAL:
	- case EROFS:
	- return (EZFS_BADDEV);
	- case EBUSY:
	- return (EZFS_INITIALIZING);
	- case ESRCH:
	- return (EZFS_NO_INITIALIZE);
	- }
	- return (err);
	-}
	-
	-/*
	- * Begin, suspend, or cancel the initialization (initializing of all free
	- * blocks) for the given vdevs in the given pool.
	- */
	-int
	-zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
	- nvlist_t *vds)
	-{
	- char msg[1024];
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- nvlist_t *errlist;
	-
	- /* translate vdev names to guids */
	- nvlist_t *vdev_guids = fnvlist_alloc();
	- nvlist_t *guids_to_paths = fnvlist_alloc();
	- boolean_t spare, cache;
	- nvlist_t *tgt;
	- nvpair_t *elem;
	-
	- for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL;
	- elem = nvlist_next_nvpair(vds, elem)) {
	- char *vd_path = nvpair_name(elem);
	- tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL);
	-
	- if ((tgt == NULL) \|\| cache \|\| spare) {
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot initialize '%s'"),
	- vd_path);
	- int err = (tgt == NULL) ? EZFS_NODEVICE :
	- (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE);
	- fnvlist_free(vdev_guids);
	- fnvlist_free(guids_to_paths);
	- return (zfs_error(hdl, err, msg));
	- }
	-
	- uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
	- fnvlist_add_uint64(vdev_guids, vd_path, guid);
	-
	- (void) snprintf(msg, sizeof (msg), "%llu", guid);
	- fnvlist_add_string(guids_to_paths, msg, vd_path);
	- }
	-
	- int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids,
	- &errlist);
	- fnvlist_free(vdev_guids);
	-
	- if (err == 0) {
	- fnvlist_free(guids_to_paths);
	- return (0);
	- }
	-
	- nvlist_t *vd_errlist = NULL;
	- if (errlist != NULL) {
	- vd_errlist = fnvlist_lookup_nvlist(errlist,
	- ZPOOL_INITIALIZE_VDEVS);
	- }
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "operation failed"));
	-
	- for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL;
	- elem = nvlist_next_nvpair(vd_errlist, elem)) {
	- int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem));
	- char *path = fnvlist_lookup_string(guids_to_paths,
	- nvpair_name(elem));
	- (void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'",
	- path);
	- }
	-
	- fnvlist_free(guids_to_paths);
	- if (vd_errlist != NULL)
	- return (-1);
	-
	- return (zpool_standard_error(hdl, err, msg));
	-}
	-
	-#ifdef illumos
	-/*
	- * This provides a very minimal check whether a given string is likely a
	- * c#t#d# style string. Users of this are expected to do their own
	- * verification of the s# part.
	- */
	-#define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1]))
	-
	-/*
	- * More elaborate version for ones which may start with "/dev/dsk/"
	- * and the like.
	- */
	-static int
	-ctd_check_path(char *str)
	-{
	- /*
	- * If it starts with a slash, check the last component.
	- */
	- if (str && str[0] == '/') {
	- char *tmp = strrchr(str, '/');
	-
	- /*
	- * If it ends in "/old", check the second-to-last
	- * component of the string instead.
	- */
	- if (tmp != str && strcmp(tmp, "/old") == 0) {
	- for (tmp--; *tmp != '/'; tmp--)
	- ;
	- }
	- str = tmp + 1;
	- }
	- return (CTD_CHECK(str));
	-}
	-#endif
	-
	-/*
	- * Find a vdev that matches the search criteria specified. We use the
	- * the nvpair name to determine how we should look for the device.
	- * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL
	- * spare; but FALSE if its an INUSE spare.
	- */
	-static nvlist_t *
	-vdev_to_nvlist_iter(nvlist_t nv, nvlist_t search, boolean_t *avail_spare,
	- boolean_t l2cache, boolean_t log)
	-{
	- uint_t c, children;
	- nvlist_t **child;
	- nvlist_t *ret;
	- uint64_t is_log;
	- char *srchkey;
	- nvpair_t *pair = nvlist_next_nvpair(search, NULL);
	-
	- /* Nothing to look for */
	- if (search == NULL \|\| pair == NULL)
	- return (NULL);
	-
	- /* Obtain the key we will use to search */
	- srchkey = nvpair_name(pair);
	-
	- switch (nvpair_type(pair)) {
	- case DATA_TYPE_UINT64:
	- if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
	- uint64_t srchval, theguid;
	-
	- verify(nvpair_value_uint64(pair, &srchval) == 0);
	- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
	- &theguid) == 0);
	- if (theguid == srchval)
	- return (nv);
	- }
	- break;
	-
	- case DATA_TYPE_STRING: {
	- char srchval, val;
	-
	- verify(nvpair_value_string(pair, &srchval) == 0);
	- if (nvlist_lookup_string(nv, srchkey, &val) != 0)
	- break;
	-
	- /*
	- * Search for the requested value. Special cases:
	- *
	- * - ZPOOL_CONFIG_PATH for whole disk entries. To support
	- * UEFI boot, these end in "s0" or "s0/old" or "s1" or
	- * "s1/old". The "s0" or "s1" part is hidden from the user,
	- * but included in the string, so this matches around it.
	- * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
	- *
	- * Otherwise, all other searches are simple string compares.
	- */
	-#ifdef illumos
	- if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 &&
	- ctd_check_path(val)) {
	- uint64_t wholedisk = 0;
	-
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	- &wholedisk);
	- if (wholedisk) {
	- int slen = strlen(srchval);
	- int vlen = strlen(val);
	-
	- if (slen != vlen - 2)
	- break;
	-
	- /*
	- * make_leaf_vdev() should only set
	- * wholedisk for ZPOOL_CONFIG_PATHs which
	- * will include "/dev/dsk/", giving plenty of
	- * room for the indices used next.
	- */
	- ASSERT(vlen >= 6);
	-
	- /*
	- * strings identical except trailing "s0"
	- */
	- if ((strcmp(&val[vlen - 2], "s0") == 0 \|\|
	- strcmp(&val[vlen - 2], "s1") == 0) &&
	- strncmp(srchval, val, slen) == 0)
	- return (nv);
	-
	- /*
	- * strings identical except trailing "s0/old"
	- */
	- if ((strcmp(&val[vlen - 6], "s0/old") == 0 \|\|
	- strcmp(&val[vlen - 6], "s1/old") == 0) &&
	- strcmp(&srchval[slen - 4], "/old") == 0 &&
	- strncmp(srchval, val, slen - 4) == 0)
	- return (nv);
	-
	- break;
	- }
	- } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
	-#else
	- if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
	-#endif
	- char type, idx, end, p;
	- uint64_t id, vdev_id;
	-
	- /*
	- * Determine our vdev type, keeping in mind
	- * that the srchval is composed of a type and
	- * vdev id pair (i.e. mirror-4).
	- */
	- if ((type = strdup(srchval)) == NULL)
	- return (NULL);
	-
	- if ((p = strrchr(type, '-')) == NULL) {
	- free(type);
	- break;
	- }
	- idx = p + 1;
	- *p = '\0';
	-
	- /*
	- * If the types don't match then keep looking.
	- */
	- if (strncmp(val, type, strlen(val)) != 0) {
	- free(type);
	- break;
	- }
	-
	- verify(zpool_vdev_is_interior(type));
	- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
	- &id) == 0);
	-
	- errno = 0;
	- vdev_id = strtoull(idx, &end, 10);
	-
	- free(type);
	- if (errno != 0)
	- return (NULL);
	-
	- /*
	- * Now verify that we have the correct vdev id.
	- */
	- if (vdev_id == id)
	- return (nv);
	- }
	-
	- /*
	- * Common case
	- */
	- if (strcmp(srchval, val) == 0)
	- return (nv);
	- break;
	- }
	-
	- default:
	- break;
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0)
	- return (NULL);
	-
	- for (c = 0; c < children; c++) {
	- if ((ret = vdev_to_nvlist_iter(child[c], search,
	- avail_spare, l2cache, NULL)) != NULL) {
	- /*
	- * The 'is_log' value is only set for the toplevel
	- * vdev, not the leaf vdevs. So we always lookup the
	- * log device from the root of the vdev tree (where
	- * 'log' is non-NULL).
	- */
	- if (log != NULL &&
	- nvlist_lookup_uint64(child[c],
	- ZPOOL_CONFIG_IS_LOG, &is_log) == 0 &&
	- is_log) {
	- *log = B_TRUE;
	- }
	- return (ret);
	- }
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++) {
	- if ((ret = vdev_to_nvlist_iter(child[c], search,
	- avail_spare, l2cache, NULL)) != NULL) {
	- *avail_spare = B_TRUE;
	- return (ret);
	- }
	- }
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++) {
	- if ((ret = vdev_to_nvlist_iter(child[c], search,
	- avail_spare, l2cache, NULL)) != NULL) {
	- *l2cache = B_TRUE;
	- return (ret);
	- }
	- }
	- }
	-
	- return (NULL);
	-}
	-
	-/*
	- * Given a physical path (minus the "/devices" prefix), find the
	- * associated vdev.
	- */
	-nvlist_t *
	-zpool_find_vdev_by_physpath(zpool_handle_t zhp, const char ppath,
	- boolean_t avail_spare, boolean_t l2cache, boolean_t *log)
	-{
	- nvlist_t search, nvroot, *ret;
	-
	- verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);
	-
	- verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	-
	- *avail_spare = B_FALSE;
	- *l2cache = B_FALSE;
	- if (log != NULL)
	- *log = B_FALSE;
	- ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
	- nvlist_free(search);
	-
	- return (ret);
	-}
	-
	-/*
	- * Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
	- */
	-static boolean_t
	-zpool_vdev_is_interior(const char *name)
	-{
	- if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 \|\|
	- strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 \|\|
	- strncmp(name,
	- VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 \|\|
	- strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
	- return (B_TRUE);
	- return (B_FALSE);
	-}
	-
	-nvlist_t *
	-zpool_find_vdev(zpool_handle_t zhp, const char path, boolean_t *avail_spare,
	- boolean_t l2cache, boolean_t log)
	-{
	- char buf[MAXPATHLEN];
	- char *end;
	- nvlist_t nvroot, search, *ret;
	- uint64_t guid;
	-
	- verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- guid = strtoull(path, &end, 10);
	- if (guid != 0 && *end == '\0') {
	- verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
	- } else if (zpool_vdev_is_interior(path)) {
	- verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
	- } else if (path[0] != '/') {
	- (void) snprintf(buf, sizeof (buf), "%s%s", _PATH_DEV, path);
	- verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
	- } else {
	- verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
	- }
	-
	- verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	-
	- *avail_spare = B_FALSE;
	- *l2cache = B_FALSE;
	- if (log != NULL)
	- *log = B_FALSE;
	- ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
	- nvlist_free(search);
	-
	- return (ret);
	-}
	-
	-static int
	-vdev_is_online(nvlist_t *nv)
	-{
	- uint64_t ival;
	-
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 \|\|
	- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 \|\|
	- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
	- return (0);
	-
	- return (1);
	-}
	-
	-/*
	- * Helper function for zpool_get_physpaths().
	- */
	-static int
	-vdev_get_one_physpath(nvlist_t config, char physpath, size_t physpath_size,
	- size_t *bytes_written)
	-{
	- size_t bytes_left, pos, rsz;
	- char *tmppath;
	- const char *format;
	-
	- if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH,
	- &tmppath) != 0)
	- return (EZFS_NODEVICE);
	-
	- pos = *bytes_written;
	- bytes_left = physpath_size - pos;
	- format = (pos == 0) ? "%s" : " %s";
	-
	- rsz = snprintf(physpath + pos, bytes_left, format, tmppath);
	- *bytes_written += rsz;
	-
	- if (rsz >= bytes_left) {
	- /* if physpath was not copied properly, clear it */
	- if (bytes_left != 0) {
	- physpath[pos] = 0;
	- }
	- return (EZFS_NOSPC);
	- }
	- return (0);
	-}
	-
	-static int
	-vdev_get_physpaths(nvlist_t nv, char physpath, size_t phypath_size,
	- size_t *rsz, boolean_t is_spare)
	-{
	- char *type;
	- int ret;
	-
	- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
	- return (EZFS_INVALCONFIG);
	-
	- if (strcmp(type, VDEV_TYPE_DISK) == 0) {
	- /*
	- * An active spare device has ZPOOL_CONFIG_IS_SPARE set.
	- * For a spare vdev, we only want to boot from the active
	- * spare device.
	- */
	- if (is_spare) {
	- uint64_t spare = 0;
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
	- &spare);
	- if (!spare)
	- return (EZFS_INVALCONFIG);
	- }
	-
	- if (vdev_is_online(nv)) {
	- if ((ret = vdev_get_one_physpath(nv, physpath,
	- phypath_size, rsz)) != 0)
	- return (ret);
	- }
	- } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 \|\|
	- strcmp(type, VDEV_TYPE_RAIDZ) == 0 \|\|
	- strcmp(type, VDEV_TYPE_REPLACING) == 0 \|\|
	- (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
	- nvlist_t **child;
	- uint_t count;
	- int i, ret;
	-
	- if (nvlist_lookup_nvlist_array(nv,
	- ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
	- return (EZFS_INVALCONFIG);
	-
	- for (i = 0; i < count; i++) {
	- ret = vdev_get_physpaths(child[i], physpath,
	- phypath_size, rsz, is_spare);
	- if (ret == EZFS_NOSPC)
	- return (ret);
	- }
	- }
	-
	- return (EZFS_POOL_INVALARG);
	-}
	-
	-/*
	- * Get phys_path for a root pool config.
	- * Return 0 on success; non-zero on failure.
	- */
	-static int
	-zpool_get_config_physpath(nvlist_t config, char physpath, size_t phypath_size)
	-{
	- size_t rsz;
	- nvlist_t *vdev_root;
	- nvlist_t **child;
	- uint_t count;
	- char *type;
	-
	- rsz = 0;
	-
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &vdev_root) != 0)
	- return (EZFS_INVALCONFIG);
	-
	- if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 \|\|
	- nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
	- &child, &count) != 0)
	- return (EZFS_INVALCONFIG);
	-
	- /*
	- * root pool can only have a single top-level vdev.
	- */
	- if (strcmp(type, VDEV_TYPE_ROOT) != 0 \|\| count != 1)
	- return (EZFS_POOL_INVALARG);
	-
	- (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
	- B_FALSE);
	-
	- /* No online devices */
	- if (rsz == 0)
	- return (EZFS_NODEVICE);
	-
	- return (0);
	-}
	-
	-/*
	- * Get phys_path for a root pool
	- * Return 0 on success; non-zero on failure.
	- */
	-int
	-zpool_get_physpath(zpool_handle_t zhp, char physpath, size_t phypath_size)
	-{
	- return (zpool_get_config_physpath(zhp->zpool_config, physpath,
	- phypath_size));
	-}
	-
	-/*
	- * If the device has being dynamically expanded then we need to relabel
	- * the disk to use the new unallocated space.
	- */
	-static int
	-zpool_relabel_disk(libzfs_handle_t hdl, const char name)
	-{
	-#ifdef illumos
	- char path[MAXPATHLEN];
	- char errbuf[1024];
	- int fd, error;
	- int (*_efi_use_whole_disk)(int);
	-
	- if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT,
	- "efi_use_whole_disk")) == NULL)
	- return (-1);
	-
	- (void) snprintf(path, sizeof (path), "%s/%s", ZFS_RDISK_ROOT, name);
	-
	- if ((fd = open(path, O_RDWR \| O_NDELAY)) < 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
	- "relabel '%s': unable to open device"), name);
	- return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
	- }
	-
	- /*
	- * It's possible that we might encounter an error if the device
	- * does not have any unallocated space left. If so, we simply
	- * ignore that error and continue on.
	- */
	- error = _efi_use_whole_disk(fd);
	- (void) close(fd);
	- if (error && error != VT_ENOSPC) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
	- "relabel '%s': unable to read disk capacity"), name);
	- return (zfs_error(hdl, EZFS_NOCAP, errbuf));
	- }
	-#endif /* illumos */
	- return (0);
	-}
	-
	-/*
	- * Bring the specified vdev online. The 'flags' parameter is a set of the
	- * ZFS_ONLINE_* flags.
	- */
	-int
	-zpool_vdev_online(zpool_handle_t zhp, const char path, int flags,
	- vdev_state_t *newstate)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- char *pathname;
	- nvlist_t *tgt;
	- boolean_t avail_spare, l2cache, islog;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- if (flags & ZFS_ONLINE_EXPAND) {
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot expand %s"), path);
	- } else {
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot online %s"), path);
	- }
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	- &islog)) == NULL)
	- return (zfs_error(hdl, EZFS_NODEVICE, msg));
	-
	- verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
	-
	- if (avail_spare)
	- return (zfs_error(hdl, EZFS_ISSPARE, msg));
	-
	- if ((flags & ZFS_ONLINE_EXPAND \|\|
	- zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) &&
	- nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) {
	- uint64_t wholedisk = 0;
	-
	- (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
	- &wholedisk);
	-
	- /*
	- * XXX - L2ARC 1.0 devices can't support expansion.
	- */
	- if (l2cache) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "cannot expand cache devices"));
	- return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg));
	- }
	-
	- if (wholedisk) {
	- pathname += strlen(ZFS_DISK_ROOT) + 1;
	- (void) zpool_relabel_disk(hdl, pathname);
	- }
	- }
	-
	- zc.zc_cookie = VDEV_STATE_ONLINE;
	- zc.zc_obj = flags;
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
	- if (errno == EINVAL) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
	- "from this pool into a new one. Use '%s' "
	- "instead"), "zpool detach");
	- return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg));
	- }
	- return (zpool_standard_error(hdl, errno, msg));
	- }
	-
	- *newstate = zc.zc_cookie;
	- return (0);
	-}
	-
	-/*
	- * Take the specified vdev offline
	- */
	-int
	-zpool_vdev_offline(zpool_handle_t zhp, const char path, boolean_t istmp)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- nvlist_t *tgt;
	- boolean_t avail_spare, l2cache;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot offline %s"), path);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	- NULL)) == NULL)
	- return (zfs_error(hdl, EZFS_NODEVICE, msg));
	-
	- verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
	-
	- if (avail_spare)
	- return (zfs_error(hdl, EZFS_ISSPARE, msg));
	-
	- zc.zc_cookie = VDEV_STATE_OFFLINE;
	- zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
	- return (0);
	-
	- switch (errno) {
	- case EBUSY:
	-
	- /*
	- * There are no other replicas of this device.
	- */
	- return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
	-
	- case EEXIST:
	- /*
	- * The log device has unplayed logs
	- */
	- return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));
	-
	- default:
	- return (zpool_standard_error(hdl, errno, msg));
	- }
	-}
	-
	-/*
	- * Mark the given vdev faulted.
	- */
	-int
	-zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- zc.zc_guid = guid;
	- zc.zc_cookie = VDEV_STATE_FAULTED;
	- zc.zc_obj = aux;
	-
	- if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
	- return (0);
	-
	- switch (errno) {
	- case EBUSY:
	-
	- /*
	- * There are no other replicas of this device.
	- */
	- return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
	-
	- default:
	- return (zpool_standard_error(hdl, errno, msg));
	- }
	-
	-}
	-
	-/*
	- * Mark the given vdev degraded.
	- */
	-int
	-zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- zc.zc_guid = guid;
	- zc.zc_cookie = VDEV_STATE_DEGRADED;
	- zc.zc_obj = aux;
	-
	- if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
	- return (0);
	-
	- return (zpool_standard_error(hdl, errno, msg));
	-}
	-
	-/*
	- * Returns TRUE if the given nvlist is a vdev that was originally swapped in as
	- * a hot spare.
	- */
	-static boolean_t
	-is_replacing_spare(nvlist_t search, nvlist_t tgt, int which)
	-{
	- nvlist_t **child;
	- uint_t c, children;
	- char *type;
	-
	- if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child,
	- &children) == 0) {
	- verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE,
	- &type) == 0);
	-
	- if (strcmp(type, VDEV_TYPE_SPARE) == 0 &&
	- children == 2 && child[which] == tgt)
	- return (B_TRUE);
	-
	- for (c = 0; c < children; c++)
	- if (is_replacing_spare(child[c], tgt, which))
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Attach new_disk (fully described by nvroot) to old_disk.
	- * If 'replacing' is specified, the new disk will replace the old one.
	- */
	-int
	-zpool_vdev_attach(zpool_handle_t *zhp,
	- const char old_disk, const char new_disk, nvlist_t *nvroot, int replacing)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- int ret;
	- nvlist_t *tgt;
	- boolean_t avail_spare, l2cache, islog;
	- uint64_t val;
	- char *newname;
	- nvlist_t **child;
	- uint_t children;
	- nvlist_t *config_root;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- boolean_t rootpool = zpool_is_bootable(zhp);
	-
	- if (replacing)
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot replace %s with %s"), old_disk, new_disk);
	- else
	- (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	- "cannot attach %s to %s"), new_disk, old_disk);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache,
	- &islog)) == NULL)
	- return (zfs_error(hdl, EZFS_NODEVICE, msg));
	-
	- if (avail_spare)
	- return (zfs_error(hdl, EZFS_ISSPARE, msg));
	-
	- if (l2cache)
	- return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
	-
	- verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
	- zc.zc_cookie = replacing;
	-
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0 \|\| children != 1) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "new device must be a single disk"));
	- return (zfs_error(hdl, EZFS_INVALCONFIG, msg));
	- }
	-
	- verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
	- ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
	-
	- if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL)
	- return (-1);
	-
	- /*
	- * If the target is a hot spare that has been swapped in, we can only
	- * replace it with another hot spare.
	- */
	- if (replacing &&
	- nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 &&
	- (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache,
	- NULL) == NULL \|\| !avail_spare) &&
	- is_replacing_spare(config_root, tgt, 1)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "can only be replaced by another hot spare"));
	- free(newname);
	- return (zfs_error(hdl, EZFS_BADTARGET, msg));
	- }
	-
	- free(newname);
	-
	- if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
	- return (-1);
	-
	- ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc);
	-
	- zcmd_free_nvlists(&zc);
	-
	- if (ret == 0) {
	- if (rootpool) {
	- /*
	- * XXX need a better way to prevent user from
	- * booting up a half-baked vdev.
	- */
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make "
	- "sure to wait until resilver is done "
	- "before rebooting.\n"));
	- (void) fprintf(stderr, "\n");
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "If "
	- "you boot from pool '%s', you may need to update\n"
	- "boot code on newly attached disk '%s'.\n\n"
	- "Assuming you use GPT partitioning and 'da0' is "
	- "your new boot disk\n"
	- "you may use the following command:\n\n"
	- "\tgpart bootcode -b /boot/pmbr -p "
	- "/boot/gptzfsboot -i 1 da0\n\n"),
	- zhp->zpool_name, new_disk);
	- }
	- return (0);
	- }
	-
	- switch (errno) {
	- case ENOTSUP:
	- /*
	- * Can't attach to or replace this type of vdev.
	- */
	- if (replacing) {
	- uint64_t version = zpool_get_prop_int(zhp,
	- ZPOOL_PROP_VERSION, NULL);
	-
	- if (islog)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "cannot replace a log with a spare"));
	- else if (version >= SPA_VERSION_MULTI_REPLACE)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "already in replacing/spare config; wait "
	- "for completion or use 'zpool detach'"));
	- else
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "cannot replace a replacing device"));
	- } else {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "can only attach to mirrors and top-level "
	- "disks"));
	- }
	- (void) zfs_error(hdl, EZFS_BADTARGET, msg);
	- break;
	-
	- case EINVAL:
	- /*
	- * The new device must be a single disk.
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "new device must be a single disk"));
	- (void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
	- break;
	-
	- case EBUSY:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
	- "or device removal is in progress"),
	- new_disk);
	- (void) zfs_error(hdl, EZFS_BADDEV, msg);
	- break;
	-
	- case EOVERFLOW:
	- /*
	- * The new device is too small.
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "device is too small"));
	- (void) zfs_error(hdl, EZFS_BADDEV, msg);
	- break;
	-
	- case EDOM:
	- /*
	- * The new device has a different alignment requirement.
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "devices have different sector alignment"));
	- (void) zfs_error(hdl, EZFS_BADDEV, msg);
	- break;
	-
	- case ENAMETOOLONG:
	- /*
	- * The resulting top-level vdev spec won't fit in the label.
	- */
	- (void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg);
	- break;
	-
	- default:
	- (void) zpool_standard_error(hdl, errno, msg);
	- }
	-
	- return (-1);
	-}
	-
	-/*
	- * Detach the specified device.
	- */
	-int
	-zpool_vdev_detach(zpool_handle_t zhp, const char path)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- nvlist_t *tgt;
	- boolean_t avail_spare, l2cache;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot detach %s"), path);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	- NULL)) == NULL)
	- return (zfs_error(hdl, EZFS_NODEVICE, msg));
	-
	- if (avail_spare)
	- return (zfs_error(hdl, EZFS_ISSPARE, msg));
	-
	- if (l2cache)
	- return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
	-
	- verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0)
	- return (0);
	-
	- switch (errno) {
	-
	- case ENOTSUP:
	- /*
	- * Can't detach from this type of vdev.
	- */
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
	- "applicable to mirror and replacing vdevs"));
	- (void) zfs_error(hdl, EZFS_BADTARGET, msg);
	- break;
	-
	- case EBUSY:
	- /*
	- * There are no other replicas of this device.
	- */
	- (void) zfs_error(hdl, EZFS_NOREPLICAS, msg);
	- break;
	-
	- default:
	- (void) zpool_standard_error(hdl, errno, msg);
	- }
	-
	- return (-1);
	-}
	-
	-/*
	- * Find a mirror vdev in the source nvlist.
	- *
	- * The mchild array contains a list of disks in one of the top-level mirrors
	- * of the source pool. The schild array contains a list of disks that the
	- * user specified on the command line. We loop over the mchild array to
	- * see if any entry in the schild array matches.
	- *
	- * If a disk in the mchild array is found in the schild array, we return
	- * the index of that entry. Otherwise we return -1.
	- */
	-static int
	-find_vdev_entry(zpool_handle_t zhp, nvlist_t *mchild, uint_t mchildren,
	- nvlist_t **schild, uint_t schildren)
	-{
	- uint_t mc;
	-
	- for (mc = 0; mc < mchildren; mc++) {
	- uint_t sc;
	- char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
	- mchild[mc], 0);
	-
	- for (sc = 0; sc < schildren; sc++) {
	- char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
	- schild[sc], 0);
	- boolean_t result = (strcmp(mpath, spath) == 0);
	-
	- free(spath);
	- if (result) {
	- free(mpath);
	- return (mc);
	- }
	- }
	-
	- free(mpath);
	- }
	-
	- return (-1);
	-}
	-
	-/*
	- * Split a mirror pool. If newroot points to null, then a new nvlist
	- * is generated and it is the responsibility of the caller to free it.
	- */
	-int
	-zpool_vdev_split(zpool_handle_t zhp, char newname, nvlist_t **newroot,
	- nvlist_t *props, splitflags_t flags)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- nvlist_t tree, config, child, newchild, *newconfig = NULL;
	- nvlist_t *varray = NULL, zc_props = NULL;
	- uint_t c, children, newchildren, lastlog = 0, vcount, found = 0;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- uint64_t vers;
	- boolean_t freelist = B_FALSE, memory_err = B_TRUE;
	- int retval = 0;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name);
	-
	- if (!zpool_name_valid(hdl, B_FALSE, newname))
	- return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
	-
	- if ((config = zpool_get_config(zhp, NULL)) == NULL) {
	- (void) fprintf(stderr, gettext("Internal error: unable to "
	- "retrieve pool configuration\n"));
	- return (-1);
	- }
	-
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree)
	- == 0);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
	-
	- if (props) {
	- prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
	- if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
	- props, vers, flags, msg)) == NULL)
	- return (-1);
	- }
	-
	- if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
	- &children) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "Source pool is missing vdev tree"));
	- nvlist_free(zc_props);
	- return (-1);
	- }
	-
	- varray = zfs_alloc(hdl, children * sizeof (nvlist_t *));
	- vcount = 0;
	-
	- if (*newroot == NULL \|\|
	- nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN,
	- &newchild, &newchildren) != 0)
	- newchildren = 0;
	-
	- for (c = 0; c < children; c++) {
	- uint64_t is_log = B_FALSE, is_hole = B_FALSE;
	- char *type;
	- nvlist_t *mchild, vdev;
	- uint_t mchildren;
	- int entry;
	-
	- /*
	- * Unlike cache & spares, slogs are stored in the
	- * ZPOOL_CONFIG_CHILDREN array. We filter them out here.
	- */
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	- &is_log);
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
	- &is_hole);
	- if (is_log \|\| is_hole) {
	- /*
	- * Create a hole vdev and put it in the config.
	- */
	- if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0)
	- goto out;
	- if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_HOLE) != 0)
	- goto out;
	- if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE,
	- 1) != 0)
	- goto out;
	- if (lastlog == 0)
	- lastlog = vcount;
	- varray[vcount++] = vdev;
	- continue;
	- }
	- lastlog = 0;
	- verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type)
	- == 0);
	- if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "Source pool must be composed only of mirrors\n"));
	- retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
	- goto out;
	- }
	-
	- verify(nvlist_lookup_nvlist_array(child[c],
	- ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
	-
	- /* find or add an entry for this top-level vdev */
	- if (newchildren > 0 &&
	- (entry = find_vdev_entry(zhp, mchild, mchildren,
	- newchild, newchildren)) >= 0) {
	- /* We found a disk that the user specified. */
	- vdev = mchild[entry];
	- ++found;
	- } else {
	- /* User didn't specify a disk for this vdev. */
	- vdev = mchild[mchildren - 1];
	- }
	-
	- if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
	- goto out;
	- }
	-
	- /* did we find every disk the user specified? */
	- if (found != newchildren) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must "
	- "include at most one disk from each mirror"));
	- retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
	- goto out;
	- }
	-
	- /* Prepare the nvlist for populating. */
	- if (*newroot == NULL) {
	- if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0)
	- goto out;
	- freelist = B_TRUE;
	- if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_ROOT) != 0)
	- goto out;
	- } else {
	- verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0);
	- }
	-
	- /* Add all the children we found */
	- if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray,
	- lastlog == 0 ? vcount : lastlog) != 0)
	- goto out;
	-
	- /*
	- * If we're just doing a dry run, exit now with success.
	- */
	- if (flags.dryrun) {
	- memory_err = B_FALSE;
	- freelist = B_FALSE;
	- goto out;
	- }
	-
	- /* now build up the config list & call the ioctl */
	- if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0)
	- goto out;
	-
	- if (nvlist_add_nvlist(newconfig,
	- ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 \|\|
	- nvlist_add_string(newconfig,
	- ZPOOL_CONFIG_POOL_NAME, newname) != 0 \|\|
	- nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0)
	- goto out;
	-
	- /*
	- * The new pool is automatically part of the namespace unless we
	- * explicitly export it.
	- */
	- if (!flags.import)
	- zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT;
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string));
	- if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0)
	- goto out;
	- if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
	- goto out;
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) {
	- retval = zpool_standard_error(hdl, errno, msg);
	- goto out;
	- }
	-
	- freelist = B_FALSE;
	- memory_err = B_FALSE;
	-
	-out:
	- if (varray != NULL) {
	- int v;
	-
	- for (v = 0; v < vcount; v++)
	- nvlist_free(varray[v]);
	- free(varray);
	- }
	- zcmd_free_nvlists(&zc);
	- nvlist_free(zc_props);
	- nvlist_free(newconfig);
	- if (freelist) {
	- nvlist_free(*newroot);
	- *newroot = NULL;
	- }
	-
	- if (retval != 0)
	- return (retval);
	-
	- if (memory_err)
	- return (no_memory(hdl));
	-
	- return (0);
	-}
	-
	-/*
	- * Remove the given device.
	- */
	-int
	-zpool_vdev_remove(zpool_handle_t zhp, const char path)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- nvlist_t *tgt;
	- boolean_t avail_spare, l2cache, islog;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- uint64_t version;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	- &islog)) == NULL)
	- return (zfs_error(hdl, EZFS_NODEVICE, msg));
	-
	- version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
	- if (islog && version < SPA_VERSION_HOLES) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded to support log removal"));
	- return (zfs_error(hdl, EZFS_BADVERSION, msg));
	- }
	-
	- zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
	- return (0);
	-
	- switch (errno) {
	-
	- case EINVAL:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid config; all top-level vdevs must "
	- "have the same sector size and not be raidz."));
	- (void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
	- break;
	-
	- case EBUSY:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "Pool busy; removal may already be in progress"));
	- (void) zfs_error(hdl, EZFS_BUSY, msg);
	- break;
	-
	- default:
	- (void) zpool_standard_error(hdl, errno, msg);
	- }
	- return (-1);
	-}
	-
	-int
	-zpool_vdev_remove_cancel(zpool_handle_t *zhp)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot cancel removal"));
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- zc.zc_cookie = 1;
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
	- return (0);
	-
	- return (zpool_standard_error(hdl, errno, msg));
	-}
	-
	-int
	-zpool_vdev_indirect_size(zpool_handle_t zhp, const char path,
	- uint64_t *sizep)
	-{
	- char msg[1024];
	- nvlist_t *tgt;
	- boolean_t avail_spare, l2cache, islog;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"),
	- path);
	-
	- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	- &islog)) == NULL)
	- return (zfs_error(hdl, EZFS_NODEVICE, msg));
	-
	- if (avail_spare \|\| l2cache \|\| islog) {
	- *sizep = 0;
	- return (0);
	- }
	-
	- if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "indirect size not available"));
	- return (zfs_error(hdl, EINVAL, msg));
	- }
	- return (0);
	-}
	-
	-/*
	- * Clear the errors for the pool, or the particular device if specified.
	- */
	-int
	-zpool_clear(zpool_handle_t zhp, const char path, nvlist_t *rewindnvl)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- nvlist_t *tgt;
	- zpool_load_policy_t policy;
	- boolean_t avail_spare, l2cache;
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- nvlist_t *nvi = NULL;
	- int error;
	-
	- if (path)
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
	- path);
	- else
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
	- zhp->zpool_name);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- if (path) {
	- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare,
	- &l2cache, NULL)) == NULL)
	- return (zfs_error(hdl, EZFS_NODEVICE, msg));
	-
	- /*
	- * Don't allow error clearing for hot spares. Do allow
	- * error clearing for l2cache devices.
	- */
	- if (avail_spare)
	- return (zfs_error(hdl, EZFS_ISSPARE, msg));
	-
	- verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID,
	- &zc.zc_guid) == 0);
	- }
	-
	- zpool_get_load_policy(rewindnvl, &policy);
	- zc.zc_cookie = policy.zlp_rewind;
	-
	- if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0)
	- return (-1);
	-
	- if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0)
	- return (-1);
	-
	- while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 &&
	- errno == ENOMEM) {
	- if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- }
	-
	- if (!error \|\| ((policy.zlp_rewind & ZPOOL_TRY_REWIND) &&
	- errno != EPERM && errno != EACCES)) {
	- if (policy.zlp_rewind &
	- (ZPOOL_DO_REWIND \| ZPOOL_TRY_REWIND)) {
	- (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
	- zpool_rewind_exclaim(hdl, zc.zc_name,
	- ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0),
	- nvi);
	- nvlist_free(nvi);
	- }
	- zcmd_free_nvlists(&zc);
	- return (0);
	- }
	-
	- zcmd_free_nvlists(&zc);
	- return (zpool_standard_error(hdl, errno, msg));
	-}
	-
	-/*
	- * Similar to zpool_clear(), but takes a GUID (used by fmd).
	- */
	-int
	-zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"),
	- guid);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- zc.zc_guid = guid;
	- zc.zc_cookie = ZPOOL_NO_REWIND;
	-
	- if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0)
	- return (0);
	-
	- return (zpool_standard_error(hdl, errno, msg));
	-}
	-
	-/*
	- * Change the GUID for a pool.
	- */
	-int
	-zpool_reguid(zpool_handle_t *zhp)
	-{
	- char msg[1024];
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	- zfs_cmd_t zc = { 0 };
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
	- return (0);
	-
	- return (zpool_standard_error(hdl, errno, msg));
	-}
	-
	-/*
	- * Reopen the pool.
	- */
	-int
	-zpool_reopen(zpool_handle_t *zhp)
	-{
	- zfs_cmd_t zc = { 0 };
	- char msg[1024];
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) snprintf(msg, sizeof (msg),
	- dgettext(TEXT_DOMAIN, "cannot reopen '%s'"),
	- zhp->zpool_name);
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0)
	- return (0);
	- return (zpool_standard_error(hdl, errno, msg));
	-}
	-
	-/* call into libzfs_core to execute the sync IOCTL per pool */
	-int
	-zpool_sync_one(zpool_handle_t zhp, void data)
	-{
	- int ret;
	- libzfs_handle_t *hdl = zpool_get_handle(zhp);
	- const char *pool_name = zpool_get_name(zhp);
	- boolean_t *force = data;
	- nvlist_t *innvl = fnvlist_alloc();
	-
	- fnvlist_add_boolean_value(innvl, "force", *force);
	- if ((ret = lzc_sync(pool_name, innvl, NULL)) != 0) {
	- nvlist_free(innvl);
	- return (zpool_standard_error_fmt(hdl, ret,
	- dgettext(TEXT_DOMAIN, "sync '%s' failed"), pool_name));
	- }
	- nvlist_free(innvl);
	-
	- return (0);
	-}
	-
	-/*
	- * Convert from a devid string to a path.
	- */
	-static char *
	-devid_to_path(char *devid_str)
	-{
	- ddi_devid_t devid;
	- char *minor;
	- char *path;
	- devid_nmlist_t *list = NULL;
	- int ret;
	-
	- if (devid_str_decode(devid_str, &devid, &minor) != 0)
	- return (NULL);
	-
	- ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list);
	-
	- devid_str_free(minor);
	- devid_free(devid);
	-
	- if (ret != 0)
	- return (NULL);
	-
	- /*
	- * In a case the strdup() fails, we will just return NULL below.
	- */
	- path = strdup(list[0].devname);
	-
	- devid_free_nmlist(list);
	-
	- return (path);
	-}
	-
	-/*
	- * Convert from a path to a devid string.
	- */
	-static char *
	-path_to_devid(const char *path)
	-{
	-#ifdef have_devid
	- int fd;
	- ddi_devid_t devid;
	- char minor, ret;
	-
	- if ((fd = open(path, O_RDONLY)) < 0)
	- return (NULL);
	-
	- minor = NULL;
	- ret = NULL;
	- if (devid_get(fd, &devid) == 0) {
	- if (devid_get_minor_name(fd, &minor) == 0)
	- ret = devid_str_encode(devid, minor);
	- if (minor != NULL)
	- devid_str_free(minor);
	- devid_free(devid);
	- }
	- (void) close(fd);
	-
	- return (ret);
	-#else
	- return (NULL);
	-#endif
	-}
	-
	-/*
	- * Issue the necessary ioctl() to update the stored path value for the vdev. We
	- * ignore any failure here, since a common case is for an unprivileged user to
	- * type 'zpool status', and we'll display the correct information anyway.
	- */
	-static void
	-set_path(zpool_handle_t zhp, nvlist_t nv, const char *path)
	-{
	- zfs_cmd_t zc = { 0 };
	-
	- (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- (void) strncpy(zc.zc_value, path, sizeof (zc.zc_value));
	- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
	- &zc.zc_guid) == 0);
	-
	- (void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc);
	-}
	-
	-/*
	- * Given a vdev, return the name to display in iostat. If the vdev has a path,
	- * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type.
	- * We also check if this is a whole disk, in which case we strip off the
	- * trailing 's0' slice name.
	- *
	- * This routine is also responsible for identifying when disks have been
	- * reconfigured in a new location. The kernel will have opened the device by
	- * devid, but the path will still refer to the old location. To catch this, we
	- * first do a path -> devid translation (which is fast for the common case). If
	- * the devid matches, we're done. If not, we do a reverse devid -> path
	- * translation and issue the appropriate ioctl() to update the path of the vdev.
	- * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any
	- * of these checks.
	- */
	-char *
	-zpool_vdev_name(libzfs_handle_t hdl, zpool_handle_t zhp, nvlist_t *nv,
	- int name_flags)
	-{
	- char path, devid, *env;
	- uint64_t value;
	- char buf[64];
	- vdev_stat_t *vs;
	- uint_t vsc;
	- int have_stats;
	- int have_path;
	-
	- env = getenv("ZPOOL_VDEV_NAME_PATH");
	- if (env && (strtoul(env, NULL, 0) > 0 \|\|
	- !strncasecmp(env, "YES", 3) \|\| !strncasecmp(env, "ON", 2)))
	- name_flags \|= VDEV_NAME_PATH;
	-
	- env = getenv("ZPOOL_VDEV_NAME_GUID");
	- if (env && (strtoul(env, NULL, 0) > 0 \|\|
	- !strncasecmp(env, "YES", 3) \|\| !strncasecmp(env, "ON", 2)))
	- name_flags \|= VDEV_NAME_GUID;
	-
	- env = getenv("ZPOOL_VDEV_NAME_FOLLOW_LINKS");
	- if (env && (strtoul(env, NULL, 0) > 0 \|\|
	- !strncasecmp(env, "YES", 3) \|\| !strncasecmp(env, "ON", 2)))
	- name_flags \|= VDEV_NAME_FOLLOW_LINKS;
	-
	- have_stats = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &vsc) == 0;
	- have_path = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0;
	-
	- /*
	- * If the device is not currently present, assume it will not
	- * come back at the same device path. Display the device by GUID.
	- */
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 \|\|
	- (name_flags & VDEV_NAME_GUID) != 0 \|\|
	- have_path && have_stats && vs->vs_state <= VDEV_STATE_CANT_OPEN) {
	- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value);
	- (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value);
	- path = buf;
	- } else if (have_path) {
	-
	- /*
	- * If the device is dead (faulted, offline, etc) then don't
	- * bother opening it. Otherwise we may be forcing the user to
	- * open a misbehaving device, which can have undesirable
	- * effects.
	- */
	- if ((have_stats == 0 \|\|
	- vs->vs_state >= VDEV_STATE_DEGRADED) &&
	- zhp != NULL &&
	- nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) {
	- /*
	- * Determine if the current path is correct.
	- */
	- char *newdevid = path_to_devid(path);
	-
	- if (newdevid == NULL \|\|
	- strcmp(devid, newdevid) != 0) {
	- char *newpath;
	-
	- if ((newpath = devid_to_path(devid)) != NULL) {
	- /*
	- * Update the path appropriately.
	- */
	- set_path(zhp, nv, newpath);
	- if (nvlist_add_string(nv,
	- ZPOOL_CONFIG_PATH, newpath) == 0)
	- verify(nvlist_lookup_string(nv,
	- ZPOOL_CONFIG_PATH,
	- &path) == 0);
	- free(newpath);
	- }
	- }
	-
	- if (newdevid)
	- devid_str_free(newdevid);
	- }
	-
	-#ifdef illumos
	- if (name_flags & VDEV_NAME_FOLLOW_LINKS) {
	- char *rp = realpath(path, NULL);
	- if (rp) {
	- strlcpy(buf, rp, sizeof (buf));
	- path = buf;
	- free(rp);
	- }
	- }
	-
	- if (strncmp(path, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0)
	- path += strlen(ZFS_DISK_ROOTD);
	-
	- /*
	- * Remove the partition from the path it this is a whole disk.
	- */
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
	- == 0 && value && !(name_flags & VDEV_NAME_PATH)) {
	- int pathlen = strlen(path);
	- char *tmp = zfs_strdup(hdl, path);
	-
	- /*
	- * If it starts with c#, and ends with "s0" or "s1",
	- * chop the slice off, or if it ends with "s0/old" or
	- * "s1/old", remove the slice from the middle.
	- */
	- if (CTD_CHECK(tmp)) {
	- if (strcmp(&tmp[pathlen - 2], "s0") == 0 \|\|
	- strcmp(&tmp[pathlen - 2], "s1") == 0) {
	- tmp[pathlen - 2] = '\0';
	- } else if (pathlen > 6 &&
	- (strcmp(&tmp[pathlen - 6], "s0/old") == 0 \|\|
	- strcmp(&tmp[pathlen - 6], "s1/old") == 0)) {
	- (void) strcpy(&tmp[pathlen - 6],
	- "/old");
	- }
	- }
	- return (tmp);
	- }
	-#else /* !illumos */
	- if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
	- path += sizeof(_PATH_DEV) - 1;
	-#endif /* illumos */
	- } else {
	- verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);
	-
	- /*
	- * If it's a raidz device, we need to stick in the parity level.
	- */
	- if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) {
	- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
	- &value) == 0);
	- (void) snprintf(buf, sizeof (buf), "%s%llu", path,
	- (u_longlong_t)value);
	- path = buf;
	- }
	-
	- /*
	- * We identify each top-level vdev by using a <type-id>
	- * naming convention.
	- */
	- if (name_flags & VDEV_NAME_TYPE_ID) {
	- uint64_t id;
	-
	- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
	- &id) == 0);
	- (void) snprintf(buf, sizeof (buf), "%s-%llu", path,
	- (u_longlong_t)id);
	- path = buf;
	- }
	- }
	-
	- return (zfs_strdup(hdl, path));
	-}
	-
	-static int
	-zbookmark_mem_compare(const void a, const void b)
	-{
	- return (memcmp(a, b, sizeof (zbookmark_phys_t)));
	-}
	-
	-/*
	- * Retrieve the persistent error log, uniquify the members, and return to the
	- * caller.
	- */
	-int
	-zpool_get_errlog(zpool_handle_t zhp, nvlist_t *nverrlistp)
	-{
	- zfs_cmd_t zc = { 0 };
	- uint64_t count;
	- zbookmark_phys_t *zb = NULL;
	- int i;
	-
	- /*
	- * Retrieve the raw error list from the kernel. If the number of errors
	- * has increased, allocate more space and continue until we get the
	- * entire list.
	- */
	- verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT,
	- &count) == 0);
	- if (count == 0)
	- return (0);
	- if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
	- count * sizeof (zbookmark_phys_t))) == (uintptr_t)NULL)
	- return (-1);
	- zc.zc_nvlist_dst_size = count;
	- (void) strcpy(zc.zc_name, zhp->zpool_name);
	- for (;;) {
	- if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG,
	- &zc) != 0) {
	- free((void *)(uintptr_t)zc.zc_nvlist_dst);
	- if (errno == ENOMEM) {
	- void *dst;
	-
	- count = zc.zc_nvlist_dst_size;
	- dst = zfs_alloc(zhp->zpool_hdl, count *
	- sizeof (zbookmark_phys_t));
	- if (dst == NULL)
	- return (-1);
	- zc.zc_nvlist_dst = (uintptr_t)dst;
	- } else {
	- return (-1);
	- }
	- } else {
	- break;
	- }
	- }
	-
	- /*
	- * Sort the resulting bookmarks. This is a little confusing due to the
	- * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last
	- * to first, and 'zc_nvlist_dst_size' indicates the number of boomarks
	- * _not_ copied as part of the process. So we point the start of our
	- * array appropriate and decrement the total number of elements.
	- */
	- zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
	- zc.zc_nvlist_dst_size;
	- count -= zc.zc_nvlist_dst_size;
	-
	- qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
	-
	- verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
	-
	- /*
	- * Fill in the nverrlistp with nvlist's of dataset and object numbers.
	- */
	- for (i = 0; i < count; i++) {
	- nvlist_t *nv;
	-
	- /* ignoring zb_blkid and zb_level for now */
	- if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset &&
	- zb[i-1].zb_object == zb[i].zb_object)
	- continue;
	-
	- if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0)
	- goto nomem;
	- if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET,
	- zb[i].zb_objset) != 0) {
	- nvlist_free(nv);
	- goto nomem;
	- }
	- if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT,
	- zb[i].zb_object) != 0) {
	- nvlist_free(nv);
	- goto nomem;
	- }
	- if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) {
	- nvlist_free(nv);
	- goto nomem;
	- }
	- nvlist_free(nv);
	- }
	-
	- free((void *)(uintptr_t)zc.zc_nvlist_dst);
	- return (0);
	-
	-nomem:
	- free((void *)(uintptr_t)zc.zc_nvlist_dst);
	- return (no_memory(zhp->zpool_hdl));
	-}
	-
	-/*
	- * Upgrade a ZFS pool to the latest on-disk version.
	- */
	-int
	-zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version)
	-{
	- zfs_cmd_t zc = { 0 };
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) strcpy(zc.zc_name, zhp->zpool_name);
	- zc.zc_cookie = new_version;
	-
	- if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0)
	- return (zpool_standard_error_fmt(hdl, errno,
	- dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"),
	- zhp->zpool_name));
	- return (0);
	-}
	-
	-void
	-zfs_save_arguments(int argc, char *argv, char string, int len)
	-{
	- (void) strlcpy(string, basename(argv[0]), len);
	- for (int i = 1; i < argc; i++) {
	- (void) strlcat(string, " ", len);
	- (void) strlcat(string, argv[i], len);
	- }
	-}
	-
	-int
	-zpool_log_history(libzfs_handle_t hdl, const char message)
	-{
	- zfs_cmd_t zc = { 0 };
	- nvlist_t *args;
	- int err;
	-
	- args = fnvlist_alloc();
	- fnvlist_add_string(args, "message", message);
	- err = zcmd_write_src_nvlist(hdl, &zc, args);
	- if (err == 0)
	- err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc);
	- nvlist_free(args);
	- zcmd_free_nvlists(&zc);
	- return (err);
	-}
	-
	-/*
	- * Perform ioctl to get some command history of a pool.
	- *
	- * 'buf' is the buffer to fill up to 'len' bytes. 'off' is the
	- * logical offset of the history buffer to start reading from.
	- *
	- * Upon return, 'off' is the next logical offset to read from and
	- * 'len' is the actual amount of bytes read into 'buf'.
	- */
	-static int
	-get_history(zpool_handle_t zhp, char buf, uint64_t off, uint64_t len)
	-{
	- zfs_cmd_t zc = { 0 };
	- libzfs_handle_t *hdl = zhp->zpool_hdl;
	-
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	-
	- zc.zc_history = (uint64_t)(uintptr_t)buf;
	- zc.zc_history_len = *len;
	- zc.zc_history_offset = *off;
	-
	- if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) {
	- switch (errno) {
	- case EPERM:
	- return (zfs_error_fmt(hdl, EZFS_PERM,
	- dgettext(TEXT_DOMAIN,
	- "cannot show history for pool '%s'"),
	- zhp->zpool_name));
	- case ENOENT:
	- return (zfs_error_fmt(hdl, EZFS_NOHISTORY,
	- dgettext(TEXT_DOMAIN, "cannot get history for pool "
	- "'%s'"), zhp->zpool_name));
	- case ENOTSUP:
	- return (zfs_error_fmt(hdl, EZFS_BADVERSION,
	- dgettext(TEXT_DOMAIN, "cannot get history for pool "
	- "'%s', pool must be upgraded"), zhp->zpool_name));
	- default:
	- return (zpool_standard_error_fmt(hdl, errno,
	- dgettext(TEXT_DOMAIN,
	- "cannot get history for '%s'"), zhp->zpool_name));
	- }
	- }
	-
	- *len = zc.zc_history_len;
	- *off = zc.zc_history_offset;
	-
	- return (0);
	-}
	-
	-/*
	- * Process the buffer of nvlists, unpacking and storing each nvlist record
	- * into 'records'. 'leftover' is set to the number of bytes that weren't
	- * processed as there wasn't a complete record.
	- */
	-int
	-zpool_history_unpack(char buf, uint64_t bytes_read, uint64_t leftover,
	- nvlist_t **records, uint_t numrecords)
	-{
	- uint64_t reclen;
	- nvlist_t *nv;
	- int i;
	-
	- while (bytes_read > sizeof (reclen)) {
	-
	- /* get length of packed record (stored as little endian) */
	- for (i = 0, reclen = 0; i < sizeof (reclen); i++)
	- reclen += (uint64_t)(((uchar_t )buf)[i]) << (8i);
	-
	- if (bytes_read < sizeof (reclen) + reclen)
	- break;
	-
	- /* unpack record */
	- if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0)
	- return (ENOMEM);
	- bytes_read -= sizeof (reclen) + reclen;
	- buf += sizeof (reclen) + reclen;
	-
	- /* add record to nvlist array */
	- (*numrecords)++;
	- if (ISP2(*numrecords + 1)) {
	- records = realloc(records,
	- numrecords 2 * sizeof (nvlist_t *));
	- }
	- (records)[numrecords - 1] = nv;
	- }
	-
	- *leftover = bytes_read;
	- return (0);
	-}
	-
	-/* from spa_history.c: spa_history_create_obj() */
	-#define HIS_BUF_LEN_DEF (128 << 10)
	-#define HIS_BUF_LEN_MAX (1 << 30)
	-
	-/*
	- * Retrieve the command history of a pool.
	- */
	-int
	-zpool_get_history(zpool_handle_t zhp, nvlist_t nvhisp, uint64_t off,
	- boolean_t *eof)
	-{
	- char *buf;
	- uint64_t buflen = HIS_BUF_LEN_DEF;
	- nvlist_t **records = NULL;
	- uint_t numrecords = 0;
	- int err, i;
	- uint64_t start = *off;
	-
	- buf = malloc(buflen);
	- if (buf == NULL)
	- return (ENOMEM);
	- /* process about 1MB at a time */
	- while (off - start < 1024 1024) {
	- uint64_t bytes_read = buflen;
	- uint64_t leftover;
	-
	- if ((err = get_history(zhp, buf, off, &bytes_read)) != 0)
	- break;
	-
	- /* if nothing else was read in, we're at EOF, just return */
	- if (bytes_read == 0) {
	- *eof = B_TRUE;
	- break;
	- }
	-
	- if ((err = zpool_history_unpack(buf, bytes_read,
	- &leftover, &records, &numrecords)) != 0)
	- break;
	- *off -= leftover;
	- if (leftover == bytes_read) {
	- /*
	- * no progress made, because buffer is not big enough
	- * to hold this record; resize and retry.
	- */
	- buflen *= 2;
	- free(buf);
	- buf = NULL;
	- if ((buflen >= HIS_BUF_LEN_MAX) \|\|
	- ((buf = malloc(buflen)) == NULL)) {
	- err = ENOMEM;
	- break;
	- }
	- }
	- }
	-
	- free(buf);
	-
	- if (!err) {
	- verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0);
	- verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD,
	- records, numrecords) == 0);
	- }
	- for (i = 0; i < numrecords; i++)
	- nvlist_free(records[i]);
	- free(records);
	-
	- return (err);
	-}
	-
	-void
	-zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
	- char *pathname, size_t len)
	-{
	- zfs_cmd_t zc = { 0 };
	- boolean_t mounted = B_FALSE;
	- char *mntpnt = NULL;
	- char dsname[ZFS_MAX_DATASET_NAME_LEN];
	-
	- if (dsobj == 0) {
	- /* special case for the MOS */
	- (void) snprintf(pathname, len, "<metadata>:<0x%llx>", obj);
	- return;
	- }
	-
	- /* get the dataset's name */
	- (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	- zc.zc_obj = dsobj;
	- if (ioctl(zhp->zpool_hdl->libzfs_fd,
	- ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) {
	- /* just write out a path of two object numbers */
	- (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>",
	- dsobj, obj);
	- return;
	- }
	- (void) strlcpy(dsname, zc.zc_value, sizeof (dsname));
	-
	- /* find out if the dataset is mounted */
	- mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt);
	-
	- /* get the corrupted object's path */
	- (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
	- zc.zc_obj = obj;
	- if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH,
	- &zc) == 0) {
	- if (mounted) {
	- (void) snprintf(pathname, len, "%s%s", mntpnt,
	- zc.zc_value);
	- } else {
	- (void) snprintf(pathname, len, "%s:%s",
	- dsname, zc.zc_value);
	- }
	- } else {
	- (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, obj);
	- }
	- free(mntpnt);
	-}
	-
	-int
	-zpool_set_bootenv(zpool_handle_t zhp, const char envmap)
	-{
	- int error = lzc_set_bootenv(zhp->zpool_name, envmap);
	- if (error != 0) {
	- (void) zpool_standard_error_fmt(zhp->zpool_hdl, error,
	- dgettext(TEXT_DOMAIN,
	- "error setting bootenv in pool '%s'"), zhp->zpool_name);
	- }
	-
	- return (error);
	-}
	-
	-int
	-zpool_get_bootenv(zpool_handle_t zhp, char outbuf, size_t size, off_t offset)
	-{
	- nvlist_t *nvl;
	- int error = lzc_get_bootenv(zhp->zpool_name, &nvl);;
	- if (error != 0) {
	- (void) zpool_standard_error_fmt(zhp->zpool_hdl, error,
	- dgettext(TEXT_DOMAIN,
	- "error getting bootenv in pool '%s'"), zhp->zpool_name);
	- return (-1);
	- }
	- char *envmap = fnvlist_lookup_string(nvl, "envmap");
	- if (offset >= strlen(envmap)) {
	- fnvlist_free(nvl);
	- return (0);
	- }
	-
	- strlcpy(outbuf, envmap + offset, size);
	- int bytes = MIN(strlen(envmap + offset), size);
	- fnvlist_free(nvl);
	- return (bytes);
	-}
	-
	-#ifdef illumos
	-/*
	- * Read the EFI label from the config, if a label does not exist then
	- * pass back the error to the caller. If the caller has passed a non-NULL
	- * diskaddr argument then we set it to the starting address of the EFI
	- * partition. If the caller has passed a non-NULL boolean argument, then
	- * we set it to indicate if the disk does have efi system partition.
	- */
	-static int
	-read_efi_label(nvlist_t config, diskaddr_t sb, boolean_t *system)
	-{
	- char *path;
	- int fd;
	- char diskname[MAXPATHLEN];
	- boolean_t boot = B_FALSE;
	- int err = -1;
	- int slice;
	-
	- if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0)
	- return (err);
	-
	- (void) snprintf(diskname, sizeof (diskname), "%s%s", ZFS_RDISK_ROOT,
	- strrchr(path, '/'));
	- if ((fd = open(diskname, O_RDONLY\|O_NDELAY)) >= 0) {
	- struct dk_gpt *vtoc;
	-
	- if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) {
	- for (slice = 0; slice < vtoc->efi_nparts; slice++) {
	- if (vtoc->efi_parts[slice].p_tag == V_SYSTEM)
	- boot = B_TRUE;
	- if (vtoc->efi_parts[slice].p_tag == V_USR)
	- break;
	- }
	- if (sb != NULL && vtoc->efi_parts[slice].p_tag == V_USR)
	- *sb = vtoc->efi_parts[slice].p_start;
	- if (system != NULL)
	- *system = boot;
	- efi_free(vtoc);
	- }
	- (void) close(fd);
	- }
	- return (err);
	-}
	-
	-/*
	- * determine where a partition starts on a disk in the current
	- * configuration
	- */
	-static diskaddr_t
	-find_start_block(nvlist_t *config)
	-{
	- nvlist_t **child;
	- uint_t c, children;
	- diskaddr_t sb = MAXOFFSET_T;
	- uint64_t wholedisk;
	-
	- if (nvlist_lookup_nvlist_array(config,
	- ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) {
	- if (nvlist_lookup_uint64(config,
	- ZPOOL_CONFIG_WHOLE_DISK,
	- &wholedisk) != 0 \|\| !wholedisk) {
	- return (MAXOFFSET_T);
	- }
	- if (read_efi_label(config, &sb, NULL) < 0)
	- sb = MAXOFFSET_T;
	- return (sb);
	- }
	-
	- for (c = 0; c < children; c++) {
	- sb = find_start_block(child[c]);
	- if (sb != MAXOFFSET_T) {
	- return (sb);
	- }
	- }
	- return (MAXOFFSET_T);
	-}
	-#endif /* illumos */
	-
	-/*
	- * Label an individual disk. The name provided is the short name,
	- * stripped of any leading /dev path.
	- */
	-int
	-zpool_label_disk(libzfs_handle_t hdl, zpool_handle_t zhp, const char *name,
	- zpool_boot_label_t boot_type, uint64_t boot_size, int *slice)
	-{
	-#ifdef illumos
	- char path[MAXPATHLEN];
	- struct dk_gpt *vtoc;
	- int fd;
	- size_t resv = EFI_MIN_RESV_SIZE;
	- uint64_t slice_size;
	- diskaddr_t start_block;
	- char errbuf[1024];
	-
	- /* prepare an error message just in case */
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN, "cannot label '%s'"), name);
	-
	- if (zhp) {
	- nvlist_t *nvroot;
	-
	- verify(nvlist_lookup_nvlist(zhp->zpool_config,
	- ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	-
	- if (zhp->zpool_start_block == 0)
	- start_block = find_start_block(nvroot);
	- else
	- start_block = zhp->zpool_start_block;
	- zhp->zpool_start_block = start_block;
	- } else {
	- /* new pool */
	- start_block = NEW_START_BLOCK;
	- }
	-
	- (void) snprintf(path, sizeof (path), "%s/%s%s", ZFS_RDISK_ROOT, name,
	- BACKUP_SLICE);
	-
	- if ((fd = open(path, O_RDWR \| O_NDELAY)) < 0) {
	- /*
	- * This shouldn't happen. We've long since verified that this
	- * is a valid device.
	- */
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "unable to open device"));
	- return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
	- }
	-
	- if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) {
	- /*
	- * The only way this can fail is if we run out of memory, or we
	- * were unable to read the disk's capacity
	- */
	- if (errno == ENOMEM)
	- (void) no_memory(hdl);
	-
	- (void) close(fd);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "unable to read disk capacity"), name);
	-
	- return (zfs_error(hdl, EZFS_NOCAP, errbuf));
	- }
	-
	- /*
	- * Why we use V_USR: V_BACKUP confuses users, and is considered
	- * disposable by some EFI utilities (since EFI doesn't have a backup
	- * slice). V_UNASSIGNED is supposed to be used only for zero size
	- * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT,
	- * etc. were all pretty specific. V_USR is as close to reality as we
	- * can get, in the absence of V_OTHER.
	- */
	- /* first fix the partition start block */
	- if (start_block == MAXOFFSET_T)
	- start_block = NEW_START_BLOCK;
	-
	- /*
	- * EFI System partition is using slice 0.
	- * ZFS is on slice 1 and slice 8 is reserved.
	- * We assume the GPT partition table without system
	- * partition has zfs p_start == NEW_START_BLOCK.
	- * If start_block != NEW_START_BLOCK, it means we have
	- * system partition. Correct solution would be to query/cache vtoc
	- * from existing vdev member.
	- */
	- if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
	- if (boot_size % vtoc->efi_lbasize != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "boot partition size must be a multiple of %d"),
	- vtoc->efi_lbasize);
	- (void) close(fd);
	- efi_free(vtoc);
	- return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
	- }
	- /*
	- * System partition size checks.
	- * Note the 1MB is quite arbitrary value, since we
	- * are creating dedicated pool, it should be enough
	- * to hold fat + efi bootloader. May need to be
	- * adjusted if the bootloader size will grow.
	- */
	- if (boot_size < 1024 * 1024) {
	- char buf[64];
	- zfs_nicenum(boot_size, buf, sizeof (buf));
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "Specified size %s for EFI System partition is too "
	- "small, the minimum size is 1MB."), buf);
	- (void) close(fd);
	- efi_free(vtoc);
	- return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
	- }
	- /* 33MB is tested with mkfs -F pcfs */
	- if (hdl->libzfs_printerr &&
	- ((vtoc->efi_lbasize == 512 &&
	- boot_size < 33 * 1024 * 1024) \|\|
	- (vtoc->efi_lbasize == 4096 &&
	- boot_size < 256 * 1024 * 1024))) {
	- char buf[64];
	- zfs_nicenum(boot_size, buf, sizeof (buf));
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "Warning: EFI System partition size %s is "
	- "not allowing to create FAT32 file\nsystem, which "
	- "may result in unbootable system.\n"), buf);
	- }
	- /* Adjust zfs partition start by size of system partition. */
	- start_block += boot_size / vtoc->efi_lbasize;
	- }
	-
	- if (start_block == NEW_START_BLOCK) {
	- /*
	- * Use default layout.
	- * ZFS is on slice 0 and slice 8 is reserved.
	- */
	- slice_size = vtoc->efi_last_u_lba + 1;
	- slice_size -= EFI_MIN_RESV_SIZE;
	- slice_size -= start_block;
	- if (slice != NULL)
	- *slice = 0;
	-
	- vtoc->efi_parts[0].p_start = start_block;
	- vtoc->efi_parts[0].p_size = slice_size;
	-
	- vtoc->efi_parts[0].p_tag = V_USR;
	- (void) strcpy(vtoc->efi_parts[0].p_name, "zfs");
	-
	- vtoc->efi_parts[8].p_start = slice_size + start_block;
	- vtoc->efi_parts[8].p_size = resv;
	- vtoc->efi_parts[8].p_tag = V_RESERVED;
	- } else {
	- slice_size = start_block - NEW_START_BLOCK;
	- vtoc->efi_parts[0].p_start = NEW_START_BLOCK;
	- vtoc->efi_parts[0].p_size = slice_size;
	- vtoc->efi_parts[0].p_tag = V_SYSTEM;
	- (void) strcpy(vtoc->efi_parts[0].p_name, "loader");
	- if (slice != NULL)
	- *slice = 1;
	- /* prepare slice 1 */
	- slice_size = vtoc->efi_last_u_lba + 1 - slice_size;
	- slice_size -= resv;
	- slice_size -= NEW_START_BLOCK;
	- vtoc->efi_parts[1].p_start = start_block;
	- vtoc->efi_parts[1].p_size = slice_size;
	- vtoc->efi_parts[1].p_tag = V_USR;
	- (void) strcpy(vtoc->efi_parts[1].p_name, "zfs");
	-
	- vtoc->efi_parts[8].p_start = slice_size + start_block;
	- vtoc->efi_parts[8].p_size = resv;
	- vtoc->efi_parts[8].p_tag = V_RESERVED;
	- }
	-
	- if (efi_write(fd, vtoc) != 0) {
	- /*
	- * Some block drivers (like pcata) may not support EFI
	- * GPT labels. Print out a helpful error message dir-
	- * ecting the user to manually label the disk and give
	- * a specific slice.
	- */
	- (void) close(fd);
	- efi_free(vtoc);
	-
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "try using fdisk(1M) and then provide a specific slice"));
	- return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
	- }
	-
	- (void) close(fd);
	- efi_free(vtoc);
	-#endif /* illumos */
	- return (0);
	-}
	-
	-static boolean_t
	-supported_dump_vdev_type(libzfs_handle_t hdl, nvlist_t config, char *errbuf)
	-{
	- char *type;
	- nvlist_t **child;
	- uint_t children, c;
	-
	- verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0);
	- if (strcmp(type, VDEV_TYPE_FILE) == 0 \|\|
	- strcmp(type, VDEV_TYPE_HOLE) == 0 \|\|
	- strcmp(type, VDEV_TYPE_MISSING) == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "vdev type '%s' is not supported"), type);
	- (void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf);
	- return (B_FALSE);
	- }
	- if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++) {
	- if (!supported_dump_vdev_type(hdl, child[c], errbuf))
	- return (B_FALSE);
	- }
	- }
	- return (B_TRUE);
	-}
	-
	-/*
	- * Check if this zvol is allowable for use as a dump device; zero if
	- * it is, > 0 if it isn't, < 0 if it isn't a zvol.
	- *
	- * Allowable storage configurations include mirrors, all raidz variants, and
	- * pools with log, cache, and spare devices. Pools which are backed by files or
	- * have missing/hole vdevs are not suitable.
	- */
	-int
	-zvol_check_dump_config(char *arg)
	-{
	- zpool_handle_t *zhp = NULL;
	- nvlist_t config, nvroot;
	- char p, volname;
	- nvlist_t **top;
	- uint_t toplevels;
	- libzfs_handle_t *hdl;
	- char errbuf[1024];
	- char poolname[ZFS_MAX_DATASET_NAME_LEN];
	- int pathlen = strlen(ZVOL_FULL_DEV_DIR);
	- int ret = 1;
	-
	- if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) {
	- return (-1);
	- }
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "dump is not supported on device '%s'"), arg);
	-
	- if ((hdl = libzfs_init()) == NULL)
	- return (1);
	- libzfs_print_on_error(hdl, B_TRUE);
	-
	- volname = arg + pathlen;
	-
	- /* check the configuration of the pool */
	- if ((p = strchr(volname, '/')) == NULL) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "malformed dataset name"));
	- (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
	- return (1);
	- } else if (p - volname >= ZFS_MAX_DATASET_NAME_LEN) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "dataset name is too long"));
	- (void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf);
	- return (1);
	- } else {
	- (void) strncpy(poolname, volname, p - volname);
	- poolname[p - volname] = '\0';
	- }
	-
	- if ((zhp = zpool_open(hdl, poolname)) == NULL) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "could not open pool '%s'"), poolname);
	- (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
	- goto out;
	- }
	- config = zpool_get_config(zhp, NULL);
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "could not obtain vdev configuration for '%s'"), poolname);
	- (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf);
	- goto out;
	- }
	-
	- verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- &top, &toplevels) == 0);
	-
	- if (!supported_dump_vdev_type(hdl, top[0], errbuf)) {
	- goto out;
	- }
	- ret = 0;
	-
	-out:
	- if (zhp)
	- zpool_close(zhp);
	- libzfs_fini(hdl);
	- return (ret);
	-}
	-
	-int
	-zpool_nextboot(libzfs_handle_t *hdl, uint64_t pool_guid, uint64_t dev_guid,
	- const char *command)
	-{
	- zfs_cmd_t zc = { 0 };
	- nvlist_t *args;
	- char *packed;
	- size_t size;
	- int error;
	-
	- args = fnvlist_alloc();
	- fnvlist_add_uint64(args, ZPOOL_CONFIG_POOL_GUID, pool_guid);
	- fnvlist_add_uint64(args, ZPOOL_CONFIG_GUID, dev_guid);
	- fnvlist_add_string(args, "command", command);
	- error = zcmd_write_src_nvlist(hdl, &zc, args);
	- if (error == 0)
	- error = ioctl(hdl->libzfs_fd, ZFS_IOC_NEXTBOOT, &zc);
	- zcmd_free_nvlists(&zc);
	- nvlist_free(args);
	- return (error);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
	@@ -1,3924 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	- * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
	- * Copyright (c) 2019 Datto Inc.
	- */
	-
	-#include <assert.h>
	-#include <ctype.h>
	-#include <errno.h>
	-#include <libintl.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <stddef.h>
	-#include <fcntl.h>
	-#include <sys/param.h>
	-#include <sys/mount.h>
	-#include <pthread.h>
	-#include <umem.h>
	-#include <time.h>
	-
	-#include <libzfs.h>
	-#include <libzfs_core.h>
	-
	-#include "zfs_namecheck.h"
	-#include "zfs_prop.h"
	-#include "zfs_fletcher.h"
	-#include "libzfs_impl.h"
	-#include <zlib.h>
	-#include <sha2.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/ddt.h>
	-
	-#ifdef __FreeBSD__
	-extern int zfs_ioctl_version;
	-#endif
	-
	-/* in libzfs_dataset.c */
	-extern void zfs_setprop_error(libzfs_handle_t , zfs_prop_t, int, char );
	-/* We need to use something for ENODATA. */
	-#define ENODATA EIDRM
	-
	-static int zfs_receive_impl(libzfs_handle_t , const char , const char *,
	- recvflags_t , int, const char , nvlist_t , avl_tree_t , char **, int,
	- uint64_t , const char );
	-static int guid_to_name(libzfs_handle_t , const char ,
	- uint64_t, boolean_t, char *);
	-
	-static const zio_cksum_t zero_cksum = { 0 };
	-
	-typedef struct dedup_arg {
	- int inputfd;
	- int outputfd;
	- libzfs_handle_t *dedup_hdl;
	-} dedup_arg_t;
	-
	-typedef struct progress_arg {
	- zfs_handle_t *pa_zhp;
	- int pa_fd;
	- boolean_t pa_parsable;
	- boolean_t pa_astitle;
	- uint64_t pa_size;
	-} progress_arg_t;
	-
	-typedef struct dataref {
	- uint64_t ref_guid;
	- uint64_t ref_object;
	- uint64_t ref_offset;
	-} dataref_t;
	-
	-typedef struct dedup_entry {
	- struct dedup_entry *dde_next;
	- zio_cksum_t dde_chksum;
	- uint64_t dde_prop;
	- dataref_t dde_ref;
	-} dedup_entry_t;
	-
	-#define MAX_DDT_PHYSMEM_PERCENT 20
	-#define SMALLEST_POSSIBLE_MAX_DDT_MB 128
	-
	-typedef struct dedup_table {
	- dedup_entry_t **dedup_hash_array;
	- umem_cache_t *ddecache;
	- uint64_t max_ddt_size; /* max dedup table size in bytes */
	- uint64_t cur_ddt_size; /* current dedup table size in bytes */
	- uint64_t ddt_count;
	- int numhashbits;
	- boolean_t ddt_full;
	-} dedup_table_t;
	-
	-static int
	-high_order_bit(uint64_t n)
	-{
	- int count;
	-
	- for (count = 0; n != 0; count++)
	- n >>= 1;
	- return (count);
	-}
	-
	-static size_t
	-ssread(void buf, size_t len, FILE stream)
	-{
	- size_t outlen;
	-
	- if ((outlen = fread(buf, len, 1, stream)) == 0)
	- return (0);
	-
	- return (outlen);
	-}
	-
	-static void
	-ddt_hash_append(libzfs_handle_t hdl, dedup_table_t ddt, dedup_entry_t **ddepp,
	- zio_cksum_t cs, uint64_t prop, dataref_t dr)
	-{
	- dedup_entry_t *dde;
	-
	- if (ddt->cur_ddt_size >= ddt->max_ddt_size) {
	- if (ddt->ddt_full == B_FALSE) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "Dedup table full. Deduplication will continue "
	- "with existing table entries"));
	- ddt->ddt_full = B_TRUE;
	- }
	- return;
	- }
	-
	- if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT))
	- != NULL) {
	- assert(*ddepp == NULL);
	- dde->dde_next = NULL;
	- dde->dde_chksum = *cs;
	- dde->dde_prop = prop;
	- dde->dde_ref = *dr;
	- *ddepp = dde;
	- ddt->cur_ddt_size += sizeof (dedup_entry_t);
	- ddt->ddt_count++;
	- }
	-}
	-
	-/*
	- * Using the specified dedup table, do a lookup for an entry with
	- * the checksum cs. If found, return the block's reference info
	- * in *dr. Otherwise, insert a new entry in the dedup table, using
	- * the reference information specified by *dr.
	- *
	- * return value: true - entry was found
	- * false - entry was not found
	- */
	-static boolean_t
	-ddt_update(libzfs_handle_t hdl, dedup_table_t ddt, zio_cksum_t *cs,
	- uint64_t prop, dataref_t *dr)
	-{
	- uint32_t hashcode;
	- dedup_entry_t **ddepp;
	-
	- hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits);
	-
	- for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL;
	- ddepp = &((*ddepp)->dde_next)) {
	- if (ZIO_CHECKSUM_EQUAL(((ddepp)->dde_chksum), cs) &&
	- (*ddepp)->dde_prop == prop) {
	- dr = (ddepp)->dde_ref;
	- return (B_TRUE);
	- }
	- }
	- ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr);
	- return (B_FALSE);
	-}
	-
	-static int
	-dump_record(dmu_replay_record_t drr, void payload, int payload_len,
	- zio_cksum_t *zc, int outfd)
	-{
	- ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
	- ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
	- (void) fletcher_4_incremental_native(drr,
	- offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
	- if (drr->drr_type != DRR_BEGIN) {
	- ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
	- drr_checksum.drr_checksum));
	- drr->drr_u.drr_checksum.drr_checksum = *zc;
	- }
	- (void) fletcher_4_incremental_native(
	- &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc);
	- if (write(outfd, drr, sizeof (*drr)) == -1)
	- return (errno);
	- if (payload_len != 0) {
	- (void) fletcher_4_incremental_native(payload, payload_len, zc);
	- if (write(outfd, payload, payload_len) == -1)
	- return (errno);
	- }
	- return (0);
	-}
	-
	-/*
	- * This function is started in a separate thread when the dedup option
	- * has been requested. The main send thread determines the list of
	- * snapshots to be included in the send stream and makes the ioctl calls
	- * for each one. But instead of having the ioctl send the output to the
	- * the output fd specified by the caller of zfs_send()), the
	- * ioctl is told to direct the output to a pipe, which is read by the
	- * alternate thread running THIS function. This function does the
	- * dedup'ing by:
	- * 1. building a dedup table (the DDT)
	- * 2. doing checksums on each data block and inserting a record in the DDT
	- * 3. looking for matching checksums, and
	- * 4. sending a DRR_WRITE_BYREF record instead of a write record whenever
	- * a duplicate block is found.
	- * The output of this function then goes to the output fd requested
	- * by the caller of zfs_send().
	- */
	-static void *
	-cksummer(void *arg)
	-{
	- dedup_arg_t *dda = arg;
	- char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
	- dmu_replay_record_t thedrr;
	- dmu_replay_record_t *drr = &thedrr;
	- FILE *ofp;
	- int outfd;
	- dedup_table_t ddt;
	- zio_cksum_t stream_cksum;
	- uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
	- uint64_t numbuckets;
	-
	- ddt.max_ddt_size =
	- MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100,
	- SMALLEST_POSSIBLE_MAX_DDT_MB << 20);
	-
	- numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t));
	-
	- /*
	- * numbuckets must be a power of 2. Increase number to
	- * a power of 2 if necessary.
	- */
	- if (!ISP2(numbuckets))
	- numbuckets = 1 << high_order_bit(numbuckets);
	-
	- ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *));
	- ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0,
	- NULL, NULL, NULL, NULL, NULL, 0);
	- ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *);
	- ddt.numhashbits = high_order_bit(numbuckets) - 1;
	- ddt.ddt_full = B_FALSE;
	-
	- outfd = dda->outputfd;
	- ofp = fdopen(dda->inputfd, "r");
	- while (ssread(drr, sizeof (*drr), ofp) != 0) {
	-
	- /*
	- * kernel filled in checksum, we are going to write same
	- * record, but need to regenerate checksum.
	- */
	- if (drr->drr_type != DRR_BEGIN) {
	- bzero(&drr->drr_u.drr_checksum.drr_checksum,
	- sizeof (drr->drr_u.drr_checksum.drr_checksum));
	- }
	-
	- switch (drr->drr_type) {
	- case DRR_BEGIN:
	- {
	- struct drr_begin *drrb = &drr->drr_u.drr_begin;
	- int fflags;
	- int sz = 0;
	- ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
	-
	- ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
	-
	- /* set the DEDUP feature flag for this stream */
	- fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
	- fflags \|= (DMU_BACKUP_FEATURE_DEDUP \|
	- DMU_BACKUP_FEATURE_DEDUPPROPS);
	- DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
	-
	- if (drr->drr_payloadlen != 0) {
	- sz = drr->drr_payloadlen;
	-
	- if (sz > SPA_MAXBLOCKSIZE) {
	- buf = zfs_realloc(dda->dedup_hdl, buf,
	- SPA_MAXBLOCKSIZE, sz);
	- }
	- (void) ssread(buf, sz, ofp);
	- if (ferror(stdin))
	- perror("fread");
	- }
	- if (dump_record(drr, buf, sz, &stream_cksum,
	- outfd) != 0)
	- goto out;
	- break;
	- }
	-
	- case DRR_END:
	- {
	- struct drr_end *drre = &drr->drr_u.drr_end;
	- /* use the recalculated checksum */
	- drre->drr_checksum = stream_cksum;
	- if (dump_record(drr, NULL, 0, &stream_cksum,
	- outfd) != 0)
	- goto out;
	- break;
	- }
	-
	- case DRR_OBJECT:
	- {
	- struct drr_object *drro = &drr->drr_u.drr_object;
	- if (drro->drr_bonuslen > 0) {
	- (void) ssread(buf,
	- P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
	- ofp);
	- }
	- if (dump_record(drr, buf,
	- P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
	- &stream_cksum, outfd) != 0)
	- goto out;
	- break;
	- }
	-
	- case DRR_SPILL:
	- {
	- struct drr_spill *drrs = &drr->drr_u.drr_spill;
	- (void) ssread(buf, drrs->drr_length, ofp);
	- if (dump_record(drr, buf, drrs->drr_length,
	- &stream_cksum, outfd) != 0)
	- goto out;
	- break;
	- }
	-
	- case DRR_FREEOBJECTS:
	- {
	- if (dump_record(drr, NULL, 0, &stream_cksum,
	- outfd) != 0)
	- goto out;
	- break;
	- }
	-
	- case DRR_WRITE:
	- {
	- struct drr_write *drrw = &drr->drr_u.drr_write;
	- dataref_t dataref;
	- uint64_t payload_size;
	-
	- payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
	- (void) ssread(buf, payload_size, ofp);
	-
	- /*
	- * Use the existing checksum if it's dedup-capable,
	- * else calculate a SHA256 checksum for it.
	- */
	-
	- if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
	- zero_cksum) \|\|
	- !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
	- SHA256_CTX ctx;
	- zio_cksum_t tmpsha256;
	-
	- SHA256Init(&ctx);
	- SHA256Update(&ctx, buf, payload_size);
	- SHA256Final(&tmpsha256, &ctx);
	- drrw->drr_key.ddk_cksum.zc_word[0] =
	- BE_64(tmpsha256.zc_word[0]);
	- drrw->drr_key.ddk_cksum.zc_word[1] =
	- BE_64(tmpsha256.zc_word[1]);
	- drrw->drr_key.ddk_cksum.zc_word[2] =
	- BE_64(tmpsha256.zc_word[2]);
	- drrw->drr_key.ddk_cksum.zc_word[3] =
	- BE_64(tmpsha256.zc_word[3]);
	- drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256;
	- drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP;
	- }
	-
	- dataref.ref_guid = drrw->drr_toguid;
	- dataref.ref_object = drrw->drr_object;
	- dataref.ref_offset = drrw->drr_offset;
	-
	- if (ddt_update(dda->dedup_hdl, &ddt,
	- &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop,
	- &dataref)) {
	- dmu_replay_record_t wbr_drr = {0};
	- struct drr_write_byref *wbr_drrr =
	- &wbr_drr.drr_u.drr_write_byref;
	-
	- /* block already present in stream */
	- wbr_drr.drr_type = DRR_WRITE_BYREF;
	-
	- wbr_drrr->drr_object = drrw->drr_object;
	- wbr_drrr->drr_offset = drrw->drr_offset;
	- wbr_drrr->drr_length = drrw->drr_logical_size;
	- wbr_drrr->drr_toguid = drrw->drr_toguid;
	- wbr_drrr->drr_refguid = dataref.ref_guid;
	- wbr_drrr->drr_refobject =
	- dataref.ref_object;
	- wbr_drrr->drr_refoffset =
	- dataref.ref_offset;
	-
	- wbr_drrr->drr_checksumtype =
	- drrw->drr_checksumtype;
	- wbr_drrr->drr_checksumflags =
	- drrw->drr_checksumtype;
	- wbr_drrr->drr_key.ddk_cksum =
	- drrw->drr_key.ddk_cksum;
	- wbr_drrr->drr_key.ddk_prop =
	- drrw->drr_key.ddk_prop;
	-
	- if (dump_record(&wbr_drr, NULL, 0,
	- &stream_cksum, outfd) != 0)
	- goto out;
	- } else {
	- /* block not previously seen */
	- if (dump_record(drr, buf, payload_size,
	- &stream_cksum, outfd) != 0)
	- goto out;
	- }
	- break;
	- }
	-
	- case DRR_WRITE_EMBEDDED:
	- {
	- struct drr_write_embedded *drrwe =
	- &drr->drr_u.drr_write_embedded;
	- (void) ssread(buf,
	- P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp);
	- if (dump_record(drr, buf,
	- P2ROUNDUP((uint64_t)drrwe->drr_psize, 8),
	- &stream_cksum, outfd) != 0)
	- goto out;
	- break;
	- }
	-
	- case DRR_FREE:
	- {
	- if (dump_record(drr, NULL, 0, &stream_cksum,
	- outfd) != 0)
	- goto out;
	- break;
	- }
	-
	- default:
	- (void) fprintf(stderr, "INVALID record type 0x%x\n",
	- drr->drr_type);
	- /* should never happen, so assert */
	- assert(B_FALSE);
	- }
	- }
	-out:
	- umem_cache_destroy(ddt.ddecache);
	- free(ddt.dedup_hash_array);
	- free(buf);
	- (void) fclose(ofp);
	-
	- return (NULL);
	-}
	-
	-/*
	- * Routines for dealing with the AVL tree of fs-nvlists
	- */
	-typedef struct fsavl_node {
	- avl_node_t fn_node;
	- nvlist_t *fn_nvfs;
	- char *fn_snapname;
	- uint64_t fn_guid;
	-} fsavl_node_t;
	-
	-static int
	-fsavl_compare(const void arg1, const void arg2)
	-{
	- const fsavl_node_t fn1 = (const fsavl_node_t )arg1;
	- const fsavl_node_t fn2 = (const fsavl_node_t )arg2;
	-
	- return (AVL_CMP(fn1->fn_guid, fn2->fn_guid));
	-}
	-
	-/*
	- * Given the GUID of a snapshot, find its containing filesystem and
	- * (optionally) name.
	- */
	-static nvlist_t *
	-fsavl_find(avl_tree_t avl, uint64_t snapguid, char *snapname)
	-{
	- fsavl_node_t fn_find;
	- fsavl_node_t *fn;
	-
	- fn_find.fn_guid = snapguid;
	-
	- fn = avl_find(avl, &fn_find, NULL);
	- if (fn) {
	- if (snapname)
	- *snapname = fn->fn_snapname;
	- return (fn->fn_nvfs);
	- }
	- return (NULL);
	-}
	-
	-static void
	-fsavl_destroy(avl_tree_t *avl)
	-{
	- fsavl_node_t *fn;
	- void *cookie;
	-
	- if (avl == NULL)
	- return;
	-
	- cookie = NULL;
	- while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL)
	- free(fn);
	- avl_destroy(avl);
	- free(avl);
	-}
	-
	-/*
	- * Given an nvlist, produce an avl tree of snapshots, ordered by guid
	- */
	-static avl_tree_t *
	-fsavl_create(nvlist_t *fss)
	-{
	- avl_tree_t *fsavl;
	- nvpair_t *fselem = NULL;
	-
	- if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL)
	- return (NULL);
	-
	- avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t),
	- offsetof(fsavl_node_t, fn_node));
	-
	- while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) {
	- nvlist_t nvfs, snaps;
	- nvpair_t *snapelem = NULL;
	-
	- VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
	- VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
	-
	- while ((snapelem =
	- nvlist_next_nvpair(snaps, snapelem)) != NULL) {
	- fsavl_node_t *fn;
	- uint64_t guid;
	-
	- VERIFY(0 == nvpair_value_uint64(snapelem, &guid));
	- if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) {
	- fsavl_destroy(fsavl);
	- return (NULL);
	- }
	- fn->fn_nvfs = nvfs;
	- fn->fn_snapname = nvpair_name(snapelem);
	- fn->fn_guid = guid;
	-
	- /*
	- * Note: if there are multiple snaps with the
	- * same GUID, we ignore all but one.
	- */
	- if (avl_find(fsavl, fn, NULL) == NULL)
	- avl_add(fsavl, fn);
	- else
	- free(fn);
	- }
	- }
	-
	- return (fsavl);
	-}
	-
	-/*
	- * Routines for dealing with the giant nvlist of fs-nvlists, etc.
	- */
	-typedef struct send_data {
	- /*
	- * assigned inside every recursive call,
	- * restored from *_save on return:
	- *
	- * guid of fromsnap snapshot in parent dataset
	- * txg of fromsnap snapshot in current dataset
	- * txg of tosnap snapshot in current dataset
	- */
	-
	- uint64_t parent_fromsnap_guid;
	- uint64_t fromsnap_txg;
	- uint64_t tosnap_txg;
	-
	- /* the nvlists get accumulated during depth-first traversal */
	- nvlist_t *parent_snaps;
	- nvlist_t *fss;
	- nvlist_t *snapprops;
	-
	- /* send-receive configuration, does not change during traversal */
	- const char *fsname;
	- const char *fromsnap;
	- const char *tosnap;
	- boolean_t recursive;
	- boolean_t verbose;
	- boolean_t replicate;
	-
	- /*
	- * The header nvlist is of the following format:
	- * {
	- * "tosnap" -> string
	- * "fromsnap" -> string (if incremental)
	- * "fss" -> {
	- * id -> {
	- *
	- * "name" -> string (full name; for debugging)
	- * "parentfromsnap" -> number (guid of fromsnap in parent)
	- *
	- * "props" -> { name -> value (only if set here) }
	- * "snaps" -> { name (lastname) -> number (guid) }
	- * "snapprops" -> { name (lastname) -> { name -> value } }
	- *
	- * "origin" -> number (guid) (if clone)
	- * "sent" -> boolean (not on-disk)
	- * }
	- * }
	- * }
	- *
	- */
	-} send_data_t;
	-
	-static void send_iterate_prop(zfs_handle_t zhp, nvlist_t nv);
	-
	-static int
	-send_iterate_snap(zfs_handle_t zhp, void arg)
	-{
	- send_data_t *sd = arg;
	- uint64_t guid = zhp->zfs_dmustats.dds_guid;
	- uint64_t txg = zhp->zfs_dmustats.dds_creation_txg;
	- char *snapname;
	- nvlist_t *nv;
	-
	- snapname = strrchr(zhp->zfs_name, '@')+1;
	-
	- if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) {
	- if (sd->verbose) {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "skipping snapshot %s because it was created "
	- "after the destination snapshot (%s)\n"),
	- zhp->zfs_name, sd->tosnap);
	- }
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid));
	- /*
	- * NB: if there is no fromsnap here (it's a newly created fs in
	- * an incremental replication), we will substitute the tosnap.
	- */
	- if ((sd->fromsnap && strcmp(snapname, sd->fromsnap) == 0) \|\|
	- (sd->parent_fromsnap_guid == 0 && sd->tosnap &&
	- strcmp(snapname, sd->tosnap) == 0)) {
	- sd->parent_fromsnap_guid = guid;
	- }
	-
	- VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0));
	- send_iterate_prop(zhp, nv);
	- VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv));
	- nvlist_free(nv);
	-
	- zfs_close(zhp);
	- return (0);
	-}
	-
	-static void
	-send_iterate_prop(zfs_handle_t zhp, nvlist_t nv)
	-{
	- nvpair_t *elem = NULL;
	-
	- while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) {
	- char *propname = nvpair_name(elem);
	- zfs_prop_t prop = zfs_name_to_prop(propname);
	- nvlist_t *propnv;
	-
	- if (!zfs_prop_user(propname)) {
	- /*
	- * Realistically, this should never happen. However,
	- * we want the ability to add DSL properties without
	- * needing to make incompatible version changes. We
	- * need to ignore unknown properties to allow older
	- * software to still send datasets containing these
	- * properties, with the unknown properties elided.
	- */
	- if (prop == ZPROP_INVAL)
	- continue;
	-
	- if (zfs_prop_readonly(prop))
	- continue;
	- }
	-
	- verify(nvpair_value_nvlist(elem, &propnv) == 0);
	- if (prop == ZFS_PROP_QUOTA \|\| prop == ZFS_PROP_RESERVATION \|\|
	- prop == ZFS_PROP_REFQUOTA \|\|
	- prop == ZFS_PROP_REFRESERVATION) {
	- char *source;
	- uint64_t value;
	- verify(nvlist_lookup_uint64(propnv,
	- ZPROP_VALUE, &value) == 0);
	- if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
	- continue;
	- /*
	- * May have no source before SPA_VERSION_RECVD_PROPS,
	- * but is still modifiable.
	- */
	- if (nvlist_lookup_string(propnv,
	- ZPROP_SOURCE, &source) == 0) {
	- if ((strcmp(source, zhp->zfs_name) != 0) &&
	- (strcmp(source,
	- ZPROP_SOURCE_VAL_RECVD) != 0))
	- continue;
	- }
	- } else {
	- char *source;
	- if (nvlist_lookup_string(propnv,
	- ZPROP_SOURCE, &source) != 0)
	- continue;
	- if ((strcmp(source, zhp->zfs_name) != 0) &&
	- (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0))
	- continue;
	- }
	-
	- if (zfs_prop_user(propname) \|\|
	- zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
	- char *value;
	- verify(nvlist_lookup_string(propnv,
	- ZPROP_VALUE, &value) == 0);
	- VERIFY(0 == nvlist_add_string(nv, propname, value));
	- } else {
	- uint64_t value;
	- verify(nvlist_lookup_uint64(propnv,
	- ZPROP_VALUE, &value) == 0);
	- VERIFY(0 == nvlist_add_uint64(nv, propname, value));
	- }
	- }
	-}
	-
	-/*
	- * returns snapshot creation txg
	- * and returns 0 if the snapshot does not exist
	- */
	-static uint64_t
	-get_snap_txg(libzfs_handle_t hdl, const char fs, const char *snap)
	-{
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- uint64_t txg = 0;
	-
	- if (fs == NULL \|\| fs[0] == '\0' \|\| snap == NULL \|\| snap[0] == '\0')
	- return (txg);
	-
	- (void) snprintf(name, sizeof (name), "%s@%s", fs, snap);
	- if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) {
	- zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT);
	- if (zhp != NULL) {
	- txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG);
	- zfs_close(zhp);
	- }
	- }
	-
	- return (txg);
	-}
	-
	-/*
	- * recursively generate nvlists describing datasets. See comment
	- * for the data structure send_data_t above for description of contents
	- * of the nvlist.
	- */
	-static int
	-send_iterate_fs(zfs_handle_t zhp, void arg)
	-{
	- send_data_t *sd = arg;
	- nvlist_t nvfs, nv;
	- int rv = 0;
	- uint64_t min_txg = 0, max_txg = 0;
	- uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid;
	- uint64_t fromsnap_txg_save = sd->fromsnap_txg;
	- uint64_t tosnap_txg_save = sd->tosnap_txg;
	- uint64_t txg = zhp->zfs_dmustats.dds_creation_txg;
	- uint64_t guid = zhp->zfs_dmustats.dds_guid;
	- uint64_t fromsnap_txg, tosnap_txg;
	- char guidstring[64];
	-
	- fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap);
	- if (fromsnap_txg != 0)
	- sd->fromsnap_txg = fromsnap_txg;
	-
	- tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap);
	- if (tosnap_txg != 0)
	- sd->tosnap_txg = tosnap_txg;
	-
	- /*
	- * on the send side, if the current dataset does not have tosnap,
	- * perform two additional checks:
	- *
	- * - skip sending the current dataset if it was created later than
	- * the parent tosnap
	- * - return error if the current dataset was created earlier than
	- * the parent tosnap
	- */
	- if (sd->tosnap != NULL && tosnap_txg == 0) {
	- if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) {
	- if (sd->verbose) {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "skipping dataset %s: snapshot %s does "
	- "not exist\n"), zhp->zfs_name, sd->tosnap);
	- }
	- } else {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "cannot send %s@%s%s: snapshot %s@%s does not "
	- "exist\n"), sd->fsname, sd->tosnap, sd->recursive ?
	- dgettext(TEXT_DOMAIN, " recursively") : "",
	- zhp->zfs_name, sd->tosnap);
	- rv = -1;
	- }
	- goto out;
	- }
	-
	- nvfs = fnvlist_alloc();
	- fnvlist_add_string(nvfs, "name", zhp->zfs_name);
	- fnvlist_add_uint64(nvfs, "parentfromsnap",
	- sd->parent_fromsnap_guid);
	-
	- if (zhp->zfs_dmustats.dds_origin[0]) {
	- zfs_handle_t *origin = zfs_open(zhp->zfs_hdl,
	- zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
	- if (origin == NULL) {
	- rv = -1;
	- goto out;
	- }
	- VERIFY(0 == nvlist_add_uint64(nvfs, "origin",
	- origin->zfs_dmustats.dds_guid));
	- }
	-
	- /* iterate over props */
	- VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0));
	- send_iterate_prop(zhp, nv);
	- VERIFY(0 == nvlist_add_nvlist(nvfs, "props", nv));
	- nvlist_free(nv);
	-
	- /* iterate over snaps, and set sd->parent_fromsnap_guid */
	- if (!sd->replicate && fromsnap_txg != 0)
	- min_txg = fromsnap_txg;
	- if (!sd->replicate && tosnap_txg != 0)
	- max_txg = tosnap_txg;
	- sd->parent_fromsnap_guid = 0;
	- VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0));
	- VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0));
	- (void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd,
	- min_txg, max_txg);
	- VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps));
	- VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops));
	- fnvlist_free(sd->parent_snaps);
	- fnvlist_free(sd->snapprops);
	-
	- /* add this fs to nvlist */
	- (void) snprintf(guidstring, sizeof (guidstring),
	- "0x%llx", (longlong_t)guid);
	- VERIFY(0 == nvlist_add_nvlist(sd->fss, guidstring, nvfs));
	- nvlist_free(nvfs);
	-
	- /* iterate over children */
	- if (sd->recursive)
	- rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
	-
	-out:
	- sd->parent_fromsnap_guid = parent_fromsnap_guid_save;
	- sd->fromsnap_txg = fromsnap_txg_save;
	- sd->tosnap_txg = tosnap_txg_save;
	-
	- zfs_close(zhp);
	- return (rv);
	-}
	-
	-static int
	-gather_nvlist(libzfs_handle_t hdl, const char fsname, const char *fromsnap,
	- const char *tosnap, boolean_t recursive, boolean_t verbose,
	- boolean_t replicate, nvlist_t nvlp, avl_tree_t avlp)
	-{
	- zfs_handle_t *zhp;
	- int error;
	- uint64_t min_txg = 0, max_txg = 0;
	- send_data_t sd = { 0 };
	-
	- zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (zhp == NULL)
	- return (EZFS_BADTYPE);
	-
	- VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0));
	- sd.fsname = fsname;
	- sd.fromsnap = fromsnap;
	- sd.tosnap = tosnap;
	- sd.recursive = recursive;
	- sd.verbose = verbose;
	- sd.replicate = replicate;
	-
	- if ((error = send_iterate_fs(zhp, &sd)) != 0) {
	- nvlist_free(sd.fss);
	- if (avlp != NULL)
	- *avlp = NULL;
	- *nvlp = NULL;
	- return (error);
	- }
	-
	- if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) {
	- nvlist_free(sd.fss);
	- *nvlp = NULL;
	- return (EZFS_NOMEM);
	- }
	-
	- *nvlp = sd.fss;
	- return (0);
	-}
	-
	-/*
	- * Routines specific to "zfs send"
	- */
	-typedef struct send_dump_data {
	- /* these are all just the short snapname (the part after the @) */
	- const char *fromsnap;
	- const char *tosnap;
	- char prevsnap[ZFS_MAX_DATASET_NAME_LEN];
	- uint64_t prevsnap_obj;
	- boolean_t seenfrom, seento, replicate, doall, fromorigin;
	- boolean_t verbose, dryrun, parsable, progress, embed_data, std_out;
	- boolean_t progressastitle;
	- boolean_t large_block, compress;
	- int outfd;
	- boolean_t err;
	- nvlist_t *fss;
	- nvlist_t *snapholds;
	- avl_tree_t *fsavl;
	- snapfilter_cb_t *filter_cb;
	- void *filter_cb_arg;
	- nvlist_t *debugnv;
	- char holdtag[ZFS_MAX_DATASET_NAME_LEN];
	- int cleanup_fd;
	- uint64_t size;
	-} send_dump_data_t;
	-
	-static int
	-zfs_send_space(zfs_handle_t zhp, const char snapname, const char *from,
	- enum lzc_send_flags flags, uint64_t *spacep)
	-{
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- int error;
	-
	- assert(snapname != NULL);
	- error = lzc_send_space(snapname, from, flags, spacep);
	-
	- if (error != 0) {
	- char errbuf[1024];
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "warning: cannot estimate space for '%s'"), snapname);
	-
	- switch (error) {
	- case EXDEV:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "not an earlier snapshot from the same fs"));
	- return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
	-
	- case ENOENT:
	- if (zfs_dataset_exists(hdl, snapname,
	- ZFS_TYPE_SNAPSHOT)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "incremental source (%s) does not exist"),
	- snapname);
	- }
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	-
	- case EDQUOT:
	- case EFBIG:
	- case EIO:
	- case ENOLINK:
	- case ENOSPC:
	- case ENXIO:
	- case EPIPE:
	- case ERANGE:
	- case EFAULT:
	- case EROFS:
	- case EINVAL:
	- zfs_error_aux(hdl, strerror(error));
	- return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
	-
	- default:
	- return (zfs_standard_error(hdl, error, errbuf));
	- }
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Dumps a backup of the given snapshot (incremental from fromsnap if it's not
	- * NULL) to the file descriptor specified by outfd.
	- */
	-static int
	-dump_ioctl(zfs_handle_t zhp, const char fromsnap, uint64_t fromsnap_obj,
	- boolean_t fromorigin, int outfd, enum lzc_send_flags flags,
	- nvlist_t *debugnv)
	-{
	- zfs_cmd_t zc = { 0 };
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- nvlist_t *thisdbg;
	-
	- assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
	- assert(fromsnap_obj == 0 \|\| !fromorigin);
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	- zc.zc_cookie = outfd;
	- zc.zc_obj = fromorigin;
	- zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
	- zc.zc_fromobj = fromsnap_obj;
	- zc.zc_flags = flags;
	-
	- VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
	- if (fromsnap && fromsnap[0] != '\0') {
	- VERIFY(0 == nvlist_add_string(thisdbg,
	- "fromsnap", fromsnap));
	- }
	-
	- if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
	- char errbuf[1024];
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "warning: cannot send '%s'"), zhp->zfs_name);
	-
	- VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno));
	- if (debugnv) {
	- VERIFY(0 == nvlist_add_nvlist(debugnv,
	- zhp->zfs_name, thisdbg));
	- }
	- nvlist_free(thisdbg);
	-
	- switch (errno) {
	- case EXDEV:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "not an earlier snapshot from the same fs"));
	- return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
	-
	- case ENOENT:
	- if (zfs_dataset_exists(hdl, zc.zc_name,
	- ZFS_TYPE_SNAPSHOT)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "incremental source (@%s) does not exist"),
	- zc.zc_value);
	- }
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	-
	- case EDQUOT:
	- case EFBIG:
	- case EIO:
	- case ENOLINK:
	- case ENOSPC:
	-#ifdef illumos
	- case ENOSTR:
	-#endif
	- case ENXIO:
	- case EPIPE:
	- case ERANGE:
	- case EFAULT:
	- case EROFS:
	- zfs_error_aux(hdl, strerror(errno));
	- return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
	-
	- default:
	- return (zfs_standard_error(hdl, errno, errbuf));
	- }
	- }
	-
	- if (debugnv)
	- VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
	- nvlist_free(thisdbg);
	-
	- return (0);
	-}
	-
	-static void
	-gather_holds(zfs_handle_t zhp, send_dump_data_t sdd)
	-{
	- assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
	-
	- /*
	- * zfs_send() only sets snapholds for sends that need them,
	- * e.g. replication and doall.
	- */
	- if (sdd->snapholds == NULL)
	- return;
	-
	- fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag);
	-}
	-
	-static void *
	-send_progress_thread(void *arg)
	-{
	- progress_arg_t *pa = arg;
	- zfs_cmd_t zc = { 0 };
	- zfs_handle_t *zhp = pa->pa_zhp;
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- unsigned long long bytes, total;
	- char buf[16];
	- time_t t;
	- struct tm *tm;
	-
	- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	-
	- if (!pa->pa_parsable && !pa->pa_astitle)
	- (void) fprintf(stderr, "TIME SENT SNAPSHOT\n");
	-
	- /*
	- * Print the progress from ZFS_IOC_SEND_PROGRESS every second.
	- */
	- for (;;) {
	- (void) sleep(1);
	-
	- zc.zc_cookie = pa->pa_fd;
	- if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0)
	- return ((void *)-1);
	-
	- (void) time(&t);
	- tm = localtime(&t);
	- bytes = zc.zc_cookie;
	-
	- if (pa->pa_astitle) {
	- int pct;
	- if (pa->pa_size > bytes)
	- pct = 100 * bytes / pa->pa_size;
	- else
	- pct = 100;
	-
	- setproctitle("sending %s (%d%%: %llu/%llu)",
	- zhp->zfs_name, pct, bytes, pa->pa_size);
	- } else if (pa->pa_parsable) {
	- (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
	- tm->tm_hour, tm->tm_min, tm->tm_sec,
	- bytes, zhp->zfs_name);
	- } else {
	- zfs_nicenum(bytes, buf, sizeof (buf));
	- (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n",
	- tm->tm_hour, tm->tm_min, tm->tm_sec,
	- buf, zhp->zfs_name);
	- }
	- }
	-}
	-
	-static void
	-send_print_verbose(FILE fout, const char tosnap, const char *fromsnap,
	- uint64_t size, boolean_t parsable)
	-{
	- if (parsable) {
	- if (fromsnap != NULL) {
	- (void) fprintf(fout, "incremental\t%s\t%s",
	- fromsnap, tosnap);
	- } else {
	- (void) fprintf(fout, "full\t%s",
	- tosnap);
	- }
	- } else {
	- if (fromsnap != NULL) {
	- if (strchr(fromsnap, '@') == NULL &&
	- strchr(fromsnap, '#') == NULL) {
	- (void) fprintf(fout, dgettext(TEXT_DOMAIN,
	- "send from @%s to %s"),
	- fromsnap, tosnap);
	- } else {
	- (void) fprintf(fout, dgettext(TEXT_DOMAIN,
	- "send from %s to %s"),
	- fromsnap, tosnap);
	- }
	- } else {
	- (void) fprintf(fout, dgettext(TEXT_DOMAIN,
	- "full send of %s"),
	- tosnap);
	- }
	- }
	-
	- if (parsable) {
	- (void) fprintf(fout, "\t%llu",
	- (longlong_t)size);
	- } else if (size != 0) {
	- char buf[16];
	- zfs_nicenum(size, buf, sizeof (buf));
	- (void) fprintf(fout, dgettext(TEXT_DOMAIN,
	- " estimated size is %s"), buf);
	- }
	- (void) fprintf(fout, "\n");
	-}
	-
	-static int
	-dump_snapshot(zfs_handle_t zhp, void arg)
	-{
	- send_dump_data_t *sdd = arg;
	- progress_arg_t pa = { 0 };
	- pthread_t tid;
	- char *thissnap;
	- enum lzc_send_flags flags = 0;
	- int err;
	- boolean_t isfromsnap, istosnap, fromorigin;
	- boolean_t exclude = B_FALSE;
	- FILE *fout = sdd->std_out ? stdout : stderr;
	-
	- err = 0;
	- thissnap = strchr(zhp->zfs_name, '@') + 1;
	- isfromsnap = (sdd->fromsnap != NULL &&
	- strcmp(sdd->fromsnap, thissnap) == 0);
	-
	- if (!sdd->seenfrom && isfromsnap) {
	- gather_holds(zhp, sdd);
	- sdd->seenfrom = B_TRUE;
	- (void) strcpy(sdd->prevsnap, thissnap);
	- sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- if (sdd->seento \|\| !sdd->seenfrom) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
	- if (istosnap)
	- sdd->seento = B_TRUE;
	-
	- if (sdd->large_block)
	- flags \|= LZC_SEND_FLAG_LARGE_BLOCK;
	- if (sdd->embed_data)
	- flags \|= LZC_SEND_FLAG_EMBED_DATA;
	- if (sdd->compress)
	- flags \|= LZC_SEND_FLAG_COMPRESS;
	-
	- if (!sdd->doall && !isfromsnap && !istosnap) {
	- if (sdd->replicate) {
	- char *snapname;
	- nvlist_t *snapprops;
	- /*
	- * Filter out all intermediate snapshots except origin
	- * snapshots needed to replicate clones.
	- */
	- nvlist_t *nvfs = fsavl_find(sdd->fsavl,
	- zhp->zfs_dmustats.dds_guid, &snapname);
	-
	- VERIFY(0 == nvlist_lookup_nvlist(nvfs,
	- "snapprops", &snapprops));
	- VERIFY(0 == nvlist_lookup_nvlist(snapprops,
	- thissnap, &snapprops));
	- exclude = !nvlist_exists(snapprops, "is_clone_origin");
	- } else {
	- exclude = B_TRUE;
	- }
	- }
	-
	- /*
	- * If a filter function exists, call it to determine whether
	- * this snapshot will be sent.
	- */
	- if (exclude \|\| (sdd->filter_cb != NULL &&
	- sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
	- /*
	- * This snapshot is filtered out. Don't send it, and don't
	- * set prevsnap_obj, so it will be as if this snapshot didn't
	- * exist, and the next accepted snapshot will be sent as
	- * an incremental from the last accepted one, or as the
	- * first (and full) snapshot in the case of a replication,
	- * non-incremental send.
	- */
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- gather_holds(zhp, sdd);
	- fromorigin = sdd->prevsnap[0] == '\0' &&
	- (sdd->fromorigin \|\| sdd->replicate);
	-
	- if (sdd->verbose \|\| sdd->progress) {
	- uint64_t size = 0;
	- char fromds[ZFS_MAX_DATASET_NAME_LEN];
	-
	- if (sdd->prevsnap[0] != '\0') {
	- (void) strlcpy(fromds, zhp->zfs_name, sizeof (fromds));
	- *(strchr(fromds, '@') + 1) = '\0';
	- (void) strlcat(fromds, sdd->prevsnap, sizeof (fromds));
	- }
	- if (zfs_send_space(zhp, zhp->zfs_name,
	- sdd->prevsnap[0] ? fromds : NULL, flags, &size) != 0) {
	- size = 0; /* cannot estimate send space */
	- } else {
	- send_print_verbose(fout, zhp->zfs_name,
	- sdd->prevsnap[0] ? sdd->prevsnap : NULL,
	- size, sdd->parsable);
	- }
	- sdd->size += size;
	- }
	-
	- if (!sdd->dryrun) {
	- /*
	- * If progress reporting is requested, spawn a new thread to
	- * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
	- */
	- if (sdd->progress) {
	- pa.pa_zhp = zhp;
	- pa.pa_fd = sdd->outfd;
	- pa.pa_parsable = sdd->parsable;
	- pa.pa_size = sdd->size;
	- pa.pa_astitle = sdd->progressastitle;
	-
	- if ((err = pthread_create(&tid, NULL,
	- send_progress_thread, &pa)) != 0) {
	- zfs_close(zhp);
	- return (err);
	- }
	- }
	-
	- err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
	- fromorigin, sdd->outfd, flags, sdd->debugnv);
	-
	- if (sdd->progress) {
	- (void) pthread_cancel(tid);
	- (void) pthread_join(tid, NULL);
	- }
	- }
	-
	- (void) strcpy(sdd->prevsnap, thissnap);
	- sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
	- zfs_close(zhp);
	- return (err);
	-}
	-
	-static int
	-dump_filesystem(zfs_handle_t zhp, void arg)
	-{
	- int rv = 0;
	- uint64_t min_txg = 0, max_txg = 0;
	- send_dump_data_t *sdd = arg;
	- boolean_t missingfrom = B_FALSE;
	- zfs_cmd_t zc = { 0 };
	-
	- (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
	- zhp->zfs_name, sdd->tosnap);
	- if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "WARNING: could not send %s@%s: does not exist\n"),
	- zhp->zfs_name, sdd->tosnap);
	- sdd->err = B_TRUE;
	- return (0);
	- }
	-
	- if (sdd->replicate && sdd->fromsnap) {
	- /*
	- * If this fs does not have fromsnap, and we're doing
	- * recursive, we need to send a full stream from the
	- * beginning (or an incremental from the origin if this
	- * is a clone). If we're doing non-recursive, then let
	- * them get the error.
	- */
	- (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
	- zhp->zfs_name, sdd->fromsnap);
	- if (ioctl(zhp->zfs_hdl->libzfs_fd,
	- ZFS_IOC_OBJSET_STATS, &zc) != 0) {
	- missingfrom = B_TRUE;
	- }
	- }
	-
	- sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
	- sdd->prevsnap_obj = 0;
	- if (sdd->fromsnap == NULL \|\| missingfrom)
	- sdd->seenfrom = B_TRUE;
	-
	- if (!sdd->replicate && sdd->fromsnap != NULL)
	- min_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name,
	- sdd->fromsnap);
	- if (!sdd->replicate && sdd->tosnap != NULL)
	- max_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name,
	- sdd->tosnap);
	-
	- rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg,
	- min_txg, max_txg);
	- if (!sdd->seenfrom) {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "WARNING: could not send %s@%s:\n"
	- "incremental source (%s@%s) does not exist\n"),
	- zhp->zfs_name, sdd->tosnap,
	- zhp->zfs_name, sdd->fromsnap);
	- sdd->err = B_TRUE;
	- } else if (!sdd->seento) {
	- if (sdd->fromsnap) {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "WARNING: could not send %s@%s:\n"
	- "incremental source (%s@%s) "
	- "is not earlier than it\n"),
	- zhp->zfs_name, sdd->tosnap,
	- zhp->zfs_name, sdd->fromsnap);
	- } else {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "WARNING: "
	- "could not send %s@%s: does not exist\n"),
	- zhp->zfs_name, sdd->tosnap);
	- }
	- sdd->err = B_TRUE;
	- }
	-
	- return (rv);
	-}
	-
	-static int
	-dump_filesystems(zfs_handle_t rzhp, void arg)
	-{
	- send_dump_data_t *sdd = arg;
	- nvpair_t *fspair;
	- boolean_t needagain, progress;
	-
	- if (!sdd->replicate)
	- return (dump_filesystem(rzhp, sdd));
	-
	- /* Mark the clone origin snapshots. */
	- for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
	- fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
	- nvlist_t *nvfs;
	- uint64_t origin_guid = 0;
	-
	- VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs));
	- (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid);
	- if (origin_guid != 0) {
	- char *snapname;
	- nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
	- origin_guid, &snapname);
	- if (origin_nv != NULL) {
	- nvlist_t *snapprops;
	- VERIFY(0 == nvlist_lookup_nvlist(origin_nv,
	- "snapprops", &snapprops));
	- VERIFY(0 == nvlist_lookup_nvlist(snapprops,
	- snapname, &snapprops));
	- VERIFY(0 == nvlist_add_boolean(
	- snapprops, "is_clone_origin"));
	- }
	- }
	- }
	-again:
	- needagain = progress = B_FALSE;
	- for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
	- fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
	- nvlist_t fslist, parent_nv;
	- char *fsname;
	- zfs_handle_t *zhp;
	- int err;
	- uint64_t origin_guid = 0;
	- uint64_t parent_guid = 0;
	-
	- VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
	- if (nvlist_lookup_boolean(fslist, "sent") == 0)
	- continue;
	-
	- VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0);
	- (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid);
	- (void) nvlist_lookup_uint64(fslist, "parentfromsnap",
	- &parent_guid);
	-
	- if (parent_guid != 0) {
	- parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL);
	- if (!nvlist_exists(parent_nv, "sent")) {
	- /* parent has not been sent; skip this one */
	- needagain = B_TRUE;
	- continue;
	- }
	- }
	-
	- if (origin_guid != 0) {
	- nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
	- origin_guid, NULL);
	- if (origin_nv != NULL &&
	- !nvlist_exists(origin_nv, "sent")) {
	- /*
	- * origin has not been sent yet;
	- * skip this clone.
	- */
	- needagain = B_TRUE;
	- continue;
	- }
	- }
	-
	- zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET);
	- if (zhp == NULL)
	- return (-1);
	- err = dump_filesystem(zhp, sdd);
	- VERIFY(nvlist_add_boolean(fslist, "sent") == 0);
	- progress = B_TRUE;
	- zfs_close(zhp);
	- if (err)
	- return (err);
	- }
	- if (needagain) {
	- assert(progress);
	- goto again;
	- }
	-
	- /* clean out the sent flags in case we reuse this fss */
	- for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
	- fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
	- nvlist_t *fslist;
	-
	- VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
	- (void) nvlist_remove_all(fslist, "sent");
	- }
	-
	- return (0);
	-}
	-
	-nvlist_t *
	-zfs_send_resume_token_to_nvlist(libzfs_handle_t hdl, const char token)
	-{
	- unsigned int version;
	- int nread;
	- unsigned long long checksum, packed_len;
	-
	- /*
	- * Decode token header, which is:
	- * <token version>-<checksum of payload>-<uncompressed payload length>
	- * Note that the only supported token version is 1.
	- */
	- nread = sscanf(token, "%u-%llx-%llx-",
	- &version, &checksum, &packed_len);
	- if (nread != 3) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "resume token is corrupt (invalid format)"));
	- return (NULL);
	- }
	-
	- if (version != ZFS_SEND_RESUME_TOKEN_VERSION) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "resume token is corrupt (invalid version %u)"),
	- version);
	- return (NULL);
	- }
	-
	- /* convert hexadecimal representation to binary */
	- token = strrchr(token, '-') + 1;
	- int len = strlen(token) / 2;
	- unsigned char *compressed = zfs_alloc(hdl, len);
	- for (int i = 0; i < len; i++) {
	- nread = sscanf(token + i * 2, "%2hhx", compressed + i);
	- if (nread != 1) {
	- free(compressed);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "resume token is corrupt "
	- "(payload is not hex-encoded)"));
	- return (NULL);
	- }
	- }
	-
	- /* verify checksum */
	- zio_cksum_t cksum;
	- fletcher_4_native(compressed, len, NULL, &cksum);
	- if (cksum.zc_word[0] != checksum) {
	- free(compressed);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "resume token is corrupt (incorrect checksum)"));
	- return (NULL);
	- }
	-
	- /* uncompress */
	- void *packed = zfs_alloc(hdl, packed_len);
	- uLongf packed_len_long = packed_len;
	- if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK \|\|
	- packed_len_long != packed_len) {
	- free(packed);
	- free(compressed);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "resume token is corrupt (decompression failed)"));
	- return (NULL);
	- }
	-
	- /* unpack nvlist */
	- nvlist_t *nv;
	- int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP);
	- free(packed);
	- free(compressed);
	- if (error != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "resume token is corrupt (nvlist_unpack failed)"));
	- return (NULL);
	- }
	- return (nv);
	-}
	-
	-int
	-zfs_send_resume(libzfs_handle_t hdl, sendflags_t flags, int outfd,
	- const char *resume_token)
	-{
	- char errbuf[1024];
	- char *toname;
	- char *fromname = NULL;
	- uint64_t resumeobj, resumeoff, toguid, fromguid, bytes;
	- zfs_handle_t *zhp;
	- int error = 0;
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- enum lzc_send_flags lzc_flags = 0;
	- uint64_t size = 0;
	- FILE *fout = (flags->verbose && flags->dryrun) ? stdout : stderr;
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot resume send"));
	-
	- nvlist_t *resume_nvl =
	- zfs_send_resume_token_to_nvlist(hdl, resume_token);
	- if (resume_nvl == NULL) {
	- /*
	- * zfs_error_aux has already been set by
	- * zfs_send_resume_token_to_nvlist
	- */
	- return (zfs_error(hdl, EZFS_FAULT, errbuf));
	- }
	- if (flags->verbose) {
	- (void) fprintf(fout, dgettext(TEXT_DOMAIN,
	- "resume token contents:\n"));
	- nvlist_print(fout, resume_nvl);
	- }
	-
	- if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 \|\|
	- nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 \|\|
	- nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 \|\|
	- nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 \|\|
	- nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "resume token is corrupt"));
	- return (zfs_error(hdl, EZFS_FAULT, errbuf));
	- }
	- fromguid = 0;
	- (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
	-
	- if (flags->largeblock \|\| nvlist_exists(resume_nvl, "largeblockok"))
	- lzc_flags \|= LZC_SEND_FLAG_LARGE_BLOCK;
	- if (flags->embed_data \|\| nvlist_exists(resume_nvl, "embedok"))
	- lzc_flags \|= LZC_SEND_FLAG_EMBED_DATA;
	- if (flags->compress \|\| nvlist_exists(resume_nvl, "compressok"))
	- lzc_flags \|= LZC_SEND_FLAG_COMPRESS;
	-
	- if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) {
	- if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' is no longer the same snapshot used in "
	- "the initial send"), toname);
	- } else {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' used in the initial send no longer exists"),
	- toname);
	- }
	- return (zfs_error(hdl, EZFS_BADPATH, errbuf));
	- }
	- zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
	- if (zhp == NULL) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "unable to access '%s'"), name);
	- return (zfs_error(hdl, EZFS_BADPATH, errbuf));
	- }
	-
	- if (fromguid != 0) {
	- if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "incremental source %#llx no longer exists"),
	- (longlong_t)fromguid);
	- return (zfs_error(hdl, EZFS_BADPATH, errbuf));
	- }
	- fromname = name;
	- }
	-
	- if (flags->progress \|\| flags->verbose) {
	- error = lzc_send_space(zhp->zfs_name, fromname,
	- lzc_flags, &size);
	- if (error == 0)
	- size = MAX(0, (int64_t)(size - bytes));
	- }
	- if (flags->verbose) {
	- send_print_verbose(fout, zhp->zfs_name, fromname,
	- size, flags->parsable);
	- }
	-
	- if (!flags->dryrun) {
	- progress_arg_t pa = { 0 };
	- pthread_t tid;
	- /*
	- * If progress reporting is requested, spawn a new thread to
	- * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
	- */
	- if (flags->progress) {
	- pa.pa_zhp = zhp;
	- pa.pa_fd = outfd;
	- pa.pa_parsable = flags->parsable;
	- pa.pa_size = size;
	- pa.pa_astitle = flags->progressastitle;
	-
	- error = pthread_create(&tid, NULL,
	- send_progress_thread, &pa);
	- if (error != 0) {
	- zfs_close(zhp);
	- return (error);
	- }
	- }
	-
	- error = lzc_send_resume(zhp->zfs_name, fromname, outfd,
	- lzc_flags, resumeobj, resumeoff);
	-
	- if (flags->progress) {
	- (void) pthread_cancel(tid);
	- (void) pthread_join(tid, NULL);
	- }
	-
	- char errbuf[1024];
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "warning: cannot send '%s'"), zhp->zfs_name);
	-
	- zfs_close(zhp);
	-
	- switch (error) {
	- case 0:
	- return (0);
	- case EXDEV:
	- case ENOENT:
	- case EDQUOT:
	- case EFBIG:
	- case EIO:
	- case ENOLINK:
	- case ENOSPC:
	-#ifdef illumos
	- case ENOSTR:
	-#endif
	- case ENXIO:
	- case EPIPE:
	- case ERANGE:
	- case EFAULT:
	- case EROFS:
	- zfs_error_aux(hdl, strerror(errno));
	- return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
	-
	- default:
	- return (zfs_standard_error(hdl, errno, errbuf));
	- }
	- }
	-
	-
	- zfs_close(zhp);
	-
	- return (error);
	-}
	-
	-/*
	- * Generate a send stream for the dataset identified by the argument zhp.
	- *
	- * The content of the send stream is the snapshot identified by
	- * 'tosnap'. Incremental streams are requested in two ways:
	- * - from the snapshot identified by "fromsnap" (if non-null) or
	- * - from the origin of the dataset identified by zhp, which must
	- * be a clone. In this case, "fromsnap" is null and "fromorigin"
	- * is TRUE.
	- *
	- * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
	- * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM)
	- * if "replicate" is set. If "doall" is set, dump all the intermediate
	- * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall"
	- * case too. If "props" is set, send properties.
	- */
	-int
	-zfs_send(zfs_handle_t zhp, const char fromsnap, const char *tosnap,
	- sendflags_t *flags, int outfd, snapfilter_cb_t filter_func,
	- void cb_arg, nvlist_t *debugnvp)
	-{
	- char errbuf[1024];
	- send_dump_data_t sdd = { 0 };
	- int err = 0;
	- nvlist_t *fss = NULL;
	- avl_tree_t *fsavl = NULL;
	- static uint64_t holdseq;
	- int spa_version;
	- pthread_t tid = 0;
	- int pipefd[2];
	- dedup_arg_t dda = { 0 };
	- int featureflags = 0;
	- FILE *fout;
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot send '%s'"), zhp->zfs_name);
	-
	- if (fromsnap && fromsnap[0] == '\0') {
	- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	- "zero-length incremental source"));
	- return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
	- }
	-
	- if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
	- uint64_t version;
	- version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
	- if (version >= ZPL_VERSION_SA) {
	- featureflags \|= DMU_BACKUP_FEATURE_SA_SPILL;
	- }
	- }
	-
	- if (flags->dedup && !flags->dryrun) {
	- featureflags \|= (DMU_BACKUP_FEATURE_DEDUP \|
	- DMU_BACKUP_FEATURE_DEDUPPROPS);
	- if ((err = pipe(pipefd)) != 0) {
	- zfs_error_aux(zhp->zfs_hdl, strerror(errno));
	- return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED,
	- errbuf));
	- }
	- dda.outputfd = outfd;
	- dda.inputfd = pipefd[1];
	- dda.dedup_hdl = zhp->zfs_hdl;
	- if ((err = pthread_create(&tid, NULL, cksummer, &dda)) != 0) {
	- (void) close(pipefd[0]);
	- (void) close(pipefd[1]);
	- zfs_error_aux(zhp->zfs_hdl, strerror(errno));
	- return (zfs_error(zhp->zfs_hdl,
	- EZFS_THREADCREATEFAILED, errbuf));
	- }
	- }
	-
	- if (flags->replicate \|\| flags->doall \|\| flags->props) {
	- dmu_replay_record_t drr = { 0 };
	- char *packbuf = NULL;
	- size_t buflen = 0;
	- zio_cksum_t zc = { 0 };
	-
	- if (flags->replicate \|\| flags->props) {
	- nvlist_t *hdrnv;
	-
	- VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0));
	- if (fromsnap) {
	- VERIFY(0 == nvlist_add_string(hdrnv,
	- "fromsnap", fromsnap));
	- }
	- VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap));
	- if (!flags->replicate) {
	- VERIFY(0 == nvlist_add_boolean(hdrnv,
	- "not_recursive"));
	- }
	-
	- err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name,
	- fromsnap, tosnap, flags->replicate, flags->verbose,
	- flags->replicate, &fss, &fsavl);
	- if (err)
	- goto err_out;
	- VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
	- err = nvlist_pack(hdrnv, &packbuf, &buflen,
	- NV_ENCODE_XDR, 0);
	- if (debugnvp)
	- *debugnvp = hdrnv;
	- else
	- nvlist_free(hdrnv);
	- if (err)
	- goto stderr_out;
	- }
	-
	- if (!flags->dryrun) {
	- /* write first begin record */
	- drr.drr_type = DRR_BEGIN;
	- drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
	- DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.
	- drr_versioninfo, DMU_COMPOUNDSTREAM);
	- DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.
	- drr_versioninfo, featureflags);
	- (void) snprintf(drr.drr_u.drr_begin.drr_toname,
	- sizeof (drr.drr_u.drr_begin.drr_toname),
	- "%s@%s", zhp->zfs_name, tosnap);
	- drr.drr_payloadlen = buflen;
	-
	- err = dump_record(&drr, packbuf, buflen, &zc, outfd);
	- free(packbuf);
	- if (err != 0)
	- goto stderr_out;
	-
	- /* write end record */
	- bzero(&drr, sizeof (drr));
	- drr.drr_type = DRR_END;
	- drr.drr_u.drr_end.drr_checksum = zc;
	- err = write(outfd, &drr, sizeof (drr));
	- if (err == -1) {
	- err = errno;
	- goto stderr_out;
	- }
	-
	- err = 0;
	- }
	- }
	-
	- /* dump each stream */
	- sdd.fromsnap = fromsnap;
	- sdd.tosnap = tosnap;
	- if (tid != 0)
	- sdd.outfd = pipefd[0];
	- else
	- sdd.outfd = outfd;
	- sdd.replicate = flags->replicate;
	- sdd.doall = flags->doall;
	- sdd.fromorigin = flags->fromorigin;
	- sdd.fss = fss;
	- sdd.fsavl = fsavl;
	- sdd.verbose = flags->verbose;
	- sdd.parsable = flags->parsable;
	- sdd.progress = flags->progress;
	- sdd.progressastitle = flags->progressastitle;
	- sdd.dryrun = flags->dryrun;
	- sdd.large_block = flags->largeblock;
	- sdd.embed_data = flags->embed_data;
	- sdd.compress = flags->compress;
	- sdd.filter_cb = filter_func;
	- sdd.filter_cb_arg = cb_arg;
	- if (debugnvp)
	- sdd.debugnv = *debugnvp;
	- if (sdd.verbose && sdd.dryrun)
	- sdd.std_out = B_TRUE;
	- fout = sdd.std_out ? stdout : stderr;
	-
	- /*
	- * Some flags require that we place user holds on the datasets that are
	- * being sent so they don't get destroyed during the send. We can skip
	- * this step if the pool is imported read-only since the datasets cannot
	- * be destroyed.
	- */
	- if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp),
	- ZPOOL_PROP_READONLY, NULL) &&
	- zfs_spa_version(zhp, &spa_version) == 0 &&
	- spa_version >= SPA_VERSION_USERREFS &&
	- (flags->doall \|\| flags->replicate)) {
	- ++holdseq;
	- (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
	- ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
	- sdd.cleanup_fd = open(ZFS_DEV, O_RDWR\|O_EXCL);
	- if (sdd.cleanup_fd < 0) {
	- err = errno;
	- goto stderr_out;
	- }
	- sdd.snapholds = fnvlist_alloc();
	- } else {
	- sdd.cleanup_fd = -1;
	- sdd.snapholds = NULL;
	- }
	- if (flags->progress \|\| flags->verbose \|\| sdd.snapholds != NULL) {
	- /*
	- * Do a verbose no-op dry run to get all the verbose output
	- * or to gather snapshot hold's before generating any data,
	- * then do a non-verbose real run to generate the streams.
	- */
	- sdd.dryrun = B_TRUE;
	- err = dump_filesystems(zhp, &sdd);
	-
	- if (err != 0)
	- goto stderr_out;
	-
	- if (flags->verbose) {
	- if (flags->parsable) {
	- (void) fprintf(fout, "size\t%llu\n",
	- (longlong_t)sdd.size);
	- } else {
	- char buf[16];
	- zfs_nicenum(sdd.size, buf, sizeof (buf));
	- (void) fprintf(fout, dgettext(TEXT_DOMAIN,
	- "total estimated size is %s\n"), buf);
	- }
	- }
	-
	- /* Ensure no snaps found is treated as an error. */
	- if (!sdd.seento) {
	- err = ENOENT;
	- goto err_out;
	- }
	-
	- /* Skip the second run if dryrun was requested. */
	- if (flags->dryrun)
	- goto err_out;
	-
	- if (sdd.snapholds != NULL) {
	- err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds);
	- if (err != 0)
	- goto stderr_out;
	-
	- fnvlist_free(sdd.snapholds);
	- sdd.snapholds = NULL;
	- }
	-
	- sdd.dryrun = B_FALSE;
	- sdd.verbose = B_FALSE;
	- }
	-
	- err = dump_filesystems(zhp, &sdd);
	- fsavl_destroy(fsavl);
	- nvlist_free(fss);
	-
	- /* Ensure no snaps found is treated as an error. */
	- if (err == 0 && !sdd.seento)
	- err = ENOENT;
	-
	- if (tid != 0) {
	- if (err != 0)
	- (void) pthread_cancel(tid);
	- (void) close(pipefd[0]);
	- (void) pthread_join(tid, NULL);
	- }
	-
	- if (sdd.cleanup_fd != -1) {
	- VERIFY(0 == close(sdd.cleanup_fd));
	- sdd.cleanup_fd = -1;
	- }
	-
	- if (!flags->dryrun && (flags->replicate \|\| flags->doall \|\|
	- flags->props)) {
	- /*
	- * write final end record. NB: want to do this even if
	- * there was some error, because it might not be totally
	- * failed.
	- */
	- dmu_replay_record_t drr = { 0 };
	- drr.drr_type = DRR_END;
	- if (write(outfd, &drr, sizeof (drr)) == -1) {
	- return (zfs_standard_error(zhp->zfs_hdl,
	- errno, errbuf));
	- }
	- }
	-
	- return (err \|\| sdd.err);
	-
	-stderr_out:
	- err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
	-err_out:
	- fsavl_destroy(fsavl);
	- nvlist_free(fss);
	- fnvlist_free(sdd.snapholds);
	-
	- if (sdd.cleanup_fd != -1)
	- VERIFY(0 == close(sdd.cleanup_fd));
	- if (tid != 0) {
	- (void) pthread_cancel(tid);
	- (void) close(pipefd[0]);
	- (void) pthread_join(tid, NULL);
	- }
	- return (err);
	-}
	-
	-int
	-zfs_send_one(zfs_handle_t zhp, const char from, int fd, sendflags_t flags)
	-{
	- int err = 0;
	- libzfs_handle_t *hdl = zhp->zfs_hdl;
	- enum lzc_send_flags lzc_flags = 0;
	- FILE *fout = (flags.verbose && flags.dryrun) ? stdout : stderr;
	- char errbuf[1024];
	-
	- if (flags.largeblock)
	- lzc_flags \|= LZC_SEND_FLAG_LARGE_BLOCK;
	- if (flags.embed_data)
	- lzc_flags \|= LZC_SEND_FLAG_EMBED_DATA;
	- if (flags.compress)
	- lzc_flags \|= LZC_SEND_FLAG_COMPRESS;
	-
	- if (flags.verbose) {
	- uint64_t size = 0;
	- err = lzc_send_space(zhp->zfs_name, from, lzc_flags, &size);
	- if (err == 0) {
	- send_print_verbose(fout, zhp->zfs_name, from, size,
	- flags.parsable);
	- if (flags.parsable) {
	- (void) fprintf(fout, "size\t%llu\n",
	- (longlong_t)size);
	- } else {
	- char buf[16];
	- zfs_nicenum(size, buf, sizeof (buf));
	- (void) fprintf(fout, dgettext(TEXT_DOMAIN,
	- "total estimated size is %s\n"), buf);
	- }
	- } else {
	- (void) fprintf(stderr, "Cannot estimate send size: "
	- "%s\n", strerror(errno));
	- }
	- }
	-
	- if (flags.dryrun)
	- return (err);
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "warning: cannot send '%s'"), zhp->zfs_name);
	-
	- err = lzc_send(zhp->zfs_name, from, fd, lzc_flags);
	- if (err != 0) {
	- switch (errno) {
	- case EXDEV:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "not an earlier snapshot from the same fs"));
	- return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
	-
	- case ENOENT:
	- case ESRCH:
	- if (lzc_exists(zhp->zfs_name)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "incremental source (%s) does not exist"),
	- from);
	- }
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	-
	- case EBUSY:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "target is busy; if a filesystem, "
	- "it must not be mounted"));
	- return (zfs_error(hdl, EZFS_BUSY, errbuf));
	-
	- case EDQUOT:
	- case EFBIG:
	- case EIO:
	- case ENOLINK:
	- case ENOSPC:
	-#ifdef illumos
	- case ENOSTR:
	-#endif
	- case ENXIO:
	- case EPIPE:
	- case ERANGE:
	- case EFAULT:
	- case EROFS:
	- zfs_error_aux(hdl, strerror(errno));
	- return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
	-
	- default:
	- return (zfs_standard_error(hdl, errno, errbuf));
	- }
	- }
	- return (err != 0);
	-}
	-
	-/*
	- * Routines specific to "zfs recv"
	- */
	-
	-static int
	-recv_read(libzfs_handle_t hdl, int fd, void buf, int ilen,
	- boolean_t byteswap, zio_cksum_t *zc)
	-{
	- char *cp = buf;
	- int rv;
	- int len = ilen;
	-
	- assert(ilen <= SPA_MAXBLOCKSIZE);
	-
	- do {
	- rv = read(fd, cp, len);
	- cp += rv;
	- len -= rv;
	- } while (rv > 0);
	-
	- if (rv < 0 \|\| len != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "failed to read from stream"));
	- return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN,
	- "cannot receive")));
	- }
	-
	- if (zc) {
	- if (byteswap)
	- (void) fletcher_4_incremental_byteswap(buf, ilen, zc);
	- else
	- (void) fletcher_4_incremental_native(buf, ilen, zc);
	- }
	- return (0);
	-}
	-
	-static int
	-recv_read_nvlist(libzfs_handle_t hdl, int fd, int len, nvlist_t *nvp,
	- boolean_t byteswap, zio_cksum_t *zc)
	-{
	- char *buf;
	- int err;
	-
	- buf = zfs_alloc(hdl, len);
	- if (buf == NULL)
	- return (ENOMEM);
	-
	- err = recv_read(hdl, fd, buf, len, byteswap, zc);
	- if (err != 0) {
	- free(buf);
	- return (err);
	- }
	-
	- err = nvlist_unpack(buf, len, nvp, 0);
	- free(buf);
	- if (err != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
	- "stream (malformed nvlist)"));
	- return (EINVAL);
	- }
	- return (0);
	-}
	-
	-static int
	-recv_rename(libzfs_handle_t hdl, const char name, const char *tryname,
	- int baselen, char newname, recvflags_t flags)
	-{
	- static int seq;
	- int err;
	- prop_changelist_t *clp;
	- zfs_handle_t *zhp;
	-
	- zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
	- if (zhp == NULL)
	- return (-1);
	- clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
	- flags->force ? MS_FORCE : 0);
	- zfs_close(zhp);
	- if (clp == NULL)
	- return (-1);
	- err = changelist_prefix(clp);
	- if (err)
	- return (err);
	-
	- if (tryname) {
	- (void) strcpy(newname, tryname);
	- if (flags->verbose) {
	- (void) printf("attempting rename %s to %s\n",
	- name, newname);
	- }
	- err = lzc_rename(name, newname);
	- if (err == 0)
	- changelist_rename(clp, name, tryname);
	- } else {
	- err = ENOENT;
	- }
	-
	- if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) {
	- seq++;
	-
	- (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN,
	- "%.*srecv-%u-%u", baselen, name, getpid(), seq);
	- if (flags->verbose) {
	- (void) printf("failed - trying rename %s to %s\n",
	- name, newname);
	- }
	- err = lzc_rename(name, newname);
	- if (err == 0)
	- changelist_rename(clp, name, newname);
	- if (err && flags->verbose) {
	- (void) printf("failed (%u) - "
	- "will try again on next pass\n", errno);
	- }
	- err = EAGAIN;
	- } else if (flags->verbose) {
	- if (err == 0)
	- (void) printf("success\n");
	- else
	- (void) printf("failed (%u)\n", errno);
	- }
	-
	- (void) changelist_postfix(clp);
	- changelist_free(clp);
	-
	- return (err);
	-}
	-
	-static int
	-recv_destroy(libzfs_handle_t hdl, const char name, int baselen,
	- char newname, recvflags_t flags)
	-{
	- int err = 0;
	- prop_changelist_t *clp;
	- zfs_handle_t *zhp;
	- boolean_t defer = B_FALSE;
	- int spa_version;
	-
	- zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
	- if (zhp == NULL)
	- return (-1);
	- clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
	- flags->force ? MS_FORCE : 0);
	- if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
	- zfs_spa_version(zhp, &spa_version) == 0 &&
	- spa_version >= SPA_VERSION_USERREFS)
	- defer = B_TRUE;
	- zfs_close(zhp);
	- if (clp == NULL)
	- return (-1);
	- err = changelist_prefix(clp);
	- if (err)
	- return (err);
	-
	- if (flags->verbose)
	- (void) printf("attempting destroy %s\n", name);
	- if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
	- nvlist_t *nv = fnvlist_alloc();
	- fnvlist_add_boolean(nv, name);
	- err = lzc_destroy_snaps(nv, defer, NULL);
	- fnvlist_free(nv);
	- } else {
	- err = lzc_destroy(name);
	- }
	- if (err == 0) {
	- if (flags->verbose)
	- (void) printf("success\n");
	- changelist_remove(clp, name);
	- }
	-
	- (void) changelist_postfix(clp);
	- changelist_free(clp);
	-
	- /*
	- * Deferred destroy might destroy the snapshot or only mark it to be
	- * destroyed later, and it returns success in either case.
	- */
	- if (err != 0 \|\| (defer && zfs_dataset_exists(hdl, name,
	- ZFS_TYPE_SNAPSHOT))) {
	- err = recv_rename(hdl, name, NULL, baselen, newname, flags);
	- }
	-
	- return (err);
	-}
	-
	-typedef struct guid_to_name_data {
	- uint64_t guid;
	- boolean_t bookmark_ok;
	- char *name;
	- char *skip;
	-} guid_to_name_data_t;
	-
	-static int
	-guid_to_name_cb(zfs_handle_t zhp, void arg)
	-{
	- guid_to_name_data_t *gtnd = arg;
	- const char *slash;
	- int err;
	-
	- if (gtnd->skip != NULL &&
	- (slash = strrchr(zhp->zfs_name, '/')) != NULL &&
	- strcmp(slash + 1, gtnd->skip) == 0) {
	- zfs_close(zhp);
	- return (0);
	- }
	-
	- if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) {
	- (void) strcpy(gtnd->name, zhp->zfs_name);
	- zfs_close(zhp);
	- return (EEXIST);
	- }
	-
	- err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
	- if (err != EEXIST && gtnd->bookmark_ok)
	- err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd);
	- zfs_close(zhp);
	- return (err);
	-}
	-
	-/*
	- * Attempt to find the local dataset associated with this guid. In the case of
	- * multiple matches, we attempt to find the "best" match by searching
	- * progressively larger portions of the hierarchy. This allows one to send a
	- * tree of datasets individually and guarantee that we will find the source
	- * guid within that hierarchy, even if there are multiple matches elsewhere.
	- */
	-static int
	-guid_to_name(libzfs_handle_t hdl, const char parent, uint64_t guid,
	- boolean_t bookmark_ok, char *name)
	-{
	- char pname[ZFS_MAX_DATASET_NAME_LEN];
	- guid_to_name_data_t gtnd;
	-
	- gtnd.guid = guid;
	- gtnd.bookmark_ok = bookmark_ok;
	- gtnd.name = name;
	- gtnd.skip = NULL;
	-
	- /*
	- * Search progressively larger portions of the hierarchy, starting
	- * with the filesystem specified by 'parent'. This will
	- * select the "most local" version of the origin snapshot in the case
	- * that there are multiple matching snapshots in the system.
	- */
	- (void) strlcpy(pname, parent, sizeof (pname));
	- char *cp = strrchr(pname, '@');
	- if (cp == NULL)
	- cp = strchr(pname, '\0');
	- for (; cp != NULL; cp = strrchr(pname, '/')) {
	- /* Chop off the last component and open the parent */
	- *cp = '\0';
	- zfs_handle_t *zhp = make_dataset_handle(hdl, pname);
	-
	- if (zhp == NULL)
	- continue;
	- int err = guid_to_name_cb(zfs_handle_dup(zhp), &gtnd);
	- if (err != EEXIST)
	- err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
	- if (err != EEXIST && bookmark_ok)
	- err = zfs_iter_bookmarks(zhp, guid_to_name_cb, &gtnd);
	- zfs_close(zhp);
	- if (err == EEXIST)
	- return (0);
	-
	- /*
	- * Remember the last portion of the dataset so we skip it next
	- * time through (as we've already searched that portion of the
	- * hierarchy).
	- */
	- gtnd.skip = strrchr(pname, '/') + 1;
	- }
	-
	- return (ENOENT);
	-}
	-
	-/*
	- * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if
	- * guid1 is after guid2.
	- */
	-static int
	-created_before(libzfs_handle_t hdl, avl_tree_t avl,
	- uint64_t guid1, uint64_t guid2)
	-{
	- nvlist_t *nvfs;
	- char fsname, snapname;
	- char buf[ZFS_MAX_DATASET_NAME_LEN];
	- int rv;
	- zfs_handle_t guid1hdl, guid2hdl;
	- uint64_t create1, create2;
	-
	- if (guid2 == 0)
	- return (0);
	- if (guid1 == 0)
	- return (1);
	-
	- nvfs = fsavl_find(avl, guid1, &snapname);
	- VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
	- (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
	- guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
	- if (guid1hdl == NULL)
	- return (-1);
	-
	- nvfs = fsavl_find(avl, guid2, &snapname);
	- VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
	- (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
	- guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
	- if (guid2hdl == NULL) {
	- zfs_close(guid1hdl);
	- return (-1);
	- }
	-
	- create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG);
	- create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG);
	-
	- if (create1 < create2)
	- rv = -1;
	- else if (create1 > create2)
	- rv = +1;
	- else
	- rv = 0;
	-
	- zfs_close(guid1hdl);
	- zfs_close(guid2hdl);
	-
	- return (rv);
	-}
	-
	-static int
	-recv_incremental_replication(libzfs_handle_t hdl, const char tofs,
	- recvflags_t flags, nvlist_t stream_nv, avl_tree_t *stream_avl,
	- nvlist_t *renamed)
	-{
	- nvlist_t local_nv, deleted = NULL;
	- avl_tree_t *local_avl;
	- nvpair_t fselem, nextfselem;
	- char *fromsnap;
	- char newname[ZFS_MAX_DATASET_NAME_LEN];
	- char guidname[32];
	- int error;
	- boolean_t needagain, progress, recursive;
	- char s1, s2;
	-
	- VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap));
	-
	- recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
	- ENOENT);
	-
	- if (flags->dryrun)
	- return (0);
	-
	-again:
	- needagain = progress = B_FALSE;
	-
	- VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0));
	-
	- if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL,
	- recursive, B_FALSE, B_FALSE, &local_nv, &local_avl)) != 0)
	- return (error);
	-
	- /*
	- * Process deletes and renames
	- */
	- for (fselem = nvlist_next_nvpair(local_nv, NULL);
	- fselem; fselem = nextfselem) {
	- nvlist_t nvfs, snaps;
	- nvlist_t *stream_nvfs = NULL;
	- nvpair_t snapelem, nextsnapelem;
	- uint64_t fromguid = 0;
	- uint64_t originguid = 0;
	- uint64_t stream_originguid = 0;
	- uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid;
	- char fsname, stream_fsname;
	-
	- nextfselem = nvlist_next_nvpair(local_nv, fselem);
	-
	- VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
	- VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
	- VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
	- VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap",
	- &parent_fromsnap_guid));
	- (void) nvlist_lookup_uint64(nvfs, "origin", &originguid);
	-
	- /*
	- * First find the stream's fs, so we can check for
	- * a different origin (due to "zfs promote")
	- */
	- for (snapelem = nvlist_next_nvpair(snaps, NULL);
	- snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) {
	- uint64_t thisguid;
	-
	- VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
	- stream_nvfs = fsavl_find(stream_avl, thisguid, NULL);
	-
	- if (stream_nvfs != NULL)
	- break;
	- }
	-
	- /* check for promote */
	- (void) nvlist_lookup_uint64(stream_nvfs, "origin",
	- &stream_originguid);
	- if (stream_nvfs && originguid != stream_originguid) {
	- switch (created_before(hdl, local_avl,
	- stream_originguid, originguid)) {
	- case 1: {
	- /* promote it! */
	- zfs_cmd_t zc = { 0 };
	- nvlist_t *origin_nvfs;
	- char *origin_fsname;
	-
	- if (flags->verbose)
	- (void) printf("promoting %s\n", fsname);
	-
	- origin_nvfs = fsavl_find(local_avl, originguid,
	- NULL);
	- VERIFY(0 == nvlist_lookup_string(origin_nvfs,
	- "name", &origin_fsname));
	- (void) strlcpy(zc.zc_value, origin_fsname,
	- sizeof (zc.zc_value));
	- (void) strlcpy(zc.zc_name, fsname,
	- sizeof (zc.zc_name));
	- error = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc);
	- if (error == 0)
	- progress = B_TRUE;
	- break;
	- }
	- default:
	- break;
	- case -1:
	- fsavl_destroy(local_avl);
	- nvlist_free(local_nv);
	- return (-1);
	- }
	- /*
	- * We had/have the wrong origin, therefore our
	- * list of snapshots is wrong. Need to handle
	- * them on the next pass.
	- */
	- needagain = B_TRUE;
	- continue;
	- }
	-
	- for (snapelem = nvlist_next_nvpair(snaps, NULL);
	- snapelem; snapelem = nextsnapelem) {
	- uint64_t thisguid;
	- char *stream_snapname;
	- nvlist_t found, props;
	-
	- nextsnapelem = nvlist_next_nvpair(snaps, snapelem);
	-
	- VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
	- found = fsavl_find(stream_avl, thisguid,
	- &stream_snapname);
	-
	- /* check for delete */
	- if (found == NULL) {
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	-
	- if (!flags->force)
	- continue;
	-
	- (void) snprintf(name, sizeof (name), "%s@%s",
	- fsname, nvpair_name(snapelem));
	-
	- error = recv_destroy(hdl, name,
	- strlen(fsname)+1, newname, flags);
	- if (error)
	- needagain = B_TRUE;
	- else
	- progress = B_TRUE;
	- sprintf(guidname, "%" PRIu64, thisguid);
	- nvlist_add_boolean(deleted, guidname);
	- continue;
	- }
	-
	- stream_nvfs = found;
	-
	- if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops",
	- &props) && 0 == nvlist_lookup_nvlist(props,
	- stream_snapname, &props)) {
	- zfs_cmd_t zc = { 0 };
	-
	- zc.zc_cookie = B_TRUE; /* received */
	- (void) snprintf(zc.zc_name, sizeof (zc.zc_name),
	- "%s@%s", fsname, nvpair_name(snapelem));
	- if (zcmd_write_src_nvlist(hdl, &zc,
	- props) == 0) {
	- (void) zfs_ioctl(hdl,
	- ZFS_IOC_SET_PROP, &zc);
	- zcmd_free_nvlists(&zc);
	- }
	- }
	-
	- /* check for different snapname */
	- if (strcmp(nvpair_name(snapelem),
	- stream_snapname) != 0) {
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- char tryname[ZFS_MAX_DATASET_NAME_LEN];
	-
	- (void) snprintf(name, sizeof (name), "%s@%s",
	- fsname, nvpair_name(snapelem));
	- (void) snprintf(tryname, sizeof (name), "%s@%s",
	- fsname, stream_snapname);
	-
	- error = recv_rename(hdl, name, tryname,
	- strlen(fsname)+1, newname, flags);
	- if (error)
	- needagain = B_TRUE;
	- else
	- progress = B_TRUE;
	- }
	-
	- if (strcmp(stream_snapname, fromsnap) == 0)
	- fromguid = thisguid;
	- }
	-
	- /* check for delete */
	- if (stream_nvfs == NULL) {
	- if (!flags->force)
	- continue;
	-
	- error = recv_destroy(hdl, fsname, strlen(tofs)+1,
	- newname, flags);
	- if (error)
	- needagain = B_TRUE;
	- else
	- progress = B_TRUE;
	- sprintf(guidname, "%" PRIu64, parent_fromsnap_guid);
	- nvlist_add_boolean(deleted, guidname);
	- continue;
	- }
	-
	- if (fromguid == 0) {
	- if (flags->verbose) {
	- (void) printf("local fs %s does not have "
	- "fromsnap (%s in stream); must have "
	- "been deleted locally; ignoring\n",
	- fsname, fromsnap);
	- }
	- continue;
	- }
	-
	- VERIFY(0 == nvlist_lookup_string(stream_nvfs,
	- "name", &stream_fsname));
	- VERIFY(0 == nvlist_lookup_uint64(stream_nvfs,
	- "parentfromsnap", &stream_parent_fromsnap_guid));
	-
	- s1 = strrchr(fsname, '/');
	- s2 = strrchr(stream_fsname, '/');
	-
	- /*
	- * Check if we're going to rename based on parent guid change
	- * and the current parent guid was also deleted. If it was then
	- * rename will fail and is likely unneeded, so avoid this and
	- * force an early retry to determine the new
	- * parent_fromsnap_guid.
	- */
	- if (stream_parent_fromsnap_guid != 0 &&
	- parent_fromsnap_guid != 0 &&
	- stream_parent_fromsnap_guid != parent_fromsnap_guid) {
	- sprintf(guidname, "%" PRIu64, parent_fromsnap_guid);
	- if (nvlist_exists(deleted, guidname)) {
	- progress = B_TRUE;
	- needagain = B_TRUE;
	- goto doagain;
	- }
	- }
	-
	- /*
	- * Check for rename. If the exact receive path is specified, it
	- * does not count as a rename, but we still need to check the
	- * datasets beneath it.
	- */
	- if ((stream_parent_fromsnap_guid != 0 &&
	- parent_fromsnap_guid != 0 &&
	- stream_parent_fromsnap_guid != parent_fromsnap_guid) \|\|
	- ((flags->isprefix \|\| strcmp(tofs, fsname) != 0) &&
	- (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
	- nvlist_t *parent;
	- char tryname[ZFS_MAX_DATASET_NAME_LEN];
	-
	- parent = fsavl_find(local_avl,
	- stream_parent_fromsnap_guid, NULL);
	- /*
	- * NB: parent might not be found if we used the
	- * tosnap for stream_parent_fromsnap_guid,
	- * because the parent is a newly-created fs;
	- * we'll be able to rename it after we recv the
	- * new fs.
	- */
	- if (parent != NULL) {
	- char *pname;
	-
	- VERIFY(0 == nvlist_lookup_string(parent, "name",
	- &pname));
	- (void) snprintf(tryname, sizeof (tryname),
	- "%s%s", pname, strrchr(stream_fsname, '/'));
	- } else {
	- tryname[0] = '\0';
	- if (flags->verbose) {
	- (void) printf("local fs %s new parent "
	- "not found\n", fsname);
	- }
	- }
	-
	- newname[0] = '\0';
	-
	- error = recv_rename(hdl, fsname, tryname,
	- strlen(tofs)+1, newname, flags);
	-
	- if (renamed != NULL && newname[0] != '\0') {
	- VERIFY(0 == nvlist_add_boolean(renamed,
	- newname));
	- }
	-
	- if (error)
	- needagain = B_TRUE;
	- else
	- progress = B_TRUE;
	- }
	- }
	-
	-doagain:
	- fsavl_destroy(local_avl);
	- nvlist_free(local_nv);
	- nvlist_free(deleted);
	-
	- if (needagain && progress) {
	- /* do another pass to fix up temporary names */
	- if (flags->verbose)
	- (void) printf("another pass:\n");
	- goto again;
	- }
	-
	- return (needagain);
	-}
	-
	-static int
	-zfs_receive_package(libzfs_handle_t hdl, int fd, const char destname,
	- recvflags_t flags, dmu_replay_record_t drr, zio_cksum_t *zc,
	- char *top_zfs, int cleanup_fd, uint64_t action_handlep)
	-{
	- nvlist_t *stream_nv = NULL;
	- avl_tree_t *stream_avl = NULL;
	- char *fromsnap = NULL;
	- char *sendsnap = NULL;
	- char *cp;
	- char tofs[ZFS_MAX_DATASET_NAME_LEN];
	- char sendfs[ZFS_MAX_DATASET_NAME_LEN];
	- char errbuf[1024];
	- dmu_replay_record_t drre;
	- int error;
	- boolean_t anyerr = B_FALSE;
	- boolean_t softerr = B_FALSE;
	- boolean_t recursive;
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot receive"));
	-
	- assert(drr->drr_type == DRR_BEGIN);
	- assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC);
	- assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) ==
	- DMU_COMPOUNDSTREAM);
	-
	- /*
	- * Read in the nvlist from the stream.
	- */
	- if (drr->drr_payloadlen != 0) {
	- error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen,
	- &stream_nv, flags->byteswap, zc);
	- if (error) {
	- error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
	- goto out;
	- }
	- }
	-
	- recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
	- ENOENT);
	-
	- if (recursive && strchr(destname, '@')) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "cannot specify snapshot name for multi-snapshot stream"));
	- error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
	- goto out;
	- }
	-
	- /*
	- * Read in the end record and verify checksum.
	- */
	- if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre),
	- flags->byteswap, NULL)))
	- goto out;
	- if (flags->byteswap) {
	- drre.drr_type = BSWAP_32(drre.drr_type);
	- drre.drr_u.drr_end.drr_checksum.zc_word[0] =
	- BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]);
	- drre.drr_u.drr_end.drr_checksum.zc_word[1] =
	- BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]);
	- drre.drr_u.drr_end.drr_checksum.zc_word[2] =
	- BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]);
	- drre.drr_u.drr_end.drr_checksum.zc_word[3] =
	- BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]);
	- }
	- if (drre.drr_type != DRR_END) {
	- error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
	- goto out;
	- }
	- if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "incorrect header checksum"));
	- error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
	- goto out;
	- }
	-
	- (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap);
	-
	- if (drr->drr_payloadlen != 0) {
	- nvlist_t *stream_fss;
	-
	- VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss",
	- &stream_fss));
	- if ((stream_avl = fsavl_create(stream_fss)) == NULL) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "couldn't allocate avl tree"));
	- error = zfs_error(hdl, EZFS_NOMEM, errbuf);
	- goto out;
	- }
	-
	- if (fromsnap != NULL && recursive) {
	- nvlist_t *renamed = NULL;
	- nvpair_t *pair = NULL;
	-
	- (void) strlcpy(tofs, destname, sizeof (tofs));
	- if (flags->isprefix) {
	- struct drr_begin *drrb = &drr->drr_u.drr_begin;
	- int i;
	-
	- if (flags->istail) {
	- cp = strrchr(drrb->drr_toname, '/');
	- if (cp == NULL) {
	- (void) strlcat(tofs, "/",
	- sizeof (tofs));
	- i = 0;
	- } else {
	- i = (cp - drrb->drr_toname);
	- }
	- } else {
	- i = strcspn(drrb->drr_toname, "/@");
	- }
	- /* zfs_receive_one() will create_parents() */
	- (void) strlcat(tofs, &drrb->drr_toname[i],
	- sizeof (tofs));
	- *strchr(tofs, '@') = '\0';
	- }
	-
	- if (!flags->dryrun && !flags->nomount) {
	- VERIFY(0 == nvlist_alloc(&renamed,
	- NV_UNIQUE_NAME, 0));
	- }
	-
	- softerr = recv_incremental_replication(hdl, tofs, flags,
	- stream_nv, stream_avl, renamed);
	-
	- /* Unmount renamed filesystems before receiving. */
	- while ((pair = nvlist_next_nvpair(renamed,
	- pair)) != NULL) {
	- zfs_handle_t *zhp;
	- prop_changelist_t *clp = NULL;
	-
	- zhp = zfs_open(hdl, nvpair_name(pair),
	- ZFS_TYPE_FILESYSTEM);
	- if (zhp != NULL) {
	- clp = changelist_gather(zhp,
	- ZFS_PROP_MOUNTPOINT, 0,
	- flags->forceunmount ? MS_FORCE : 0);
	- zfs_close(zhp);
	- if (clp != NULL) {
	- softerr \|=
	- changelist_prefix(clp);
	- changelist_free(clp);
	- }
	- }
	- }
	-
	- nvlist_free(renamed);
	- }
	- }
	-
	- /*
	- * Get the fs specified by the first path in the stream (the top level
	- * specified by 'zfs send') and pass it to each invocation of
	- * zfs_receive_one().
	- */
	- (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname,
	- sizeof (sendfs));
	- if ((cp = strchr(sendfs, '@')) != NULL) {
	- *cp = '\0';
	- /*
	- * Find the "sendsnap", the final snapshot in a replication
	- * stream. zfs_receive_one() handles certain errors
	- * differently, depending on if the contained stream is the
	- * last one or not.
	- */
	- sendsnap = (cp + 1);
	- }
	-
	- /* Finally, receive each contained stream */
	- do {
	- /*
	- * we should figure out if it has a recoverable
	- * error, in which case do a recv_skip() and drive on.
	- * Note, if we fail due to already having this guid,
	- * zfs_receive_one() will take care of it (ie,
	- * recv_skip() and return 0).
	- */
	- error = zfs_receive_impl(hdl, destname, NULL, flags, fd,
	- sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
	- action_handlep, sendsnap);
	- if (error == ENODATA) {
	- error = 0;
	- break;
	- }
	- anyerr \|= error;
	- } while (error == 0);
	-
	- if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) {
	- /*
	- * Now that we have the fs's they sent us, try the
	- * renames again.
	- */
	- softerr = recv_incremental_replication(hdl, tofs, flags,
	- stream_nv, stream_avl, NULL);
	- }
	-
	-out:
	- fsavl_destroy(stream_avl);
	- nvlist_free(stream_nv);
	- if (softerr)
	- error = -2;
	- if (anyerr)
	- error = -1;
	- return (error);
	-}
	-
	-static void
	-trunc_prop_errs(int truncated)
	-{
	- ASSERT(truncated != 0);
	-
	- if (truncated == 1)
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "1 more property could not be set\n"));
	- else
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	- "%d more properties could not be set\n"), truncated);
	-}
	-
	-static int
	-recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
	-{
	- dmu_replay_record_t *drr;
	- void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
	- char errbuf[1024];
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot receive:"));
	-
	- /* XXX would be great to use lseek if possible... */
	- drr = buf;
	-
	- while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t),
	- byteswap, NULL) == 0) {
	- if (byteswap)
	- drr->drr_type = BSWAP_32(drr->drr_type);
	-
	- switch (drr->drr_type) {
	- case DRR_BEGIN:
	- if (drr->drr_payloadlen != 0) {
	- (void) recv_read(hdl, fd, buf,
	- drr->drr_payloadlen, B_FALSE, NULL);
	- }
	- break;
	-
	- case DRR_END:
	- free(buf);
	- return (0);
	-
	- case DRR_OBJECT:
	- if (byteswap) {
	- drr->drr_u.drr_object.drr_bonuslen =
	- BSWAP_32(drr->drr_u.drr_object.
	- drr_bonuslen);
	- }
	- (void) recv_read(hdl, fd, buf,
	- P2ROUNDUP(drr->drr_u.drr_object.drr_bonuslen, 8),
	- B_FALSE, NULL);
	- break;
	-
	- case DRR_WRITE:
	- if (byteswap) {
	- drr->drr_u.drr_write.drr_logical_size =
	- BSWAP_64(
	- drr->drr_u.drr_write.drr_logical_size);
	- drr->drr_u.drr_write.drr_compressed_size =
	- BSWAP_64(
	- drr->drr_u.drr_write.drr_compressed_size);
	- }
	- uint64_t payload_size =
	- DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write);
	- (void) recv_read(hdl, fd, buf,
	- payload_size, B_FALSE, NULL);
	- break;
	- case DRR_SPILL:
	- if (byteswap) {
	- drr->drr_u.drr_spill.drr_length =
	- BSWAP_64(drr->drr_u.drr_spill.drr_length);
	- }
	- (void) recv_read(hdl, fd, buf,
	- drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
	- break;
	- case DRR_WRITE_EMBEDDED:
	- if (byteswap) {
	- drr->drr_u.drr_write_embedded.drr_psize =
	- BSWAP_32(drr->drr_u.drr_write_embedded.
	- drr_psize);
	- }
	- (void) recv_read(hdl, fd, buf,
	- P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize,
	- 8), B_FALSE, NULL);
	- break;
	- case DRR_WRITE_BYREF:
	- case DRR_FREEOBJECTS:
	- case DRR_FREE:
	- break;
	-
	- default:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid record type"));
	- return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
	- }
	- }
	-
	- free(buf);
	- return (-1);
	-}
	-
	-static void
	-recv_ecksum_set_aux(libzfs_handle_t hdl, const char target_snap,
	- boolean_t resumable)
	-{
	- char target_fs[ZFS_MAX_DATASET_NAME_LEN];
	-
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "checksum mismatch or incomplete stream"));
	-
	- if (!resumable)
	- return;
	- (void) strlcpy(target_fs, target_snap, sizeof (target_fs));
	- *strchr(target_fs, '@') = '\0';
	- zfs_handle_t *zhp = zfs_open(hdl, target_fs,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (zhp == NULL)
	- return;
	-
	- char token_buf[ZFS_MAXPROPLEN];
	- int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
	- token_buf, sizeof (token_buf),
	- NULL, NULL, 0, B_TRUE);
	- if (error == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "checksum mismatch or incomplete stream.\n"
	- "Partially received snapshot is saved.\n"
	- "A resuming stream can be generated on the sending "
	- "system by running:\n"
	- " zfs send -t %s"),
	- token_buf);
	- }
	- zfs_close(zhp);
	-}
	-
	-/*
	- * Restores a backup of tosnap from the file descriptor specified by infd.
	- */
	-static int
	-zfs_receive_one(libzfs_handle_t hdl, int infd, const char tosnap,
	- const char originsnap, recvflags_t flags, dmu_replay_record_t *drr,
	- dmu_replay_record_t drr_noswap, const char sendfs, nvlist_t *stream_nv,
	- avl_tree_t stream_avl, char *top_zfs, int cleanup_fd,
	- uint64_t action_handlep, const char finalsnap)
	-{
	- zfs_cmd_t zc = { 0 };
	- time_t begin_time;
	- int ioctl_err, ioctl_errno, err;
	- char *cp;
	- struct drr_begin *drrb = &drr->drr_u.drr_begin;
	- char errbuf[1024];
	- char prop_errbuf[1024];
	- const char *chopprefix;
	- boolean_t newfs = B_FALSE;
	- boolean_t stream_wantsnewfs;
	- uint64_t parent_snapguid = 0;
	- prop_changelist_t *clp = NULL;
	- nvlist_t *snapprops_nvlist = NULL;
	- zprop_errflags_t prop_errflags;
	- boolean_t recursive;
	- char *snapname = NULL;
	-
	- begin_time = time(NULL);
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot receive"));
	-
	- recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
	- ENOENT);
	-
	- if (stream_avl != NULL) {
	- nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid,
	- &snapname);
	- nvlist_t *props;
	- int ret;
	-
	- (void) nvlist_lookup_uint64(fs, "parentfromsnap",
	- &parent_snapguid);
	- err = nvlist_lookup_nvlist(fs, "props", &props);
	- if (err)
	- VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0));
	-
	- if (flags->canmountoff) {
	- VERIFY(0 == nvlist_add_uint64(props,
	- zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0));
	- }
	- ret = zcmd_write_src_nvlist(hdl, &zc, props);
	- if (err)
	- nvlist_free(props);
	-
	- if (0 == nvlist_lookup_nvlist(fs, "snapprops", &props)) {
	- VERIFY(0 == nvlist_lookup_nvlist(props,
	- snapname, &snapprops_nvlist));
	- }
	-
	- if (ret != 0)
	- return (-1);
	- }
	-
	- cp = NULL;
	-
	- /*
	- * Determine how much of the snapshot name stored in the stream
	- * we are going to tack on to the name they specified on the
	- * command line, and how much we are going to chop off.
	- *
	- * If they specified a snapshot, chop the entire name stored in
	- * the stream.
	- */
	- if (flags->istail) {
	- /*
	- * A filesystem was specified with -e. We want to tack on only
	- * the tail of the sent snapshot path.
	- */
	- if (strchr(tosnap, '@')) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
	- "argument - snapshot not allowed with -e"));
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	- }
	-
	- chopprefix = strrchr(sendfs, '/');
	-
	- if (chopprefix == NULL) {
	- /*
	- * The tail is the poolname, so we need to
	- * prepend a path separator.
	- */
	- int len = strlen(drrb->drr_toname);
	- cp = malloc(len + 2);
	- cp[0] = '/';
	- (void) strcpy(&cp[1], drrb->drr_toname);
	- chopprefix = cp;
	- } else {
	- chopprefix = drrb->drr_toname + (chopprefix - sendfs);
	- }
	- } else if (flags->isprefix) {
	- /*
	- * A filesystem was specified with -d. We want to tack on
	- * everything but the first element of the sent snapshot path
	- * (all but the pool name).
	- */
	- if (strchr(tosnap, '@')) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
	- "argument - snapshot not allowed with -d"));
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	- }
	-
	- chopprefix = strchr(drrb->drr_toname, '/');
	- if (chopprefix == NULL)
	- chopprefix = strchr(drrb->drr_toname, '@');
	- } else if (strchr(tosnap, '@') == NULL) {
	- /*
	- * If a filesystem was specified without -d or -e, we want to
	- * tack on everything after the fs specified by 'zfs send'.
	- */
	- chopprefix = drrb->drr_toname + strlen(sendfs);
	- } else {
	- /* A snapshot was specified as an exact path (no -d or -e). */
	- if (recursive) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "cannot specify snapshot name for multi-snapshot "
	- "stream"));
	- return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
	- }
	- chopprefix = drrb->drr_toname + strlen(drrb->drr_toname);
	- }
	-
	- ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname);
	- ASSERT(chopprefix > drrb->drr_toname);
	- ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname));
	- ASSERT(chopprefix[0] == '/' \|\| chopprefix[0] == '@' \|\|
	- chopprefix[0] == '\0');
	-
	- /*
	- * Determine name of destination snapshot, store in zc_value.
	- */
	- (void) strcpy(zc.zc_value, tosnap);
	- (void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value));
	-#ifdef __FreeBSD__
	- if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
	- zfs_ioctl_version = get_zfs_ioctl_version();
	- /*
	- * For forward compatibility hide tosnap in zc_value
	- */
	- if (zfs_ioctl_version < ZFS_IOCVER_LZC)
	- (void) strcpy(zc.zc_value + strlen(zc.zc_value) + 1, tosnap);
	-#endif
	- free(cp);
	- if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) {
	- zcmd_free_nvlists(&zc);
	- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	- }
	-
	- /*
	- * Determine the name of the origin snapshot, store in zc_string.
	- */
	- if (originsnap) {
	- (void) strncpy(zc.zc_string, originsnap, sizeof (zc.zc_string));
	- if (flags->verbose)
	- (void) printf("using provided clone origin %s\n",
	- zc.zc_string);
	- } else if (drrb->drr_flags & DRR_FLAG_CLONE) {
	- if (guid_to_name(hdl, zc.zc_value,
	- drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) {
	- zcmd_free_nvlists(&zc);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "local origin for clone %s does not exist"),
	- zc.zc_value);
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	- }
	- if (flags->verbose)
	- (void) printf("found clone origin %s\n", zc.zc_string);
	- }
	-
	- boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
	- DMU_BACKUP_FEATURE_RESUMING;
	- stream_wantsnewfs = (drrb->drr_fromguid == 0 \|\|
	- (drrb->drr_flags & DRR_FLAG_CLONE) \|\| originsnap) && !resuming;
	-
	- if (stream_wantsnewfs) {
	- /*
	- * if the parent fs does not exist, look for it based on
	- * the parent snap GUID
	- */
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot receive new filesystem stream"));
	-
	- (void) strcpy(zc.zc_name, zc.zc_value);
	- cp = strrchr(zc.zc_name, '/');
	- if (cp)
	- *cp = '\0';
	- if (cp &&
	- !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
	- char suffix[ZFS_MAX_DATASET_NAME_LEN];
	- (void) strcpy(suffix, strrchr(zc.zc_value, '/'));
	- if (guid_to_name(hdl, zc.zc_name, parent_snapguid,
	- B_FALSE, zc.zc_value) == 0) {
	- *strchr(zc.zc_value, '@') = '\0';
	- (void) strcat(zc.zc_value, suffix);
	- }
	- }
	- } else {
	- /*
	- * If the fs does not exist, look for it based on the
	- * fromsnap GUID.
	- */
	- if (resuming) {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot receive resume stream"));
	- } else {
	- (void) snprintf(errbuf, sizeof (errbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot receive incremental stream"));
	- }
	-
	- (void) strcpy(zc.zc_name, zc.zc_value);
	- *strchr(zc.zc_name, '@') = '\0';
	-
	- /*
	- * If the exact receive path was specified and this is the
	- * topmost path in the stream, then if the fs does not exist we
	- * should look no further.
	- */
	- if ((flags->isprefix \|\| (*(chopprefix = drrb->drr_toname +
	- strlen(sendfs)) != '\0' && *chopprefix != '@')) &&
	- !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
	- char snap[ZFS_MAX_DATASET_NAME_LEN];
	- (void) strcpy(snap, strchr(zc.zc_value, '@'));
	- if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid,
	- B_FALSE, zc.zc_value) == 0) {
	- *strchr(zc.zc_value, '@') = '\0';
	- (void) strcat(zc.zc_value, snap);
	- }
	- }
	- }
	-
	- (void) strcpy(zc.zc_name, zc.zc_value);
	- *strchr(zc.zc_name, '@') = '\0';
	-
	- if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
	- zfs_handle_t *zhp;
	-
	- /*
	- * Destination fs exists. It must be one of these cases:
	- * - an incremental send stream
	- * - the stream specifies a new fs (full stream or clone)
	- * and they want us to blow away the existing fs (and
	- * have therefore specified -F and removed any snapshots)
	- * - we are resuming a failed receive.
	- */
	- if (stream_wantsnewfs) {
	- boolean_t is_volume = drrb->drr_type == DMU_OST_ZVOL;
	- if (!flags->force) {
	- zcmd_free_nvlists(&zc);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "destination '%s' exists\n"
	- "must specify -F to overwrite it"),
	- zc.zc_name);
	- return (zfs_error(hdl, EZFS_EXISTS, errbuf));
	- }
	- if (ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT,
	- &zc) == 0) {
	- zcmd_free_nvlists(&zc);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "destination has snapshots (eg. %s)\n"
	- "must destroy them to overwrite it"),
	- zc.zc_name);
	- return (zfs_error(hdl, EZFS_EXISTS, errbuf));
	- }
	- if (is_volume && strrchr(zc.zc_name, '/') == NULL) {
	- zcmd_free_nvlists(&zc);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "destination '%s' is the root dataset\n"
	- "cannot overwrite with a ZVOL"),
	- zc.zc_name);
	- return (zfs_error(hdl, EZFS_EXISTS, errbuf));
	- }
	- if (is_volume &&
	- ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT,
	- &zc) == 0) {
	- zcmd_free_nvlists(&zc);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "destination has children (eg. %s)\n"
	- "cannot overwrite with a ZVOL"),
	- zc.zc_name);
	- return (zfs_error(hdl, EZFS_WRONG_PARENT,
	- errbuf));
	- }
	- }
	-
	- if ((zhp = zfs_open(hdl, zc.zc_name,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME)) == NULL) {
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	-
	- if (stream_wantsnewfs &&
	- zhp->zfs_dmustats.dds_origin[0]) {
	- zcmd_free_nvlists(&zc);
	- zfs_close(zhp);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "destination '%s' is a clone\n"
	- "must destroy it to overwrite it"),
	- zc.zc_name);
	- return (zfs_error(hdl, EZFS_EXISTS, errbuf));
	- }
	-
	- if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
	- (stream_wantsnewfs \|\| resuming)) {
	- /* We can't do online recv in this case */
	- clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
	- flags->forceunmount ? MS_FORCE : 0);
	- if (clp == NULL) {
	- zfs_close(zhp);
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- if (changelist_prefix(clp) != 0) {
	- changelist_free(clp);
	- zfs_close(zhp);
	- zcmd_free_nvlists(&zc);
	- return (-1);
	- }
	- }
	-
	- /*
	- * If we are resuming a newfs, set newfs here so that we will
	- * mount it if the recv succeeds this time. We can tell
	- * that it was a newfs on the first recv because the fs
	- * itself will be inconsistent (if the fs existed when we
	- * did the first recv, we would have received it into
	- * .../%recv).
	- */
	- if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT))
	- newfs = B_TRUE;
	-
	- zfs_close(zhp);
	- } else {
	- zfs_handle_t *zhp;
	-
	- /*
	- * Destination filesystem does not exist. Therefore we better
	- * be creating a new filesystem (either from a full backup, or
	- * a clone). It would therefore be invalid if the user
	- * specified only the pool name (i.e. if the destination name
	- * contained no slash character).
	- */
	- if (!stream_wantsnewfs \|\|
	- (cp = strrchr(zc.zc_name, '/')) == NULL) {
	- zcmd_free_nvlists(&zc);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "destination '%s' does not exist"), zc.zc_name);
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	- }
	-
	- /*
	- * Trim off the final dataset component so we perform the
	- * recvbackup ioctl to the filesystems's parent.
	- */
	- *cp = '\0';
	-
	- if (flags->isprefix && !flags->istail && !flags->dryrun &&
	- create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) {
	- zcmd_free_nvlists(&zc);
	- return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
	- }
	-
	- /* validate parent */
	- zhp = zfs_open(hdl, zc.zc_name, ZFS_TYPE_DATASET);
	- if (zhp == NULL) {
	- zcmd_free_nvlists(&zc);
	- return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
	- }
	- if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
	- zcmd_free_nvlists(&zc);
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "parent '%s' is not a filesystem"), zc.zc_name);
	- zfs_close(zhp);
	- return (zfs_error(hdl, EZFS_WRONG_PARENT, errbuf));
	- }
	- zfs_close(zhp);
	-
	- newfs = B_TRUE;
	- }
	-
	- zc.zc_begin_record = *drr_noswap;
	- zc.zc_cookie = infd;
	- zc.zc_guid = flags->force;
	- zc.zc_resumable = flags->resumable;
	- if (flags->verbose) {
	- (void) printf("%s %s stream of %s into %s\n",
	- flags->dryrun ? "would receive" : "receiving",
	- drrb->drr_fromguid ? "incremental" : "full",
	- drrb->drr_toname, zc.zc_value);
	- (void) fflush(stdout);
	- }
	-
	- if (flags->dryrun) {
	- zcmd_free_nvlists(&zc);
	- return (recv_skip(hdl, infd, flags->byteswap));
	- }
	-
	- zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf;
	- zc.zc_nvlist_dst_size = sizeof (prop_errbuf);
	- zc.zc_cleanup_fd = cleanup_fd;
	- zc.zc_action_handle = *action_handlep;
	-
	- err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc);
	- ioctl_errno = errno;
	- prop_errflags = (zprop_errflags_t)zc.zc_obj;
	-
	- if (err == 0) {
	- nvlist_t *prop_errors;
	- VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
	- zc.zc_nvlist_dst_size, &prop_errors, 0));
	-
	- nvpair_t *prop_err = NULL;
	-
	- while ((prop_err = nvlist_next_nvpair(prop_errors,
	- prop_err)) != NULL) {
	- char tbuf[1024];
	- zfs_prop_t prop;
	- int intval;
	-
	- prop = zfs_name_to_prop(nvpair_name(prop_err));
	- (void) nvpair_value_int32(prop_err, &intval);
	- if (strcmp(nvpair_name(prop_err),
	- ZPROP_N_MORE_ERRORS) == 0) {
	- trunc_prop_errs(intval);
	- break;
	- } else if (snapname == NULL \|\| finalsnap == NULL \|\|
	- strcmp(finalsnap, snapname) == 0 \|\|
	- strcmp(nvpair_name(prop_err),
	- zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) {
	- /*
	- * Skip the special case of, for example,
	- * "refquota", errors on intermediate
	- * snapshots leading up to a final one.
	- * That's why we have all of the checks above.
	- *
	- * See zfs_ioctl.c's extract_delay_props() for
	- * a list of props which can fail on
	- * intermediate snapshots, but shouldn't
	- * affect the overall receive.
	- */
	- (void) snprintf(tbuf, sizeof (tbuf),
	- dgettext(TEXT_DOMAIN,
	- "cannot receive %s property on %s"),
	- nvpair_name(prop_err), zc.zc_name);
	- zfs_setprop_error(hdl, prop, intval, tbuf);
	- }
	- }
	- nvlist_free(prop_errors);
	- }
	-
	- zc.zc_nvlist_dst = 0;
	- zc.zc_nvlist_dst_size = 0;
	- zcmd_free_nvlists(&zc);
	-
	- if (err == 0 && snapprops_nvlist) {
	- zfs_cmd_t zc2 = { 0 };
	-
	- (void) strcpy(zc2.zc_name, zc.zc_value);
	- zc2.zc_cookie = B_TRUE; /* received */
	- if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) {
	- (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2);
	- zcmd_free_nvlists(&zc2);
	- }
	- }
	-
	- if (err && (ioctl_errno == ENOENT \|\| ioctl_errno == EEXIST)) {
	- /*
	- * It may be that this snapshot already exists,
	- * in which case we want to consume & ignore it
	- * rather than failing.
	- */
	- avl_tree_t *local_avl;
	- nvlist_t local_nv, fs;
	- cp = strchr(zc.zc_value, '@');
	-
	- /*
	- * XXX Do this faster by just iterating over snaps in
	- * this fs. Also if zc_value does not exist, we will
	- * get a strange "does not exist" error message.
	- */
	- *cp = '\0';
	- if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE,
	- B_FALSE, B_FALSE, &local_nv, &local_avl) == 0) {
	- *cp = '@';
	- fs = fsavl_find(local_avl, drrb->drr_toguid, NULL);
	- fsavl_destroy(local_avl);
	- nvlist_free(local_nv);
	-
	- if (fs != NULL) {
	- if (flags->verbose) {
	- (void) printf("snap %s already exists; "
	- "ignoring\n", zc.zc_value);
	- }
	- err = ioctl_err = recv_skip(hdl, infd,
	- flags->byteswap);
	- }
	- }
	- *cp = '@';
	- }
	-
	- if (ioctl_err != 0) {
	- switch (ioctl_errno) {
	- case ENODEV:
	- cp = strchr(zc.zc_value, '@');
	- *cp = '\0';
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "most recent snapshot of %s does not\n"
	- "match incremental source"), zc.zc_value);
	- (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
	- *cp = '@';
	- break;
	- case ETXTBSY:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "destination %s has been modified\n"
	- "since most recent snapshot"), zc.zc_name);
	- (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
	- break;
	- case EEXIST:
	- cp = strchr(zc.zc_value, '@');
	- if (newfs) {
	- /* it's the containing fs that exists */
	- *cp = '\0';
	- }
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "destination already exists"));
	- (void) zfs_error_fmt(hdl, EZFS_EXISTS,
	- dgettext(TEXT_DOMAIN, "cannot restore to %s"),
	- zc.zc_value);
	- *cp = '@';
	- break;
	- case EINVAL:
	- (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
	- break;
	- case ECKSUM:
	- recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable);
	- (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
	- break;
	- case ENOTSUP:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgraded to receive this stream."));
	- (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	- break;
	- case EDQUOT:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "destination %s space quota exceeded"), zc.zc_name);
	- (void) zfs_error(hdl, EZFS_NOSPC, errbuf);
	- break;
	- default:
	- (void) zfs_standard_error(hdl, ioctl_errno, errbuf);
	- }
	- }
	-
	- /*
	- * Mount the target filesystem (if created). Also mount any
	- * children of the target filesystem if we did a replication
	- * receive (indicated by stream_avl being non-NULL).
	- */
	- cp = strchr(zc.zc_value, '@');
	- if (cp && (ioctl_err == 0 \|\| !newfs)) {
	- zfs_handle_t *h;
	-
	- *cp = '\0';
	- h = zfs_open(hdl, zc.zc_value,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	- if (h != NULL) {
	- if (h->zfs_type == ZFS_TYPE_VOLUME) {
	- *cp = '@';
	- } else if (newfs \|\| stream_avl) {
	- /*
	- * Track the first/top of hierarchy fs,
	- * for mounting and sharing later.
	- */
	- if (top_zfs && *top_zfs == NULL)
	- *top_zfs = zfs_strdup(hdl, zc.zc_value);
	- }
	- zfs_close(h);
	- }
	- *cp = '@';
	- }
	-
	- if (clp) {
	- if (!flags->nomount)
	- err \|= changelist_postfix(clp);
	- changelist_free(clp);
	- }
	-
	- if (prop_errflags & ZPROP_ERR_NOCLEAR) {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
	- "failed to clear unreceived properties on %s"),
	- zc.zc_name);
	- (void) fprintf(stderr, "\n");
	- }
	- if (prop_errflags & ZPROP_ERR_NORESTORE) {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
	- "failed to restore original properties on %s"),
	- zc.zc_name);
	- (void) fprintf(stderr, "\n");
	- }
	-
	- if (err \|\| ioctl_err)
	- return (-1);
	-
	- *action_handlep = zc.zc_action_handle;
	-
	- if (flags->verbose) {
	- char buf1[64];
	- char buf2[64];
	- uint64_t bytes = zc.zc_cookie;
	- time_t delta = time(NULL) - begin_time;
	- if (delta == 0)
	- delta = 1;
	- zfs_nicenum(bytes, buf1, sizeof (buf1));
	- zfs_nicenum(bytes/delta, buf2, sizeof (buf1));
	-
	- (void) printf("received %sB stream in %lu seconds (%sB/sec)\n",
	- buf1, delta, buf2);
	- }
	-
	- return (0);
	-}
	-
	-static int
	-zfs_receive_impl(libzfs_handle_t hdl, const char tosnap,
	- const char originsnap, recvflags_t flags, int infd, const char *sendfs,
	- nvlist_t stream_nv, avl_tree_t stream_avl, char **top_zfs, int cleanup_fd,
	- uint64_t action_handlep, const char finalsnap)
	-{
	- int err;
	- dmu_replay_record_t drr, drr_noswap;
	- struct drr_begin *drrb = &drr.drr_u.drr_begin;
	- char errbuf[1024];
	- zio_cksum_t zcksum = { 0 };
	- uint64_t featureflags;
	- int hdrtype;
	-
	- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	- "cannot receive"));
	-
	- if (flags->isprefix &&
	- !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs "
	- "(%s) does not exist"), tosnap);
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	- }
	- if (originsnap &&
	- !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs "
	- "(%s) does not exist"), originsnap);
	- return (zfs_error(hdl, EZFS_NOENT, errbuf));
	- }
	-
	- /* read in the BEGIN record */
	- if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
	- &zcksum)))
	- return (err);
	-
	- if (drr.drr_type == DRR_END \|\| drr.drr_type == BSWAP_32(DRR_END)) {
	- /* It's the double end record at the end of a package */
	- return (ENODATA);
	- }
	-
	- /* the kernel needs the non-byteswapped begin record */
	- drr_noswap = drr;
	-
	- flags->byteswap = B_FALSE;
	- if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
	- /*
	- * We computed the checksum in the wrong byteorder in
	- * recv_read() above; do it again correctly.
	- */
	- bzero(&zcksum, sizeof (zio_cksum_t));
	- (void) fletcher_4_incremental_byteswap(&drr,
	- sizeof (drr), &zcksum);
	- flags->byteswap = B_TRUE;
	-
	- drr.drr_type = BSWAP_32(drr.drr_type);
	- drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen);
	- drrb->drr_magic = BSWAP_64(drrb->drr_magic);
	- drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
	- drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
	- drrb->drr_type = BSWAP_32(drrb->drr_type);
	- drrb->drr_flags = BSWAP_32(drrb->drr_flags);
	- drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
	- drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
	- }
	-
	- if (drrb->drr_magic != DMU_BACKUP_MAGIC \|\| drr.drr_type != DRR_BEGIN) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
	- "stream (bad magic number)"));
	- return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
	- }
	-
	- featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
	- hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo);
	-
	- if (!DMU_STREAM_SUPPORTED(featureflags) \|\|
	- (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "stream has unsupported feature, feature flags = %lx"),
	- featureflags);
	- return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
	- }
	-
	- if (strchr(drrb->drr_toname, '@') == NULL) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
	- "stream (bad snapshot name)"));
	- return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
	- }
	-
	- if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) {
	- char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN];
	- if (sendfs == NULL) {
	- /*
	- * We were not called from zfs_receive_package(). Get
	- * the fs specified by 'zfs send'.
	- */
	- char *cp;
	- (void) strlcpy(nonpackage_sendfs,
	- drr.drr_u.drr_begin.drr_toname,
	- sizeof (nonpackage_sendfs));
	- if ((cp = strchr(nonpackage_sendfs, '@')) != NULL)
	- *cp = '\0';
	- sendfs = nonpackage_sendfs;
	- VERIFY(finalsnap == NULL);
	- }
	- return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags,
	- &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs,
	- cleanup_fd, action_handlep, finalsnap));
	- } else {
	- assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
	- DMU_COMPOUNDSTREAM);
	- return (zfs_receive_package(hdl, infd, tosnap, flags, &drr,
	- &zcksum, top_zfs, cleanup_fd, action_handlep));
	- }
	-}
	-
	-/*
	- * Restores a backup of tosnap from the file descriptor specified by infd.
	- * Return 0 on total success, -2 if some things couldn't be
	- * destroyed/renamed/promoted, -1 if some things couldn't be received.
	- * (-1 will override -2, if -1 and the resumable flag was specified the
	- * transfer can be resumed if the sending side supports it).
	- */
	-int
	-zfs_receive(libzfs_handle_t hdl, const char tosnap, nvlist_t *props,
	- recvflags_t flags, int infd, avl_tree_t stream_avl)
	-{
	- char *top_zfs = NULL;
	- int err;
	- int cleanup_fd;
	- uint64_t action_handle = 0;
	- char *originsnap = NULL;
	-
	- if (props) {
	- err = nvlist_lookup_string(props, "origin", &originsnap);
	- if (err && err != ENOENT)
	- return (err);
	- }
	-
	- cleanup_fd = open(ZFS_DEV, O_RDWR\|O_EXCL);
	- VERIFY(cleanup_fd >= 0);
	-
	- err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL,
	- stream_avl, &top_zfs, cleanup_fd, &action_handle, NULL);
	-
	- VERIFY(0 == close(cleanup_fd));
	-
	- if (err == 0 && !flags->nomount && top_zfs) {
	- zfs_handle_t *zhp;
	- prop_changelist_t *clp;
	-
	- zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM);
	- if (zhp != NULL) {
	- clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT,
	- CL_GATHER_MOUNT_ALWAYS,
	- flags->forceunmount ? MS_FORCE : 0);
	- zfs_close(zhp);
	- if (clp != NULL) {
	- /* mount and share received datasets */
	- err = changelist_postfix(clp);
	- changelist_free(clp);
	- }
	- }
	- if (zhp == NULL \|\| clp == NULL \|\| err)
	- err = -1;
	- }
	- if (top_zfs)
	- free(top_zfs);
	-
	- return (err);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
	@@ -1,511 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- */
	-
	-/*
	- * This file contains the functions which analyze the status of a pool. This
	- * include both the status of an active pool, as well as the status exported
	- * pools. Returns one of the ZPOOL_STATUS_* defines describing the status of
	- * the pool. This status is independent (to a certain degree) from the state of
	- * the pool. A pool's state describes only whether or not it is capable of
	- * providing the necessary fault tolerance for data. The status describes the
	- * overall status of devices. A pool that is online can still have a device
	- * that is experiencing errors.
	- *
	- * Only a subset of the possible faults can be detected using 'zpool status',
	- * and not all possible errors correspond to a FMA message ID. The explanation
	- * is left up to the caller, depending on whether it is a live pool or an
	- * import.
	- */
	-
	-#include <libzfs.h>
	-#include <string.h>
	-#include <unistd.h>
	-#include "libzfs_impl.h"
	-#include "zfeature_common.h"
	-
	-/*
	- * Message ID table. This must be kept in sync with the ZPOOL_STATUS_* defines
	- * in libzfs.h. Note that there are some status results which go past the end
	- * of this table, and hence have no associated message ID.
	- */
	-static char *zfs_msgid_table[] = {
	- "ZFS-8000-14", /* ZPOOL_STATUS_CORRUPT_CACHE */
	- "ZFS-8000-2Q", /* ZPOOL_STATUS_MISSING_DEV_R */
	- "ZFS-8000-3C", /* ZPOOL_STATUS_MISSING_DEV_NR */
	- "ZFS-8000-4J", /* ZPOOL_STATUS_CORRUPT_LABEL_R */
	- "ZFS-8000-5E", /* ZPOOL_STATUS_CORRUPT_LABEL_NR */
	- "ZFS-8000-6X", /* ZPOOL_STATUS_BAD_GUID_SUM */
	- "ZFS-8000-72", /* ZPOOL_STATUS_CORRUPT_POOL */
	- "ZFS-8000-8A", /* ZPOOL_STATUS_CORRUPT_DATA */
	- "ZFS-8000-9P", /* ZPOOL_STATUS_FAILING_DEV */
	- "ZFS-8000-A5", /* ZPOOL_STATUS_VERSION_NEWER */
	- "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_MISMATCH */
	- "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_ACTIVE */
	- "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_REQUIRED */
	- "ZFS-8000-HC", /* ZPOOL_STATUS_IO_FAILURE_WAIT */
	- "ZFS-8000-JQ", /* ZPOOL_STATUS_IO_FAILURE_CONTINUE */
	- "ZFS-8000-MM", /* ZPOOL_STATUS_IO_FAILURE_MMP */
	- "ZFS-8000-K4", /* ZPOOL_STATUS_BAD_LOG */
	- /*
	- * The following results have no message ID.
	- * ZPOOL_STATUS_UNSUP_FEAT_READ
	- * ZPOOL_STATUS_UNSUP_FEAT_WRITE
	- * ZPOOL_STATUS_FAULTED_DEV_R
	- * ZPOOL_STATUS_FAULTED_DEV_NR
	- * ZPOOL_STATUS_VERSION_OLDER
	- * ZPOOL_STATUS_FEAT_DISABLED
	- * ZPOOL_STATUS_RESILVERING
	- * ZPOOL_STATUS_OFFLINE_DEV
	- * ZPOOL_STATUS_REMOVED_DEV
	- * ZPOOL_STATUS_OK
	- */
	-};
	-
	-#define NMSGID (sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0]))
	-
	-/* ARGSUSED */
	-static int
	-vdev_missing(vdev_stat_t *vs, uint_t vsc)
	-{
	- return (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- vs->vs_aux == VDEV_AUX_OPEN_FAILED);
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_faulted(vdev_stat_t *vs, uint_t vsc)
	-{
	- return (vs->vs_state == VDEV_STATE_FAULTED);
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_errors(vdev_stat_t *vs, uint_t vsc)
	-{
	- return (vs->vs_state == VDEV_STATE_DEGRADED \|\|
	- vs->vs_read_errors != 0 \|\| vs->vs_write_errors != 0 \|\|
	- vs->vs_checksum_errors != 0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_broken(vdev_stat_t *vs, uint_t vsc)
	-{
	- return (vs->vs_state == VDEV_STATE_CANT_OPEN);
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_offlined(vdev_stat_t *vs, uint_t vsc)
	-{
	- return (vs->vs_state == VDEV_STATE_OFFLINE);
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_removed(vdev_stat_t *vs, uint_t vsc)
	-{
	- return (vs->vs_state == VDEV_STATE_REMOVED);
	-}
	-
	-static int
	-vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc)
	-{
	- return (VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
	- vs->vs_configured_ashift < vs->vs_physical_ashift);
	-}
	-
	-/*
	- * Detect if any leaf devices that have seen errors or could not be opened.
	- */
	-static boolean_t
	-find_vdev_problem(nvlist_t vdev, int (func)(vdev_stat_t *, uint_t),
	- boolean_t ignore_replacing)
	-{
	- nvlist_t **child;
	- vdev_stat_t *vs;
	- uint_t c, vsc, children;
	-
	- /*
	- * Ignore problems within a 'replacing' vdev, since we're presumably in
	- * the process of repairing any such errors, and don't want to call them
	- * out again. We'll pick up the fact that a resilver is happening
	- * later.
	- */
	- if (ignore_replacing == B_TRUE) {
	- char *type;
	-
	- verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE,
	- &type) == 0);
	- if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
	- return (B_FALSE);
	- }
	-
	- if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
	- &children) == 0) {
	- for (c = 0; c < children; c++)
	- if (find_vdev_problem(child[c], func, ignore_replacing))
	- return (B_TRUE);
	- } else {
	- verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &vsc) == 0);
	-
	- if (func(vs, vsc) != 0)
	- return (B_TRUE);
	- }
	-
	- /*
	- * Check any L2 cache devs
	- */
	- if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child,
	- &children) == 0) {
	- for (c = 0; c < children; c++)
	- if (find_vdev_problem(child[c], func, ignore_replacing))
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Active pool health status.
	- *
	- * To determine the status for a pool, we make several passes over the config,
	- * picking the most egregious error we find. In order of importance, we do the
	- * following:
	- *
	- * - Check for a complete and valid configuration
	- * - Look for any faulted or missing devices in a non-replicated config
	- * - Check for any data errors
	- * - Check for any faulted or missing devices in a replicated config
	- * - Look for any devices showing errors
	- * - Check for any resilvering devices
	- *
	- * There can obviously be multiple errors within a single pool, so this routine
	- * only picks the most damaging of all the current errors to report.
	- */
	-static zpool_status_t
	-check_status(nvlist_t *config, boolean_t isimport)
	-{
	- nvlist_t *nvroot;
	- vdev_stat_t *vs;
	- pool_scan_stat_t *ps = NULL;
	- uint_t vsc, psc;
	- uint64_t nerr;
	- uint64_t version;
	- uint64_t stateval;
	- uint64_t suspended;
	- uint64_t hostid = 0;
	- unsigned long system_hostid = get_system_hostid();
	-
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	- &version) == 0);
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &vsc) == 0);
	- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	- &stateval) == 0);
	-
	- /*
	- * Currently resilvering a vdev
	- */
	- (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
	- (uint64_t **)&ps, &psc);
	- if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER &&
	- ps->pss_state == DSS_SCANNING)
	- return (ZPOOL_STATUS_RESILVERING);
	-
	- /*
	- * The multihost property is set and the pool may be active.
	- */
	- if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- vs->vs_aux == VDEV_AUX_ACTIVE) {
	- mmp_state_t mmp_state;
	- nvlist_t *nvinfo;
	-
	- nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
	- mmp_state = fnvlist_lookup_uint64(nvinfo,
	- ZPOOL_CONFIG_MMP_STATE);
	-
	- if (mmp_state == MMP_STATE_ACTIVE)
	- return (ZPOOL_STATUS_HOSTID_ACTIVE);
	- else if (mmp_state == MMP_STATE_NO_HOSTID)
	- return (ZPOOL_STATUS_HOSTID_REQUIRED);
	- else
	- return (ZPOOL_STATUS_HOSTID_MISMATCH);
	- }
	-
	- /*
	- * Pool last accessed by another system.
	- */
	- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
	- if (hostid != 0 && (unsigned long)hostid != system_hostid &&
	- stateval == POOL_STATE_ACTIVE)
	- return (ZPOOL_STATUS_HOSTID_MISMATCH);
	-
	- /*
	- * Newer on-disk version.
	- */
	- if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- vs->vs_aux == VDEV_AUX_VERSION_NEWER)
	- return (ZPOOL_STATUS_VERSION_NEWER);
	-
	- /*
	- * Unsupported feature(s).
	- */
	- if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- vs->vs_aux == VDEV_AUX_UNSUP_FEAT) {
	- nvlist_t *nvinfo;
	-
	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
	- &nvinfo) == 0);
	- if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY))
	- return (ZPOOL_STATUS_UNSUP_FEAT_WRITE);
	- return (ZPOOL_STATUS_UNSUP_FEAT_READ);
	- }
	-
	- /*
	- * Check that the config is complete.
	- */
	- if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
	- return (ZPOOL_STATUS_BAD_GUID_SUM);
	-
	- /*
	- * Check whether the pool has suspended.
	- */
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED,
	- &suspended) == 0) {
	- uint64_t reason;
	-
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED_REASON,
	- &reason) == 0 && reason == ZIO_SUSPEND_MMP)
	- return (ZPOOL_STATUS_IO_FAILURE_MMP);
	-
	- if (suspended == ZIO_FAILURE_MODE_CONTINUE)
	- return (ZPOOL_STATUS_IO_FAILURE_CONTINUE);
	- return (ZPOOL_STATUS_IO_FAILURE_WAIT);
	- }
	-
	- /*
	- * Could not read a log.
	- */
	- if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- vs->vs_aux == VDEV_AUX_BAD_LOG) {
	- return (ZPOOL_STATUS_BAD_LOG);
	- }
	-
	- /*
	- * Bad devices in non-replicated config.
	- */
	- if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
	- return (ZPOOL_STATUS_FAULTED_DEV_NR);
	-
	- if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- find_vdev_problem(nvroot, vdev_missing, B_TRUE))
	- return (ZPOOL_STATUS_MISSING_DEV_NR);
	-
	- if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- find_vdev_problem(nvroot, vdev_broken, B_TRUE))
	- return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
	-
	- /*
	- * Corrupted pool metadata
	- */
	- if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
	- vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
	- return (ZPOOL_STATUS_CORRUPT_POOL);
	-
	- /*
	- * Persistent data errors.
	- */
	- if (!isimport) {
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
	- &nerr) == 0 && nerr != 0)
	- return (ZPOOL_STATUS_CORRUPT_DATA);
	- }
	-
	- /*
	- * Missing devices in a replicated config.
	- */
	- if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
	- return (ZPOOL_STATUS_FAULTED_DEV_R);
	- if (find_vdev_problem(nvroot, vdev_missing, B_TRUE))
	- return (ZPOOL_STATUS_MISSING_DEV_R);
	- if (find_vdev_problem(nvroot, vdev_broken, B_TRUE))
	- return (ZPOOL_STATUS_CORRUPT_LABEL_R);
	-
	- /*
	- * Devices with errors
	- */
	- if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE))
	- return (ZPOOL_STATUS_FAILING_DEV);
	-
	- /*
	- * Offlined devices
	- */
	- if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE))
	- return (ZPOOL_STATUS_OFFLINE_DEV);
	-
	- /*
	- * Removed device
	- */
	- if (find_vdev_problem(nvroot, vdev_removed, B_TRUE))
	- return (ZPOOL_STATUS_REMOVED_DEV);
	-
	- /*
	- * Suboptimal, but usable, ashift configuration.
	- */
	- if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE))
	- return (ZPOOL_STATUS_NON_NATIVE_ASHIFT);
	-
	- /*
	- * Outdated, but usable, version
	- */
	- if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION)
	- return (ZPOOL_STATUS_VERSION_OLDER);
	-
	- /*
	- * Usable pool with disabled features
	- */
	- if (version >= SPA_VERSION_FEATURES) {
	- int i;
	- nvlist_t *feat;
	-
	- if (isimport) {
	- feat = fnvlist_lookup_nvlist(config,
	- ZPOOL_CONFIG_LOAD_INFO);
	- if (nvlist_exists(feat, ZPOOL_CONFIG_ENABLED_FEAT))
	- feat = fnvlist_lookup_nvlist(feat,
	- ZPOOL_CONFIG_ENABLED_FEAT);
	- } else {
	- feat = fnvlist_lookup_nvlist(config,
	- ZPOOL_CONFIG_FEATURE_STATS);
	- }
	-
	- for (i = 0; i < SPA_FEATURES; i++) {
	- zfeature_info_t *fi = &spa_feature_table[i];
	- if (!nvlist_exists(feat, fi->fi_guid))
	- return (ZPOOL_STATUS_FEAT_DISABLED);
	- }
	- }
	-
	- return (ZPOOL_STATUS_OK);
	-}
	-
	-zpool_status_t
	-zpool_get_status(zpool_handle_t zhp, char *msgid)
	-{
	- zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE);
	-
	- if (ret >= NMSGID)
	- *msgid = NULL;
	- else
	- *msgid = zfs_msgid_table[ret];
	-
	- return (ret);
	-}
	-
	-zpool_status_t
	-zpool_import_status(nvlist_t config, char *msgid)
	-{
	- zpool_status_t ret = check_status(config, B_TRUE);
	-
	- if (ret >= NMSGID)
	- *msgid = NULL;
	- else
	- *msgid = zfs_msgid_table[ret];
	-
	- return (ret);
	-}
	-
	-static void
	-dump_ddt_stat(const ddt_stat_t *dds, int h)
	-{
	- char refcnt[6];
	- char blocks[6], lsize[6], psize[6], dsize[6];
	- char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
	-
	- if (dds == NULL \|\| dds->dds_blocks == 0)
	- return;
	-
	- if (h == -1)
	- (void) strcpy(refcnt, "Total");
	- else
	- zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt));
	-
	- zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks));
	- zfs_nicenum(dds->dds_lsize, lsize, sizeof (lsize));
	- zfs_nicenum(dds->dds_psize, psize, sizeof (psize));
	- zfs_nicenum(dds->dds_dsize, dsize, sizeof (dsize));
	- zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks));
	- zfs_nicenum(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize));
	- zfs_nicenum(dds->dds_ref_psize, ref_psize, sizeof (ref_psize));
	- zfs_nicenum(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize));
	-
	- (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
	- refcnt,
	- blocks, lsize, psize, dsize,
	- ref_blocks, ref_lsize, ref_psize, ref_dsize);
	-}
	-
	-/*
	- * Print the DDT histogram and the column totals.
	- */
	-void
	-zpool_dump_ddt(const ddt_stat_t dds_total, const ddt_histogram_t ddh)
	-{
	- int h;
	-
	- (void) printf("\n");
	-
	- (void) printf("bucket "
	- " allocated "
	- " referenced \n");
	- (void) printf("______ "
	- "______________________________ "
	- "______________________________\n");
	-
	- (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
	- "refcnt",
	- "blocks", "LSIZE", "PSIZE", "DSIZE",
	- "blocks", "LSIZE", "PSIZE", "DSIZE");
	-
	- (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
	- "------",
	- "------", "-----", "-----", "-----",
	- "------", "-----", "-----", "-----");
	-
	- for (h = 0; h < 64; h++)
	- dump_ddt_stat(&ddh->ddh_stat[h], h);
	-
	- dump_ddt_stat(dds_total, -1);
	-
	- (void) printf("\n");
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
	@@ -1,1661 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2019 Joyent, Inc.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	- * Copyright (c) 2017 Datto Inc.
	- */
	-
	-/*
	- * Internal utility routines for the ZFS library.
	- */
	-
	-#include <sys/param.h>
	-#include <sys/linker.h>
	-#include <sys/module.h>
	-#include <sys/stat.h>
	-
	-#include <errno.h>
	-#include <fcntl.h>
	-#include <libintl.h>
	-#include <stdarg.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <ctype.h>
	-#include <math.h>
	-#include <sys/mnttab.h>
	-#include <sys/mntent.h>
	-#include <sys/types.h>
	-#include <libcmdutils.h>
	-
	-#include <libzfs.h>
	-#include <libzfs_core.h>
	-
	-#include "libzfs_impl.h"
	-#include "zfs_prop.h"
	-#include "zfs_comutil.h"
	-#include "zfeature_common.h"
	-
	-
	-int
	-libzfs_errno(libzfs_handle_t *hdl)
	-{
	- return (hdl->libzfs_error);
	-}
	-
	-const char *
	-libzfs_error_action(libzfs_handle_t *hdl)
	-{
	- return (hdl->libzfs_action);
	-}
	-
	-const char *
	-libzfs_error_description(libzfs_handle_t *hdl)
	-{
	- if (hdl->libzfs_desc[0] != '\0')
	- return (hdl->libzfs_desc);
	-
	- switch (hdl->libzfs_error) {
	- case EZFS_NOMEM:
	- return (dgettext(TEXT_DOMAIN, "out of memory"));
	- case EZFS_BADPROP:
	- return (dgettext(TEXT_DOMAIN, "invalid property value"));
	- case EZFS_PROPREADONLY:
	- return (dgettext(TEXT_DOMAIN, "read-only property"));
	- case EZFS_PROPTYPE:
	- return (dgettext(TEXT_DOMAIN, "property doesn't apply to "
	- "datasets of this type"));
	- case EZFS_PROPNONINHERIT:
	- return (dgettext(TEXT_DOMAIN, "property cannot be inherited"));
	- case EZFS_PROPSPACE:
	- return (dgettext(TEXT_DOMAIN, "invalid quota or reservation"));
	- case EZFS_BADTYPE:
	- return (dgettext(TEXT_DOMAIN, "operation not applicable to "
	- "datasets of this type"));
	- case EZFS_BUSY:
	- return (dgettext(TEXT_DOMAIN, "pool or dataset is busy"));
	- case EZFS_EXISTS:
	- return (dgettext(TEXT_DOMAIN, "pool or dataset exists"));
	- case EZFS_NOENT:
	- return (dgettext(TEXT_DOMAIN, "no such pool or dataset"));
	- case EZFS_BADSTREAM:
	- return (dgettext(TEXT_DOMAIN, "invalid backup stream"));
	- case EZFS_DSREADONLY:
	- return (dgettext(TEXT_DOMAIN, "dataset is read-only"));
	- case EZFS_VOLTOOBIG:
	- return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
	- "this system"));
	- case EZFS_INVALIDNAME:
	- return (dgettext(TEXT_DOMAIN, "invalid name"));
	- case EZFS_BADRESTORE:
	- return (dgettext(TEXT_DOMAIN, "unable to restore to "
	- "destination"));
	- case EZFS_BADBACKUP:
	- return (dgettext(TEXT_DOMAIN, "backup failed"));
	- case EZFS_BADTARGET:
	- return (dgettext(TEXT_DOMAIN, "invalid target vdev"));
	- case EZFS_NODEVICE:
	- return (dgettext(TEXT_DOMAIN, "no such device in pool"));
	- case EZFS_BADDEV:
	- return (dgettext(TEXT_DOMAIN, "invalid device"));
	- case EZFS_NOREPLICAS:
	- return (dgettext(TEXT_DOMAIN, "no valid replicas"));
	- case EZFS_RESILVERING:
	- return (dgettext(TEXT_DOMAIN, "currently resilvering"));
	- case EZFS_BADVERSION:
	- return (dgettext(TEXT_DOMAIN, "unsupported version or "
	- "feature"));
	- case EZFS_POOLUNAVAIL:
	- return (dgettext(TEXT_DOMAIN, "pool is unavailable"));
	- case EZFS_DEVOVERFLOW:
	- return (dgettext(TEXT_DOMAIN, "too many devices in one vdev"));
	- case EZFS_BADPATH:
	- return (dgettext(TEXT_DOMAIN, "must be an absolute path"));
	- case EZFS_CROSSTARGET:
	- return (dgettext(TEXT_DOMAIN, "operation crosses datasets or "
	- "pools"));
	- case EZFS_ZONED:
	- return (dgettext(TEXT_DOMAIN, "dataset in use by local zone"));
	- case EZFS_MOUNTFAILED:
	- return (dgettext(TEXT_DOMAIN, "mount failed"));
	- case EZFS_UMOUNTFAILED:
	- return (dgettext(TEXT_DOMAIN, "umount failed"));
	- case EZFS_UNSHARENFSFAILED:
	- return (dgettext(TEXT_DOMAIN, "unshare(1M) failed"));
	- case EZFS_SHARENFSFAILED:
	- return (dgettext(TEXT_DOMAIN, "share(1M) failed"));
	- case EZFS_UNSHARESMBFAILED:
	- return (dgettext(TEXT_DOMAIN, "smb remove share failed"));
	- case EZFS_SHARESMBFAILED:
	- return (dgettext(TEXT_DOMAIN, "smb add share failed"));
	- case EZFS_PERM:
	- return (dgettext(TEXT_DOMAIN, "permission denied"));
	- case EZFS_NOSPC:
	- return (dgettext(TEXT_DOMAIN, "out of space"));
	- case EZFS_FAULT:
	- return (dgettext(TEXT_DOMAIN, "bad address"));
	- case EZFS_IO:
	- return (dgettext(TEXT_DOMAIN, "I/O error"));
	- case EZFS_INTR:
	- return (dgettext(TEXT_DOMAIN, "signal received"));
	- case EZFS_ISSPARE:
	- return (dgettext(TEXT_DOMAIN, "device is reserved as a hot "
	- "spare"));
	- case EZFS_INVALCONFIG:
	- return (dgettext(TEXT_DOMAIN, "invalid vdev configuration"));
	- case EZFS_RECURSIVE:
	- return (dgettext(TEXT_DOMAIN, "recursive dataset dependency"));
	- case EZFS_NOHISTORY:
	- return (dgettext(TEXT_DOMAIN, "no history available"));
	- case EZFS_POOLPROPS:
	- return (dgettext(TEXT_DOMAIN, "failed to retrieve "
	- "pool properties"));
	- case EZFS_POOL_NOTSUP:
	- return (dgettext(TEXT_DOMAIN, "operation not supported "
	- "on this type of pool"));
	- case EZFS_POOL_INVALARG:
	- return (dgettext(TEXT_DOMAIN, "invalid argument for "
	- "this pool operation"));
	- case EZFS_NAMETOOLONG:
	- return (dgettext(TEXT_DOMAIN, "dataset name is too long"));
	- case EZFS_OPENFAILED:
	- return (dgettext(TEXT_DOMAIN, "open failed"));
	- case EZFS_NOCAP:
	- return (dgettext(TEXT_DOMAIN,
	- "disk capacity information could not be retrieved"));
	- case EZFS_LABELFAILED:
	- return (dgettext(TEXT_DOMAIN, "write of label failed"));
	- case EZFS_BADWHO:
	- return (dgettext(TEXT_DOMAIN, "invalid user/group"));
	- case EZFS_BADPERM:
	- return (dgettext(TEXT_DOMAIN, "invalid permission"));
	- case EZFS_BADPERMSET:
	- return (dgettext(TEXT_DOMAIN, "invalid permission set name"));
	- case EZFS_NODELEGATION:
	- return (dgettext(TEXT_DOMAIN, "delegated administration is "
	- "disabled on pool"));
	- case EZFS_BADCACHE:
	- return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));
	- case EZFS_ISL2CACHE:
	- return (dgettext(TEXT_DOMAIN, "device is in use as a cache"));
	- case EZFS_VDEVNOTSUP:
	- return (dgettext(TEXT_DOMAIN, "vdev specification is not "
	- "supported"));
	- case EZFS_NOTSUP:
	- return (dgettext(TEXT_DOMAIN, "operation not supported "
	- "on this dataset"));
	- case EZFS_IOC_NOTSUPPORTED:
	- return (dgettext(TEXT_DOMAIN, "operation not supported by "
	- "zfs kernel module"));
	- case EZFS_ACTIVE_SPARE:
	- return (dgettext(TEXT_DOMAIN, "pool has active shared spare "
	- "device"));
	- case EZFS_UNPLAYED_LOGS:
	- return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
	- "logs"));
	- case EZFS_REFTAG_RELE:
	- return (dgettext(TEXT_DOMAIN, "no such tag on this dataset"));
	- case EZFS_REFTAG_HOLD:
	- return (dgettext(TEXT_DOMAIN, "tag already exists on this "
	- "dataset"));
	- case EZFS_TAGTOOLONG:
	- return (dgettext(TEXT_DOMAIN, "tag too long"));
	- case EZFS_PIPEFAILED:
	- return (dgettext(TEXT_DOMAIN, "pipe create failed"));
	- case EZFS_THREADCREATEFAILED:
	- return (dgettext(TEXT_DOMAIN, "thread create failed"));
	- case EZFS_POSTSPLIT_ONLINE:
	- return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
	- "into a new one"));
	- case EZFS_SCRUB_PAUSED:
	- return (dgettext(TEXT_DOMAIN, "scrub is paused; "
	- "use 'zpool scrub' to resume"));
	- case EZFS_SCRUBBING:
	- return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
	- "use 'zpool scrub -s' to cancel current scrub"));
	- case EZFS_NO_SCRUB:
	- return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
	- case EZFS_DIFF:
	- return (dgettext(TEXT_DOMAIN, "unable to generate diffs"));
	- case EZFS_DIFFDATA:
	- return (dgettext(TEXT_DOMAIN, "invalid diff data"));
	- case EZFS_POOLREADONLY:
	- return (dgettext(TEXT_DOMAIN, "pool is read-only"));
	- case EZFS_NO_PENDING:
	- return (dgettext(TEXT_DOMAIN, "operation is not "
	- "in progress"));
	- case EZFS_CHECKPOINT_EXISTS:
	- return (dgettext(TEXT_DOMAIN, "checkpoint exists"));
	- case EZFS_DISCARDING_CHECKPOINT:
	- return (dgettext(TEXT_DOMAIN, "currently discarding "
	- "checkpoint"));
	- case EZFS_NO_CHECKPOINT:
	- return (dgettext(TEXT_DOMAIN, "checkpoint does not exist"));
	- case EZFS_DEVRM_IN_PROGRESS:
	- return (dgettext(TEXT_DOMAIN, "device removal in progress"));
	- case EZFS_VDEV_TOO_BIG:
	- return (dgettext(TEXT_DOMAIN, "device exceeds supported size"));
	- case EZFS_ACTIVE_POOL:
	- return (dgettext(TEXT_DOMAIN, "pool is imported on a "
	- "different host"));
	- case EZFS_TOOMANY:
	- return (dgettext(TEXT_DOMAIN, "argument list too long"));
	- case EZFS_INITIALIZING:
	- return (dgettext(TEXT_DOMAIN, "currently initializing"));
	- case EZFS_NO_INITIALIZE:
	- return (dgettext(TEXT_DOMAIN, "there is no active "
	- "initialization"));
	- case EZFS_WRONG_PARENT:
	- return (dgettext(TEXT_DOMAIN, "invalid parent dataset"));
	- case EZFS_UNKNOWN:
	- return (dgettext(TEXT_DOMAIN, "unknown error"));
	- default:
	- assert(hdl->libzfs_error == 0);
	- return (dgettext(TEXT_DOMAIN, "no error"));
	- }
	-}
	-
	-/PRINTFLIKE2/
	-void
	-zfs_error_aux(libzfs_handle_t hdl, const char fmt, ...)
	-{
	- va_list ap;
	-
	- va_start(ap, fmt);
	-
	- (void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc),
	- fmt, ap);
	- hdl->libzfs_desc_active = 1;
	-
	- va_end(ap);
	-}
	-
	-static void
	-zfs_verror(libzfs_handle_t hdl, int error, const char fmt, va_list ap)
	-{
	- (void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action),
	- fmt, ap);
	- hdl->libzfs_error = error;
	-
	- if (hdl->libzfs_desc_active)
	- hdl->libzfs_desc_active = 0;
	- else
	- hdl->libzfs_desc[0] = '\0';
	-
	- if (hdl->libzfs_printerr) {
	- if (error == EZFS_UNKNOWN) {
	- (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal "
	- "error: %s\n"), libzfs_error_description(hdl));
	- abort();
	- }
	-
	- (void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action,
	- libzfs_error_description(hdl));
	- if (error == EZFS_NOMEM)
	- exit(1);
	- }
	-}
	-
	-int
	-zfs_error(libzfs_handle_t hdl, int error, const char msg)
	-{
	- return (zfs_error_fmt(hdl, error, "%s", msg));
	-}
	-
	-/PRINTFLIKE3/
	-int
	-zfs_error_fmt(libzfs_handle_t hdl, int error, const char fmt, ...)
	-{
	- va_list ap;
	-
	- va_start(ap, fmt);
	-
	- zfs_verror(hdl, error, fmt, ap);
	-
	- va_end(ap);
	-
	- return (-1);
	-}
	-
	-static int
	-zfs_common_error(libzfs_handle_t hdl, int error, const char fmt,
	- va_list ap)
	-{
	- switch (error) {
	- case EPERM:
	- case EACCES:
	- zfs_verror(hdl, EZFS_PERM, fmt, ap);
	- return (-1);
	-
	- case ECANCELED:
	- zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap);
	- return (-1);
	-
	- case EIO:
	- zfs_verror(hdl, EZFS_IO, fmt, ap);
	- return (-1);
	-
	- case EFAULT:
	- zfs_verror(hdl, EZFS_FAULT, fmt, ap);
	- return (-1);
	-
	- case EINTR:
	- zfs_verror(hdl, EZFS_INTR, fmt, ap);
	- return (-1);
	- }
	-
	- return (0);
	-}
	-
	-int
	-zfs_standard_error(libzfs_handle_t hdl, int error, const char msg)
	-{
	- return (zfs_standard_error_fmt(hdl, error, "%s", msg));
	-}
	-
	-/PRINTFLIKE3/
	-int
	-zfs_standard_error_fmt(libzfs_handle_t hdl, int error, const char fmt, ...)
	-{
	- va_list ap;
	-
	- va_start(ap, fmt);
	-
	- if (zfs_common_error(hdl, error, fmt, ap) != 0) {
	- va_end(ap);
	- return (-1);
	- }
	-
	- switch (error) {
	- case ENXIO:
	- case ENODEV:
	- case EPIPE:
	- zfs_verror(hdl, EZFS_IO, fmt, ap);
	- break;
	-
	- case ENOENT:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "dataset does not exist"));
	- zfs_verror(hdl, EZFS_NOENT, fmt, ap);
	- break;
	-
	- case ENOSPC:
	- case EDQUOT:
	- zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
	- va_end(ap);
	- return (-1);
	-
	- case EEXIST:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "dataset already exists"));
	- zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
	- break;
	-
	- case EBUSY:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "dataset is busy"));
	- zfs_verror(hdl, EZFS_BUSY, fmt, ap);
	- break;
	- case EROFS:
	- zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
	- break;
	- case ENAMETOOLONG:
	- zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap);
	- break;
	- case ENOTSUP:
	- zfs_verror(hdl, EZFS_BADVERSION, fmt, ap);
	- break;
	- case EAGAIN:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool I/O is currently suspended"));
	- zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
	- break;
	- case EREMOTEIO:
	- zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap);
	- break;
	- case ZFS_ERR_IOC_CMD_UNAVAIL:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
	- "module does not support this operation. A reboot may "
	- "be required to enable this operation."));
	- zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
	- break;
	- case ZFS_ERR_IOC_ARG_UNAVAIL:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
	- "module does not support an option for this operation. "
	- "A reboot may be required to enable this option."));
	- zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
	- break;
	- case ZFS_ERR_IOC_ARG_REQUIRED:
	- case ZFS_ERR_IOC_ARG_BADTYPE:
	- zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
	- break;
	- default:
	- zfs_error_aux(hdl, strerror(error));
	- zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
	- break;
	- }
	-
	- va_end(ap);
	- return (-1);
	-}
	-
	-int
	-zpool_standard_error(libzfs_handle_t hdl, int error, const char msg)
	-{
	- return (zpool_standard_error_fmt(hdl, error, "%s", msg));
	-}
	-
	-/PRINTFLIKE3/
	-int
	-zpool_standard_error_fmt(libzfs_handle_t hdl, int error, const char fmt, ...)
	-{
	- va_list ap;
	-
	- va_start(ap, fmt);
	-
	- if (zfs_common_error(hdl, error, fmt, ap) != 0) {
	- va_end(ap);
	- return (-1);
	- }
	-
	- switch (error) {
	- case ENODEV:
	- zfs_verror(hdl, EZFS_NODEVICE, fmt, ap);
	- break;
	-
	- case ENOENT:
	- zfs_error_aux(hdl,
	- dgettext(TEXT_DOMAIN, "no such pool or dataset"));
	- zfs_verror(hdl, EZFS_NOENT, fmt, ap);
	- break;
	-
	- case EEXIST:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool already exists"));
	- zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
	- break;
	-
	- case EBUSY:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy"));
	- zfs_verror(hdl, EZFS_BUSY, fmt, ap);
	- break;
	-
	- case ENXIO:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "one or more devices is currently unavailable"));
	- zfs_verror(hdl, EZFS_BADDEV, fmt, ap);
	- break;
	-
	- case ENAMETOOLONG:
	- zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap);
	- break;
	-
	- case ENOTSUP:
	- zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap);
	- break;
	-
	- case EINVAL:
	- zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap);
	- break;
	-
	- case ENOSPC:
	- case EDQUOT:
	- zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
	- va_end(ap);
	- return (-1);
	-
	- case EAGAIN:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool I/O is currently suspended"));
	- zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
	- break;
	-
	- case EROFS:
	- zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
	- break;
	- /* There is no pending operation to cancel */
	- case ESRCH:
	- zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap);
	- break;
	- case EREMOTEIO:
	- zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap);
	- break;
	- case ZFS_ERR_CHECKPOINT_EXISTS:
	- zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap);
	- break;
	- case ZFS_ERR_DISCARDING_CHECKPOINT:
	- zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap);
	- break;
	- case ZFS_ERR_NO_CHECKPOINT:
	- zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap);
	- break;
	- case ZFS_ERR_DEVRM_IN_PROGRESS:
	- zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap);
	- break;
	- case ZFS_ERR_VDEV_TOO_BIG:
	- zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap);
	- break;
	- case ZFS_ERR_WRONG_PARENT:
	- zfs_verror(hdl, EZFS_WRONG_PARENT, fmt, ap);
	- break;
	- case ZFS_ERR_IOC_CMD_UNAVAIL:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
	- "module does not support this operation. A reboot may "
	- "be required to enable this operation."));
	- zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
	- break;
	- case ZFS_ERR_IOC_ARG_UNAVAIL:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
	- "module does not support an option for this operation. "
	- "A reboot may be required to enable this option."));
	- zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
	- break;
	- case ZFS_ERR_IOC_ARG_REQUIRED:
	- case ZFS_ERR_IOC_ARG_BADTYPE:
	- zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
	- break;
	- default:
	- zfs_error_aux(hdl, strerror(error));
	- zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
	- }
	-
	- va_end(ap);
	- return (-1);
	-}
	-
	-/*
	- * Display an out of memory error message and abort the current program.
	- */
	-int
	-no_memory(libzfs_handle_t *hdl)
	-{
	- return (zfs_error(hdl, EZFS_NOMEM, "internal error"));
	-}
	-
	-/*
	- * A safe form of malloc() which will die if the allocation fails.
	- */
	-void *
	-zfs_alloc(libzfs_handle_t *hdl, size_t size)
	-{
	- void *data;
	-
	- if ((data = calloc(1, size)) == NULL)
	- (void) no_memory(hdl);
	-
	- return (data);
	-}
	-
	-/*
	- * A safe form of asprintf() which will die if the allocation fails.
	- */
	-/PRINTFLIKE2/
	-char *
	-zfs_asprintf(libzfs_handle_t hdl, const char fmt, ...)
	-{
	- va_list ap;
	- char *ret;
	- int err;
	-
	- va_start(ap, fmt);
	-
	- err = vasprintf(&ret, fmt, ap);
	-
	- va_end(ap);
	-
	- if (err < 0)
	- (void) no_memory(hdl);
	-
	- return (ret);
	-}
	-
	-/*
	- * A safe form of realloc(), which also zeroes newly allocated space.
	- */
	-void *
	-zfs_realloc(libzfs_handle_t hdl, void ptr, size_t oldsize, size_t newsize)
	-{
	- void *ret;
	-
	- if ((ret = realloc(ptr, newsize)) == NULL) {
	- (void) no_memory(hdl);
	- return (NULL);
	- }
	-
	- bzero((char *)ret + oldsize, (newsize - oldsize));
	- return (ret);
	-}
	-
	-/*
	- * A safe form of strdup() which will die if the allocation fails.
	- */
	-char *
	-zfs_strdup(libzfs_handle_t hdl, const char str)
	-{
	- char *ret;
	-
	- if ((ret = strdup(str)) == NULL)
	- (void) no_memory(hdl);
	-
	- return (ret);
	-}
	-
	-/*
	- * Convert a number to an appropriately human-readable output.
	- */
	-void
	-zfs_nicenum(uint64_t num, char *buf, size_t buflen)
	-{
	- nicenum(num, buf, buflen);
	-}
	-
	-void
	-libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr)
	-{
	- hdl->libzfs_printerr = printerr;
	-}
	-
	-static int
	-libzfs_load(void)
	-{
	- int error;
	-
	- if (modfind("zfs") < 0) {
	- /* Not present in kernel, try loading it. */
	- if (kldload("zfs") < 0 \|\| modfind("zfs") < 0) {
	- if (errno != EEXIST)
	- return (-1);
	- }
	- }
	- return (0);
	-}
	-
	-libzfs_handle_t *
	-libzfs_init(void)
	-{
	- libzfs_handle_t *hdl;
	-
	- if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) {
	- return (NULL);
	- }
	-
	- if (libzfs_load() < 0) {
	- free(hdl);
	- return (NULL);
	- }
	-
	- if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
	- free(hdl);
	- return (NULL);
	- }
	-
	- if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) {
	- (void) close(hdl->libzfs_fd);
	- free(hdl);
	- return (NULL);
	- }
	-
	- hdl->libzfs_sharetab = fopen(ZFS_EXPORTS_PATH, "r");
	-
	- if (libzfs_core_init() != 0) {
	- (void) close(hdl->libzfs_fd);
	- (void) fclose(hdl->libzfs_mnttab);
	- (void) fclose(hdl->libzfs_sharetab);
	- free(hdl);
	- return (NULL);
	- }
	-
	- zfs_prop_init();
	- zpool_prop_init();
	- zpool_feature_init();
	- libzfs_mnttab_init(hdl);
	-
	- if (getenv("ZFS_PROP_DEBUG") != NULL) {
	- hdl->libzfs_prop_debug = B_TRUE;
	- }
	-
	- return (hdl);
	-}
	-
	-void
	-libzfs_fini(libzfs_handle_t *hdl)
	-{
	- (void) close(hdl->libzfs_fd);
	- if (hdl->libzfs_mnttab)
	- (void) fclose(hdl->libzfs_mnttab);
	- if (hdl->libzfs_sharetab)
	- (void) fclose(hdl->libzfs_sharetab);
	- zfs_uninit_libshare(hdl);
	- zpool_free_handles(hdl);
	-#ifdef illumos
	- libzfs_fru_clear(hdl, B_TRUE);
	-#endif
	- namespace_clear(hdl);
	- libzfs_mnttab_fini(hdl);
	- libzfs_core_fini();
	- free(hdl);
	-}
	-
	-libzfs_handle_t *
	-zpool_get_handle(zpool_handle_t *zhp)
	-{
	- return (zhp->zpool_hdl);
	-}
	-
	-libzfs_handle_t *
	-zfs_get_handle(zfs_handle_t *zhp)
	-{
	- return (zhp->zfs_hdl);
	-}
	-
	-zpool_handle_t *
	-zfs_get_pool_handle(const zfs_handle_t *zhp)
	-{
	- return (zhp->zpool_hdl);
	-}
	-
	-/*
	- * Given a name, determine whether or not it's a valid path
	- * (starts with '/' or "./"). If so, walk the mnttab trying
	- * to match the device number. If not, treat the path as an
	- * fs/vol/snap/bkmark name.
	- */
	-zfs_handle_t *
	-zfs_path_to_zhandle(libzfs_handle_t hdl, char path, zfs_type_t argtype)
	-{
	- struct stat64 statbuf;
	- struct extmnttab entry;
	- int ret;
	-
	- if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) {
	- /*
	- * It's not a valid path, assume it's a name of type 'argtype'.
	- */
	- return (zfs_open(hdl, path, argtype));
	- }
	-
	- if (stat64(path, &statbuf) != 0) {
	- (void) fprintf(stderr, "%s: %s\n", path, strerror(errno));
	- return (NULL);
	- }
	-
	-#ifdef illumos
	- rewind(hdl->libzfs_mnttab);
	- while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) {
	- if (makedevice(entry.mnt_major, entry.mnt_minor) ==
	- statbuf.st_dev) {
	- break;
	- }
	- }
	-#else
	- {
	- struct statfs sfs;
	-
	- ret = statfs(path, &sfs);
	- if (ret == 0)
	- statfs2mnttab(&sfs, &entry);
	- else {
	- (void) fprintf(stderr, "%s: %s\n", path,
	- strerror(errno));
	- }
	- }
	-#endif /* illumos */
	- if (ret != 0) {
	- return (NULL);
	- }
	-
	- if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
	- (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"),
	- path);
	- return (NULL);
	- }
	-
	- return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM));
	-}
	-
	-/*
	- * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from
	- * an ioctl().
	- */
	-int
	-zcmd_alloc_dst_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc, size_t len)
	-{
	- if (len == 0)
	- len = 16 * 1024;
	- zc->zc_nvlist_dst_size = len;
	- zc->zc_nvlist_dst =
	- (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
	- if (zc->zc_nvlist_dst == 0)
	- return (-1);
	-
	- return (0);
	-}
	-
	-/*
	- * Called when an ioctl() which returns an nvlist fails with ENOMEM. This will
	- * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was
	- * filled in by the kernel to indicate the actual required size.
	- */
	-int
	-zcmd_expand_dst_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc)
	-{
	- free((void *)(uintptr_t)zc->zc_nvlist_dst);
	- zc->zc_nvlist_dst =
	- (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
	- if (zc->zc_nvlist_dst == 0)
	- return (-1);
	-
	- return (0);
	-}
	-
	-/*
	- * Called to free the src and dst nvlists stored in the command structure.
	- */
	-void
	-zcmd_free_nvlists(zfs_cmd_t *zc)
	-{
	- free((void *)(uintptr_t)zc->zc_nvlist_conf);
	- free((void *)(uintptr_t)zc->zc_nvlist_src);
	- free((void *)(uintptr_t)zc->zc_nvlist_dst);
	- zc->zc_nvlist_conf = NULL;
	- zc->zc_nvlist_src = NULL;
	- zc->zc_nvlist_dst = NULL;
	-}
	-
	-static int
	-zcmd_write_nvlist_com(libzfs_handle_t hdl, uint64_t outnv, uint64_t *outlen,
	- nvlist_t *nvl)
	-{
	- char *packed;
	- size_t len;
	-
	- verify(nvlist_size(nvl, &len, NV_ENCODE_NATIVE) == 0);
	-
	- if ((packed = zfs_alloc(hdl, len)) == NULL)
	- return (-1);
	-
	- verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0);
	-
	- *outnv = (uint64_t)(uintptr_t)packed;
	- *outlen = len;
	-
	- return (0);
	-}
	-
	-int
	-zcmd_write_conf_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc, nvlist_t *nvl)
	-{
	- return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf,
	- &zc->zc_nvlist_conf_size, nvl));
	-}
	-
	-int
	-zcmd_write_src_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc, nvlist_t *nvl)
	-{
	- return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src,
	- &zc->zc_nvlist_src_size, nvl));
	-}
	-
	-/*
	- * Unpacks an nvlist from the ZFS ioctl command structure.
	- */
	-int
	-zcmd_read_dst_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc, nvlist_t **nvlp)
	-{
	- if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst,
	- zc->zc_nvlist_dst_size, nvlp, 0) != 0)
	- return (no_memory(hdl));
	-
	- return (0);
	-}
	-
	-int
	-zfs_ioctl(libzfs_handle_t hdl, int request, zfs_cmd_t zc)
	-{
	- return (ioctl(hdl->libzfs_fd, request, zc));
	-}
	-
	-/*
	- * ================================================================
	- * API shared by zfs and zpool property management
	- * ================================================================
	- */
	-
	-static void
	-zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
	-{
	- zprop_list_t *pl = cbp->cb_proplist;
	- int i;
	- char *title;
	- size_t len;
	-
	- cbp->cb_first = B_FALSE;
	- if (cbp->cb_scripted)
	- return;
	-
	- /*
	- * Start with the length of the column headers.
	- */
	- cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME"));
	- cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN,
	- "PROPERTY"));
	- cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN,
	- "VALUE"));
	- cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN,
	- "RECEIVED"));
	- cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
	- "SOURCE"));
	-
	- /* first property is always NAME */
	- assert(cbp->cb_proplist->pl_prop ==
	- ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME));
	-
	- /*
	- * Go through and calculate the widths for each column. For the
	- * 'source' column, we kludge it up by taking the worst-case scenario of
	- * inheriting from the longest name. This is acceptable because in the
	- * majority of cases 'SOURCE' is the last column displayed, and we don't
	- * use the width anyway. Note that the 'VALUE' column can be oversized,
	- * if the name of the property is much longer than any values we find.
	- */
	- for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
	- /*
	- * 'PROPERTY' column
	- */
	- if (pl->pl_prop != ZPROP_INVAL) {
	- const char *propname = (type == ZFS_TYPE_POOL) ?
	- zpool_prop_to_name(pl->pl_prop) :
	- zfs_prop_to_name(pl->pl_prop);
	-
	- len = strlen(propname);
	- if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
	- cbp->cb_colwidths[GET_COL_PROPERTY] = len;
	- } else {
	- len = strlen(pl->pl_user_prop);
	- if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
	- cbp->cb_colwidths[GET_COL_PROPERTY] = len;
	- }
	-
	- /*
	- * 'VALUE' column. The first property is always the 'name'
	- * property that was tacked on either by /sbin/zfs's
	- * zfs_do_get() or when calling zprop_expand_list(), so we
	- * ignore its width. If the user specified the name property
	- * to display, then it will be later in the list in any case.
	- */
	- if (pl != cbp->cb_proplist &&
	- pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
	- cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;
	-
	- /* 'RECEIVED' column. */
	- if (pl != cbp->cb_proplist &&
	- pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD])
	- cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width;
	-
	- /*
	- * 'NAME' and 'SOURCE' columns
	- */
	- if (pl->pl_prop == (type == ZFS_TYPE_POOL ? ZPOOL_PROP_NAME :
	- ZFS_PROP_NAME) &&
	- pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) {
	- cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width;
	- cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width +
	- strlen(dgettext(TEXT_DOMAIN, "inherited from"));
	- }
	- }
	-
	- /*
	- * Now go through and print the headers.
	- */
	- for (i = 0; i < ZFS_GET_NCOLS; i++) {
	- switch (cbp->cb_columns[i]) {
	- case GET_COL_NAME:
	- title = dgettext(TEXT_DOMAIN, "NAME");
	- break;
	- case GET_COL_PROPERTY:
	- title = dgettext(TEXT_DOMAIN, "PROPERTY");
	- break;
	- case GET_COL_VALUE:
	- title = dgettext(TEXT_DOMAIN, "VALUE");
	- break;
	- case GET_COL_RECVD:
	- title = dgettext(TEXT_DOMAIN, "RECEIVED");
	- break;
	- case GET_COL_SOURCE:
	- title = dgettext(TEXT_DOMAIN, "SOURCE");
	- break;
	- default:
	- title = NULL;
	- }
	-
	- if (title != NULL) {
	- if (i == (ZFS_GET_NCOLS - 1) \|\|
	- cbp->cb_columns[i + 1] == GET_COL_NONE)
	- (void) printf("%s", title);
	- else
	- (void) printf("%-*s ",
	- cbp->cb_colwidths[cbp->cb_columns[i]],
	- title);
	- }
	- }
	- (void) printf("\n");
	-}
	-
	-/*
	- * Display a single line of output, according to the settings in the callback
	- * structure.
	- */
	-void
	-zprop_print_one_property(const char name, zprop_get_cbdata_t cbp,
	- const char propname, const char value, zprop_source_t sourcetype,
	- const char source, const char recvd_value)
	-{
	- int i;
	- const char *str = NULL;
	- char buf[128];
	-
	- /*
	- * Ignore those source types that the user has chosen to ignore.
	- */
	- if ((sourcetype & cbp->cb_sources) == 0)
	- return;
	-
	- if (cbp->cb_first)
	- zprop_print_headers(cbp, cbp->cb_type);
	-
	- for (i = 0; i < ZFS_GET_NCOLS; i++) {
	- switch (cbp->cb_columns[i]) {
	- case GET_COL_NAME:
	- str = name;
	- break;
	-
	- case GET_COL_PROPERTY:
	- str = propname;
	- break;
	-
	- case GET_COL_VALUE:
	- str = value;
	- break;
	-
	- case GET_COL_SOURCE:
	- switch (sourcetype) {
	- case ZPROP_SRC_NONE:
	- str = "-";
	- break;
	-
	- case ZPROP_SRC_DEFAULT:
	- str = "default";
	- break;
	-
	- case ZPROP_SRC_LOCAL:
	- str = "local";
	- break;
	-
	- case ZPROP_SRC_TEMPORARY:
	- str = "temporary";
	- break;
	-
	- case ZPROP_SRC_INHERITED:
	- (void) snprintf(buf, sizeof (buf),
	- "inherited from %s", source);
	- str = buf;
	- break;
	- case ZPROP_SRC_RECEIVED:
	- str = "received";
	- break;
	-
	- default:
	- str = NULL;
	- assert(!"unhandled zprop_source_t");
	- }
	- break;
	-
	- case GET_COL_RECVD:
	- str = (recvd_value == NULL ? "-" : recvd_value);
	- break;
	-
	- default:
	- continue;
	- }
	-
	- if (cbp->cb_columns[i + 1] == GET_COL_NONE)
	- (void) printf("%s", str);
	- else if (cbp->cb_scripted)
	- (void) printf("%s\t", str);
	- else
	- (void) printf("%-*s ",
	- cbp->cb_colwidths[cbp->cb_columns[i]],
	- str);
	- }
	-
	- (void) printf("\n");
	-}
	-
	-/*
	- * Given a numeric suffix, convert the value into a number of bits that the
	- * resulting value must be shifted.
	- */
	-static int
	-str2shift(libzfs_handle_t hdl, const char buf)
	-{
	- const char *ends = "BKMGTPEZ";
	- int i;
	-
	- if (buf[0] == '\0')
	- return (0);
	- for (i = 0; i < strlen(ends); i++) {
	- if (toupper(buf[0]) == ends[i])
	- break;
	- }
	- if (i == strlen(ends)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid numeric suffix '%s'"), buf);
	- return (-1);
	- }
	-
	- /*
	- * We want to allow trailing 'b' characters for 'GB' or 'Mb'. But don't
	- * allow 'BB' - that's just weird.
	- */
	- if (buf[1] == '\0' \|\| (toupper(buf[1]) == 'B' && buf[2] == '\0' &&
	- toupper(buf[0]) != 'B'))
	- return (10*i);
	-
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid numeric suffix '%s'"), buf);
	- return (-1);
	-}
	-
	-/*
	- * Convert a string of the form '100G' into a real number. Used when setting
	- * properties or creating a volume. 'buf' is used to place an extended error
	- * message for the caller to use.
	- */
	-int
	-zfs_nicestrtonum(libzfs_handle_t hdl, const char value, uint64_t *num)
	-{
	- char *end;
	- int shift;
	-
	- *num = 0;
	-
	- /* Check to see if this looks like a number. */
	- if ((value[0] < '0' \|\| value[0] > '9') && value[0] != '.') {
	- if (hdl)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "bad numeric value '%s'"), value);
	- return (-1);
	- }
	-
	- /* Rely on strtoull() to process the numeric portion. */
	- errno = 0;
	- *num = strtoull(value, &end, 10);
	-
	- /*
	- * Check for ERANGE, which indicates that the value is too large to fit
	- * in a 64-bit value.
	- */
	- if (errno == ERANGE) {
	- if (hdl)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "numeric value is too large"));
	- return (-1);
	- }
	-
	- /*
	- * If we have a decimal value, then do the computation with floating
	- * point arithmetic. Otherwise, use standard arithmetic.
	- */
	- if (*end == '.') {
	- double fval = strtod(value, &end);
	-
	- if ((shift = str2shift(hdl, end)) == -1)
	- return (-1);
	-
	- fval *= pow(2, shift);
	-
	- if (fval > UINT64_MAX) {
	- if (hdl)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "numeric value is too large"));
	- return (-1);
	- }
	-
	- *num = (uint64_t)fval;
	- } else {
	- if ((shift = str2shift(hdl, end)) == -1)
	- return (-1);
	-
	- /* Check for overflow */
	- if (shift >= 64 \|\| (num << shift) >> shift != num) {
	- if (hdl)
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "numeric value is too large"));
	- return (-1);
	- }
	-
	- *num <<= shift;
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Given a propname=value nvpair to set, parse any numeric properties
	- * (index, boolean, etc) if they are specified as strings and add the
	- * resulting nvpair to the returned nvlist.
	- *
	- * At the DSL layer, all properties are either 64-bit numbers or strings.
	- * We want the user to be able to ignore this fact and specify properties
	- * as native values (numbers, for example) or as strings (to simplify
	- * command line utilities). This also handles converting index types
	- * (compression, checksum, etc) from strings to their on-disk index.
	- */
	-int
	-zprop_parse_value(libzfs_handle_t hdl, nvpair_t elem, int prop,
	- zfs_type_t type, nvlist_t ret, char svalp, uint64_t ivalp,
	- const char *errbuf)
	-{
	- data_type_t datatype = nvpair_type(elem);
	- zprop_type_t proptype;
	- const char *propname;
	- char *value;
	- boolean_t isnone = B_FALSE;
	- boolean_t isauto = B_FALSE;
	-
	- if (type == ZFS_TYPE_POOL) {
	- proptype = zpool_prop_get_type(prop);
	- propname = zpool_prop_to_name(prop);
	- } else {
	- proptype = zfs_prop_get_type(prop);
	- propname = zfs_prop_to_name(prop);
	- }
	-
	- /*
	- * Convert any properties to the internal DSL value types.
	- */
	- *svalp = NULL;
	- *ivalp = 0;
	-
	- switch (proptype) {
	- case PROP_TYPE_STRING:
	- if (datatype != DATA_TYPE_STRING) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be a string"), nvpair_name(elem));
	- goto error;
	- }
	- (void) nvpair_value_string(elem, svalp);
	- if (strlen(*svalp) >= ZFS_MAXPROPLEN) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' is too long"), nvpair_name(elem));
	- goto error;
	- }
	- break;
	-
	- case PROP_TYPE_NUMBER:
	- if (datatype == DATA_TYPE_STRING) {
	- (void) nvpair_value_string(elem, &value);
	- if (strcmp(value, "none") == 0) {
	- isnone = B_TRUE;
	- } else if (strcmp(value, "auto") == 0) {
	- isauto = B_TRUE;
	- } else if (zfs_nicestrtonum(hdl, value, ivalp) != 0) {
	- goto error;
	- }
	- } else if (datatype == DATA_TYPE_UINT64) {
	- (void) nvpair_value_uint64(elem, ivalp);
	- } else {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be a number"), nvpair_name(elem));
	- goto error;
	- }
	-
	- /*
	- * Quota special: force 'none' and don't allow 0.
	- */
	- if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone &&
	- (prop == ZFS_PROP_QUOTA \|\| prop == ZFS_PROP_REFQUOTA)) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "use 'none' to disable quota/refquota"));
	- goto error;
	- }
	-
	- /*
	- * Special handling for "*_limit=none". In this case it's not
	- * 0 but UINT64_MAX.
	- */
	- if ((type & ZFS_TYPE_DATASET) && isnone &&
	- (prop == ZFS_PROP_FILESYSTEM_LIMIT \|\|
	- prop == ZFS_PROP_SNAPSHOT_LIMIT)) {
	- *ivalp = UINT64_MAX;
	- }
	-
	- /*
	- * Special handling for setting 'refreservation' to 'auto'. Use
	- * UINT64_MAX to tell the caller to use zfs_fix_auto_resv().
	- * 'auto' is only allowed on volumes.
	- */
	- if (isauto) {
	- switch (prop) {
	- case ZFS_PROP_REFRESERVATION:
	- if ((type & ZFS_TYPE_VOLUME) == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s=auto' only allowed on "
	- "volumes"), nvpair_name(elem));
	- goto error;
	- }
	- *ivalp = UINT64_MAX;
	- break;
	- default:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'auto' is invalid value for '%s'"),
	- nvpair_name(elem));
	- goto error;
	- }
	- }
	-
	- break;
	-
	- case PROP_TYPE_INDEX:
	- if (datatype != DATA_TYPE_STRING) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be a string"), nvpair_name(elem));
	- goto error;
	- }
	-
	- (void) nvpair_value_string(elem, &value);
	-
	- if (zprop_string_to_index(prop, value, ivalp, type) != 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "'%s' must be one of '%s'"), propname,
	- zprop_values(prop, type));
	- goto error;
	- }
	- break;
	-
	- default:
	- abort();
	- }
	-
	- /*
	- * Add the result to our return set of properties.
	- */
	- if (*svalp != NULL) {
	- if (nvlist_add_string(ret, propname, *svalp) != 0) {
	- (void) no_memory(hdl);
	- return (-1);
	- }
	- } else {
	- if (nvlist_add_uint64(ret, propname, *ivalp) != 0) {
	- (void) no_memory(hdl);
	- return (-1);
	- }
	- }
	-
	- return (0);
	-error:
	- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	- return (-1);
	-}
	-
	-static int
	-addlist(libzfs_handle_t hdl, char propname, zprop_list_t **listp,
	- zfs_type_t type)
	-{
	- int prop;
	- zprop_list_t *entry;
	-
	- prop = zprop_name_to_prop(propname, type);
	-
	- if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type))
	- prop = ZPROP_INVAL;
	-
	- /*
	- * When no property table entry can be found, return failure if
	- * this is a pool property or if this isn't a user-defined
	- * dataset property,
	- */
	- if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL &&
	- !zpool_prop_feature(propname) &&
	- !zpool_prop_unsupported(propname)) \|\|
	- (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) &&
	- !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "invalid property '%s'"), propname);
	- return (zfs_error(hdl, EZFS_BADPROP,
	- dgettext(TEXT_DOMAIN, "bad property list")));
	- }
	-
	- if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
	- return (-1);
	-
	- entry->pl_prop = prop;
	- if (prop == ZPROP_INVAL) {
	- if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) ==
	- NULL) {
	- free(entry);
	- return (-1);
	- }
	- entry->pl_width = strlen(propname);
	- } else {
	- entry->pl_width = zprop_width(prop, &entry->pl_fixed,
	- type);
	- }
	-
	- *listp = entry;
	-
	- return (0);
	-}
	-
	-/*
	- * Given a comma-separated list of properties, construct a property list
	- * containing both user-defined and native properties. This function will
	- * return a NULL list if 'all' is specified, which can later be expanded
	- * by zprop_expand_list().
	- */
	-int
	-zprop_get_list(libzfs_handle_t hdl, char props, zprop_list_t **listp,
	- zfs_type_t type)
	-{
	- *listp = NULL;
	-
	- /*
	- * If 'all' is specified, return a NULL list.
	- */
	- if (strcmp(props, "all") == 0)
	- return (0);
	-
	- /*
	- * If no props were specified, return an error.
	- */
	- if (props[0] == '\0') {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "no properties specified"));
	- return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN,
	- "bad property list")));
	- }
	-
	- /*
	- * It would be nice to use getsubopt() here, but the inclusion of column
	- * aliases makes this more effort than it's worth.
	- */
	- while (*props != '\0') {
	- size_t len;
	- char *p;
	- char c;
	-
	- if ((p = strchr(props, ',')) == NULL) {
	- len = strlen(props);
	- p = props + len;
	- } else {
	- len = p - props;
	- }
	-
	- /*
	- * Check for empty options.
	- */
	- if (len == 0) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "empty property name"));
	- return (zfs_error(hdl, EZFS_BADPROP,
	- dgettext(TEXT_DOMAIN, "bad property list")));
	- }
	-
	- /*
	- * Check all regular property names.
	- */
	- c = props[len];
	- props[len] = '\0';
	-
	- if (strcmp(props, "space") == 0) {
	- static char *spaceprops[] = {
	- "name", "avail", "used", "usedbysnapshots",
	- "usedbydataset", "usedbyrefreservation",
	- "usedbychildren", NULL
	- };
	- int i;
	-
	- for (i = 0; spaceprops[i]; i++) {
	- if (addlist(hdl, spaceprops[i], listp, type))
	- return (-1);
	- listp = &(*listp)->pl_next;
	- }
	- } else {
	- if (addlist(hdl, props, listp, type))
	- return (-1);
	- listp = &(*listp)->pl_next;
	- }
	-
	- props = p;
	- if (c == ',')
	- props++;
	- }
	-
	- return (0);
	-}
	-
	-void
	-zprop_free_list(zprop_list_t *pl)
	-{
	- zprop_list_t *next;
	-
	- while (pl != NULL) {
	- next = pl->pl_next;
	- free(pl->pl_user_prop);
	- free(pl);
	- pl = next;
	- }
	-}
	-
	-typedef struct expand_data {
	- zprop_list_t **last;
	- libzfs_handle_t *hdl;
	- zfs_type_t type;
	-} expand_data_t;
	-
	-int
	-zprop_expand_list_cb(int prop, void *cb)
	-{
	- zprop_list_t *entry;
	- expand_data_t *edp = cb;
	-
	- if ((entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t))) == NULL)
	- return (ZPROP_INVAL);
	-
	- entry->pl_prop = prop;
	- entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type);
	- entry->pl_all = B_TRUE;
	-
	- *(edp->last) = entry;
	- edp->last = &entry->pl_next;
	-
	- return (ZPROP_CONT);
	-}
	-
	-int
	-zprop_expand_list(libzfs_handle_t hdl, zprop_list_t *plp, zfs_type_t type)
	-{
	- zprop_list_t *entry;
	- zprop_list_t **last;
	- expand_data_t exp;
	-
	- if (*plp == NULL) {
	- /*
	- * If this is the very first time we've been called for an 'all'
	- * specification, expand the list to include all native
	- * properties.
	- */
	- last = plp;
	-
	- exp.last = last;
	- exp.hdl = hdl;
	- exp.type = type;
	-
	- if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE,
	- B_FALSE, type) == ZPROP_INVAL)
	- return (-1);
	-
	- /*
	- * Add 'name' to the beginning of the list, which is handled
	- * specially.
	- */
	- if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
	- return (-1);
	-
	- entry->pl_prop = (type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME :
	- ZFS_PROP_NAME;
	- entry->pl_width = zprop_width(entry->pl_prop,
	- &entry->pl_fixed, type);
	- entry->pl_all = B_TRUE;
	- entry->pl_next = *plp;
	- *plp = entry;
	- }
	- return (0);
	-}
	-
	-int
	-zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered,
	- zfs_type_t type)
	-{
	- return (zprop_iter_common(func, cb, show_all, ordered, type));
	-}
	-
	-ulong_t
	-get_system_hostid(void)
	-{
	- char *env;
	-
	- /*
	- * Allow the hostid to be subverted for testing.
	- */
	- env = getenv("ZFS_HOSTID");
	- if (env) {
	- ulong_t hostid = strtoull(env, NULL, 16);
	- return (hostid & 0xFFFFFFFF);
	- }
	-
	- return (gethostid());
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
	+++ head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
	@@ -1,114 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright 2017 RackTop Systems.
	- * Copyright (c) 2017 Datto Inc.
	- */
	-
	-#ifndef _LIBZFS_CORE_H
	-#define _LIBZFS_CORE_H
	-
	-#include <libnvpair.h>
	-#include <sys/param.h>
	-#include <sys/types.h>
	-#include <sys/fs/zfs.h>
	-
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-int libzfs_core_init(void);
	-void libzfs_core_fini(void);
	-
	-/*
	- * NB: this type should be kept binary compatible with dmu_objset_type_t.
	- */
	-enum lzc_dataset_type {
	- LZC_DATSET_TYPE_ZFS = 2,
	- LZC_DATSET_TYPE_ZVOL
	-};
	-
	-int lzc_remap(const char *fsname);
	-int lzc_snapshot(nvlist_t , nvlist_t , nvlist_t **);
	-int lzc_create(const char , enum lzc_dataset_type, nvlist_t );
	-int lzc_clone(const char , const char , nvlist_t *);
	-int lzc_promote(const char , char , int);
	-int lzc_destroy_snaps(nvlist_t , boolean_t, nvlist_t *);
	-int lzc_bookmark(nvlist_t , nvlist_t *);
	-int lzc_get_bookmarks(const char , nvlist_t , nvlist_t **);
	-int lzc_destroy_bookmarks(nvlist_t , nvlist_t *);
	-int lzc_initialize(const char , pool_initialize_func_t, nvlist_t ,
	- nvlist_t **);
	-
	-int lzc_snaprange_space(const char , const char , uint64_t *);
	-
	-int lzc_hold(nvlist_t , int, nvlist_t *);
	-int lzc_release(nvlist_t , nvlist_t *);
	-int lzc_get_holds(const char , nvlist_t *);
	-
	-enum lzc_send_flags {
	- LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
	- LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1,
	- LZC_SEND_FLAG_COMPRESS = 1 << 2
	-};
	-
	-int lzc_send(const char , const char , int, enum lzc_send_flags);
	-int lzc_send_resume(const char , const char , int,
	- enum lzc_send_flags, uint64_t, uint64_t);
	-int lzc_send_space(const char , const char , enum lzc_send_flags, uint64_t *);
	-
	-struct dmu_replay_record;
	-
	-int lzc_receive(const char , nvlist_t , const char *, boolean_t, int);
	-int lzc_receive_resumable(const char , nvlist_t , const char *,
	- boolean_t, int);
	-int lzc_receive_with_header(const char , nvlist_t , const char *, boolean_t,
	- boolean_t, int, const struct dmu_replay_record *);
	-
	-boolean_t lzc_exists(const char *);
	-
	-int lzc_rollback(const char , char , int);
	-int lzc_rollback_to(const char , const char );
	-
	-int lzc_sync(const char , nvlist_t , nvlist_t **);
	-
	-int lzc_rename(const char , const char );
	-int lzc_destroy(const char *);
	-
	-int lzc_channel_program(const char , const char , uint64_t,
	- uint64_t, nvlist_t , nvlist_t *);
	-int lzc_channel_program_nosync(const char , const char , uint64_t,
	- uint64_t, nvlist_t , nvlist_t *);
	-
	-int lzc_pool_checkpoint(const char *);
	-int lzc_pool_checkpoint_discard(const char *);
	-
	-int lzc_set_bootenv(const char , const char );
	-int lzc_get_bootenv(const char , nvlist_t *);
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _LIBZFS_CORE_H */
	Index: head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
	@@ -1,1234 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 RackTop Systems.
	- * Copyright (c) 2017 Datto Inc.
	- */
	-
	-/*
	- * LibZFS_Core (lzc) is intended to replace most functionality in libzfs.
	- * It has the following characteristics:
	- *
	- * - Thread Safe. libzfs_core is accessible concurrently from multiple
	- * threads. This is accomplished primarily by avoiding global data
	- * (e.g. caching). Since it's thread-safe, there is no reason for a
	- * process to have multiple libzfs "instances". Therefore, we store
	- * our few pieces of data (e.g. the file descriptor) in global
	- * variables. The fd is reference-counted so that the libzfs_core
	- * library can be "initialized" multiple times (e.g. by different
	- * consumers within the same process).
	- *
	- * - Committed Interface. The libzfs_core interface will be committed,
	- * therefore consumers can compile against it and be confident that
	- * their code will continue to work on future releases of this code.
	- * Currently, the interface is Evolving (not Committed), but we intend
	- * to commit to it once it is more complete and we determine that it
	- * meets the needs of all consumers.
	- *
	- * - Programatic Error Handling. libzfs_core communicates errors with
	- * defined error numbers, and doesn't print anything to stdout/stderr.
	- *
	- * - Thin Layer. libzfs_core is a thin layer, marshaling arguments
	- * to/from the kernel ioctls. There is generally a 1:1 correspondence
	- * between libzfs_core functions and ioctls to /dev/zfs.
	- *
	- * - Clear Atomicity. Because libzfs_core functions are generally 1:1
	- * with kernel ioctls, and kernel ioctls are general atomic, each
	- * libzfs_core function is atomic. For example, creating multiple
	- * snapshots with a single call to lzc_snapshot() is atomic -- it
	- * can't fail with only some of the requested snapshots created, even
	- * in the event of power loss or system crash.
	- *
	- * - Continued libzfs Support. Some higher-level operations (e.g.
	- * support for "zfs send -R") are too complicated to fit the scope of
	- * libzfs_core. This functionality will continue to live in libzfs.
	- * Where appropriate, libzfs will use the underlying atomic operations
	- * of libzfs_core. For example, libzfs may implement "zfs send -R \|
	- * zfs receive" by using individual "send one snapshot", rename,
	- * destroy, and "receive one snapshot" operations in libzfs_core.
	- * /sbin/zfs and /zbin/zpool will link with both libzfs and
	- * libzfs_core. Other consumers should aim to use only libzfs_core,
	- * since that will be the supported, stable interface going forwards.
	- */
	-
	-#define _IN_LIBZFS_CORE_
	-
	-#include <libzfs_core.h>
	-#include <ctype.h>
	-#include <unistd.h>
	-#include <stdlib.h>
	-#include <string.h>
	-#ifdef ZFS_DEBUG
	-#include <stdio.h>
	-#endif
	-#include <errno.h>
	-#include <fcntl.h>
	-#include <pthread.h>
	-#include <sys/nvpair.h>
	-#include <sys/param.h>
	-#include <sys/types.h>
	-#include <sys/stat.h>
	-#include <sys/zfs_ioctl.h>
	-#include "libzfs_core_compat.h"
	-#include "libzfs_compat.h"
	-
	-#ifdef __FreeBSD__
	-extern int zfs_ioctl_version;
	-#endif
	-
	-static int g_fd = -1;
	-static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER;
	-static int g_refcount;
	-
	-#ifdef ZFS_DEBUG
	-static zfs_ioc_t fail_ioc_cmd;
	-static zfs_errno_t fail_ioc_err;
	-
	-static void
	-libzfs_core_debug_ioc(void)
	-{
	- /*
	- * To test running newer user space binaries with kernel's
	- * that don't yet support an ioctl or a new ioctl arg we
	- * provide an override to intentionally fail an ioctl.
	- *
	- * USAGE:
	- * The override variable, ZFS_IOC_TEST, is of the form "cmd:err"
	- *
	- * For example, to fail a ZFS_IOC_POOL_CHECKPOINT with a
	- * ZFS_ERR_IOC_CMD_UNAVAIL, the string would be "0x5a4d:1029"
	- *
	- * $ sudo sh -c "ZFS_IOC_TEST=0x5a4d:1029 zpool checkpoint tank"
	- * cannot checkpoint 'tank': the loaded zfs module does not support
	- * this operation. A reboot may be required to enable this operation.
	- */
	- if (fail_ioc_cmd == 0) {
	- char *ioc_test = getenv("ZFS_IOC_TEST");
	- unsigned int ioc_num = 0, ioc_err = 0;
	-
	- if (ioc_test != NULL &&
	- sscanf(ioc_test, "%i:%i", &ioc_num, &ioc_err) == 2 &&
	- ioc_num < ZFS_IOC_LAST) {
	- fail_ioc_cmd = ioc_num;
	- fail_ioc_err = ioc_err;
	- }
	- }
	-}
	-#endif
	-
	-int
	-libzfs_core_init(void)
	-{
	- (void) pthread_mutex_lock(&g_lock);
	- if (g_refcount == 0) {
	- g_fd = open("/dev/zfs", O_RDWR);
	- if (g_fd < 0) {
	- (void) pthread_mutex_unlock(&g_lock);
	- return (errno);
	- }
	- }
	- g_refcount++;
	-
	-#ifdef ZFS_DEBUG
	- libzfs_core_debug_ioc();
	-#endif
	- (void) pthread_mutex_unlock(&g_lock);
	-
	- return (0);
	-}
	-
	-void
	-libzfs_core_fini(void)
	-{
	- (void) pthread_mutex_lock(&g_lock);
	- ASSERT3S(g_refcount, >, 0);
	-
	- if (g_refcount > 0)
	- g_refcount--;
	-
	- if (g_refcount == 0 && g_fd != -1) {
	- (void) close(g_fd);
	- g_fd = -1;
	- }
	- (void) pthread_mutex_unlock(&g_lock);
	-}
	-
	-static int
	-lzc_ioctl(zfs_ioc_t ioc, const char *name,
	- nvlist_t source, nvlist_t *resultp)
	-{
	- zfs_cmd_t zc = { 0 };
	- int error = 0;
	- char *packed = NULL;
	-#ifdef __FreeBSD__
	- nvlist_t *oldsource;
	-#endif
	- size_t size = 0;
	-
	- ASSERT3S(g_refcount, >, 0);
	- VERIFY3S(g_fd, !=, -1);
	-
	-#ifdef ZFS_DEBUG
	- if (ioc == fail_ioc_cmd)
	- return (fail_ioc_err);
	-#endif
	-
	- if (name != NULL)
	- (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
	-
	-#ifdef __FreeBSD__
	- if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
	- zfs_ioctl_version = get_zfs_ioctl_version();
	-
	- if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
	- oldsource = source;
	- error = lzc_compat_pre(&zc, &ioc, &source);
	- if (error)
	- return (error);
	- }
	-#endif
	-
	- if (source != NULL) {
	- packed = fnvlist_pack(source, &size);
	- zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
	- zc.zc_nvlist_src_size = size;
	- }
	-
	- if (resultp != NULL) {
	- *resultp = NULL;
	- if (ioc == ZFS_IOC_CHANNEL_PROGRAM) {
	- zc.zc_nvlist_dst_size = fnvlist_lookup_uint64(source,
	- ZCP_ARG_MEMLIMIT);
	- } else {
	- zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
	- }
	- zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
	- malloc(zc.zc_nvlist_dst_size);
	-#ifdef illumos
	- if (zc.zc_nvlist_dst == NULL) {
	-#else
	- if (zc.zc_nvlist_dst == 0) {
	-#endif
	- error = ENOMEM;
	- goto out;
	- }
	- }
	-
	- while (ioctl(g_fd, ioc, &zc) != 0) {
	- /*
	- * If ioctl exited with ENOMEM, we retry the ioctl after
	- * increasing the size of the destination nvlist.
	- *
	- * Channel programs that exit with ENOMEM ran over the
	- * lua memory sandbox; they should not be retried.
	- */
	- if (errno == ENOMEM && resultp != NULL &&
	- ioc != ZFS_IOC_CHANNEL_PROGRAM) {
	- free((void *)(uintptr_t)zc.zc_nvlist_dst);
	- zc.zc_nvlist_dst_size *= 2;
	- zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
	- malloc(zc.zc_nvlist_dst_size);
	-#ifdef illumos
	- if (zc.zc_nvlist_dst == NULL) {
	-#else
	- if (zc.zc_nvlist_dst == 0) {
	-#endif
	- error = ENOMEM;
	- goto out;
	- }
	- } else {
	- error = errno;
	- break;
	- }
	- }
	-
	-#ifdef __FreeBSD__
	- if (zfs_ioctl_version < ZFS_IOCVER_LZC)
	- lzc_compat_post(&zc, ioc);
	-#endif
	- if (zc.zc_nvlist_dst_filled) {
	- resultp = fnvlist_unpack((void )(uintptr_t)zc.zc_nvlist_dst,
	- zc.zc_nvlist_dst_size);
	- }
	-#ifdef __FreeBSD__
	- if (zfs_ioctl_version < ZFS_IOCVER_LZC)
	- lzc_compat_outnvl(&zc, ioc, resultp);
	-#endif
	-out:
	-#ifdef __FreeBSD__
	- if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
	- if (source != oldsource)
	- nvlist_free(source);
	- source = oldsource;
	- }
	-#endif
	- fnvlist_pack_free(packed, size);
	- free((void *)(uintptr_t)zc.zc_nvlist_dst);
	- return (error);
	-}
	-
	-int
	-lzc_create(const char fsname, enum lzc_dataset_type type, nvlist_t props)
	-{
	- int error;
	- nvlist_t *args = fnvlist_alloc();
	- fnvlist_add_int32(args, "type", (dmu_objset_type_t)type);
	- if (props != NULL)
	- fnvlist_add_nvlist(args, "props", props);
	- error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL);
	- nvlist_free(args);
	- return (error);
	-}
	-
	-int
	-lzc_clone(const char fsname, const char origin,
	- nvlist_t *props)
	-{
	- int error;
	- nvlist_t *args = fnvlist_alloc();
	- fnvlist_add_string(args, "origin", origin);
	- if (props != NULL)
	- fnvlist_add_nvlist(args, "props", props);
	- error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL);
	- nvlist_free(args);
	- return (error);
	-}
	-
	-int
	-lzc_promote(const char fsname, char snapnamebuf, int snapnamelen)
	-{
	- /*
	- * The promote ioctl is still legacy, so we need to construct our
	- * own zfs_cmd_t rather than using lzc_ioctl().
	- */
	- zfs_cmd_t zc = { 0 };
	-
	- ASSERT3S(g_refcount, >, 0);
	- VERIFY3S(g_fd, !=, -1);
	-
	- (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name));
	- if (ioctl(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) {
	- int error = errno;
	- if (error == EEXIST && snapnamebuf != NULL)
	- (void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen);
	- return (error);
	- }
	- return (0);
	-}
	-
	-int
	-lzc_remap(const char *fsname)
	-{
	- int error;
	- nvlist_t *args = fnvlist_alloc();
	- error = lzc_ioctl(ZFS_IOC_REMAP, fsname, args, NULL);
	- nvlist_free(args);
	- return (error);
	-}
	-
	-int
	-lzc_rename(const char source, const char target)
	-{
	- zfs_cmd_t zc = { 0 };
	- int error;
	-
	- ASSERT3S(g_refcount, >, 0);
	- VERIFY3S(g_fd, !=, -1);
	-
	- (void) strlcpy(zc.zc_name, source, sizeof (zc.zc_name));
	- (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
	- error = ioctl(g_fd, ZFS_IOC_RENAME, &zc);
	- if (error != 0)
	- error = errno;
	- return (error);
	-}
	-
	-int
	-lzc_destroy(const char *fsname)
	-{
	- int error;
	-
	- nvlist_t *args = fnvlist_alloc();
	- error = lzc_ioctl(ZFS_IOC_DESTROY, fsname, args, NULL);
	- nvlist_free(args);
	- return (error);
	-}
	-
	-/*
	- * Creates snapshots.
	- *
	- * The keys in the snaps nvlist are the snapshots to be created.
	- * They must all be in the same pool.
	- *
	- * The props nvlist is properties to set. Currently only user properties
	- * are supported. { user:prop_name -> string value }
	- *
	- * The returned results nvlist will have an entry for each snapshot that failed.
	- * The value will be the (int32) error code.
	- *
	- * The return value will be 0 if all snapshots were created, otherwise it will
	- * be the errno of a (unspecified) snapshot that failed.
	- */
	-int
	-lzc_snapshot(nvlist_t snaps, nvlist_t props, nvlist_t **errlist)
	-{
	- nvpair_t *elem;
	- nvlist_t *args;
	- int error;
	- char pool[ZFS_MAX_DATASET_NAME_LEN];
	-
	- *errlist = NULL;
	-
	- /* determine the pool name */
	- elem = nvlist_next_nvpair(snaps, NULL);
	- if (elem == NULL)
	- return (0);
	- (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	- pool[strcspn(pool, "/@")] = '\0';
	-
	- args = fnvlist_alloc();
	- fnvlist_add_nvlist(args, "snaps", snaps);
	- if (props != NULL)
	- fnvlist_add_nvlist(args, "props", props);
	-
	- error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist);
	- nvlist_free(args);
	-
	- return (error);
	-}
	-
	-/*
	- * Destroys snapshots.
	- *
	- * The keys in the snaps nvlist are the snapshots to be destroyed.
	- * They must all be in the same pool.
	- *
	- * Snapshots that do not exist will be silently ignored.
	- *
	- * If 'defer' is not set, and a snapshot has user holds or clones, the
	- * destroy operation will fail and none of the snapshots will be
	- * destroyed.
	- *
	- * If 'defer' is set, and a snapshot has user holds or clones, it will be
	- * marked for deferred destruction, and will be destroyed when the last hold
	- * or clone is removed/destroyed.
	- *
	- * The return value will be 0 if all snapshots were destroyed (or marked for
	- * later destruction if 'defer' is set) or didn't exist to begin with.
	- *
	- * Otherwise the return value will be the errno of a (unspecified) snapshot
	- * that failed, no snapshots will be destroyed, and the errlist will have an
	- * entry for each snapshot that failed. The value in the errlist will be
	- * the (int32) error code.
	- */
	-int
	-lzc_destroy_snaps(nvlist_t snaps, boolean_t defer, nvlist_t *errlist)
	-{
	- nvpair_t *elem;
	- nvlist_t *args;
	- int error;
	- char pool[ZFS_MAX_DATASET_NAME_LEN];
	-
	- /* determine the pool name */
	- elem = nvlist_next_nvpair(snaps, NULL);
	- if (elem == NULL)
	- return (0);
	- (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	- pool[strcspn(pool, "/@")] = '\0';
	-
	- args = fnvlist_alloc();
	- fnvlist_add_nvlist(args, "snaps", snaps);
	- if (defer)
	- fnvlist_add_boolean(args, "defer");
	-
	- error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist);
	- nvlist_free(args);
	-
	- return (error);
	-}
	-
	-int
	-lzc_snaprange_space(const char firstsnap, const char lastsnap,
	- uint64_t *usedp)
	-{
	- nvlist_t *args;
	- nvlist_t *result;
	- int err;
	- char fs[ZFS_MAX_DATASET_NAME_LEN];
	- char *atp;
	-
	- /* determine the fs name */
	- (void) strlcpy(fs, firstsnap, sizeof (fs));
	- atp = strchr(fs, '@');
	- if (atp == NULL)
	- return (EINVAL);
	- *atp = '\0';
	-
	- args = fnvlist_alloc();
	- fnvlist_add_string(args, "firstsnap", firstsnap);
	-
	- err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result);
	- nvlist_free(args);
	- if (err == 0)
	- *usedp = fnvlist_lookup_uint64(result, "used");
	- fnvlist_free(result);
	-
	- return (err);
	-}
	-
	-boolean_t
	-lzc_exists(const char *dataset)
	-{
	- /*
	- * The objset_stats ioctl is still legacy, so we need to construct our
	- * own zfs_cmd_t rather than using lzc_ioctl().
	- */
	- zfs_cmd_t zc = { 0 };
	-
	- ASSERT3S(g_refcount, >, 0);
	- VERIFY3S(g_fd, !=, -1);
	-
	- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
	- return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0);
	-}
	-
	-/*
	- * outnvl is unused.
	- * It was added to preserve the function signature in case it is
	- * needed in the future.
	- */
	-/ARGSUSED/
	-int
	-lzc_sync(const char pool_name, nvlist_t innvl, nvlist_t **outnvl)
	-{
	- return (lzc_ioctl(ZFS_IOC_POOL_SYNC, pool_name, innvl, NULL));
	-}
	-
	-/*
	- * Create "user holds" on snapshots. If there is a hold on a snapshot,
	- * the snapshot can not be destroyed. (However, it can be marked for deletion
	- * by lzc_destroy_snaps(defer=B_TRUE).)
	- *
	- * The keys in the nvlist are snapshot names.
	- * The snapshots must all be in the same pool.
	- * The value is the name of the hold (string type).
	- *
	- * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
	- * In this case, when the cleanup_fd is closed (including on process
	- * termination), the holds will be released. If the system is shut down
	- * uncleanly, the holds will be released when the pool is next opened
	- * or imported.
	- *
	- * Holds for snapshots which don't exist will be skipped and have an entry
	- * added to errlist, but will not cause an overall failure.
	- *
	- * The return value will be 0 if all holds, for snapshots that existed,
	- * were succesfully created.
	- *
	- * Otherwise the return value will be the errno of a (unspecified) hold that
	- * failed and no holds will be created.
	- *
	- * In all cases the errlist will have an entry for each hold that failed
	- * (name = snapshot), with its value being the error code (int32).
	- */
	-int
	-lzc_hold(nvlist_t holds, int cleanup_fd, nvlist_t *errlist)
	-{
	- char pool[ZFS_MAX_DATASET_NAME_LEN];
	- nvlist_t *args;
	- nvpair_t *elem;
	- int error;
	-
	- /* determine the pool name */
	- elem = nvlist_next_nvpair(holds, NULL);
	- if (elem == NULL)
	- return (0);
	- (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	- pool[strcspn(pool, "/@")] = '\0';
	-
	- args = fnvlist_alloc();
	- fnvlist_add_nvlist(args, "holds", holds);
	- if (cleanup_fd != -1)
	- fnvlist_add_int32(args, "cleanup_fd", cleanup_fd);
	-
	- error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist);
	- nvlist_free(args);
	- return (error);
	-}
	-
	-/*
	- * Release "user holds" on snapshots. If the snapshot has been marked for
	- * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
	- * any clones, and all the user holds are removed, then the snapshot will be
	- * destroyed.
	- *
	- * The keys in the nvlist are snapshot names.
	- * The snapshots must all be in the same pool.
	- * The value is a nvlist whose keys are the holds to remove.
	- *
	- * Holds which failed to release because they didn't exist will have an entry
	- * added to errlist, but will not cause an overall failure.
	- *
	- * The return value will be 0 if the nvl holds was empty or all holds that
	- * existed, were successfully removed.
	- *
	- * Otherwise the return value will be the errno of a (unspecified) hold that
	- * failed to release and no holds will be released.
	- *
	- * In all cases the errlist will have an entry for each hold that failed to
	- * to release.
	- */
	-int
	-lzc_release(nvlist_t holds, nvlist_t *errlist)
	-{
	- char pool[ZFS_MAX_DATASET_NAME_LEN];
	- nvpair_t *elem;
	-
	- /* determine the pool name */
	- elem = nvlist_next_nvpair(holds, NULL);
	- if (elem == NULL)
	- return (0);
	- (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	- pool[strcspn(pool, "/@")] = '\0';
	-
	- return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist));
	-}
	-
	-/*
	- * Retrieve list of user holds on the specified snapshot.
	- *
	- * On success, *holdsp will be set to a nvlist which the caller must free.
	- * The keys are the names of the holds, and the value is the creation time
	- * of the hold (uint64) in seconds since the epoch.
	- */
	-int
	-lzc_get_holds(const char snapname, nvlist_t *holdsp)
	-{
	- return (lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, NULL, holdsp));
	-}
	-
	-/*
	- * Generate a zfs send stream for the specified snapshot and write it to
	- * the specified file descriptor.
	- *
	- * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
	- *
	- * If "from" is NULL, a full (non-incremental) stream will be sent.
	- * If "from" is non-NULL, it must be the full name of a snapshot or
	- * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or
	- * "pool/fs#earlier_bmark"). If non-NULL, the specified snapshot or
	- * bookmark must represent an earlier point in the history of "snapname").
	- * It can be an earlier snapshot in the same filesystem or zvol as "snapname",
	- * or it can be the origin of "snapname"'s filesystem, or an earlier
	- * snapshot in the origin, etc.
	- *
	- * "fd" is the file descriptor to write the send stream to.
	- *
	- * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
	- * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
	- * records with drr_blksz > 128K.
	- *
	- * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
	- * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
	- * which the receiving system must support (as indicated by support
	- * for the "embedded_data" feature).
	- */
	-int
	-lzc_send(const char snapname, const char from, int fd,
	- enum lzc_send_flags flags)
	-{
	- return (lzc_send_resume(snapname, from, fd, flags, 0, 0));
	-}
	-
	-int
	-lzc_send_resume(const char snapname, const char from, int fd,
	- enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff)
	-{
	- nvlist_t *args;
	- int err;
	-
	- args = fnvlist_alloc();
	- fnvlist_add_int32(args, "fd", fd);
	- if (from != NULL)
	- fnvlist_add_string(args, "fromsnap", from);
	- if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
	- fnvlist_add_boolean(args, "largeblockok");
	- if (flags & LZC_SEND_FLAG_EMBED_DATA)
	- fnvlist_add_boolean(args, "embedok");
	- if (flags & LZC_SEND_FLAG_COMPRESS)
	- fnvlist_add_boolean(args, "compressok");
	- if (resumeobj != 0 \|\| resumeoff != 0) {
	- fnvlist_add_uint64(args, "resume_object", resumeobj);
	- fnvlist_add_uint64(args, "resume_offset", resumeoff);
	- }
	- err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
	- nvlist_free(args);
	- return (err);
	-}
	-
	-/*
	- * "from" can be NULL, a snapshot, or a bookmark.
	- *
	- * If from is NULL, a full (non-incremental) stream will be estimated. This
	- * is calculated very efficiently.
	- *
	- * If from is a snapshot, lzc_send_space uses the deadlists attached to
	- * each snapshot to efficiently estimate the stream size.
	- *
	- * If from is a bookmark, the indirect blocks in the destination snapshot
	- * are traversed, looking for blocks with a birth time since the creation TXG of
	- * the snapshot this bookmark was created from. This will result in
	- * significantly more I/O and be less efficient than a send space estimation on
	- * an equivalent snapshot.
	- */
	-int
	-lzc_send_space(const char snapname, const char from,
	- enum lzc_send_flags flags, uint64_t *spacep)
	-{
	- nvlist_t *args;
	- nvlist_t *result;
	- int err;
	-
	- args = fnvlist_alloc();
	- if (from != NULL)
	- fnvlist_add_string(args, "from", from);
	- if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
	- fnvlist_add_boolean(args, "largeblockok");
	- if (flags & LZC_SEND_FLAG_EMBED_DATA)
	- fnvlist_add_boolean(args, "embedok");
	- if (flags & LZC_SEND_FLAG_COMPRESS)
	- fnvlist_add_boolean(args, "compressok");
	- err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
	- nvlist_free(args);
	- if (err == 0)
	- *spacep = fnvlist_lookup_uint64(result, "space");
	- nvlist_free(result);
	- return (err);
	-}
	-
	-static int
	-recv_read(int fd, void *buf, int ilen)
	-{
	- char *cp = buf;
	- int rv;
	- int len = ilen;
	-
	- do {
	- rv = read(fd, cp, len);
	- cp += rv;
	- len -= rv;
	- } while (rv > 0);
	-
	- if (rv < 0 \|\| len != 0)
	- return (EIO);
	-
	- return (0);
	-}
	-
	-static int
	-recv_impl(const char snapname, nvlist_t props, const char *origin,
	- boolean_t force, boolean_t resumable, int fd,
	- const dmu_replay_record_t *begin_record)
	-{
	- /*
	- * The receive ioctl is still legacy, so we need to construct our own
	- * zfs_cmd_t rather than using zfsc_ioctl().
	- */
	- zfs_cmd_t zc = { 0 };
	- char *atp;
	- char *packed = NULL;
	- size_t size;
	- int error;
	-
	- ASSERT3S(g_refcount, >, 0);
	- VERIFY3S(g_fd, !=, -1);
	-
	- /* zc_name is name of containing filesystem */
	- (void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name));
	- atp = strchr(zc.zc_name, '@');
	- if (atp == NULL)
	- return (EINVAL);
	- *atp = '\0';
	-
	- /* if the fs does not exist, try its parent. */
	- if (!lzc_exists(zc.zc_name)) {
	- char *slashp = strrchr(zc.zc_name, '/');
	- if (slashp == NULL)
	- return (ENOENT);
	- *slashp = '\0';
	-
	- }
	-
	- /* zc_value is full name of the snapshot to create */
	- (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
	-
	- if (props != NULL) {
	- /* zc_nvlist_src is props to set */
	- packed = fnvlist_pack(props, &size);
	- zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
	- zc.zc_nvlist_src_size = size;
	- }
	-
	- /* zc_string is name of clone origin (if DRR_FLAG_CLONE) */
	- if (origin != NULL)
	- (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string));
	-
	- /* zc_begin_record is non-byteswapped BEGIN record */
	- if (begin_record == NULL) {
	- error = recv_read(fd, &zc.zc_begin_record,
	- sizeof (zc.zc_begin_record));
	- if (error != 0)
	- goto out;
	- } else {
	- zc.zc_begin_record = *begin_record;
	- }
	-
	- /* zc_cookie is fd to read from */
	- zc.zc_cookie = fd;
	-
	- /* zc guid is force flag */
	- zc.zc_guid = force;
	-
	- zc.zc_resumable = resumable;
	-
	- /* zc_cleanup_fd is unused */
	- zc.zc_cleanup_fd = -1;
	-
	- error = ioctl(g_fd, ZFS_IOC_RECV, &zc);
	- if (error != 0)
	- error = errno;
	-
	-out:
	- if (packed != NULL)
	- fnvlist_pack_free(packed, size);
	- free((void*)(uintptr_t)zc.zc_nvlist_dst);
	- return (error);
	-}
	-
	-/*
	- * The simplest receive case: receive from the specified fd, creating the
	- * specified snapshot. Apply the specified properties as "received" properties
	- * (which can be overridden by locally-set properties). If the stream is a
	- * clone, its origin snapshot must be specified by 'origin'. The 'force'
	- * flag will cause the target filesystem to be rolled back or destroyed if
	- * necessary to receive.
	- *
	- * Return 0 on success or an errno on failure.
	- *
	- * Note: this interface does not work on dedup'd streams
	- * (those with DMU_BACKUP_FEATURE_DEDUP).
	- */
	-int
	-lzc_receive(const char snapname, nvlist_t props, const char *origin,
	- boolean_t force, int fd)
	-{
	- return (recv_impl(snapname, props, origin, force, B_FALSE, fd, NULL));
	-}
	-
	-/*
	- * Like lzc_receive, but if the receive fails due to premature stream
	- * termination, the intermediate state will be preserved on disk. In this
	- * case, ECKSUM will be returned. The receive may subsequently be resumed
	- * with a resuming send stream generated by lzc_send_resume().
	- */
	-int
	-lzc_receive_resumable(const char snapname, nvlist_t props, const char *origin,
	- boolean_t force, int fd)
	-{
	- return (recv_impl(snapname, props, origin, force, B_TRUE, fd, NULL));
	-}
	-
	-/*
	- * Like lzc_receive, but allows the caller to read the begin record and then to
	- * pass it in. That could be useful if the caller wants to derive, for example,
	- * the snapname or the origin parameters based on the information contained in
	- * the begin record.
	- * The begin record must be in its original form as read from the stream,
	- * in other words, it should not be byteswapped.
	- *
	- * The 'resumable' parameter allows to obtain the same behavior as with
	- * lzc_receive_resumable.
	- */
	-int
	-lzc_receive_with_header(const char snapname, nvlist_t props,
	- const char *origin, boolean_t force, boolean_t resumable, int fd,
	- const dmu_replay_record_t *begin_record)
	-{
	- if (begin_record == NULL)
	- return (EINVAL);
	- return (recv_impl(snapname, props, origin, force, resumable, fd,
	- begin_record));
	-}
	-
	-/*
	- * Roll back this filesystem or volume to its most recent snapshot.
	- * If snapnamebuf is not NULL, it will be filled in with the name
	- * of the most recent snapshot.
	- * Note that the latest snapshot may change if a new one is concurrently
	- * created or the current one is destroyed. lzc_rollback_to can be used
	- * to roll back to a specific latest snapshot.
	- *
	- * Return 0 on success or an errno on failure.
	- */
	-int
	-lzc_rollback(const char fsname, char snapnamebuf, int snapnamelen)
	-{
	- nvlist_t *args;
	- nvlist_t *result;
	- int err;
	-
	- args = fnvlist_alloc();
	- err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
	- nvlist_free(args);
	- if (err == 0 && snapnamebuf != NULL) {
	- const char *snapname = fnvlist_lookup_string(result, "target");
	- (void) strlcpy(snapnamebuf, snapname, snapnamelen);
	- }
	- nvlist_free(result);
	-
	- return (err);
	-}
	-
	-/*
	- * Roll back this filesystem or volume to the specified snapshot,
	- * if possible.
	- *
	- * Return 0 on success or an errno on failure.
	- */
	-int
	-lzc_rollback_to(const char fsname, const char snapname)
	-{
	- nvlist_t *args;
	- nvlist_t *result;
	- int err;
	-
	- args = fnvlist_alloc();
	- fnvlist_add_string(args, "target", snapname);
	- err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
	- nvlist_free(args);
	- nvlist_free(result);
	- return (err);
	-}
	-
	-/*
	- * Creates bookmarks.
	- *
	- * The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to
	- * the name of the snapshot (e.g. "pool/fs@snap"). All the bookmarks and
	- * snapshots must be in the same pool.
	- *
	- * The returned results nvlist will have an entry for each bookmark that failed.
	- * The value will be the (int32) error code.
	- *
	- * The return value will be 0 if all bookmarks were created, otherwise it will
	- * be the errno of a (undetermined) bookmarks that failed.
	- */
	-int
	-lzc_bookmark(nvlist_t bookmarks, nvlist_t *errlist)
	-{
	- nvpair_t *elem;
	- int error;
	- char pool[ZFS_MAX_DATASET_NAME_LEN];
	-
	- /* determine the pool name */
	- elem = nvlist_next_nvpair(bookmarks, NULL);
	- if (elem == NULL)
	- return (0);
	- (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	- pool[strcspn(pool, "/#")] = '\0';
	-
	- error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist);
	-
	- return (error);
	-}
	-
	-/*
	- * Retrieve bookmarks.
	- *
	- * Retrieve the list of bookmarks for the given file system. The props
	- * parameter is an nvlist of property names (with no values) that will be
	- * returned for each bookmark.
	- *
	- * The following are valid properties on bookmarks, all of which are numbers
	- * (represented as uint64 in the nvlist)
	- *
	- * "guid" - globally unique identifier of the snapshot it refers to
	- * "createtxg" - txg when the snapshot it refers to was created
	- * "creation" - timestamp when the snapshot it refers to was created
	- *
	- * The format of the returned nvlist as follows:
	- * <short name of bookmark> -> {
	- * <name of property> -> {
	- * "value" -> uint64
	- * }
	- * }
	- */
	-int
	-lzc_get_bookmarks(const char fsname, nvlist_t props, nvlist_t **bmarks)
	-{
	- return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks));
	-}
	-
	-/*
	- * Destroys bookmarks.
	- *
	- * The keys in the bmarks nvlist are the bookmarks to be destroyed.
	- * They must all be in the same pool. Bookmarks are specified as
	- * <fs>#<bmark>.
	- *
	- * Bookmarks that do not exist will be silently ignored.
	- *
	- * The return value will be 0 if all bookmarks that existed were destroyed.
	- *
	- * Otherwise the return value will be the errno of a (undetermined) bookmark
	- * that failed, no bookmarks will be destroyed, and the errlist will have an
	- * entry for each bookmarks that failed. The value in the errlist will be
	- * the (int32) error code.
	- */
	-int
	-lzc_destroy_bookmarks(nvlist_t bmarks, nvlist_t *errlist)
	-{
	- nvpair_t *elem;
	- int error;
	- char pool[ZFS_MAX_DATASET_NAME_LEN];
	-
	- /* determine the pool name */
	- elem = nvlist_next_nvpair(bmarks, NULL);
	- if (elem == NULL)
	- return (0);
	- (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	- pool[strcspn(pool, "/#")] = '\0';
	-
	- error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist);
	-
	- return (error);
	-}
	-
	-static int
	-lzc_channel_program_impl(const char pool, const char program, boolean_t sync,
	- uint64_t instrlimit, uint64_t memlimit, nvlist_t argnvl, nvlist_t *outnvl)
	-{
	- int error;
	- nvlist_t *args;
	-
	- args = fnvlist_alloc();
	- fnvlist_add_string(args, ZCP_ARG_PROGRAM, program);
	- fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl);
	- fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync);
	- fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit);
	- fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit);
	- error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl);
	- fnvlist_free(args);
	-
	- return (error);
	-}
	-
	-/*
	- * Executes a channel program.
	- *
	- * If this function returns 0 the channel program was successfully loaded and
	- * ran without failing. Note that individual commands the channel program ran
	- * may have failed and the channel program is responsible for reporting such
	- * errors through outnvl if they are important.
	- *
	- * This method may also return:
	- *
	- * EINVAL The program contains syntax errors, or an invalid memory or time
	- * limit was given. No part of the channel program was executed.
	- * If caused by syntax errors, 'outnvl' contains information about the
	- * errors.
	- *
	- * EDOM The program was executed, but encountered a runtime error, such as
	- * calling a function with incorrect arguments, invoking the error()
	- * function directly, failing an assert() command, etc. Some portion
	- * of the channel program may have executed and committed changes.
	- * Information about the failure can be found in 'outnvl'.
	- *
	- * ENOMEM The program fully executed, but the output buffer was not large
	- * enough to store the returned value. No output is returned through
	- * 'outnvl'.
	- *
	- * ENOSPC The program was terminated because it exceeded its memory usage
	- * limit. Some portion of the channel program may have executed and
	- * committed changes to disk. No output is returned through 'outnvl'.
	- *
	- * ETIMEDOUT The program was terminated because it exceeded its Lua instruction
	- * limit. Some portion of the channel program may have executed and
	- * committed changes to disk. No output is returned through 'outnvl'.
	- */
	-int
	-lzc_channel_program(const char pool, const char program, uint64_t instrlimit,
	- uint64_t memlimit, nvlist_t argnvl, nvlist_t *outnvl)
	-{
	- return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit,
	- memlimit, argnvl, outnvl));
	-}
	-
	-/*
	- * Creates a checkpoint for the specified pool.
	- *
	- * If this function returns 0 the pool was successfully checkpointed.
	- *
	- * This method may also return:
	- *
	- * ZFS_ERR_CHECKPOINT_EXISTS
	- * The pool already has a checkpoint. A pools can only have one
	- * checkpoint at most, at any given time.
	- *
	- * ZFS_ERR_DISCARDING_CHECKPOINT
	- * ZFS is in the middle of discarding a checkpoint for this pool.
	- * The pool can be checkpointed again once the discard is done.
	- *
	- * ZFS_DEVRM_IN_PROGRESS
	- * A vdev is currently being removed. The pool cannot be
	- * checkpointed until the device removal is done.
	- *
	- * ZFS_VDEV_TOO_BIG
	- * One or more top-level vdevs exceed the maximum vdev size
	- * supported for this feature.
	- */
	-int
	-lzc_pool_checkpoint(const char *pool)
	-{
	- int error;
	-
	- nvlist_t *result = NULL;
	- nvlist_t *args = fnvlist_alloc();
	-
	- error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result);
	-
	- fnvlist_free(args);
	- fnvlist_free(result);
	-
	- return (error);
	-}
	-
	-/*
	- * Discard the checkpoint from the specified pool.
	- *
	- * If this function returns 0 the checkpoint was successfully discarded.
	- *
	- * This method may also return:
	- *
	- * ZFS_ERR_NO_CHECKPOINT
	- * The pool does not have a checkpoint.
	- *
	- * ZFS_ERR_DISCARDING_CHECKPOINT
	- * ZFS is already in the middle of discarding the checkpoint.
	- */
	-int
	-lzc_pool_checkpoint_discard(const char *pool)
	-{
	- int error;
	-
	- nvlist_t *result = NULL;
	- nvlist_t *args = fnvlist_alloc();
	-
	- error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result);
	-
	- fnvlist_free(args);
	- fnvlist_free(result);
	-
	- return (error);
	-}
	-
	-/*
	- * Executes a read-only channel program.
	- *
	- * A read-only channel program works programmatically the same way as a
	- * normal channel program executed with lzc_channel_program(). The only
	- * difference is it runs exclusively in open-context and therefore can
	- * return faster. The downside to that, is that the program cannot change
	- * on-disk state by calling functions from the zfs.sync submodule.
	- *
	- * The return values of this function (and their meaning) are exactly the
	- * same as the ones described in lzc_channel_program().
	- */
	-int
	-lzc_channel_program_nosync(const char pool, const char program,
	- uint64_t timeout, uint64_t memlimit, nvlist_t argnvl, nvlist_t *outnvl)
	-{
	- return (lzc_channel_program_impl(pool, program, B_FALSE, timeout,
	- memlimit, argnvl, outnvl));
	-}
	-
	-/*
	- * Changes initializing state.
	- *
	- * vdevs should be a list of (<key>, guid) where guid is a uint64 vdev GUID.
	- * The key is ignored.
	- *
	- * If there are errors related to vdev arguments, per-vdev errors are returned
	- * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where
	- * guid is stringified with PRIu64, and errno is one of the following as
	- * an int64_t:
	- * - ENODEV if the device was not found
	- * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing)
	- * - EROFS if the device is not writeable
	- * - EBUSY start requested but the device is already being initialized
	- * - ESRCH cancel/suspend requested but device is not being initialized
	- *
	- * If the errlist is empty, then return value will be:
	- * - EINVAL if one or more arguments was invalid
	- * - Other spa_open failures
	- * - 0 if the operation succeeded
	- */
	-int
	-lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type,
	- nvlist_t vdevs, nvlist_t *errlist)
	-{
	- int error;
	- nvlist_t *args = fnvlist_alloc();
	- fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type);
	- fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs);
	-
	- error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist);
	-
	- fnvlist_free(args);
	-
	- return (error);
	-}
	-
	-/*
	- * Set the bootenv contents for the given pool.
	- */
	-int
	-lzc_set_bootenv(const char pool, const char env)
	-{
	- nvlist_t *args = fnvlist_alloc();
	- fnvlist_add_string(args, "envmap", env);
	- int error = lzc_ioctl(ZFS_IOC_SET_BOOTENV, pool, args, NULL);
	- fnvlist_free(args);
	- return (error);
	-}
	-
	-/*
	- * Get the contents of the bootenv of the given pool.
	- */
	-int
	-lzc_get_bootenv(const char pool, nvlist_t *outnvl)
	-{
	- return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl));
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h
	+++ head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h
	@@ -1,47 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2013 by Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- */
	-
	-#ifndef _LIBZFS_CORE_COMPAT_H
	-#define _LIBZFS_CORE_COMPAT_H
	-
	-#include <libnvpair.h>
	-#include <sys/param.h>
	-#include <sys/types.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zfs_ioctl.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-int lzc_compat_pre(zfs_cmd_t , zfs_ioc_t , nvlist_t **);
	-void lzc_compat_post(zfs_cmd_t *, const zfs_ioc_t);
	-int lzc_compat_outnvl(zfs_cmd_t , const zfs_ioc_t, nvlist_t *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _LIBZFS_CORE_COMPAT_H */
	Index: head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c
	+++ head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c
	@@ -1,189 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- */
	-
	-#include <sys/zfs_ioctl.h>
	-#include <zfs_ioctl_compat.h>
	-#include "libzfs_core_compat.h"
	-
	-extern int zfs_ioctl_version;
	-
	-int
	-lzc_compat_pre(zfs_cmd_t zc, zfs_ioc_t ioc, nvlist_t **source)
	-{
	- nvlist_t *nvl = NULL;
	- nvpair_t pair, hpair;
	- char buf, val;
	- zfs_ioc_t vecnum;
	- uint32_t type32;
	- int32_t cleanup_fd;
	- int error = 0;
	- int pos;
	-
	- if (zfs_ioctl_version >= ZFS_IOCVER_LZC)
	- return (0);
	-
	- vecnum = *ioc;
	-
	- switch (vecnum) {
	- case ZFS_IOC_CREATE:
	- type32 = fnvlist_lookup_int32(*source, "type");
	- zc->zc_objset_type = (uint64_t)type32;
	- nvlist_lookup_nvlist(*source, "props", &nvl);
	- *source = nvl;
	- break;
	- case ZFS_IOC_CLONE:
	- buf = fnvlist_lookup_string(*source, "origin");
	- strlcpy(zc->zc_value, buf, MAXPATHLEN);
	- nvlist_lookup_nvlist(*source, "props", &nvl);
	- *ioc = ZFS_IOC_CREATE;
	- *source = nvl;
	- break;
	- case ZFS_IOC_SNAPSHOT:
	- nvl = fnvlist_lookup_nvlist(*source, "snaps");
	- pair = nvlist_next_nvpair(nvl, NULL);
	- if (pair != NULL) {
	- buf = nvpair_name(pair);
	- pos = strcspn(buf, "@");
	- strlcpy(zc->zc_name, buf, pos + 1);
	- strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN);
	- } else
	- error = EINVAL;
	- /* old kernel cannot create multiple snapshots */
	- if (!error && nvlist_next_nvpair(nvl, pair) != NULL)
	- error = EOPNOTSUPP;
	- nvlist_free(nvl);
	- nvl = NULL;
	- nvlist_lookup_nvlist(*source, "props", &nvl);
	- *source = nvl;
	- break;
	- case ZFS_IOC_SPACE_SNAPS:
	- buf = fnvlist_lookup_string(*source, "firstsnap");
	- strlcpy(zc->zc_value, buf, MAXPATHLEN);
	- break;
	- case ZFS_IOC_DESTROY_SNAPS:
	- nvl = fnvlist_lookup_nvlist(*source, "snaps");
	- pair = nvlist_next_nvpair(nvl, NULL);
	- if (pair != NULL) {
	- buf = nvpair_name(pair);
	- pos = strcspn(buf, "@");
	- strlcpy(zc->zc_name, buf, pos + 1);
	- } else
	- error = EINVAL;
	- /* old kernel cannot atomically destroy multiple snaps */
	- if (!error && nvlist_next_nvpair(nvl, pair) != NULL)
	- error = EOPNOTSUPP;
	- *source = nvl;
	- break;
	- case ZFS_IOC_HOLD:
	- nvl = fnvlist_lookup_nvlist(*source, "holds");
	- pair = nvlist_next_nvpair(nvl, NULL);
	- if (pair != NULL) {
	- buf = nvpair_name(pair);
	- pos = strcspn(buf, "@");
	- strlcpy(zc->zc_name, buf, pos + 1);
	- strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN);
	- if (nvpair_value_string(pair, &val) == 0)
	- strlcpy(zc->zc_string, val, MAXNAMELEN);
	- else
	- error = EINVAL;
	- } else
	- error = EINVAL;
	- /* old kernel cannot atomically create multiple holds */
	- if (!error && nvlist_next_nvpair(nvl, pair) != NULL)
	- error = EOPNOTSUPP;
	- nvlist_free(nvl);
	- if (nvlist_lookup_int32(*source, "cleanup_fd",
	- &cleanup_fd) == 0)
	- zc->zc_cleanup_fd = cleanup_fd;
	- else
	- zc->zc_cleanup_fd = -1;
	- break;
	- case ZFS_IOC_RELEASE:
	- pair = nvlist_next_nvpair(*source, NULL);
	- if (pair != NULL) {
	- buf = nvpair_name(pair);
	- pos = strcspn(buf, "@");
	- strlcpy(zc->zc_name, buf, pos + 1);
	- strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN);
	- if (nvpair_value_nvlist(pair, &nvl) == 0) {
	- hpair = nvlist_next_nvpair(nvl, NULL);
	- if (hpair != NULL)
	- strlcpy(zc->zc_string,
	- nvpair_name(hpair), MAXNAMELEN);
	- else
	- error = EINVAL;
	- if (!error && nvlist_next_nvpair(nvl,
	- hpair) != NULL)
	- error = EOPNOTSUPP;
	- } else
	- error = EINVAL;
	- } else
	- error = EINVAL;
	- /* old kernel cannot atomically release multiple holds */
	- if (!error && nvlist_next_nvpair(nvl, pair) != NULL)
	- error = EOPNOTSUPP;
	- break;
	- }
	-
	- return (error);
	-}
	-
	-void
	-lzc_compat_post(zfs_cmd_t *zc, const zfs_ioc_t ioc)
	-{
	- if (zfs_ioctl_version >= ZFS_IOCVER_LZC)
	- return;
	-
	- switch (ioc) {
	- case ZFS_IOC_CREATE:
	- case ZFS_IOC_CLONE:
	- case ZFS_IOC_SNAPSHOT:
	- case ZFS_IOC_SPACE_SNAPS:
	- case ZFS_IOC_DESTROY_SNAPS:
	- zc->zc_nvlist_dst_filled = B_FALSE;
	- break;
	- }
	-}
	-
	-int
	-lzc_compat_outnvl(zfs_cmd_t zc, const zfs_ioc_t ioc, nvlist_t *outnvl)
	-{
	- nvlist_t *nvl;
	-
	- if (zfs_ioctl_version >= ZFS_IOCVER_LZC)
	- return (0);
	-
	- switch (ioc) {
	- case ZFS_IOC_SPACE_SNAPS:
	- nvl = fnvlist_alloc();
	- fnvlist_add_uint64(nvl, "used", zc->zc_cookie);
	- fnvlist_add_uint64(nvl, "compressed", zc->zc_objset_type);
	- fnvlist_add_uint64(nvl, "uncompressed", zc->zc_perm_action);
	- *outnvl = nvl;
	- break;
	- }
	-
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
	+++ head/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
	@@ -1,1238 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- */
	-
	-#include <assert.h>
	-#include <fcntl.h>
	-#include <poll.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <string.h>
	-#include <zlib.h>
	-#include <libgen.h>
	-#include <sys/assfail.h>
	-#include <sys/spa.h>
	-#include <sys/stat.h>
	-#include <sys/processor.h>
	-#include <sys/zfs_context.h>
	-#include <sys/rrwlock.h>
	-#include <sys/zmod.h>
	-#include <sys/utsname.h>
	-#include <sys/systeminfo.h>
	-#include <libzfs.h>
	-
	-/*
	- * Emulation of kernel services in userland.
	- */
	-
	-#ifndef __FreeBSD__
	-int aok;
	-#endif
	-uint64_t physmem;
	-vnode_t rootdir = (vnode_t )0xabcd1234;
	-char hw_serial[HW_HOSTID_LEN];
	-#ifdef illumos
	-kmutex_t cpu_lock;
	-#endif
	-
	-/* If set, all blocks read will be copied to the specified directory. */
	-char *vn_dumpdir = NULL;
	-
	-struct utsname utsname = {
	- "userland", "libzpool", "1", "1", "na"
	-};
	-
	-/* this only exists to have its address taken */
	-struct proc p0;
	-
	-/*
	- * =========================================================================
	- * threads
	- * =========================================================================
	- */
	-/ARGSUSED/
	-kthread_t *
	-zk_thread_create(void (func)(), void arg)
	-{
	- thread_t tid;
	-
	- VERIFY(thr_create(0, 0, (void ()(void *))func, arg, THR_DETACHED,
	- &tid) == 0);
	-
	- return ((void *)(uintptr_t)tid);
	-}
	-
	-/*
	- * =========================================================================
	- * kstats
	- * =========================================================================
	- */
	-/ARGSUSED/
	-kstat_t *
	-kstat_create(char module, int instance, char name, char *class,
	- uchar_t type, ulong_t ndata, uchar_t ks_flag)
	-{
	- return (NULL);
	-}
	-
	-/ARGSUSED/
	-void
	-kstat_named_init(kstat_named_t knp, const char name, uchar_t type)
	-{}
	-
	-/ARGSUSED/
	-void
	-kstat_install(kstat_t *ksp)
	-{}
	-
	-/ARGSUSED/
	-void
	-kstat_delete(kstat_t *ksp)
	-{}
	-
	-/*
	- * =========================================================================
	- * mutexes
	- * =========================================================================
	- */
	-void
	-zmutex_init(kmutex_t *mp)
	-{
	- mp->m_owner = NULL;
	- mp->initialized = B_TRUE;
	- (void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL);
	-}
	-
	-void
	-zmutex_destroy(kmutex_t *mp)
	-{
	- ASSERT(mp->initialized == B_TRUE);
	- ASSERT(mp->m_owner == NULL);
	- (void) _mutex_destroy(&(mp)->m_lock);
	- mp->m_owner = (void *)-1UL;
	- mp->initialized = B_FALSE;
	-}
	-
	-int
	-zmutex_owned(kmutex_t *mp)
	-{
	- ASSERT(mp->initialized == B_TRUE);
	-
	- return (mp->m_owner == curthread);
	-}
	-
	-void
	-mutex_enter(kmutex_t *mp)
	-{
	- ASSERT(mp->initialized == B_TRUE);
	- ASSERT(mp->m_owner != (void *)-1UL);
	- ASSERT(mp->m_owner != curthread);
	- VERIFY(mutex_lock(&mp->m_lock) == 0);
	- ASSERT(mp->m_owner == NULL);
	- mp->m_owner = curthread;
	-}
	-
	-int
	-mutex_tryenter(kmutex_t *mp)
	-{
	- ASSERT(mp->initialized == B_TRUE);
	- ASSERT(mp->m_owner != (void *)-1UL);
	- if (0 == mutex_trylock(&mp->m_lock)) {
	- ASSERT(mp->m_owner == NULL);
	- mp->m_owner = curthread;
	- return (1);
	- } else {
	- return (0);
	- }
	-}
	-
	-void
	-mutex_exit(kmutex_t *mp)
	-{
	- ASSERT(mp->initialized == B_TRUE);
	- ASSERT(mutex_owner(mp) == curthread);
	- mp->m_owner = NULL;
	- VERIFY(mutex_unlock(&mp->m_lock) == 0);
	-}
	-
	-void *
	-mutex_owner(kmutex_t *mp)
	-{
	- ASSERT(mp->initialized == B_TRUE);
	- return (mp->m_owner);
	-}
	-
	-/*
	- * =========================================================================
	- * rwlocks
	- * =========================================================================
	- */
	-/ARGSUSED/
	-void
	-rw_init(krwlock_t rwlp, char name, int type, void *arg)
	-{
	- rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL);
	- rwlp->rw_owner = NULL;
	- rwlp->initialized = B_TRUE;
	- rwlp->rw_count = 0;
	-}
	-
	-void
	-rw_destroy(krwlock_t *rwlp)
	-{
	- ASSERT(rwlp->rw_count == 0);
	- rwlock_destroy(&rwlp->rw_lock);
	- rwlp->rw_owner = (void *)-1UL;
	- rwlp->initialized = B_FALSE;
	-}
	-
	-void
	-rw_enter(krwlock_t *rwlp, krw_t rw)
	-{
	- //ASSERT(!RW_LOCK_HELD(rwlp));
	- ASSERT(rwlp->initialized == B_TRUE);
	- ASSERT(rwlp->rw_owner != (void *)-1UL);
	- ASSERT(rwlp->rw_owner != curthread);
	-
	- if (rw == RW_READER) {
	- VERIFY(rw_rdlock(&rwlp->rw_lock) == 0);
	- ASSERT(rwlp->rw_count >= 0);
	- atomic_add_int(&rwlp->rw_count, 1);
	- } else {
	- VERIFY(rw_wrlock(&rwlp->rw_lock) == 0);
	- ASSERT(rwlp->rw_count == 0);
	- rwlp->rw_count = -1;
	- rwlp->rw_owner = curthread;
	- }
	-}
	-
	-void
	-rw_exit(krwlock_t *rwlp)
	-{
	- ASSERT(rwlp->initialized == B_TRUE);
	- ASSERT(rwlp->rw_owner != (void *)-1UL);
	-
	- if (rwlp->rw_owner == curthread) {
	- /* Write locked. */
	- ASSERT(rwlp->rw_count == -1);
	- rwlp->rw_count = 0;
	- rwlp->rw_owner = NULL;
	- } else {
	- /* Read locked. */
	- ASSERT(rwlp->rw_count > 0);
	- atomic_add_int(&rwlp->rw_count, -1);
	- }
	- VERIFY(rw_unlock(&rwlp->rw_lock) == 0);
	-}
	-
	-int
	-rw_tryenter(krwlock_t *rwlp, krw_t rw)
	-{
	- int rv;
	-
	- ASSERT(rwlp->initialized == B_TRUE);
	- ASSERT(rwlp->rw_owner != (void *)-1UL);
	- ASSERT(rwlp->rw_owner != curthread);
	-
	- if (rw == RW_READER)
	- rv = rw_tryrdlock(&rwlp->rw_lock);
	- else
	- rv = rw_trywrlock(&rwlp->rw_lock);
	-
	- if (rv == 0) {
	- ASSERT(rwlp->rw_owner == NULL);
	- if (rw == RW_READER) {
	- ASSERT(rwlp->rw_count >= 0);
	- atomic_add_int(&rwlp->rw_count, 1);
	- } else {
	- ASSERT(rwlp->rw_count == 0);
	- rwlp->rw_count = -1;
	- rwlp->rw_owner = curthread;
	- }
	- return (1);
	- }
	-
	- return (0);
	-}
	-
	-/ARGSUSED/
	-int
	-rw_tryupgrade(krwlock_t *rwlp)
	-{
	- ASSERT(rwlp->initialized == B_TRUE);
	- ASSERT(rwlp->rw_owner != (void *)-1UL);
	-
	- return (0);
	-}
	-
	-int
	-rw_lock_held(krwlock_t *rwlp)
	-{
	-
	- return (rwlp->rw_count != 0);
	-}
	-
	-/*
	- * =========================================================================
	- * condition variables
	- * =========================================================================
	- */
	-/ARGSUSED/
	-void
	-cv_init(kcondvar_t cv, char name, int type, void *arg)
	-{
	- VERIFY(cond_init(cv, name, NULL) == 0);
	-}
	-
	-void
	-cv_destroy(kcondvar_t *cv)
	-{
	- VERIFY(cond_destroy(cv) == 0);
	-}
	-
	-void
	-cv_wait(kcondvar_t cv, kmutex_t mp)
	-{
	- ASSERT(mutex_owner(mp) == curthread);
	- mp->m_owner = NULL;
	- int ret = cond_wait(cv, &mp->m_lock);
	- VERIFY(ret == 0 \|\| ret == EINTR);
	- mp->m_owner = curthread;
	-}
	-
	-/*
	- * NB: this emulates FreeBSD cv_wait_sig(9), not the illumos one.
	- * Meanings of the return code are different.
	- * NB: this does not actually catch any signals.
	- */
	-int
	-cv_wait_sig(kcondvar_t cv, kmutex_t mp)
	-{
	- cv_wait(cv, mp);
	- return (0);
	-}
	-
	-clock_t
	-cv_timedwait(kcondvar_t cv, kmutex_t mp, clock_t abstime)
	-{
	- int error;
	- struct timespec ts;
	- struct timeval tv;
	- clock_t delta;
	-
	- abstime += ddi_get_lbolt();
	-top:
	- delta = abstime - ddi_get_lbolt();
	- if (delta <= 0)
	- return (-1);
	-
	- if (gettimeofday(&tv, NULL) != 0)
	- assert(!"gettimeofday() failed");
	-
	- ts.tv_sec = tv.tv_sec + delta / hz;
	- ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz);
	- ASSERT(ts.tv_nsec >= 0);
	-
	- if (ts.tv_nsec >= NANOSEC) {
	- ts.tv_sec++;
	- ts.tv_nsec -= NANOSEC;
	- }
	-
	- ASSERT(mutex_owner(mp) == curthread);
	- mp->m_owner = NULL;
	- error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
	- mp->m_owner = curthread;
	-
	- if (error == EINTR)
	- goto top;
	-
	- if (error == ETIMEDOUT)
	- return (-1);
	-
	- ASSERT(error == 0);
	-
	- return (1);
	-}
	-
	-/ARGSUSED/
	-clock_t
	-cv_timedwait_hires(kcondvar_t cv, kmutex_t mp, hrtime_t tim, hrtime_t res,
	- int flag)
	-{
	- int error;
	- timespec_t ts;
	- hrtime_t delta;
	-
	- ASSERT(flag == 0 \|\| flag == CALLOUT_FLAG_ABSOLUTE);
	-
	-top:
	- delta = tim;
	- if (flag & CALLOUT_FLAG_ABSOLUTE)
	- delta -= gethrtime();
	-
	- if (delta <= 0)
	- return (-1);
	-
	- clock_gettime(CLOCK_REALTIME, &ts);
	- ts.tv_sec += delta / NANOSEC;
	- ts.tv_nsec += delta % NANOSEC;
	- if (ts.tv_nsec >= NANOSEC) {
	- ts.tv_sec++;
	- ts.tv_nsec -= NANOSEC;
	- }
	-
	- ASSERT(mutex_owner(mp) == curthread);
	- mp->m_owner = NULL;
	- error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
	- mp->m_owner = curthread;
	-
	- if (error == ETIMEDOUT)
	- return (-1);
	-
	- if (error == EINTR)
	- goto top;
	-
	- ASSERT(error == 0);
	-
	- return (1);
	-}
	-
	-void
	-cv_signal(kcondvar_t *cv)
	-{
	- VERIFY(cond_signal(cv) == 0);
	-}
	-
	-void
	-cv_broadcast(kcondvar_t *cv)
	-{
	- VERIFY(cond_broadcast(cv) == 0);
	-}
	-
	-/*
	- * =========================================================================
	- * vnode operations
	- * =========================================================================
	- */
	-/*
	- * Note: for the xxxat() versions of these functions, we assume that the
	- * starting vp is always rootdir (which is true for spa_directory.c, the only
	- * ZFS consumer of these interfaces). We assert this is true, and then emulate
	- * them by adding '/' in front of the path.
	- */
	-
	-/ARGSUSED/
	-int
	-vn_open(char path, int x1, int flags, int mode, vnode_t *vpp, int x2, int x3)
	-{
	- int fd;
	- int dump_fd;
	- vnode_t *vp;
	- int old_umask;
	- char realpath[MAXPATHLEN];
	- struct stat64 st;
	-
	- /*
	- * If we're accessing a real disk from userland, we need to use
	- * the character interface to avoid caching. This is particularly
	- * important if we're trying to look at a real in-kernel storage
	- * pool from userland, e.g. via zdb, because otherwise we won't
	- * see the changes occurring under the segmap cache.
	- * On the other hand, the stupid character device returns zero
	- * for its size. So -- gag -- we open the block device to get
	- * its size, and remember it for subsequent VOP_GETATTR().
	- */
	- if (strncmp(path, "/dev/", 5) == 0) {
	- char *dsk;
	- fd = open64(path, O_RDONLY);
	- if (fd == -1)
	- return (errno);
	- if (fstat64(fd, &st) == -1) {
	- close(fd);
	- return (errno);
	- }
	- close(fd);
	- (void) sprintf(realpath, "%s", path);
	- dsk = strstr(path, "/dsk/");
	- if (dsk != NULL)
	- (void) sprintf(realpath + (dsk - path) + 1, "r%s",
	- dsk + 1);
	- } else {
	- (void) sprintf(realpath, "%s", path);
	- if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
	- return (errno);
	- }
	-
	- if (flags & FCREAT)
	- old_umask = umask(0);
	-
	- /*
	- * The construct 'flags - FREAD' conveniently maps combinations of
	- * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
	- */
	- fd = open64(realpath, flags - FREAD, mode);
	-
	- if (flags & FCREAT)
	- (void) umask(old_umask);
	-
	- if (vn_dumpdir != NULL) {
	- char dumppath[MAXPATHLEN];
	- (void) snprintf(dumppath, sizeof (dumppath),
	- "%s/%s", vn_dumpdir, basename(realpath));
	- dump_fd = open64(dumppath, O_CREAT \| O_WRONLY, 0666);
	- if (dump_fd == -1)
	- return (errno);
	- } else {
	- dump_fd = -1;
	- }
	-
	- if (fd == -1)
	- return (errno);
	-
	- if (fstat64(fd, &st) == -1) {
	- close(fd);
	- return (errno);
	- }
	-
	- (void) fcntl(fd, F_SETFD, FD_CLOEXEC);
	-
	- *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
	-
	- vp->v_fd = fd;
	- vp->v_size = st.st_size;
	- vp->v_path = spa_strdup(path);
	- vp->v_dump_fd = dump_fd;
	-
	- return (0);
	-}
	-
	-/ARGSUSED/
	-int
	-vn_openat(char path, int x1, int flags, int mode, vnode_t *vpp, int x2,
	- int x3, vnode_t *startvp, int fd)
	-{
	- char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
	- int ret;
	-
	- ASSERT(startvp == rootdir);
	- (void) sprintf(realpath, "/%s", path);
	-
	- /* fd ignored for now, need if want to simulate nbmand support */
	- ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
	-
	- umem_free(realpath, strlen(path) + 2);
	-
	- return (ret);
	-}
	-
	-/ARGSUSED/
	-int
	-vn_rdwr(int uio, vnode_t vp, void addr, ssize_t len, offset_t offset,
	- int x1, int x2, rlim64_t x3, void x4, ssize_t residp)
	-{
	- ssize_t iolen, split;
	-
	- if (uio == UIO_READ) {
	- iolen = pread64(vp->v_fd, addr, len, offset);
	- if (vp->v_dump_fd != -1) {
	- int status =
	- pwrite64(vp->v_dump_fd, addr, iolen, offset);
	- ASSERT(status != -1);
	- }
	- } else {
	- /*
	- * To simulate partial disk writes, we split writes into two
	- * system calls so that the process can be killed in between.
	- */
	- int sectors = len >> SPA_MINBLOCKSHIFT;
	- split = (sectors > 0 ? rand() % sectors : 0) <<
	- SPA_MINBLOCKSHIFT;
	- iolen = pwrite64(vp->v_fd, addr, split, offset);
	- iolen += pwrite64(vp->v_fd, (char *)addr + split,
	- len - split, offset + split);
	- }
	-
	- if (iolen == -1)
	- return (errno);
	- if (residp)
	- *residp = len - iolen;
	- else if (iolen != len)
	- return (EIO);
	- return (0);
	-}
	-
	-void
	-vn_close(vnode_t vp, int openflag, cred_t cr, kthread_t *td)
	-{
	- close(vp->v_fd);
	- if (vp->v_dump_fd != -1)
	- close(vp->v_dump_fd);
	- spa_strfree(vp->v_path);
	- umem_free(vp, sizeof (vnode_t));
	-}
	-
	-/*
	- * At a minimum we need to update the size since vdev_reopen()
	- * will no longer call vn_openat().
	- */
	-int
	-fop_getattr(vnode_t vp, vattr_t vap)
	-{
	- struct stat64 st;
	-
	- if (fstat64(vp->v_fd, &st) == -1) {
	- close(vp->v_fd);
	- return (errno);
	- }
	-
	- vap->va_size = st.st_size;
	- return (0);
	-}
	-
	-#ifdef ZFS_DEBUG
	-
	-/*
	- * =========================================================================
	- * Figure out which debugging statements to print
	- * =========================================================================
	- */
	-
	-static char *dprintf_string;
	-static int dprintf_print_all;
	-
	-int
	-dprintf_find_string(const char *string)
	-{
	- char *tmp_str = dprintf_string;
	- int len = strlen(string);
	-
	- /*
	- * Find out if this is a string we want to print.
	- * String format: file1.c,function_name1,file2.c,file3.c
	- */
	-
	- while (tmp_str != NULL) {
	- if (strncmp(tmp_str, string, len) == 0 &&
	- (tmp_str[len] == ',' \|\| tmp_str[len] == '\0'))
	- return (1);
	- tmp_str = strchr(tmp_str, ',');
	- if (tmp_str != NULL)
	- tmp_str++; /* Get rid of , */
	- }
	- return (0);
	-}
	-
	-void
	-dprintf_setup(int argc, char *argv)
	-{
	- int i, j;
	-
	- /*
	- * Debugging can be specified two ways: by setting the
	- * environment variable ZFS_DEBUG, or by including a
	- * "debug=..." argument on the command line. The command
	- * line setting overrides the environment variable.
	- */
	-
	- for (i = 1; i < *argc; i++) {
	- int len = strlen("debug=");
	- /* First look for a command line argument */
	- if (strncmp("debug=", argv[i], len) == 0) {
	- dprintf_string = argv[i] + len;
	- /* Remove from args */
	- for (j = i; j < *argc; j++)
	- argv[j] = argv[j+1];
	- argv[j] = NULL;
	- (*argc)--;
	- }
	- }
	-
	- if (dprintf_string == NULL) {
	- /* Look for ZFS_DEBUG environment variable */
	- dprintf_string = getenv("ZFS_DEBUG");
	- }
	-
	- /*
	- * Are we just turning on all debugging?
	- */
	- if (dprintf_find_string("on"))
	- dprintf_print_all = 1;
	-
	- if (dprintf_string != NULL)
	- zfs_flags \|= ZFS_DEBUG_DPRINTF;
	-}
	-
	-int
	-sysctl_handle_64(SYSCTL_HANDLER_ARGS)
	-{
	- return (0);
	-}
	-
	-/*
	- * =========================================================================
	- * debug printfs
	- * =========================================================================
	- */
	-void
	-__dprintf(const char file, const char func, int line, const char *fmt, ...)
	-{
	- const char *newfile;
	- va_list adx;
	-
	- /*
	- * Get rid of annoying "../common/" prefix to filename.
	- */
	- newfile = strrchr(file, '/');
	- if (newfile != NULL) {
	- newfile = newfile + 1; /* Get rid of leading / */
	- } else {
	- newfile = file;
	- }
	-
	- if (dprintf_print_all \|\|
	- dprintf_find_string(newfile) \|\|
	- dprintf_find_string(func)) {
	- /* Print out just the function name if requested */
	- flockfile(stdout);
	- if (dprintf_find_string("pid"))
	- (void) printf("%d ", getpid());
	- if (dprintf_find_string("tid"))
	- (void) printf("%lu ", thr_self());
	-#if 0
	- if (dprintf_find_string("cpu"))
	- (void) printf("%u ", getcpuid());
	-#endif
	- if (dprintf_find_string("time"))
	- (void) printf("%llu ", gethrtime());
	- if (dprintf_find_string("long"))
	- (void) printf("%s, line %d: ", newfile, line);
	- (void) printf("%s: ", func);
	- va_start(adx, fmt);
	- (void) vprintf(fmt, adx);
	- va_end(adx);
	- funlockfile(stdout);
	- }
	-}
	-
	-#endif /* ZFS_DEBUG */
	-
	-/*
	- * =========================================================================
	- * cmn_err() and panic()
	- * =========================================================================
	- */
	-static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
	-static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
	-
	-void
	-vpanic(const char *fmt, va_list adx)
	-{
	- char buf[512];
	- (void) vsnprintf(buf, 512, fmt, adx);
	- assfail(buf, NULL, 0);
	- abort(); /* necessary to make vpanic meet noreturn requirements */
	-}
	-
	-void
	-panic(const char *fmt, ...)
	-{
	- va_list adx;
	-
	- va_start(adx, fmt);
	- vpanic(fmt, adx);
	- va_end(adx);
	-}
	-
	-void
	-vcmn_err(int ce, const char *fmt, va_list adx)
	-{
	- if (ce == CE_PANIC)
	- vpanic(fmt, adx);
	- if (ce != CE_NOTE) { /* suppress noise in userland stress testing */
	- (void) fprintf(stderr, "%s", ce_prefix[ce]);
	- (void) vfprintf(stderr, fmt, adx);
	- (void) fprintf(stderr, "%s", ce_suffix[ce]);
	- }
	-}
	-
	-/PRINTFLIKE2/
	-void
	-cmn_err(int ce, const char *fmt, ...)
	-{
	- va_list adx;
	-
	- va_start(adx, fmt);
	- vcmn_err(ce, fmt, adx);
	- va_end(adx);
	-}
	-
	-/*
	- * =========================================================================
	- * kobj interfaces
	- * =========================================================================
	- */
	-struct _buf *
	-kobj_open_file(char *name)
	-{
	- struct _buf *file;
	- vnode_t *vp;
	-
	- /* set vp as the _fd field of the file */
	- if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir,
	- -1) != 0)
	- return ((void *)-1UL);
	-
	- file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
	- file->_fd = (intptr_t)vp;
	- return (file);
	-}
	-
	-int
	-kobj_read_file(struct _buf file, char buf, unsigned size, unsigned off)
	-{
	- ssize_t resid;
	-
	- vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
	- UIO_SYSSPACE, 0, 0, 0, &resid);
	-
	- return (size - resid);
	-}
	-
	-void
	-kobj_close_file(struct _buf *file)
	-{
	- vn_close((vnode_t *)file->_fd, 0, NULL, NULL);
	- umem_free(file, sizeof (struct _buf));
	-}
	-
	-int
	-kobj_get_filesize(struct _buf file, uint64_t size)
	-{
	- struct stat64 st;
	- vnode_t vp = (vnode_t )file->_fd;
	-
	- if (fstat64(vp->v_fd, &st) == -1) {
	- vn_close(vp, 0, NULL, NULL);
	- return (errno);
	- }
	- *size = st.st_size;
	- return (0);
	-}
	-
	-/*
	- * =========================================================================
	- * misc routines
	- * =========================================================================
	- */
	-
	-void
	-delay(clock_t ticks)
	-{
	- poll(0, 0, ticks * (1000 / hz));
	-}
	-
	-#if 0
	-/*
	- * Find highest one bit set.
	- * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
	- */
	-int
	-highbit64(uint64_t i)
	-{
	- int h = 1;
	-
	- if (i == 0)
	- return (0);
	- if (i & 0xffffffff00000000ULL) {
	- h += 32; i >>= 32;
	- }
	- if (i & 0xffff0000) {
	- h += 16; i >>= 16;
	- }
	- if (i & 0xff00) {
	- h += 8; i >>= 8;
	- }
	- if (i & 0xf0) {
	- h += 4; i >>= 4;
	- }
	- if (i & 0xc) {
	- h += 2; i >>= 2;
	- }
	- if (i & 0x2) {
	- h += 1;
	- }
	- return (h);
	-}
	-#endif
	-
	-static int random_fd = -1, urandom_fd = -1;
	-
	-static int
	-random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
	-{
	- size_t resid = len;
	- ssize_t bytes;
	-
	- ASSERT(fd != -1);
	-
	- while (resid != 0) {
	- bytes = read(fd, ptr, resid);
	- ASSERT3S(bytes, >=, 0);
	- ptr += bytes;
	- resid -= bytes;
	- }
	-
	- return (0);
	-}
	-
	-int
	-random_get_bytes(uint8_t *ptr, size_t len)
	-{
	- return (random_get_bytes_common(ptr, len, random_fd));
	-}
	-
	-int
	-random_get_pseudo_bytes(uint8_t *ptr, size_t len)
	-{
	- return (random_get_bytes_common(ptr, len, urandom_fd));
	-}
	-
	-int
	-ddi_strtoul(const char hw_serial, char nptr, int base, unsigned long result)
	-{
	- char *end;
	-
	- *result = strtoul(hw_serial, &end, base);
	- if (*result == 0)
	- return (errno);
	- return (0);
	-}
	-
	-int
	-ddi_strtoull(const char str, char nptr, int base, u_longlong_t result)
	-{
	- char *end;
	-
	- *result = strtoull(str, &end, base);
	- if (*result == 0)
	- return (errno);
	- return (0);
	-}
	-
	-#ifdef illumos
	-/* ARGSUSED */
	-cyclic_id_t
	-cyclic_add(cyc_handler_t hdlr, cyc_time_t when)
	-{
	- return (1);
	-}
	-
	-/* ARGSUSED */
	-void
	-cyclic_remove(cyclic_id_t id)
	-{
	-}
	-
	-/* ARGSUSED */
	-int
	-cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
	-{
	- return (1);
	-}
	-#endif
	-
	-/*
	- * =========================================================================
	- * kernel emulation setup & teardown
	- * =========================================================================
	- */
	-static int
	-umem_out_of_memory(void)
	-{
	- char errmsg[] = "out of memory -- generating core dump\n";
	-
	- write(fileno(stderr), errmsg, sizeof (errmsg));
	- abort();
	- return (0);
	-}
	-
	-void
	-kernel_init(int mode)
	-{
	- extern uint_t rrw_tsd_key;
	-
	- umem_nofail_callback(umem_out_of_memory);
	-
	- physmem = sysconf(_SC_PHYS_PAGES);
	-
	- dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
	- (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
	-
	- (void) snprintf(hw_serial, sizeof (hw_serial), "%ld",
	- (mode & FWRITE) ? get_system_hostid() : 0);
	-
	- VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
	- VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
	-
	- system_taskq_init();
	-
	-#ifdef illumos
	- mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
	-#endif
	-
	- spa_init(mode);
	-
	- tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
	-}
	-
	-void
	-kernel_fini(void)
	-{
	- spa_fini();
	-
	- system_taskq_fini();
	-
	- close(random_fd);
	- close(urandom_fd);
	-
	- random_fd = -1;
	- urandom_fd = -1;
	-}
	-
	-/* ARGSUSED */
	-uint32_t
	-zone_get_hostid(void *zonep)
	-{
	- /*
	- * We're emulating the system's hostid in userland.
	- */
	- return (strtoul(hw_serial, NULL, 10));
	-}
	-
	-int
	-z_uncompress(void dst, size_t dstlen, const void *src, size_t srclen)
	-{
	- int ret;
	- uLongf len = *dstlen;
	-
	- if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK)
	- *dstlen = (size_t)len;
	-
	- return (ret);
	-}
	-
	-int
	-z_compress_level(void dst, size_t dstlen, const void *src, size_t srclen,
	- int level)
	-{
	- int ret;
	- uLongf len = *dstlen;
	-
	- if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK)
	- *dstlen = (size_t)len;
	-
	- return (ret);
	-}
	-
	-uid_t
	-crgetuid(cred_t *cr)
	-{
	- return (0);
	-}
	-
	-uid_t
	-crgetruid(cred_t *cr)
	-{
	- return (0);
	-}
	-
	-gid_t
	-crgetgid(cred_t *cr)
	-{
	- return (0);
	-}
	-
	-int
	-crgetngroups(cred_t *cr)
	-{
	- return (0);
	-}
	-
	-gid_t *
	-crgetgroups(cred_t *cr)
	-{
	- return (NULL);
	-}
	-
	-int
	-zfs_secpolicy_snapshot_perms(const char name, cred_t cr)
	-{
	- return (0);
	-}
	-
	-int
	-zfs_secpolicy_rename_perms(const char from, const char to, cred_t *cr)
	-{
	- return (0);
	-}
	-
	-int
	-zfs_secpolicy_destroy_perms(const char name, cred_t cr)
	-{
	- return (0);
	-}
	-
	-ksiddomain_t *
	-ksid_lookupdomain(const char *dom)
	-{
	- ksiddomain_t *kd;
	-
	- kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
	- kd->kd_name = spa_strdup(dom);
	- return (kd);
	-}
	-
	-void
	-ksiddomain_rele(ksiddomain_t *ksid)
	-{
	- spa_strfree(ksid->kd_name);
	- umem_free(ksid, sizeof (ksiddomain_t));
	-}
	-
	-/*
	- * Do not change the length of the returned string; it must be freed
	- * with strfree().
	- */
	-char *
	-kmem_asprintf(const char *fmt, ...)
	-{
	- int size;
	- va_list adx;
	- char *buf;
	-
	- va_start(adx, fmt);
	- size = vsnprintf(NULL, 0, fmt, adx) + 1;
	- va_end(adx);
	-
	- buf = kmem_alloc(size, KM_SLEEP);
	-
	- va_start(adx, fmt);
	- size = vsnprintf(buf, size, fmt, adx);
	- va_end(adx);
	-
	- return (buf);
	-}
	-
	-/* ARGSUSED */
	-int
	-zfs_onexit_fd_hold(int fd, minor_t *minorp)
	-{
	- *minorp = 0;
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-void
	-zfs_onexit_fd_rele(int fd)
	-{
	-}
	-
	-/* ARGSUSED */
	-int
	-zfs_onexit_add_cb(minor_t minor, void (func)(void ), void *data,
	- uint64_t *action_handle)
	-{
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-int
	-zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
	-{
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-int
	-zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
	-{
	- return (0);
	-}
	-
	-#ifdef __FreeBSD__
	-/* ARGSUSED */
	-int
	-zvol_create_minors(const char *name)
	-{
	- return (0);
	-}
	-#endif
	-
	-#ifdef illumos
	-void
	-bioinit(buf_t *bp)
	-{
	- bzero(bp, sizeof (buf_t));
	-}
	-
	-void
	-biodone(buf_t *bp)
	-{
	- if (bp->b_iodone != NULL) {
	- (*(bp->b_iodone))(bp);
	- return;
	- }
	- ASSERT((bp->b_flags & B_DONE) == 0);
	- bp->b_flags \|= B_DONE;
	-}
	-
	-void
	-bioerror(buf_t *bp, int error)
	-{
	- ASSERT(bp != NULL);
	- ASSERT(error >= 0);
	-
	- if (error != 0) {
	- bp->b_flags \|= B_ERROR;
	- } else {
	- bp->b_flags &= ~B_ERROR;
	- }
	- bp->b_error = error;
	-}
	-
	-
	-int
	-geterror(struct buf *bp)
	-{
	- int error = 0;
	-
	- if (bp->b_flags & B_ERROR) {
	- error = bp->b_error;
	- if (!error)
	- error = EIO;
	- }
	- return (error);
	-}
	-#endif
	Index: head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
	+++ head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
	@@ -1,838 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
	- */
	-/*
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZFS_CONTEXT_H
	-#define _SYS_ZFS_CONTEXT_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define _SYS_MUTEX_H
	-#define _SYS_RWLOCK_H
	-#define _SYS_CONDVAR_H
	-#define _SYS_SYSTM_H
	-#define _SYS_T_LOCK_H
	-#define _SYS_VNODE_H
	-#define _SYS_VFS_H
	-#define _SYS_SUNDDI_H
	-#define _SYS_CALLB_H
	-#define _SYS_SCHED_H_
	-
	-#include <solaris.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <stddef.h>
	-#include <stdarg.h>
	-#include <fcntl.h>
	-#include <unistd.h>
	-#include <errno.h>
	-#include <string.h>
	-#include <strings.h>
	-#include <thread.h>
	-#include <assert.h>
	-#include <limits.h>
	-#include <dirent.h>
	-#include <time.h>
	-#include <math.h>
	-#include <umem.h>
	-#include <inttypes.h>
	-#include <fsshare.h>
	-#include <pthread.h>
	-#include <sched.h>
	-#include <setjmp.h>
	-#include <sys/debug.h>
	-#include <sys/note.h>
	-#include <sys/types.h>
	-#include <sys/cred.h>
	-#include <sys/atomic.h>
	-#include <sys/sysmacros.h>
	-#include <sys/bitmap.h>
	-#include <sys/resource.h>
	-#include <sys/byteorder.h>
	-#include <sys/list.h>
	-#include <sys/time.h>
	-#include <sys/uio.h>
	-#include <sys/mntent.h>
	-#include <sys/mnttab.h>
	-#include <sys/zfs_debug.h>
	-#include <sys/sdt.h>
	-#include <sys/kstat.h>
	-#include <sys/u8_textprep.h>
	-#include <sys/kernel.h>
	-#include <sys/disk.h>
	-#include <sys/sysevent.h>
	-#include <sys/sysevent/eventdefs.h>
	-#include <sys/sysevent/dev.h>
	-#include <machine/atomic.h>
	-#include <sys/debug.h>
	-#ifdef illumos
	-#include "zfs.h"
	-#endif
	-
	-#define ZFS_EXPORTS_PATH "/etc/zfs/exports"
	-
	-/*
	- * Debugging
	- */
	-
	-/*
	- * Note that we are not using the debugging levels.
	- */
	-
	-#define CE_CONT 0 /* continuation */
	-#define CE_NOTE 1 /* notice */
	-#define CE_WARN 2 /* warning */
	-#define CE_PANIC 3 /* panic */
	-#define CE_IGNORE 4 /* print nothing */
	-
	-/*
	- * ZFS debugging
	- */
	-
	-#define ZFS_LOG(...) do { } while (0)
	-
	-typedef u_longlong_t rlim64_t;
	-#define RLIM64_INFINITY ((rlim64_t)-3)
	-
	-#ifdef ZFS_DEBUG
	-extern void dprintf_setup(int argc, char *argv);
	-#endif /* ZFS_DEBUG */
	-
	-extern void cmn_err(int, const char *, ...);
	-extern void vcmn_err(int, const char *, __va_list);
	-extern void panic(const char *, ...) __NORETURN;
	-extern void vpanic(const char *, __va_list) __NORETURN;
	-
	-#define fm_panic panic
	-
	-extern int aok;
	-
	-/*
	- * DTrace SDT probes have different signatures in userland than they do in
	- * the kernel. If they're being used in kernel code, re-define them out of
	- * existence for their counterparts in libzpool.
	- *
	- * Here's an example of how to use the set-error probes in userland:
	- * zfs$target:::set-error /arg0 == EBUSY/ {stack();}
	- *
	- * Here's an example of how to use DTRACE_PROBE probes in userland:
	- * If there is a probe declared as follows:
	- * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn);
	- * Then you can use it as follows:
	- * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/
	- * {printf("%u %p\n", arg1, arg2);}
	- */
	-
	-#ifdef DTRACE_PROBE
	-#undef DTRACE_PROBE
	-#endif /* DTRACE_PROBE */
	-#ifdef illumos
	-#define DTRACE_PROBE(a) \
	- ZFS_PROBE0(#a)
	-#endif
	-
	-#ifdef DTRACE_PROBE1
	-#undef DTRACE_PROBE1
	-#endif /* DTRACE_PROBE1 */
	-#ifdef illumos
	-#define DTRACE_PROBE1(a, b, c) \
	- ZFS_PROBE1(#a, (unsigned long)c)
	-#endif
	-
	-#ifdef DTRACE_PROBE2
	-#undef DTRACE_PROBE2
	-#endif /* DTRACE_PROBE2 */
	-#ifdef illumos
	-#define DTRACE_PROBE2(a, b, c, d, e) \
	- ZFS_PROBE2(#a, (unsigned long)c, (unsigned long)e)
	-#endif
	-
	-#ifdef DTRACE_PROBE3
	-#undef DTRACE_PROBE3
	-#endif /* DTRACE_PROBE3 */
	-#ifdef illumos
	-#define DTRACE_PROBE3(a, b, c, d, e, f, g) \
	- ZFS_PROBE3(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g)
	-#endif
	-
	-#ifdef DTRACE_PROBE4
	-#undef DTRACE_PROBE4
	-#endif /* DTRACE_PROBE4 */
	-#ifdef illumos
	-#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) \
	- ZFS_PROBE4(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g, \
	- (unsigned long)i)
	-#endif
	-
	-#ifdef illumos
	-/*
	- * We use the comma operator so that this macro can be used without much
	- * additional code. For example, "return (EINVAL);" becomes
	- * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated
	- * twice, so it should not have side effects (e.g. something like:
	- * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice).
	- */
	-#define SET_ERROR(err) (ZFS_SET_ERROR(err), err)
	-#else /* !illumos */
	-
	-#define DTRACE_PROBE(a) ((void)0)
	-#define DTRACE_PROBE1(a, b, c) ((void)0)
	-#define DTRACE_PROBE2(a, b, c, d, e) ((void)0)
	-#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0)
	-#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0)
	-
	-#define SET_ERROR(err) (err)
	-#endif /* !illumos */
	-
	-/*
	- * Threads
	- */
	-#define curthread ((void *)(uintptr_t)thr_self())
	-
	-#define kpreempt(x) sched_yield()
	-
	-typedef struct kthread kthread_t;
	-
	-#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \
	- zk_thread_create(func, arg)
	-#define thread_exit() thr_exit(NULL)
	-#define thread_join(t) panic("libzpool cannot join threads")
	-
	-#define newproc(f, a, cid, pri, ctp, pid) (ENOSYS)
	-
	-/* in libzpool, p0 exists only to have its address taken */
	-struct proc {
	- uintptr_t this_is_never_used_dont_dereference_it;
	-};
	-
	-extern struct proc p0;
	-#define curproc (&p0)
	-
	-#define PS_NONE -1
	-
	-extern kthread_t zk_thread_create(void (func)(void), void arg);
	-
	-#define issig(why) (FALSE)
	-#define ISSIG(thr, why) (FALSE)
	-
	-/*
	- * Mutexes
	- */
	-typedef struct kmutex {
	- void *m_owner;
	- boolean_t initialized;
	- mutex_t m_lock;
	-} kmutex_t;
	-
	-#define MUTEX_DEFAULT USYNC_THREAD
	-#undef MUTEX_HELD
	-#undef MUTEX_NOT_HELD
	-#define MUTEX_HELD(m) ((m)->m_owner == curthread)
	-#define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m))
	-#define _mutex_held(m) pthread_mutex_isowned_np(m)
	-
	-/*
	- * Argh -- we have to get cheesy here because the kernel and userland
	- * have different signatures for the same routine.
	- */
	-//extern int _mutex_init(mutex_t mp, int type, void arg);
	-//extern int _mutex_destroy(mutex_t *mp);
	-//extern int _mutex_owned(mutex_t *mp);
	-
	-#define mutex_init(mp, b, c, d) zmutex_init((kmutex_t *)(mp))
	-#define mutex_destroy(mp) zmutex_destroy((kmutex_t *)(mp))
	-#define mutex_owned(mp) zmutex_owned((kmutex_t *)(mp))
	-
	-extern void zmutex_init(kmutex_t *mp);
	-extern void zmutex_destroy(kmutex_t *mp);
	-extern int zmutex_owned(kmutex_t *mp);
	-extern void mutex_enter(kmutex_t *mp);
	-extern void mutex_exit(kmutex_t *mp);
	-extern int mutex_tryenter(kmutex_t *mp);
	-extern void mutex_owner(kmutex_t mp);
	-
	-/*
	- * RW locks
	- */
	-typedef struct krwlock {
	- int rw_count;
	- void *rw_owner;
	- boolean_t initialized;
	- rwlock_t rw_lock;
	-} krwlock_t;
	-
	-typedef int krw_t;
	-
	-#define RW_READER 0
	-#define RW_WRITER 1
	-#define RW_DEFAULT USYNC_THREAD
	-
	-#undef RW_READ_HELD
	-#define RW_READ_HELD(x) ((x)->rw_owner == NULL && (x)->rw_count > 0)
	-
	-#undef RW_WRITE_HELD
	-#define RW_WRITE_HELD(x) ((x)->rw_owner == curthread)
	-#define RW_LOCK_HELD(x) rw_lock_held(x)
	-
	-#undef RW_LOCK_HELD
	-#define RW_LOCK_HELD(x) (RW_READ_HELD(x) \|\| RW_WRITE_HELD(x))
	-
	-extern void rw_init(krwlock_t rwlp, char name, int type, void *arg);
	-extern void rw_destroy(krwlock_t *rwlp);
	-extern void rw_enter(krwlock_t *rwlp, krw_t rw);
	-extern int rw_tryenter(krwlock_t *rwlp, krw_t rw);
	-extern int rw_tryupgrade(krwlock_t *rwlp);
	-extern void rw_exit(krwlock_t *rwlp);
	-extern int rw_lock_held(krwlock_t *rwlp);
	-#define rw_downgrade(rwlp) do { } while (0)
	-
	-extern uid_t crgetuid(cred_t *cr);
	-extern uid_t crgetruid(cred_t *cr);
	-extern gid_t crgetgid(cred_t *cr);
	-extern int crgetngroups(cred_t *cr);
	-extern gid_t crgetgroups(cred_t cr);
	-
	-/*
	- * Condition variables
	- */
	-typedef cond_t kcondvar_t;
	-
	-#define CV_DEFAULT USYNC_THREAD
	-#define CALLOUT_FLAG_ABSOLUTE 0x2
	-
	-extern void cv_init(kcondvar_t cv, char name, int type, void *arg);
	-extern void cv_destroy(kcondvar_t *cv);
	-extern void cv_wait(kcondvar_t cv, kmutex_t mp);
	-extern int cv_wait_sig(kcondvar_t cv, kmutex_t mp);
	-extern clock_t cv_timedwait(kcondvar_t cv, kmutex_t mp, clock_t abstime);
	-#define cv_timedwait_sig(cvp, mp, t) cv_timedwait(cvp, mp, t)
	-extern clock_t cv_timedwait_hires(kcondvar_t cvp, kmutex_t mp, hrtime_t tim,
	- hrtime_t res, int flag);
	-#define cv_timedwait_sig_hires(cvp, mp, t, r, f) \
	- cv_timedwait_hires(cvp, mp, t, r, f)
	-extern void cv_signal(kcondvar_t *cv);
	-extern void cv_broadcast(kcondvar_t *cv);
	-
	-/*
	- * Thread-specific data
	- */
	-#define tsd_get(k) pthread_getspecific(k)
	-#define tsd_set(k, v) pthread_setspecific(k, v)
	-#define tsd_create(kp, d) pthread_key_create(kp, d)
	-#define tsd_destroy(kp) /* nothing */
	-
	-/*
	- * Kernel memory
	- */
	-#define KM_SLEEP UMEM_NOFAIL
	-#define KM_PUSHPAGE KM_SLEEP
	-#define KM_NOSLEEP UMEM_DEFAULT
	-#define KM_NORMALPRI 0 /* not needed with UMEM_DEFAULT */
	-#define KMC_NODEBUG UMC_NODEBUG
	-#define KMC_NOTOUCH 0 /* not needed for userland caches */
	-#define KM_NODEBUG 0
	-#define kmem_alloc(_s, _f) umem_alloc(_s, _f)
	-#define kmem_zalloc(_s, _f) umem_zalloc(_s, _f)
	-#define kmem_free(_b, _s) umem_free(_b, _s)
	-#define kmem_size() (physmem * PAGESIZE)
	-#define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \
	- umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i)
	-#define kmem_cache_destroy(_c) umem_cache_destroy(_c)
	-#define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
	-#define kmem_cache_free(_c, _b) umem_cache_free(_c, _b)
	-#define kmem_debugging() 0
	-#define kmem_cache_reap_active() (B_FALSE)
	-#define kmem_cache_reap_soon(_c) /* nothing */
	-#define kmem_cache_set_move(_c, _cb) /* nothing */
	-#define POINTER_INVALIDATE(_pp) /* nothing */
	-#define POINTER_IS_VALID(_p) 0
	-
	-typedef umem_cache_t kmem_cache_t;
	-
	-typedef enum kmem_cbrc {
	- KMEM_CBRC_YES,
	- KMEM_CBRC_NO,
	- KMEM_CBRC_LATER,
	- KMEM_CBRC_DONT_NEED,
	- KMEM_CBRC_DONT_KNOW
	-} kmem_cbrc_t;
	-
	-/*
	- * Task queues
	- */
	-typedef struct taskq taskq_t;
	-typedef uintptr_t taskqid_t;
	-typedef void (task_func_t)(void *);
	-
	-typedef struct taskq_ent {
	- struct taskq_ent *tqent_next;
	- struct taskq_ent *tqent_prev;
	- task_func_t *tqent_func;
	- void *tqent_arg;
	- uintptr_t tqent_flags;
	-} taskq_ent_t;
	-
	-#define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */
	-
	-#define TASKQ_PREPOPULATE 0x0001
	-#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */
	-#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */
	-#define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */
	-#define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */
	-
	-#define TQ_SLEEP KM_SLEEP /* Can block for memory */
	-#define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */
	-#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
	-#define TQ_FRONT 0x08 /* Queue in front */
	-
	-#define TASKQID_INVALID ((taskqid_t)0)
	-
	-extern taskq_t *system_taskq;
	-
	-extern taskq_t taskq_create(const char , int, pri_t, int, int, uint_t);
	-#define taskq_create_proc(a, b, c, d, e, p, f) \
	- (taskq_create(a, b, c, d, e, f))
	-#define taskq_create_sysdc(a, b, d, e, p, dc, f) \
	- (taskq_create(a, b, maxclsyspri, d, e, f))
	-extern taskqid_t taskq_dispatch(taskq_t , task_func_t, void , uint_t);
	-extern void taskq_dispatch_ent(taskq_t , task_func_t, void , uint_t,
	- taskq_ent_t *);
	-extern void taskq_destroy(taskq_t *);
	-extern void taskq_wait(taskq_t *);
	-extern void taskq_wait_id(taskq_t *, taskqid_t);
	-extern int taskq_member(taskq_t , void );
	-extern void system_taskq_init(void);
	-extern void system_taskq_fini(void);
	-
	-#define taskq_dispatch_safe(tq, func, arg, flags, task) \
	- taskq_dispatch((tq), (func), (arg), (flags))
	-
	-#define XVA_MAPSIZE 3
	-#define XVA_MAGIC 0x78766174
	-
	-/*
	- * vnodes
	- */
	-typedef struct vnode {
	- uint64_t v_size;
	- int v_fd;
	- char *v_path;
	- int v_dump_fd;
	-} vnode_t;
	-
	-extern char *vn_dumpdir;
	-#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */
	-
	-typedef struct xoptattr {
	- timestruc_t xoa_createtime; /* Create time of file */
	- uint8_t xoa_archive;
	- uint8_t xoa_system;
	- uint8_t xoa_readonly;
	- uint8_t xoa_hidden;
	- uint8_t xoa_nounlink;
	- uint8_t xoa_immutable;
	- uint8_t xoa_appendonly;
	- uint8_t xoa_nodump;
	- uint8_t xoa_settable;
	- uint8_t xoa_opaque;
	- uint8_t xoa_av_quarantined;
	- uint8_t xoa_av_modified;
	- uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ];
	- uint8_t xoa_reparse;
	- uint8_t xoa_offline;
	- uint8_t xoa_sparse;
	-} xoptattr_t;
	-
	-typedef struct vattr {
	- uint_t va_mask; /* bit-mask of attributes */
	- u_offset_t va_size; /* file size in bytes */
	-} vattr_t;
	-
	-
	-typedef struct xvattr {
	- vattr_t xva_vattr; /* Embedded vattr structure */
	- uint32_t xva_magic; /* Magic Number */
	- uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */
	- uint32_t xva_rtnattrmapp; / Ptr to xva_rtnattrmap[] */
	- uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */
	- uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */
	- xoptattr_t xva_xoptattrs; /* Optional attributes */
	-} xvattr_t;
	-
	-typedef struct vsecattr {
	- uint_t vsa_mask; /* See below */
	- int vsa_aclcnt; /* ACL entry count */
	- void vsa_aclentp; / pointer to ACL entries */
	- int vsa_dfaclcnt; /* default ACL entry count */
	- void vsa_dfaclentp; / pointer to default ACL entries */
	- size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */
	-} vsecattr_t;
	-
	-#define AT_TYPE 0x00001
	-#define AT_MODE 0x00002
	-#define AT_UID 0x00004
	-#define AT_GID 0x00008
	-#define AT_FSID 0x00010
	-#define AT_NODEID 0x00020
	-#define AT_NLINK 0x00040
	-#define AT_SIZE 0x00080
	-#define AT_ATIME 0x00100
	-#define AT_MTIME 0x00200
	-#define AT_CTIME 0x00400
	-#define AT_RDEV 0x00800
	-#define AT_BLKSIZE 0x01000
	-#define AT_NBLOCKS 0x02000
	-#define AT_SEQ 0x08000
	-#define AT_XVATTR 0x10000
	-
	-#define CRCREAT 0
	-
	-extern int fop_getattr(vnode_t vp, vattr_t vap);
	-
	-#define VOP_CLOSE(vp, f, c, o, cr, ct) 0
	-#define VOP_PUTPAGE(vp, of, sz, fl, cr, ct) 0
	-#define VOP_GETATTR(vp, vap, cr) fop_getattr((vp), (vap));
	-
	-#define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd)
	-
	-#define VN_RELE(vp) vn_close(vp, 0, NULL, NULL)
	-#define VN_RELE_ASYNC(vp, taskq) vn_close(vp, 0, NULL, NULL)
	-
	-#define vn_lock(vp, type)
	-#define VOP_UNLOCK(vp)
	-
	-extern int vn_open(char path, int x1, int oflags, int mode, vnode_t *vpp,
	- int x2, int x3);
	-extern int vn_openat(char path, int x1, int oflags, int mode, vnode_t *vpp,
	- int x2, int x3, vnode_t *vp, int fd);
	-extern int vn_rdwr(int uio, vnode_t vp, void addr, ssize_t len,
	- offset_t offset, int x1, int x2, rlim64_t x3, void x4, ssize_t residp);
	-extern void vn_close(vnode_t vp, int openflag, cred_t cr, kthread_t *td);
	-
	-#define vn_remove(path, x1, x2) remove(path)
	-#define vn_rename(from, to, seg) rename((from), (to))
	-#define vn_is_readonly(vp) B_FALSE
	-
	-extern vnode_t *rootdir;
	-
	-#include <sys/file.h> /* for FREAD, FWRITE, etc */
	-#define FTRUNC O_TRUNC
	-
	-/*
	- * Random stuff
	- */
	-#define ddi_get_lbolt() (gethrtime() >> 23)
	-#define ddi_get_lbolt64() (gethrtime() >> 23)
	-#define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */
	-
	-extern void delay(clock_t ticks);
	-
	-#define SEC_TO_TICK(sec) ((sec) * hz)
	-#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz))
	-
	-#define gethrestime_sec() time(NULL)
	-#define gethrestime(t) \
	- do {\
	- (t)->tv_sec = gethrestime_sec();\
	- (t)->tv_nsec = 0;\
	- } while (0);
	-
	-#define max_ncpus 64
	-#define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN))
	-
	-#define minclsyspri 60
	-#define maxclsyspri 99
	-
	-#define CPU_SEQID (thr_self() & (max_ncpus - 1))
	-
	-#define kcred NULL
	-#define CRED() NULL
	-
	-#ifndef ptob
	-#define ptob(x) ((x) * PAGESIZE)
	-#endif
	-
	-extern uint64_t physmem;
	-
	-extern int highbit64(uint64_t i);
	-extern int random_get_bytes(uint8_t *ptr, size_t len);
	-extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
	-
	-extern void kernel_init(int);
	-extern void kernel_fini(void);
	-
	-struct spa;
	-extern void nicenum(uint64_t num, char *buf, size_t);
	-extern void show_pool_stats(struct spa *);
	-extern int set_global_var(char *arg);
	-
	-typedef struct callb_cpr {
	- kmutex_t *cc_lockp;
	-} callb_cpr_t;
	-
	-#define CALLB_CPR_INIT(cp, lockp, func, name) { \
	- (cp)->cc_lockp = lockp; \
	-}
	-
	-#define CALLB_CPR_SAFE_BEGIN(cp) { \
	- ASSERT(MUTEX_HELD((cp)->cc_lockp)); \
	-}
	-
	-#define CALLB_CPR_SAFE_END(cp, lockp) { \
	- ASSERT(MUTEX_HELD((cp)->cc_lockp)); \
	-}
	-
	-#define CALLB_CPR_EXIT(cp) { \
	- ASSERT(MUTEX_HELD((cp)->cc_lockp)); \
	- mutex_exit((cp)->cc_lockp); \
	-}
	-
	-#define zone_dataset_visible(x, y) (1)
	-#define INGLOBALZONE(z) (1)
	-extern uint32_t zone_get_hostid(void *zonep);
	-
	-extern char kmem_asprintf(const char fmt, ...);
	-#define strfree(str) kmem_free((str), strlen(str) + 1)
	-
	-/*
	- * Hostname information
	- */
	-extern struct utsname utsname;
	-extern char hw_serial[]; /* for userland-emulated hostid access */
	-extern int ddi_strtoul(const char str, char *nptr, int base,
	- unsigned long *result);
	-
	-extern int ddi_strtoull(const char str, char *nptr, int base,
	- u_longlong_t *result);
	-
	-/* ZFS Boot Related stuff. */
	-
	-struct _buf {
	- intptr_t _fd;
	-};
	-
	-struct bootstat {
	- uint64_t st_size;
	-};
	-
	-typedef struct ace_object {
	- uid_t a_who;
	- uint32_t a_access_mask;
	- uint16_t a_flags;
	- uint16_t a_type;
	- uint8_t a_obj_type[16];
	- uint8_t a_inherit_obj_type[16];
	-} ace_object_t;
	-
	-
	-#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
	-#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
	-#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
	-#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
	-
	-extern struct _buf kobj_open_file(char name);
	-extern int kobj_read_file(struct _buf file, char buf, unsigned size,
	- unsigned off);
	-extern void kobj_close_file(struct _buf *file);
	-extern int kobj_get_filesize(struct _buf file, uint64_t size);
	-extern int zfs_secpolicy_snapshot_perms(const char name, cred_t cr);
	-extern int zfs_secpolicy_rename_perms(const char from, const char to,
	- cred_t *cr);
	-extern int zfs_secpolicy_destroy_perms(const char name, cred_t cr);
	-extern zoneid_t getzoneid(void);
	-/* Random compatibility stuff. */
	-#define pwrite64(d, p, n, o) pwrite(d, p, n, o)
	-#define readdir64(d) readdir(d)
	-#define SIGPENDING(td) (0)
	-#define root_mount_wait() do { } while (0)
	-#define root_mounted() (1)
	-
	-#define noinline __attribute__((noinline))
	-#define likely(x) __builtin_expect((x), 1)
	-
	-struct file {
	- void *dummy;
	-};
	-
	-#define FCREAT O_CREAT
	-#define FOFFMAX 0x0
	-
	-/* SID stuff */
	-typedef struct ksiddomain {
	- uint_t kd_ref;
	- uint_t kd_len;
	- char *kd_name;
	-} ksiddomain_t;
	-
	-ksiddomain_t ksid_lookupdomain(const char );
	-void ksiddomain_rele(ksiddomain_t *);
	-
	-typedef uint32_t idmap_rid_t;
	-
	-#define DDI_SLEEP KM_SLEEP
	-#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) (0)
	-
	-#define SX_SYSINIT(name, lock, desc)
	-
	-#define SYSCTL_HANDLER_ARGS struct sysctl_oid oidp, void arg1, \
	- intptr_t arg2, struct sysctl_req *req
	-
	-/*
	- * This describes the access space for a sysctl request. This is needed
	- * so that we can use the interface from the kernel or from user-space.
	- */
	-struct sysctl_req {
	- struct thread td; / used for access checking */
	- int lock; /* wiring state */
	- void *oldptr;
	- size_t oldlen;
	- size_t oldidx;
	- int (oldfunc)(struct sysctl_req , const void *, size_t);
	- void *newptr;
	- size_t newlen;
	- size_t newidx;
	- int (newfunc)(struct sysctl_req , void *, size_t);
	- size_t validlen;
	- int flags;
	-};
	-
	-SLIST_HEAD(sysctl_oid_list, sysctl_oid);
	-
	-/*
	- * This describes one "oid" in the MIB tree. Potentially more nodes can
	- * be hidden behind it, expanded by the handler.
	- */
	-struct sysctl_oid {
	- struct sysctl_oid_list *oid_parent;
	- SLIST_ENTRY(sysctl_oid) oid_link;
	- int oid_number;
	- u_int oid_kind;
	- void *oid_arg1;
	- intptr_t oid_arg2;
	- const char *oid_name;
	- int (*oid_handler)(SYSCTL_HANDLER_ARGS);
	- const char *oid_fmt;
	- int oid_refcnt;
	- u_int oid_running;
	- const char *oid_descr;
	-};
	-
	-#define SYSCTL_DECL(...)
	-#define SYSCTL_NODE(...)
	-#define SYSCTL_INT(...)
	-#define SYSCTL_UINT(...)
	-#define SYSCTL_ULONG(...)
	-#define SYSCTL_PROC(...)
	-#define SYSCTL_QUAD(...)
	-#define SYSCTL_UQUAD(...)
	-#ifdef TUNABLE_INT
	-#undef TUNABLE_INT
	-#undef TUNABLE_ULONG
	-#undef TUNABLE_QUAD
	-#endif
	-#define TUNABLE_INT(...)
	-#define TUNABLE_ULONG(...)
	-#define TUNABLE_QUAD(...)
	-
	-int sysctl_handle_64(SYSCTL_HANDLER_ARGS);
	-
	-/* Errors */
	-
	-#ifndef ERESTART
	-#define ERESTART (-1)
	-#endif
	-
	-#ifdef illumos
	-/*
	- * Cyclic information
	- */
	-extern kmutex_t cpu_lock;
	-
	-typedef uintptr_t cyclic_id_t;
	-typedef uint16_t cyc_level_t;
	-typedef void (cyc_func_t)(void );
	-
	-#define CY_LOW_LEVEL 0
	-#define CY_INFINITY INT64_MAX
	-#define CYCLIC_NONE ((cyclic_id_t)0)
	-
	-typedef struct cyc_time {
	- hrtime_t cyt_when;
	- hrtime_t cyt_interval;
	-} cyc_time_t;
	-
	-typedef struct cyc_handler {
	- cyc_func_t cyh_func;
	- void *cyh_arg;
	- cyc_level_t cyh_level;
	-} cyc_handler_t;
	-
	-extern cyclic_id_t cyclic_add(cyc_handler_t , cyc_time_t );
	-extern void cyclic_remove(cyclic_id_t);
	-extern int cyclic_reprogram(cyclic_id_t, hrtime_t);
	-#endif /* illumos */
	-
	-#ifdef illumos
	-/*
	- * Buf structure
	- */
	-#define B_BUSY 0x0001
	-#define B_DONE 0x0002
	-#define B_ERROR 0x0004
	-#define B_READ 0x0040 /* read when I/O occurs */
	-#define B_WRITE 0x0100 /* non-read pseudo-flag */
	-
	-typedef struct buf {
	- int b_flags;
	- size_t b_bcount;
	- union {
	- caddr_t b_addr;
	- } b_un;
	-
	- lldaddr_t _b_blkno;
	-#define b_lblkno _b_blkno._f
	- size_t b_resid;
	- size_t b_bufsize;
	- int (b_iodone)(struct buf );
	- int b_error;
	- void *b_private;
	-} buf_t;
	-
	-extern void bioinit(buf_t *);
	-extern void biodone(buf_t *);
	-extern void bioerror(buf_t *, int);
	-extern int geterror(buf_t *);
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZFS_CONTEXT_H */
	Index: head/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
	+++ head/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
	@@ -1,353 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
	- * Copyright (c) 2014 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-
	-int taskq_now;
	-taskq_t *system_taskq;
	-
	-#define TASKQ_ACTIVE 0x00010000
	-#define TASKQ_NAMELEN 31
	-
	-struct taskq {
	- char tq_name[TASKQ_NAMELEN + 1];
	- kmutex_t tq_lock;
	- krwlock_t tq_threadlock;
	- kcondvar_t tq_dispatch_cv;
	- kcondvar_t tq_wait_cv;
	- thread_t *tq_threadlist;
	- int tq_flags;
	- int tq_active;
	- int tq_nthreads;
	- int tq_nalloc;
	- int tq_minalloc;
	- int tq_maxalloc;
	- kcondvar_t tq_maxalloc_cv;
	- int tq_maxalloc_wait;
	- taskq_ent_t *tq_freelist;
	- taskq_ent_t tq_task;
	-};
	-
	-static taskq_ent_t *
	-task_alloc(taskq_t *tq, int tqflags)
	-{
	- taskq_ent_t *t;
	- int rv;
	-
	-again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
	- tq->tq_freelist = t->tqent_next;
	- } else {
	- if (tq->tq_nalloc >= tq->tq_maxalloc) {
	- if (!(tqflags & KM_SLEEP))
	- return (NULL);
	-
	- /*
	- * We don't want to exceed tq_maxalloc, but we can't
	- * wait for other tasks to complete (and thus free up
	- * task structures) without risking deadlock with
	- * the caller. So, we just delay for one second
	- * to throttle the allocation rate. If we have tasks
	- * complete before one second timeout expires then
	- * taskq_ent_free will signal us and we will
	- * immediately retry the allocation.
	- */
	- tq->tq_maxalloc_wait++;
	-#ifdef __FreeBSD__
	- rv = cv_timedwait(&tq->tq_maxalloc_cv,
	- &tq->tq_lock, hz);
	-#else
	- rv = cv_timedwait(&tq->tq_maxalloc_cv,
	- &tq->tq_lock, ddi_get_lbolt() + hz);
	-#endif
	- tq->tq_maxalloc_wait--;
	- if (rv > 0)
	- goto again; /* signaled */
	- }
	- mutex_exit(&tq->tq_lock);
	-
	- t = kmem_alloc(sizeof (taskq_ent_t), tqflags & KM_SLEEP);
	-
	- mutex_enter(&tq->tq_lock);
	- if (t != NULL)
	- tq->tq_nalloc++;
	- }
	- return (t);
	-}
	-
	-static void
	-task_free(taskq_t tq, taskq_ent_t t)
	-{
	- if (tq->tq_nalloc <= tq->tq_minalloc) {
	- t->tqent_next = tq->tq_freelist;
	- tq->tq_freelist = t;
	- } else {
	- tq->tq_nalloc--;
	- mutex_exit(&tq->tq_lock);
	- kmem_free(t, sizeof (taskq_ent_t));
	- mutex_enter(&tq->tq_lock);
	- }
	-
	- if (tq->tq_maxalloc_wait)
	- cv_signal(&tq->tq_maxalloc_cv);
	-}
	-
	-taskqid_t
	-taskq_dispatch(taskq_t tq, task_func_t func, void arg, uint_t tqflags)
	-{
	- taskq_ent_t *t;
	-
	- if (taskq_now) {
	- func(arg);
	- return (1);
	- }
	-
	- mutex_enter(&tq->tq_lock);
	- ASSERT(tq->tq_flags & TASKQ_ACTIVE);
	- if ((t = task_alloc(tq, tqflags)) == NULL) {
	- mutex_exit(&tq->tq_lock);
	- return (0);
	- }
	- if (tqflags & TQ_FRONT) {
	- t->tqent_next = tq->tq_task.tqent_next;
	- t->tqent_prev = &tq->tq_task;
	- } else {
	- t->tqent_next = &tq->tq_task;
	- t->tqent_prev = tq->tq_task.tqent_prev;
	- }
	- t->tqent_next->tqent_prev = t;
	- t->tqent_prev->tqent_next = t;
	- t->tqent_func = func;
	- t->tqent_arg = arg;
	- t->tqent_flags = 0;
	- cv_signal(&tq->tq_dispatch_cv);
	- mutex_exit(&tq->tq_lock);
	- return (1);
	-}
	-
	-void
	-taskq_dispatch_ent(taskq_t tq, task_func_t func, void arg, uint_t flags,
	- taskq_ent_t *t)
	-{
	- ASSERT(func != NULL);
	- ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
	-
	- /*
	- * Mark it as a prealloc'd task. This is important
	- * to ensure that we don't free it later.
	- */
	- t->tqent_flags \|= TQENT_FLAG_PREALLOC;
	- /*
	- * Enqueue the task to the underlying queue.
	- */
	- mutex_enter(&tq->tq_lock);
	-
	- if (flags & TQ_FRONT) {
	- t->tqent_next = tq->tq_task.tqent_next;
	- t->tqent_prev = &tq->tq_task;
	- } else {
	- t->tqent_next = &tq->tq_task;
	- t->tqent_prev = tq->tq_task.tqent_prev;
	- }
	- t->tqent_next->tqent_prev = t;
	- t->tqent_prev->tqent_next = t;
	- t->tqent_func = func;
	- t->tqent_arg = arg;
	- cv_signal(&tq->tq_dispatch_cv);
	- mutex_exit(&tq->tq_lock);
	-}
	-
	-void
	-taskq_wait(taskq_t *tq)
	-{
	- mutex_enter(&tq->tq_lock);
	- while (tq->tq_task.tqent_next != &tq->tq_task \|\| tq->tq_active != 0)
	- cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
	- mutex_exit(&tq->tq_lock);
	-}
	-
	-void
	-taskq_wait_id(taskq_t *tq, taskqid_t id)
	-{
	- taskq_wait(tq);
	-}
	-
	-static void *
	-taskq_thread(void *arg)
	-{
	- taskq_t *tq = arg;
	- taskq_ent_t *t;
	- boolean_t prealloc;
	-
	- mutex_enter(&tq->tq_lock);
	- while (tq->tq_flags & TASKQ_ACTIVE) {
	- if ((t = tq->tq_task.tqent_next) == &tq->tq_task) {
	- if (--tq->tq_active == 0)
	- cv_broadcast(&tq->tq_wait_cv);
	- cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
	- tq->tq_active++;
	- continue;
	- }
	- t->tqent_prev->tqent_next = t->tqent_next;
	- t->tqent_next->tqent_prev = t->tqent_prev;
	- t->tqent_next = NULL;
	- t->tqent_prev = NULL;
	- prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC;
	- mutex_exit(&tq->tq_lock);
	-
	- rw_enter(&tq->tq_threadlock, RW_READER);
	- t->tqent_func(t->tqent_arg);
	- rw_exit(&tq->tq_threadlock);
	-
	- mutex_enter(&tq->tq_lock);
	- if (!prealloc)
	- task_free(tq, t);
	- }
	- tq->tq_nthreads--;
	- cv_broadcast(&tq->tq_wait_cv);
	- mutex_exit(&tq->tq_lock);
	- return (NULL);
	-}
	-
	-/ARGSUSED/
	-taskq_t *
	-taskq_create(const char *name, int nthreads, pri_t pri,
	- int minalloc, int maxalloc, uint_t flags)
	-{
	- taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP);
	- int t;
	-
	- if (flags & TASKQ_THREADS_CPU_PCT) {
	- int pct;
	- ASSERT3S(nthreads, >=, 0);
	- ASSERT3S(nthreads, <=, 100);
	- pct = MIN(nthreads, 100);
	- pct = MAX(pct, 0);
	-
	- nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100;
	- nthreads = MAX(nthreads, 1); /* need at least 1 thread */
	- } else {
	- ASSERT3S(nthreads, >=, 1);
	- }
	-
	- rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
	- mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
	- (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1);
	- tq->tq_flags = flags \| TASKQ_ACTIVE;
	- tq->tq_active = nthreads;
	- tq->tq_nthreads = nthreads;
	- tq->tq_minalloc = minalloc;
	- tq->tq_maxalloc = maxalloc;
	- tq->tq_task.tqent_next = &tq->tq_task;
	- tq->tq_task.tqent_prev = &tq->tq_task;
	- tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP);
	-
	- if (flags & TASKQ_PREPOPULATE) {
	- mutex_enter(&tq->tq_lock);
	- while (minalloc-- > 0)
	- task_free(tq, task_alloc(tq, KM_SLEEP));
	- mutex_exit(&tq->tq_lock);
	- }
	-
	- for (t = 0; t < nthreads; t++)
	- (void) thr_create(0, 0, taskq_thread,
	- tq, THR_BOUND, &tq->tq_threadlist[t]);
	-
	- return (tq);
	-}
	-
	-void
	-taskq_destroy(taskq_t *tq)
	-{
	- int t;
	- int nthreads = tq->tq_nthreads;
	-
	- taskq_wait(tq);
	-
	- mutex_enter(&tq->tq_lock);
	-
	- tq->tq_flags &= ~TASKQ_ACTIVE;
	- cv_broadcast(&tq->tq_dispatch_cv);
	-
	- while (tq->tq_nthreads != 0)
	- cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
	-
	- tq->tq_minalloc = 0;
	- while (tq->tq_nalloc != 0) {
	- ASSERT(tq->tq_freelist != NULL);
	- task_free(tq, task_alloc(tq, KM_SLEEP));
	- }
	-
	- mutex_exit(&tq->tq_lock);
	-
	- for (t = 0; t < nthreads; t++)
	- (void) thr_join(tq->tq_threadlist[t], NULL, NULL);
	-
	- kmem_free(tq->tq_threadlist, nthreads * sizeof (thread_t));
	-
	- rw_destroy(&tq->tq_threadlock);
	- mutex_destroy(&tq->tq_lock);
	- cv_destroy(&tq->tq_dispatch_cv);
	- cv_destroy(&tq->tq_wait_cv);
	- cv_destroy(&tq->tq_maxalloc_cv);
	-
	- kmem_free(tq, sizeof (taskq_t));
	-}
	-
	-int
	-taskq_member(taskq_t tq, void t)
	-{
	- int i;
	-
	- if (taskq_now)
	- return (1);
	-
	- for (i = 0; i < tq->tq_nthreads; i++)
	- if (tq->tq_threadlist[i] == (thread_t)(uintptr_t)t)
	- return (1);
	-
	- return (0);
	-}
	-
	-void
	-system_taskq_init(void)
	-{
	- system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512,
	- TASKQ_DYNAMIC \| TASKQ_PREPOPULATE);
	-}
	-
	-void
	-system_taskq_fini(void)
	-{
	- taskq_destroy(system_taskq);
	- system_taskq = NULL; /* defensive */
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzpool/common/util.c
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzpool/common/util.c
	+++ head/cddl/contrib/opensolaris/lib/libzpool/common/util.c
	@@ -1,196 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#include <assert.h>
	-#include <sys/zfs_context.h>
	-#include <sys/avl.h>
	-#include <string.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <sys/spa.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/refcount.h>
	-#include <dlfcn.h>
	-
	-/*
	- * Routines needed by more than one client of libzpool.
	- */
	-
	-static void
	-show_vdev_stats(const char desc, const char ctype, nvlist_t *nv, int indent)
	-{
	- vdev_stat_t *vs;
	- vdev_stat_t v0 = { 0 };
	- uint64_t sec;
	- uint64_t is_log = 0;
	- nvlist_t **child;
	- uint_t c, children;
	- char used[6], avail[6];
	- char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6];
	-
	- if (indent == 0 && desc != NULL) {
	- (void) printf(" "
	- " capacity operations bandwidth ---- errors ----\n");
	- (void) printf("description "
	- "used avail read write read write read write cksum\n");
	- }
	-
	- if (desc != NULL) {
	- char suffix = "", bias = NULL;
	- char bias_suffix[32];
	-
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
	- (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
	- &bias);
	- if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &c) != 0)
	- vs = &v0;
	-
	- if (bias != NULL) {
	- (void) snprintf(bias_suffix, sizeof (bias_suffix),
	- " (%s)", bias);
	- suffix = bias_suffix;
	- } else if (is_log) {
	- suffix = " (log)";
	- }
	-
	- sec = MAX(1, vs->vs_timestamp / NANOSEC);
	-
	- nicenum(vs->vs_alloc, used, sizeof (used));
	- nicenum(vs->vs_space - vs->vs_alloc, avail, sizeof (avail));
	- nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops, sizeof (rops));
	- nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops, sizeof (wops));
	- nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes,
	- sizeof (rbytes));
	- nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes,
	- sizeof (wbytes));
	- nicenum(vs->vs_read_errors, rerr, sizeof (rerr));
	- nicenum(vs->vs_write_errors, werr, sizeof (werr));
	- nicenum(vs->vs_checksum_errors, cerr, sizeof (cerr));
	-
	- (void) printf("%s%s%s%s%s %5s %5s %5s %5s %5s %5s %5s\n",
	- indent, "",
	- desc,
	- (int)(indent+strlen(desc)-25-(vs->vs_space ? 0 : 12)),
	- suffix,
	- vs->vs_space ? 6 : 0, vs->vs_space ? used : "",
	- vs->vs_space ? 6 : 0, vs->vs_space ? avail : "",
	- rops, wops, rbytes, wbytes, rerr, werr, cerr);
	- }
	-
	- if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0)
	- return;
	-
	- for (c = 0; c < children; c++) {
	- nvlist_t *cnv = child[c];
	- char cname, tname;
	- uint64_t np;
	- if (nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &cname) &&
	- nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &cname))
	- cname = "<unknown>";
	- tname = calloc(1, strlen(cname) + 2);
	- (void) strcpy(tname, cname);
	- if (nvlist_lookup_uint64(cnv, ZPOOL_CONFIG_NPARITY, &np) == 0)
	- tname[strlen(tname)] = '0' + np;
	- show_vdev_stats(tname, ctype, cnv, indent + 2);
	- free(tname);
	- }
	-}
	-
	-void
	-show_pool_stats(spa_t *spa)
	-{
	- nvlist_t config, nvroot;
	- char *name;
	-
	- VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0);
	-
	- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	- VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	- &name) == 0);
	-
	- show_vdev_stats(name, ZPOOL_CONFIG_CHILDREN, nvroot, 0);
	- show_vdev_stats(NULL, ZPOOL_CONFIG_L2CACHE, nvroot, 0);
	- show_vdev_stats(NULL, ZPOOL_CONFIG_SPARES, nvroot, 0);
	-
	- nvlist_free(config);
	-}
	-
	-/*
	- * Sets given global variable in libzpool to given unsigned 32-bit value.
	- * arg: "<variable>=<value>"
	- */
	-int
	-set_global_var(char *arg)
	-{
	- void *zpoolhdl;
	- char varname = arg, varval;
	- u_longlong_t val;
	-
	-#ifndef _LITTLE_ENDIAN
	- /*
	- * On big endian systems changing a 64-bit variable would set the high
	- * 32 bits instead of the low 32 bits, which could cause unexpected
	- * results.
	- */
	- fprintf(stderr, "Setting global variables is only supported on "
	- "little-endian systems\n", varname);
	- return (ENOTSUP);
	-#endif
	- if ((varval = strchr(arg, '=')) != NULL) {
	- *varval = '\0';
	- varval++;
	- val = strtoull(varval, NULL, 0);
	- if (val > UINT32_MAX) {
	- fprintf(stderr, "Value for global variable '%s' must "
	- "be a 32-bit unsigned integer\n", varname);
	- return (EOVERFLOW);
	- }
	- } else {
	- return (EINVAL);
	- }
	-
	- zpoolhdl = dlopen("libzpool.so", RTLD_LAZY);
	- if (zpoolhdl != NULL) {
	- uint32_t *var;
	- var = dlsym(zpoolhdl, varname);
	- if (var == NULL) {
	- fprintf(stderr, "Global variable '%s' does not exist "
	- "in libzpool.so\n", varname);
	- return (EINVAL);
	- }
	- *var = (uint32_t)val;
	-
	- dlclose(zpoolhdl);
	- } else {
	- fprintf(stderr, "Failed to open libzpool.so to set global "
	- "variable\n");
	- return (EIO);
	- }
	-
	- return (0);
	-}
	Index: head/cddl/contrib/opensolaris/lib/libzpool/common/zfs.d
	===================================================================
	--- head/cddl/contrib/opensolaris/lib/libzpool/common/zfs.d
	+++ head/cddl/contrib/opensolaris/lib/libzpool/common/zfs.d
	@@ -1,36 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- */
	-
	-provider zfs {
	- probe probe0(char *probename);
	- probe probe1(char *probename, unsigned long arg1);
	- probe probe2(char *probename, unsigned long arg1, unsigned long arg2);
	- probe probe3(char *probename, unsigned long arg1, unsigned long arg2,
	- unsigned long arg3);
	- probe probe4(char *probename, unsigned long arg1, unsigned long arg2,
	- unsigned long arg3, unsigned long arg4);
	-
	- probe set__error(int err);
	-};
	-
	-#pragma D attributes Evolving/Evolving/ISA provider zfs provider
	-#pragma D attributes Private/Private/Unknown provider zfs module
	-#pragma D attributes Private/Private/Unknown provider zfs function
	-#pragma D attributes Evolving/Evolving/ISA provider zfs name
	-#pragma D attributes Evolving/Evolving/ISA provider zfs args
	Index: head/cddl/contrib/opensolaris/tools/ctf/cvt/util.c
	===================================================================
	--- head/cddl/contrib/opensolaris/tools/ctf/cvt/util.c
	+++ head/cddl/contrib/opensolaris/tools/ctf/cvt/util.c
	@@ -29,6 +29,7 @@
	* Utility functions
	*/

	+#include <assert.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	Index: head/cddl/lib/Makefile
	===================================================================
	--- head/cddl/lib/Makefile
	+++ head/cddl/lib/Makefile
	@@ -6,27 +6,40 @@
	libavl \
	libctf \
	libdtrace \
	+ ${_libicp} \
	+ ${_libicp_rescue} \
	libnvpair \
	+ libspl \
	+ ${_libtpool} \
	libumem \
	libuutil \
	${_libzfs_core} \
	${_libzfs} \
	${_libzpool} \
	+ ${_libzutil}

	SUBDIR.${MK_TESTS}+= tests

	.if ${MK_ZFS} != "no"
	_libzfs_core= libzfs_core
	+_libicp= libicp
	+_libicp_rescue= libicp_rescue
	_libzfs= libzfs
	+_libzutil= libzutil
	.if ${MK_LIBTHR} != "no"
	_libzpool= libzpool
	+_libtpool= libtpool
	.endif
	.endif

	+SUBDIR_DEPEND_libctf= libspl
	SUBDIR_DEPEND_libdtrace= libctf
	+SUBDIR_DEPEND_libtpool= libspl
	+SUBDIR_DEPEND_libuutil= libavl libspl
	SUBDIR_DEPEND_libzfs_core= libnvpair
	-SUBDIR_DEPEND_libzfs= libavl libnvpair libumem libuutil libzfs_core
	-SUBDIR_DEPEND_libzpool= libavl libnvpair libumem
	+SUBDIR_DEPEND_libzfs= libavl libnvpair libumem libuutil libzfs_core libzutil
	+SUBDIR_DEPEND_libzpool= libavl libnvpair libumem libicp
	+SUBDIR_DEPEND_libzutil= libavl libtpool

	SUBDIR_PARALLEL=

	Index: head/cddl/lib/drti/Makefile
	===================================================================
	--- head/cddl/lib/drti/Makefile
	+++ head/cddl/lib/drti/Makefile
	@@ -11,7 +11,14 @@
	CLEANFILES= ${FILES}
	# These FILES qualify as libraries for the purpose of LIBRARIES_ONLY.
	.undef LIBRARIES_ONLY
	-
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \
	-I${SRCTOP}/cddl/compat/opensolaris/include \
	-I${OPENSOLARIS_USR_DISTDIR}/head \
	Index: head/cddl/lib/libavl/Makefile
	===================================================================
	--- head/cddl/lib/libavl/Makefile
	+++ head/cddl/lib/libavl/Makefile
	@@ -1,12 +1,15 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/avl
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/avl

	PACKAGE= runtime
	LIB= avl
	SRCS= avl.c
	WARNS?= 3
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	.include <bsd.lib.mk>
	Index: head/cddl/lib/libctf/Makefile
	===================================================================
	--- head/cddl/lib/libctf/Makefile
	+++ head/cddl/lib/libctf/Makefile
	@@ -21,6 +21,14 @@
	WARNS?= 2
	CFLAGS+= -DCTF_OLD_VERSIONS

	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+
	CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \
	-I${SRCTOP}/cddl/compat/opensolaris/include \
	-I${OPENSOLARIS_USR_DISTDIR}/head \
	@@ -28,6 +36,6 @@
	-I${OPENSOLARIS_USR_DISTDIR}/lib/libctf/common \
	-I${OPENSOLARIS_SYS_DISTDIR}/uts/common

	-LIBADD+= z
	+LIBADD+= spl z

	.include <bsd.lib.mk>
	Index: head/cddl/lib/libdtrace/Makefile
	===================================================================
	--- head/cddl/lib/libdtrace/Makefile
	+++ head/cddl/lib/libdtrace/Makefile
	@@ -66,6 +66,16 @@

	WARNS?= 1

	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+
	+
	CFLAGS+= -I${.OBJDIR} -I${.CURDIR} \
	-I${SRCTOP}/sys/cddl/dev/dtrace/${MACHINE_ARCH} \
	-I${SRCTOP}/sys/cddl/compat/opensolaris \
	Index: head/cddl/lib/libicp/Makefile
	===================================================================
	--- head/cddl/lib/libicp/Makefile
	+++ head/cddl/lib/libicp/Makefile
	@@ -0,0 +1,101 @@
	+# $FreeBSD$
	+
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp
	+
	+PACKAGE= runtime
	+LIB= icp
	+LIBADD=
	+
	+
	+.if ${MACHINE_ARCH} == "amd64"
	+ASM_SOURCES_C = asm-x86_64/aes/aeskey.c
	+ASM_SOURCES_AS = \
	+ asm-x86_64/aes/aes_amd64.S \
	+ asm-x86_64/aes/aes_aesni.S \
	+ asm-x86_64/modes/gcm_pclmulqdq.S \
	+ asm-x86_64/modes/aesni-gcm-x86_64.S \
	+ asm-x86_64/modes/ghash-x86_64.S \
	+ asm-x86_64/sha1/sha1-x86_64.S \
	+ asm-x86_64/sha2/sha256_impl.S \
	+ asm-x86_64/sha2/sha512_impl.S
	+
	+CFLAGS+= -D__amd64 -D_SYS_STACK_H -UHAVE_AES
	+.else
	+ASM_SOURCES_C =
	+ASM_SOURCES_AS =
	+.endif
	+
	+
	+KERNEL_C = \
	+ spi/kcf_spi.c \
	+ api/kcf_ctxops.c \
	+ api/kcf_digest.c \
	+ api/kcf_cipher.c \
	+ api/kcf_miscapi.c \
	+ api/kcf_mac.c \
	+ algs/aes/aes_impl_aesni.c \
	+ algs/aes/aes_impl_generic.c \
	+ algs/aes/aes_impl_x86-64.c \
	+ algs/aes/aes_impl.c \
	+ algs/aes/aes_modes.c \
	+ algs/edonr/edonr.c \
	+ algs/modes/modes.c \
	+ algs/modes/cbc.c \
	+ algs/modes/gcm_generic.c \
	+ algs/modes/gcm_pclmulqdq.c \
	+ algs/modes/gcm.c \
	+ algs/modes/ctr.c \
	+ algs/modes/ccm.c \
	+ algs/modes/ecb.c \
	+ algs/sha1/sha1.c \
	+ algs/sha2/sha2.c \
	+ algs/skein/skein.c \
	+ algs/skein/skein_block.c \
	+ algs/skein/skein_iv.c \
	+ illumos-crypto.c \
	+ io/aes.c \
	+ io/edonr_mod.c \
	+ io/sha1_mod.c \
	+ io/sha2_mod.c \
	+ io/skein_mod.c \
	+ os/modhash.c \
	+ os/modconf.c \
	+ core/kcf_sched.c \
	+ core/kcf_prov_lib.c \
	+ core/kcf_callprov.c \
	+ core/kcf_mech_tabs.c \
	+ core/kcf_prov_tabs.c \
	+ $(ASM_SOURCES_C)
	+
	+
	+
	+
	+
	+
	+SRCS= $(ASM_SOURCES_AS) $(KERNEL_C)
	+
	+WARNS?= 2
	+SHLIB_MAJOR= 3
	+CSTD= c99
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	+
	+
	+CFLAGS.aes_amd64.S+= -DLOCORE
	+CFLAGS.aes_aesni.S+= -DLOCORE
	+CFLAGS.gcm_pclmulqdq.S+= -DLOCORE
	+CFLAGS.aesni-gcm-x86_64.S+= -DLOCORE
	+CFLAGS.ghash-x86_64.S+= -DLOCORE
	+CFLAGS.sha1-x86_64.S+= -DLOCORE
	+CFLAGS.sha256_impl.S+= -DLOCORE
	+CFLAGS.sha512_impl.S+= -DLOCORE
	+
	+.include <bsd.lib.mk>
	Index: head/cddl/lib/libicp_rescue/Makefile
	===================================================================
	--- head/cddl/lib/libicp_rescue/Makefile
	+++ head/cddl/lib/libicp_rescue/Makefile
	@@ -0,0 +1,99 @@
	+# $FreeBSD$
	+
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp
	+
	+PACKAGE= runtime
	+LIB= icp_rescue
	+LIBADD=
	+
	+
	+.if ${MACHINE_ARCH} == "amd64"
	+ASM_SOURCES_C = asm-x86_64/aes/aeskey.c
	+ASM_SOURCES_AS = \
	+ asm-x86_64/aes/aes_amd64.S \
	+ asm-x86_64/aes/aes_aesni.S \
	+ asm-x86_64/modes/gcm_pclmulqdq.S \
	+ asm-x86_64/modes/aesni-gcm-x86_64.S \
	+ asm-x86_64/sha1/sha1-x86_64.S \
	+ asm-x86_64/sha2/sha256_impl.S \
	+ asm-x86_64/sha2/sha512_impl.S
	+
	+CFLAGS+= -D__amd64 -D_SYS_STACK_H
	+.else
	+ASM_SOURCES_C =
	+ASM_SOURCES_AS =
	+.endif
	+
	+
	+KERNEL_C = \
	+ spi/kcf_spi.c \
	+ api/kcf_ctxops.c \
	+ api/kcf_digest.c \
	+ api/kcf_cipher.c \
	+ api/kcf_miscapi.c \
	+ api/kcf_mac.c \
	+ algs/aes/aes_impl_aesni.c \
	+ algs/aes/aes_impl_generic.c \
	+ algs/aes/aes_impl_x86-64.c \
	+ algs/aes/aes_impl.c \
	+ algs/aes/aes_modes.c \
	+ algs/edonr/edonr.c \
	+ algs/modes/modes.c \
	+ algs/modes/cbc.c \
	+ algs/modes/gcm_generic.c \
	+ algs/modes/gcm_pclmulqdq.c \
	+ algs/modes/gcm.c \
	+ algs/modes/ctr.c \
	+ algs/modes/ccm.c \
	+ algs/modes/ecb.c \
	+ algs/sha1/sha1.c \
	+ algs/sha2/sha2.c \
	+ algs/skein/skein_block.c \
	+ illumos-crypto.c \
	+ io/aes.c \
	+ io/edonr_mod.c \
	+ io/sha1_mod.c \
	+ io/sha2_mod.c \
	+ io/skein_mod.c \
	+ os/modhash.c \
	+ os/modconf.c \
	+ core/kcf_sched.c \
	+ core/kcf_prov_lib.c \
	+ core/kcf_callprov.c \
	+ core/kcf_mech_tabs.c \
	+ core/kcf_prov_tabs.c \
	+ $(ASM_SOURCES_C)
	+
	+
	+
	+
	+
	+
	+SRCS= $(ASM_SOURCES_AS) $(KERNEL_C)
	+
	+WARNS?= 2
	+SHLIB_MAJOR= 3
	+CSTD= c99
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID -UHAVE_AVX -DRESCUE
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	+
	+
	+CFLAGS.aes_amd64.S+= -DLOCORE
	+CFLAGS.aes_aesni.S+= -DLOCORE
	+CFLAGS.gcm_pclmulqdq.S+= -DLOCORE
	+CFLAGS.aesni-gcm-x86_64.S+= -DLOCORE
	+CFLAGS.ghash-x86_64.S+= -DLOCORE
	+CFLAGS.sha1-x86_64.S+= -DLOCORE
	+CFLAGS.sha256_impl.S+= -DLOCORE
	+CFLAGS.sha512_impl.S+= -DLOCORE
	+CFLAGS.gcm.c+= -UCAN_USE_GCM_ASM
	+
	+.include <bsd.lib.mk>
	Index: head/cddl/lib/libnvpair/Makefile
	===================================================================
	--- head/cddl/lib/libnvpair/Makefile
	+++ head/cddl/lib/libnvpair/Makefile
	@@ -1,36 +1,30 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/nvpair
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/nvpair
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libnvpair

	LIB= nvpair

	PACKAGE= runtime
	-INCS= libnvpair.h
	+# user
	SRCS= libnvpair.c \
	- nvpair_alloc_system.c \
	- nvpair_json.c \
	- opensolaris_fnvpair.c \
	- opensolaris_nvpair.c \
	- opensolaris_nvpair_alloc_fixed.c
	+ libnvpair_json.c \
	+ nvpair_alloc_system.c
	+# kernel
	+SRCS+= nvpair_alloc_fixed.c \
	+ nvpair.c \
	+ fnvpair.c

	-WARNS?= 1
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	+WARNS?= 2
	+CFLAGS+= -DIN_BASE -DHAVE_RPC_TYPES
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	CFLAGS+= -I${SRCTOP}/sys
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID -DHAVE_CONFIG_H -DHAVE_XDR_BYTESREC

	-# This library uses macros to define fprintf behavior for several object types
	-# The compiler will see the non-string literal arguments to the fprintf calls and
	-# omit warnings for them. Quiesce these warnings in contrib code:
	-#
	-# cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c:743:12: warning: format
	-# string is not a string literal (potentially insecure) [-Wformat-security]
	-# ARENDER(pctl, nvlist_array, nvl, name, val, nelem);
	-#
	-CFLAGS+= -Wno-format-security
	+
	+CFLAGS.nvpair.c+= -UHAVE_RPC_TYPES
	.include <bsd.lib.mk>
	Index: head/cddl/lib/libspl/Makefile
	===================================================================
	--- head/cddl/lib/libspl/Makefile
	+++ head/cddl/lib/libspl/Makefile
	@@ -0,0 +1,56 @@
	+# $FreeBSD$
	+
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/os/freebsd
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/include
	+
	+
	+LIB= spl
	+LIBADD=
	+PACKAGE= runtime
	+
	+SRCS = \
	+ assert.c \
	+ list.c \
	+ mkdirp.c \
	+ page.c \
	+ strlcat.c \
	+ strlcpy.c \
	+ timestamp.c \
	+ zone.c \
	+ include/sys/list.h \
	+ include/sys/list_impl.h
	+
	+SRCS += \
	+ getexecname.c \
	+ gethostid.c \
	+ getmntany.c \
	+ mnttab.c
	+
	+
	+.if ${MACHINE_ARCH} == "amd64"
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/asm-x86_64
	+SRCS += atomic.S
	+.elif ${MACHINE_ARCH} == "i386"
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/asm-i386
	+SRCS += atomic.S
	+.else
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/asm-generic
	+SRCS += atomic.c
	+.endif
	+
	+
	+WARNS?= 2
	+CSTD= c99
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	+CFLAGS.atomic.S+= -DLOCORE
	+
	+.include <bsd.lib.mk>
	Index: head/cddl/lib/libtpool/Makefile
	===================================================================
	--- head/cddl/lib/libtpool/Makefile
	+++ head/cddl/lib/libtpool/Makefile
	@@ -0,0 +1,27 @@
	+# $FreeBSD$
	+
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libtpool
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/include
	+
	+
	+LIB= tpool
	+LIBADD= spl
	+PACKAGE= runtime
	+
	+INCS= thread_pool_impl.h
	+SRCS= thread_pool.c
	+
	+WARNS?= 2
	+CSTD= c99
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	+
	+.include <bsd.lib.mk>
	Index: head/cddl/lib/libuutil/Makefile
	===================================================================
	--- head/cddl/lib/libuutil/Makefile
	+++ head/cddl/lib/libuutil/Makefile
	@@ -1,11 +1,10 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/avl
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libuutil

	PACKAGE= runtime
	LIB= uutil
	-SRCS= avl.c \
	+SRCS=\
	uu_alloc.c \
	uu_avl.c \
	uu_dprintf.c \
	@@ -14,14 +13,17 @@
	uu_misc.c \
	uu_open.c \
	uu_pname.c \
	- uu_strtoint.c
	+ uu_string.c

	-WARNS?= 1
	-CFLAGS+= -DNATIVE_BUILD
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	+WARNS?= 2
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+
	+LIBADD= avl spl

	.include <bsd.lib.mk>
	Index: head/cddl/lib/libzfs/Makefile
	===================================================================
	--- head/cddl/lib/libzfs/Makefile
	+++ head/cddl/lib/libzfs/Makefile
	@@ -1,62 +1,102 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/compat/opensolaris/misc
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils/common
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zcommon
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs/os/freebsd
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libshare
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libshare/os/freebsd
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/include
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zstd
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zstd/lib

	PACKAGE= runtime
	LIB= zfs
	-LIBADD= md pthread umem util uutil m avl bsdxml geom nvpair z zfs_core
	-SRCS= deviceid.c \
	- fsshare.c \
	- mkdirp.c \
	- mnttab.c \
	- thread_pool.c \
	- zmount.c \
	- zone.c
	+LIBADD= md pthread umem util uutil m avl bsdxml geom nvpair z zfs_core zutil

	-SRCS+= nicenum.c
	+INCS= libzfs.h
	+USER_C = \
	+ libzfs_changelist.c \
	+ libzfs_config.c \
	+ libzfs_crypto.c \
	+ libzfs_dataset.c \
	+ libzfs_diff.c \
	+ libzfs_import.c \
	+ libzfs_iter.c \
	+ libzfs_mount.c \
	+ libzfs_pool.c \
	+ libzfs_sendrecv.c \
	+ libzfs_status.c \
	+ libzfs_util.c

	-SRCS+= libzfs_changelist.c \
	- libzfs_compat.c \
	- libzfs_config.c \
	- libzfs_dataset.c \
	- libzfs_diff.c \
	- libzfs_import.c \
	- libzfs_iter.c \
	- libzfs_mount.c \
	- libzfs_pool.c \
	- libzfs_sendrecv.c \
	- libzfs_status.c \
	- libzfs_util.c \
	- zfeature_common.c \
	- zfs_comutil.c \
	- zfs_deleg.c \
	- zfs_fletcher.c \
	- zfs_namecheck.c \
	- zfs_prop.c \
	- zpool_prop.c \
	- zprop_common.c \
	+# FreeBSD
	+USER_C += \
	+ libzfs_compat.c \
	+ libzfs_ioctl_compat.c \
	+ libzfs_zmount.c

	-WARNS?= 0
	-SHLIB_MAJOR= 3
	+# libshare
	+USER_C += \
	+ libshare.c \
	+ nfs.c \
	+ smb.c
	+
	+
	+KERNEL_C = \
	+ algs/sha2/sha2.c \
	+ cityhash.c \
	+ zfeature_common.c \
	+ zfs_comutil.c \
	+ zfs_deleg.c \
	+ zfs_fletcher.c \
	+ zfs_fletcher_superscalar.c \
	+ zfs_fletcher_superscalar4.c \
	+ zfs_namecheck.c \
	+ zfs_prop.c \
	+ zfs_uio.c \
	+ zpool_prop.c \
	+ zprop_common.c
	+
	+
	+KERNEL_C+= zstd.c \
	+ zfs_zstd.c
	+
	+
	+ARCH_C =
	+.if ${MACHINE_ARCH} == "amd64" \|\| ${MACHINE_ARCH} == "i386"
	+ARCH_C += zfs_fletcher_intel.c \
	+ zfs_fletcher_sse.c
	+CFLAGS += -DHAVE_SSE2
	+.endif
	+.if ${MACHINE_ARCH} == "amd64"
	+ARCH_C += zfs_fletcher_avx512.c
	+CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_AVX512F
	+.endif
	+.if ${MACHINE_ARCH} == "aarch64"
	+ARCH_C += zfs_fletcher_aarch64_neon.c
	+.endif
	+
	+SRCS= $(USER_C) $(KERNEL_C) $(ARCH_C)
	+
	+WARNS?= 2
	+SHLIB_MAJOR= 4
	CSTD= c99
	-CFLAGS+= -DZFS_NO_ACL
	-CFLAGS+= -I${SRCTOP}/sbin/mount
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libshare
	+CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include
	+CFLAGS+= -I${SRCTOP}/sys
	CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -I${SRCDIR}/sys/contrib/openzfs/module/zstd/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	+CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith
	+CFLAGS.zstd.c= -fno-tree-vectorize
	+

	.include <bsd.lib.mk>
	Index: head/cddl/lib/libzfs_core/Makefile
	===================================================================
	--- head/cddl/lib/libzfs_core/Makefile
	+++ head/cddl/lib/libzfs_core/Makefile
	@@ -1,37 +1,28 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/compat/opensolaris/misc
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs_core
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/include

	+
	LIB= zfs_core
	LIBADD= nvpair
	PACKAGE= runtime

	INCS= libzfs_core.h
	-SRCS= libzfs_core.c \
	- libzfs_core_compat.c \
	- zfs_ioctl_compat.c
	+SRCS= libzfs_core.c

	-SRCS+= libzfs_compat.c
	-
	-WARNS?= 0
	+WARNS?= 2
	CSTD= c99
	-CFLAGS+= -DZFS_NO_ACL
	-CFLAGS+= -I${SRCTOP}/sbin/mount
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzfs_core/common
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h

	.include <bsd.lib.mk>
	Index: head/cddl/lib/libzpool/Makefile
	===================================================================
	--- head/cddl/lib/libzpool/Makefile
	+++ head/cddl/lib/libzpool/Makefile
	@@ -1,20 +1,17 @@
	# $FreeBSD$

	-.include "${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/Makefile.files"

	# ZFS_COMMON_SRCS
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zfs
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zcommon
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/unicode
	# LUA_SRCS
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua
	-# ZFS_SHARED_SRCS
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-# LZ4_COMMON_SRCS
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/lz4
	-# KERNEL_SRCS
	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-# LIST_SRCS
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/os
	-# ATOMIC_SRCS
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/lua
	+
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/os/linux/zfs
	+
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzpool
	+
	.if exists(${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}/opensolaris_atomic.S)
	.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}
	ATOMIC_SRCS= opensolaris_atomic.S
	@@ -23,40 +20,218 @@
	.PATH: ${SRCTOP}/sys/cddl/compat/opensolaris/kern
	ATOMIC_SRCS= opensolaris_atomic.c
	.endif
	-# UNICODE_SRCS
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/unicode
	-# LIBCMDUTILS_SRCS
	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils/common

	+.if ${MACHINE_ARCH} == "powerpc"
	+# Don't waste GOT entries on small data.
	+PICFLAG= -fPIC
	+.endif
	+
	LIB= zpool

	-ZFS_COMMON_SRCS= ${ZFS_COMMON_OBJS:C/.o$/.c/} trim_map.c
	-ZFS_SHARED_SRCS= ${ZFS_SHARED_OBJS:C/.o$/.c/}
	-LZ4_COMMON_SRCS= lz4.c
	-LUA_SRCS= ${LUA_OBJS:C/.o$/.c/}
	-KERNEL_SRCS= kernel.c taskq.c util.c
	-LIST_SRCS= list.c
	-UNICODE_SRCS= u8_textprep.c
	-LIBCMDUTILS_SRCS=nicenum.c

	-SRCS= ${ZFS_COMMON_SRCS} ${ZFS_SHARED_SRCS} ${LUA_SRCS} \
	- ${LZ4_COMMON_SRCS} ${KERNEL_SRCS} ${LIST_SRCS} ${ATOMIC_SRCS} \
	- ${UNICODE_SRCS} ${LIBCMDUTILS_SRCS}

	-WARNS?= 0
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/lz4
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils
	+USER_C = \
	+ kernel.c \
	+ taskq.c \
	+ util.c
	+
	+KERNEL_C = \
	+ zfeature_common.c \
	+ zfs_comutil.c \
	+ zfs_deleg.c \
	+ zfs_fletcher.c \
	+ zfs_fletcher_superscalar.c \
	+ zfs_fletcher_superscalar4.c \
	+ zfs_namecheck.c \
	+ zfs_prop.c \
	+ zfs_uio.c \
	+ zpool_prop.c \
	+ zprop_common.c \
	+ abd.c \
	+ abd_os.c \
	+ aggsum.c \
	+ arc.c \
	+ arc_os.c \
	+ blkptr.c \
	+ bplist.c \
	+ bpobj.c \
	+ bptree.c \
	+ btree.c \
	+ bqueue.c \
	+ cityhash.c \
	+ dbuf.c \
	+ dbuf_stats.c \
	+ ddt.c \
	+ ddt_zap.c \
	+ dmu.c \
	+ dmu_diff.c \
	+ dmu_object.c \
	+ dmu_objset.c \
	+ dmu_recv.c \
	+ dmu_redact.c \
	+ dmu_send.c \
	+ dmu_traverse.c \
	+ dmu_tx.c \
	+ dmu_zfetch.c \
	+ dnode.c \
	+ dnode_sync.c \
	+ dsl_bookmark.c \
	+ dsl_dataset.c \
	+ dsl_deadlist.c \
	+ dsl_deleg.c \
	+ dsl_dir.c \
	+ dsl_crypt.c \
	+ dsl_pool.c \
	+ dsl_prop.c \
	+ dsl_scan.c \
	+ dsl_synctask.c \
	+ dsl_destroy.c \
	+ dsl_userhold.c \
	+ edonr_zfs.c \
	+ hkdf.c \
	+ fm.c \
	+ gzip.c \
	+ lzjb.c \
	+ lz4.c \
	+ metaslab.c \
	+ mmp.c \
	+ multilist.c \
	+ objlist.c \
	+ pathname.c \
	+ range_tree.c \
	+ refcount.c \
	+ rrwlock.c \
	+ sa.c \
	+ sha256.c \
	+ skein_zfs.c \
	+ spa.c \
	+ spa_boot.c \
	+ spa_checkpoint.c \
	+ spa_config.c \
	+ spa_errlog.c \
	+ spa_history.c \
	+ spa_log_spacemap.c \
	+ spa_misc.c \
	+ spa_stats.c \
	+ space_map.c \
	+ space_reftree.c \
	+ txg.c \
	+ trace.c \
	+ uberblock.c \
	+ unique.c \
	+ vdev.c \
	+ vdev_cache.c \
	+ vdev_file.c \
	+ vdev_indirect_births.c \
	+ vdev_indirect.c \
	+ vdev_indirect_mapping.c \
	+ vdev_initialize.c \
	+ vdev_label.c \
	+ vdev_mirror.c \
	+ vdev_missing.c \
	+ vdev_queue.c \
	+ vdev_raidz.c \
	+ vdev_raidz_math_aarch64_neon.c \
	+ vdev_raidz_math_aarch64_neonx2.c \
	+ vdev_raidz_math_avx2.c \
	+ vdev_raidz_math_avx512bw.c \
	+ vdev_raidz_math_avx512f.c \
	+ vdev_raidz_math.c \
	+ vdev_raidz_math_scalar.c \
	+ vdev_rebuild.c \
	+ vdev_removal.c \
	+ vdev_root.c \
	+ vdev_trim.c \
	+ zap.c \
	+ zap_leaf.c \
	+ zap_micro.c \
	+ zcp.c \
	+ zcp_get.c \
	+ zcp_global.c \
	+ zcp_iter.c \
	+ zcp_set.c \
	+ zcp_synctask.c \
	+ zfeature.c \
	+ zfs_byteswap.c \
	+ zfs_debug.c \
	+ zfs_fm.c \
	+ zfs_fuid.c \
	+ zfs_sa.c \
	+ zfs_znode.c \
	+ zfs_ratelimit.c \
	+ zfs_rlock.c \
	+ zil.c \
	+ zio.c \
	+ zio_checksum.c \
	+ zio_compress.c \
	+ zio_crypt.c \
	+ zio_inject.c \
	+ zle.c \
	+ zrlock.c \
	+ zthr.c
	+
	+ARCH_C =
	+.if ${MACHINE_ARCH} == "amd64" \|\| ${MACHINE_ARCH} == "i386"
	+ARCH_C += vdev_raidz_math_sse2.c \
	+ vdev_raidz_math_ssse3.c \
	+ zfs_fletcher_intel.c \
	+ zfs_fletcher_sse.c
	+CFLAGS += -DHAVE_SSE2 -DHAVE_SSE3
	+.endif
	+.if ${MACHINE_ARCH} == "amd64"
	+ARCH_C += zfs_fletcher_avx512.c
	+CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_AVX512F \
	+ -DHAVE_AVX512BW
	+.endif
	+.if ${MACHINE_ARCH} == "aarch64"
	+ARCH_C += zfs_fletcher_aarch64_neon.c
	+.endif
	+
	+LUA_C = \
	+ lapi.c \
	+ lauxlib.c \
	+ lbaselib.c \
	+ lcode.c \
	+ lcompat.c \
	+ lcorolib.c \
	+ lctype.c \
	+ ldebug.c \
	+ ldo.c \
	+ lfunc.c \
	+ lgc.c \
	+ llex.c \
	+ lmem.c \
	+ lobject.c \
	+ lopcodes.c \
	+ lparser.c \
	+ lstate.c \
	+ lstring.c \
	+ lstrlib.c \
	+ ltable.c \
	+ ltablib.c \
	+ ltm.c \
	+ lvm.c \
	+ lzio.c
	+
	+UNICODE_C = u8_textprep.c uconv.c
	+
	+SRCS= ${USER_C} ${KERNEL_C} ${LUA_C} ${UNICODE_C} ${ARCH_C}
	+
	+WARNS?= 2
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	+CFLAGS+= -I${SRCTOP}/sys/modules/zfs
	+CFLAGS+= -DLIB_ZPOOL_BUILD -DZFS_DEBUG
	+
	+
	# XXX: pthread doesn't have mutex_owned() equivalent, so we need to look
	# into libthr private structures. That's sooo evil, but it's only for
	# ZFS debugging tools needs.
	@@ -64,10 +239,9 @@
	CFLAGS+= -I${SRCTOP}/lib/libpthread/thread
	CFLAGS+= -I${SRCTOP}/lib/libpthread/sys
	CFLAGS+= -I${SRCTOP}/lib/libthr/arch/${MACHINE_CPUARCH}/include
	-CFLAGS.lz4.c+= -D_FAKE_KERNEL
	CFLAGS.gcc+= -fms-extensions

	-LIBADD= md pthread z nvpair avl umem
	+LIBADD= md pthread z spl icp nvpair avl umem

	# atomic.S doesn't like profiling.
	MK_PROFILE= no
	Index: head/cddl/lib/libzpool/Makefile.depend
	===================================================================
	--- head/cddl/lib/libzpool/Makefile.depend
	+++ head/cddl/lib/libzpool/Makefile.depend
	@@ -3,6 +3,7 @@

	DIRDEPS = \
	cddl/lib/libavl \
	+ cddl/lib/libicp \
	cddl/lib/libnvpair \
	cddl/lib/libumem \
	gnu/lib/csu \
	Index: head/cddl/lib/libzutil/Makefile
	===================================================================
	--- head/cddl/lib/libzutil/Makefile
	+++ head/cddl/lib/libzutil/Makefile
	@@ -0,0 +1,42 @@
	+# $FreeBSD$
	+
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzutil
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzutil/os/freebsd
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/os/freebsd/zfs
	+
	+LIB= zutil
	+LIBADD= avl tpool
	+PACKAGE= runtime
	+
	+INCS = zutil_import.h
	+
	+SRCS = \
	+ zutil_device_path.c \
	+ zutil_import.c \
	+ zutil_import.h \
	+ zutil_nicenum.c \
	+ zutil_pool.c
	+
	+SRCS += \
	+ zutil_device_path_os.c \
	+ zutil_import_os.c \
	+ zutil_compat.c
	+
	+SRCS += zfs_ioctl_compat.c
	+
	+
	+WARNS?= 2
	+CSTD= c99
	+
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzutil
	+CFLAGS+= -DHAVE_ISSETUGID -DIN_BASE
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	+
	+.include <bsd.lib.mk>
	Index: head/cddl/sbin/zfs/Makefile
	===================================================================
	--- head/cddl/sbin/zfs/Makefile
	+++ head/cddl/sbin/zfs/Makefile
	@@ -1,27 +1,77 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zfs
	+ZFSTOP= ${SRCTOP}/sys/contrib/openzfs

	+.PATH: ${ZFSTOP}/cmd/zfs
	+.PATH: ${ZFSTOP}/man/man8
	+.PATH: ${ZFSTOP}/module/os/freebsd/spl
	+
	PACKAGE= runtime
	PROG= zfs
	-MAN= zfs.8 zfs-program.8
	-SRCS= zfs_main.c zfs_iter.c
	+MAN= \
	+ zfs.8 \
	+ zfs-allow.8 \
	+ zfs-bookmark.8 \
	+ zfs-change-key.8 \
	+ zfs-clone.8 \
	+ zfs-create.8 \
	+ zfs-destroy.8 \
	+ zfs-diff.8 \
	+ zfs-get.8 \
	+ zfs-groupspace.8 \
	+ zfs-hold.8 \
	+ zfs-inherit.8 \
	+ zfs-jail.8 \
	+ zfs-list.8 \
	+ zfs-load-key.8 \
	+ zfs-mount.8 \
	+ zfs-program.8 \
	+ zfs-project.8 \
	+ zfs-projectspace.8 \
	+ zfs-promote.8 \
	+ zfs-receive.8 \
	+ zfs-recv.8 \
	+ zfs-redact.8 \
	+ zfs-release.8 \
	+ zfs-rename.8 \
	+ zfs-rollback.8 \
	+ zfs-send.8 \
	+ zfs-set.8 \
	+ zfs-share.8 \
	+ zfs-snapshot.8 \
	+ zfs-unallow.8 \
	+ zfs-unjail.8 \
	+ zfs-unload-key.8 \
	+ zfs-unmount.8 \
	+ zfs-upgrade.8 \
	+ zfs-userspace.8 \
	+ zfs-wait.8 \
	+ zfsconcepts.8 \
	+ zfsprops.8
	+SRCS= \
	+ zfs_iter.c \
	+ zfs_iter.h \
	+ zfs_main.c \
	+ zfs_util.h \
	+ zfs_project.c \
	+ zfs_projectutil.h

	-WARNS?= 0
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libumem/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	+WARNS?= 2

	-LIBADD= jail nvpair uutil zfs_core zfs
	+CFLAGS+= \
	+ -DIN_BASE \
	+ -I${ZFSTOP}/include \
	+ -I${ZFSTOP}/include/os/freebsd \
	+ -I${ZFSTOP}/lib/libspl/include \
	+ -I${ZFSTOP}/lib/libspl/include/os/freebsd \
	+ -I${SRCTOP}/sys \
	+ -I${SRCTOP}/cddl/compat/opensolaris/include \
	+ -I${ZFSTOP}/module/icp/include \
	+ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \
	+ -DHAVE_ISSETUGID \
	+ -include ${SRCTOP}/sys/modules/zfs/zfs_config.h \
	+ -I${SRCTOP}/sys/modules/zfs

	+LIBADD= jail avl nvpair geom uutil zfs_core spl tpool zutil zfs m crypto
	+LDADD+= -pthread
	.include <bsd.prog.mk>
	Index: head/cddl/sbin/zpool/Makefile
	===================================================================
	--- head/cddl/sbin/zpool/Makefile
	+++ head/cddl/sbin/zpool/Makefile
	@@ -1,32 +1,76 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zpool
	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/stat/common
	-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	+ZFSTOP= ${SRCTOP}/sys/contrib/openzfs

	+.PATH: ${ZFSTOP}/man/man5
	+.PATH: ${ZFSTOP}/man/man8
	+.PATH: ${ZFSTOP}/cmd/zpool
	+.PATH: ${ZFSTOP}/cmd/zpool/os/freebsd
	+
	+
	PACKAGE= runtime
	PROG= zpool
	-MAN= zpool.8 zpool-features.7
	-SRCS= zpool_main.c zpool_vdev.c zpool_iter.c zpool_util.c zfs_comutil.c
	-SRCS+= timestamp.c
	+MAN= \
	+ spl-module-parameters.5 \
	+ zfs-module-parameters.5 \
	+ zpool.8 \
	+ zpool-add.8 \
	+ zpool-attach.8 \
	+ zpool-checkpoint.8 \
	+ zpool-clear.8 \
	+ zpool-create.8 \
	+ zpool-destroy.8 \
	+ zpool-detach.8 \
	+ zpool-events.8 \
	+ zpool-export.8 \
	+ zpool-features.5 \
	+ zpool-get.8 \
	+ zpool-history.8 \
	+ zpool-import.8 \
	+ zpool-initialize.8 \
	+ zpool-iostat.8 \
	+ zpool-labelclear.8 \
	+ zpool-list.8 \
	+ zpool-offline.8 \
	+ zpool-online.8 \
	+ zpool-reguid.8 \
	+ zpool-remove.8 \
	+ zpool-reopen.8 \
	+ zpool-replace.8 \
	+ zpool-resilver.8 \
	+ zpool-scrub.8 \
	+ zpool-set.8 \
	+ zpool-split.8 \
	+ zpool-status.8 \
	+ zpool-sync.8 \
	+ zpool-trim.8 \
	+ zpool-upgrade.8 \
	+ zpool-wait.8 \
	+ zpoolconcepts.8 \
	+ zpoolprops.8
	+SRCS= \
	+ zpool_iter.c \
	+ zpool_main.c \
	+ zpool_util.c \
	+ zpool_util.h \
	+ zpool_vdev.c \
	+ zpool_vdev_os.c

	-WARNS?= 0
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libumem/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/cmd/stat/common
	+WARNS?= 2

	-LIBADD= geom nvpair uutil zfs
	+CFLAGS+= \
	+ -DIN_BASE \
	+ -I${ZFSTOP}/include \
	+ -I${ZFSTOP}/lib/libspl/include \
	+ -I${ZFSTOP}/lib/libspl/include/os/freebsd \
	+ -I${SRCTOP}/sys \
	+ -I${SRCTOP}/cddl/compat/opensolaris/include \
	+ -I${ZFSTOP}/cmd/zpool \
	+ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \
	+ -DHAVE_ISSETUGID \
	+ -include ${SRCTOP}/sys/modules/zfs/zfs_config.h \
	+ -DSYSCONFDIR=\"/etc\"

	+LIBADD= geom nvpair uutil zfs zutil avl spl tpool zfs_core m
	+LDADD+= -pthread
	.include <bsd.prog.mk>
	Index: head/cddl/usr.bin/Makefile
	===================================================================
	--- head/cddl/usr.bin/Makefile
	+++ head/cddl/usr.bin/Makefile
	@@ -7,7 +7,7 @@
	ctfdump \
	ctfmerge \
	${_zinject} \
	- ${_zlook} \
	+ ${_zstream} \
	${_zstreamdump} \
	${_ztest}

	@@ -15,10 +15,9 @@

	.if ${MK_ZFS} != "no"
	_zinject= zinject
	-#_zlook= zlook
	.if ${MK_LIBTHR} != "no"
	_ztest= ztest
	-_zstreamdump = zstreamdump
	+_zstream = zstream
	.endif
	.endif

	Index: head/cddl/usr.bin/ctfconvert/Makefile
	===================================================================
	--- head/cddl/usr.bin/ctfconvert/Makefile
	+++ head/cddl/usr.bin/ctfconvert/Makefile
	@@ -27,6 +27,12 @@
	traverse.c \
	util.c

	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \
	-I${SRCTOP}/cddl/compat/opensolaris/include \
	-I${OPENSOLARIS_USR_DISTDIR} \
	@@ -35,8 +41,9 @@
	-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/common \
	-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/cvt \
	-I${OPENSOLARIS_SYS_DISTDIR}/uts/common
	+CFLAGS+= -DHAVE_ISSETUGID

	-LIBADD= dwarf elf z pthread
	+LIBADD= spl dwarf elf z pthread

	HAS_TESTS=
	SUBDIR.${MK_TESTS}+= tests
	Index: head/cddl/usr.bin/ctfconvert/Makefile.depend
	===================================================================
	--- head/cddl/usr.bin/ctfconvert/Makefile.depend
	+++ head/cddl/usr.bin/ctfconvert/Makefile.depend
	@@ -5,6 +5,7 @@
	gnu/lib/csu \
	include \
	include/xlocale \
	+ cddl/lib/libspl \
	lib/${CSU_DIR} \
	lib/libc \
	lib/libcompiler_rt \
	Index: head/cddl/usr.bin/ctfdump/Makefile
	===================================================================
	--- head/cddl/usr.bin/ctfdump/Makefile
	+++ head/cddl/usr.bin/ctfdump/Makefile
	@@ -8,6 +8,13 @@
	symbol.c \
	utils.c

	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	CFLAGS+= -I${OPENSOLARIS_USR_DISTDIR} \
	-I${OPENSOLARIS_SYS_DISTDIR} \
	-I${OPENSOLARIS_USR_DISTDIR}/head \
	@@ -16,6 +23,7 @@
	-I${SRCTOP}/cddl/compat/opensolaris/include \
	-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/common \
	-I${OPENSOLARIS_SYS_DISTDIR}/uts/common
	+CFLAGS+= -DHAVE_ISSETUGID

	LIBADD= elf z

	Index: head/cddl/usr.bin/ctfmerge/Makefile
	===================================================================
	--- head/cddl/usr.bin/ctfmerge/Makefile
	+++ head/cddl/usr.bin/ctfmerge/Makefile
	@@ -24,6 +24,13 @@

	WARNS?= 1

	+
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \
	-I${SRCTOP}/cddl/compat/opensolaris/include \
	-I${OPENSOLARIS_USR_DISTDIR} \
	@@ -32,7 +39,8 @@
	-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/common \
	-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/cvt \
	-I${OPENSOLARIS_SYS_DISTDIR}/uts/common
	+CFLAGS+= -DHAVE_ISSETUGID

	-LIBADD= elf z pthread
	+LIBADD= spl elf z pthread

	.include <bsd.prog.mk>
	Index: head/cddl/usr.bin/zinject/Makefile
	===================================================================
	--- head/cddl/usr.bin/zinject/Makefile
	+++ head/cddl/usr.bin/zinject/Makefile
	@@ -1,24 +1,28 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zinject
	+ZFSTOP= ${SRCTOP}/sys/contrib/openzfs

	+.PATH: ${ZFSTOP}/cmd/zinject
	+.PATH: ${ZFSTOP}/man/man8
	+
	PROG= zinject
	+INCS= zinject.h
	SRCS= zinject.c translate.c
	-MAN=
	+MAN= zinject.8

	-WARNS?= 0
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs/
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	+WARNS?= 2
	+CFLAGS+= \
	+ -DIN_BASE \
	+ -I${ZFSTOP}/include \
	+ -I${ZFSTOP}/lib/libspl/include \
	+ -I${ZFSTOP}/lib/libspl/include/os/freebsd \
	+ -I${SRCTOP}/sys \
	+ -I${SRCTOP}/cddl/compat/opensolaris/include \
	+ -I${ZFSTOP}/module/icp/include \
	+ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \
	+ -DHAVE_ISSETUGID \
	+ -include ${SRCTOP}/sys/modules/zfs/zfs_config.h

	-LIBADD= geom m nvpair umem uutil zfs_core zfs zpool
	+LIBADD= geom m nvpair umem uutil avl spl zfs_core zfs zutil zpool

	.include <bsd.prog.mk>
	Index: head/cddl/usr.bin/zlook/Makefile
	===================================================================
	--- head/cddl/usr.bin/zlook/Makefile
	+++ head/cddl/usr.bin/zlook/Makefile
	@@ -1,12 +0,0 @@
	-# $FreeBSD$
	-
	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zlook
	-
	-PROG= zlook
	-MAN=
	-
	-WARNS?= 0
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-
	-.include <bsd.prog.mk>
	Index: head/cddl/usr.bin/zstream/Makefile
	===================================================================
	--- head/cddl/usr.bin/zstream/Makefile
	+++ head/cddl/usr.bin/zstream/Makefile
	@@ -0,0 +1,32 @@
	+# $FreeBSD$
	+
	+ZFSTOP= ${SRCTOP}/sys/contrib/openzfs
	+
	+.PATH: ${ZFSTOP}/cmd/zstream
	+.PATH: ${ZFSTOP}/man/man8
	+
	+PROG= zstream
	+MAN= zstream.8
	+INCS= zstream.h
	+SRCS= \
	+ zstream.c \
	+ zstream_dump.c \
	+ zstream_redup.c \
	+ zstream_token.c
	+
	+WARNS?= 2
	+CFLAGS+= \
	+ -DIN_BASE \
	+ -I${ZFSTOP}/include \
	+ -I${ZFSTOP}/lib/libspl/include \
	+ -I${ZFSTOP}/lib/libspl/include/os/freebsd \
	+ -I${SRCTOP}/sys \
	+ -I${SRCTOP}/cddl/compat/opensolaris/include \
	+ -I${ZFSTOP}/module/icp/include \
	+ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \
	+ -DHAVE_ISSETUGID \
	+ -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	+
	+LIBADD= geom m nvpair umem uutil avl spl zfs_core zfs zutil zpool
	+
	+.include <bsd.prog.mk>
	Index: head/cddl/usr.bin/zstreamdump/Makefile
	===================================================================
	--- head/cddl/usr.bin/zstreamdump/Makefile
	+++ head/cddl/usr.bin/zstreamdump/Makefile
	@@ -1,23 +1,11 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zstreamdump
	+ZFSTOP= ${SRCTOP}/sys/contrib/openzfs

	-PROG= zstreamdump
	-MAN= zstreamdump.1
	+.PATH: ${ZFSTOP}/cmd/zstreamdump
	+.PATH: ${ZFSTOP}/man/man8

	-WARNS?= 0
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-
	-LIBADD= m nvpair umem zpool zfs pthread z avl
	-
	-CSTD= c99
	+SCRIPTS= zstreamdump
	+MAN= zstreamdump.8

	.include <bsd.prog.mk>
	Index: head/cddl/usr.bin/ztest/Makefile
	===================================================================
	--- head/cddl/usr.bin/ztest/Makefile
	+++ head/cddl/usr.bin/ztest/Makefile
	@@ -1,30 +1,33 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/ztest
	+ZFSTOP= ${SRCTOP}/sys/contrib/openzfs

	+.PATH: ${ZFSTOP}/cmd/ztest
	+.PATH: ${ZFSTOP}/man/man1
	+
	PROG= ztest
	-MAN=
	+MAN= ztest.1

	-WARNS?= 0
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	+WARNS?= 2
	+CFLAGS+= \
	+ -DIN_BASE \
	+ -I${ZFSTOP}/include \
	+ -I${ZFSTOP}/lib/libspl/include \
	+ -I${ZFSTOP}/lib/libspl/include/os/freebsd \
	+ -I${SRCTOP}/sys \
	+ -I${SRCTOP}/cddl/compat/opensolaris/include \
	+ -I${ZFSTOP}/module/icp/include \
	+ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \
	+ -DHAVE_ISSETUGID \
	+ -include ${SRCTOP}/sys/modules/zfs/zfs_config.h

	-LIBADD= geom m nvpair umem zpool pthread avl zfs_core zfs uutil
	+LIBADD= geom m nvpair umem zpool pthread avl zfs_core spl zutil zfs uutil icp

	CSTD= c99

	# Since there are many asserts in this program, it makes no sense to compile
	# it without debugging.
	-CFLAGS+= -g -DDEBUG=1 -Wno-format
	+CFLAGS+= -g -DDEBUG=1 -Wno-format -DZFS_DEBUG=1
	CFLAGS.gcc+= -fms-extensions

	HAS_TESTS=
	Index: head/cddl/usr.sbin/dtrace/Makefile
	===================================================================
	--- head/cddl/usr.sbin/dtrace/Makefile
	+++ head/cddl/usr.sbin/dtrace/Makefile
	@@ -10,6 +10,13 @@

	WARNS?= 1

	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \
	-I${SRCTOP}/cddl/compat/opensolaris/include \
	-I${OPENSOLARIS_USR_DISTDIR}/head \
	@@ -17,12 +24,13 @@
	-I${OPENSOLARIS_USR_DISTDIR}/lib/libproc/common \
	-I${OPENSOLARIS_SYS_DISTDIR}/uts/common \
	-I${OPENSOLARIS_SYS_DISTDIR}/compat
	+CFLAGS+= -DHAVE_ISSETUGID

	# Optional debugging stuff...
	#CFLAGS+= -DNEED_ERRLOC
	#YFLAGS+= -d

	-LIBADD= dtrace ctf elf proc
	+LIBADD= dtrace ctf elf proc spl

	.if ${MK_DTRACE_TESTS} != "no"
	SUBDIR+= tests
	Index: head/cddl/usr.sbin/lockstat/Makefile
	===================================================================
	--- head/cddl/usr.sbin/lockstat/Makefile
	+++ head/cddl/usr.sbin/lockstat/Makefile
	@@ -8,6 +8,14 @@

	WARNS?= 1

	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \
	-I${SRCTOP}/cddl/compat/opensolaris/include \
	-I${OPENSOLARIS_USR_DISTDIR}/head \
	@@ -16,6 +24,7 @@
	-I${OPENSOLARIS_SYS_DISTDIR}/uts/common \
	-I${OPENSOLARIS_SYS_DISTDIR}/compat \
	-I${SRCTOP}/sys
	+CFLAGS+= -DHAVE_ISSETUGID

	CFLAGS+= -DNEED_ERRLOC -g

	Index: head/cddl/usr.sbin/plockstat/Makefile
	===================================================================
	--- head/cddl/usr.sbin/plockstat/Makefile
	+++ head/cddl/usr.sbin/plockstat/Makefile
	@@ -8,6 +8,13 @@

	WARNS?= 1

	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \
	-I${SRCTOP}/cddl/compat/opensolaris/include \
	-I${OPENSOLARIS_USR_DISTDIR}/head \
	@@ -17,6 +24,7 @@
	-I${OPENSOLARIS_SYS_DISTDIR}/compat \
	-I${SRCTOP}/cddl/lib/libdtrace \
	-I${SRCTOP}/sys
	+CFLAGS+= -DHAVE_ISSETUGID

	LIBADD= dtrace proc

	Index: head/cddl/usr.sbin/zdb/Makefile
	===================================================================
	--- head/cddl/usr.sbin/zdb/Makefile
	+++ head/cddl/usr.sbin/zdb/Makefile
	@@ -1,33 +1,33 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zdb
	+ZFSTOP= ${SRCTOP}/sys/contrib/openzfs

	+.PATH: ${ZFSTOP}/cmd/zdb
	+.PATH: ${ZFSTOP}/man/man8
	+
	PROG= zdb
	MAN= zdb.8
	+INCS= zdb.h
	SRCS= zdb.c zdb_il.c

	WARNS?= 2
	CSTD= c99

	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	+CFLAGS+= \
	+ -DIN_BASE \
	+ -I${ZFSTOP}/include \
	+ -I${ZFSTOP}/lib/libspl/include \
	+ -I${ZFSTOP}/lib/libspl/include/os/freebsd \
	+ -I${ZFSTOP}/lib/libspl/include/os/freebsd/spl \
	+ -I${SRCTOP}/sys \
	+ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \
	+ -DHAVE_ISSETUGID

	-LIBADD= nvpair umem uutil zfs zpool
	+LIBADD= nvpair umem uutil zfs spl avl zutil zpool

	CFLAGS.gcc+= -fms-extensions
	# Since there are many asserts in this program, it makes no sense to compile
	# it without debugging.
	-CFLAGS+= -g -DDEBUG=1
	+CFLAGS+= -g -DDEBUG=1 -DZFS_DEBUG=1

	.include <bsd.prog.mk>
	Index: head/cddl/usr.sbin/zfsd/Makefile.common
	===================================================================
	--- head/cddl/usr.sbin/zfsd/Makefile.common
	+++ head/cddl/usr.sbin/zfsd/Makefile.common
	@@ -10,29 +10,24 @@
	zpool_list.cc \
	zfsd_main.cc

	-WARNS?= 3
	+WARNS?= 2

	# Ignore warnings about Solaris specific pragmas.
	IGNORE_PRAGMA= YES

	-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-INCFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-INCFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-INCFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
	-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libumem/common
	-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-INCFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-INCFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-INCFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-INCFLAGS+= -I${SRCTOP}/cddl/usr.sbin
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -I${SRCTOP}/cddl/usr.sbin

	-CFLAGS+= -DNEED_SOLARIS_BOOLEAN ${INCFLAGS}
	+# use issetugid(2)
	+CFLAGS+= -D_MACHINE_FLOAT_H_ -DHAVE_ISSETUGID

	-LIBADD+= devdctl zfs zfs_core util geom bsdxml sbuf nvpair uutil
	+LIBADD+= devdctl zfs zfs_core util geom bsdxml sbuf nvpair avl uutil zutil

	cscope:
	find ${.CURDIR} -type f -a $ -name ".[ch]" -o -name ".cc" $ \
	Index: head/cddl/usr.sbin/zfsd/callout.cc
	===================================================================
	--- head/cddl/usr.sbin/zfsd/callout.cc
	+++ head/cddl/usr.sbin/zfsd/callout.cc
	@@ -39,6 +39,7 @@
	* timer services built on top of the POSIX interval timer.
	*/

	+#include <sys/byteorder.h>
	#include <sys/time.h>

	#include <signal.h>
	Index: head/cddl/usr.sbin/zfsd/case_file.cc
	===================================================================
	--- head/cddl/usr.sbin/zfsd/case_file.cc
	+++ head/cddl/usr.sbin/zfsd/case_file.cc
	@@ -39,11 +39,13 @@
	* accumulate in order to mark a device as degraded.
	*/
	#include <sys/cdefs.h>
	+#include <sys/byteorder.h>
	#include <sys/time.h>

	#include <sys/fs/zfs.h>

	#include <dirent.h>
	+#include <fcntl.h>
	#include <iomanip>
	#include <fstream>
	#include <functional>
	@@ -75,7 +77,6 @@
	__FBSDID("$FreeBSD$");

	/============================ Namespace Control =============================/
	-using std::auto_ptr;
	using std::hex;
	using std::ifstream;
	using std::stringstream;
	@@ -239,8 +240,6 @@
	{
	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
	- zpool_boot_label_t boot_type;
	- uint64_t boot_size;

	if (pool == NULL \|\| !RefreshVdevState()) {
	/*
	@@ -333,13 +332,7 @@
	}

	/* Write a label on the newly inserted disk. */
	- if (zpool_is_bootable(pool))
	- boot_type = ZPOOL_COPY_BOOT_LABEL;
	- else
	- boot_type = ZPOOL_NO_BOOT_LABEL;
	- boot_size = zpool_get_prop_int(pool, ZPOOL_PROP_BOOTSIZE, NULL);
	- if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str(),
	- boot_type, boot_size, NULL) != 0) {
	+ if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
	syslog(LOG_ERR,
	"Replace vdev(%s/%s) by physical path (label): %s: %s\n",
	zpool_get_name(pool), VdevGUIDString().c_str(),
	@@ -1118,7 +1111,7 @@
	nvlist_free(newvd);

	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
	- /replace/B_TRUE) == 0);
	+ /replace/B_TRUE, /rebuild/ B_FALSE) == 0);
	if (retval)
	syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
	poolname, oldstr.c_str(), path);
	Index: head/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc
	===================================================================
	--- head/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc
	+++ head/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc
	@@ -30,6 +30,7 @@
	* Authors: Alan Somers (Spectra Logic Corporation)
	*/
	#include <sys/cdefs.h>
	+#include <sys/byteorder.h>

	#include <stdarg.h>
	#include <syslog.h>
	Index: head/cddl/usr.sbin/zfsd/vdev.cc
	===================================================================
	--- head/cddl/usr.sbin/zfsd/vdev.cc
	+++ head/cddl/usr.sbin/zfsd/vdev.cc
	@@ -39,6 +39,7 @@
	*/
	#include <syslog.h>
	#include <sys/cdefs.h>
	+#include <sys/byteorder.h>
	#include <sys/fs/zfs.h>

	#include <libzfs.h>
	Index: head/cddl/usr.sbin/zfsd/vdev_iterator.cc
	===================================================================
	--- head/cddl/usr.sbin/zfsd/vdev_iterator.cc
	+++ head/cddl/usr.sbin/zfsd/vdev_iterator.cc
	@@ -38,6 +38,7 @@
	* Implementation of the VdevIterator class.
	*/
	#include <sys/cdefs.h>
	+#include <sys/byteorder.h>
	#include <sys/fs/zfs.h>

	#include <stdint.h>
	Index: head/cddl/usr.sbin/zfsd/zfsd.cc
	===================================================================
	--- head/cddl/usr.sbin/zfsd/zfsd.cc
	+++ head/cddl/usr.sbin/zfsd/zfsd.cc
	@@ -42,10 +42,12 @@
	*/

	#include <sys/cdefs.h>
	+#include <sys/byteorder.h>
	#include <sys/param.h>
	#include <sys/fs/zfs.h>

	#include <err.h>
	+#include <fcntl.h>
	#include <libgeom.h>
	#include <libutil.h>
	#include <poll.h>
	Index: head/cddl/usr.sbin/zfsd/zfsd_event.cc
	===================================================================
	--- head/cddl/usr.sbin/zfsd/zfsd_event.cc
	+++ head/cddl/usr.sbin/zfsd/zfsd_event.cc
	@@ -34,6 +34,7 @@
	* \file zfsd_event.cc
	*/
	#include <sys/cdefs.h>
	+#include <sys/byteorder.h>
	#include <sys/time.h>
	#include <sys/fs/zfs.h>
	#include <sys/vdev_impl.h>
	@@ -41,12 +42,13 @@
	#include <syslog.h>

	#include <libzfs.h>
	+#include <libzutil.h>
	/*
	* Undefine flush, defined by cpufunc.h on sparc64, because it conflicts with
	* C++ flush methods
	*/
	#undef flush
	-
	+#undef __init
	#include <list>
	#include <map>
	#include <sstream>
	@@ -190,7 +192,8 @@
	if (poolName != NULL)
	free(poolName);

	- nlabels = zpool_read_all_labels(devFd, &devLabel);
	+ if (zpool_read_label(devFd, &devLabel, &nlabels) != 0)
	+ return (NULL);
	/*
	* If we find a disk with fewer than the maximum number of
	* labels, it might be the whole disk of a partitioned disk
	Index: head/cddl/usr.sbin/zfsd/zfsd_exception.cc
	===================================================================
	--- head/cddl/usr.sbin/zfsd/zfsd_exception.cc
	+++ head/cddl/usr.sbin/zfsd/zfsd_exception.cc
	@@ -36,6 +36,7 @@
	* Implementation of the ZfsdException class.
	*/
	#include <sys/cdefs.h>
	+#include <sys/byteorder.h>
	#include <sys/fs/zfs.h>

	#include <syslog.h>
	Index: head/cddl/usr.sbin/zfsd/zpool_list.cc
	===================================================================
	--- head/cddl/usr.sbin/zfsd/zpool_list.cc
	+++ head/cddl/usr.sbin/zfsd/zpool_list.cc
	@@ -38,6 +38,7 @@
	* Implementation of the ZpoolList class.
	*/
	#include <sys/cdefs.h>
	+#include <sys/byteorder.h>
	#include <sys/fs/zfs.h>

	#include <stdint.h>
	Index: head/cddl/usr.sbin/zhack/Makefile
	===================================================================
	--- head/cddl/usr.sbin/zhack/Makefile
	+++ head/cddl/usr.sbin/zhack/Makefile
	@@ -1,6 +1,6 @@
	# $FreeBSD$

	-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zhack
	+.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/zhack

	PROG= zhack
	MAN=
	@@ -8,20 +8,20 @@
	WARNS?= 0
	CSTD= c99

	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	+WARNS?= 2
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h

	-LIBADD= nvpair zfs zpool
	+
	+LIBADD= nvpair zfs spl zutil zpool

	CFLAGS+= -DDEBUG=1
	#DEBUG_FLAGS+= -g
	Index: head/include/Makefile
	===================================================================
	--- head/include/Makefile
	+++ head/include/Makefile
	@@ -244,7 +244,7 @@
	${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 teken.h \
	${SDESTDIR}${INCLUDEDIR}/teken
	.if ${MK_CDDL} != "no"
	- cd ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/sys; \
	+ cd ${SRCTOP}/sys/contrib/openzfs/include/sys; \
	${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 nvpair.h \
	${SDESTDIR}${INCLUDEDIR}/sys
	.endif
	@@ -377,7 +377,7 @@
	done
	.if ${MK_CDDL} != "no"
	${INSTALL_SYMLINK} ${TAG_ARGS} \
	- ../../../sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h \
	+ ../../../sys/contrib/openenzfs/include/sys/nvpair.h \
	${SDESTDIR}${INCLUDEDIR}/sys
	.endif
	.if ${MK_MLX5TOOL} != "no"
	Index: head/lib/libbe/Makefile
	===================================================================
	--- head/lib/libbe/Makefile
	+++ head/lib/libbe/Makefile
	@@ -16,19 +16,18 @@
	IGNORE_PRAGMA= yes

	LIBADD+= zfs
	-LIBADD+= nvpair
	+LIBADD+= nvpair spl

	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	+CFLAGS+= -DIN_BASE -DHAVE_RPC_TYPES
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID

	-CFLAGS+= -DNEED_SOLARIS_BOOLEAN

	HAS_TESTS= YES
	SUBDIR.${MK_TESTS}+= tests
	Index: head/lib/libbe/be.c
	===================================================================
	--- head/lib/libbe/be.c
	+++ head/lib/libbe/be.c
	@@ -35,10 +35,13 @@
	#include <sys/queue.h>
	#include <sys/zfs_context.h>
	#include <sys/mntent.h>
	+#include <sys/zfs_ioctl.h>

	+#include <libzutil.h>
	#include <ctype.h>
	#include <libgen.h>
	#include <libzfs_core.h>
	+#include <libzfs_impl.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>
	@@ -993,12 +996,8 @@
	ZFS_TYPE_FILESYSTEM)) == NULL)
	return (set_error(lbh, BE_ERR_ZFSOPEN));

	- /* recurse, nounmount, forceunmount */
	- struct renameflags flags = {
	- .nounmount = 1,
	- };

	- err = zfs_rename(zfs_hdl, NULL, full_new, flags);
	+ err = zfs_rename(zfs_hdl,full_new, B_FALSE, B_FALSE);

	zfs_close(zfs_hdl);
	if (err != 0)
	@@ -1025,7 +1024,7 @@
	if ((zfs = zfs_open(lbh->lzh, buf, ZFS_TYPE_DATASET)) == NULL)
	return (set_error(lbh, BE_ERR_ZFSOPEN));

	- err = zfs_send_one(zfs, NULL, fd, flags);
	+ err = zfs_send_one(zfs, NULL, fd, &flags, /* redactbook */ NULL);
	zfs_close(zfs);

	return (err);
	Index: head/lib/libbe/tests/Makefile
	===================================================================
	--- head/lib/libbe/tests/Makefile
	+++ head/lib/libbe/tests/Makefile
	@@ -8,14 +8,19 @@
	SRCS_target_prog= target_prog.c
	BINDIR_target_prog= ${TESTSDIR}

	-LIBADD+= zfs
	-LIBADD+= nvpair
	-LIBADD+= be
	+LIBADD+= zfs \
	+ spl \
	+ nvpair \
	+ be \

	CFLAGS+= -I${SRCTOP}/lib/libbe
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-
	-CFLAGS+= -DNEED_SOLARIS_BOOLEAN
	+CFLAGS+= -DIN_BASE -DHAVE_RPC_TYPES
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID

	.include <bsd.test.mk>
	Index: head/lib/libproc/Makefile
	===================================================================
	--- head/lib/libproc/Makefile
	+++ head/lib/libproc/Makefile
	@@ -29,6 +29,13 @@
	.if ${MK_CDDL} != "no"
	LIBADD+= ctf
	IGNORE_PRAGMA= YES
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID -DHAVE_BOOLEAN
	CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libctf/common \
	-I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common \
	-I${SRCTOP}/sys/cddl/compat/opensolaris
	Index: head/lib/libproc/proc_bkpt.c
	===================================================================
	--- head/lib/libproc/proc_bkpt.c
	+++ head/lib/libproc/proc_bkpt.c
	@@ -112,7 +112,7 @@
	return (-1);
	}

	- DPRINTFX("adding breakpoint at 0x%lx", address);
	+ DPRINTFX("adding breakpoint at 0x%lx", (unsigned long)address);

	stopped = 0;
	if (phdl->status != PS_STOP) {
	@@ -173,7 +173,7 @@
	return (-1);
	}

	- DPRINTFX("removing breakpoint at 0x%lx", address);
	+ DPRINTFX("removing breakpoint at 0x%lx", (unsigned long)address);

	stopped = 0;
	if (phdl->status != PS_STOP) {
	Index: head/lib/libproc/proc_sym.c
	===================================================================
	--- head/lib/libproc/proc_sym.c
	+++ head/lib/libproc/proc_sym.c
	@@ -307,7 +307,7 @@
	*/
	if (data->d_size < sizeof(crc) + 1) {
	DPRINTFX("ERROR: debuglink section is too small (%zd bytes)",
	- data->d_size);
	+ (ssize_t)data->d_size);
	goto internal;
	}
	if (strnlen(data->d_buf, data->d_size) >= data->d_size - sizeof(crc)) {
	@@ -510,7 +510,7 @@
	int error;

	if ((mapping = _proc_addr2map(p, addr)) == NULL) {
	- DPRINTFX("ERROR: proc_addr2map failed to resolve 0x%jx", addr);
	+ DPRINTFX("ERROR: proc_addr2map failed to resolve 0x%jx", (uintmax_t)addr);
	return (-1);
	}
	if (open_object(mapping) != 0) {
	Index: head/lib/libprocstat/libprocstat.c
	===================================================================
	--- head/lib/libprocstat/libprocstat.c
	+++ head/lib/libprocstat/libprocstat.c
	@@ -70,6 +70,7 @@
	#include <sys/ptrace.h>
	#define _KERNEL
	#include <sys/mount.h>
	+#include <sys/filedesc.h>
	#include <sys/pipe.h>
	#include <ufs/ufs/quota.h>
	#include <ufs/ufs/inode.h>
	Index: head/lib/libprocstat/zfs/Makefile
	===================================================================
	--- head/lib/libprocstat/zfs/Makefile
	+++ head/lib/libprocstat/zfs/Makefile
	@@ -6,15 +6,19 @@
	OBJS= zfs_defs.o
	WARNS?= 1

	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-CFLAGS+= -I${.CURDIR:H}
	-CFLAGS+= -DNEED_SOLARIS_BOOLEAN
	+
	+CFLAGS+= -DIN_BASE -D__KERNEL__ -D_KERNEL -I. -I${.CURDIR}
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs
	+CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include
	+
	+CFLAGS+= -I${SRCTOP}/sys -I. -I..
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID -D_SYS_VMEM_H_ -UKLD_TIED -DKLD_MODULE
	+
	CFLAGS+= -fno-builtin -nostdlib

	all: ${OBJS}
	Index: head/lib/libprocstat/zfs_defs.c
	===================================================================
	--- head/lib/libprocstat/zfs_defs.c
	+++ head/lib/libprocstat/zfs_defs.c
	@@ -26,14 +26,10 @@
	*/

	#include <sys/cdefs.h>
	+#include <sys/types.h>
	__FBSDID("$FreeBSD$");

	-/* Pretend we are kernel to get the same binary layout. */
	-#define _KERNEL

	-/* A hack to deal with kpilite.h. */
	-#define KLD_MODULE
	-
	/*
	* Prevent some headers from getting included and fake some types
	* in order to allow this file to compile without bringing in
	@@ -41,14 +37,40 @@
	*/
	#define _OPENSOLARIS_SYS_PATHNAME_H_
	#define _OPENSOLARIS_SYS_POLICY_H_
	-#define _OPENSOLARIS_SYS_VNODE_H_
	#define _VNODE_PAGER_

	-typedef struct vnode vnode_t;
	-typedef struct vattr vattr_t;
	-typedef struct xvattr xvattr_t;
	-typedef struct vsecattr vsecattr_t;
	-typedef enum vtype vtype_t;
	+
	+enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
	+ VMARKER };
	+
	+/*
	+ * Vnode attributes. A field value of VNOVAL represents a field whose value
	+ * is unavailable (getattr) or which is not to be changed (setattr).
	+ */
	+struct vattr {
	+ enum vtype va_type; /* vnode type (for create) */
	+ u_short va_mode; /* files access mode and type */
	+ u_short va_padding0;
	+ uid_t va_uid; /* owner user id */
	+ gid_t va_gid; /* owner group id */
	+ nlink_t va_nlink; /* number of references to file */
	+ dev_t va_fsid; /* filesystem id */
	+ ino_t va_fileid; /* file id */
	+ u_quad_t va_size; /* file size in bytes */
	+ long va_blocksize; /* blocksize preferred for i/o */
	+ struct timespec va_atime; /* time of last access */
	+ struct timespec va_mtime; /* time of last modification */
	+ struct timespec va_ctime; /* time file changed */
	+ struct timespec va_birthtime; /* time file created */
	+ u_long va_gen; /* generation number of file */
	+ u_long va_flags; /* flags defined for file */
	+ dev_t va_rdev; /* device the special file represents */
	+ u_quad_t va_bytes; /* bytes of disk space held by file */
	+ u_quad_t va_filerev; /* file modification number */
	+ u_int va_vaflags; /* operations flags, see below */
	+ long va_spare; /* remain quad aligned */
	+};
	+

	#include <sys/zfs_context.h>
	#include <sys/zfs_znode.h>
	Index: head/libexec/rc/rc.d/zfs
	===================================================================
	--- head/libexec/rc/rc.d/zfs
	+++ head/libexec/rc/rc.d/zfs
	@@ -25,6 +25,13 @@

	zfs_start_main()
	{
	+ local cachefile
	+
	+ for cachefile in /boot/zfs/zpool.cache /etc/zfs/zpool.cache; do
	+ if [ -r $cachefile ]; then
	+ zpool import -c $cachefile -a
	+ fi
	+ done
	zfs mount -va
	zfs share -a
	if [ ! -r /etc/zfs/exports ]; then
	Index: head/rescue/rescue/Makefile
	===================================================================
	--- head/rescue/rescue/Makefile
	+++ head/rescue/rescue/Makefile
	@@ -129,7 +129,7 @@
	CRUNCH_LIBS+= -l80211 -lalias -lcam -lncursesw -ldevstat -lipsec -llzma
	.if ${MK_ZFS} != "no"
	CRUNCH_LIBS+= -lavl -lzpool -lzfs_core -lzfs -lnvpair -lpthread -luutil -lumem
	-CRUNCH_LIBS+= -lbe
	+CRUNCH_LIBS+= -lbe -lzutil -ltpool -lspl -licp_rescue
	.else
	# liblzma needs pthread
	CRUNCH_LIBS+= -lpthread
	Index: head/sbin/bectl/Makefile
	===================================================================
	--- head/sbin/bectl/Makefile
	+++ head/sbin/bectl/Makefile
	@@ -7,16 +7,22 @@

	SRCS= bectl.c bectl_jail.c bectl_list.c

	-LIBADD+= be
	-LIBADD+= jail
	-LIBADD+= nvpair
	-LIBADD+= util
	+LIBADD+= be \
	+ jail \
	+ nvpair \
	+ spl \
	+ util \

	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-
	-CFLAGS+= -DNEED_SOLARIS_BOOLEAN
	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h

	HAS_TESTS= yes
	SUBDIR.${MK_TESTS}+= tests
	Index: head/sbin/bectl/Makefile.depend
	===================================================================
	--- head/sbin/bectl/Makefile.depend
	+++ head/sbin/bectl/Makefile.depend
	@@ -4,6 +4,7 @@
	DIRDEPS = \
	cddl/lib/libavl \
	cddl/lib/libnvpair \
	+ cddl/lib/libspl \
	cddl/lib/libumem \
	cddl/lib/libuutil \
	cddl/lib/libzfs \
	Index: head/sbin/bectl/bectl.c
	===================================================================
	--- head/sbin/bectl/bectl.c
	+++ head/sbin/bectl/bectl.c
	@@ -60,6 +60,8 @@

	libbe_handle_t *be;

	+int aok;
	+
	int
	usage(bool explicit)
	{
	Index: head/sbin/zfsbootcfg/Makefile
	===================================================================
	--- head/sbin/zfsbootcfg/Makefile
	+++ head/sbin/zfsbootcfg/Makefile
	@@ -2,7 +2,7 @@
	# $FreeBSD$

	PROG= zfsbootcfg
	-WARNS?= 1
	+WARNS?= 2
	MAN= zfsbootcfg.8

	LIBADD+=zfs
	@@ -11,17 +11,16 @@
	LIBADD+=uutil
	LIBADD+=geom

	+CFLAGS+= -DIN_BASE
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS+= -I${SRCTOP}/sys
	CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	-
	-CFLAGS+= -DNEED_SOLARIS_BOOLEAN
	+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -DHAVE_ISSETUGID
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h

	.include <bsd.prog.mk>
	Index: head/share/mk/bsd.libnames.mk
	===================================================================
	--- head/share/mk/bsd.libnames.mk
	+++ head/share/mk/bsd.libnames.mk
	@@ -80,6 +80,7 @@
	LIBIBNETDISC?= ${LIBDESTDIR}${LIBDIR_BASE}/libibnetdisc.a
	LIBIBUMAD?= ${LIBDESTDIR}${LIBDIR_BASE}/libibumad.a
	LIBIBVERBS?= ${LIBDESTDIR}${LIBDIR_BASE}/libibverbs.a
	+LIBICP?= ${LIBDESTDIR}${LIBDIR_BASE}/libicp.a
	LIBIPSEC?= ${LIBDESTDIR}${LIBDIR_BASE}/libipsec.a
	LIBIPT?= ${LIBDESTDIR}${LIBDIR_BASE}/libipt.a
	LIBJAIL?= ${LIBDESTDIR}${LIBDIR_BASE}/libjail.a
	@@ -135,6 +136,7 @@
	LIBSBUF?= ${LIBDESTDIR}${LIBDIR_BASE}/libsbuf.a
	LIBSDP?= ${LIBDESTDIR}${LIBDIR_BASE}/libsdp.a
	LIBSMB?= ${LIBDESTDIR}${LIBDIR_BASE}/libsmb.a
	+LIBSPL?= ${LIBDESTDIR}${LIBDIR_BASE}/libspl.a
	LIBSSL?= ${LIBDESTDIR}${LIBDIR_BASE}/libssl.a
	LIBSSP_NONSHARED?= ${LIBDESTDIR}${LIBDIR_BASE}/libssp_nonshared.a
	LIBSTATS?= ${LIBDESTDIR}${LIBDIR_BASE}/libstats.a
	@@ -146,6 +148,7 @@
	LIBTERMCAPW?= ${LIBDESTDIR}${LIBDIR_BASE}/libtermcapw.a
	LIBTERMLIB?= "don't use LIBTERMLIB, use LIBTERMCAP"
	LIBTINFO?= "don't use LIBTINFO, use LIBNCURSES"
	+LIBTPOOL?= ${LIBDESTDIR}${LIBDIR_BASE}/libtpool.a
	LIBUFS?= ${LIBDESTDIR}${LIBDIR_BASE}/libufs.a
	LIBUGIDFW?= ${LIBDESTDIR}${LIBDIR_BASE}/libugidfw.a
	LIBULOG?= ${LIBDESTDIR}${LIBDIR_BASE}/libulog.a
	@@ -166,6 +169,7 @@
	LIBZFS?= ${LIBDESTDIR}${LIBDIR_BASE}/libzfs.a
	LIBZFS_CORE?= ${LIBDESTDIR}${LIBDIR_BASE}/libzfs_core.a
	LIBZPOOL?= ${LIBDESTDIR}${LIBDIR_BASE}/libzpool.a
	+LIBZUTIL?= ${LIBDESTDIR}${LIBDIR_BASE}/libzutil.a

	# enforce the 2 -lpthread and -lc to always be the last in that exact order
	.if defined(LDADD)
	Index: head/share/mk/src.libnames.mk
	===================================================================
	--- head/share/mk/src.libnames.mk
	+++ head/share/mk/src.libnames.mk
	@@ -125,6 +125,7 @@
	heimntlm \
	heimsqlite \
	hx509 \
	+ icp \
	ipsec \
	ipt \
	jail \
	@@ -172,6 +173,7 @@
	sdp \
	sm \
	smb \
	+ spl \
	ssl \
	ssp_nonshared \
	stats \
	@@ -181,6 +183,7 @@
	tacplus \
	termcap \
	termcapw \
	+ tpool \
	ufs \
	ugidfw \
	ulog \
	@@ -199,6 +202,7 @@
	zfs_core \
	zfs \
	zpool \
	+ zutil

	.if ${MK_BLACKLIST} != "no"
	_LIBRARIES+= \
	@@ -355,9 +359,10 @@
	_DP_ucl= m
	_DP_vmmapi= util
	_DP_opencsd= cxxrt
	-_DP_ctf= z
	+_DP_ctf= spl z
	_DP_dtrace= ctf elf proc pthread rtld_db
	_DP_xo= util
	+_DP_ztest= geom m nvpair umem zpool pthread avl zfs_core spl zutil zfs uutil icp
	# The libc dependencies are not strictly needed but are defined to make the
	# assert happy.
	_DP_c= compiler_rt
	@@ -375,11 +380,14 @@
	_DP_ulog= md
	_DP_fifolog= z
	_DP_ipf= kvm
	-_DP_zfs= md pthread umem util uutil m nvpair avl bsdxml geom nvpair z \
	- zfs_core
	+_DP_tpool= spl
	+_DP_uutil= avl spl
	+_DP_zfs= md pthread umem util uutil m avl bsdxml geom nvpair \
	+ z zfs_core zutil
	_DP_zfs_core= nvpair
	-_DP_zpool= md pthread z nvpair avl umem
	-_DP_be= zfs nvpair
	+_DP_zpool= md pthread z icp spl nvpair avl umem
	+_DP_zutil= avl tpool
	+_DP_be= zfs spl nvpair

	# OFED support
	.if ${MK_OFED} != "no"
	@@ -583,12 +591,15 @@
	LIBAVLDIR= ${OBJTOP}/cddl/lib/libavl
	LIBCTFDIR= ${OBJTOP}/cddl/lib/libctf
	LIBDTRACEDIR= ${OBJTOP}/cddl/lib/libdtrace
	+LIBICPDIR= ${OBJTOP}/cddl/lib/libicp
	LIBNVPAIRDIR= ${OBJTOP}/cddl/lib/libnvpair
	LIBUMEMDIR= ${OBJTOP}/cddl/lib/libumem
	LIBUUTILDIR= ${OBJTOP}/cddl/lib/libuutil
	LIBZFSDIR= ${OBJTOP}/cddl/lib/libzfs
	LIBZFS_COREDIR= ${OBJTOP}/cddl/lib/libzfs_core
	LIBZPOOLDIR= ${OBJTOP}/cddl/lib/libzpool
	+LIBZUTILDIR= ${OBJTOP}/cddl/lib/libzutil
	+LIBTPOOLDIR= ${OBJTOP}/cddl/lib/libtpool

	# OFED support
	LIBCXGB4DIR= ${OBJTOP}/lib/ofed/libcxgb4
	@@ -655,6 +666,7 @@
	LIBPANELDIR= ${OBJTOP}/lib/ncurses/panel
	LIBPANELWDIR= ${OBJTOP}/lib/ncurses/panelw
	LIBCRYPTODIR= ${OBJTOP}/secure/lib/libcrypto
	+LIBSPLDIR= ${OBJTOP}/cddl/lib/libspl
	LIBSSHDIR= ${OBJTOP}/secure/lib/libssh
	LIBSSLDIR= ${OBJTOP}/secure/lib/libssl
	LIBTEKENDIR= ${OBJTOP}/sys/teken/libteken
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris.c
	@@ -37,6 +37,8 @@
	#include <sys/module.h>
	#include <sys/mutex.h>

	+extern struct opensolaris_utsname utsname;
	+
	cpu_core_t cpu_core[MAXCPU];
	kmutex_t cpu_lock;
	solaris_cpu_t solaris_cpu[MAXCPU];
	@@ -82,7 +84,6 @@

	switch (type) {
	case MOD_LOAD:
	- utsname.nodename = prison0.pr_hostname;
	break;

	case MOD_UNLOAD:
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
	@@ -1,296 +0,0 @@
	-/*-
	- * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/param.h>
	-#include <sys/kernel.h>
	-#include <sys/systm.h>
	-#include <sys/malloc.h>
	-#include <sys/kmem.h>
	-#include <sys/debug.h>
	-#include <sys/mutex.h>
	-#include <sys/vmmeter.h>
	-
	-#include <vm/vm_page.h>
	-#include <vm/vm_object.h>
	-#include <vm/vm_kern.h>
	-#include <vm/vm_map.h>
	-
	-#ifdef KMEM_DEBUG
	-#include <sys/queue.h>
	-#include <sys/stack.h>
	-#endif
	-
	-#ifdef _KERNEL
	-MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris");
	-#else
	-#define malloc(size, type, flags) malloc(size)
	-#define free(addr, type) free(addr)
	-#endif
	-
	-#ifdef KMEM_DEBUG
	-struct kmem_item {
	- struct stack stack;
	- LIST_ENTRY(kmem_item) next;
	-};
	-static LIST_HEAD(, kmem_item) kmem_items;
	-static struct mtx kmem_items_mtx;
	-MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF);
	-#endif /* KMEM_DEBUG */
	-
	-#include <sys/vmem.h>
	-
	-void *
	-zfs_kmem_alloc(size_t size, int kmflags)
	-{
	- void *p;
	-#ifdef KMEM_DEBUG
	- struct kmem_item *i;
	-
	- size += sizeof(struct kmem_item);
	-#endif
	- p = malloc(size, M_SOLARIS, kmflags);
	-#ifndef _KERNEL
	- if (kmflags & KM_SLEEP)
	- assert(p != NULL);
	-#endif
	-#ifdef KMEM_DEBUG
	- if (p != NULL) {
	- i = p;
	- p = (u_char *)p + sizeof(struct kmem_item);
	- stack_save(&i->stack);
	- mtx_lock(&kmem_items_mtx);
	- LIST_INSERT_HEAD(&kmem_items, i, next);
	- mtx_unlock(&kmem_items_mtx);
	- }
	-#endif
	- return (p);
	-}
	-
	-void
	-zfs_kmem_free(void *buf, size_t size __unused)
	-{
	-#ifdef KMEM_DEBUG
	- if (buf == NULL) {
	- printf("%s: attempt to free NULL\n", __func__);
	- return;
	- }
	- struct kmem_item *i;
	-
	- buf = (u_char *)buf - sizeof(struct kmem_item);
	- mtx_lock(&kmem_items_mtx);
	- LIST_FOREACH(i, &kmem_items, next) {
	- if (i == buf)
	- break;
	- }
	- ASSERT(i != NULL);
	- LIST_REMOVE(i, next);
	- mtx_unlock(&kmem_items_mtx);
	-#endif
	- free(buf, M_SOLARIS);
	-}
	-
	-static uint64_t kmem_size_val;
	-
	-static void
	-kmem_size_init(void *unused __unused)
	-{
	-
	- kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE;
	- if (kmem_size_val > vm_kmem_size)
	- kmem_size_val = vm_kmem_size;
	-}
	-SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
	-
	-uint64_t
	-kmem_size(void)
	-{
	-
	- return (kmem_size_val);
	-}
	-
	-static int
	-kmem_std_constructor(void mem, int size __unused, void private, int flags)
	-{
	- struct kmem_cache *cache = private;
	-
	- return (cache->kc_constructor(mem, cache->kc_private, flags));
	-}
	-
	-static void
	-kmem_std_destructor(void mem, int size __unused, void private)
	-{
	- struct kmem_cache *cache = private;
	-
	- cache->kc_destructor(mem, cache->kc_private);
	-}
	-
	-kmem_cache_t *
	-kmem_cache_create(char *name, size_t bufsize, size_t align,
	- int (constructor)(void , void , int), void (destructor)(void , void ),
	- void (reclaim)(void ) __unused, void private, vmem_t vmp, int cflags)
	-{
	- kmem_cache_t *cache;
	-
	- ASSERT(vmp == NULL);
	-
	- cache = kmem_alloc(sizeof(*cache), KM_SLEEP);
	- strlcpy(cache->kc_name, name, sizeof(cache->kc_name));
	- cache->kc_constructor = constructor;
	- cache->kc_destructor = destructor;
	- cache->kc_private = private;
	-#if defined(_KERNEL) && !defined(KMEM_DEBUG)
	- cache->kc_zone = uma_zcreate(cache->kc_name, bufsize,
	- constructor != NULL ? kmem_std_constructor : NULL,
	- destructor != NULL ? kmem_std_destructor : NULL,
	- NULL, NULL, align > 0 ? align - 1 : 0, cflags);
	-#else
	- cache->kc_size = bufsize;
	-#endif
	-
	- return (cache);
	-}
	-
	-void
	-kmem_cache_destroy(kmem_cache_t *cache)
	-{
	-#if defined(_KERNEL) && !defined(KMEM_DEBUG)
	- uma_zdestroy(cache->kc_zone);
	-#endif
	- kmem_free(cache, sizeof(*cache));
	-}
	-
	-void *
	-kmem_cache_alloc(kmem_cache_t *cache, int flags)
	-{
	-#if defined(_KERNEL) && !defined(KMEM_DEBUG)
	- return (uma_zalloc_arg(cache->kc_zone, cache, flags));
	-#else
	- void *p;
	-
	- p = kmem_alloc(cache->kc_size, flags);
	- if (p != NULL && cache->kc_constructor != NULL)
	- kmem_std_constructor(p, cache->kc_size, cache, flags);
	- return (p);
	-#endif
	-}
	-
	-void
	-kmem_cache_free(kmem_cache_t cache, void buf)
	-{
	-#if defined(_KERNEL) && !defined(KMEM_DEBUG)
	- uma_zfree_arg(cache->kc_zone, buf, cache);
	-#else
	- if (cache->kc_destructor != NULL)
	- kmem_std_destructor(buf, cache->kc_size, cache);
	- kmem_free(buf, cache->kc_size);
	-#endif
	-}
	-
	-/*
	- * Allow our caller to determine if there are running reaps.
	- *
	- * This call is very conservative and may return B_TRUE even when
	- * reaping activity isn't active. If it returns B_FALSE, then reaping
	- * activity is definitely inactive.
	- */
	-boolean_t
	-kmem_cache_reap_active(void)
	-{
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Reap (almost) everything soon.
	- *
	- * Note: this does not wait for the reap-tasks to complete. Caller
	- * should use kmem_cache_reap_active() (above) and/or moderation to
	- * avoid scheduling too many reap-tasks.
	- */
	-#ifdef _KERNEL
	-void
	-kmem_cache_reap_soon(kmem_cache_t *cache)
	-{
	-#ifndef KMEM_DEBUG
	- uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN);
	-#endif
	-}
	-
	-void
	-kmem_reap(void)
	-{
	- uma_reclaim(UMA_RECLAIM_TRIM);
	-}
	-#else
	-void
	-kmem_cache_reap_soon(kmem_cache_t *cache __unused)
	-{
	-}
	-
	-void
	-kmem_reap(void)
	-{
	-}
	-#endif
	-
	-int
	-kmem_debugging(void)
	-{
	- return (0);
	-}
	-
	-void *
	-calloc(size_t n, size_t s)
	-{
	- return (kmem_zalloc(n * s, KM_NOSLEEP));
	-}
	-
	-#ifdef KMEM_DEBUG
	-void kmem_show(void *);
	-void
	-kmem_show(void *dummy __unused)
	-{
	- struct kmem_item *i;
	-
	- mtx_lock(&kmem_items_mtx);
	- if (LIST_EMPTY(&kmem_items))
	- printf("KMEM_DEBUG: No leaked elements.\n");
	- else {
	- printf("KMEM_DEBUG: Leaked elements:\n\n");
	- LIST_FOREACH(i, &kmem_items, next) {
	- printf("address=%p\n", i);
	- stack_print_ddb(&i->stack);
	- printf("\n");
	- }
	- }
	- mtx_unlock(&kmem_items_mtx);
	-}
	-
	-SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL);
	-#endif /* KMEM_DEBUG */
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c
	@@ -1,210 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/types.h>
	-#include <sys/systm.h>
	-#include <sys/kernel.h>
	-#include <sys/systm.h>
	-#include <sys/kthread.h>
	-#include <sys/namei.h>
	-#include <sys/proc.h>
	-#include <sys/filedesc.h>
	-#include <sys/fcntl.h>
	-#include <sys/linker.h>
	-#include <sys/kobj.h>
	-
	-void
	-kobj_free(void *address, size_t size)
	-{
	-
	- kmem_free(address, size);
	-}
	-
	-void *
	-kobj_alloc(size_t size, int flag)
	-{
	-
	- return (kmem_alloc(size, (flag & KM_NOWAIT) ? KM_NOSLEEP : KM_SLEEP));
	-}
	-
	-void *
	-kobj_zalloc(size_t size, int flag)
	-{
	- void *p;
	-
	- if ((p = kobj_alloc(size, flag)) != NULL)
	- bzero(p, size);
	- return (p);
	-}
	-
	-static void *
	-kobj_open_file_vnode(const char *file)
	-{
	- struct thread *td = curthread;
	- struct nameidata nd;
	- int error, flags;
	-
	- pwd_ensure_dirs();
	-
	- flags = FREAD \| O_NOFOLLOW;
	- NDINIT(&nd, LOOKUP, 0, UIO_SYSSPACE, file, td);
	- error = vn_open_cred(&nd, &flags, 0, 0, curthread->td_ucred, NULL);
	- if (error != 0)
	- return (NULL);
	- NDFREE(&nd, NDF_ONLY_PNBUF);
	- /* We just unlock so we hold a reference. */
	- VOP_UNLOCK(nd.ni_vp);
	- return (nd.ni_vp);
	-}
	-
	-static void *
	-kobj_open_file_loader(const char *file)
	-{
	-
	- return (preload_search_by_name(file));
	-}
	-
	-struct _buf *
	-kobj_open_file(const char *file)
	-{
	- struct _buf *out;
	-
	- out = kmem_alloc(sizeof(*out), KM_SLEEP);
	- out->mounted = root_mounted();
	- /*
	- * If root is already mounted we read file using file system,
	- * if not, we use loader.
	- */
	- if (out->mounted)
	- out->ptr = kobj_open_file_vnode(file);
	- else
	- out->ptr = kobj_open_file_loader(file);
	- if (out->ptr == NULL) {
	- kmem_free(out, sizeof(*out));
	- return ((struct _buf *)-1);
	- }
	- return (out);
	-}
	-
	-static int
	-kobj_get_filesize_vnode(struct _buf file, uint64_t size)
	-{
	- struct vnode *vp = file->ptr;
	- struct vattr va;
	- int error;
	-
	- vn_lock(vp, LK_SHARED \| LK_RETRY);
	- error = VOP_GETATTR(vp, &va, curthread->td_ucred);
	- VOP_UNLOCK(vp);
	- if (error == 0)
	- *size = (uint64_t)va.va_size;
	- return (error);
	-}
	-
	-static int
	-kobj_get_filesize_loader(struct _buf file, uint64_t size)
	-{
	- void *ptr;
	-
	- ptr = preload_search_info(file->ptr, MODINFO_SIZE);
	- if (ptr == NULL)
	- return (ENOENT);
	- size = (uint64_t)(size_t *)ptr;
	- return (0);
	-}
	-
	-int
	-kobj_get_filesize(struct _buf file, uint64_t size)
	-{
	-
	- if (file->mounted)
	- return (kobj_get_filesize_vnode(file, size));
	- else
	- return (kobj_get_filesize_loader(file, size));
	-}
	-
	-int
	-kobj_read_file_vnode(struct _buf file, char buf, unsigned size, unsigned off)
	-{
	- struct vnode *vp = file->ptr;
	- struct thread *td = curthread;
	- struct uio auio;
	- struct iovec aiov;
	- int error;
	-
	- bzero(&aiov, sizeof(aiov));
	- bzero(&auio, sizeof(auio));
	-
	- aiov.iov_base = buf;
	- aiov.iov_len = size;
	-
	- auio.uio_iov = &aiov;
	- auio.uio_offset = (off_t)off;
	- auio.uio_segflg = UIO_SYSSPACE;
	- auio.uio_rw = UIO_READ;
	- auio.uio_iovcnt = 1;
	- auio.uio_resid = size;
	- auio.uio_td = td;
	-
	- vn_lock(vp, LK_SHARED \| LK_RETRY);
	- error = VOP_READ(vp, &auio, IO_UNIT \| IO_SYNC, td->td_ucred);
	- VOP_UNLOCK(vp);
	- return (error != 0 ? -1 : size - auio.uio_resid);
	-}
	-
	-int
	-kobj_read_file_loader(struct _buf file, char buf, unsigned size, unsigned off)
	-{
	- char *ptr;
	-
	- ptr = preload_fetch_addr(file->ptr);
	- if (ptr == NULL)
	- return (ENOENT);
	- bcopy(ptr + off, buf, size);
	- return (0);
	-}
	-
	-int
	-kobj_read_file(struct _buf file, char buf, unsigned size, unsigned off)
	-{
	-
	- if (file->mounted)
	- return (kobj_read_file_vnode(file, buf, size, off));
	- else
	- return (kobj_read_file_loader(file, buf, size, off));
	-}
	-
	-void
	-kobj_close_file(struct _buf *file)
	-{
	-
	- if (file->mounted)
	- vn_close(file->ptr, FREAD, curthread->td_ucred, curthread);
	- kmem_free(file, sizeof(*file));
	-}
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c
	@@ -1,148 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/param.h>
	-#include <sys/kernel.h>
	-#include <sys/systm.h>
	-#include <sys/malloc.h>
	-#include <sys/sysctl.h>
	-#include <sys/kstat.h>
	-
	-static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics");
	-
	-SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "Kernel statistics");
	-
	-kstat_t *
	-kstat_create(char module, int instance, char name, char *class, uchar_t type,
	- ulong_t ndata, uchar_t flags)
	-{
	- struct sysctl_oid *root;
	- kstat_t *ksp;
	-
	- KASSERT(instance == 0, ("instance=%d", instance));
	- KASSERT(type == KSTAT_TYPE_NAMED, ("type=%hhu", type));
	- KASSERT(flags == KSTAT_FLAG_VIRTUAL, ("flags=%02hhx", flags));
	-
	- /*
	- * Allocate the main structure. We don't need to copy module/class/name
	- * stuff in here, because it is only used for sysctl node creation
	- * done in this function.
	- */
	- ksp = malloc(sizeof(*ksp), M_KSTAT, M_WAITOK);
	- ksp->ks_ndata = ndata;
	-
	- /*
	- * Create sysctl tree for those statistics:
	- *
	- * kstat.<module>.<class>.<name>.
	- */
	- sysctl_ctx_init(&ksp->ks_sysctl_ctx);
	- root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
	- SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module,
	- CTLFLAG_RW \| CTLFLAG_MPSAFE, 0, "");
	- if (root == NULL) {
	- printf("%s: Cannot create kstat.%s tree!\n", __func__, module);
	- sysctl_ctx_free(&ksp->ks_sysctl_ctx);
	- free(ksp, M_KSTAT);
	- return (NULL);
	- }
	- root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
	- OID_AUTO, class, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0, "");
	- if (root == NULL) {
	- printf("%s: Cannot create kstat.%s.%s tree!\n", __func__,
	- module, class);
	- sysctl_ctx_free(&ksp->ks_sysctl_ctx);
	- free(ksp, M_KSTAT);
	- return (NULL);
	- }
	- root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
	- OID_AUTO, name, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0, "");
	- if (root == NULL) {
	- printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__,
	- module, class, name);
	- sysctl_ctx_free(&ksp->ks_sysctl_ctx);
	- free(ksp, M_KSTAT);
	- return (NULL);
	- }
	- ksp->ks_sysctl_root = root;
	-
	- return (ksp);
	-}
	-
	-static int
	-kstat_sysctl(SYSCTL_HANDLER_ARGS)
	-{
	- kstat_named_t *ksent = arg1;
	- uint64_t val;
	-
	- val = ksent->value.ui64;
	- return sysctl_handle_64(oidp, &val, 0, req);
	-}
	-
	-void
	-kstat_install(kstat_t *ksp)
	-{
	- kstat_named_t *ksent;
	- u_int i;
	-
	- ksent = ksp->ks_data;
	- for (i = 0; i < ksp->ks_ndata; i++, ksent++) {
	- KASSERT(ksent->data_type == KSTAT_DATA_UINT64,
	- ("data_type=%d", ksent->data_type));
	- SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
	- SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, ksent->name,
	- CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RD, ksent,
	- sizeof(*ksent), kstat_sysctl, "QU", ksent->desc);
	- }
	-}
	-
	-void
	-kstat_delete(kstat_t *ksp)
	-{
	-
	- sysctl_ctx_free(&ksp->ks_sysctl_ctx);
	- free(ksp, M_KSTAT);
	-}
	-
	-void
	-kstat_set_string(char dst, const char src)
	-{
	-
	- bzero(dst, KSTAT_STRLEN);
	- (void) strncpy(dst, src, KSTAT_STRLEN - 1);
	-}
	-
	-void
	-kstat_named_init(kstat_named_t knp, const char name, uchar_t data_type)
	-{
	-
	- kstat_set_string(knp->name, name);
	- knp->data_type = data_type;
	-}
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c
	@@ -1,64 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/param.h>
	-#include <sys/kernel.h>
	-#include <sys/systm.h>
	-#include <sys/pathname.h>
	-#include <sys/vfs.h>
	-#include <sys/vnode.h>
	-
	-int
	-lookupname(char *dirname, enum uio_seg seg, enum symfollow follow,
	- vnode_t dirvpp, vnode_t compvpp)
	-{
	-
	- return (lookupnameat(dirname, seg, follow, dirvpp, compvpp, NULL));
	-}
	-
	-int
	-lookupnameat(char *dirname, enum uio_seg seg, enum symfollow follow,
	- vnode_t dirvpp, vnode_t compvpp, vnode_t *startvp)
	-{
	- struct nameidata nd;
	- int error, ltype;
	-
	- ASSERT(dirvpp == NULL);
	-
	- vref(startvp);
	- ltype = VOP_ISLOCKED(startvp);
	- VOP_UNLOCK(startvp);
	- NDINIT_ATVP(&nd, LOOKUP, LOCKLEAF \| follow, seg, dirname,
	- startvp, curthread);
	- error = namei(&nd);
	- *compvpp = nd.ni_vp;
	- NDFREE(&nd, NDF_ONLY_PNBUF);
	- vn_lock(startvp, ltype \| LK_RETRY);
	- return (error);
	-}
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c
	@@ -1,54 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/param.h>
	-#include <sys/jail.h>
	-#include <sys/kernel.h>
	-#include <sys/libkern.h>
	-#include <sys/limits.h>
	-#include <sys/misc.h>
	-#include <sys/sysctl.h>
	-
	-char hw_serial[11] = "0";
	-
	-struct opensolaris_utsname utsname = {
	- .machine = MACHINE
	-};
	-
	-static void
	-opensolaris_utsname_init(void *arg)
	-{
	-
	- utsname.sysname = ostype;
	- utsname.nodename = prison0.pr_hostname;
	- utsname.release = osrelease;
	- snprintf(utsname.version, sizeof(utsname.version), "%d", osreldate);
	-}
	-SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
	- opensolaris_utsname_init, NULL);
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c
	@@ -1,429 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/param.h>
	-#include <sys/priv.h>
	-#include <sys/vnode.h>
	-#include <sys/mntent.h>
	-#include <sys/mount.h>
	-#include <sys/stat.h>
	-#include <sys/jail.h>
	-#include <sys/policy.h>
	-#include <sys/zfs_vfsops.h>
	-
	-int
	-secpolicy_nfs(cred_t *cr)
	-{
	-
	- return (priv_check_cred(cr, PRIV_NFS_DAEMON));
	-}
	-
	-int
	-secpolicy_zfs(cred_t *cr)
	-{
	-
	- return (priv_check_cred(cr, PRIV_VFS_MOUNT));
	-}
	-
	-int
	-secpolicy_sys_config(cred_t *cr, int checkonly __unused)
	-{
	-
	- return (priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG));
	-}
	-
	-int
	-secpolicy_zinject(cred_t *cr)
	-{
	-
	- return (priv_check_cred(cr, PRIV_ZFS_INJECT));
	-}
	-
	-int
	-secpolicy_fs_unmount(cred_t cr, struct mount vfsp __unused)
	-{
	-
	- return (priv_check_cred(cr, PRIV_VFS_UNMOUNT));
	-}
	-
	-int
	-secpolicy_fs_owner(struct mount mp, cred_t cr)
	-{
	-
	- if (zfs_super_owner) {
	- if (cr->cr_uid == mp->mnt_cred->cr_uid &&
	- cr->cr_prison == mp->mnt_cred->cr_prison) {
	- return (0);
	- }
	- }
	- return (EPERM);
	-}
	-
	-/*
	- * This check is done in kern_link(), so we could just return 0 here.
	- */
	-extern int hardlink_check_uid;
	-int
	-secpolicy_basic_link(vnode_t vp, cred_t cr)
	-{
	-
	- if (!hardlink_check_uid)
	- return (0);
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	- return (priv_check_cred(cr, PRIV_VFS_LINK));
	-}
	-
	-int
	-secpolicy_vnode_stky_modify(cred_t *cr)
	-{
	-
	- return (EPERM);
	-}
	-
	-int
	-secpolicy_vnode_remove(vnode_t vp, cred_t cr)
	-{
	-
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	- return (priv_check_cred(cr, PRIV_VFS_ADMIN));
	-}
	-
	-int
	-secpolicy_vnode_access(cred_t cr, vnode_t vp, uid_t owner, accmode_t accmode)
	-{
	-
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	-
	- if ((accmode & VREAD) && priv_check_cred(cr, PRIV_VFS_READ) != 0)
	- return (EACCES);
	- if ((accmode & VWRITE) &&
	- priv_check_cred(cr, PRIV_VFS_WRITE) != 0) {
	- return (EACCES);
	- }
	- if (accmode & VEXEC) {
	- if (vp->v_type == VDIR) {
	- if (priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0)
	- return (EACCES);
	- } else {
	- if (priv_check_cred(cr, PRIV_VFS_EXEC) != 0)
	- return (EACCES);
	- }
	- }
	- return (0);
	-}
	-
	-/*
	- * Like secpolicy_vnode_access() but we get the actual wanted mode and the
	- * current mode of the file, not the missing bits.
	- */
	-int
	-secpolicy_vnode_access2(cred_t cr, vnode_t vp, uid_t owner,
	- accmode_t curmode, accmode_t wantmode)
	-{
	- accmode_t mode;
	-
	- mode = ~curmode & wantmode;
	-
	- if (mode == 0)
	- return (0);
	-
	- return (secpolicy_vnode_access(cr, vp, owner, mode));
	-}
	-
	-int
	-secpolicy_vnode_any_access(cred_t cr, vnode_t vp, uid_t owner)
	-{
	- static int privs[] = {
	- PRIV_VFS_ADMIN,
	- PRIV_VFS_READ,
	- PRIV_VFS_WRITE,
	- PRIV_VFS_EXEC,
	- PRIV_VFS_LOOKUP
	- };
	- int i;
	-
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	-
	- /* Same as secpolicy_vnode_setdac */
	- if (owner == cr->cr_uid)
	- return (0);
	-
	- for (i = 0; i < sizeof (privs)/sizeof (int); i++) {
	- boolean_t allzone = B_FALSE;
	- int priv;
	-
	- switch (priv = privs[i]) {
	- case PRIV_VFS_EXEC:
	- if (vp->v_type == VDIR)
	- continue;
	- break;
	- case PRIV_VFS_LOOKUP:
	- if (vp->v_type != VDIR)
	- continue;
	- break;
	- }
	- if (priv_check_cred(cr, priv) == 0)
	- return (0);
	- }
	- return (EPERM);
	-}
	-
	-int
	-secpolicy_vnode_setdac(vnode_t vp, cred_t cr, uid_t owner)
	-{
	-
	- if (owner == cr->cr_uid)
	- return (0);
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	- return (priv_check_cred(cr, PRIV_VFS_ADMIN));
	-}
	-
	-int
	-secpolicy_vnode_setattr(cred_t cr, vnode_t vp, struct vattr *vap,
	- const struct vattr *ovap, int flags,
	- int unlocked_access(void , int, cred_t ), void *node)
	-{
	- int mask = vap->va_mask;
	- int error;
	-
	- if (mask & AT_SIZE) {
	- if (vp->v_type == VDIR)
	- return (EISDIR);
	- error = unlocked_access(node, VWRITE, cr);
	- if (error)
	- return (error);
	- }
	- if (mask & AT_MODE) {
	- /*
	- * If not the owner of the file then check privilege
	- * for two things: the privilege to set the mode at all
	- * and, if we're setting setuid, we also need permissions
	- * to add the set-uid bit, if we're not the owner.
	- * In the specific case of creating a set-uid root
	- * file, we need even more permissions.
	- */
	- error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
	- if (error)
	- return (error);
	- error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr);
	- if (error)
	- return (error);
	- } else {
	- vap->va_mode = ovap->va_mode;
	- }
	- if (mask & (AT_UID \| AT_GID)) {
	- error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
	- if (error)
	- return (error);
	-
	- /*
	- * To change the owner of a file, or change the group of a file to a
	- * group of which we are not a member, the caller must have
	- * privilege.
	- */
	- if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) \|\|
	- ((mask & AT_GID) && vap->va_gid != ovap->va_gid &&
	- !groupmember(vap->va_gid, cr))) {
	- if (secpolicy_fs_owner(vp->v_mount, cr) != 0) {
	- error = priv_check_cred(cr, PRIV_VFS_CHOWN);
	- if (error)
	- return (error);
	- }
	- }
	-
	- if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) \|\|
	- ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) {
	- secpolicy_setid_clear(vap, vp, cr);
	- }
	- }
	- if (mask & (AT_ATIME \| AT_MTIME)) {
	- /*
	- * From utimes(2):
	- * If times is NULL, ... The caller must be the owner of
	- * the file, have permission to write the file, or be the
	- * super-user.
	- * If times is non-NULL, ... The caller must be the owner of
	- * the file or be the super-user.
	- */
	- error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
	- if (error && (vap->va_vaflags & VA_UTIMES_NULL))
	- error = unlocked_access(node, VWRITE, cr);
	- if (error)
	- return (error);
	- }
	- return (0);
	-}
	-
	-int
	-secpolicy_vnode_create_gid(cred_t *cr)
	-{
	-
	- return (EPERM);
	-}
	-
	-int
	-secpolicy_vnode_setids_setgids(vnode_t vp, cred_t cr, gid_t gid)
	-{
	-
	- if (groupmember(gid, cr))
	- return (0);
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	- return (priv_check_cred(cr, PRIV_VFS_SETGID));
	-}
	-
	-int
	-secpolicy_vnode_setid_retain(vnode_t vp, cred_t cr,
	- boolean_t issuidroot __unused)
	-{
	-
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	- return (priv_check_cred(cr, PRIV_VFS_RETAINSUGID));
	-}
	-
	-void
	-secpolicy_setid_clear(struct vattr vap, vnode_t vp, cred_t *cr)
	-{
	-
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return;
	-
	- if ((vap->va_mode & (S_ISUID \| S_ISGID)) != 0) {
	- if (priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) {
	- vap->va_mask \|= AT_MODE;
	- vap->va_mode &= ~(S_ISUID\|S_ISGID);
	- }
	- }
	-}
	-
	-int
	-secpolicy_setid_setsticky_clear(vnode_t vp, struct vattr vap,
	- const struct vattr ovap, cred_t cr)
	-{
	- int error;
	-
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	-
	- /*
	- * Privileged processes may set the sticky bit on non-directories,
	- * as well as set the setgid bit on a file with a group that the process
	- * is not a member of. Both of these are allowed in jail(8).
	- */
	- if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) {
	- if (priv_check_cred(cr, PRIV_VFS_STICKYFILE))
	- return (EFTYPE);
	- }
	- /*
	- * Check for privilege if attempting to set the
	- * group-id bit.
	- */
	- if ((vap->va_mode & S_ISGID) != 0) {
	- error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid);
	- if (error)
	- return (error);
	- }
	- /*
	- * Deny setting setuid if we are not the file owner.
	- */
	- if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) {
	- error = priv_check_cred(cr, PRIV_VFS_ADMIN);
	- if (error)
	- return (error);
	- }
	- return (0);
	-}
	-
	-int
	-secpolicy_fs_mount(cred_t cr, vnode_t mvp, struct mount *vfsp)
	-{
	-
	- return (priv_check_cred(cr, PRIV_VFS_MOUNT));
	-}
	-
	-int
	-secpolicy_vnode_owner(vnode_t vp, cred_t cr, uid_t owner)
	-{
	-
	- if (owner == cr->cr_uid)
	- return (0);
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	-
	- /* XXX: vfs_suser()? */
	- return (priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER));
	-}
	-
	-int
	-secpolicy_vnode_chown(vnode_t vp, cred_t cr, uid_t owner)
	-{
	-
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	- return (priv_check_cred(cr, PRIV_VFS_CHOWN));
	-}
	-
	-void
	-secpolicy_fs_mount_clearopts(cred_t cr, struct mount vfsp)
	-{
	-
	- if (priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) {
	- MNT_ILOCK(vfsp);
	- vfsp->vfs_flag \|= VFS_NOSETUID \| MNT_USER;
	- vfs_clearmntopt(vfsp, MNTOPT_SETUID);
	- vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0);
	- MNT_IUNLOCK(vfsp);
	- }
	-}
	-
	-/*
	- * Check privileges for setting xvattr attributes
	- */
	-int
	-secpolicy_xvattr(vnode_t vp, xvattr_t xvap, uid_t owner, cred_t *cr,
	- vtype_t vtype)
	-{
	-
	- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
	- return (0);
	- return (priv_check_cred(cr, PRIV_VFS_SYSFLAGS));
	-}
	-
	-int
	-secpolicy_smb(cred_t *cr)
	-{
	-
	- return (priv_check_cred(cr, PRIV_NETSMB));
	-}
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c
	@@ -1,194 +0,0 @@
	-/*-
	- * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/param.h>
	-#include <sys/jail.h>
	-#include <sys/kernel.h>
	-#include <sys/libkern.h>
	-#include <sys/limits.h>
	-#include <sys/misc.h>
	-#include <sys/sunddi.h>
	-#include <sys/sysctl.h>
	-
	-int
	-ddi_strtol(const char str, char nptr, int base, long result)
	-{
	-
	- *result = strtol(str, nptr, base);
	- return (0);
	-}
	-
	-int
	-ddi_strtoul(const char str, char nptr, int base, unsigned long result)
	-{
	-
	- if (str == hw_serial) {
	- *result = prison0.pr_hostid;
	- return (0);
	- }
	-
	- *result = strtoul(str, nptr, base);
	- return (0);
	-}
	-
	-int
	-ddi_strtoull(const char str, char nptr, int base, unsigned long long result)
	-{
	-
	- *result = (unsigned long long)strtouq(str, nptr, base);
	- return (0);
	-}
	-
	-int
	-ddi_strtoll(const char str, char nptr, int base, long long result)
	-{
	-
	- *result = (long long)strtoq(str, nptr, base);
	- return (0);
	-}
	-
	-struct ddi_soft_state_item {
	- int ssi_item;
	- void *ssi_data;
	- LIST_ENTRY(ddi_soft_state_item) ssi_next;
	-};
	-
	-struct ddi_soft_state {
	- size_t ss_size;
	- kmutex_t ss_lock;
	- LIST_HEAD(, ddi_soft_state_item) ss_list;
	-};
	-
	-static void *
	-ddi_get_soft_state_locked(struct ddi_soft_state *ss, int item)
	-{
	- struct ddi_soft_state_item *itemp;
	-
	- ASSERT(MUTEX_HELD(&ss->ss_lock));
	-
	- LIST_FOREACH(itemp, &ss->ss_list, ssi_next) {
	- if (itemp->ssi_item == item)
	- return (itemp->ssi_data);
	- }
	- return (NULL);
	-}
	-
	-void *
	-ddi_get_soft_state(void *state, int item)
	-{
	- struct ddi_soft_state *ss = state;
	- void *data;
	-
	- mutex_enter(&ss->ss_lock);
	- data = ddi_get_soft_state_locked(ss, item);
	- mutex_exit(&ss->ss_lock);
	- return (data);
	-}
	-
	-int
	-ddi_soft_state_zalloc(void *state, int item)
	-{
	- struct ddi_soft_state *ss = state;
	- struct ddi_soft_state_item *itemp;
	-
	- itemp = kmem_alloc(sizeof(*itemp), KM_SLEEP);
	- itemp->ssi_item = item;
	- itemp->ssi_data = kmem_zalloc(ss->ss_size, KM_SLEEP);
	-
	- mutex_enter(&ss->ss_lock);
	- if (ddi_get_soft_state_locked(ss, item) != NULL) {
	- mutex_exit(&ss->ss_lock);
	- kmem_free(itemp->ssi_data, ss->ss_size);
	- kmem_free(itemp, sizeof(*itemp));
	- return (DDI_FAILURE);
	- }
	- LIST_INSERT_HEAD(&ss->ss_list, itemp, ssi_next);
	- mutex_exit(&ss->ss_lock);
	- return (DDI_SUCCESS);
	-}
	-
	-static void
	-ddi_soft_state_free_locked(struct ddi_soft_state *ss, int item)
	-{
	- struct ddi_soft_state_item *itemp;
	-
	- ASSERT(MUTEX_HELD(&ss->ss_lock));
	-
	- LIST_FOREACH(itemp, &ss->ss_list, ssi_next) {
	- if (itemp->ssi_item == item)
	- break;
	- }
	- if (itemp != NULL) {
	- LIST_REMOVE(itemp, ssi_next);
	- kmem_free(itemp->ssi_data, ss->ss_size);
	- kmem_free(itemp, sizeof(*itemp));
	- }
	-}
	-
	-void
	-ddi_soft_state_free(void *state, int item)
	-{
	- struct ddi_soft_state *ss = state;
	-
	- mutex_enter(&ss->ss_lock);
	- ddi_soft_state_free_locked(ss, item);
	- mutex_exit(&ss->ss_lock);
	-}
	-
	-int
	-ddi_soft_state_init(void **statep, size_t size, size_t nitems __unused)
	-{
	- struct ddi_soft_state *ss;
	-
	- ss = kmem_alloc(sizeof(*ss), KM_SLEEP);
	- mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
	- ss->ss_size = size;
	- LIST_INIT(&ss->ss_list);
	- *statep = ss;
	- return (0);
	-}
	-
	-void
	-ddi_soft_state_fini(void **statep)
	-{
	- struct ddi_soft_state ss = statep;
	- struct ddi_soft_state_item *itemp;
	- int item;
	-
	- mutex_enter(&ss->ss_lock);
	- while ((itemp = LIST_FIRST(&ss->ss_list)) != NULL) {
	- item = itemp->ssi_item;
	- ddi_soft_state_free_locked(ss, item);
	- }
	- mutex_exit(&ss->ss_lock);
	- mutex_destroy(&ss->ss_lock);
	- kmem_free(ss, sizeof(*ss));
	-
	- *statep = NULL;
	-}
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c
	@@ -1,338 +0,0 @@
	-/*-
	- * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/param.h>
	-#include <sys/kernel.h>
	-#include <sys/systm.h>
	-#include <sys/malloc.h>
	-#include <sys/kmem.h>
	-#include <sys/sbuf.h>
	-#include <sys/devctl.h>
	-#include <sys/nvpair.h>
	-#include <sys/sunddi.h>
	-#include <sys/sysevent.h>
	-#include <sys/fm/protocol.h>
	-
	-struct sysevent {
	- nvlist_t *se_nvl;
	- char se_class[128];
	- char se_subclass[128];
	- char se_pub[128];
	-};
	-
	-sysevent_t *
	-sysevent_alloc(char class, char subclass, char *pub, int flag)
	-{
	- struct sysevent *ev;
	-
	- ASSERT(class != NULL);
	- ASSERT(subclass != NULL);
	- ASSERT(pub != NULL);
	- ASSERT(flag == SE_SLEEP);
	-
	- ev = kmem_alloc(sizeof(*ev), KM_SLEEP);
	- ev->se_nvl = NULL;
	- strlcpy(ev->se_class, class, sizeof(ev->se_class));
	- strlcpy(ev->se_subclass, subclass, sizeof(ev->se_subclass));
	- strlcpy(ev->se_pub, pub, sizeof(ev->se_pub));
	-
	- return ((sysevent_t *)ev);
	-}
	-
	-void
	-sysevent_free(sysevent_t *evp)
	-{
	- struct sysevent ev = (struct sysevent )evp;
	-
	- ASSERT(evp != NULL);
	-
	- if (ev->se_nvl != NULL)
	- sysevent_free_attr(ev->se_nvl);
	- kmem_free(ev, sizeof(*ev));
	-}
	-
	-int
	-sysevent_add_attr(sysevent_attr_list_t *ev_attr_list, char name,
	- sysevent_value_t *se_value, int flag)
	-{
	- nvlist_t *nvl;
	- int error;
	-
	- ASSERT(ev_attr_list != NULL);
	- ASSERT(name != NULL);
	- ASSERT(se_value != NULL);
	- ASSERT(flag == SE_SLEEP);
	-
	- if (strlen(name) >= MAX_ATTR_NAME)
	- return (SE_EINVAL);
	-
	- nvl = *ev_attr_list;
	- if (nvl == NULL) {
	- if (nvlist_alloc(&nvl, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0)
	- return (SE_ENOMEM);
	- }
	-
	- error = 0;
	-
	- switch (se_value->value_type) {
	- case SE_DATA_TYPE_UINT64:
	- error = nvlist_add_uint64(nvl, name, se_value->value.sv_uint64);
	- break;
	- case SE_DATA_TYPE_STRING:
	- if (strlen(se_value->value.sv_string) >= MAX_STRING_SZ)
	- error = SE_EINVAL;
	- if (error == 0) {
	- error = nvlist_add_string(nvl, name,
	- se_value->value.sv_string);
	- }
	- break;
	- default:
	-#if 0
	- printf("%s: type %d is not implemented\n", __func__,
	- se_value->value_type);
	-#endif
	- break;
	- }
	-
	- if (error != 0) {
	- nvlist_free(nvl);
	- return (error);
	- }
	-
	- *ev_attr_list = nvl;
	-
	- return (0);
	-}
	-
	-void
	-sysevent_free_attr(sysevent_attr_list_t *ev_attr_list)
	-{
	-
	- nvlist_free(ev_attr_list);
	-}
	-
	-int
	-sysevent_attach_attributes(sysevent_t evp, sysevent_attr_list_t ev_attr_list)
	-{
	- struct sysevent ev = (struct sysevent )evp;
	-
	- ASSERT(ev->se_nvl == NULL);
	-
	- ev->se_nvl = ev_attr_list;
	-
	- return (0);
	-}
	-
	-void
	-sysevent_detach_attributes(sysevent_t *evp)
	-{
	- struct sysevent ev = (struct sysevent )evp;
	-
	- ASSERT(ev->se_nvl != NULL);
	-
	- ev->se_nvl = NULL;
	-}
	-
	-int
	-log_sysevent(sysevent_t evp, int flag, sysevent_id_t eid)
	-{
	- struct sysevent ev = (struct sysevent )evp;
	- struct sbuf *sb;
	- const char *type;
	- char typestr[128];
	- nvpair_t *elem = NULL;
	-
	- ASSERT(evp != NULL);
	- ASSERT(ev->se_nvl != NULL);
	- ASSERT(flag == SE_SLEEP);
	- ASSERT(eid != NULL);
	-
	- sb = sbuf_new_auto();
	- if (sb == NULL)
	- return (SE_ENOMEM);
	- type = NULL;
	-
	- while ((elem = nvlist_next_nvpair(ev->se_nvl, elem)) != NULL) {
	- switch (nvpair_type(elem)) {
	- case DATA_TYPE_BOOLEAN:
	- {
	- boolean_t value;
	-
	- (void) nvpair_value_boolean_value(elem, &value);
	- sbuf_printf(sb, " %s=%s", nvpair_name(elem),
	- value ? "true" : "false");
	- break;
	- }
	- case DATA_TYPE_UINT8:
	- {
	- uint8_t value;
	-
	- (void) nvpair_value_uint8(elem, &value);
	- sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value);
	- break;
	- }
	- case DATA_TYPE_INT32:
	- {
	- int32_t value;
	-
	- (void) nvpair_value_int32(elem, &value);
	- sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
	- (intmax_t)value);
	- break;
	- }
	- case DATA_TYPE_UINT32:
	- {
	- uint32_t value;
	-
	- (void) nvpair_value_uint32(elem, &value);
	- sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
	- (uintmax_t)value);
	- break;
	- }
	- case DATA_TYPE_INT64:
	- {
	- int64_t value;
	-
	- (void) nvpair_value_int64(elem, &value);
	- sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
	- (intmax_t)value);
	- break;
	- }
	- case DATA_TYPE_UINT64:
	- {
	- uint64_t value;
	-
	- (void) nvpair_value_uint64(elem, &value);
	- sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
	- (uintmax_t)value);
	- break;
	- }
	- case DATA_TYPE_STRING:
	- {
	- char *value;
	-
	- (void) nvpair_value_string(elem, &value);
	- sbuf_printf(sb, " %s=%s", nvpair_name(elem), value);
	- if (strcmp(FM_CLASS, nvpair_name(elem)) == 0)
	- type = value;
	- break;
	- }
	- case DATA_TYPE_UINT8_ARRAY:
	- {
	- uint8_t *value;
	- uint_t ii, nelem;
	-
	- (void) nvpair_value_uint8_array(elem, &value, &nelem);
	- sbuf_printf(sb, " %s=", nvpair_name(elem));
	- for (ii = 0; ii < nelem; ii++)
	- sbuf_printf(sb, "%02hhx", value[ii]);
	- break;
	- }
	- case DATA_TYPE_UINT16_ARRAY:
	- {
	- uint16_t *value;
	- uint_t ii, nelem;
	-
	- (void) nvpair_value_uint16_array(elem, &value, &nelem);
	- sbuf_printf(sb, " %s=", nvpair_name(elem));
	- for (ii = 0; ii < nelem; ii++)
	- sbuf_printf(sb, "%04hx", value[ii]);
	- break;
	- }
	- case DATA_TYPE_UINT32_ARRAY:
	- {
	- uint32_t *value;
	- uint_t ii, nelem;
	-
	- (void) nvpair_value_uint32_array(elem, &value, &nelem);
	- sbuf_printf(sb, " %s=", nvpair_name(elem));
	- for (ii = 0; ii < nelem; ii++)
	- sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]);
	- break;
	- }
	- case DATA_TYPE_UINT64_ARRAY:
	- {
	- uint64_t *value;
	- uint_t ii, nelem;
	-
	- (void) nvpair_value_uint64_array(elem, &value, &nelem);
	- sbuf_printf(sb, " %s=", nvpair_name(elem));
	- for (ii = 0; ii < nelem; ii++)
	- sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]);
	- break;
	- }
	- default:
	-#if 0
	- printf("%s: type %d is not implemented\n", __func__,
	- nvpair_type(elem));
	-#endif
	- break;
	- }
	- }
	-
	- if (sbuf_finish(sb) != 0) {
	- sbuf_delete(sb);
	- return (SE_ENOMEM);
	- }
	-
	- if (type == NULL)
	- type = ev->se_subclass;
	- if (strncmp(type, "ESC_ZFS_", 8) == 0) {
	- snprintf(typestr, sizeof(typestr), "misc.fs.zfs.%s", type + 8);
	- type = typestr;
	- }
	- devctl_notify("ZFS", "ZFS", type, sbuf_data(sb));
	- sbuf_delete(sb);
	-
	- return (0);
	-}
	-
	-int
	-_ddi_log_sysevent(char vendor, char class, char *subclass,
	- nvlist_t attr_list, sysevent_id_t eidp, int flag)
	-{
	- sysevent_t *ev;
	- int ret;
	-
	- ASSERT(vendor != NULL);
	- ASSERT(class != NULL);
	- ASSERT(subclass != NULL);
	- ASSERT(attr_list != NULL);
	- ASSERT(eidp != NULL);
	- ASSERT(flag == DDI_SLEEP);
	-
	- ev = sysevent_alloc(class, subclass, vendor, SE_SLEEP);
	- ASSERT(ev != NULL);
	- (void)sysevent_attach_attributes(ev, attr_list);
	- ret = log_sysevent(ev, SE_SLEEP, eidp);
	- sysevent_detach_attributes(ev);
	- sysevent_free(ev);
	-
	- return (ret);
	-}
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
	@@ -1,252 +0,0 @@
	-/*-
	- * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/param.h>
	-#include <sys/kernel.h>
	-#include <sys/systm.h>
	-#include <sys/malloc.h>
	-#include <sys/mount.h>
	-#include <sys/cred.h>
	-#include <sys/vfs.h>
	-#include <sys/priv.h>
	-#include <sys/libkern.h>
	-
	-MALLOC_DECLARE(M_MOUNT);
	-
	-void
	-vfs_setmntopt(vfs_t vfsp, const char name, const char *arg,
	- int flags __unused)
	-{
	- struct vfsopt *opt;
	- size_t namesize;
	- int locked;
	-
	- if (!(locked = mtx_owned(MNT_MTX(vfsp))))
	- MNT_ILOCK(vfsp);
	-
	- if (vfsp->mnt_opt == NULL) {
	- void *opts;
	-
	- MNT_IUNLOCK(vfsp);
	- opts = malloc(sizeof(*vfsp->mnt_opt), M_MOUNT, M_WAITOK);
	- MNT_ILOCK(vfsp);
	- if (vfsp->mnt_opt == NULL) {
	- vfsp->mnt_opt = opts;
	- TAILQ_INIT(vfsp->mnt_opt);
	- } else {
	- free(opts, M_MOUNT);
	- }
	- }
	-
	- MNT_IUNLOCK(vfsp);
	-
	- opt = malloc(sizeof(*opt), M_MOUNT, M_WAITOK);
	- namesize = strlen(name) + 1;
	- opt->name = malloc(namesize, M_MOUNT, M_WAITOK);
	- strlcpy(opt->name, name, namesize);
	- opt->pos = -1;
	- opt->seen = 1;
	- if (arg == NULL) {
	- opt->value = NULL;
	- opt->len = 0;
	- } else {
	- opt->len = strlen(arg) + 1;
	- opt->value = malloc(opt->len, M_MOUNT, M_WAITOK);
	- bcopy(arg, opt->value, opt->len);
	- }
	-
	- MNT_ILOCK(vfsp);
	- TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link);
	- if (!locked)
	- MNT_IUNLOCK(vfsp);
	-}
	-
	-void
	-vfs_clearmntopt(vfs_t vfsp, const char name)
	-{
	- int locked;
	-
	- if (!(locked = mtx_owned(MNT_MTX(vfsp))))
	- MNT_ILOCK(vfsp);
	- vfs_deleteopt(vfsp->mnt_opt, name);
	- if (!locked)
	- MNT_IUNLOCK(vfsp);
	-}
	-
	-int
	-vfs_optionisset(const vfs_t vfsp, const char opt, char **argp)
	-{
	- struct vfsoptlist *opts = vfsp->mnt_optnew;
	- int error;
	-
	- if (opts == NULL)
	- return (0);
	- error = vfs_getopt(opts, opt, (void **)argp, NULL);
	- return (error != 0 ? 0 : 1);
	-}
	-
	-int
	-mount_snapshot(kthread_t td, vnode_t vpp, const char fstype, char *fspath,
	- char *fspec, int fsflags)
	-{
	- struct vfsconf *vfsp;
	- struct mount *mp;
	- vnode_t vp, mvp;
	- struct ucred *cr;
	- int error;
	-
	- ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot");
	-
	- vp = *vpp;
	- *vpp = NULL;
	- error = 0;
	-
	- /*
	- * Be ultra-paranoid about making sure the type and fspath
	- * variables will fit in our mp buffers, including the
	- * terminating NUL.
	- */
	- if (strlen(fstype) >= MFSNAMELEN \|\| strlen(fspath) >= MNAMELEN)
	- error = ENAMETOOLONG;
	- if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
	- error = ENODEV;
	- if (error == 0 && vp->v_type != VDIR)
	- error = ENOTDIR;
	- /*
	- * We need vnode lock to protect v_mountedhere and vnode interlock
	- * to protect v_iflag.
	- */
	- if (error == 0) {
	- VI_LOCK(vp);
	- if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
	- vp->v_iflag \|= VI_MOUNT;
	- else
	- error = EBUSY;
	- VI_UNLOCK(vp);
	- }
	- if (error != 0) {
	- vput(vp);
	- return (error);
	- }
	- vn_seqc_write_begin(vp);
	- VOP_UNLOCK(vp);
	-
	- /*
	- * Allocate and initialize the filesystem.
	- * We don't want regular user that triggered snapshot mount to be able
	- * to unmount it, so pass credentials of the parent mount.
	- */
	- mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred);
	-
	- mp->mnt_optnew = NULL;
	- vfs_setmntopt(mp, "from", fspec, 0);
	- mp->mnt_optnew = mp->mnt_opt;
	- mp->mnt_opt = NULL;
	-
	- /*
	- * Set the mount level flags.
	- */
	- mp->mnt_flag = fsflags & MNT_UPDATEMASK;
	- /*
	- * Snapshots are always read-only.
	- */
	- mp->mnt_flag \|= MNT_RDONLY;
	- /*
	- * We don't want snapshots to allow access to vulnerable setuid
	- * programs, so we turn off setuid when mounting snapshots.
	- */
	- mp->mnt_flag \|= MNT_NOSUID;
	- /*
	- * We don't want snapshots to be visible in regular
	- * mount(8) and df(1) output.
	- */
	- mp->mnt_flag \|= MNT_IGNORE;
	- /*
	- * XXX: This is evil, but we can't mount a snapshot as a regular user.
	- * XXX: Is is safe when snapshot is mounted from within a jail?
	- */
	- cr = td->td_ucred;
	- td->td_ucred = kcred;
	- error = VFS_MOUNT(mp);
	- td->td_ucred = cr;
	-
	- if (error != 0) {
	- /*
	- * Clear VI_MOUNT and decrement the use count "atomically",
	- * under the vnode lock. This is not strictly required,
	- * but makes it easier to reason about the life-cycle and
	- * ownership of the covered vnode.
	- */
	- vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	- VI_LOCK(vp);
	- vp->v_iflag &= ~VI_MOUNT;
	- VI_UNLOCK(vp);
	- vn_seqc_write_end(vp);
	- vput(vp);
	- vfs_unbusy(mp);
	- vfs_freeopts(mp->mnt_optnew);
	- mp->mnt_vnodecovered = NULL;
	- vfs_mount_destroy(mp);
	- return (error);
	- }
	-
	- if (mp->mnt_opt != NULL)
	- vfs_freeopts(mp->mnt_opt);
	- mp->mnt_opt = mp->mnt_optnew;
	- (void)VFS_STATFS(mp, &mp->mnt_stat);
	-
	- /*
	- * Prevent external consumers of mount options from reading
	- * mnt_optnew.
	- */
	- mp->mnt_optnew = NULL;
	-
	- vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	-#ifdef FREEBSD_NAMECACHE
	- cache_purge(vp);
	-#endif
	- VI_LOCK(vp);
	- vp->v_iflag &= ~VI_MOUNT;
	- VI_UNLOCK(vp);
	-
	- vp->v_mountedhere = mp;
	- /* Put the new filesystem on the mount list. */
	- mtx_lock(&mountlist_mtx);
	- TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
	- mtx_unlock(&mountlist_mtx);
	- vfs_event_signal(NULL, VQ_MOUNT, 0);
	- if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
	- panic("mount: lost mount");
	- vn_seqc_write_end(vp);
	- VOP_UNLOCK(vp);
	- vfs_op_exit(mp);
	- vfs_unbusy(mp);
	- *vpp = mvp;
	- return (0);
	-}
	Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c
	===================================================================
	--- head/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c
	+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c
	@@ -1,256 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#include <sys/cdefs.h>
	-__FBSDID("$FreeBSD$");
	-
	-#include <sys/param.h>
	-#include <sys/kernel.h>
	-#include <sys/systm.h>
	-#include <sys/proc.h>
	-#include <sys/lock.h>
	-#include <sys/mutex.h>
	-#include <sys/sx.h>
	-#include <sys/malloc.h>
	-#include <sys/queue.h>
	-#include <sys/jail.h>
	-#include <sys/osd.h>
	-#include <sys/priv.h>
	-#include <sys/zone.h>
	-
	-static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data");
	-
	-/*
	- * Structure to record list of ZFS datasets exported to a zone.
	- */
	-typedef struct zone_dataset {
	- LIST_ENTRY(zone_dataset) zd_next;
	- char zd_dataset[0];
	-} zone_dataset_t;
	-
	-LIST_HEAD(zone_dataset_head, zone_dataset);
	-
	-static int zone_slot;
	-
	-int
	-zone_dataset_attach(struct ucred cred, const char dataset, int jailid)
	-{
	- struct zone_dataset_head *head;
	- zone_dataset_t zd, zd2;
	- struct prison *pr;
	- int dofree, error;
	-
	- if ((error = priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
	- return (error);
	-
	- /* Allocate memory before we grab prison's mutex. */
	- zd = malloc(sizeof(*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK);
	-
	- sx_slock(&allprison_lock);
	- pr = prison_find(jailid); /* Locks &pr->pr_mtx. */
	- sx_sunlock(&allprison_lock);
	- if (pr == NULL) {
	- free(zd, M_ZONES);
	- return (ENOENT);
	- }
	-
	- head = osd_jail_get(pr, zone_slot);
	- if (head != NULL) {
	- dofree = 0;
	- LIST_FOREACH(zd2, head, zd_next) {
	- if (strcmp(dataset, zd2->zd_dataset) == 0) {
	- free(zd, M_ZONES);
	- error = EEXIST;
	- goto end;
	- }
	- }
	- } else {
	- dofree = 1;
	- prison_hold_locked(pr);
	- mtx_unlock(&pr->pr_mtx);
	- head = malloc(sizeof(*head), M_ZONES, M_WAITOK);
	- LIST_INIT(head);
	- mtx_lock(&pr->pr_mtx);
	- error = osd_jail_set(pr, zone_slot, head);
	- KASSERT(error == 0, ("osd_jail_set() failed (error=%d)", error));
	- }
	- strcpy(zd->zd_dataset, dataset);
	- LIST_INSERT_HEAD(head, zd, zd_next);
	-end:
	- if (dofree)
	- prison_free_locked(pr);
	- else
	- mtx_unlock(&pr->pr_mtx);
	- return (error);
	-}
	-
	-int
	-zone_dataset_detach(struct ucred cred, const char dataset, int jailid)
	-{
	- struct zone_dataset_head *head;
	- zone_dataset_t *zd;
	- struct prison *pr;
	- int error;
	-
	- if ((error = priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
	- return (error);
	-
	- sx_slock(&allprison_lock);
	- pr = prison_find(jailid);
	- sx_sunlock(&allprison_lock);
	- if (pr == NULL)
	- return (ENOENT);
	- head = osd_jail_get(pr, zone_slot);
	- if (head == NULL) {
	- error = ENOENT;
	- goto end;
	- }
	- LIST_FOREACH(zd, head, zd_next) {
	- if (strcmp(dataset, zd->zd_dataset) == 0)
	- break;
	- }
	- if (zd == NULL)
	- error = ENOENT;
	- else {
	- LIST_REMOVE(zd, zd_next);
	- free(zd, M_ZONES);
	- if (LIST_EMPTY(head))
	- osd_jail_del(pr, zone_slot);
	- error = 0;
	- }
	-end:
	- mtx_unlock(&pr->pr_mtx);
	- return (error);
	-}
	-
	-/*
	- * Returns true if the named dataset is visible in the current zone.
	- * The 'write' parameter is set to 1 if the dataset is also writable.
	- */
	-int
	-zone_dataset_visible(const char dataset, int write)
	-{
	- struct zone_dataset_head *head;
	- zone_dataset_t *zd;
	- struct prison *pr;
	- size_t len;
	- int ret = 0;
	-
	- if (dataset[0] == '\0')
	- return (0);
	- if (INGLOBALZONE(curthread)) {
	- if (write != NULL)
	- *write = 1;
	- return (1);
	- }
	- pr = curthread->td_ucred->cr_prison;
	- mtx_lock(&pr->pr_mtx);
	- head = osd_jail_get(pr, zone_slot);
	- if (head == NULL)
	- goto end;
	-
	- /*
	- * Walk the list once, looking for datasets which match exactly, or
	- * specify a dataset underneath an exported dataset. If found, return
	- * true and note that it is writable.
	- */
	- LIST_FOREACH(zd, head, zd_next) {
	- len = strlen(zd->zd_dataset);
	- if (strlen(dataset) >= len &&
	- bcmp(dataset, zd->zd_dataset, len) == 0 &&
	- (dataset[len] == '\0' \|\| dataset[len] == '/' \|\|
	- dataset[len] == '@')) {
	- if (write)
	- *write = 1;
	- ret = 1;
	- goto end;
	- }
	- }
	-
	- /*
	- * Walk the list a second time, searching for datasets which are parents
	- * of exported datasets. These should be visible, but read-only.
	- *
	- * Note that we also have to support forms such as 'pool/dataset/', with
	- * a trailing slash.
	- */
	- LIST_FOREACH(zd, head, zd_next) {
	- len = strlen(dataset);
	- if (dataset[len - 1] == '/')
	- len--; /* Ignore trailing slash */
	- if (len < strlen(zd->zd_dataset) &&
	- bcmp(dataset, zd->zd_dataset, len) == 0 &&
	- zd->zd_dataset[len] == '/') {
	- if (write)
	- *write = 0;
	- ret = 1;
	- goto end;
	- }
	- }
	-end:
	- mtx_unlock(&pr->pr_mtx);
	- return (ret);
	-}
	-
	-static void
	-zone_destroy(void *arg)
	-{
	- struct zone_dataset_head *head;
	- zone_dataset_t *zd;
	-
	- head = arg;
	- while ((zd = LIST_FIRST(head)) != NULL) {
	- LIST_REMOVE(zd, zd_next);
	- free(zd, M_ZONES);
	- }
	- free(head, M_ZONES);
	-}
	-
	-uint32_t
	-zone_get_hostid(void *ptr)
	-{
	-
	- KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__));
	-
	- return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid);
	-}
	-
	-static void
	-zone_sysinit(void *arg __unused)
	-{
	-
	- zone_slot = osd_jail_register(zone_destroy, NULL);
	-}
	-
	-static void
	-zone_sysuninit(void *arg __unused)
	-{
	-
	- osd_jail_deregister(zone_slot);
	-}
	-
	-SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL);
	-SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL);
	Index: head/sys/cddl/compat/opensolaris/sys/acl.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/acl.h
	+++ head/sys/cddl/compat/opensolaris/sys/acl.h
	@@ -1,39 +0,0 @@
	-/*-
	- * Copyright (c) 2008, 2009 Edward Tomasz Napierała <trasz@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef OPENSOLARIS_SYS_ACL_H
	-#define OPENSOLARIS_SYS_ACL_H
	-
	-#include_next <sys/acl.h>
	-
	-struct acl;
	-
	-void aces_from_acl(ace_t aces, int nentries, const struct acl *aclp);
	-int acl_from_aces(struct acl aclp, const ace_t aces, int nentries);
	-
	-#endif /* OPENSOLARIS_SYS_ACL_H */
	Index: head/sys/cddl/compat/opensolaris/sys/file.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/file.h
	+++ head/sys/cddl/compat/opensolaris/sys/file.h
	@@ -1,64 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_FILE_H_
	-#define _OPENSOLARIS_SYS_FILE_H_
	-
	-#include_next <sys/file.h>
	-
	-#define FKIOCTL 0x80000000 /* ioctl addresses are from kernel */
	-
	-#ifdef _KERNEL
	-typedef struct file file_t;
	-
	-#include <sys/capsicum.h>
	-
	-static __inline file_t *
	-getf(int fd, cap_rights_t *rightsp)
	-{
	- struct file *fp;
	-
	- if (fget(curthread, fd, rightsp, &fp) == 0)
	- return (fp);
	- return (NULL);
	-}
	-
	-static __inline void
	-releasef(int fd)
	-{
	- struct file *fp;
	-
	- /* No CAP_ rights required, as we're only releasing. */
	- if (fget(curthread, fd, &cap_no_rights, &fp) == 0) {
	- fdrop(fp, curthread);
	- fdrop(fp, curthread);
	- }
	-}
	-#endif /* _KERNEL */
	-
	-#endif /* !_OPENSOLARIS_SYS_FILE_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/kobj.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/kobj.h
	+++ head/sys/cddl/compat/opensolaris/sys/kobj.h
	@@ -1,60 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_KOBJ_H_
	-#define _OPENSOLARIS_SYS_KOBJ_H_
	-
	-#include <sys/types.h>
	-#include <sys/kmem.h>
	-#include_next <sys/kobj.h>
	-#ifdef AT_UID
	-#undef AT_UID
	-#endif
	-#ifdef AT_GID
	-#undef AT_GID
	-#endif
	-#include <sys/vnode.h>
	-
	-#define KM_NOWAIT 0x01
	-#define KM_TMP 0x02
	-
	-void kobj_free(void *address, size_t size);
	-void *kobj_alloc(size_t size, int flag);
	-void *kobj_zalloc(size_t size, int flag);
	-
	-struct _buf {
	- void *ptr;
	- int mounted;
	-};
	-
	-struct _buf kobj_open_file(const char path);
	-int kobj_get_filesize(struct _buf file, uint64_t size);
	-int kobj_read_file(struct _buf file, char buf, unsigned size, unsigned off);
	-void kobj_close_file(struct _buf *file);
	-
	-#endif /* _OPENSOLARIS_SYS_KOBJ_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/lock.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/lock.h
	+++ head/sys/cddl/compat/opensolaris/sys/lock.h
	@@ -1,45 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_LOCK_H_
	-#define _OPENSOLARIS_SYS_LOCK_H_
	-
	-#include_next <sys/lock.h>
	-
	-#ifdef _KERNEL
	-
	-#define LO_ALLMASK (LO_INITIALIZED \| LO_WITNESS \| LO_QUIET \| \
	- LO_RECURSABLE \| LO_SLEEPABLE \| LO_UPGRADABLE \| \
	- LO_DUPOK \| LO_CLASSMASK \| LO_NOPROFILE)
	-#define LO_EXPECTED (LO_INITIALIZED \| LO_WITNESS \| LO_RECURSABLE \| \
	- LO_SLEEPABLE \| LO_UPGRADABLE \| LO_DUPOK \| \
	- /* sx lock class */(2 << LO_CLASSSHIFT))
	-
	-#endif /* defined(_KERNEL) */
	-
	-#endif /* _OPENSOLARIS_SYS_LOCK_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/misc.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/misc.h
	+++ head/sys/cddl/compat/opensolaris/sys/misc.h
	@@ -55,7 +55,6 @@
	};

	extern char hw_serial[11];
	-extern struct opensolaris_utsname utsname;
	#endif

	#endif /* _OPENSOLARIS_SYS_MISC_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/mman.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/mman.h
	+++ head/sys/cddl/compat/opensolaris/sys/mman.h
	@@ -1,37 +0,0 @@
	-/*
	- * Copyright (C) 2007 John Birrell <jb@freebsd.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- *
	- */
	-
	-#ifndef _COMPAT_OPENSOLARIS_SYS_MMAN_H_
	-#define _COMPAT_OPENSOLARIS_SYS_MMAN_H_
	-
	-#include_next <sys/mman.h>
	-
	-#define mmap64(_a,_b,_c,_d,_e,_f) mmap(_a,_b,_c,_d,_e,_f)
	-
	-#endif
	Index: head/sys/cddl/compat/opensolaris/sys/modctl.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/modctl.h
	+++ head/sys/cddl/compat/opensolaris/sys/modctl.h
	@@ -31,6 +31,7 @@
	#define _COMPAT_OPENSOLARIS_SYS_MODCTL_H

	#include <sys/param.h>
	+#include <sys/queue.h>
	#include <sys/linker.h>

	typedef struct linker_file modctl_t;
	Index: head/sys/cddl/compat/opensolaris/sys/mount.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/mount.h
	+++ head/sys/cddl/compat/opensolaris/sys/mount.h
	@@ -1,41 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_MOUNT_H_
	-#define _OPENSOLARIS_SYS_MOUNT_H_
	-
	-#include <sys/param.h>
	-
	-#include_next <sys/mount.h>
	-
	-#define MS_FORCE MNT_FORCE
	-#define MS_REMOUNT MNT_UPDATE
	-
	-typedef struct fid fid_t;
	-
	-#endif /* !_OPENSOLARIS_SYS_MOUNT_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/mutex.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/mutex.h
	+++ head/sys/cddl/compat/opensolaris/sys/mutex.h
	@@ -1,77 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_MUTEX_H_
	-#define _OPENSOLARIS_SYS_MUTEX_H_
	-
	-#ifdef _KERNEL
	-
	-#include <sys/param.h>
	-#include <sys/lock.h>
	-#include_next <sys/mutex.h>
	-#include <sys/proc.h>
	-#include <sys/sx.h>
	-
	-typedef enum {
	- MUTEX_DEFAULT = 6 /* kernel default mutex */
	-} kmutex_type_t;
	-
	-#define MUTEX_HELD(x) (mutex_owned(x))
	-#define MUTEX_NOT_HELD(x) (!mutex_owned(x) \|\| KERNEL_PANICKED())
	-
	-typedef struct sx kmutex_t;
	-
	-#ifndef OPENSOLARIS_WITNESS
	-#define MUTEX_FLAGS (SX_DUPOK \| SX_NEW \| SX_NOWITNESS)
	-#else
	-#define MUTEX_FLAGS (SX_DUPOK \| SX_NEW)
	-#endif
	-
	-#define mutex_init(lock, desc, type, arg) do { \
	- const char *_name; \
	- ASSERT((type) == 0 \|\| (type) == MUTEX_DEFAULT); \
	- KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) != \
	- LO_EXPECTED, ("lock %s already initialized", #lock)); \
	- for (_name = #lock; *_name != '\0'; _name++) { \
	- if (_name >= 'a' && _name <= 'z') \
	- break; \
	- } \
	- if (*_name == '\0') \
	- _name = #lock; \
	- sx_init_flags((lock), _name, MUTEX_FLAGS); \
	-} while (0)
	-#define mutex_destroy(lock) sx_destroy(lock)
	-#define mutex_enter(lock) sx_xlock(lock)
	-#define mutex_tryenter(lock) sx_try_xlock(lock)
	-#define mutex_exit(lock) sx_xunlock(lock)
	-#define mutex_owned(lock) sx_xlocked(lock)
	-#define mutex_owner(lock) sx_xholder(lock)
	-
	-#endif /* _KERNEL */
	-
	-#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/nvpair.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/nvpair.h
	+++ head/sys/cddl/compat/opensolaris/sys/nvpair.h
	@@ -1,230 +0,0 @@
	-/*-
	- * Copyright (c) 2014 Sandvine Inc.
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_NVPAIR_H_
	-#define _OPENSOLARIS_SYS_NVPAIR_H_
	-
	-#ifdef _KERNEL
	-
	-/*
	- * Some of the symbols in the Illumos nvpair library conflict with symbols
	- * provided by nv(9), so we use this preprocessor hack to avoid the conflict.
	- *
	- * This list was generated by:
	- * cat nv.h nv_impl.h nvlist_* nvpair_impl.h \| \
	- * sed -nE 's/^[[:alnum:]_][[:alnum:]_ ][[:space:]]+[]([[:alnum:]_]+)\(.$/#define \1 illumos_\1/p' \| \
	- * sort -u
	- */
	-#define nvlist_add_binary illumos_nvlist_add_binary
	-#define nvlist_add_bool illumos_nvlist_add_bool
	-#define nvlist_add_bool_array illumos_nvlist_add_bool_array
	-#define nvlist_add_descriptor illumos_nvlist_add_descriptor
	-#define nvlist_add_descriptor_array illumos_nvlist_add_descriptor_array
	-#define nvlist_add_null illumos_nvlist_add_null
	-#define nvlist_add_number illumos_nvlist_add_number
	-#define nvlist_add_number_array illumos_nvlist_add_number_array
	-#define nvlist_add_nvlist illumos_nvlist_add_nvlist
	-#define nvlist_add_nvlist_array illumos_nvlist_add_nvlist_array
	-#define nvlist_add_nvpair illumos_nvlist_add_nvpair
	-#define nvlist_add_string illumos_nvlist_add_string
	-#define nvlist_add_string_array illumos_nvlist_add_string_array
	-#define nvlist_add_stringf illumos_nvlist_add_stringf
	-#define nvlist_add_stringv illumos_nvlist_add_stringv
	-#define nvlist_clone illumos_nvlist_clone
	-#define nvlist_create illumos_nvlist_create
	-#define nvlist_descriptors illumos_nvlist_descriptors
	-#define nvlist_destroy illumos_nvlist_destroy
	-#define nvlist_dump illumos_nvlist_dump
	-#define nvlist_empty illumos_nvlist_empty
	-#define nvlist_error illumos_nvlist_error
	-#define nvlist_exists illumos_nvlist_exists
	-#define nvlist_exists_binary illumos_nvlist_exists_binary
	-#define nvlist_exists_bool illumos_nvlist_exists_bool
	-#define nvlist_exists_bool_array illumos_nvlist_exists_bool_array
	-#define nvlist_exists_descriptor illumos_nvlist_exists_descriptor
	-#define nvlist_exists_descriptor_array illumos_nvlist_exists_descriptor_array
	-#define nvlist_exists_null illumos_nvlist_exists_null
	-#define nvlist_exists_number illumos_nvlist_exists_number
	-#define nvlist_exists_number_array illumos_nvlist_exists_number_array
	-#define nvlist_exists_nvlist illumos_nvlist_exists_nvlist
	-#define nvlist_exists_nvlist_array illumos_nvlist_exists_nvlist_array
	-#define nvlist_exists_string illumos_nvlist_exists_string
	-#define nvlist_exists_string_array illumos_nvlist_exists_string_array
	-#define nvlist_exists_type illumos_nvlist_exists_type
	-#define nvlist_fdump illumos_nvlist_fdump
	-#define nvlist_first_nvpair illumos_nvlist_first_nvpair
	-#define nvlist_flags illumos_nvlist_flags
	-#define nvlist_free illumos_nvlist_free
	-#define nvlist_free_binary illumos_nvlist_free_binary
	-#define nvlist_free_binary_array illumos_nvlist_free_binary_array
	-#define nvlist_free_bool illumos_nvlist_free_bool
	-#define nvlist_free_bool_array illumos_nvlist_free_bool_array
	-#define nvlist_free_descriptor illumos_nvlist_free_descriptor
	-#define nvlist_free_descriptor_array illumos_nvlist_free_descriptor_array
	-#define nvlist_free_null illumos_nvlist_free_null
	-#define nvlist_free_number illumos_nvlist_free_number
	-#define nvlist_free_number_array illumos_nvlist_free_number_array
	-#define nvlist_free_nvlist illumos_nvlist_free_nvlist
	-#define nvlist_free_nvlist_array illumos_nvlist_free_nvlist_array
	-#define nvlist_free_nvpair illumos_nvlist_free_nvpair
	-#define nvlist_free_string illumos_nvlist_free_string
	-#define nvlist_free_string_array illumos_nvlist_free_string_array
	-#define nvlist_free_type illumos_nvlist_free_type
	-#define nvlist_get_array_next illumos_nvlist_get_array_next
	-#define nvlist_get_binary illumos_nvlist_get_binary
	-#define nvlist_get_bool illumos_nvlist_get_bool
	-#define nvlist_get_bool_array illumos_nvlist_get_bool_array
	-#define nvlist_get_descriptor illumos_nvlist_get_descriptor
	-#define nvlist_get_descriptor_array illumos_nvlist_get_descriptor_array
	-#define nvlist_get_number illumos_nvlist_get_number
	-#define nvlist_get_number_array illumos_nvlist_get_number_array
	-#define nvlist_get_nvlist illumos_nvlist_get_nvlist
	-#define nvlist_get_nvpair illumos_nvlist_get_nvpair
	-#define nvlist_get_nvpair_parent illumos_nvlist_get_nvpair_parent
	-#define nvlist_get_pararr illumos_nvlist_get_pararr
	-#define nvlist_get_parent illumos_nvlist_get_parent
	-#define nvlist_get_string illumos_nvlist_get_string
	-#define nvlist_in_array illumos_nvlist_in_array
	-#define nvlist_move_binary illumos_nvlist_move_binary
	-#define nvlist_move_bool_array illumos_nvlist_move_bool_array
	-#define nvlist_move_descriptor illumos_nvlist_move_descriptor
	-#define nvlist_move_descriptor_array illumos_nvlist_move_descriptor_array
	-#define nvlist_move_number_array illumos_nvlist_move_number_array
	-#define nvlist_move_nvlist illumos_nvlist_move_nvlist
	-#define nvlist_move_nvlist_array illumos_nvlist_move_nvlist_array
	-#define nvlist_move_nvpair illumos_nvlist_move_nvpair
	-#define nvlist_move_string illumos_nvlist_move_string
	-#define nvlist_move_string_array illumos_nvlist_move_string_array
	-#define nvlist_ndescriptors illumos_nvlist_ndescriptors
	-#define nvlist_next illumos_nvlist_next
	-#define nvlist_next_nvpair illumos_nvlist_next_nvpair
	-#define nvlist_pack illumos_nvlist_pack
	-#define nvlist_prev_nvpair illumos_nvlist_prev_nvpair
	-#define nvlist_recv illumos_nvlist_recv
	-#define nvlist_remove_nvpair illumos_nvlist_remove_nvpair
	-#define nvlist_send illumos_nvlist_send
	-#define nvlist_set_array_next illumos_nvlist_set_array_next
	-#define nvlist_set_error illumos_nvlist_set_error
	-#define nvlist_set_flags illumos_nvlist_set_flags
	-#define nvlist_set_parent illumos_nvlist_set_parent
	-#define nvlist_size illumos_nvlist_size
	-#define nvlist_take_binary illumos_nvlist_take_binary
	-#define nvlist_take_bool illumos_nvlist_take_bool
	-#define nvlist_take_bool_array illumos_nvlist_take_bool_array
	-#define nvlist_take_descriptor illumos_nvlist_take_descriptor
	-#define nvlist_take_descriptor_array illumos_nvlist_take_descriptor_array
	-#define nvlist_take_number illumos_nvlist_take_number
	-#define nvlist_take_number_array illumos_nvlist_take_number_array
	-#define nvlist_take_nvlist illumos_nvlist_take_nvlist
	-#define nvlist_take_nvlist_array illumos_nvlist_take_nvlist_array
	-#define nvlist_take_nvpair illumos_nvlist_take_nvpair
	-#define nvlist_take_string illumos_nvlist_take_string
	-#define nvlist_take_string_array illumos_nvlist_take_string_array
	-#define nvlist_unpack illumos_nvlist_unpack
	-#define nvlist_unpack_header illumos_nvlist_unpack_header
	-#define nvlist_xfer illumos_nvlist_xfer
	-#define nvpair_assert illumos_nvpair_assert
	-#define nvpair_clone illumos_nvpair_clone
	-#define nvpair_create_binary illumos_nvpair_create_binary
	-#define nvpair_create_bool illumos_nvpair_create_bool
	-#define nvpair_create_bool_array illumos_nvpair_create_bool_array
	-#define nvpair_create_descriptor illumos_nvpair_create_descriptor
	-#define nvpair_create_descriptor_array illumos_nvpair_create_descriptor_array
	-#define nvpair_create_null illumos_nvpair_create_null
	-#define nvpair_create_number illumos_nvpair_create_number
	-#define nvpair_create_number_array illumos_nvpair_create_number_array
	-#define nvpair_create_nvlist illumos_nvpair_create_nvlist
	-#define nvpair_create_nvlist_array illumos_nvpair_create_nvlist_array
	-#define nvpair_create_string illumos_nvpair_create_string
	-#define nvpair_create_string_array illumos_nvpair_create_string_array
	-#define nvpair_create_stringf illumos_nvpair_create_stringf
	-#define nvpair_create_stringv illumos_nvpair_create_stringv
	-#define nvpair_free illumos_nvpair_free
	-#define nvpair_free_structure illumos_nvpair_free_structure
	-#define nvpair_get_binary illumos_nvpair_get_binary
	-#define nvpair_get_bool illumos_nvpair_get_bool
	-#define nvpair_get_bool_array illumos_nvpair_get_bool_array
	-#define nvpair_get_descriptor illumos_nvpair_get_descriptor
	-#define nvpair_get_descriptor_array illumos_nvpair_get_descriptor_array
	-#define nvpair_get_number illumos_nvpair_get_number
	-#define nvpair_get_number_array illumos_nvpair_get_number_array
	-#define nvpair_get_nvlist illumos_nvpair_get_nvlist
	-#define nvpair_get_string illumos_nvpair_get_string
	-#define nvpair_header_size illumos_nvpair_header_size
	-#define nvpair_init_datasize illumos_nvpair_init_datasize
	-#define nvpair_insert illumos_nvpair_insert
	-#define nvpair_move_binary illumos_nvpair_move_binary
	-#define nvpair_move_bool_array illumos_nvpair_move_bool_array
	-#define nvpair_move_descriptor illumos_nvpair_move_descriptor
	-#define nvpair_move_descriptor_array illumos_nvpair_move_descriptor_array
	-#define nvpair_move_number_array illumos_nvpair_move_number_array
	-#define nvpair_move_nvlist illumos_nvpair_move_nvlist
	-#define nvpair_move_nvlist_array illumos_nvpair_move_nvlist_array
	-#define nvpair_move_string illumos_nvpair_move_string
	-#define nvpair_move_string_array illumos_nvpair_move_string_array
	-#define nvpair_name illumos_nvpair_name
	-#define nvpair_next illumos_nvpair_next
	-#define nvpair_nvlist illumos_nvpair_nvlist
	-#define nvpair_pack_binary illumos_nvpair_pack_binary
	-#define nvpair_pack_bool illumos_nvpair_pack_bool
	-#define nvpair_pack_bool_array illumos_nvpair_pack_bool_array
	-#define nvpair_pack_descriptor illumos_nvpair_pack_descriptor
	-#define nvpair_pack_descriptor_array illumos_nvpair_pack_descriptor_array
	-#define nvpair_pack_header illumos_nvpair_pack_header
	-#define nvpair_pack_null illumos_nvpair_pack_null
	-#define nvpair_pack_number illumos_nvpair_pack_number
	-#define nvpair_pack_number_array illumos_nvpair_pack_number_array
	-#define nvpair_pack_nvlist_array_next illumos_nvpair_pack_nvlist_array_next
	-#define nvpair_pack_nvlist_up illumos_nvpair_pack_nvlist_up
	-#define nvpair_pack_string illumos_nvpair_pack_string
	-#define nvpair_pack_string_array illumos_nvpair_pack_string_array
	-#define nvpair_prev illumos_nvpair_prev
	-#define nvpair_remove illumos_nvpair_remove
	-#define nvpair_size illumos_nvpair_size
	-#define nvpair_type illumos_nvpair_type
	-#define nvpair_type_string illumos_nvpair_type_string
	-#define nvpair_unpack illumos_nvpair_unpack
	-#define nvpair_unpack_binary illumos_nvpair_unpack_binary
	-#define nvpair_unpack_bool illumos_nvpair_unpack_bool
	-#define nvpair_unpack_bool_array illumos_nvpair_unpack_bool_array
	-#define nvpair_unpack_descriptor illumos_nvpair_unpack_descriptor
	-#define nvpair_unpack_descriptor_array illumos_nvpair_unpack_descriptor_array
	-#define nvpair_unpack_header illumos_nvpair_unpack_header
	-#define nvpair_unpack_null illumos_nvpair_unpack_null
	-#define nvpair_unpack_number illumos_nvpair_unpack_number
	-#define nvpair_unpack_number_array illumos_nvpair_unpack_number_array
	-#define nvpair_unpack_nvlist illumos_nvpair_unpack_nvlist
	-#define nvpair_unpack_nvlist_array illumos_nvpair_unpack_nvlist_array
	-#define nvpair_unpack_string illumos_nvpair_unpack_string
	-#define nvpair_unpack_string_array illumos_nvpair_unpack_string_array
	-
	-#endif /* _KERNEL */
	-
	-#include_next <sys/nvpair.h>
	-
	-#endif
	Index: head/sys/cddl/compat/opensolaris/sys/param.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/param.h
	+++ head/sys/cddl/compat/opensolaris/sys/param.h
	@@ -1,41 +0,0 @@
	-/*
	- * Copyright (C) 2007 John Birrell <jb@freebsd.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- *
	- */
	-
	-#ifndef _COMPAT_OPENSOLARIS_SYS_PARAM_H_
	-#define _COMPAT_OPENSOLARIS_SYS_PARAM_H_
	-
	-#include_next <sys/param.h>
	-
	-#define PAGESIZE PAGE_SIZE
	-
	-#ifdef _KERNEL
	-#define ptob(x) ((uint64_t)(x) << PAGE_SHIFT)
	-#endif
	-
	-#endif
	Index: head/sys/cddl/compat/opensolaris/sys/proc.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/proc.h
	+++ head/sys/cddl/compat/opensolaris/sys/proc.h
	@@ -1,105 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_PROC_H_
	-#define _OPENSOLARIS_SYS_PROC_H_
	-
	-#include <sys/param.h>
	-#include <sys/kthread.h>
	-#include_next <sys/proc.h>
	-#include <sys/stdint.h>
	-#include <sys/smp.h>
	-#include <sys/sched.h>
	-#include <sys/lock.h>
	-#include <sys/mutex.h>
	-#include <sys/unistd.h>
	-#include <sys/debug.h>
	-
	-#ifdef _KERNEL
	-
	-#define CPU curcpu
	-#define minclsyspri PRIBIO
	-#define maxclsyspri PVM
	-#define max_ncpus (mp_maxid + 1)
	-#define boot_max_ncpus (mp_maxid + 1)
	-#define syscid 1
	-
	-#define TS_RUN 0
	-
	-#define p0 proc0
	-
	-#define t_did td_tid
	-
	-typedef short pri_t;
	-typedef struct thread _kthread;
	-typedef struct thread kthread_t;
	-typedef struct thread *kthread_id_t;
	-typedef struct proc proc_t;
	-
	-extern struct proc *system_proc;
	-
	-static __inline kthread_t *
	-do_thread_create(caddr_t stk, size_t stksize, void (proc)(void ), void *arg,
	- size_t len, proc_t *pp, int state, pri_t pri)
	-{
	- kthread_t *td = NULL;
	- proc_t **ppp;
	- int error;
	-
	- /*
	- * Be sure there are no surprises.
	- */
	- ASSERT(stk == NULL);
	- ASSERT(len == 0);
	- ASSERT(state == TS_RUN);
	- ASSERT(pp != NULL);
	-
	- if (pp == &p0)
	- ppp = &system_proc;
	- else
	- ppp = &pp;
	- error = kproc_kthread_add(proc, arg, ppp, &td, RFSTOPPED,
	- stksize / PAGE_SIZE, "zfskern", "solthread %p", proc);
	- if (error == 0) {
	- thread_lock(td);
	- sched_prio(td, pri);
	- sched_add(td, SRQ_BORING);
	- }
	- return (td);
	-}
	-
	-#define thread_create(stk, stksize, proc, arg, len, pp, state, pri) \
	- do_thread_create(stk, stksize, proc, arg, len, pp, state, pri)
	-#define thread_exit() kthread_exit()
	-
	-int uread(proc_t , void , size_t, uintptr_t);
	-int uwrite(proc_t , void , size_t, uintptr_t);
	-
	-#endif /* _KERNEL */
	-
	-#endif /* _OPENSOLARIS_SYS_PROC_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/stat.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/stat.h
	+++ head/sys/cddl/compat/opensolaris/sys/stat.h
	@@ -1,65 +0,0 @@
	-/*
	- * Copyright (C) 2007 John Birrell <jb@freebsd.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- *
	- */
	-
	-#ifndef _COMPAT_OPENSOLARIS_SYS_STAT_H_
	-#define _COMPAT_OPENSOLARIS_SYS_STAT_H_
	-
	-#include_next <sys/stat.h>
	-
	-/*
	- * When bootstrapping on Linux a stat64/fstat64 functions exists in both
	- * glibc and musl libc. To avoid compilation errors, use those functions instead
	- * of redefining them to stat/fstat.
	- * Similarly, macOS provides (deprecated) stat64 functions that we can use
	- * for now.
	- */
	-#if !defined(__linux__) && !defined(__APPLE__)
	-#define stat64 stat
	-
	-#define MAXOFFSET_T OFF_MAX
	-
	-#if !defined(_KERNEL)
	-#include <sys/disk.h>
	-
	-static __inline int
	-fstat64(int fd, struct stat *sb)
	-{
	- int ret;
	-
	- ret = fstat(fd, sb);
	- if (ret == 0) {
	- if (S_ISCHR(sb->st_mode))
	- (void)ioctl(fd, DIOCGMEDIASIZE, &sb->st_size);
	- }
	- return (ret);
	-}
	-#endif /* !defined(_KERNEL) */
	-#endif /* !defined(__linux__) && !defined(__APPLE__) */
	-
	-#endif /* !_COMPAT_OPENSOLARIS_SYS_STAT_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/systm.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/systm.h
	+++ head/sys/cddl/compat/opensolaris/sys/systm.h
	@@ -1,47 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_SYSTM_H_
	-#define _OPENSOLARIS_SYS_SYSTM_H_
	-
	-#ifdef _KERNEL
	-
	-#include <sys/param.h>
	-#include_next <sys/systm.h>
	-
	-#include <sys/string.h>
	-
	-#define PAGESIZE PAGE_SIZE
	-#define PAGEOFFSET (PAGESIZE - 1)
	-#define PAGEMASK (~PAGEOFFSET)
	-
	-#define delay(x) pause("soldelay", (x))
	-
	-#endif /* _KERNEL */
	-
	-#endif /* _OPENSOLARIS_SYS_SYSTM_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/time.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/time.h
	+++ head/sys/cddl/compat/opensolaris/sys/time.h
	@@ -1,95 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_TIME_H_
	-#define _OPENSOLARIS_SYS_TIME_H_
	-
	-#include <sys/types.h>
	-#include_next <sys/time.h>
	-
	-#define SEC 1
	-#define MILLISEC 1000
	-#define MICROSEC 1000000
	-#define NANOSEC 1000000000
	-#define TIME_MAX LLONG_MAX
	-
	-#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC))
	-#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC))
	-
	-#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC))
	-#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC))
	-
	-#define NSEC2SEC(n) ((n) / (NANOSEC / SEC))
	-#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC))
	-
	-typedef longlong_t hrtime_t;
	-
	-#if defined(__i386__) \|\| defined(__powerpc__)
	-#define TIMESPEC_OVERFLOW(ts) \
	- ((ts)->tv_sec < INT32_MIN \|\| (ts)->tv_sec > INT32_MAX)
	-#else
	-#define TIMESPEC_OVERFLOW(ts) \
	- ((ts)->tv_sec < INT64_MIN \|\| (ts)->tv_sec > INT64_MAX)
	-#endif
	-
	-#define SEC_TO_TICK(sec) ((sec) * hz)
	-#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz))
	-
	-#ifdef _KERNEL
	-static __inline hrtime_t
	-gethrtime(void) {
	-
	- struct timespec ts;
	- hrtime_t nsec;
	-
	- getnanouptime(&ts);
	- nsec = (hrtime_t)ts.tv_sec * NANOSEC + ts.tv_nsec;
	- return (nsec);
	-}
	-
	-#define gethrestime_sec() (time_second)
	-#define gethrestime(ts) getnanotime(ts)
	-#define gethrtime_waitfree() gethrtime()
	-
	-extern int nsec_per_tick; /* nanoseconds per clock tick */
	-
	-#define ddi_get_lbolt64() \
	- (int64_t)(((getsbinuptime() >> 16) * hz) >> 16)
	-#define ddi_get_lbolt() (clock_t)ddi_get_lbolt64()
	-
	-#else
	-
	-static __inline hrtime_t gethrtime(void) {
	- struct timespec ts;
	- clock_gettime(CLOCK_UPTIME,&ts);
	- return (((u_int64_t) ts.tv_sec) * NANOSEC + ts.tv_nsec);
	-}
	-
	-#endif /* _KERNEL */
	-
	-#endif /* !_OPENSOLARIS_SYS_TIME_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/types.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/types.h
	+++ head/sys/cddl/compat/opensolaris/sys/types.h
	@@ -1,101 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_TYPES_H_
	-#define _OPENSOLARIS_SYS_TYPES_H_
	-
	-/*
	- * This is a bag of dirty hacks to keep things compiling.
	- */
	-
	-#include <sys/stdint.h>
	-
	-#ifdef _KERNEL
	-typedef int64_t clock_t;
	-#define _CLOCK_T_DECLARED
	-#endif
	-
	-#include_next <sys/types.h>
	-
	-#define MAXNAMELEN 256
	-
	-typedef struct timespec timestruc_t;
	-typedef struct timespec timespec_t;
	-typedef u_int uint_t;
	-typedef u_char uchar_t;
	-typedef u_short ushort_t;
	-typedef u_long ulong_t;
	-typedef long long longlong_t;
	-typedef unsigned long long u_longlong_t;
	-#ifndef _OFF64_T_DECLARED
	-#define _OFF64_T_DECLARED
	-typedef off_t off64_t;
	-#endif
	-typedef id_t taskid_t;
	-typedef id_t projid_t;
	-typedef id_t poolid_t;
	-typedef id_t zoneid_t;
	-typedef id_t ctid_t;
	-typedef mode_t o_mode_t;
	-typedef uint64_t pgcnt_t;
	-typedef u_int minor_t;
	-
	-#ifdef _KERNEL
	-
	-#define B_FALSE 0
	-#define B_TRUE 1
	-
	-typedef short index_t;
	-typedef off_t offset_t;
	-#ifndef _PTRDIFF_T_DECLARED
	-typedef __ptrdiff_t ptrdiff_t; /* pointer difference */
	-#define _PTRDIFF_T_DECLARED
	-#endif
	-typedef int64_t rlim64_t;
	-typedef int major_t;
	-
	-#else
	-#ifdef NEED_SOLARIS_BOOLEAN
	-#if defined(__XOPEN_OR_POSIX)
	-typedef enum { _B_FALSE, _B_TRUE } boolean_t;
	-#else
	-typedef enum { B_FALSE, B_TRUE } boolean_t;
	-#endif /* defined(__XOPEN_OR_POSIX) */
	-#endif
	-
	-typedef longlong_t offset_t;
	-typedef u_longlong_t u_offset_t;
	-typedef uint64_t upad64_t;
	-typedef short pri_t;
	-typedef int32_t daddr32_t;
	-typedef int32_t time32_t;
	-typedef u_longlong_t diskaddr_t;
	-
	-#endif /* !_KERNEL */
	-
	-#endif /* !_OPENSOLARIS_SYS_TYPES_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/uio.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/uio.h
	+++ head/sys/cddl/compat/opensolaris/sys/uio.h
	@@ -1,89 +0,0 @@
	-/*-
	- * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_UIO_H_
	-#define _OPENSOLARIS_SYS_UIO_H_
	-
	-#include_next <sys/uio.h>
	-#include <sys/debug.h>
	-
	-#ifndef _KERNEL
	-#define FOF_OFFSET 1 /* Use the offset in uio argument */
	-
	-struct uio {
	- struct iovec *uio_iov;
	- int uio_iovcnt;
	- off_t uio_offset;
	- int uio_resid;
	- enum uio_seg uio_segflg;
	- enum uio_rw uio_rw;
	- void *uio_td;
	-};
	-#endif
	-
	-#define uio_loffset uio_offset
	-
	-typedef struct uio uio_t;
	-typedef struct iovec iovec_t;
	-
	-typedef enum xuio_type {
	- UIOTYPE_ASYNCIO,
	- UIOTYPE_ZEROCOPY
	-} xuio_type_t;
	-
	-typedef struct xuio {
	- uio_t xu_uio;
	-
	- /* Extended uio fields */
	- enum xuio_type xu_type; /* What kind of uio structure? */
	- union {
	- struct {
	- int xu_zc_rw;
	- void *xu_zc_priv;
	- } xu_zc;
	- } xu_ext;
	-} xuio_t;
	-
	-#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv
	-#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw
	-
	-#ifdef BUILDING_ZFS
	-static __inline int
	-zfs_uiomove(void cp, size_t n, enum uio_rw dir, uio_t uio)
	-{
	-
	- ASSERT(uio->uio_rw == dir);
	- return (uiomove(cp, (int)n, uio));
	-}
	-#define uiomove(cp, n, dir, uio) zfs_uiomove((cp), (n), (dir), (uio))
	-
	-int uiocopy(void p, size_t n, enum uio_rw rw, struct uio uio, size_t *cbytes);
	-void uioskip(uio_t *uiop, size_t n);
	-#endif /* BUILDING_ZFS */
	-
	-#endif /* !_OPENSOLARIS_SYS_UIO_H_ */
	Index: head/sys/cddl/compat/opensolaris/sys/vnode.h
	===================================================================
	--- head/sys/cddl/compat/opensolaris/sys/vnode.h
	+++ head/sys/cddl/compat/opensolaris/sys/vnode.h
	@@ -1,287 +0,0 @@
	-/*-
	- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _OPENSOLARIS_SYS_VNODE_H_
	-#define _OPENSOLARIS_SYS_VNODE_H_
	-
	-#ifdef _KERNEL
	-
	-struct vnode;
	-struct vattr;
	-
	-typedef struct vnode vnode_t;
	-typedef struct vattr vattr_t;
	-typedef enum vtype vtype_t;
	-
	-#include <sys/namei.h>
	-enum symfollow { NO_FOLLOW = NOFOLLOW };
	-
	-#include <sys/proc.h>
	-#include_next <sys/vnode.h>
	-#include <sys/mount.h>
	-#include <sys/cred.h>
	-#include <sys/fcntl.h>
	-#include <sys/file.h>
	-#include <sys/filedesc.h>
	-#include <sys/syscallsubr.h>
	-
	-typedef struct vop_vector vnodeops_t;
	-#define VOP_FID VOP_VPTOFH
	-#define vop_fid vop_vptofh
	-#define vop_fid_args vop_vptofh_args
	-#define a_fid a_fhp
	-
	-#define IS_XATTRDIR(dvp) (0)
	-
	-#define v_count v_usecount
	-
	-#define V_APPEND VAPPEND
	-
	-#define rootvfs (rootvnode == NULL ? NULL : rootvnode->v_mount)
	-
	-static __inline int
	-vn_is_readonly(vnode_t *vp)
	-{
	- return (vp->v_mount->mnt_flag & MNT_RDONLY);
	-}
	-#define vn_vfswlock(vp) (0)
	-#define vn_vfsunlock(vp) do { } while (0)
	-#define vn_ismntpt(vp) ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL)
	-#define vn_mountedvfs(vp) ((vp)->v_mountedhere)
	-#define vn_has_cached_data(vp) \
	- ((vp)->v_object != NULL && \
	- (vp)->v_object->resident_page_count > 0)
	-#define vn_exists(vp) do { } while (0)
	-#define vn_invalid(vp) do { } while (0)
	-#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0)
	-#define vn_free(vp) do { } while (0)
	-#define vn_matchops(vp, vops) ((vp)->v_op == &(vops))
	-
	-#define VN_HOLD(v) vref(v)
	-#define VN_RELE(v) vrele(v)
	-#define VN_URELE(v) vput(v)
	-
	-#define vnevent_create(vp, ct) do { } while (0)
	-#define vnevent_link(vp, ct) do { } while (0)
	-#define vnevent_remove(vp, dvp, name, ct) do { } while (0)
	-#define vnevent_rmdir(vp, dvp, name, ct) do { } while (0)
	-#define vnevent_rename_src(vp, dvp, name, ct) do { } while (0)
	-#define vnevent_rename_dest(vp, dvp, name, ct) do { } while (0)
	-#define vnevent_rename_dest_dir(vp, ct) do { } while (0)
	-
	-#define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp))
	-#define MANDMODE(mode) (0)
	-#define MANDLOCK(vp, mode) (0)
	-#define chklock(vp, op, offset, size, mode, ct) (0)
	-#define cleanlocks(vp, pid, foo) do { } while (0)
	-#define cleanshares(vp, pid) do { } while (0)
	-
	-/*
	- * We will use va_spare is place of Solaris' va_mask.
	- * This field is initialized in zfs_setattr().
	- */
	-#define va_mask va_spare
	-/* TODO: va_fileid is shorter than va_nodeid !!! */
	-#define va_nodeid va_fileid
	-/* TODO: This field needs conversion! */
	-#define va_nblocks va_bytes
	-#define va_blksize va_blocksize
	-#define va_seq va_gen
	-
	-#define MAXOFFSET_T OFF_MAX
	-#define EXCL 0
	-
	-#define ACCESSED (AT_ATIME)
	-#define STATE_CHANGED (AT_CTIME)
	-#define CONTENT_MODIFIED (AT_MTIME \| AT_CTIME)
	-
	-static __inline void
	-vattr_init_mask(vattr_t *vap)
	-{
	-
	- vap->va_mask = 0;
	-
	- if (vap->va_type != VNON)
	- vap->va_mask \|= AT_TYPE;
	- if (vap->va_uid != (uid_t)VNOVAL)
	- vap->va_mask \|= AT_UID;
	- if (vap->va_gid != (gid_t)VNOVAL)
	- vap->va_mask \|= AT_GID;
	- if (vap->va_size != (u_quad_t)VNOVAL)
	- vap->va_mask \|= AT_SIZE;
	- if (vap->va_atime.tv_sec != VNOVAL)
	- vap->va_mask \|= AT_ATIME;
	- if (vap->va_mtime.tv_sec != VNOVAL)
	- vap->va_mask \|= AT_MTIME;
	- if (vap->va_mode != (u_short)VNOVAL)
	- vap->va_mask \|= AT_MODE;
	- if (vap->va_flags != VNOVAL)
	- vap->va_mask \|= AT_XVATTR;
	-}
	-
	-#define FCREAT O_CREAT
	-#define FTRUNC O_TRUNC
	-#define FEXCL O_EXCL
	-#define FDSYNC FFSYNC
	-#define FRSYNC FFSYNC
	-#define FSYNC FFSYNC
	-#define FOFFMAX 0x00
	-#define FIGNORECASE 0x00
	-
	-static __inline int
	-vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode,
	- vnode_t *vpp, enum create crwhy, mode_t umask, struct vnode startvp,
	- int fd)
	-{
	- struct thread *td = curthread;
	- struct nameidata nd;
	- int error, operation;
	-
	- ASSERT(seg == UIO_SYSSPACE);
	- if ((filemode & FCREAT) != 0) {
	- ASSERT(filemode == (FWRITE \| FCREAT \| FTRUNC \| FOFFMAX));
	- ASSERT(crwhy == CRCREAT);
	- operation = CREATE;
	- } else {
	- ASSERT(filemode == (FREAD \| FOFFMAX) \|\|
	- filemode == (FREAD \| FWRITE \| FOFFMAX));
	- ASSERT(crwhy == 0);
	- operation = LOOKUP;
	- }
	- ASSERT(umask == 0);
	-
	- pwd_ensure_dirs();
	-
	- if (startvp != NULL)
	- vref(startvp);
	- NDINIT_ATVP(&nd, operation, 0, UIO_SYSSPACE, pnamep, startvp, td);
	- filemode \|= O_NOFOLLOW;
	- error = vn_open_cred(&nd, &filemode, createmode, 0, td->td_ucred, NULL);
	- NDFREE(&nd, NDF_ONLY_PNBUF);
	- if (error == 0) {
	- /* We just unlock so we hold a reference. */
	- VOP_UNLOCK(nd.ni_vp);
	- *vpp = nd.ni_vp;
	- }
	- return (error);
	-}
	-
	-static __inline int
	-zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode,
	- vnode_t **vpp, enum create crwhy, mode_t umask)
	-{
	-
	- return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
	- umask, NULL, -1));
	-}
	-#define vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask) \
	- zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask))
	-
	-#define RLIM64_INFINITY 0
	-static __inline int
	-zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len,
	- offset_t offset, enum uio_seg seg, int ioflag, int ulimit, cred_t *cr,
	- ssize_t *residp)
	-{
	- struct thread *td = curthread;
	- int error;
	- ssize_t resid;
	-
	- ASSERT(ioflag == 0);
	- ASSERT(ulimit == RLIM64_INFINITY);
	-
	- if (rw == UIO_WRITE) {
	- ioflag = IO_SYNC;
	- } else {
	- ioflag = IO_DIRECT;
	- }
	- error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED,
	- &resid, td);
	- if (residp != NULL)
	- *residp = (ssize_t)resid;
	- return (error);
	-}
	-#define vn_rdwr(rw, vp, base, len, offset, seg, ioflag, ulimit, cr, residp) \
	- zfs_vn_rdwr((rw), (vp), (base), (len), (offset), (seg), (ioflag), (ulimit), (cr), (residp))
	-
	-static __inline int
	-zfs_vop_fsync(vnode_t vp, int flag, cred_t cr)
	-{
	- struct mount *mp;
	- int error;
	-
	- ASSERT(flag == FSYNC);
	-
	- if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	- goto drop;
	- vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	- error = VOP_FSYNC(vp, MNT_WAIT, curthread);
	- VOP_UNLOCK(vp);
	- vn_finished_write(mp);
	-drop:
	- return (error);
	-}
	-#define VOP_FSYNC(vp, flag, cr, ct) zfs_vop_fsync((vp), (flag), (cr))
	-
	-static __inline int
	-zfs_vop_close(vnode_t vp, int flag, int count, offset_t offset, cred_t cr)
	-{
	- int error;
	-
	- ASSERT(count == 1);
	- ASSERT(offset == 0);
	-
	- error = vn_close(vp, flag, cr, curthread);
	- return (error);
	-}
	-#define VOP_CLOSE(vp, oflags, count, offset, cr, ct) \
	- zfs_vop_close((vp), (oflags), (count), (offset), (cr))
	-
	-static __inline int
	-vn_rename(char from, char to, enum uio_seg seg)
	-{
	-
	- ASSERT(seg == UIO_SYSSPACE);
	-
	- return (kern_renameat(curthread, AT_FDCWD, from, AT_FDCWD, to, seg));
	-}
	-
	-static __inline int
	-vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
	-{
	-
	- ASSERT(seg == UIO_SYSSPACE);
	- ASSERT(dirflag == RMFILE);
	-
	- return (kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0,
	- 0));
	-}
	-
	-#endif /* _KERNEL */
	-
	-#endif /* _OPENSOLARIS_SYS_VNODE_H_ */
	Index: head/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
	+++ head/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
	@@ -1,69 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-#ifndef _ACL_COMMON_H
	-#define _ACL_COMMON_H
	-
	-#include <sys/types.h>
	-#include <sys/acl.h>
	-#include <sys/stat.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct trivial_acl {
	- uint32_t allow0; /* allow mask for bits only in owner */
	- uint32_t deny1; /* deny mask for bits not in owner */
	- uint32_t deny2; /* deny mask for bits not in group */
	- uint32_t owner; /* allow mask matching mode */
	- uint32_t group; /* allow mask matching mode */
	- uint32_t everyone; /* allow mask matching mode */
	-} trivial_acl_t;
	-
	-extern int acltrivial(const char *);
	-extern void adjust_ace_pair(ace_t *pair, mode_t mode);
	-extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t);
	-extern int ace_trivial(ace_t *acep, int aclcnt);
	-extern int ace_trivial_common(void *, int,
	- uint64_t (walk)(void , uint64_t, int aclcnt, uint16_t , uint16_t ,
	- uint32_t *mask));
	-#if !defined(_KERNEL)
	-extern acl_t *acl_alloc(acl_type_t);
	-extern void acl_free(acl_t *aclp);
	-extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir,
	- uid_t owner, gid_t group);
	-#endif /* !_KERNEL */
	-void ksort(caddr_t v, int n, int s, int (*f)());
	-int cmp2acls(void a, void b);
	-int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t *acl, int count);
	-void acl_trivial_access_masks(mode_t mode, boolean_t isdir,
	- trivial_acl_t *masks);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ACL_COMMON_H */
	Index: head/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
	+++ head/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
	@@ -1,1765 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/stat.h>
	-#include <sys/avl.h>
	-#include <sys/misc.h>
	-#if defined(_KERNEL)
	-#include <sys/kmem.h>
	-#include <sys/systm.h>
	-#include <sys/sysmacros.h>
	-#include <acl/acl_common.h>
	-#include <sys/debug.h>
	-#else
	-#include <errno.h>
	-#include <stdlib.h>
	-#include <stddef.h>
	-#include <strings.h>
	-#include <unistd.h>
	-#include <assert.h>
	-#include <grp.h>
	-#include <pwd.h>
	-#include <acl_common.h>
	-#define ASSERT assert
	-#endif
	-
	-#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA \| \
	- ACE_WRITE_DATA \| ACE_APPEND_DATA \| ACE_EXECUTE \| \
	- ACE_READ_ATTRIBUTES \| ACE_READ_ACL \| ACE_WRITE_ACL)
	-
	-
	-#define ACL_SYNCHRONIZE_SET_DENY 0x0000001
	-#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002
	-#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004
	-#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008
	-
	-#define ACL_WRITE_OWNER_SET_DENY 0x0000010
	-#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020
	-#define ACL_WRITE_OWNER_ERR_DENY 0x0000040
	-#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080
	-
	-#define ACL_DELETE_SET_DENY 0x0000100
	-#define ACL_DELETE_SET_ALLOW 0x0000200
	-#define ACL_DELETE_ERR_DENY 0x0000400
	-#define ACL_DELETE_ERR_ALLOW 0x0000800
	-
	-#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000
	-#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000
	-#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000
	-#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000
	-
	-#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000
	-#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000
	-#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000
	-#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000
	-
	-#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000
	-#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000
	-#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000
	-#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000
	-
	-#define ACL_READ_NAMED_READER_SET_DENY 0x1000000
	-#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000
	-#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000
	-#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000
	-
	-
	-#define ACE_VALID_MASK_BITS (\
	- ACE_READ_DATA \| \
	- ACE_LIST_DIRECTORY \| \
	- ACE_WRITE_DATA \| \
	- ACE_ADD_FILE \| \
	- ACE_APPEND_DATA \| \
	- ACE_ADD_SUBDIRECTORY \| \
	- ACE_READ_NAMED_ATTRS \| \
	- ACE_WRITE_NAMED_ATTRS \| \
	- ACE_EXECUTE \| \
	- ACE_DELETE_CHILD \| \
	- ACE_READ_ATTRIBUTES \| \
	- ACE_WRITE_ATTRIBUTES \| \
	- ACE_DELETE \| \
	- ACE_READ_ACL \| \
	- ACE_WRITE_ACL \| \
	- ACE_WRITE_OWNER \| \
	- ACE_SYNCHRONIZE)
	-
	-#define ACE_MASK_UNDEFINED 0x80000000
	-
	-#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE \| \
	- ACE_DIRECTORY_INHERIT_ACE \| \
	- ACE_NO_PROPAGATE_INHERIT_ACE \| ACE_INHERIT_ONLY_ACE \| \
	- ACE_SUCCESSFUL_ACCESS_ACE_FLAG \| ACE_FAILED_ACCESS_ACE_FLAG \| \
	- ACE_IDENTIFIER_GROUP \| ACE_OWNER \| ACE_GROUP \| ACE_EVERYONE)
	-
	-/*
	- * ACL conversion helpers
	- */
	-
	-typedef enum {
	- ace_unused,
	- ace_user_obj,
	- ace_user,
	- ace_group, /* includes GROUP and GROUP_OBJ */
	- ace_other_obj
	-} ace_to_aent_state_t;
	-
	-typedef struct acevals {
	- uid_t key;
	- avl_node_t avl;
	- uint32_t mask;
	- uint32_t allowed;
	- uint32_t denied;
	- int aent_type;
	-} acevals_t;
	-
	-typedef struct ace_list {
	- acevals_t user_obj;
	- avl_tree_t user;
	- int numusers;
	- acevals_t group_obj;
	- avl_tree_t group;
	- int numgroups;
	- acevals_t other_obj;
	- uint32_t acl_mask;
	- int hasmask;
	- int dfacl_flag;
	- ace_to_aent_state_t state;
	- int seen; /* bitmask of all aclent_t a_type values seen */
	-} ace_list_t;
	-
	-/*
	- * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
	- * v = Ptr to array/vector of objs
	- * n = # objs in the array
	- * s = size of each obj (must be multiples of a word size)
	- * f = ptr to function to compare two objs
	- * returns (-1 = less than, 0 = equal, 1 = greater than
	- */
	-void
	-ksort(caddr_t v, int n, int s, int (*f)())
	-{
	- int g, i, j, ii;
	- unsigned int p1, p2;
	- unsigned int tmp;
	-
	- /* No work to do */
	- if (v == NULL \|\| n <= 1)
	- return;
	-
	- /* Sanity check on arguments */
	- ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
	- ASSERT(s > 0);
	- for (g = n / 2; g > 0; g /= 2) {
	- for (i = g; i < n; i++) {
	- for (j = i - g; j >= 0 &&
	- (f)(v + j s, v + (j + g) * s) == 1;
	- j -= g) {
	- p1 = (void )(v + j s);
	- p2 = (void )(v + (j + g) s);
	- for (ii = 0; ii < s / 4; ii++) {
	- tmp = *p1;
	- p1++ = p2;
	- *p2++ = tmp;
	- }
	- }
	- }
	- }
	-}
	-
	-/*
	- * Compare two acls, all fields. Returns:
	- * -1 (less than)
	- * 0 (equal)
	- * +1 (greater than)
	- */
	-int
	-cmp2acls(void a, void b)
	-{
	- aclent_t x = (aclent_t )a;
	- aclent_t y = (aclent_t )b;
	-
	- /* Compare types */
	- if (x->a_type < y->a_type)
	- return (-1);
	- if (x->a_type > y->a_type)
	- return (1);
	- /* Equal types; compare id's */
	- if (x->a_id < y->a_id)
	- return (-1);
	- if (x->a_id > y->a_id)
	- return (1);
	- /* Equal ids; compare perms */
	- if (x->a_perm < y->a_perm)
	- return (-1);
	- if (x->a_perm > y->a_perm)
	- return (1);
	- /* Totally equal */
	- return (0);
	-}
	-
	-/ARGSUSED/
	-static void *
	-cacl_realloc(void *ptr, size_t size, size_t new_size)
	-{
	-#if defined(_KERNEL)
	- void *tmp;
	-
	- tmp = kmem_alloc(new_size, KM_SLEEP);
	- (void) memcpy(tmp, ptr, (size < new_size) ? size : new_size);
	- kmem_free(ptr, size);
	- return (tmp);
	-#else
	- return (realloc(ptr, new_size));
	-#endif
	-}
	-
	-static int
	-cacl_malloc(void **ptr, size_t size)
	-{
	-#if defined(_KERNEL)
	- *ptr = kmem_zalloc(size, KM_SLEEP);
	- return (0);
	-#else
	- *ptr = calloc(1, size);
	- if (*ptr == NULL)
	- return (errno);
	-
	- return (0);
	-#endif
	-}
	-
	-/ARGSUSED/
	-static void
	-cacl_free(void *ptr, size_t size)
	-{
	-#if defined(_KERNEL)
	- kmem_free(ptr, size);
	-#else
	- free(ptr);
	-#endif
	-}
	-
	-#if !defined(_KERNEL)
	-acl_t *
	-acl_alloc(enum acl_type type)
	-{
	- acl_t *aclp;
	-
	- if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0)
	- return (NULL);
	-
	- aclp->acl_aclp = NULL;
	- aclp->acl_cnt = 0;
	-
	- switch (type) {
	- case ACE_T:
	- aclp->acl_type = ACE_T;
	- aclp->acl_entry_size = sizeof (ace_t);
	- break;
	- case ACLENT_T:
	- aclp->acl_type = ACLENT_T;
	- aclp->acl_entry_size = sizeof (aclent_t);
	- break;
	- default:
	- acl_free(aclp);
	- aclp = NULL;
	- }
	- return (aclp);
	-}
	-
	-/*
	- * Free acl_t structure
	- */
	-void
	-acl_free(acl_t *aclp)
	-{
	- int acl_size;
	-
	- if (aclp == NULL)
	- return;
	-
	- if (aclp->acl_aclp) {
	- acl_size = aclp->acl_cnt * aclp->acl_entry_size;
	- cacl_free(aclp->acl_aclp, acl_size);
	- }
	-
	- cacl_free(aclp, sizeof (acl_t));
	-}
	-
	-static uint32_t
	-access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
	-{
	- uint32_t access_mask = 0;
	- int acl_produce;
	- int synchronize_set = 0, write_owner_set = 0;
	- int delete_set = 0, write_attrs_set = 0;
	- int read_named_set = 0, write_named_set = 0;
	-
	- acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW \|
	- ACL_WRITE_ATTRS_OWNER_SET_ALLOW \|
	- ACL_WRITE_ATTRS_WRITER_SET_DENY);
	-
	- if (isallow) {
	- synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
	- write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
	- delete_set = ACL_DELETE_SET_ALLOW;
	- if (hasreadperm)
	- read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
	- if (haswriteperm)
	- write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
	- if (isowner)
	- write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
	- else if (haswriteperm)
	- write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
	- } else {
	-
	- synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
	- write_owner_set = ACL_WRITE_OWNER_SET_DENY;
	- delete_set = ACL_DELETE_SET_DENY;
	- if (hasreadperm)
	- read_named_set = ACL_READ_NAMED_READER_SET_DENY;
	- if (haswriteperm)
	- write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
	- if (isowner)
	- write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
	- else if (haswriteperm)
	- write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
	- else
	- /*
	- * If the entity is not the owner and does not
	- * have write permissions ACE_WRITE_ATTRIBUTES will
	- * always go in the DENY ACE.
	- */
	- access_mask \|= ACE_WRITE_ATTRIBUTES;
	- }
	-
	- if (acl_produce & synchronize_set)
	- access_mask \|= ACE_SYNCHRONIZE;
	- if (acl_produce & write_owner_set)
	- access_mask \|= ACE_WRITE_OWNER;
	- if (acl_produce & delete_set)
	- access_mask \|= ACE_DELETE;
	- if (acl_produce & write_attrs_set)
	- access_mask \|= ACE_WRITE_ATTRIBUTES;
	- if (acl_produce & read_named_set)
	- access_mask \|= ACE_READ_NAMED_ATTRS;
	- if (acl_produce & write_named_set)
	- access_mask \|= ACE_WRITE_NAMED_ATTRS;
	-
	- return (access_mask);
	-}
	-
	-/*
	- * Given an mode_t, convert it into an access_mask as used
	- * by nfsace, assuming aclent_t -> nfsace semantics.
	- */
	-static uint32_t
	-mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow)
	-{
	- uint32_t access = 0;
	- int haswriteperm = 0;
	- int hasreadperm = 0;
	-
	- if (isallow) {
	- haswriteperm = (mode & S_IWOTH);
	- hasreadperm = (mode & S_IROTH);
	- } else {
	- haswriteperm = !(mode & S_IWOTH);
	- hasreadperm = !(mode & S_IROTH);
	- }
	-
	- /*
	- * The following call takes care of correctly setting the following
	- * mask bits in the access_mask:
	- * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
	- * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
	- */
	- access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
	-
	- if (isallow) {
	- access \|= ACE_READ_ACL \| ACE_READ_ATTRIBUTES;
	- if (isowner)
	- access \|= ACE_WRITE_ACL;
	- } else {
	- if (! isowner)
	- access \|= ACE_WRITE_ACL;
	- }
	-
	- /* read */
	- if (mode & S_IROTH) {
	- access \|= ACE_READ_DATA;
	- }
	- /* write */
	- if (mode & S_IWOTH) {
	- access \|= ACE_WRITE_DATA \|
	- ACE_APPEND_DATA;
	- if (isdir)
	- access \|= ACE_DELETE_CHILD;
	- }
	- /* exec */
	- if (mode & S_IXOTH) {
	- access \|= ACE_EXECUTE;
	- }
	-
	- return (access);
	-}
	-
	-/*
	- * Given an nfsace (presumably an ALLOW entry), make a
	- * corresponding DENY entry at the address given.
	- */
	-static void
	-ace_make_deny(ace_t allow, ace_t deny, int isdir, int isowner)
	-{
	- (void) memcpy(deny, allow, sizeof (ace_t));
	-
	- deny->a_who = allow->a_who;
	-
	- deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
	- deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
	- if (isdir)
	- deny->a_access_mask ^= ACE_DELETE_CHILD;
	-
	- deny->a_access_mask &= ~(ACE_SYNCHRONIZE \| ACE_WRITE_OWNER \|
	- ACE_DELETE \| ACE_WRITE_ATTRIBUTES \| ACE_READ_NAMED_ATTRS \|
	- ACE_WRITE_NAMED_ATTRS);
	- deny->a_access_mask \|= access_mask_set((allow->a_access_mask &
	- ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
	- B_FALSE);
	-}
	-/*
	- * Make an initial pass over an array of aclent_t's. Gather
	- * information such as an ACL_MASK (if any), number of users,
	- * number of groups, and whether the array needs to be sorted.
	- */
	-static int
	-ln_aent_preprocess(aclent_t *aclent, int n,
	- int hasmask, mode_t mask,
	- int numuser, int numgroup, int *needsort)
	-{
	- int error = 0;
	- int i;
	- int curtype = 0;
	-
	- *hasmask = 0;
	- *mask = 07;
	- *needsort = 0;
	- *numuser = 0;
	- *numgroup = 0;
	-
	- for (i = 0; i < n; i++) {
	- if (aclent[i].a_type < curtype)
	- *needsort = 1;
	- else if (aclent[i].a_type > curtype)
	- curtype = aclent[i].a_type;
	- if (aclent[i].a_type & USER)
	- (*numuser)++;
	- if (aclent[i].a_type & (GROUP \| GROUP_OBJ))
	- (*numgroup)++;
	- if (aclent[i].a_type & CLASS_OBJ) {
	- if (*hasmask) {
	- error = EINVAL;
	- goto out;
	- } else {
	- *hasmask = 1;
	- *mask = aclent[i].a_perm;
	- }
	- }
	- }
	-
	- if ((! hasmask) && (numuser + *numgroup > 1)) {
	- error = EINVAL;
	- goto out;
	- }
	-
	-out:
	- return (error);
	-}
	-
	-/*
	- * Convert an array of aclent_t into an array of nfsace entries,
	- * following POSIX draft -> nfsv4 conversion semantics as outlined in
	- * the IETF draft.
	- */
	-static int
	-ln_aent_to_ace(aclent_t aclent, int n, ace_t acepp, int rescount, int isdir)
	-{
	- int error = 0;
	- mode_t mask;
	- int numuser, numgroup, needsort;
	- int resultsize = 0;
	- int i, groupi = 0, skip;
	- ace_t acep, result = NULL;
	- int hasmask;
	-
	- error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
	- &numuser, &numgroup, &needsort);
	- if (error != 0)
	- goto out;
	-
	- /* allow + deny for each aclent */
	- resultsize = n * 2;
	- if (hasmask) {
	- /*
	- * stick extra deny on the group_obj and on each
	- * user\|group for the mask (the group_obj was added
	- * into the count for numgroup)
	- */
	- resultsize += numuser + numgroup;
	- /* ... and don't count the mask itself */
	- resultsize -= 2;
	- }
	-
	- /* sort the source if necessary */
	- if (needsort)
	- ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
	-
	- if (cacl_malloc((void *)&result, resultsize sizeof (ace_t)) != 0)
	- goto out;
	-
	- acep = result;
	-
	- for (i = 0; i < n; i++) {
	- /*
	- * don't process CLASS_OBJ (mask); mask was grabbed in
	- * ln_aent_preprocess()
	- */
	- if (aclent[i].a_type & CLASS_OBJ)
	- continue;
	-
	- /* If we need an ACL_MASK emulator, prepend it now */
	- if ((hasmask) &&
	- (aclent[i].a_type & (USER \| GROUP \| GROUP_OBJ))) {
	- acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
	- acep->a_flags = 0;
	- if (aclent[i].a_type & GROUP_OBJ) {
	- acep->a_who = (uid_t)-1;
	- acep->a_flags \|=
	- (ACE_IDENTIFIER_GROUP\|ACE_GROUP);
	- } else if (aclent[i].a_type & USER) {
	- acep->a_who = aclent[i].a_id;
	- } else {
	- acep->a_who = aclent[i].a_id;
	- acep->a_flags \|= ACE_IDENTIFIER_GROUP;
	- }
	- if (aclent[i].a_type & ACL_DEFAULT) {
	- acep->a_flags \|= ACE_INHERIT_ONLY_ACE \|
	- ACE_FILE_INHERIT_ACE \|
	- ACE_DIRECTORY_INHERIT_ACE;
	- }
	- /*
	- * Set the access mask for the prepended deny
	- * ace. To do this, we invert the mask (found
	- * in ln_aent_preprocess()) then convert it to an
	- * DENY ace access_mask.
	- */
	- acep->a_access_mask = mode_to_ace_access((mask ^ 07),
	- isdir, 0, 0);
	- acep += 1;
	- }
	-
	- /* handle a_perm -> access_mask */
	- acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
	- isdir, aclent[i].a_type & USER_OBJ, 1);
	-
	- /* emulate a default aclent */
	- if (aclent[i].a_type & ACL_DEFAULT) {
	- acep->a_flags \|= ACE_INHERIT_ONLY_ACE \|
	- ACE_FILE_INHERIT_ACE \|
	- ACE_DIRECTORY_INHERIT_ACE;
	- }
	-
	- /*
	- * handle a_perm and a_id
	- *
	- * this must be done last, since it involves the
	- * corresponding deny aces, which are handled
	- * differently for each different a_type.
	- */
	- if (aclent[i].a_type & USER_OBJ) {
	- acep->a_who = (uid_t)-1;
	- acep->a_flags \|= ACE_OWNER;
	- ace_make_deny(acep, acep + 1, isdir, B_TRUE);
	- acep += 2;
	- } else if (aclent[i].a_type & USER) {
	- acep->a_who = aclent[i].a_id;
	- ace_make_deny(acep, acep + 1, isdir, B_FALSE);
	- acep += 2;
	- } else if (aclent[i].a_type & (GROUP_OBJ \| GROUP)) {
	- if (aclent[i].a_type & GROUP_OBJ) {
	- acep->a_who = (uid_t)-1;
	- acep->a_flags \|= ACE_GROUP;
	- } else {
	- acep->a_who = aclent[i].a_id;
	- }
	- acep->a_flags \|= ACE_IDENTIFIER_GROUP;
	- /*
	- * Set the corresponding deny for the group ace.
	- *
	- * The deny aces go after all of the groups, unlike
	- * everything else, where they immediately follow
	- * the allow ace.
	- *
	- * We calculate "skip", the number of slots to
	- * skip ahead for the deny ace, here.
	- *
	- * The pattern is:
	- * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
	- * thus, skip is
	- * (2 * numgroup) - 1 - groupi
	- * (2 * numgroup) to account for MD + A
	- * - 1 to account for the fact that we're on the
	- * access (A), not the mask (MD)
	- * - groupi to account for the fact that we have
	- * passed up groupi number of MD's.
	- */
	- skip = (2 * numgroup) - 1 - groupi;
	- ace_make_deny(acep, acep + skip, isdir, B_FALSE);
	- /*
	- * If we just did the last group, skip acep past
	- * all of the denies; else, just move ahead one.
	- */
	- if (++groupi >= numgroup)
	- acep += numgroup + 1;
	- else
	- acep += 1;
	- } else if (aclent[i].a_type & OTHER_OBJ) {
	- acep->a_who = (uid_t)-1;
	- acep->a_flags \|= ACE_EVERYONE;
	- ace_make_deny(acep, acep + 1, isdir, B_FALSE);
	- acep += 2;
	- } else {
	- error = EINVAL;
	- goto out;
	- }
	- }
	-
	- *acepp = result;
	- *rescount = resultsize;
	-
	-out:
	- if (error != 0) {
	- if ((result != NULL) && (resultsize > 0)) {
	- cacl_free(result, resultsize * sizeof (ace_t));
	- }
	- }
	-
	- return (error);
	-}
	-
	-static int
	-convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir,
	- ace_t *retacep, int retacecnt)
	-{
	- ace_t *acep;
	- ace_t *dfacep;
	- int acecnt = 0;
	- int dfacecnt = 0;
	- int dfaclstart = 0;
	- int dfaclcnt = 0;
	- aclent_t *aclp;
	- int i;
	- int error;
	- int acesz, dfacesz;
	-
	- ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
	-
	- for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
	- if (aclp->a_type & ACL_DEFAULT)
	- break;
	- }
	-
	- if (i < aclcnt) {
	- dfaclstart = i;
	- dfaclcnt = aclcnt - i;
	- }
	-
	- if (dfaclcnt && !isdir) {
	- return (EINVAL);
	- }
	-
	- error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir);
	- if (error)
	- return (error);
	-
	- if (dfaclcnt) {
	- error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
	- &dfacep, &dfacecnt, isdir);
	- if (error) {
	- if (acep) {
	- cacl_free(acep, acecnt * sizeof (ace_t));
	- }
	- return (error);
	- }
	- }
	-
	- if (dfacecnt != 0) {
	- acesz = sizeof (ace_t) * acecnt;
	- dfacesz = sizeof (ace_t) * dfacecnt;
	- acep = cacl_realloc(acep, acesz, acesz + dfacesz);
	- if (acep == NULL)
	- return (ENOMEM);
	- if (dfaclcnt) {
	- (void) memcpy(acep + acecnt, dfacep, dfacesz);
	- }
	- }
	- if (dfaclcnt)
	- cacl_free(dfacep, dfacecnt * sizeof (ace_t));
	-
	- *retacecnt = acecnt + dfacecnt;
	- *retacep = acep;
	- return (0);
	-}
	-
	-static int
	-ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
	-{
	- int error = 0;
	- o_mode_t mode = 0;
	- uint32_t bits, wantbits;
	-
	- /* read */
	- if (mask & ACE_READ_DATA)
	- mode \|= S_IROTH;
	-
	- /* write */
	- wantbits = (ACE_WRITE_DATA \| ACE_APPEND_DATA);
	- if (isdir)
	- wantbits \|= ACE_DELETE_CHILD;
	- bits = mask & wantbits;
	- if (bits != 0) {
	- if (bits != wantbits) {
	- error = ENOTSUP;
	- goto out;
	- }
	- mode \|= S_IWOTH;
	- }
	-
	- /* exec */
	- if (mask & ACE_EXECUTE) {
	- mode \|= S_IXOTH;
	- }
	-
	- *modep = mode;
	-
	-out:
	- return (error);
	-}
	-
	-static void
	-acevals_init(acevals_t *vals, uid_t key)
	-{
	- bzero(vals, sizeof (*vals));
	- vals->allowed = ACE_MASK_UNDEFINED;
	- vals->denied = ACE_MASK_UNDEFINED;
	- vals->mask = ACE_MASK_UNDEFINED;
	- vals->key = key;
	-}
	-
	-static void
	-ace_list_init(ace_list_t *al, int dfacl_flag)
	-{
	- acevals_init(&al->user_obj, 0);
	- acevals_init(&al->group_obj, 0);
	- acevals_init(&al->other_obj, 0);
	- al->numusers = 0;
	- al->numgroups = 0;
	- al->acl_mask = 0;
	- al->hasmask = 0;
	- al->state = ace_unused;
	- al->seen = 0;
	- al->dfacl_flag = dfacl_flag;
	-}
	-
	-/*
	- * Find or create an acevals holder for a given id and avl tree.
	- *
	- * Note that only one thread will ever touch these avl trees, so
	- * there is no need for locking.
	- */
	-static acevals_t *
	-acevals_find(ace_t ace, avl_tree_t avl, int *num)
	-{
	- acevals_t key, *rc;
	- avl_index_t where;
	-
	- key.key = ace->a_who;
	- rc = avl_find(avl, &key, &where);
	- if (rc != NULL)
	- return (rc);
	-
	- /* this memory is freed by ln_ace_to_aent()->ace_list_free() */
	- if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0)
	- return (NULL);
	-
	- acevals_init(rc, ace->a_who);
	- avl_insert(avl, rc, where);
	- (*num)++;
	-
	- return (rc);
	-}
	-
	-static int
	-access_mask_check(ace_t *acep, int mask_bit, int isowner)
	-{
	- int set_deny, err_deny;
	- int set_allow, err_allow;
	- int acl_consume;
	- int haswriteperm, hasreadperm;
	-
	- if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
	- haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1;
	- hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1;
	- } else {
	- haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0;
	- hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0;
	- }
	-
	- acl_consume = (ACL_SYNCHRONIZE_ERR_DENY \|
	- ACL_DELETE_ERR_DENY \|
	- ACL_WRITE_OWNER_ERR_DENY \|
	- ACL_WRITE_OWNER_ERR_ALLOW \|
	- ACL_WRITE_ATTRS_OWNER_SET_ALLOW \|
	- ACL_WRITE_ATTRS_OWNER_ERR_DENY \|
	- ACL_WRITE_ATTRS_WRITER_SET_DENY \|
	- ACL_WRITE_ATTRS_WRITER_ERR_ALLOW \|
	- ACL_WRITE_NAMED_WRITER_ERR_DENY \|
	- ACL_READ_NAMED_READER_ERR_DENY);
	-
	- if (mask_bit == ACE_SYNCHRONIZE) {
	- set_deny = ACL_SYNCHRONIZE_SET_DENY;
	- err_deny = ACL_SYNCHRONIZE_ERR_DENY;
	- set_allow = ACL_SYNCHRONIZE_SET_ALLOW;
	- err_allow = ACL_SYNCHRONIZE_ERR_ALLOW;
	- } else if (mask_bit == ACE_WRITE_OWNER) {
	- set_deny = ACL_WRITE_OWNER_SET_DENY;
	- err_deny = ACL_WRITE_OWNER_ERR_DENY;
	- set_allow = ACL_WRITE_OWNER_SET_ALLOW;
	- err_allow = ACL_WRITE_OWNER_ERR_ALLOW;
	- } else if (mask_bit == ACE_DELETE) {
	- set_deny = ACL_DELETE_SET_DENY;
	- err_deny = ACL_DELETE_ERR_DENY;
	- set_allow = ACL_DELETE_SET_ALLOW;
	- err_allow = ACL_DELETE_ERR_ALLOW;
	- } else if (mask_bit == ACE_WRITE_ATTRIBUTES) {
	- if (isowner) {
	- set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY;
	- err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY;
	- set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
	- err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW;
	- } else if (haswriteperm) {
	- set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY;
	- err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY;
	- set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
	- err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW;
	- } else {
	- if ((acep->a_access_mask & mask_bit) &&
	- (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) {
	- return (ENOTSUP);
	- }
	- return (0);
	- }
	- } else if (mask_bit == ACE_READ_NAMED_ATTRS) {
	- if (!hasreadperm)
	- return (0);
	-
	- set_deny = ACL_READ_NAMED_READER_SET_DENY;
	- err_deny = ACL_READ_NAMED_READER_ERR_DENY;
	- set_allow = ACL_READ_NAMED_READER_SET_ALLOW;
	- err_allow = ACL_READ_NAMED_READER_ERR_ALLOW;
	- } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) {
	- if (!haswriteperm)
	- return (0);
	-
	- set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY;
	- err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY;
	- set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
	- err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW;
	- } else {
	- return (EINVAL);
	- }
	-
	- if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
	- if (acl_consume & set_deny) {
	- if (!(acep->a_access_mask & mask_bit)) {
	- return (ENOTSUP);
	- }
	- } else if (acl_consume & err_deny) {
	- if (acep->a_access_mask & mask_bit) {
	- return (ENOTSUP);
	- }
	- }
	- } else {
	- /* ACE_ACCESS_ALLOWED_ACE_TYPE */
	- if (acl_consume & set_allow) {
	- if (!(acep->a_access_mask & mask_bit)) {
	- return (ENOTSUP);
	- }
	- } else if (acl_consume & err_allow) {
	- if (acep->a_access_mask & mask_bit) {
	- return (ENOTSUP);
	- }
	- }
	- }
	- return (0);
	-}
	-
	-static int
	-ace_to_aent_legal(ace_t *acep)
	-{
	- int error = 0;
	- int isowner;
	-
	- /* only ALLOW or DENY */
	- if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) &&
	- (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) {
	- error = ENOTSUP;
	- goto out;
	- }
	-
	- /* check for invalid flags */
	- if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) {
	- error = EINVAL;
	- goto out;
	- }
	-
	- /* some flags are illegal */
	- if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG \|
	- ACE_FAILED_ACCESS_ACE_FLAG \|
	- ACE_NO_PROPAGATE_INHERIT_ACE)) {
	- error = ENOTSUP;
	- goto out;
	- }
	-
	- /* check for invalid masks */
	- if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) {
	- error = EINVAL;
	- goto out;
	- }
	-
	- if ((acep->a_flags & ACE_OWNER)) {
	- isowner = 1;
	- } else {
	- isowner = 0;
	- }
	-
	- error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner);
	- if (error)
	- goto out;
	-
	- error = access_mask_check(acep, ACE_WRITE_OWNER, isowner);
	- if (error)
	- goto out;
	-
	- error = access_mask_check(acep, ACE_DELETE, isowner);
	- if (error)
	- goto out;
	-
	- error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner);
	- if (error)
	- goto out;
	-
	- error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner);
	- if (error)
	- goto out;
	-
	- error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner);
	- if (error)
	- goto out;
	-
	- /* more detailed checking of masks */
	- if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
	- if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) {
	- error = ENOTSUP;
	- goto out;
	- }
	- if ((acep->a_access_mask & ACE_WRITE_DATA) &&
	- (! (acep->a_access_mask & ACE_APPEND_DATA))) {
	- error = ENOTSUP;
	- goto out;
	- }
	- if ((! (acep->a_access_mask & ACE_WRITE_DATA)) &&
	- (acep->a_access_mask & ACE_APPEND_DATA)) {
	- error = ENOTSUP;
	- goto out;
	- }
	- }
	-
	- /* ACL enforcement */
	- if ((acep->a_access_mask & ACE_READ_ACL) &&
	- (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) {
	- error = ENOTSUP;
	- goto out;
	- }
	- if (acep->a_access_mask & ACE_WRITE_ACL) {
	- if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) &&
	- (isowner)) {
	- error = ENOTSUP;
	- goto out;
	- }
	- if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) &&
	- (! isowner)) {
	- error = ENOTSUP;
	- goto out;
	- }
	- }
	-
	-out:
	- return (error);
	-}
	-
	-static int
	-ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
	-{
	- /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
	- if ((mask & (ACE_READ_ACL \| ACE_READ_ATTRIBUTES)) !=
	- (ACE_READ_ACL \| ACE_READ_ATTRIBUTES)) {
	- return (ENOTSUP);
	- }
	-
	- return (ace_mask_to_mode(mask, modep, isdir));
	-}
	-
	-static int
	-acevals_to_aent(acevals_t vals, aclent_t dest, ace_list_t *list,
	- uid_t owner, gid_t group, boolean_t isdir)
	-{
	- int error;
	- uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
	-
	- if (isdir)
	- flips \|= ACE_DELETE_CHILD;
	- if (vals->allowed != (vals->denied ^ flips)) {
	- error = ENOTSUP;
	- goto out;
	- }
	- if ((list->hasmask) && (list->acl_mask != vals->mask) &&
	- (vals->aent_type & (USER \| GROUP \| GROUP_OBJ))) {
	- error = ENOTSUP;
	- goto out;
	- }
	- error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir);
	- if (error != 0)
	- goto out;
	- dest->a_type = vals->aent_type;
	- if (dest->a_type & (USER \| GROUP)) {
	- dest->a_id = vals->key;
	- } else if (dest->a_type & USER_OBJ) {
	- dest->a_id = owner;
	- } else if (dest->a_type & GROUP_OBJ) {
	- dest->a_id = group;
	- } else if (dest->a_type & OTHER_OBJ) {
	- dest->a_id = 0;
	- } else {
	- error = EINVAL;
	- goto out;
	- }
	-
	-out:
	- return (error);
	-}
	-
	-
	-static int
	-ace_list_to_aent(ace_list_t list, aclent_t aclentp, int aclcnt,
	- uid_t owner, gid_t group, boolean_t isdir)
	-{
	- int error = 0;
	- aclent_t aent, result = NULL;
	- acevals_t *vals;
	- int resultcount;
	-
	- if ((list->seen & (USER_OBJ \| GROUP_OBJ \| OTHER_OBJ)) !=
	- (USER_OBJ \| GROUP_OBJ \| OTHER_OBJ)) {
	- error = ENOTSUP;
	- goto out;
	- }
	- if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) {
	- error = ENOTSUP;
	- goto out;
	- }
	-
	- resultcount = 3 + list->numusers + list->numgroups;
	- /*
	- * This must be the same condition as below, when we add the CLASS_OBJ
	- * (aka ACL mask)
	- */
	- if ((list->hasmask) \|\| (! list->dfacl_flag))
	- resultcount += 1;
	-
	- if (cacl_malloc((void **)&result,
	- resultcount * sizeof (aclent_t)) != 0) {
	- error = ENOMEM;
	- goto out;
	- }
	- aent = result;
	-
	- /* USER_OBJ */
	- if (!(list->user_obj.aent_type & USER_OBJ)) {
	- error = EINVAL;
	- goto out;
	- }
	-
	- error = acevals_to_aent(&list->user_obj, aent, list, owner, group,
	- isdir);
	-
	- if (error != 0)
	- goto out;
	- ++aent;
	- /* USER */
	- vals = NULL;
	- for (vals = avl_first(&list->user); vals != NULL;
	- vals = AVL_NEXT(&list->user, vals)) {
	- if (!(vals->aent_type & USER)) {
	- error = EINVAL;
	- goto out;
	- }
	- error = acevals_to_aent(vals, aent, list, owner, group,
	- isdir);
	- if (error != 0)
	- goto out;
	- ++aent;
	- }
	- /* GROUP_OBJ */
	- if (!(list->group_obj.aent_type & GROUP_OBJ)) {
	- error = EINVAL;
	- goto out;
	- }
	- error = acevals_to_aent(&list->group_obj, aent, list, owner, group,
	- isdir);
	- if (error != 0)
	- goto out;
	- ++aent;
	- /* GROUP */
	- vals = NULL;
	- for (vals = avl_first(&list->group); vals != NULL;
	- vals = AVL_NEXT(&list->group, vals)) {
	- if (!(vals->aent_type & GROUP)) {
	- error = EINVAL;
	- goto out;
	- }
	- error = acevals_to_aent(vals, aent, list, owner, group,
	- isdir);
	- if (error != 0)
	- goto out;
	- ++aent;
	- }
	- /*
	- * CLASS_OBJ (aka ACL_MASK)
	- *
	- * An ACL_MASK is not fabricated if the ACL is a default ACL.
	- * This is to follow UFS's behavior.
	- */
	- if ((list->hasmask) \|\| (! list->dfacl_flag)) {
	- if (list->hasmask) {
	- uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
	- if (isdir)
	- flips \|= ACE_DELETE_CHILD;
	- error = ace_mask_to_mode(list->acl_mask ^ flips,
	- &aent->a_perm, isdir);
	- if (error != 0)
	- goto out;
	- } else {
	- /* fabricate the ACL_MASK from the group permissions */
	- error = ace_mask_to_mode(list->group_obj.allowed,
	- &aent->a_perm, isdir);
	- if (error != 0)
	- goto out;
	- }
	- aent->a_id = 0;
	- aent->a_type = CLASS_OBJ \| list->dfacl_flag;
	- ++aent;
	- }
	- /* OTHER_OBJ */
	- if (!(list->other_obj.aent_type & OTHER_OBJ)) {
	- error = EINVAL;
	- goto out;
	- }
	- error = acevals_to_aent(&list->other_obj, aent, list, owner, group,
	- isdir);
	- if (error != 0)
	- goto out;
	- ++aent;
	-
	- *aclentp = result;
	- *aclcnt = resultcount;
	-
	-out:
	- if (error != 0) {
	- if (result != NULL)
	- cacl_free(result, resultcount * sizeof (aclent_t));
	- }
	-
	- return (error);
	-}
	-
	-
	-/*
	- * free all data associated with an ace_list
	- */
	-static void
	-ace_list_free(ace_list_t *al)
	-{
	- acevals_t *node;
	- void *cookie;
	-
	- if (al == NULL)
	- return;
	-
	- cookie = NULL;
	- while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL)
	- cacl_free(node, sizeof (acevals_t));
	- cookie = NULL;
	- while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL)
	- cacl_free(node, sizeof (acevals_t));
	-
	- avl_destroy(&al->user);
	- avl_destroy(&al->group);
	-
	- /* free the container itself */
	- cacl_free(al, sizeof (ace_list_t));
	-}
	-
	-static int
	-acevals_compare(const void va, const void vb)
	-{
	- const acevals_t a = va, b = vb;
	-
	- if (a->key == b->key)
	- return (0);
	-
	- if (a->key > b->key)
	- return (1);
	-
	- else
	- return (-1);
	-}
	-
	-/*
	- * Convert a list of ace_t entries to equivalent regular and default
	- * aclent_t lists. Return error (ENOTSUP) when conversion is not possible.
	- */
	-static int
	-ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
	- aclent_t *aclentp, int aclcnt, aclent_t *dfaclentp, int dfaclcnt,
	- boolean_t isdir)
	-{
	- int error = 0;
	- ace_t *acep;
	- uint32_t bits;
	- int i;
	- ace_list_t normacl = NULL, dfacl = NULL, *acl;
	- acevals_t *vals;
	-
	- *aclentp = NULL;
	- *aclcnt = 0;
	- *dfaclentp = NULL;
	- *dfaclcnt = 0;
	-
	- /* we need at least user_obj, group_obj, and other_obj */
	- if (n < 6) {
	- error = ENOTSUP;
	- goto out;
	- }
	- if (ace == NULL) {
	- error = EINVAL;
	- goto out;
	- }
	-
	- error = cacl_malloc((void **)&normacl, sizeof (ace_list_t));
	- if (error != 0)
	- goto out;
	-
	- avl_create(&normacl->user, acevals_compare, sizeof (acevals_t),
	- offsetof(acevals_t, avl));
	- avl_create(&normacl->group, acevals_compare, sizeof (acevals_t),
	- offsetof(acevals_t, avl));
	-
	- ace_list_init(normacl, 0);
	-
	- error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t));
	- if (error != 0)
	- goto out;
	-
	- avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t),
	- offsetof(acevals_t, avl));
	- avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t),
	- offsetof(acevals_t, avl));
	- ace_list_init(dfacl, ACL_DEFAULT);
	-
	- /* process every ace_t... */
	- for (i = 0; i < n; i++) {
	- acep = &ace[i];
	-
	- /* rule out certain cases quickly */
	- error = ace_to_aent_legal(acep);
	- if (error != 0)
	- goto out;
	-
	- /*
	- * Turn off these bits in order to not have to worry about
	- * them when doing the checks for compliments.
	- */
	- acep->a_access_mask &= ~(ACE_WRITE_OWNER \| ACE_DELETE \|
	- ACE_SYNCHRONIZE \| ACE_WRITE_ATTRIBUTES \|
	- ACE_READ_NAMED_ATTRS \| ACE_WRITE_NAMED_ATTRS);
	-
	- /* see if this should be a regular or default acl */
	- bits = acep->a_flags &
	- (ACE_INHERIT_ONLY_ACE \|
	- ACE_FILE_INHERIT_ACE \|
	- ACE_DIRECTORY_INHERIT_ACE);
	- if (bits != 0) {
	- /* all or nothing on these inherit bits */
	- if (bits != (ACE_INHERIT_ONLY_ACE \|
	- ACE_FILE_INHERIT_ACE \|
	- ACE_DIRECTORY_INHERIT_ACE)) {
	- error = ENOTSUP;
	- goto out;
	- }
	- acl = dfacl;
	- } else {
	- acl = normacl;
	- }
	-
	- if ((acep->a_flags & ACE_OWNER)) {
	- if (acl->state > ace_user_obj) {
	- error = ENOTSUP;
	- goto out;
	- }
	- acl->state = ace_user_obj;
	- acl->seen \|= USER_OBJ;
	- vals = &acl->user_obj;
	- vals->aent_type = USER_OBJ \| acl->dfacl_flag;
	- } else if ((acep->a_flags & ACE_EVERYONE)) {
	- acl->state = ace_other_obj;
	- acl->seen \|= OTHER_OBJ;
	- vals = &acl->other_obj;
	- vals->aent_type = OTHER_OBJ \| acl->dfacl_flag;
	- } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) {
	- if (acl->state > ace_group) {
	- error = ENOTSUP;
	- goto out;
	- }
	- if ((acep->a_flags & ACE_GROUP)) {
	- acl->seen \|= GROUP_OBJ;
	- vals = &acl->group_obj;
	- vals->aent_type = GROUP_OBJ \| acl->dfacl_flag;
	- } else {
	- acl->seen \|= GROUP;
	- vals = acevals_find(acep, &acl->group,
	- &acl->numgroups);
	- if (vals == NULL) {
	- error = ENOMEM;
	- goto out;
	- }
	- vals->aent_type = GROUP \| acl->dfacl_flag;
	- }
	- acl->state = ace_group;
	- } else {
	- if (acl->state > ace_user) {
	- error = ENOTSUP;
	- goto out;
	- }
	- acl->state = ace_user;
	- acl->seen \|= USER;
	- vals = acevals_find(acep, &acl->user,
	- &acl->numusers);
	- if (vals == NULL) {
	- error = ENOMEM;
	- goto out;
	- }
	- vals->aent_type = USER \| acl->dfacl_flag;
	- }
	-
	- if (!(acl->state > ace_unused)) {
	- error = EINVAL;
	- goto out;
	- }
	-
	- if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
	- /* no more than one allowed per aclent_t */
	- if (vals->allowed != ACE_MASK_UNDEFINED) {
	- error = ENOTSUP;
	- goto out;
	- }
	- vals->allowed = acep->a_access_mask;
	- } else {
	- /*
	- * it's a DENY; if there was a previous DENY, it
	- * must have been an ACL_MASK.
	- */
	- if (vals->denied != ACE_MASK_UNDEFINED) {
	- /* ACL_MASK is for USER and GROUP only */
	- if ((acl->state != ace_user) &&
	- (acl->state != ace_group)) {
	- error = ENOTSUP;
	- goto out;
	- }
	-
	- if (! acl->hasmask) {
	- acl->hasmask = 1;
	- acl->acl_mask = vals->denied;
	- /* check for mismatched ACL_MASK emulations */
	- } else if (acl->acl_mask != vals->denied) {
	- error = ENOTSUP;
	- goto out;
	- }
	- vals->mask = vals->denied;
	- }
	- vals->denied = acep->a_access_mask;
	- }
	- }
	-
	- /* done collating; produce the aclent_t lists */
	- if (normacl->state != ace_unused) {
	- error = ace_list_to_aent(normacl, aclentp, aclcnt,
	- owner, group, isdir);
	- if (error != 0) {
	- goto out;
	- }
	- }
	- if (dfacl->state != ace_unused) {
	- error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt,
	- owner, group, isdir);
	- if (error != 0) {
	- goto out;
	- }
	- }
	-
	-out:
	- if (normacl != NULL)
	- ace_list_free(normacl);
	- if (dfacl != NULL)
	- ace_list_free(dfacl);
	-
	- return (error);
	-}
	-
	-static int
	-convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir,
	- uid_t owner, gid_t group, aclent_t *retaclentp, int retaclcnt)
	-{
	- int error = 0;
	- aclent_t aclentp, dfaclentp;
	- int aclcnt, dfaclcnt;
	- int aclsz, dfaclsz;
	-
	- error = ln_ace_to_aent(acebufp, acecnt, owner, group,
	- &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir);
	-
	- if (error)
	- return (error);
	-
	-
	- if (dfaclcnt != 0) {
	- /*
	- * Slap aclentp and dfaclentp into a single array.
	- */
	- aclsz = sizeof (aclent_t) * aclcnt;
	- dfaclsz = sizeof (aclent_t) * dfaclcnt;
	- aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz);
	- if (aclentp != NULL) {
	- (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz);
	- } else {
	- error = ENOMEM;
	- }
	- }
	-
	- if (aclentp) {
	- *retaclentp = aclentp;
	- *retaclcnt = aclcnt + dfaclcnt;
	- }
	-
	- if (dfaclentp)
	- cacl_free(dfaclentp, dfaclsz);
	-
	- return (error);
	-}
	-
	-
	-int
	-acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner,
	- gid_t group)
	-{
	- int aclcnt;
	- void *acldata;
	- int error;
	-
	- /*
	- * See if we need to translate
	- */
	- if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) \|\|
	- (target_flavor == _ACL_ACLENT_ENABLED &&
	- aclp->acl_type == ACLENT_T))
	- return (0);
	-
	- if (target_flavor == -1) {
	- error = EINVAL;
	- goto out;
	- }
	-
	- if (target_flavor == _ACL_ACE_ENABLED &&
	- aclp->acl_type == ACLENT_T) {
	- error = convert_aent_to_ace(aclp->acl_aclp,
	- aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt);
	- if (error)
	- goto out;
	-
	- } else if (target_flavor == _ACL_ACLENT_ENABLED &&
	- aclp->acl_type == ACE_T) {
	- error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt,
	- isdir, owner, group, (aclent_t **)&acldata, &aclcnt);
	- if (error)
	- goto out;
	- } else {
	- error = ENOTSUP;
	- goto out;
	- }
	-
	- /*
	- * replace old acl with newly translated acl
	- */
	- cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size);
	- aclp->acl_aclp = acldata;
	- aclp->acl_cnt = aclcnt;
	- if (target_flavor == _ACL_ACE_ENABLED) {
	- aclp->acl_type = ACE_T;
	- aclp->acl_entry_size = sizeof (ace_t);
	- } else {
	- aclp->acl_type = ACLENT_T;
	- aclp->acl_entry_size = sizeof (aclent_t);
	- }
	- return (0);
	-
	-out:
	-
	-#if !defined(_KERNEL)
	- errno = error;
	- return (-1);
	-#else
	- return (error);
	-#endif
	-}
	-#endif /* !_KERNEL */
	-
	-#define SET_ACE(acl, index, who, mask, type, flags) { \
	- acl[0][index].a_who = (uint32_t)who; \
	- acl[0][index].a_type = type; \
	- acl[0][index].a_flags = flags; \
	- acl[0][index++].a_access_mask = mask; \
	-}
	-
	-void
	-acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
	-{
	- uint32_t read_mask = ACE_READ_DATA;
	- uint32_t write_mask = ACE_WRITE_DATA\|ACE_APPEND_DATA;
	- uint32_t execute_mask = ACE_EXECUTE;
	-
	- (void) isdir; /* will need this later */
	-
	- masks->deny1 = 0;
	- if (!(mode & S_IRUSR) && (mode & (S_IRGRP\|S_IROTH)))
	- masks->deny1 \|= read_mask;
	- if (!(mode & S_IWUSR) && (mode & (S_IWGRP\|S_IWOTH)))
	- masks->deny1 \|= write_mask;
	- if (!(mode & S_IXUSR) && (mode & (S_IXGRP\|S_IXOTH)))
	- masks->deny1 \|= execute_mask;
	-
	- masks->deny2 = 0;
	- if (!(mode & S_IRGRP) && (mode & S_IROTH))
	- masks->deny2 \|= read_mask;
	- if (!(mode & S_IWGRP) && (mode & S_IWOTH))
	- masks->deny2 \|= write_mask;
	- if (!(mode & S_IXGRP) && (mode & S_IXOTH))
	- masks->deny2 \|= execute_mask;
	-
	- masks->allow0 = 0;
	- if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
	- masks->allow0 \|= read_mask;
	- if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
	- masks->allow0 \|= write_mask;
	- if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
	- masks->allow0 \|= execute_mask;
	-
	- masks->owner = ACE_WRITE_ATTRIBUTES\|ACE_WRITE_OWNER\|ACE_WRITE_ACL\|
	- ACE_WRITE_NAMED_ATTRS\|ACE_READ_ACL\|ACE_READ_ATTRIBUTES\|
	- ACE_READ_NAMED_ATTRS\|ACE_SYNCHRONIZE;
	- if (mode & S_IRUSR)
	- masks->owner \|= read_mask;
	- if (mode & S_IWUSR)
	- masks->owner \|= write_mask;
	- if (mode & S_IXUSR)
	- masks->owner \|= execute_mask;
	-
	- masks->group = ACE_READ_ACL\|ACE_READ_ATTRIBUTES\|ACE_READ_NAMED_ATTRS\|
	- ACE_SYNCHRONIZE;
	- if (mode & S_IRGRP)
	- masks->group \|= read_mask;
	- if (mode & S_IWGRP)
	- masks->group \|= write_mask;
	- if (mode & S_IXGRP)
	- masks->group \|= execute_mask;
	-
	- masks->everyone = ACE_READ_ACL\|ACE_READ_ATTRIBUTES\|ACE_READ_NAMED_ATTRS\|
	- ACE_SYNCHRONIZE;
	- if (mode & S_IROTH)
	- masks->everyone \|= read_mask;
	- if (mode & S_IWOTH)
	- masks->everyone \|= write_mask;
	- if (mode & S_IXOTH)
	- masks->everyone \|= execute_mask;
	-}
	-
	-int
	-acl_trivial_create(mode_t mode, boolean_t isdir, ace_t *acl, int count)
	-{
	- int index = 0;
	- int error;
	- trivial_acl_t masks;
	-
	- *count = 3;
	- acl_trivial_access_masks(mode, isdir, &masks);
	-
	- if (masks.allow0)
	- (*count)++;
	- if (masks.deny1)
	- (*count)++;
	- if (masks.deny2)
	- (*count)++;
	-
	- if ((error = cacl_malloc((void *)acl, count * sizeof (ace_t))) != 0)
	- return (error);
	-
	- if (masks.allow0) {
	- SET_ACE(acl, index, -1, masks.allow0,
	- ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER);
	- }
	- if (masks.deny1) {
	- SET_ACE(acl, index, -1, masks.deny1,
	- ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER);
	- }
	- if (masks.deny2) {
	- SET_ACE(acl, index, -1, masks.deny2,
	- ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP\|ACE_IDENTIFIER_GROUP);
	- }
	-
	- SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE,
	- ACE_OWNER);
	- SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE,
	- ACE_IDENTIFIER_GROUP\|ACE_GROUP);
	- SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE,
	- ACE_EVERYONE);
	-
	- return (0);
	-}
	-
	-/*
	- * ace_trivial:
	- * determine whether an ace_t acl is trivial
	- *
	- * Trivialness implies that the acl is composed of only
	- * owner, group, everyone entries. ACL can't
	- * have read_acl denied, and write_owner/write_acl/write_attributes
	- * can only be owner@ entry.
	- */
	-int
	-ace_trivial_common(void *acep, int aclcnt,
	- uint64_t (walk)(void , uint64_t, int aclcnt,
	- uint16_t , uint16_t , uint32_t *))
	-{
	- uint16_t flags;
	- uint32_t mask;
	- uint16_t type;
	- uint64_t cookie = 0;
	-
	- while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) {
	- switch (flags & ACE_TYPE_FLAGS) {
	- case ACE_OWNER:
	- case ACE_GROUP\|ACE_IDENTIFIER_GROUP:
	- case ACE_EVERYONE:
	- break;
	- default:
	- return (1);
	-
	- }
	-
	- if (flags & (ACE_FILE_INHERIT_ACE\|
	- ACE_DIRECTORY_INHERIT_ACE\|ACE_NO_PROPAGATE_INHERIT_ACE\|
	- ACE_INHERIT_ONLY_ACE))
	- return (1);
	-
	- /*
	- * Special check for some special bits
	- *
	- * Don't allow anybody to deny reading basic
	- * attributes or a files ACL.
	- */
	- if ((mask & (ACE_READ_ACL\|ACE_READ_ATTRIBUTES)) &&
	- (type == ACE_ACCESS_DENIED_ACE_TYPE))
	- return (1);
	-
	- /*
	- * Delete permissions are never set by default
	- */
	- if (mask & (ACE_DELETE\|ACE_DELETE_CHILD))
	- return (1);
	- /*
	- * only allow owner@ to have
	- * write_acl/write_owner/write_attributes/write_xattr/
	- */
	- if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
	- (!(flags & ACE_OWNER) && (mask &
	- (ACE_WRITE_OWNER\|ACE_WRITE_ACL\| ACE_WRITE_ATTRIBUTES\|
	- ACE_WRITE_NAMED_ATTRS))))
	- return (1);
	-
	- }
	- return (0);
	-}
	-
	-uint64_t
	-ace_walk(void datap, uint64_t cookie, int aclcnt, uint16_t flags,
	- uint16_t type, uint32_t mask)
	-{
	- ace_t *acep = datap;
	-
	- if (cookie >= aclcnt)
	- return (0);
	-
	- *flags = acep[cookie].a_flags;
	- *type = acep[cookie].a_type;
	- *mask = acep[cookie++].a_access_mask;
	-
	- return (cookie);
	-}
	-
	-int
	-ace_trivial(ace_t *acep, int aclcnt)
	-{
	- return (ace_trivial_common(acep, aclcnt, ace_walk));
	-}
	Index: head/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
	+++ head/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
	@@ -1,133 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	- .file "atomic.s"
	-
	-#define _ASM
	-#include <sys/asm_linkage.h>
	-
	- /*
	- * NOTE: If atomic_dec_64 and atomic_dec_64_nv are ever
	- * separated, it is important to edit the libc i386 platform
	- * specific mapfile and remove the NODYNSORT attribute
	- * from atomic_dec_64_nv.
	- */
	- ENTRY(atomic_dec_64)
	- ALTENTRY(atomic_dec_64_nv)
	- pushl %edi
	- pushl %ebx
	- movl 12(%esp), %edi // %edi = target address
	- movl (%edi), %eax
	- movl 4(%edi), %edx // %edx:%eax = old value
	-1:
	- xorl %ebx, %ebx
	- xorl %ecx, %ecx
	- not %ecx
	- not %ebx // %ecx:%ebx = -1
	- addl %eax, %ebx
	- adcl %edx, %ecx // add in the carry from inc
	- lock
	- cmpxchg8b (%edi) // try to stick it in
	- jne 1b
	- movl %ebx, %eax
	- movl %ecx, %edx // return new value
	- popl %ebx
	- popl %edi
	- ret
	- SET_SIZE(atomic_dec_64_nv)
	- SET_SIZE(atomic_dec_64)
	-
	- /*
	- * NOTE: If atomic_add_64 and atomic_add_64_nv are ever
	- * separated, it is important to edit the libc i386 platform
	- * specific mapfile and remove the NODYNSORT attribute
	- * from atomic_add_64_nv.
	- */
	- ENTRY(atomic_add_64)
	- ALTENTRY(atomic_add_64_nv)
	- pushl %edi
	- pushl %ebx
	- movl 12(%esp), %edi // %edi = target address
	- movl (%edi), %eax
	- movl 4(%edi), %edx // %edx:%eax = old value
	-1:
	- movl 16(%esp), %ebx
	- movl 20(%esp), %ecx // %ecx:%ebx = delta
	- addl %eax, %ebx
	- adcl %edx, %ecx // %ecx:%ebx = new value
	- lock
	- cmpxchg8b (%edi) // try to stick it in
	- jne 1b
	- movl %ebx, %eax
	- movl %ecx, %edx // return new value
	- popl %ebx
	- popl %edi
	- ret
	- SET_SIZE(atomic_add_64_nv)
	- SET_SIZE(atomic_add_64)
	-
	- ENTRY(atomic_cas_64)
	- pushl %ebx
	- pushl %esi
	- movl 12(%esp), %esi
	- movl 16(%esp), %eax
	- movl 20(%esp), %edx
	- movl 24(%esp), %ebx
	- movl 28(%esp), %ecx
	- lock
	- cmpxchg8b (%esi)
	- popl %esi
	- popl %ebx
	- ret
	- SET_SIZE(atomic_cas_64)
	-
	- ENTRY(atomic_swap_64)
	- pushl %esi
	- pushl %ebx
	- movl 12(%esp), %esi
	- movl 16(%esp), %ebx
	- movl 20(%esp), %ecx
	- movl (%esi), %eax
	- movl 4(%esi), %edx // %edx:%eax = old value
	-1:
	- lock
	- cmpxchg8b (%esi)
	- jne 1b
	- popl %ebx
	- popl %esi
	- ret
	- SET_SIZE(atomic_swap_64)
	-
	- ENTRY(atomic_load_64)
	- pushl %esi
	- movl 8(%esp), %esi
	- movl %ebx, %eax // make old and new values equal, so that
	- movl %ecx, %edx // destination is never changed
	- lock
	- cmpxchg8b (%esi)
	- popl %esi
	- ret
	- SET_SIZE(atomic_load_64)
	Index: head/sys/cddl/contrib/opensolaris/common/avl/avl.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/avl/avl.c
	+++ head/sys/cddl/contrib/opensolaris/common/avl/avl.c
	@@ -1,1063 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2014 by Delphix. All rights reserved.
	- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-/*
	- * AVL - generic AVL tree implementation for kernel use
	- *
	- * A complete description of AVL trees can be found in many CS textbooks.
	- *
	- * Here is a very brief overview. An AVL tree is a binary search tree that is
	- * almost perfectly balanced. By "almost" perfectly balanced, we mean that at
	- * any given node, the left and right subtrees are allowed to differ in height
	- * by at most 1 level.
	- *
	- * This relaxation from a perfectly balanced binary tree allows doing
	- * insertion and deletion relatively efficiently. Searching the tree is
	- * still a fast operation, roughly O(log(N)).
	- *
	- * The key to insertion and deletion is a set of tree manipulations called
	- * rotations, which bring unbalanced subtrees back into the semi-balanced state.
	- *
	- * This implementation of AVL trees has the following peculiarities:
	- *
	- * - The AVL specific data structures are physically embedded as fields
	- * in the "using" data structures. To maintain generality the code
	- * must constantly translate between "avl_node_t *" and containing
	- * data structure "void *"s by adding/subtracting the avl_offset.
	- *
	- * - Since the AVL data is always embedded in other structures, there is
	- * no locking or memory allocation in the AVL routines. This must be
	- * provided for by the enclosing data structure's semantics. Typically,
	- * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
	- * exclusive write lock. Other operations require a read lock.
	- *
	- * - The implementation uses iteration instead of explicit recursion,
	- * since it is intended to run on limited size kernel stacks. Since
	- * there is no recursion stack present to move "up" in the tree,
	- * there is an explicit "parent" link in the avl_node_t.
	- *
	- * - The left/right children pointers of a node are in an array.
	- * In the code, variables (instead of constants) are used to represent
	- * left and right indices. The implementation is written as if it only
	- * dealt with left handed manipulations. By changing the value assigned
	- * to "left", the code also works for right handed trees. The
	- * following variables/terms are frequently used:
	- *
	- * int left; // 0 when dealing with left children,
	- * // 1 for dealing with right children
	- *
	- * int left_heavy; // -1 when left subtree is taller at some node,
	- * // +1 when right subtree is taller
	- *
	- * int right; // will be the opposite of left (0 or 1)
	- * int right_heavy;// will be the opposite of left_heavy (-1 or 1)
	- *
	- * int direction; // 0 for "<" (ie. left child); 1 for ">" (right)
	- *
	- * Though it is a little more confusing to read the code, the approach
	- * allows using half as much code (and hence cache footprint) for tree
	- * manipulations and eliminates many conditional branches.
	- *
	- * - The avl_index_t is an opaque "cookie" used to find nodes at or
	- * adjacent to where a new value would be inserted in the tree. The value
	- * is a modified "avl_node_t *". The bottom bit (normally 0 for a
	- * pointer) is set to indicate if that the new node has a value greater
	- * than the value of the indicated "avl_node_t *".
	- *
	- * Note - in addition to userland (e.g. libavl and libutil) and the kernel
	- * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
	- * which each have their own compilation environments and subsequent
	- * requirements. Each of these environments must be considered when adding
	- * dependencies from avl.c.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/stdint.h>
	-#include <sys/debug.h>
	-#include <sys/avl.h>
	-
	-/*
	- * Small arrays to translate between balance (or diff) values and child indices.
	- *
	- * Code that deals with binary tree data structures will randomly use
	- * left and right children when examining a tree. C "if()" statements
	- * which evaluate randomly suffer from very poor hardware branch prediction.
	- * In this code we avoid some of the branch mispredictions by using the
	- * following translation arrays. They replace random branches with an
	- * additional memory reference. Since the translation arrays are both very
	- * small the data should remain efficiently in cache.
	- */
	-static const int avl_child2balance[2] = {-1, 1};
	-static const int avl_balance2child[] = {0, 0, 1};
	-
	-
	-/*
	- * Walk from one node to the previous valued node (ie. an infix walk
	- * towards the left). At any given node we do one of 2 things:
	- *
	- * - If there is a left child, go to it, then to it's rightmost descendant.
	- *
	- * - otherwise we return through parent nodes until we've come from a right
	- * child.
	- *
	- * Return Value:
	- * NULL - if at the end of the nodes
	- * otherwise next node
	- */
	-void *
	-avl_walk(avl_tree_t tree, void oldnode, int left)
	-{
	- size_t off = tree->avl_offset;
	- avl_node_t *node = AVL_DATA2NODE(oldnode, off);
	- int right = 1 - left;
	- int was_child;
	-
	-
	- /*
	- * nowhere to walk to if tree is empty
	- */
	- if (node == NULL)
	- return (NULL);
	-
	- /*
	- * Visit the previous valued node. There are two possibilities:
	- *
	- * If this node has a left child, go down one left, then all
	- * the way right.
	- */
	- if (node->avl_child[left] != NULL) {
	- for (node = node->avl_child[left];
	- node->avl_child[right] != NULL;
	- node = node->avl_child[right])
	- ;
	- /*
	- * Otherwise, return thru left children as far as we can.
	- */
	- } else {
	- for (;;) {
	- was_child = AVL_XCHILD(node);
	- node = AVL_XPARENT(node);
	- if (node == NULL)
	- return (NULL);
	- if (was_child == right)
	- break;
	- }
	- }
	-
	- return (AVL_NODE2DATA(node, off));
	-}
	-
	-/*
	- * Return the lowest valued node in a tree or NULL.
	- * (leftmost child from root of tree)
	- */
	-void *
	-avl_first(avl_tree_t *tree)
	-{
	- avl_node_t *node;
	- avl_node_t *prev = NULL;
	- size_t off = tree->avl_offset;
	-
	- for (node = tree->avl_root; node != NULL; node = node->avl_child[0])
	- prev = node;
	-
	- if (prev != NULL)
	- return (AVL_NODE2DATA(prev, off));
	- return (NULL);
	-}
	-
	-/*
	- * Return the highest valued node in a tree or NULL.
	- * (rightmost child from root of tree)
	- */
	-void *
	-avl_last(avl_tree_t *tree)
	-{
	- avl_node_t *node;
	- avl_node_t *prev = NULL;
	- size_t off = tree->avl_offset;
	-
	- for (node = tree->avl_root; node != NULL; node = node->avl_child[1])
	- prev = node;
	-
	- if (prev != NULL)
	- return (AVL_NODE2DATA(prev, off));
	- return (NULL);
	-}
	-
	-/*
	- * Access the node immediately before or after an insertion point.
	- *
	- * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child
	- *
	- * Return value:
	- * NULL: no node in the given direction
	- * "void *" of the found tree node
	- */
	-void *
	-avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
	-{
	- int child = AVL_INDEX2CHILD(where);
	- avl_node_t *node = AVL_INDEX2NODE(where);
	- void *data;
	- size_t off = tree->avl_offset;
	-
	- if (node == NULL) {
	- ASSERT(tree->avl_root == NULL);
	- return (NULL);
	- }
	- data = AVL_NODE2DATA(node, off);
	- if (child != direction)
	- return (data);
	-
	- return (avl_walk(tree, data, direction));
	-}
	-
	-
	-/*
	- * Search for the node which contains "value". The algorithm is a
	- * simple binary tree search.
	- *
	- * return value:
	- * NULL: the value is not in the AVL tree
	- * *where (if not NULL) is set to indicate the insertion point
	- * "void *" of the found tree node
	- */
	-void *
	-avl_find(avl_tree_t tree, const void value, avl_index_t *where)
	-{
	- avl_node_t *node;
	- avl_node_t *prev = NULL;
	- int child = 0;
	- int diff;
	- size_t off = tree->avl_offset;
	-
	- for (node = tree->avl_root; node != NULL;
	- node = node->avl_child[child]) {
	-
	- prev = node;
	-
	- diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
	- ASSERT(-1 <= diff && diff <= 1);
	- if (diff == 0) {
	-#ifdef DEBUG
	- if (where != NULL)
	- *where = 0;
	-#endif
	- return (AVL_NODE2DATA(node, off));
	- }
	- child = avl_balance2child[1 + diff];
	-
	- }
	-
	- if (where != NULL)
	- *where = AVL_MKINDEX(prev, child);
	-
	- return (NULL);
	-}
	-
	-
	-/*
	- * Perform a rotation to restore balance at the subtree given by depth.
	- *
	- * This routine is used by both insertion and deletion. The return value
	- * indicates:
	- * 0 : subtree did not change height
	- * !0 : subtree was reduced in height
	- *
	- * The code is written as if handling left rotations, right rotations are
	- * symmetric and handled by swapping values of variables right/left[_heavy]
	- *
	- * On input balance is the "new" balance at "node". This value is either
	- * -2 or +2.
	- */
	-static int
	-avl_rotation(avl_tree_t tree, avl_node_t node, int balance)
	-{
	- int left = !(balance < 0); /* when balance = -2, left will be 0 */
	- int right = 1 - left;
	- int left_heavy = balance >> 1;
	- int right_heavy = -left_heavy;
	- avl_node_t *parent = AVL_XPARENT(node);
	- avl_node_t *child = node->avl_child[left];
	- avl_node_t *cright;
	- avl_node_t *gchild;
	- avl_node_t *gright;
	- avl_node_t *gleft;
	- int which_child = AVL_XCHILD(node);
	- int child_bal = AVL_XBALANCE(child);
	-
	- /* BEGIN CSTYLED */
	- /*
	- * case 1 : node is overly left heavy, the left child is balanced or
	- * also left heavy. This requires the following rotation.
	- *
	- * (node bal:-2)
	- * / \
	- * / \
	- * (child bal:0 or -1)
	- * / \
	- * / \
	- * cright
	- *
	- * becomes:
	- *
	- * (child bal:1 or 0)
	- * / \
	- * / \
	- * (node bal:-1 or 0)
	- * / \
	- * / \
	- * cright
	- *
	- * we detect this situation by noting that child's balance is not
	- * right_heavy.
	- */
	- /* END CSTYLED */
	- if (child_bal != right_heavy) {
	-
	- /*
	- * compute new balance of nodes
	- *
	- * If child used to be left heavy (now balanced) we reduced
	- * the height of this sub-tree -- used in "return...;" below
	- */
	- child_bal += right_heavy; /* adjust towards right */
	-
	- /*
	- * move "cright" to be node's left child
	- */
	- cright = child->avl_child[right];
	- node->avl_child[left] = cright;
	- if (cright != NULL) {
	- AVL_SETPARENT(cright, node);
	- AVL_SETCHILD(cright, left);
	- }
	-
	- /*
	- * move node to be child's right child
	- */
	- child->avl_child[right] = node;
	- AVL_SETBALANCE(node, -child_bal);
	- AVL_SETCHILD(node, right);
	- AVL_SETPARENT(node, child);
	-
	- /*
	- * update the pointer into this subtree
	- */
	- AVL_SETBALANCE(child, child_bal);
	- AVL_SETCHILD(child, which_child);
	- AVL_SETPARENT(child, parent);
	- if (parent != NULL)
	- parent->avl_child[which_child] = child;
	- else
	- tree->avl_root = child;
	-
	- return (child_bal == 0);
	- }
	-
	- /* BEGIN CSTYLED */
	- /*
	- * case 2 : When node is left heavy, but child is right heavy we use
	- * a different rotation.
	- *
	- * (node b:-2)
	- * / \
	- * / \
	- * / \
	- * (child b:+1)
	- * / \
	- * / \
	- * (gchild b: != 0)
	- * / \
	- * / \
	- * gleft gright
	- *
	- * becomes:
	- *
	- * (gchild b:0)
	- * / \
	- * / \
	- * / \
	- * (child b:?) (node b:?)
	- * / \ / \
	- * / \ / \
	- * gleft gright
	- *
	- * computing the new balances is more complicated. As an example:
	- * if gchild was right_heavy, then child is now left heavy
	- * else it is balanced
	- */
	- /* END CSTYLED */
	- gchild = child->avl_child[right];
	- gleft = gchild->avl_child[left];
	- gright = gchild->avl_child[right];
	-
	- /*
	- * move gright to left child of node and
	- *
	- * move gleft to right child of node
	- */
	- node->avl_child[left] = gright;
	- if (gright != NULL) {
	- AVL_SETPARENT(gright, node);
	- AVL_SETCHILD(gright, left);
	- }
	-
	- child->avl_child[right] = gleft;
	- if (gleft != NULL) {
	- AVL_SETPARENT(gleft, child);
	- AVL_SETCHILD(gleft, right);
	- }
	-
	- /*
	- * move child to left child of gchild and
	- *
	- * move node to right child of gchild and
	- *
	- * fixup parent of all this to point to gchild
	- */
	- balance = AVL_XBALANCE(gchild);
	- gchild->avl_child[left] = child;
	- AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0));
	- AVL_SETPARENT(child, gchild);
	- AVL_SETCHILD(child, left);
	-
	- gchild->avl_child[right] = node;
	- AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0));
	- AVL_SETPARENT(node, gchild);
	- AVL_SETCHILD(node, right);
	-
	- AVL_SETBALANCE(gchild, 0);
	- AVL_SETPARENT(gchild, parent);
	- AVL_SETCHILD(gchild, which_child);
	- if (parent != NULL)
	- parent->avl_child[which_child] = gchild;
	- else
	- tree->avl_root = gchild;
	-
	- return (1); /* the new tree is always shorter */
	-}
	-
	-
	-/*
	- * Insert a new node into an AVL tree at the specified (from avl_find()) place.
	- *
	- * Newly inserted nodes are always leaf nodes in the tree, since avl_find()
	- * searches out to the leaf positions. The avl_index_t indicates the node
	- * which will be the parent of the new node.
	- *
	- * After the node is inserted, a single rotation further up the tree may
	- * be necessary to maintain an acceptable AVL balance.
	- */
	-void
	-avl_insert(avl_tree_t tree, void new_data, avl_index_t where)
	-{
	- avl_node_t *node;
	- avl_node_t *parent = AVL_INDEX2NODE(where);
	- int old_balance;
	- int new_balance;
	- int which_child = AVL_INDEX2CHILD(where);
	- size_t off = tree->avl_offset;
	-
	- ASSERT(tree);
	-#ifdef _LP64
	- ASSERT(((uintptr_t)new_data & 0x7) == 0);
	-#endif
	-
	- node = AVL_DATA2NODE(new_data, off);
	-
	- /*
	- * First, add the node to the tree at the indicated position.
	- */
	- ++tree->avl_numnodes;
	-
	- node->avl_child[0] = NULL;
	- node->avl_child[1] = NULL;
	-
	- AVL_SETCHILD(node, which_child);
	- AVL_SETBALANCE(node, 0);
	- AVL_SETPARENT(node, parent);
	- if (parent != NULL) {
	- ASSERT(parent->avl_child[which_child] == NULL);
	- parent->avl_child[which_child] = node;
	- } else {
	- ASSERT(tree->avl_root == NULL);
	- tree->avl_root = node;
	- }
	- /*
	- * Now, back up the tree modifying the balance of all nodes above the
	- * insertion point. If we get to a highly unbalanced ancestor, we
	- * need to do a rotation. If we back out of the tree we are done.
	- * If we brought any subtree into perfect balance (0), we are also done.
	- */
	- for (;;) {
	- node = parent;
	- if (node == NULL)
	- return;
	-
	- /*
	- * Compute the new balance
	- */
	- old_balance = AVL_XBALANCE(node);
	- new_balance = old_balance + avl_child2balance[which_child];
	-
	- /*
	- * If we introduced equal balance, then we are done immediately
	- */
	- if (new_balance == 0) {
	- AVL_SETBALANCE(node, 0);
	- return;
	- }
	-
	- /*
	- * If both old and new are not zero we went
	- * from -1 to -2 balance, do a rotation.
	- */
	- if (old_balance != 0)
	- break;
	-
	- AVL_SETBALANCE(node, new_balance);
	- parent = AVL_XPARENT(node);
	- which_child = AVL_XCHILD(node);
	- }
	-
	- /*
	- * perform a rotation to fix the tree and return
	- */
	- (void) avl_rotation(tree, node, new_balance);
	-}
	-
	-/*
	- * Insert "new_data" in "tree" in the given "direction" either after or
	- * before (AVL_AFTER, AVL_BEFORE) the data "here".
	- *
	- * Insertions can only be done at empty leaf points in the tree, therefore
	- * if the given child of the node is already present we move to either
	- * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since
	- * every other node in the tree is a leaf, this always works.
	- *
	- * To help developers using this interface, we assert that the new node
	- * is correctly ordered at every step of the way in DEBUG kernels.
	- */
	-void
	-avl_insert_here(
	- avl_tree_t *tree,
	- void *new_data,
	- void *here,
	- int direction)
	-{
	- avl_node_t *node;
	- int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */
	-#ifdef DEBUG
	- int diff;
	-#endif
	-
	- ASSERT(tree != NULL);
	- ASSERT(new_data != NULL);
	- ASSERT(here != NULL);
	- ASSERT(direction == AVL_BEFORE \|\| direction == AVL_AFTER);
	-
	- /*
	- * If corresponding child of node is not NULL, go to the neighboring
	- * node and reverse the insertion direction.
	- */
	- node = AVL_DATA2NODE(here, tree->avl_offset);
	-
	-#ifdef DEBUG
	- diff = tree->avl_compar(new_data, here);
	- ASSERT(-1 <= diff && diff <= 1);
	- ASSERT(diff != 0);
	- ASSERT(diff > 0 ? child == 1 : child == 0);
	-#endif
	-
	- if (node->avl_child[child] != NULL) {
	- node = node->avl_child[child];
	- child = 1 - child;
	- while (node->avl_child[child] != NULL) {
	-#ifdef DEBUG
	- diff = tree->avl_compar(new_data,
	- AVL_NODE2DATA(node, tree->avl_offset));
	- ASSERT(-1 <= diff && diff <= 1);
	- ASSERT(diff != 0);
	- ASSERT(diff > 0 ? child == 1 : child == 0);
	-#endif
	- node = node->avl_child[child];
	- }
	-#ifdef DEBUG
	- diff = tree->avl_compar(new_data,
	- AVL_NODE2DATA(node, tree->avl_offset));
	- ASSERT(-1 <= diff && diff <= 1);
	- ASSERT(diff != 0);
	- ASSERT(diff > 0 ? child == 1 : child == 0);
	-#endif
	- }
	- ASSERT(node->avl_child[child] == NULL);
	-
	- avl_insert(tree, new_data, AVL_MKINDEX(node, child));
	-}
	-
	-/*
	- * Add a new node to an AVL tree.
	- */
	-void
	-avl_add(avl_tree_t tree, void new_node)
	-{
	- avl_index_t where;
	-
	- /*
	- * This is unfortunate. We want to call panic() here, even for
	- * non-DEBUG kernels. In userland, however, we can't depend on anything
	- * in libc or else the rtld build process gets confused.
	- * Thankfully, rtld provides us with its own assfail() so we can use
	- * that here. We use assfail() directly to get a nice error message
	- * in the core - much like what panic() does for crashdumps.
	- */
	- if (avl_find(tree, new_node, &where) != NULL)
	-#ifdef _KERNEL
	- panic("avl_find() succeeded inside avl_add()");
	-#else
	- (void) assfail("avl_find() succeeded inside avl_add()",
	- __FILE__, __LINE__);
	-#endif
	- avl_insert(tree, new_node, where);
	-}
	-
	-/*
	- * Delete a node from the AVL tree. Deletion is similar to insertion, but
	- * with 2 complications.
	- *
	- * First, we may be deleting an interior node. Consider the following subtree:
	- *
	- * d c c
	- * / \ / \ / \
	- * b e b e b e
	- * / \ / \ /
	- * a c a a
	- *
	- * When we are deleting node (d), we find and bring up an adjacent valued leaf
	- * node, say (c), to take the interior node's place. In the code this is
	- * handled by temporarily swapping (d) and (c) in the tree and then using
	- * common code to delete (d) from the leaf position.
	- *
	- * Secondly, an interior deletion from a deep tree may require more than one
	- * rotation to fix the balance. This is handled by moving up the tree through
	- * parents and applying rotations as needed. The return value from
	- * avl_rotation() is used to detect when a subtree did not change overall
	- * height due to a rotation.
	- */
	-void
	-avl_remove(avl_tree_t tree, void data)
	-{
	- avl_node_t *delete;
	- avl_node_t *parent;
	- avl_node_t *node;
	- avl_node_t tmp;
	- int old_balance;
	- int new_balance;
	- int left;
	- int right;
	- int which_child;
	- size_t off = tree->avl_offset;
	-
	- ASSERT(tree);
	-
	- delete = AVL_DATA2NODE(data, off);
	-
	- /*
	- * Deletion is easiest with a node that has at most 1 child.
	- * We swap a node with 2 children with a sequentially valued
	- * neighbor node. That node will have at most 1 child. Note this
	- * has no effect on the ordering of the remaining nodes.
	- *
	- * As an optimization, we choose the greater neighbor if the tree
	- * is right heavy, otherwise the left neighbor. This reduces the
	- * number of rotations needed.
	- */
	- if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) {
	-
	- /*
	- * choose node to swap from whichever side is taller
	- */
	- old_balance = AVL_XBALANCE(delete);
	- left = avl_balance2child[old_balance + 1];
	- right = 1 - left;
	-
	- /*
	- * get to the previous value'd node
	- * (down 1 left, as far as possible right)
	- */
	- for (node = delete->avl_child[left];
	- node->avl_child[right] != NULL;
	- node = node->avl_child[right])
	- ;
	-
	- /*
	- * create a temp placeholder for 'node'
	- * move 'node' to delete's spot in the tree
	- */
	- tmp = *node;
	-
	- node = delete;
	- if (node->avl_child[left] == node)
	- node->avl_child[left] = &tmp;
	-
	- parent = AVL_XPARENT(node);
	- if (parent != NULL)
	- parent->avl_child[AVL_XCHILD(node)] = node;
	- else
	- tree->avl_root = node;
	- AVL_SETPARENT(node->avl_child[left], node);
	- AVL_SETPARENT(node->avl_child[right], node);
	-
	- /*
	- * Put tmp where node used to be (just temporary).
	- * It always has a parent and at most 1 child.
	- */
	- delete = &tmp;
	- parent = AVL_XPARENT(delete);
	- parent->avl_child[AVL_XCHILD(delete)] = delete;
	- which_child = (delete->avl_child[1] != 0);
	- if (delete->avl_child[which_child] != NULL)
	- AVL_SETPARENT(delete->avl_child[which_child], delete);
	- }
	-
	-
	- /*
	- * Here we know "delete" is at least partially a leaf node. It can
	- * be easily removed from the tree.
	- */
	- ASSERT(tree->avl_numnodes > 0);
	- --tree->avl_numnodes;
	- parent = AVL_XPARENT(delete);
	- which_child = AVL_XCHILD(delete);
	- if (delete->avl_child[0] != NULL)
	- node = delete->avl_child[0];
	- else
	- node = delete->avl_child[1];
	-
	- /*
	- * Connect parent directly to node (leaving out delete).
	- */
	- if (node != NULL) {
	- AVL_SETPARENT(node, parent);
	- AVL_SETCHILD(node, which_child);
	- }
	- if (parent == NULL) {
	- tree->avl_root = node;
	- return;
	- }
	- parent->avl_child[which_child] = node;
	-
	-
	- /*
	- * Since the subtree is now shorter, begin adjusting parent balances
	- * and performing any needed rotations.
	- */
	- do {
	-
	- /*
	- * Move up the tree and adjust the balance
	- *
	- * Capture the parent and which_child values for the next
	- * iteration before any rotations occur.
	- */
	- node = parent;
	- old_balance = AVL_XBALANCE(node);
	- new_balance = old_balance - avl_child2balance[which_child];
	- parent = AVL_XPARENT(node);
	- which_child = AVL_XCHILD(node);
	-
	- /*
	- * If a node was in perfect balance but isn't anymore then
	- * we can stop, since the height didn't change above this point
	- * due to a deletion.
	- */
	- if (old_balance == 0) {
	- AVL_SETBALANCE(node, new_balance);
	- break;
	- }
	-
	- /*
	- * If the new balance is zero, we don't need to rotate
	- * else
	- * need a rotation to fix the balance.
	- * If the rotation doesn't change the height
	- * of the sub-tree we have finished adjusting.
	- */
	- if (new_balance == 0)
	- AVL_SETBALANCE(node, new_balance);
	- else if (!avl_rotation(tree, node, new_balance))
	- break;
	- } while (parent != NULL);
	-}
	-
	-#define AVL_REINSERT(tree, obj) \
	- avl_remove((tree), (obj)); \
	- avl_add((tree), (obj))
	-
	-boolean_t
	-avl_update_lt(avl_tree_t t, void obj)
	-{
	- void *neighbor;
	-
	- ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) \|\|
	- (t->avl_compar(obj, neighbor) <= 0));
	-
	- neighbor = AVL_PREV(t, obj);
	- if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
	- AVL_REINSERT(t, obj);
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-boolean_t
	-avl_update_gt(avl_tree_t t, void obj)
	-{
	- void *neighbor;
	-
	- ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) \|\|
	- (t->avl_compar(obj, neighbor) >= 0));
	-
	- neighbor = AVL_NEXT(t, obj);
	- if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
	- AVL_REINSERT(t, obj);
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-boolean_t
	-avl_update(avl_tree_t t, void obj)
	-{
	- void *neighbor;
	-
	- neighbor = AVL_PREV(t, obj);
	- if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
	- AVL_REINSERT(t, obj);
	- return (B_TRUE);
	- }
	-
	- neighbor = AVL_NEXT(t, obj);
	- if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
	- AVL_REINSERT(t, obj);
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-void
	-avl_swap(avl_tree_t tree1, avl_tree_t tree2)
	-{
	- avl_node_t *temp_node;
	- ulong_t temp_numnodes;
	-
	- ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
	- ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
	- ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
	-
	- temp_node = tree1->avl_root;
	- temp_numnodes = tree1->avl_numnodes;
	- tree1->avl_root = tree2->avl_root;
	- tree1->avl_numnodes = tree2->avl_numnodes;
	- tree2->avl_root = temp_node;
	- tree2->avl_numnodes = temp_numnodes;
	-}
	-
	-/*
	- * initialize a new AVL tree
	- */
	-void
	-avl_create(avl_tree_t tree, int (compar) (const void , const void ),
	- size_t size, size_t offset)
	-{
	- ASSERT(tree);
	- ASSERT(compar);
	- ASSERT(size > 0);
	- ASSERT(size >= offset + sizeof (avl_node_t));
	-#ifdef _LP64
	- ASSERT((offset & 0x7) == 0);
	-#endif
	-
	- tree->avl_compar = compar;
	- tree->avl_root = NULL;
	- tree->avl_numnodes = 0;
	- tree->avl_size = size;
	- tree->avl_offset = offset;
	-}
	-
	-/*
	- * Delete a tree.
	- */
	-/* ARGSUSED */
	-void
	-avl_destroy(avl_tree_t *tree)
	-{
	- ASSERT(tree);
	- ASSERT(tree->avl_numnodes == 0);
	- ASSERT(tree->avl_root == NULL);
	-}
	-
	-
	-/*
	- * Return the number of nodes in an AVL tree.
	- */
	-ulong_t
	-avl_numnodes(avl_tree_t *tree)
	-{
	- ASSERT(tree);
	- return (tree->avl_numnodes);
	-}
	-
	-boolean_t
	-avl_is_empty(avl_tree_t *tree)
	-{
	- ASSERT(tree);
	- return (tree->avl_numnodes == 0);
	-}
	-
	-#define CHILDBIT (1L)
	-
	-/*
	- * Post-order tree walk used to visit all tree nodes and destroy the tree
	- * in post order. This is used for destroying a tree without paying any cost
	- * for rebalancing it.
	- *
	- * example:
	- *
	- * void *cookie = NULL;
	- * my_data_t *node;
	- *
	- * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
	- * free(node);
	- * avl_destroy(tree);
	- *
	- * The cookie is really an avl_node_t to the current node's parent and
	- * an indication of which child you looked at last.
	- *
	- * On input, a cookie value of CHILDBIT indicates the tree is done.
	- */
	-void *
	-avl_destroy_nodes(avl_tree_t tree, void *cookie)
	-{
	- avl_node_t *node;
	- avl_node_t *parent;
	- int child;
	- void *first;
	- size_t off = tree->avl_offset;
	-
	- /*
	- * Initial calls go to the first node or it's right descendant.
	- */
	- if (*cookie == NULL) {
	- first = avl_first(tree);
	-
	- /*
	- * deal with an empty tree
	- */
	- if (first == NULL) {
	- cookie = (void )CHILDBIT;
	- return (NULL);
	- }
	-
	- node = AVL_DATA2NODE(first, off);
	- parent = AVL_XPARENT(node);
	- goto check_right_side;
	- }
	-
	- /*
	- * If there is no parent to return to we are done.
	- */
	- parent = (avl_node_t )((uintptr_t)(cookie) & ~CHILDBIT);
	- if (parent == NULL) {
	- if (tree->avl_root != NULL) {
	- ASSERT(tree->avl_numnodes == 1);
	- tree->avl_root = NULL;
	- tree->avl_numnodes = 0;
	- }
	- return (NULL);
	- }
	-
	- /*
	- * Remove the child pointer we just visited from the parent and tree.
	- */
	- child = (uintptr_t)(*cookie) & CHILDBIT;
	- parent->avl_child[child] = NULL;
	- ASSERT(tree->avl_numnodes > 1);
	- --tree->avl_numnodes;
	-
	- /*
	- * If we just did a right child or there isn't one, go up to parent.
	- */
	- if (child == 1 \|\| parent->avl_child[1] == NULL) {
	- node = parent;
	- parent = AVL_XPARENT(parent);
	- goto done;
	- }
	-
	- /*
	- * Do parent's right child, then leftmost descendent.
	- */
	- node = parent->avl_child[1];
	- while (node->avl_child[0] != NULL) {
	- parent = node;
	- node = node->avl_child[0];
	- }
	-
	- /*
	- * If here, we moved to a left child. It may have one
	- * child on the right (when balance == +1).
	- */
	-check_right_side:
	- if (node->avl_child[1] != NULL) {
	- ASSERT(AVL_XBALANCE(node) == 1);
	- parent = node;
	- node = node->avl_child[1];
	- ASSERT(node->avl_child[0] == NULL &&
	- node->avl_child[1] == NULL);
	- } else {
	- ASSERT(AVL_XBALANCE(node) <= 0);
	- }
	-
	-done:
	- if (parent == NULL) {
	- cookie = (void )CHILDBIT;
	- ASSERT(node == tree->avl_root);
	- } else {
	- cookie = (void )((uintptr_t)parent \| AVL_XCHILD(node));
	- }
	-
	- return (AVL_NODE2DATA(node, off));
	-}
	Index: head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c
	+++ head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c
	@@ -1,512 +0,0 @@
	-
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/nvpair.h>
	-#ifndef _KERNEL
	-#include <sys/zfs_context.h>
	-#else
	-#include <sys/debug.h>
	-#include <sys/kmem.h>
	-#include <sys/param.h>
	-#include <sys/debug.h>
	-#endif
	-
	-/*
	- * "Force" nvlist wrapper.
	- *
	- * These functions wrap the nvlist_* functions with assertions that assume
	- * the operation is successful. This allows the caller's code to be much
	- * more readable, especially for the fnvlist_lookup_* and fnvpair_value_*
	- * functions, which can return the requested value (rather than filling in
	- * a pointer).
	- *
	- * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate
	- * with KM_SLEEP.
	- *
	- * More wrappers should be added as needed -- for example
	- * nvlist_lookup__array and nvpair_value__array.
	- */
	-
	-nvlist_t *
	-fnvlist_alloc(void)
	-{
	- nvlist_t *nvl;
	- VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP));
	- return (nvl);
	-}
	-
	-void
	-fnvlist_free(nvlist_t *nvl)
	-{
	- nvlist_free(nvl);
	-}
	-
	-size_t
	-fnvlist_size(nvlist_t *nvl)
	-{
	- size_t size;
	- VERIFY0(nvlist_size(nvl, &size, NV_ENCODE_NATIVE));
	- return (size);
	-}
	-
	-/*
	- * Returns allocated buffer of size *sizep. Caller must free the buffer with
	- * fnvlist_pack_free().
	- */
	-char *
	-fnvlist_pack(nvlist_t nvl, size_t sizep)
	-{
	- char *packed = 0;
	- VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE,
	- KM_SLEEP), ==, 0);
	- return (packed);
	-}
	-
	-/ARGSUSED/
	-void
	-fnvlist_pack_free(char *pack, size_t size)
	-{
	-#ifdef _KERNEL
	- kmem_free(pack, size);
	-#else
	- free(pack);
	-#endif
	-}
	-
	-nvlist_t *
	-fnvlist_unpack(char *buf, size_t buflen)
	-{
	- nvlist_t *rv;
	- VERIFY0(nvlist_unpack(buf, buflen, &rv, KM_SLEEP));
	- return (rv);
	-}
	-
	-nvlist_t *
	-fnvlist_dup(nvlist_t *nvl)
	-{
	- nvlist_t *rv;
	- VERIFY0(nvlist_dup(nvl, &rv, KM_SLEEP));
	- return (rv);
	-}
	-
	-void
	-fnvlist_merge(nvlist_t dst, nvlist_t src)
	-{
	- VERIFY0(nvlist_merge(dst, src, KM_SLEEP));
	-}
	-
	-size_t
	-fnvlist_num_pairs(nvlist_t *nvl)
	-{
	- size_t count = 0;
	- nvpair_t *pair;
	-
	- for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL;
	- pair = nvlist_next_nvpair(nvl, pair))
	- count++;
	- return (count);
	-}
	-
	-void
	-fnvlist_add_boolean(nvlist_t nvl, const char name)
	-{
	- VERIFY0(nvlist_add_boolean(nvl, name));
	-}
	-
	-void
	-fnvlist_add_boolean_value(nvlist_t nvl, const char name, boolean_t val)
	-{
	- VERIFY0(nvlist_add_boolean_value(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_byte(nvlist_t nvl, const char name, uchar_t val)
	-{
	- VERIFY0(nvlist_add_byte(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_int8(nvlist_t nvl, const char name, int8_t val)
	-{
	- VERIFY0(nvlist_add_int8(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_uint8(nvlist_t nvl, const char name, uint8_t val)
	-{
	- VERIFY0(nvlist_add_uint8(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_int16(nvlist_t nvl, const char name, int16_t val)
	-{
	- VERIFY0(nvlist_add_int16(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_uint16(nvlist_t nvl, const char name, uint16_t val)
	-{
	- VERIFY0(nvlist_add_uint16(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_int32(nvlist_t nvl, const char name, int32_t val)
	-{
	- VERIFY0(nvlist_add_int32(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_uint32(nvlist_t nvl, const char name, uint32_t val)
	-{
	- VERIFY0(nvlist_add_uint32(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_int64(nvlist_t nvl, const char name, int64_t val)
	-{
	- VERIFY0(nvlist_add_int64(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_uint64(nvlist_t nvl, const char name, uint64_t val)
	-{
	- VERIFY0(nvlist_add_uint64(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_string(nvlist_t nvl, const char name, const char *val)
	-{
	- VERIFY0(nvlist_add_string(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_nvlist(nvlist_t nvl, const char name, nvlist_t *val)
	-{
	- VERIFY0(nvlist_add_nvlist(nvl, name, val));
	-}
	-
	-void
	-fnvlist_add_nvpair(nvlist_t nvl, nvpair_t pair)
	-{
	- VERIFY0(nvlist_add_nvpair(nvl, pair));
	-}
	-
	-void
	-fnvlist_add_boolean_array(nvlist_t nvl, const char name,
	- boolean_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_boolean_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_byte_array(nvlist_t nvl, const char name, uchar_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_byte_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_int8_array(nvlist_t nvl, const char name, int8_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_int8_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_uint8_array(nvlist_t nvl, const char name, uint8_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_uint8_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_int16_array(nvlist_t nvl, const char name, int16_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_int16_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_uint16_array(nvlist_t nvl, const char name,
	- uint16_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_uint16_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_int32_array(nvlist_t nvl, const char name, int32_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_int32_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_uint32_array(nvlist_t nvl, const char name,
	- uint32_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_uint32_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_int64_array(nvlist_t nvl, const char name, int64_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_int64_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_uint64_array(nvlist_t nvl, const char name,
	- uint64_t *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_uint64_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_string_array(nvlist_t nvl, const char name,
	- char * const *val, uint_t n)
	-{
	- VERIFY0(nvlist_add_string_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_add_nvlist_array(nvlist_t nvl, const char name,
	- nvlist_t **val, uint_t n)
	-{
	- VERIFY0(nvlist_add_nvlist_array(nvl, name, val, n));
	-}
	-
	-void
	-fnvlist_remove(nvlist_t nvl, const char name)
	-{
	- VERIFY0(nvlist_remove_all(nvl, name));
	-}
	-
	-void
	-fnvlist_remove_nvpair(nvlist_t nvl, nvpair_t pair)
	-{
	- VERIFY0(nvlist_remove_nvpair(nvl, pair));
	-}
	-
	-nvpair_t *
	-fnvlist_lookup_nvpair(nvlist_t nvl, const char name)
	-{
	- nvpair_t *rv;
	- VERIFY0(nvlist_lookup_nvpair(nvl, name, &rv));
	- return (rv);
	-}
	-
	-/* returns B_TRUE if the entry exists */
	-boolean_t
	-fnvlist_lookup_boolean(nvlist_t nvl, const char name)
	-{
	- return (nvlist_lookup_boolean(nvl, name) == 0);
	-}
	-
	-boolean_t
	-fnvlist_lookup_boolean_value(nvlist_t nvl, const char name)
	-{
	- boolean_t rv;
	- VERIFY0(nvlist_lookup_boolean_value(nvl, name, &rv));
	- return (rv);
	-}
	-
	-uchar_t
	-fnvlist_lookup_byte(nvlist_t nvl, const char name)
	-{
	- uchar_t rv;
	- VERIFY0(nvlist_lookup_byte(nvl, name, &rv));
	- return (rv);
	-}
	-
	-int8_t
	-fnvlist_lookup_int8(nvlist_t nvl, const char name)
	-{
	- int8_t rv;
	- VERIFY0(nvlist_lookup_int8(nvl, name, &rv));
	- return (rv);
	-}
	-
	-int16_t
	-fnvlist_lookup_int16(nvlist_t nvl, const char name)
	-{
	- int16_t rv;
	- VERIFY0(nvlist_lookup_int16(nvl, name, &rv));
	- return (rv);
	-}
	-
	-int32_t
	-fnvlist_lookup_int32(nvlist_t nvl, const char name)
	-{
	- int32_t rv;
	- VERIFY0(nvlist_lookup_int32(nvl, name, &rv));
	- return (rv);
	-}
	-
	-int64_t
	-fnvlist_lookup_int64(nvlist_t nvl, const char name)
	-{
	- int64_t rv;
	- VERIFY0(nvlist_lookup_int64(nvl, name, &rv));
	- return (rv);
	-}
	-
	-uint8_t
	-fnvlist_lookup_uint8_t(nvlist_t nvl, const char name)
	-{
	- uint8_t rv;
	- VERIFY0(nvlist_lookup_uint8(nvl, name, &rv));
	- return (rv);
	-}
	-
	-uint16_t
	-fnvlist_lookup_uint16(nvlist_t nvl, const char name)
	-{
	- uint16_t rv;
	- VERIFY0(nvlist_lookup_uint16(nvl, name, &rv));
	- return (rv);
	-}
	-
	-uint32_t
	-fnvlist_lookup_uint32(nvlist_t nvl, const char name)
	-{
	- uint32_t rv;
	- VERIFY0(nvlist_lookup_uint32(nvl, name, &rv));
	- return (rv);
	-}
	-
	-uint64_t
	-fnvlist_lookup_uint64(nvlist_t nvl, const char name)
	-{
	- uint64_t rv;
	- VERIFY0(nvlist_lookup_uint64(nvl, name, &rv));
	- return (rv);
	-}
	-
	-char *
	-fnvlist_lookup_string(nvlist_t nvl, const char name)
	-{
	- char *rv;
	- VERIFY0(nvlist_lookup_string(nvl, name, &rv));
	- return (rv);
	-}
	-
	-nvlist_t *
	-fnvlist_lookup_nvlist(nvlist_t nvl, const char name)
	-{
	- nvlist_t *rv;
	- VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv));
	- return (rv);
	-}
	-
	-boolean_t
	-fnvpair_value_boolean_value(nvpair_t *nvp)
	-{
	- boolean_t rv;
	- VERIFY0(nvpair_value_boolean_value(nvp, &rv));
	- return (rv);
	-}
	-
	-uchar_t
	-fnvpair_value_byte(nvpair_t *nvp)
	-{
	- uchar_t rv;
	- VERIFY0(nvpair_value_byte(nvp, &rv));
	- return (rv);
	-}
	-
	-int8_t
	-fnvpair_value_int8(nvpair_t *nvp)
	-{
	- int8_t rv;
	- VERIFY0(nvpair_value_int8(nvp, &rv));
	- return (rv);
	-}
	-
	-int16_t
	-fnvpair_value_int16(nvpair_t *nvp)
	-{
	- int16_t rv;
	- VERIFY0(nvpair_value_int16(nvp, &rv));
	- return (rv);
	-}
	-
	-int32_t
	-fnvpair_value_int32(nvpair_t *nvp)
	-{
	- int32_t rv;
	- VERIFY0(nvpair_value_int32(nvp, &rv));
	- return (rv);
	-}
	-
	-int64_t
	-fnvpair_value_int64(nvpair_t *nvp)
	-{
	- int64_t rv;
	- VERIFY0(nvpair_value_int64(nvp, &rv));
	- return (rv);
	-}
	-
	-uint8_t
	-fnvpair_value_uint8_t(nvpair_t *nvp)
	-{
	- uint8_t rv;
	- VERIFY0(nvpair_value_uint8(nvp, &rv));
	- return (rv);
	-}
	-
	-uint16_t
	-fnvpair_value_uint16(nvpair_t *nvp)
	-{
	- uint16_t rv;
	- VERIFY0(nvpair_value_uint16(nvp, &rv));
	- return (rv);
	-}
	-
	-uint32_t
	-fnvpair_value_uint32(nvpair_t *nvp)
	-{
	- uint32_t rv;
	- VERIFY0(nvpair_value_uint32(nvp, &rv));
	- return (rv);
	-}
	-
	-uint64_t
	-fnvpair_value_uint64(nvpair_t *nvp)
	-{
	- uint64_t rv;
	- VERIFY0(nvpair_value_uint64(nvp, &rv));
	- return (rv);
	-}
	-
	-char *
	-fnvpair_value_string(nvpair_t *nvp)
	-{
	- char *rv;
	- VERIFY0(nvpair_value_string(nvp, &rv));
	- return (rv);
	-}
	-
	-nvlist_t *
	-fnvpair_value_nvlist(nvpair_t *nvp)
	-{
	- nvlist_t *rv;
	- VERIFY0(nvpair_value_nvlist(nvp, &rv));
	- return (rv);
	-}
	Index: head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
	+++ head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
	@@ -1,3600 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/debug.h>
	-#include <sys/nvpair.h>
	-#include <sys/nvpair_impl.h>
	-#include <rpc/types.h>
	-#include <rpc/xdr.h>
	-
	-#if defined(_KERNEL) && !defined(_BOOT)
	-#include <sys/varargs.h>
	-#include <sys/sunddi.h>
	-#else
	-#include <stdarg.h>
	-#include <stdlib.h>
	-#include <string.h>
	-#include <strings.h>
	-#endif
	-
	-#ifndef offsetof
	-#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
	-#endif
	-#define skip_whitespace(p) while (((p) == ' ') \|\| ((p) == '\t')) p++
	-
	-#if defined(__FreeBSD__) && !defined(_KERNEL)
	-/*
	- * libnvpair is the lowest commen denominator for ZFS related libraries,
	- * defining aok here makes it usable by all ZFS related libraries
	- */
	-int aok;
	-#endif
	-
	-/*
	- * nvpair.c - Provides kernel & userland interfaces for manipulating
	- * name-value pairs.
	- *
	- * Overview Diagram
	- *
	- * +--------------+
	- * \| nvlist_t \|
	- * \|--------------\|
	- * \| nvl_version \|
	- * \| nvl_nvflag \|
	- * \| nvl_priv -+-+
	- * \| nvl_flag \| \|
	- * \| nvl_pad \| \|
	- * +--------------+ \|
	- * V
	- * +--------------+ last i_nvp in list
	- * \| nvpriv_t \| +--------------------->
	- * \|--------------\| \|
	- * +--+- nvp_list \| \| +------------+
	- * \| \| nvp_last -+--+ + nv_alloc_t \|
	- * \| \| nvp_curr \| \|------------\|
	- * \| \| nvp_nva -+----> \| nva_ops \|
	- * \| \| nvp_stat \| \| nva_arg \|
	- * \| +--------------+ +------------+
	- * \|
	- * +-------+
	- * V
	- * +---------------------+ +-------------------+
	- * \| i_nvp_t \| +-->\| i_nvp_t \| +-->
	- * \|---------------------\| \| \|-------------------\| \|
	- * \| nvi_next -+--+ \| nvi_next -+--+
	- * \| nvi_prev (NULL) \| <----+ nvi_prev \|
	- * \| . . . . . . . . . . \| \| . . . . . . . . . \|
	- * \| nvp (nvpair_t) \| \| nvp (nvpair_t) \|
	- * \| - nvp_size \| \| - nvp_size \|
	- * \| - nvp_name_sz \| \| - nvp_name_sz \|
	- * \| - nvp_value_elem \| \| - nvp_value_elem \|
	- * \| - nvp_type \| \| - nvp_type \|
	- * \| - data ... \| \| - data ... \|
	- * +---------------------+ +-------------------+
	- *
	- *
	- *
	- * +---------------------+ +---------------------+
	- * \| i_nvp_t \| +--> +-->\| i_nvp_t (last) \|
	- * \|---------------------\| \| \| \|---------------------\|
	- * \| nvi_next -+--+ ... --+ \| nvi_next (NULL) \|
	- * <-+- nvi_prev \|<-- ... <----+ nvi_prev \|
	- * \| . . . . . . . . . \| \| . . . . . . . . . \|
	- * \| nvp (nvpair_t) \| \| nvp (nvpair_t) \|
	- * \| - nvp_size \| \| - nvp_size \|
	- * \| - nvp_name_sz \| \| - nvp_name_sz \|
	- * \| - nvp_value_elem \| \| - nvp_value_elem \|
	- * \| - DATA_TYPE_NVLIST \| \| - nvp_type \|
	- * \| - data (embedded) \| \| - data ... \|
	- * \| nvlist name \| +---------------------+
	- * \| +--------------+ \|
	- * \| \| nvlist_t \| \|
	- * \| \|--------------\| \|
	- * \| \| nvl_version \| \|
	- * \| \| nvl_nvflag \| \|
	- * \| \| nvl_priv --+---+---->
	- * \| \| nvl_flag \| \|
	- * \| \| nvl_pad \| \|
	- * \| +--------------+ \|
	- * +---------------------+
	- *
	- *
	- * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will
	- * allow value to be aligned on 8 byte boundary
	- *
	- * name_len is the length of the name string including the null terminator
	- * so it must be >= 1
	- */
	-#define NVP_SIZE_CALC(name_len, data_len) \
	- (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
	-
	-static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
	-static int nvlist_add_common(nvlist_t nvl, const char name, data_type_t type,
	- uint_t nelem, const void *data);
	-
	-#define NV_STAT_EMBEDDED 0x1
	-#define EMBEDDED_NVL(nvp) ((nvlist_t )(void )NVP_VALUE(nvp))
	-#define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t *)(void )NVP_VALUE(nvp))
	-
	-#define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz))
	-#define NVPAIR2I_NVP(nvp) \
	- ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
	-
	-#ifdef _KERNEL
	-int nvpair_max_recursion = 20;
	-#else
	-int nvpair_max_recursion = 100;
	-#endif
	-
	-uint64_t nvlist_hashtable_init_size = (1 << 4);
	-
	-int
	-nv_alloc_init(nv_alloc_t nva, const nv_alloc_ops_t nvo, /* args */ ...)
	-{
	- va_list valist;
	- int err = 0;
	-
	- nva->nva_ops = nvo;
	- nva->nva_arg = NULL;
	-
	- va_start(valist, nvo);
	- if (nva->nva_ops->nv_ao_init != NULL)
	- err = nva->nva_ops->nv_ao_init(nva, valist);
	- va_end(valist);
	-
	- return (err);
	-}
	-
	-void
	-nv_alloc_reset(nv_alloc_t *nva)
	-{
	- if (nva->nva_ops->nv_ao_reset != NULL)
	- nva->nva_ops->nv_ao_reset(nva);
	-}
	-
	-void
	-nv_alloc_fini(nv_alloc_t *nva)
	-{
	- if (nva->nva_ops->nv_ao_fini != NULL)
	- nva->nva_ops->nv_ao_fini(nva);
	-}
	-
	-nv_alloc_t *
	-nvlist_lookup_nv_alloc(nvlist_t *nvl)
	-{
	- nvpriv_t *priv;
	-
	- if (nvl == NULL \|\|
	- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
	- return (NULL);
	-
	- return (priv->nvp_nva);
	-}
	-
	-static void *
	-nv_mem_zalloc(nvpriv_t *nvp, size_t size)
	-{
	- nv_alloc_t *nva = nvp->nvp_nva;
	- void *buf;
	-
	- if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL)
	- bzero(buf, size);
	-
	- return (buf);
	-}
	-
	-static void
	-nv_mem_free(nvpriv_t nvp, void buf, size_t size)
	-{
	- nv_alloc_t *nva = nvp->nvp_nva;
	-
	- nva->nva_ops->nv_ao_free(nva, buf, size);
	-}
	-
	-static void
	-nv_priv_init(nvpriv_t priv, nv_alloc_t nva, uint32_t stat)
	-{
	- bzero(priv, sizeof (nvpriv_t));
	-
	- priv->nvp_nva = nva;
	- priv->nvp_stat = stat;
	-}
	-
	-static nvpriv_t *
	-nv_priv_alloc(nv_alloc_t *nva)
	-{
	- nvpriv_t *priv;
	-
	- /*
	- * nv_mem_alloc() cannot called here because it needs the priv
	- * argument.
	- */
	- if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL)
	- return (NULL);
	-
	- nv_priv_init(priv, nva, 0);
	-
	- return (priv);
	-}
	-
	-/*
	- * Embedded lists need their own nvpriv_t's. We create a new
	- * nvpriv_t using the parameters and allocator from the parent
	- * list's nvpriv_t.
	- */
	-static nvpriv_t *
	-nv_priv_alloc_embedded(nvpriv_t *priv)
	-{
	- nvpriv_t *emb_priv;
	-
	- if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL)
	- return (NULL);
	-
	- nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED);
	-
	- return (emb_priv);
	-}
	-
	-static int
	-nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets)
	-{
	- ASSERT3P(priv->nvp_hashtable, ==, NULL);
	- ASSERT0(priv->nvp_nbuckets);
	- ASSERT0(priv->nvp_nentries);
	-
	- i_nvp_t *tab = nv_mem_zalloc(priv, buckets sizeof (i_nvp_t *));
	- if (tab == NULL)
	- return (ENOMEM);
	-
	- priv->nvp_hashtable = tab;
	- priv->nvp_nbuckets = buckets;
	- return (0);
	-}
	-
	-static void
	-nvt_tab_free(nvpriv_t *priv)
	-{
	- i_nvp_t **tab = priv->nvp_hashtable;
	- if (tab == NULL) {
	- ASSERT0(priv->nvp_nbuckets);
	- ASSERT0(priv->nvp_nentries);
	- return;
	- }
	-
	- nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *));
	-
	- priv->nvp_hashtable = NULL;
	- priv->nvp_nbuckets = 0;
	- priv->nvp_nentries = 0;
	-}
	-
	-static uint32_t
	-nvt_hash(const char *p)
	-{
	- uint32_t g, hval = 0;
	-
	- while (*p) {
	- hval = (hval << 4) + *p++;
	- if ((g = (hval & 0xf0000000)) != 0)
	- hval ^= g >> 24;
	- hval &= ~g;
	- }
	- return (hval);
	-}
	-
	-static boolean_t
	-nvt_nvpair_match(nvpair_t nvp1, nvpair_t nvp2, uint32_t nvflag)
	-{
	- boolean_t match = B_FALSE;
	- if (nvflag & NV_UNIQUE_NAME_TYPE) {
	- if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 &&
	- NVP_TYPE(nvp1) == NVP_TYPE(nvp2))
	- match = B_TRUE;
	- } else {
	- ASSERT(nvflag == 0 \|\| nvflag & NV_UNIQUE_NAME);
	- if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0)
	- match = B_TRUE;
	- }
	- return (match);
	-}
	-
	-static nvpair_t *
	-nvt_lookup_name_type(nvlist_t nvl, const char name, data_type_t type)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	- ASSERT(priv != NULL);
	-
	- i_nvp_t **tab = priv->nvp_hashtable;
	-
	- if (tab == NULL) {
	- ASSERT3P(priv->nvp_list, ==, NULL);
	- ASSERT0(priv->nvp_nbuckets);
	- ASSERT0(priv->nvp_nentries);
	- return (NULL);
	- } else {
	- ASSERT(priv->nvp_nbuckets != 0);
	- }
	-
	- uint64_t hash = nvt_hash(name);
	- uint64_t index = hash & (priv->nvp_nbuckets - 1);
	-
	- ASSERT3U(index, <, priv->nvp_nbuckets);
	- i_nvp_t *entry = tab[index];
	-
	- for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) {
	- if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 &&
	- (type == DATA_TYPE_DONTCARE \|\|
	- NVP_TYPE(&e->nvi_nvp) == type))
	- return (&e->nvi_nvp);
	- }
	- return (NULL);
	-}
	-
	-static nvpair_t *
	-nvt_lookup_name(nvlist_t nvl, const char name)
	-{
	- return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE));
	-}
	-
	-static int
	-nvt_resize(nvpriv_t *priv, uint32_t new_size)
	-{
	- i_nvp_t **tab = priv->nvp_hashtable;
	-
	- /*
	- * Migrate all the entries from the current table
	- * to a newly-allocated table with the new size by
	- * re-adjusting the pointers of their entries.
	- */
	- uint32_t size = priv->nvp_nbuckets;
	- uint32_t new_mask = new_size - 1;
	- ASSERT(((new_size) & ((new_size) - 1)) == 0);
	-
	- i_nvp_t *new_tab = nv_mem_zalloc(priv, new_size sizeof (i_nvp_t *));
	- if (new_tab == NULL)
	- return (ENOMEM);
	-
	- uint32_t nentries = 0;
	- for (uint32_t i = 0; i < size; i++) {
	- i_nvp_t next, e = tab[i];
	-
	- while (e != NULL) {
	- next = e->nvi_hashtable_next;
	-
	- uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp));
	- uint32_t index = hash & new_mask;
	-
	- e->nvi_hashtable_next = new_tab[index];
	- new_tab[index] = e;
	- nentries++;
	-
	- e = next;
	- }
	- tab[i] = NULL;
	- }
	- ASSERT3U(nentries, ==, priv->nvp_nentries);
	-
	- nvt_tab_free(priv);
	-
	- priv->nvp_hashtable = new_tab;
	- priv->nvp_nbuckets = new_size;
	- priv->nvp_nentries = nentries;
	-
	- return (0);
	-}
	-
	-static boolean_t
	-nvt_needs_togrow(nvpriv_t *priv)
	-{
	- /*
	- * Grow only when we have more elements than buckets
	- * and the # of buckets doesn't overflow.
	- */
	- return (priv->nvp_nentries > priv->nvp_nbuckets &&
	- (UINT32_MAX >> 1) >= priv->nvp_nbuckets);
	-}
	-
	-/*
	- * Allocate a new table that's twice the size of the old one,
	- * and migrate all the entries from the old one to the new
	- * one by re-adjusting their pointers.
	- */
	-static int
	-nvt_grow(nvpriv_t *priv)
	-{
	- uint32_t current_size = priv->nvp_nbuckets;
	- /* ensure we won't overflow */
	- ASSERT3U(UINT32_MAX >> 1, >=, current_size);
	- return (nvt_resize(priv, current_size << 1));
	-}
	-
	-static boolean_t
	-nvt_needs_toshrink(nvpriv_t *priv)
	-{
	- /*
	- * Shrink only when the # of elements is less than or
	- * equal to 1/4 the # of buckets. Never shrink less than
	- * nvlist_hashtable_init_size.
	- */
	- ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size);
	- if (priv->nvp_nbuckets == nvlist_hashtable_init_size)
	- return (B_FALSE);
	- return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2));
	-}
	-
	-/*
	- * Allocate a new table that's half the size of the old one,
	- * and migrate all the entries from the old one to the new
	- * one by re-adjusting their pointers.
	- */
	-static int
	-nvt_shrink(nvpriv_t *priv)
	-{
	- uint32_t current_size = priv->nvp_nbuckets;
	- /* ensure we won't overflow */
	- ASSERT3U(current_size, >=, nvlist_hashtable_init_size);
	- return (nvt_resize(priv, current_size >> 1));
	-}
	-
	-static int
	-nvt_remove_nvpair(nvlist_t nvl, nvpair_t nvp)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	-
	- if (nvt_needs_toshrink(priv)) {
	- int err = nvt_shrink(priv);
	- if (err != 0)
	- return (err);
	- }
	- i_nvp_t **tab = priv->nvp_hashtable;
	-
	- char *name = NVP_NAME(nvp);
	- uint64_t hash = nvt_hash(name);
	- uint64_t index = hash & (priv->nvp_nbuckets - 1);
	-
	- ASSERT3U(index, <, priv->nvp_nbuckets);
	- i_nvp_t *bucket = tab[index];
	-
	- for (i_nvp_t prev = NULL, e = bucket;
	- e != NULL; prev = e, e = e->nvi_hashtable_next) {
	- if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_flag)) {
	- if (prev != NULL) {
	- prev->nvi_hashtable_next =
	- e->nvi_hashtable_next;
	- } else {
	- ASSERT3P(e, ==, bucket);
	- tab[index] = e->nvi_hashtable_next;
	- }
	- e->nvi_hashtable_next = NULL;
	- priv->nvp_nentries--;
	- break;
	- }
	- }
	-
	- return (0);
	-}
	-
	-static int
	-nvt_add_nvpair(nvlist_t nvl, nvpair_t nvp)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	-
	- /* initialize nvpair table now if it doesn't exist. */
	- if (priv->nvp_hashtable == NULL) {
	- int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size);
	- if (err != 0)
	- return (err);
	- }
	-
	- /*
	- * if we don't allow duplicate entries, make sure to
	- * unlink any existing entries from the table.
	- */
	- if (nvl->nvl_nvflag != 0) {
	- int err = nvt_remove_nvpair(nvl, nvp);
	- if (err != 0)
	- return (err);
	- }
	-
	- if (nvt_needs_togrow(priv)) {
	- int err = nvt_grow(priv);
	- if (err != 0)
	- return (err);
	- }
	- i_nvp_t **tab = priv->nvp_hashtable;
	-
	- char *name = NVP_NAME(nvp);
	- uint64_t hash = nvt_hash(name);
	- uint64_t index = hash & (priv->nvp_nbuckets - 1);
	-
	- ASSERT3U(index, <, priv->nvp_nbuckets);
	- i_nvp_t *bucket = tab[index];
	-
	- /* insert link at the beginning of the bucket */
	- i_nvp_t *new_entry = NVPAIR2I_NVP(nvp);
	- ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL);
	- new_entry->nvi_hashtable_next = bucket;
	- tab[index] = new_entry;
	-
	- priv->nvp_nentries++;
	- return (0);
	-}
	-
	-static void
	-nvlist_init(nvlist_t nvl, uint32_t nvflag, nvpriv_t priv)
	-{
	- nvl->nvl_version = NV_VERSION;
	- nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME\|NV_UNIQUE_NAME_TYPE);
	- nvl->nvl_priv = (uint64_t)(uintptr_t)priv;
	- nvl->nvl_flag = 0;
	- nvl->nvl_pad = 0;
	-}
	-
	-uint_t
	-nvlist_nvflag(nvlist_t *nvl)
	-{
	- return (nvl->nvl_nvflag);
	-}
	-
	-/*
	- * nvlist_alloc - Allocate nvlist.
	- */
	-/ARGSUSED1/
	-int
	-nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag)
	-{
	-#if defined(_KERNEL) && !defined(_BOOT)
	- return (nvlist_xalloc(nvlp, nvflag,
	- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
	-#else
	- return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep));
	-#endif
	-}
	-
	-int
	-nvlist_xalloc(nvlist_t *nvlp, uint_t nvflag, nv_alloc_t nva)
	-{
	- nvpriv_t *priv;
	-
	- if (nvlp == NULL \|\| nva == NULL)
	- return (EINVAL);
	-
	- if ((priv = nv_priv_alloc(nva)) == NULL)
	- return (ENOMEM);
	-
	- if ((*nvlp = nv_mem_zalloc(priv,
	- NV_ALIGN(sizeof (nvlist_t)))) == NULL) {
	- nv_mem_free(priv, priv, sizeof (nvpriv_t));
	- return (ENOMEM);
	- }
	-
	- nvlist_init(*nvlp, nvflag, priv);
	-
	- return (0);
	-}
	-
	-/*
	- * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair.
	- */
	-static nvpair_t *
	-nvp_buf_alloc(nvlist_t *nvl, size_t len)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	- i_nvp_t *buf;
	- nvpair_t *nvp;
	- size_t nvsize;
	-
	- /*
	- * Allocate the buffer
	- */
	- nvsize = len + offsetof(i_nvp_t, nvi_nvp);
	-
	- if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL)
	- return (NULL);
	-
	- nvp = &buf->nvi_nvp;
	- nvp->nvp_size = len;
	-
	- return (nvp);
	-}
	-
	-/*
	- * nvp_buf_free - de-Allocate an i_nvp_t.
	- */
	-static void
	-nvp_buf_free(nvlist_t nvl, nvpair_t nvp)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	- size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp);
	-
	- nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize);
	-}
	-
	-/*
	- * nvp_buf_link - link a new nv pair into the nvlist.
	- */
	-static void
	-nvp_buf_link(nvlist_t nvl, nvpair_t nvp)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	- i_nvp_t *curr = NVPAIR2I_NVP(nvp);
	-
	- /* Put element at end of nvlist */
	- if (priv->nvp_list == NULL) {
	- priv->nvp_list = priv->nvp_last = curr;
	- } else {
	- curr->nvi_prev = priv->nvp_last;
	- priv->nvp_last->nvi_next = curr;
	- priv->nvp_last = curr;
	- }
	-}
	-
	-/*
	- * nvp_buf_unlink - unlink an removed nvpair out of the nvlist.
	- */
	-static void
	-nvp_buf_unlink(nvlist_t nvl, nvpair_t nvp)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	- i_nvp_t *curr = NVPAIR2I_NVP(nvp);
	-
	- /*
	- * protect nvlist_next_nvpair() against walking on freed memory.
	- */
	- if (priv->nvp_curr == curr)
	- priv->nvp_curr = curr->nvi_next;
	-
	- if (curr == priv->nvp_list)
	- priv->nvp_list = curr->nvi_next;
	- else
	- curr->nvi_prev->nvi_next = curr->nvi_next;
	-
	- if (curr == priv->nvp_last)
	- priv->nvp_last = curr->nvi_prev;
	- else
	- curr->nvi_next->nvi_prev = curr->nvi_prev;
	-}
	-
	-/*
	- * take a nvpair type and number of elements and make sure the are valid
	- */
	-static int
	-i_validate_type_nelem(data_type_t type, uint_t nelem)
	-{
	- switch (type) {
	- case DATA_TYPE_BOOLEAN:
	- if (nelem != 0)
	- return (EINVAL);
	- break;
	- case DATA_TYPE_BOOLEAN_VALUE:
	- case DATA_TYPE_BYTE:
	- case DATA_TYPE_INT8:
	- case DATA_TYPE_UINT8:
	- case DATA_TYPE_INT16:
	- case DATA_TYPE_UINT16:
	- case DATA_TYPE_INT32:
	- case DATA_TYPE_UINT32:
	- case DATA_TYPE_INT64:
	- case DATA_TYPE_UINT64:
	- case DATA_TYPE_STRING:
	- case DATA_TYPE_HRTIME:
	- case DATA_TYPE_NVLIST:
	-#if !defined(_KERNEL)
	- case DATA_TYPE_DOUBLE:
	-#endif
	- if (nelem != 1)
	- return (EINVAL);
	- break;
	- case DATA_TYPE_BOOLEAN_ARRAY:
	- case DATA_TYPE_BYTE_ARRAY:
	- case DATA_TYPE_INT8_ARRAY:
	- case DATA_TYPE_UINT8_ARRAY:
	- case DATA_TYPE_INT16_ARRAY:
	- case DATA_TYPE_UINT16_ARRAY:
	- case DATA_TYPE_INT32_ARRAY:
	- case DATA_TYPE_UINT32_ARRAY:
	- case DATA_TYPE_INT64_ARRAY:
	- case DATA_TYPE_UINT64_ARRAY:
	- case DATA_TYPE_STRING_ARRAY:
	- case DATA_TYPE_NVLIST_ARRAY:
	- /* we allow arrays with 0 elements */
	- break;
	- default:
	- return (EINVAL);
	- }
	- return (0);
	-}
	-
	-/*
	- * Verify nvp_name_sz and check the name string length.
	- */
	-static int
	-i_validate_nvpair_name(nvpair_t *nvp)
	-{
	- if ((nvp->nvp_name_sz <= 0) \|\|
	- (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0)))
	- return (EFAULT);
	-
	- /* verify the name string, make sure its terminated */
	- if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0')
	- return (EFAULT);
	-
	- return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT);
	-}
	-
	-static int
	-i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data)
	-{
	- switch (type) {
	- case DATA_TYPE_BOOLEAN_VALUE:
	- if ((boolean_t )data != B_TRUE &&
	- (boolean_t )data != B_FALSE)
	- return (EINVAL);
	- break;
	- case DATA_TYPE_BOOLEAN_ARRAY: {
	- int i;
	-
	- for (i = 0; i < nelem; i++)
	- if (((boolean_t *)data)[i] != B_TRUE &&
	- ((boolean_t *)data)[i] != B_FALSE)
	- return (EINVAL);
	- break;
	- }
	- default:
	- break;
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * This function takes a pointer to what should be a nvpair and it's size
	- * and then verifies that all the nvpair fields make sense and can be
	- * trusted. This function is used when decoding packed nvpairs.
	- */
	-static int
	-i_validate_nvpair(nvpair_t *nvp)
	-{
	- data_type_t type = NVP_TYPE(nvp);
	- int size1, size2;
	-
	- /* verify nvp_name_sz, check the name string length */
	- if (i_validate_nvpair_name(nvp) != 0)
	- return (EFAULT);
	-
	- if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0)
	- return (EFAULT);
	-
	- /*
	- * verify nvp_type, nvp_value_elem, and also possibly
	- * verify string values and get the value size.
	- */
	- size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
	- size1 = nvp->nvp_size - NVP_VALOFF(nvp);
	- if (size2 < 0 \|\| size1 != NV_ALIGN(size2))
	- return (EFAULT);
	-
	- return (0);
	-}
	-
	-static int
	-nvlist_copy_pairs(nvlist_t snvl, nvlist_t dnvl)
	-{
	- nvpriv_t *priv;
	- i_nvp_t *curr;
	-
	- if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL)
	- return (EINVAL);
	-
	- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
	- nvpair_t *nvp = &curr->nvi_nvp;
	- int err;
	-
	- if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp),
	- NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0)
	- return (err);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Frees all memory allocated for an nvpair (like embedded lists) with
	- * the exception of the nvpair buffer itself.
	- */
	-static void
	-nvpair_free(nvpair_t *nvp)
	-{
	- switch (NVP_TYPE(nvp)) {
	- case DATA_TYPE_NVLIST:
	- nvlist_free(EMBEDDED_NVL(nvp));
	- break;
	- case DATA_TYPE_NVLIST_ARRAY: {
	- nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
	- int i;
	-
	- for (i = 0; i < NVP_NELEM(nvp); i++)
	- nvlist_free(nvlp[i]);
	- break;
	- }
	- default:
	- break;
	- }
	-}
	-
	-/*
	- * nvlist_free - free an unpacked nvlist
	- */
	-void
	-nvlist_free(nvlist_t *nvl)
	-{
	- nvpriv_t *priv;
	- i_nvp_t *curr;
	-
	- if (nvl == NULL \|\|
	- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
	- return;
	-
	- /*
	- * Unpacked nvlist are linked through i_nvp_t
	- */
	- curr = priv->nvp_list;
	- while (curr != NULL) {
	- nvpair_t *nvp = &curr->nvi_nvp;
	- curr = curr->nvi_next;
	-
	- nvpair_free(nvp);
	- nvp_buf_free(nvl, nvp);
	- }
	-
	- if (!(priv->nvp_stat & NV_STAT_EMBEDDED))
	- nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t)));
	- else
	- nvl->nvl_priv = 0;
	-
	- nvt_tab_free(priv);
	- nv_mem_free(priv, priv, sizeof (nvpriv_t));
	-}
	-
	-static int
	-nvlist_contains_nvp(nvlist_t nvl, nvpair_t nvp)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	- i_nvp_t *curr;
	-
	- if (nvp == NULL)
	- return (0);
	-
	- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
	- if (&curr->nvi_nvp == nvp)
	- return (1);
	-
	- return (0);
	-}
	-
	-/*
	- * Make a copy of nvlist
	- */
	-/ARGSUSED1/
	-int
	-nvlist_dup(nvlist_t nvl, nvlist_t *nvlp, int kmflag)
	-{
	-#if defined(_KERNEL) && !defined(_BOOT)
	- return (nvlist_xdup(nvl, nvlp,
	- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
	-#else
	- return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep));
	-#endif
	-}
	-
	-int
	-nvlist_xdup(nvlist_t nvl, nvlist_t nvlp, nv_alloc_t nva)
	-{
	- int err;
	- nvlist_t *ret;
	-
	- if (nvl == NULL \|\| nvlp == NULL)
	- return (EINVAL);
	-
	- if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0)
	- return (err);
	-
	- if ((err = nvlist_copy_pairs(nvl, ret)) != 0)
	- nvlist_free(ret);
	- else
	- *nvlp = ret;
	-
	- return (err);
	-}
	-
	-/*
	- * Remove all with matching name
	- */
	-int
	-nvlist_remove_all(nvlist_t nvl, const char name)
	-{
	- int error = ENOENT;
	-
	- if (nvl == NULL \|\| name == NULL \|\| nvl->nvl_priv == 0)
	- return (EINVAL);
	-
	- nvpair_t *nvp;
	- while ((nvp = nvt_lookup_name(nvl, name)) != NULL) {
	- VERIFY0(nvlist_remove_nvpair(nvl, nvp));
	- error = 0;
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * Remove first one with matching name and type
	- */
	-int
	-nvlist_remove(nvlist_t nvl, const char name, data_type_t type)
	-{
	- if (nvl == NULL \|\| name == NULL \|\| nvl->nvl_priv == 0)
	- return (EINVAL);
	-
	- nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
	- if (nvp == NULL)
	- return (ENOENT);
	-
	- return (nvlist_remove_nvpair(nvl, nvp));
	-}
	-
	-int
	-nvlist_remove_nvpair(nvlist_t nvl, nvpair_t nvp)
	-{
	- if (nvl == NULL \|\| nvp == NULL)
	- return (EINVAL);
	-
	- int err = nvt_remove_nvpair(nvl, nvp);
	- if (err != 0)
	- return (err);
	-
	- nvp_buf_unlink(nvl, nvp);
	- nvpair_free(nvp);
	- nvp_buf_free(nvl, nvp);
	- return (0);
	-}
	-
	-/*
	- * This function calculates the size of an nvpair value.
	- *
	- * The data argument controls the behavior in case of the data types
	- * DATA_TYPE_STRING and
	- * DATA_TYPE_STRING_ARRAY
	- * Is data == NULL then the size of the string(s) is excluded.
	- */
	-static int
	-i_get_value_size(data_type_t type, const void *data, uint_t nelem)
	-{
	- uint64_t value_sz;
	-
	- if (i_validate_type_nelem(type, nelem) != 0)
	- return (-1);
	-
	- /* Calculate required size for holding value */
	- switch (type) {
	- case DATA_TYPE_BOOLEAN:
	- value_sz = 0;
	- break;
	- case DATA_TYPE_BOOLEAN_VALUE:
	- value_sz = sizeof (boolean_t);
	- break;
	- case DATA_TYPE_BYTE:
	- value_sz = sizeof (uchar_t);
	- break;
	- case DATA_TYPE_INT8:
	- value_sz = sizeof (int8_t);
	- break;
	- case DATA_TYPE_UINT8:
	- value_sz = sizeof (uint8_t);
	- break;
	- case DATA_TYPE_INT16:
	- value_sz = sizeof (int16_t);
	- break;
	- case DATA_TYPE_UINT16:
	- value_sz = sizeof (uint16_t);
	- break;
	- case DATA_TYPE_INT32:
	- value_sz = sizeof (int32_t);
	- break;
	- case DATA_TYPE_UINT32:
	- value_sz = sizeof (uint32_t);
	- break;
	- case DATA_TYPE_INT64:
	- value_sz = sizeof (int64_t);
	- break;
	- case DATA_TYPE_UINT64:
	- value_sz = sizeof (uint64_t);
	- break;
	-#if !defined(_KERNEL)
	- case DATA_TYPE_DOUBLE:
	- value_sz = sizeof (double);
	- break;
	-#endif
	- case DATA_TYPE_STRING:
	- if (data == NULL)
	- value_sz = 0;
	- else
	- value_sz = strlen(data) + 1;
	- break;
	- case DATA_TYPE_BOOLEAN_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (boolean_t);
	- break;
	- case DATA_TYPE_BYTE_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (uchar_t);
	- break;
	- case DATA_TYPE_INT8_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (int8_t);
	- break;
	- case DATA_TYPE_UINT8_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (uint8_t);
	- break;
	- case DATA_TYPE_INT16_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (int16_t);
	- break;
	- case DATA_TYPE_UINT16_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (uint16_t);
	- break;
	- case DATA_TYPE_INT32_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (int32_t);
	- break;
	- case DATA_TYPE_UINT32_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (uint32_t);
	- break;
	- case DATA_TYPE_INT64_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (int64_t);
	- break;
	- case DATA_TYPE_UINT64_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (uint64_t);
	- break;
	- case DATA_TYPE_STRING_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (uint64_t);
	-
	- if (data != NULL) {
	- char const strs = data;
	- uint_t i;
	-
	- /* no alignment requirement for strings */
	- for (i = 0; i < nelem; i++) {
	- if (strs[i] == NULL)
	- return (-1);
	- value_sz += strlen(strs[i]) + 1;
	- }
	- }
	- break;
	- case DATA_TYPE_HRTIME:
	- value_sz = sizeof (hrtime_t);
	- break;
	- case DATA_TYPE_NVLIST:
	- value_sz = NV_ALIGN(sizeof (nvlist_t));
	- break;
	- case DATA_TYPE_NVLIST_ARRAY:
	- value_sz = (uint64_t)nelem * sizeof (uint64_t) +
	- (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t));
	- break;
	- default:
	- return (-1);
	- }
	-
	- return (value_sz > INT32_MAX ? -1 : (int)value_sz);
	-}
	-
	-static int
	-nvlist_copy_embedded(nvlist_t nvl, nvlist_t onvl, nvlist_t *emb_nvl)
	-{
	- nvpriv_t *priv;
	- int err;
	-
	- if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t)
	- nvl->nvl_priv)) == NULL)
	- return (ENOMEM);
	-
	- nvlist_init(emb_nvl, onvl->nvl_nvflag, priv);
	-
	- if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) {
	- nvlist_free(emb_nvl);
	- emb_nvl->nvl_priv = 0;
	- }
	-
	- return (err);
	-}
	-
	-/*
	- * nvlist_add_common - Add new <name,value> pair to nvlist
	- */
	-static int
	-nvlist_add_common(nvlist_t nvl, const char name,
	- data_type_t type, uint_t nelem, const void *data)
	-{
	- nvpair_t *nvp;
	- uint_t i;
	-
	- int nvp_sz, name_sz, value_sz;
	- int err = 0;
	-
	- if (name == NULL \|\| nvl == NULL \|\| nvl->nvl_priv == 0)
	- return (EINVAL);
	-
	- if (nelem != 0 && data == NULL)
	- return (EINVAL);
	-
	- /*
	- * Verify type and nelem and get the value size.
	- * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
	- * is the size of the string(s) included.
	- */
	- if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
	- return (EINVAL);
	-
	- if (i_validate_nvpair_value(type, nelem, data) != 0)
	- return (EINVAL);
	-
	- /*
	- * If we're adding an nvlist or nvlist array, ensure that we are not
	- * adding the input nvlist to itself, which would cause recursion,
	- * and ensure that no NULL nvlist pointers are present.
	- */
	- switch (type) {
	- case DATA_TYPE_NVLIST:
	- if (data == nvl \|\| data == NULL)
	- return (EINVAL);
	- break;
	- case DATA_TYPE_NVLIST_ARRAY: {
	- nvlist_t onvlp = (nvlist_t )data;
	- for (i = 0; i < nelem; i++) {
	- if (onvlp[i] == nvl \|\| onvlp[i] == NULL)
	- return (EINVAL);
	- }
	- break;
	- }
	- default:
	- break;
	- }
	-
	- /* calculate sizes of the nvpair elements and the nvpair itself */
	- name_sz = strlen(name) + 1;
	- if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * 8 - 1))
	- return (EINVAL);
	-
	- nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
	-
	- if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL)
	- return (ENOMEM);
	-
	- ASSERT(nvp->nvp_size == nvp_sz);
	- nvp->nvp_name_sz = name_sz;
	- nvp->nvp_value_elem = nelem;
	- nvp->nvp_type = type;
	- bcopy(name, NVP_NAME(nvp), name_sz);
	-
	- switch (type) {
	- case DATA_TYPE_BOOLEAN:
	- break;
	- case DATA_TYPE_STRING_ARRAY: {
	- char const strs = data;
	- char *buf = NVP_VALUE(nvp);
	- char *cstrs = (void )buf;
	-
	- /* skip pre-allocated space for pointer array */
	- buf += nelem * sizeof (uint64_t);
	- for (i = 0; i < nelem; i++) {
	- int slen = strlen(strs[i]) + 1;
	- bcopy(strs[i], buf, slen);
	- cstrs[i] = buf;
	- buf += slen;
	- }
	- break;
	- }
	- case DATA_TYPE_NVLIST: {
	- nvlist_t *nnvl = EMBEDDED_NVL(nvp);
	- nvlist_t onvl = (nvlist_t )data;
	-
	- if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) {
	- nvp_buf_free(nvl, nvp);
	- return (err);
	- }
	- break;
	- }
	- case DATA_TYPE_NVLIST_ARRAY: {
	- nvlist_t onvlp = (nvlist_t )data;
	- nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
	- nvlist_t embedded = (nvlist_t )
	- ((uintptr_t)nvlp + nelem * sizeof (uint64_t));
	-
	- for (i = 0; i < nelem; i++) {
	- if ((err = nvlist_copy_embedded(nvl,
	- onvlp[i], embedded)) != 0) {
	- /*
	- * Free any successfully created lists
	- */
	- nvpair_free(nvp);
	- nvp_buf_free(nvl, nvp);
	- return (err);
	- }
	-
	- nvlp[i] = embedded++;
	- }
	- break;
	- }
	- default:
	- bcopy(data, NVP_VALUE(nvp), value_sz);
	- }
	-
	- /* if unique name, remove before add */
	- if (nvl->nvl_nvflag & NV_UNIQUE_NAME)
	- (void) nvlist_remove_all(nvl, name);
	- else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
	- (void) nvlist_remove(nvl, name, type);
	-
	- err = nvt_add_nvpair(nvl, nvp);
	- if (err != 0) {
	- nvpair_free(nvp);
	- nvp_buf_free(nvl, nvp);
	- return (err);
	- }
	- nvp_buf_link(nvl, nvp);
	-
	- return (0);
	-}
	-
	-int
	-nvlist_add_boolean(nvlist_t nvl, const char name)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL));
	-}
	-
	-int
	-nvlist_add_boolean_value(nvlist_t nvl, const char name, boolean_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val));
	-}
	-
	-int
	-nvlist_add_byte(nvlist_t nvl, const char name, uchar_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val));
	-}
	-
	-int
	-nvlist_add_int8(nvlist_t nvl, const char name, int8_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val));
	-}
	-
	-int
	-nvlist_add_uint8(nvlist_t nvl, const char name, uint8_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val));
	-}
	-
	-int
	-nvlist_add_int16(nvlist_t nvl, const char name, int16_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val));
	-}
	-
	-int
	-nvlist_add_uint16(nvlist_t nvl, const char name, uint16_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val));
	-}
	-
	-int
	-nvlist_add_int32(nvlist_t nvl, const char name, int32_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val));
	-}
	-
	-int
	-nvlist_add_uint32(nvlist_t nvl, const char name, uint32_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val));
	-}
	-
	-int
	-nvlist_add_int64(nvlist_t nvl, const char name, int64_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val));
	-}
	-
	-int
	-nvlist_add_uint64(nvlist_t nvl, const char name, uint64_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
	-}
	-
	-#if !defined(_KERNEL)
	-int
	-nvlist_add_double(nvlist_t nvl, const char name, double val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val));
	-}
	-#endif
	-
	-int
	-nvlist_add_string(nvlist_t nvl, const char name, const char *val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val));
	-}
	-
	-int
	-nvlist_add_boolean_array(nvlist_t nvl, const char name,
	- boolean_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_byte_array(nvlist_t nvl, const char name, uchar_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_int8_array(nvlist_t nvl, const char name, int8_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_uint8_array(nvlist_t nvl, const char name, uint8_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_int16_array(nvlist_t nvl, const char name, int16_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_uint16_array(nvlist_t nvl, const char name, uint16_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_int32_array(nvlist_t nvl, const char name, int32_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_uint32_array(nvlist_t nvl, const char name, uint32_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_int64_array(nvlist_t nvl, const char name, int64_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_uint64_array(nvlist_t nvl, const char name, uint64_t *a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_string_array(nvlist_t nvl, const char name,
	- char const a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_add_hrtime(nvlist_t nvl, const char name, hrtime_t val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val));
	-}
	-
	-int
	-nvlist_add_nvlist(nvlist_t nvl, const char name, nvlist_t *val)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
	-}
	-
	-int
	-nvlist_add_nvlist_array(nvlist_t nvl, const char name, nvlist_t **a, uint_t n)
	-{
	- return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
	-}
	-
	-/* reading name-value pairs */
	-nvpair_t *
	-nvlist_next_nvpair(nvlist_t nvl, nvpair_t nvp)
	-{
	- nvpriv_t *priv;
	- i_nvp_t *curr;
	-
	- if (nvl == NULL \|\|
	- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
	- return (NULL);
	-
	- curr = NVPAIR2I_NVP(nvp);
	-
	- /*
	- * Ensure that nvp is a valid nvpair on this nvlist.
	- * NB: nvp_curr is used only as a hint so that we don't always
	- * have to walk the list to determine if nvp is still on the list.
	- */
	- if (nvp == NULL)
	- curr = priv->nvp_list;
	- else if (priv->nvp_curr == curr \|\| nvlist_contains_nvp(nvl, nvp))
	- curr = curr->nvi_next;
	- else
	- curr = NULL;
	-
	- priv->nvp_curr = curr;
	-
	- return (curr != NULL ? &curr->nvi_nvp : NULL);
	-}
	-
	-nvpair_t *
	-nvlist_prev_nvpair(nvlist_t nvl, nvpair_t nvp)
	-{
	- nvpriv_t *priv;
	- i_nvp_t *curr;
	-
	- if (nvl == NULL \|\|
	- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
	- return (NULL);
	-
	- curr = NVPAIR2I_NVP(nvp);
	-
	- if (nvp == NULL)
	- curr = priv->nvp_last;
	- else if (priv->nvp_curr == curr \|\| nvlist_contains_nvp(nvl, nvp))
	- curr = curr->nvi_prev;
	- else
	- curr = NULL;
	-
	- priv->nvp_curr = curr;
	-
	- return (curr != NULL ? &curr->nvi_nvp : NULL);
	-}
	-
	-boolean_t
	-nvlist_empty(nvlist_t *nvl)
	-{
	- nvpriv_t *priv;
	-
	- if (nvl == NULL \|\|
	- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
	- return (B_TRUE);
	-
	- return (priv->nvp_list == NULL);
	-}
	-
	-char *
	-nvpair_name(nvpair_t *nvp)
	-{
	- return (NVP_NAME(nvp));
	-}
	-
	-data_type_t
	-nvpair_type(nvpair_t *nvp)
	-{
	- return (NVP_TYPE(nvp));
	-}
	-
	-int
	-nvpair_type_is_array(nvpair_t *nvp)
	-{
	- data_type_t type = NVP_TYPE(nvp);
	-
	- if ((type == DATA_TYPE_BYTE_ARRAY) \|\|
	- (type == DATA_TYPE_INT8_ARRAY) \|\|
	- (type == DATA_TYPE_UINT8_ARRAY) \|\|
	- (type == DATA_TYPE_INT16_ARRAY) \|\|
	- (type == DATA_TYPE_UINT16_ARRAY) \|\|
	- (type == DATA_TYPE_INT32_ARRAY) \|\|
	- (type == DATA_TYPE_UINT32_ARRAY) \|\|
	- (type == DATA_TYPE_INT64_ARRAY) \|\|
	- (type == DATA_TYPE_UINT64_ARRAY) \|\|
	- (type == DATA_TYPE_BOOLEAN_ARRAY) \|\|
	- (type == DATA_TYPE_STRING_ARRAY) \|\|
	- (type == DATA_TYPE_NVLIST_ARRAY))
	- return (1);
	- return (0);
	-
	-}
	-
	-static int
	-nvpair_value_common(nvpair_t nvp, data_type_t type, uint_t nelem, void *data)
	-{
	- if (nvp == NULL \|\| nvpair_type(nvp) != type)
	- return (EINVAL);
	-
	- /*
	- * For non-array types, we copy the data.
	- * For array types (including string), we set a pointer.
	- */
	- switch (type) {
	- case DATA_TYPE_BOOLEAN:
	- if (nelem != NULL)
	- *nelem = 0;
	- break;
	-
	- case DATA_TYPE_BOOLEAN_VALUE:
	- case DATA_TYPE_BYTE:
	- case DATA_TYPE_INT8:
	- case DATA_TYPE_UINT8:
	- case DATA_TYPE_INT16:
	- case DATA_TYPE_UINT16:
	- case DATA_TYPE_INT32:
	- case DATA_TYPE_UINT32:
	- case DATA_TYPE_INT64:
	- case DATA_TYPE_UINT64:
	- case DATA_TYPE_HRTIME:
	-#if !defined(_KERNEL)
	- case DATA_TYPE_DOUBLE:
	-#endif
	- if (data == NULL)
	- return (EINVAL);
	- bcopy(NVP_VALUE(nvp), data,
	- (size_t)i_get_value_size(type, NULL, 1));
	- if (nelem != NULL)
	- *nelem = 1;
	- break;
	-
	- case DATA_TYPE_NVLIST:
	- case DATA_TYPE_STRING:
	- if (data == NULL)
	- return (EINVAL);
	- (void )data = (void )NVP_VALUE(nvp);
	- if (nelem != NULL)
	- *nelem = 1;
	- break;
	-
	- case DATA_TYPE_BOOLEAN_ARRAY:
	- case DATA_TYPE_BYTE_ARRAY:
	- case DATA_TYPE_INT8_ARRAY:
	- case DATA_TYPE_UINT8_ARRAY:
	- case DATA_TYPE_INT16_ARRAY:
	- case DATA_TYPE_UINT16_ARRAY:
	- case DATA_TYPE_INT32_ARRAY:
	- case DATA_TYPE_UINT32_ARRAY:
	- case DATA_TYPE_INT64_ARRAY:
	- case DATA_TYPE_UINT64_ARRAY:
	- case DATA_TYPE_STRING_ARRAY:
	- case DATA_TYPE_NVLIST_ARRAY:
	- if (nelem == NULL \|\| data == NULL)
	- return (EINVAL);
	- if ((*nelem = NVP_NELEM(nvp)) != 0)
	- (void )data = (void )NVP_VALUE(nvp);
	- else
	- (void *)data = NULL;
	- break;
	-
	- default:
	- return (ENOTSUP);
	- }
	-
	- return (0);
	-}
	-
	-static int
	-nvlist_lookup_common(nvlist_t nvl, const char name, data_type_t type,
	- uint_t nelem, void data)
	-{
	- if (name == NULL \|\| nvl == NULL \|\| nvl->nvl_priv == 0)
	- return (EINVAL);
	-
	- if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME \| NV_UNIQUE_NAME_TYPE)))
	- return (ENOTSUP);
	-
	- nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
	- if (nvp == NULL)
	- return (ENOENT);
	-
	- return (nvpair_value_common(nvp, type, nelem, data));
	-}
	-
	-int
	-nvlist_lookup_boolean(nvlist_t nvl, const char name)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL));
	-}
	-
	-int
	-nvlist_lookup_boolean_value(nvlist_t nvl, const char name, boolean_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name,
	- DATA_TYPE_BOOLEAN_VALUE, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_byte(nvlist_t nvl, const char name, uchar_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_int8(nvlist_t nvl, const char name, int8_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_uint8(nvlist_t nvl, const char name, uint8_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_int16(nvlist_t nvl, const char name, int16_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_uint16(nvlist_t nvl, const char name, uint16_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_int32(nvlist_t nvl, const char name, int32_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_uint32(nvlist_t nvl, const char name, uint32_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_int64(nvlist_t nvl, const char name, int64_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_uint64(nvlist_t nvl, const char name, uint64_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
	-}
	-
	-#if !defined(_KERNEL)
	-int
	-nvlist_lookup_double(nvlist_t nvl, const char name, double *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val));
	-}
	-#endif
	-
	-int
	-nvlist_lookup_string(nvlist_t nvl, const char name, char **val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_nvlist(nvlist_t nvl, const char name, nvlist_t **val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_boolean_array(nvlist_t nvl, const char name,
	- boolean_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name,
	- DATA_TYPE_BOOLEAN_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_byte_array(nvlist_t nvl, const char name,
	- uchar_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_int8_array(nvlist_t nvl, const char name, int8_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_uint8_array(nvlist_t nvl, const char name,
	- uint8_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_int16_array(nvlist_t nvl, const char name,
	- int16_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_uint16_array(nvlist_t nvl, const char name,
	- uint16_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_int32_array(nvlist_t nvl, const char name,
	- int32_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_uint32_array(nvlist_t nvl, const char name,
	- uint32_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_int64_array(nvlist_t nvl, const char name,
	- int64_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_uint64_array(nvlist_t nvl, const char name,
	- uint64_t *a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_string_array(nvlist_t nvl, const char name,
	- char **a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_nvlist_array(nvlist_t nvl, const char name,
	- nvlist_t **a, uint_t n)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
	-}
	-
	-int
	-nvlist_lookup_hrtime(nvlist_t nvl, const char name, hrtime_t *val)
	-{
	- return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val));
	-}
	-
	-int
	-nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
	-{
	- va_list ap;
	- char *name;
	- int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0);
	- int ret = 0;
	-
	- va_start(ap, flag);
	- while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
	- data_type_t type;
	- void *val;
	- uint_t *nelem;
	-
	- switch (type = va_arg(ap, data_type_t)) {
	- case DATA_TYPE_BOOLEAN:
	- ret = nvlist_lookup_common(nvl, name, type, NULL, NULL);
	- break;
	-
	- case DATA_TYPE_BOOLEAN_VALUE:
	- case DATA_TYPE_BYTE:
	- case DATA_TYPE_INT8:
	- case DATA_TYPE_UINT8:
	- case DATA_TYPE_INT16:
	- case DATA_TYPE_UINT16:
	- case DATA_TYPE_INT32:
	- case DATA_TYPE_UINT32:
	- case DATA_TYPE_INT64:
	- case DATA_TYPE_UINT64:
	- case DATA_TYPE_HRTIME:
	- case DATA_TYPE_STRING:
	- case DATA_TYPE_NVLIST:
	-#if !defined(_KERNEL)
	- case DATA_TYPE_DOUBLE:
	-#endif
	- val = va_arg(ap, void *);
	- ret = nvlist_lookup_common(nvl, name, type, NULL, val);
	- break;
	-
	- case DATA_TYPE_BYTE_ARRAY:
	- case DATA_TYPE_BOOLEAN_ARRAY:
	- case DATA_TYPE_INT8_ARRAY:
	- case DATA_TYPE_UINT8_ARRAY:
	- case DATA_TYPE_INT16_ARRAY:
	- case DATA_TYPE_UINT16_ARRAY:
	- case DATA_TYPE_INT32_ARRAY:
	- case DATA_TYPE_UINT32_ARRAY:
	- case DATA_TYPE_INT64_ARRAY:
	- case DATA_TYPE_UINT64_ARRAY:
	- case DATA_TYPE_STRING_ARRAY:
	- case DATA_TYPE_NVLIST_ARRAY:
	- val = va_arg(ap, void *);
	- nelem = va_arg(ap, uint_t *);
	- ret = nvlist_lookup_common(nvl, name, type, nelem, val);
	- break;
	-
	- default:
	- ret = EINVAL;
	- }
	-
	- if (ret == ENOENT && noentok)
	- ret = 0;
	- }
	- va_end(ap);
	-
	- return (ret);
	-}
	-
	-/*
	- * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function
	- * returns zero and a pointer to the matching nvpair is returned in '*ret'
	- * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate
	- * multiple levels of embedded nvlists, with 'sep' as the separator. As an
	- * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or
	- * "a.d[3].e[1]". This matches the C syntax for array embed (for convience,
	- * code also supports "a.d[3]e[1]" syntax).
	- *
	- * If 'ip' is non-NULL and the last name component is an array, return the
	- * value of the "...[index]" array index in *ip. For an array reference that
	- * is not indexed, *ip will be returned as -1. If there is a syntax error in
	- * 'name', and 'ep' is non-NULL then *ep will be set to point to the location
	- * inside the 'name' string where the syntax error was detected.
	- */
	-static int
	-nvlist_lookup_nvpair_ei_sep(nvlist_t nvl, const char name, const char sep,
	- nvpair_t *ret, int ip, char **ep)
	-{
	- nvpair_t *nvp;
	- const char *np;
	- char *sepp;
	- char idxp, idxep;
	- nvlist_t **nva;
	- long idx;
	- int n;
	-
	- if (ip)
	- ip = -1; / not indexed */
	- if (ep)
	- *ep = NULL;
	-
	- if ((nvl == NULL) \|\| (name == NULL))
	- return (EINVAL);
	-
	- sepp = NULL;
	- idx = 0;
	- /* step through components of name */
	- for (np = name; np && *np; np = sepp) {
	- /* ensure unique names */
	- if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME))
	- return (ENOTSUP);
	-
	- /* skip white space */
	- skip_whitespace(np);
	- if (*np == 0)
	- break;
	-
	- /* set 'sepp' to end of current component 'np' */
	- if (sep)
	- sepp = strchr(np, sep);
	- else
	- sepp = NULL;
	-
	- /* find start of next "[ index ]..." */
	- idxp = strchr(np, '[');
	-
	- /* if sepp comes first, set idxp to NULL */
	- if (sepp && idxp && (sepp < idxp))
	- idxp = NULL;
	-
	- /*
	- * At this point 'idxp' is set if there is an index
	- * expected for the current component.
	- */
	- if (idxp) {
	- /* set 'n' to length of current 'np' name component */
	- n = idxp++ - np;
	-
	- /* keep sepp up to date for ep use as we advance /
	- skip_whitespace(idxp);
	- sepp = idxp;
	-
	- /* determine the index value */
	-#if defined(_KERNEL) && !defined(_BOOT)
	- if (ddi_strtol(idxp, &idxep, 0, &idx))
	- goto fail;
	-#else
	- idx = strtol(idxp, &idxep, 0);
	-#endif
	- if (idxep == idxp)
	- goto fail;
	-
	- /* keep sepp up to date for ep use as we advance /
	- sepp = idxep;
	-
	- /* skip white space index value and check for ']' */
	- skip_whitespace(sepp);
	- if (*sepp++ != ']')
	- goto fail;
	-
	- /* for embedded arrays, support C syntax: "a[1].b" */
	- skip_whitespace(sepp);
	- if (sep && (*sepp == sep))
	- sepp++;
	- } else if (sepp) {
	- n = sepp++ - np;
	- } else {
	- n = strlen(np);
	- }
	-
	- /* trim trailing whitespace by reducing length of 'np' */
	- if (n == 0)
	- goto fail;
	- for (n--; (np[n] == ' ') \|\| (np[n] == '\t'); n--)
	- ;
	- n++;
	-
	- /* skip whitespace, and set sepp to NULL if complete */
	- if (sepp) {
	- skip_whitespace(sepp);
	- if (*sepp == 0)
	- sepp = NULL;
	- }
	-
	- /*
	- * At this point:
	- * o 'n' is the length of current 'np' component.
	- * o 'idxp' is set if there was an index, and value 'idx'.
	- * o 'sepp' is set to the beginning of the next component,
	- * and set to NULL if we have no more components.
	- *
	- * Search for nvpair with matching component name.
	- */
	- for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL;
	- nvp = nvlist_next_nvpair(nvl, nvp)) {
	-
	- /* continue if no match on name */
	- if (strncmp(np, nvpair_name(nvp), n) \|\|
	- (strlen(nvpair_name(nvp)) != n))
	- continue;
	-
	- /* if indexed, verify type is array oriented */
	- if (idxp && !nvpair_type_is_array(nvp))
	- goto fail;
	-
	- /*
	- * Full match found, return nvp and idx if this
	- * was the last component.
	- */
	- if (sepp == NULL) {
	- if (ret)
	- *ret = nvp;
	- if (ip && idxp)
	- ip = (int)idx; / return index */
	- return (0); /* found */
	- }
	-
	- /*
	- * More components: current match must be
	- * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY
	- * to support going deeper.
	- */
	- if (nvpair_type(nvp) == DATA_TYPE_NVLIST) {
	- nvl = EMBEDDED_NVL(nvp);
	- break;
	- } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) {
	- (void) nvpair_value_nvlist_array(nvp,
	- &nva, (uint_t *)&n);
	- if ((n < 0) \|\| (idx >= n))
	- goto fail;
	- nvl = nva[idx];
	- break;
	- }
	-
	- /* type does not support more levels */
	- goto fail;
	- }
	- if (nvp == NULL)
	- goto fail; /* 'name' not found */
	-
	- /* search for match of next component in embedded 'nvl' list */
	- }
	-
	-fail: if (ep && sepp)
	- *ep = sepp;
	- return (EINVAL);
	-}
	-
	-/*
	- * Return pointer to nvpair with specified 'name'.
	- */
	-int
	-nvlist_lookup_nvpair(nvlist_t nvl, const char name, nvpair_t **ret)
	-{
	- return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL));
	-}
	-
	-/*
	- * Determine if named nvpair exists in nvlist (use embedded separator of '.'
	- * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed
	- * description.
	- */
	-int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl,
	- const char name, nvpair_t ret, int ip, char **ep)
	-{
	- return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep));
	-}
	-
	-boolean_t
	-nvlist_exists(nvlist_t nvl, const char name)
	-{
	- nvpriv_t *priv;
	- nvpair_t *nvp;
	- i_nvp_t *curr;
	-
	- if (name == NULL \|\| nvl == NULL \|\|
	- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
	- return (B_FALSE);
	-
	- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
	- nvp = &curr->nvi_nvp;
	-
	- if (strcmp(name, NVP_NAME(nvp)) == 0)
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-int
	-nvpair_value_boolean_value(nvpair_t nvp, boolean_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val));
	-}
	-
	-int
	-nvpair_value_byte(nvpair_t nvp, uchar_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val));
	-}
	-
	-int
	-nvpair_value_int8(nvpair_t nvp, int8_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val));
	-}
	-
	-int
	-nvpair_value_uint8(nvpair_t nvp, uint8_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val));
	-}
	-
	-int
	-nvpair_value_int16(nvpair_t nvp, int16_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val));
	-}
	-
	-int
	-nvpair_value_uint16(nvpair_t nvp, uint16_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val));
	-}
	-
	-int
	-nvpair_value_int32(nvpair_t nvp, int32_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val));
	-}
	-
	-int
	-nvpair_value_uint32(nvpair_t nvp, uint32_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val));
	-}
	-
	-int
	-nvpair_value_int64(nvpair_t nvp, int64_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val));
	-}
	-
	-int
	-nvpair_value_uint64(nvpair_t nvp, uint64_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
	-}
	-
	-#if !defined(_KERNEL)
	-int
	-nvpair_value_double(nvpair_t nvp, double val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val));
	-}
	-#endif
	-
	-int
	-nvpair_value_string(nvpair_t nvp, char *val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val));
	-}
	-
	-int
	-nvpair_value_nvlist(nvpair_t nvp, nvlist_t *val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val));
	-}
	-
	-int
	-nvpair_value_boolean_array(nvpair_t nvp, boolean_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_byte_array(nvpair_t nvp, uchar_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_int8_array(nvpair_t nvp, int8_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_uint8_array(nvpair_t nvp, uint8_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_int16_array(nvpair_t nvp, int16_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_uint16_array(nvpair_t nvp, uint16_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_int32_array(nvpair_t nvp, int32_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_uint32_array(nvpair_t nvp, uint32_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_int64_array(nvpair_t nvp, int64_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_uint64_array(nvpair_t nvp, uint64_t val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_string_array(nvpair_t nvp, char *val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_nvlist_array(nvpair_t nvp, nvlist_t *val, uint_t nelem)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val));
	-}
	-
	-int
	-nvpair_value_hrtime(nvpair_t nvp, hrtime_t val)
	-{
	- return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val));
	-}
	-
	-/*
	- * Add specified pair to the list.
	- */
	-int
	-nvlist_add_nvpair(nvlist_t nvl, nvpair_t nvp)
	-{
	- if (nvl == NULL \|\| nvp == NULL)
	- return (EINVAL);
	-
	- return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp),
	- NVP_NELEM(nvp), NVP_VALUE(nvp)));
	-}
	-
	-/*
	- * Merge the supplied nvlists and put the result in dst.
	- * The merged list will contain all names specified in both lists,
	- * the values are taken from nvl in the case of duplicates.
	- * Return 0 on success.
	- */
	-/ARGSUSED/
	-int
	-nvlist_merge(nvlist_t dst, nvlist_t nvl, int flag)
	-{
	- if (nvl == NULL \|\| dst == NULL)
	- return (EINVAL);
	-
	- if (dst != nvl)
	- return (nvlist_copy_pairs(nvl, dst));
	-
	- return (0);
	-}
	-
	-/*
	- * Encoding related routines
	- */
	-#define NVS_OP_ENCODE 0
	-#define NVS_OP_DECODE 1
	-#define NVS_OP_GETSIZE 2
	-
	-typedef struct nvs_ops nvs_ops_t;
	-
	-typedef struct {
	- int nvs_op;
	- const nvs_ops_t *nvs_ops;
	- void *nvs_private;
	- nvpriv_t *nvs_priv;
	- int nvs_recursion;
	-} nvstream_t;
	-
	-/*
	- * nvs operations are:
	- * - nvs_nvlist
	- * encoding / decoding of a nvlist header (nvlist_t)
	- * calculates the size used for header and end detection
	- *
	- * - nvs_nvpair
	- * responsible for the first part of encoding / decoding of an nvpair
	- * calculates the decoded size of an nvpair
	- *
	- * - nvs_nvp_op
	- * second part of encoding / decoding of an nvpair
	- *
	- * - nvs_nvp_size
	- * calculates the encoding size of an nvpair
	- *
	- * - nvs_nvl_fini
	- * encodes the end detection mark (zeros).
	- */
	-struct nvs_ops {
	- int (nvs_nvlist)(nvstream_t , nvlist_t , size_t );
	- int (nvs_nvpair)(nvstream_t , nvpair_t , size_t );
	- int (nvs_nvp_op)(nvstream_t , nvpair_t *);
	- int (nvs_nvp_size)(nvstream_t , nvpair_t , size_t );
	- int (nvs_nvl_fini)(nvstream_t );
	-};
	-
	-typedef struct {
	- char nvh_encoding; /* nvs encoding method */
	- char nvh_endian; /* nvs endian */
	- char nvh_reserved1; /* reserved for future use */
	- char nvh_reserved2; /* reserved for future use */
	-} nvs_header_t;
	-
	-static int
	-nvs_encode_pairs(nvstream_t nvs, nvlist_t nvl)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	- i_nvp_t *curr;
	-
	- /*
	- * Walk nvpair in list and encode each nvpair
	- */
	- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
	- if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0)
	- return (EFAULT);
	-
	- return (nvs->nvs_ops->nvs_nvl_fini(nvs));
	-}
	-
	-static int
	-nvs_decode_pairs(nvstream_t nvs, nvlist_t nvl)
	-{
	- nvpair_t *nvp;
	- size_t nvsize;
	- int err;
	-
	- /*
	- * Get decoded size of next pair in stream, alloc
	- * memory for nvpair_t, then decode the nvpair
	- */
	- while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) {
	- if (nvsize == 0) /* end of list */
	- break;
	-
	- /* make sure len makes sense */
	- if (nvsize < NVP_SIZE_CALC(1, 0))
	- return (EFAULT);
	-
	- if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL)
	- return (ENOMEM);
	-
	- if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) {
	- nvp_buf_free(nvl, nvp);
	- return (err);
	- }
	-
	- if (i_validate_nvpair(nvp) != 0) {
	- nvpair_free(nvp);
	- nvp_buf_free(nvl, nvp);
	- return (EFAULT);
	- }
	-
	- err = nvt_add_nvpair(nvl, nvp);
	- if (err != 0) {
	- nvpair_free(nvp);
	- nvp_buf_free(nvl, nvp);
	- return (err);
	- }
	- nvp_buf_link(nvl, nvp);
	- }
	- return (err);
	-}
	-
	-static int
	-nvs_getsize_pairs(nvstream_t nvs, nvlist_t nvl, size_t *buflen)
	-{
	- nvpriv_t priv = (nvpriv_t )(uintptr_t)nvl->nvl_priv;
	- i_nvp_t *curr;
	- uint64_t nvsize = *buflen;
	- size_t size;
	-
	- /*
	- * Get encoded size of nvpairs in nvlist
	- */
	- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
	- if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0)
	- return (EINVAL);
	-
	- if ((nvsize += size) > INT32_MAX)
	- return (EINVAL);
	- }
	-
	- *buflen = nvsize;
	- return (0);
	-}
	-
	-static int
	-nvs_operation(nvstream_t nvs, nvlist_t nvl, size_t *buflen)
	-{
	- int err;
	-
	- if (nvl->nvl_priv == 0)
	- return (EFAULT);
	-
	- /*
	- * Perform the operation, starting with header, then each nvpair
	- */
	- if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0)
	- return (err);
	-
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- err = nvs_encode_pairs(nvs, nvl);
	- break;
	-
	- case NVS_OP_DECODE:
	- err = nvs_decode_pairs(nvs, nvl);
	- break;
	-
	- case NVS_OP_GETSIZE:
	- err = nvs_getsize_pairs(nvs, nvl, buflen);
	- break;
	-
	- default:
	- err = EINVAL;
	- }
	-
	- return (err);
	-}
	-
	-static int
	-nvs_embedded(nvstream_t nvs, nvlist_t embedded)
	-{
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE: {
	- int err;
	-
	- if (nvs->nvs_recursion >= nvpair_max_recursion)
	- return (EINVAL);
	- nvs->nvs_recursion++;
	- err = nvs_operation(nvs, embedded, NULL);
	- nvs->nvs_recursion--;
	- return (err);
	- }
	- case NVS_OP_DECODE: {
	- nvpriv_t *priv;
	- int err;
	-
	- if (embedded->nvl_version != NV_VERSION)
	- return (ENOTSUP);
	-
	- if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL)
	- return (ENOMEM);
	-
	- nvlist_init(embedded, embedded->nvl_nvflag, priv);
	-
	- if (nvs->nvs_recursion >= nvpair_max_recursion) {
	- nvlist_free(embedded);
	- return (EINVAL);
	- }
	- nvs->nvs_recursion++;
	- if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
	- nvlist_free(embedded);
	- nvs->nvs_recursion--;
	- return (err);
	- }
	- default:
	- break;
	- }
	-
	- return (EINVAL);
	-}
	-
	-static int
	-nvs_embedded_nvl_array(nvstream_t nvs, nvpair_t nvp, size_t *size)
	-{
	- size_t nelem = NVP_NELEM(nvp);
	- nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
	- int i;
	-
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- for (i = 0; i < nelem; i++)
	- if (nvs_embedded(nvs, nvlp[i]) != 0)
	- return (EFAULT);
	- break;
	-
	- case NVS_OP_DECODE: {
	- size_t len = nelem * sizeof (uint64_t);
	- nvlist_t embedded = (nvlist_t )((uintptr_t)nvlp + len);
	-
	- bzero(nvlp, len); /* don't trust packed data */
	- for (i = 0; i < nelem; i++) {
	- if (nvs_embedded(nvs, embedded) != 0) {
	- nvpair_free(nvp);
	- return (EFAULT);
	- }
	-
	- nvlp[i] = embedded++;
	- }
	- break;
	- }
	- case NVS_OP_GETSIZE: {
	- uint64_t nvsize = 0;
	-
	- for (i = 0; i < nelem; i++) {
	- size_t nvp_sz = 0;
	-
	- if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0)
	- return (EINVAL);
	-
	- if ((nvsize += nvp_sz) > INT32_MAX)
	- return (EINVAL);
	- }
	-
	- *size = nvsize;
	- break;
	- }
	- default:
	- return (EINVAL);
	- }
	-
	- return (0);
	-}
	-
	-static int nvs_native(nvstream_t , nvlist_t , char , size_t );
	-static int nvs_xdr(nvstream_t , nvlist_t , char , size_t );
	-
	-/*
	- * Common routine for nvlist operations:
	- * encode, decode, getsize (encoded size).
	- */
	-static int
	-nvlist_common(nvlist_t nvl, char buf, size_t *buflen, int encoding,
	- int nvs_op)
	-{
	- int err = 0;
	- nvstream_t nvs;
	- int nvl_endian;
	-#if BYTE_ORDER == _LITTLE_ENDIAN
	- int host_endian = 1;
	-#else
	- int host_endian = 0;
	-#endif /* _LITTLE_ENDIAN */
	- nvs_header_t nvh = (void )buf;
	-
	- if (buflen == NULL \|\| nvl == NULL \|\|
	- (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
	- return (EINVAL);
	-
	- nvs.nvs_op = nvs_op;
	- nvs.nvs_recursion = 0;
	-
	- /*
	- * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
	- * a buffer is allocated. The first 4 bytes in the buffer are
	- * used for encoding method and host endian.
	- */
	- switch (nvs_op) {
	- case NVS_OP_ENCODE:
	- if (buf == NULL \|\| *buflen < sizeof (nvs_header_t))
	- return (EINVAL);
	-
	- nvh->nvh_encoding = encoding;
	- nvh->nvh_endian = nvl_endian = host_endian;
	- nvh->nvh_reserved1 = 0;
	- nvh->nvh_reserved2 = 0;
	- break;
	-
	- case NVS_OP_DECODE:
	- if (buf == NULL \|\| *buflen < sizeof (nvs_header_t))
	- return (EINVAL);
	-
	- /* get method of encoding from first byte */
	- encoding = nvh->nvh_encoding;
	- nvl_endian = nvh->nvh_endian;
	- break;
	-
	- case NVS_OP_GETSIZE:
	- nvl_endian = host_endian;
	-
	- /*
	- * add the size for encoding
	- */
	- *buflen = sizeof (nvs_header_t);
	- break;
	-
	- default:
	- return (ENOTSUP);
	- }
	-
	- /*
	- * Create an nvstream with proper encoding method
	- */
	- switch (encoding) {
	- case NV_ENCODE_NATIVE:
	- /*
	- * check endianness, in case we are unpacking
	- * from a file
	- */
	- if (nvl_endian != host_endian)
	- return (ENOTSUP);
	- err = nvs_native(&nvs, nvl, buf, buflen);
	- break;
	- case NV_ENCODE_XDR:
	- err = nvs_xdr(&nvs, nvl, buf, buflen);
	- break;
	- default:
	- err = ENOTSUP;
	- break;
	- }
	-
	- return (err);
	-}
	-
	-int
	-nvlist_size(nvlist_t nvl, size_t size, int encoding)
	-{
	- return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE));
	-}
	-
	-/*
	- * Pack nvlist into contiguous memory
	- */
	-/ARGSUSED1/
	-int
	-nvlist_pack(nvlist_t nvl, char bufp, size_t buflen, int encoding,
	- int kmflag)
	-{
	-#if defined(_KERNEL) && !defined(_BOOT)
	- return (nvlist_xpack(nvl, bufp, buflen, encoding,
	- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
	-#else
	- return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep));
	-#endif
	-}
	-
	-int
	-nvlist_xpack(nvlist_t nvl, char bufp, size_t buflen, int encoding,
	- nv_alloc_t *nva)
	-{
	- nvpriv_t nvpriv;
	- size_t alloc_size;
	- char *buf;
	- int err;
	-
	- if (nva == NULL \|\| nvl == NULL \|\| bufp == NULL \|\| buflen == NULL)
	- return (EINVAL);
	-
	- if (*bufp != NULL)
	- return (nvlist_common(nvl, *bufp, buflen, encoding,
	- NVS_OP_ENCODE));
	-
	- /*
	- * Here is a difficult situation:
	- * 1. The nvlist has fixed allocator properties.
	- * All other nvlist routines (like nvlist_add_*, ...) use
	- * these properties.
	- * 2. When using nvlist_pack() the user can specify their own
	- * allocator properties (e.g. by using KM_NOSLEEP).
	- *
	- * We use the user specified properties (2). A clearer solution
	- * will be to remove the kmflag from nvlist_pack(), but we will
	- * not change the interface.
	- */
	- nv_priv_init(&nvpriv, nva, 0);
	-
	- if ((err = nvlist_size(nvl, &alloc_size, encoding)))
	- return (err);
	-
	- if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL)
	- return (ENOMEM);
	-
	- if ((err = nvlist_common(nvl, buf, &alloc_size, encoding,
	- NVS_OP_ENCODE)) != 0) {
	- nv_mem_free(&nvpriv, buf, alloc_size);
	- } else {
	- *buflen = alloc_size;
	- *bufp = buf;
	- }
	-
	- return (err);
	-}
	-
	-/*
	- * Unpack buf into an nvlist_t
	- */
	-/ARGSUSED1/
	-int
	-nvlist_unpack(char buf, size_t buflen, nvlist_t *nvlp, int kmflag)
	-{
	-#if defined(_KERNEL) && !defined(_BOOT)
	- return (nvlist_xunpack(buf, buflen, nvlp,
	- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
	-#else
	- return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep));
	-#endif
	-}
	-
	-int
	-nvlist_xunpack(char buf, size_t buflen, nvlist_t nvlp, nv_alloc_t nva)
	-{
	- nvlist_t *nvl;
	- int err;
	-
	- if (nvlp == NULL)
	- return (EINVAL);
	-
	- if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0)
	- return (err);
	-
	- if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0)
	- nvlist_free(nvl);
	- else
	- *nvlp = nvl;
	-
	- return (err);
	-}
	-
	-/*
	- * Native encoding functions
	- */
	-typedef struct {
	- /*
	- * This structure is used when decoding a packed nvpair in
	- * the native format. n_base points to a buffer containing the
	- * packed nvpair. n_end is a pointer to the end of the buffer.
	- * (n_end actually points to the first byte past the end of the
	- * buffer.) n_curr is a pointer that lies between n_base and n_end.
	- * It points to the current data that we are decoding.
	- * The amount of data left in the buffer is equal to n_end - n_curr.
	- * n_flag is used to recognize a packed embedded list.
	- */
	- caddr_t n_base;
	- caddr_t n_end;
	- caddr_t n_curr;
	- uint_t n_flag;
	-} nvs_native_t;
	-
	-static int
	-nvs_native_create(nvstream_t nvs, nvs_native_t native, char *buf,
	- size_t buflen)
	-{
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- case NVS_OP_DECODE:
	- nvs->nvs_private = native;
	- native->n_curr = native->n_base = buf;
	- native->n_end = buf + buflen;
	- native->n_flag = 0;
	- return (0);
	-
	- case NVS_OP_GETSIZE:
	- nvs->nvs_private = native;
	- native->n_curr = native->n_base = native->n_end = NULL;
	- native->n_flag = 0;
	- return (0);
	- default:
	- return (EINVAL);
	- }
	-}
	-
	-/ARGSUSED/
	-static void
	-nvs_native_destroy(nvstream_t *nvs)
	-{
	-}
	-
	-static int
	-native_cp(nvstream_t nvs, void buf, size_t size)
	-{
	- nvs_native_t native = (nvs_native_t )nvs->nvs_private;
	-
	- if (native->n_curr + size > native->n_end)
	- return (EFAULT);
	-
	- /*
	- * The bcopy() below eliminates alignment requirement
	- * on the buffer (stream) and is preferred over direct access.
	- */
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- bcopy(buf, native->n_curr, size);
	- break;
	- case NVS_OP_DECODE:
	- bcopy(native->n_curr, buf, size);
	- break;
	- default:
	- return (EINVAL);
	- }
	-
	- native->n_curr += size;
	- return (0);
	-}
	-
	-/*
	- * operate on nvlist_t header
	- */
	-static int
	-nvs_native_nvlist(nvstream_t nvs, nvlist_t nvl, size_t *size)
	-{
	- nvs_native_t *native = nvs->nvs_private;
	-
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- case NVS_OP_DECODE:
	- if (native->n_flag)
	- return (0); /* packed embedded list */
	-
	- native->n_flag = 1;
	-
	- /* copy version and nvflag of the nvlist_t */
	- if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 \|\|
	- native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0)
	- return (EFAULT);
	-
	- return (0);
	-
	- case NVS_OP_GETSIZE:
	- /*
	- * if calculate for packed embedded list
	- * 4 for end of the embedded list
	- * else
	- * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag
	- * and 4 for end of the entire list
	- */
	- if (native->n_flag) {
	- *size += 4;
	- } else {
	- native->n_flag = 1;
	- size += 2 sizeof (int32_t) + 4;
	- }
	-
	- return (0);
	-
	- default:
	- return (EINVAL);
	- }
	-}
	-
	-static int
	-nvs_native_nvl_fini(nvstream_t *nvs)
	-{
	- if (nvs->nvs_op == NVS_OP_ENCODE) {
	- nvs_native_t native = (nvs_native_t )nvs->nvs_private;
	- /*
	- * Add 4 zero bytes at end of nvlist. They are used
	- * for end detection by the decode routine.
	- */
	- if (native->n_curr + sizeof (int) > native->n_end)
	- return (EFAULT);
	-
	- bzero(native->n_curr, sizeof (int));
	- native->n_curr += sizeof (int);
	- }
	-
	- return (0);
	-}
	-
	-static int
	-nvpair_native_embedded(nvstream_t nvs, nvpair_t nvp)
	-{
	- if (nvs->nvs_op == NVS_OP_ENCODE) {
	- nvs_native_t native = (nvs_native_t )nvs->nvs_private;
	- char packed = (void )
	- (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
	- /*
	- * Null out the pointer that is meaningless in the packed
	- * structure. The address may not be aligned, so we have
	- * to use bzero.
	- */
	- bzero(packed + offsetof(nvlist_t, nvl_priv),
	- sizeof(((nvlist_t *)NULL)->nvl_priv));
	- }
	-
	- return (nvs_embedded(nvs, EMBEDDED_NVL(nvp)));
	-}
	-
	-static int
	-nvpair_native_embedded_array(nvstream_t nvs, nvpair_t nvp)
	-{
	- if (nvs->nvs_op == NVS_OP_ENCODE) {
	- nvs_native_t native = (nvs_native_t )nvs->nvs_private;
	- char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp);
	- size_t len = NVP_NELEM(nvp) * sizeof (uint64_t);
	- int i;
	- /*
	- * Null out pointers that are meaningless in the packed
	- * structure. The addresses may not be aligned, so we have
	- * to use bzero.
	- */
	- bzero(value, len);
	-
	- value += len;
	- for (i = 0; i < NVP_NELEM(nvp); i++) {
	- /*
	- * Null out the pointer that is meaningless in the
	- * packed structure. The address may not be aligned,
	- * so we have to use bzero.
	- */
	- bzero(value + offsetof(nvlist_t, nvl_priv),
	- sizeof(((nvlist_t *)NULL)->nvl_priv));
	- value += sizeof(nvlist_t);
	- }
	- }
	-
	- return (nvs_embedded_nvl_array(nvs, nvp, NULL));
	-}
	-
	-static void
	-nvpair_native_string_array(nvstream_t nvs, nvpair_t nvp)
	-{
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE: {
	- nvs_native_t native = (nvs_native_t )nvs->nvs_private;
	- uint64_t strp = (void )
	- (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
	- /*
	- * Null out pointers that are meaningless in the packed
	- * structure. The addresses may not be aligned, so we have
	- * to use bzero.
	- */
	- bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t));
	- break;
	- }
	- case NVS_OP_DECODE: {
	- char *strp = (void )NVP_VALUE(nvp);
	- char buf = ((char )strp + NVP_NELEM(nvp) * sizeof (uint64_t));
	- int i;
	-
	- for (i = 0; i < NVP_NELEM(nvp); i++) {
	- strp[i] = buf;
	- buf += strlen(buf) + 1;
	- }
	- break;
	- }
	- }
	-}
	-
	-static int
	-nvs_native_nvp_op(nvstream_t nvs, nvpair_t nvp)
	-{
	- data_type_t type;
	- int value_sz;
	- int ret = 0;
	-
	- /*
	- * We do the initial bcopy of the data before we look at
	- * the nvpair type, because when we're decoding, we won't
	- * have the correct values for the pair until we do the bcopy.
	- */
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- case NVS_OP_DECODE:
	- if (native_cp(nvs, nvp, nvp->nvp_size) != 0)
	- return (EFAULT);
	- break;
	- default:
	- return (EINVAL);
	- }
	-
	- /* verify nvp_name_sz, check the name string length */
	- if (i_validate_nvpair_name(nvp) != 0)
	- return (EFAULT);
	-
	- type = NVP_TYPE(nvp);
	-
	- /*
	- * Verify type and nelem and get the value size.
	- * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
	- * is the size of the string(s) excluded.
	- */
	- if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
	- return (EFAULT);
	-
	- if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
	- return (EFAULT);
	-
	- switch (type) {
	- case DATA_TYPE_NVLIST:
	- ret = nvpair_native_embedded(nvs, nvp);
	- break;
	- case DATA_TYPE_NVLIST_ARRAY:
	- ret = nvpair_native_embedded_array(nvs, nvp);
	- break;
	- case DATA_TYPE_STRING_ARRAY:
	- nvpair_native_string_array(nvs, nvp);
	- break;
	- default:
	- break;
	- }
	-
	- return (ret);
	-}
	-
	-static int
	-nvs_native_nvp_size(nvstream_t nvs, nvpair_t nvp, size_t *size)
	-{
	- uint64_t nvp_sz = nvp->nvp_size;
	-
	- switch (NVP_TYPE(nvp)) {
	- case DATA_TYPE_NVLIST: {
	- size_t nvsize = 0;
	-
	- if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0)
	- return (EINVAL);
	-
	- nvp_sz += nvsize;
	- break;
	- }
	- case DATA_TYPE_NVLIST_ARRAY: {
	- size_t nvsize;
	-
	- if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0)
	- return (EINVAL);
	-
	- nvp_sz += nvsize;
	- break;
	- }
	- default:
	- break;
	- }
	-
	- if (nvp_sz > INT32_MAX)
	- return (EINVAL);
	-
	- *size = nvp_sz;
	-
	- return (0);
	-}
	-
	-static int
	-nvs_native_nvpair(nvstream_t nvs, nvpair_t nvp, size_t *size)
	-{
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- return (nvs_native_nvp_op(nvs, nvp));
	-
	- case NVS_OP_DECODE: {
	- nvs_native_t native = (nvs_native_t )nvs->nvs_private;
	- int32_t decode_len;
	-
	- /* try to read the size value from the stream */
	- if (native->n_curr + sizeof (int32_t) > native->n_end)
	- return (EFAULT);
	- bcopy(native->n_curr, &decode_len, sizeof (int32_t));
	-
	- /* sanity check the size value */
	- if (decode_len < 0 \|\|
	- decode_len > native->n_end - native->n_curr)
	- return (EFAULT);
	-
	- *size = decode_len;
	-
	- /*
	- * If at the end of the stream then move the cursor
	- * forward, otherwise nvpair_native_op() will read
	- * the entire nvpair at the same cursor position.
	- */
	- if (*size == 0)
	- native->n_curr += sizeof (int32_t);
	- break;
	- }
	-
	- default:
	- return (EINVAL);
	- }
	-
	- return (0);
	-}
	-
	-static const nvs_ops_t nvs_native_ops = {
	- nvs_native_nvlist,
	- nvs_native_nvpair,
	- nvs_native_nvp_op,
	- nvs_native_nvp_size,
	- nvs_native_nvl_fini
	-};
	-
	-static int
	-nvs_native(nvstream_t nvs, nvlist_t nvl, char buf, size_t buflen)
	-{
	- nvs_native_t native;
	- int err;
	-
	- nvs->nvs_ops = &nvs_native_ops;
	-
	- if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t),
	- *buflen - sizeof (nvs_header_t))) != 0)
	- return (err);
	-
	- err = nvs_operation(nvs, nvl, buflen);
	-
	- nvs_native_destroy(nvs);
	-
	- return (err);
	-}
	-
	-/*
	- * XDR encoding functions
	- *
	- * An xdr packed nvlist is encoded as:
	- *
	- * - encoding methode and host endian (4 bytes)
	- * - nvl_version (4 bytes)
	- * - nvl_nvflag (4 bytes)
	- *
	- * - encoded nvpairs, the format of one xdr encoded nvpair is:
	- * - encoded size of the nvpair (4 bytes)
	- * - decoded size of the nvpair (4 bytes)
	- * - name string, (4 + sizeof(NV_ALIGN4(string))
	- * a string is coded as size (4 bytes) and data
	- * - data type (4 bytes)
	- * - number of elements in the nvpair (4 bytes)
	- * - data
	- *
	- * - 2 zero's for end of the entire list (8 bytes)
	- */
	-static int
	-nvs_xdr_create(nvstream_t nvs, XDR xdr, char *buf, size_t buflen)
	-{
	- /* xdr data must be 4 byte aligned */
	- if ((ulong_t)buf % 4 != 0)
	- return (EFAULT);
	-
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE);
	- nvs->nvs_private = xdr;
	- return (0);
	- case NVS_OP_DECODE:
	- xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE);
	- nvs->nvs_private = xdr;
	- return (0);
	- case NVS_OP_GETSIZE:
	- nvs->nvs_private = NULL;
	- return (0);
	- default:
	- return (EINVAL);
	- }
	-}
	-
	-static void
	-nvs_xdr_destroy(nvstream_t *nvs)
	-{
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- case NVS_OP_DECODE:
	- xdr_destroy((XDR *)nvs->nvs_private);
	- break;
	- default:
	- break;
	- }
	-}
	-
	-static int
	-nvs_xdr_nvlist(nvstream_t nvs, nvlist_t nvl, size_t *size)
	-{
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE:
	- case NVS_OP_DECODE: {
	- XDR *xdr = nvs->nvs_private;
	-
	- if (!xdr_int(xdr, &nvl->nvl_version) \|\|
	- !xdr_u_int(xdr, &nvl->nvl_nvflag))
	- return (EFAULT);
	- break;
	- }
	- case NVS_OP_GETSIZE: {
	- /*
	- * 2 * 4 for nvl_version + nvl_nvflag
	- * and 8 for end of the entire list
	- */
	- size += 2 4 + 8;
	- break;
	- }
	- default:
	- return (EINVAL);
	- }
	- return (0);
	-}
	-
	-static int
	-nvs_xdr_nvl_fini(nvstream_t *nvs)
	-{
	- if (nvs->nvs_op == NVS_OP_ENCODE) {
	- XDR *xdr = nvs->nvs_private;
	- int zero = 0;
	-
	- if (!xdr_int(xdr, &zero) \|\| !xdr_int(xdr, &zero))
	- return (EFAULT);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * The format of xdr encoded nvpair is:
	- * encode_size, decode_size, name string, data type, nelem, data
	- */
	-static int
	-nvs_xdr_nvp_op(nvstream_t nvs, nvpair_t nvp)
	-{
	- data_type_t type;
	- char *buf;
	- char buf_end = (char )nvp + nvp->nvp_size;
	- int value_sz;
	- uint_t nelem, buflen;
	- bool_t ret = FALSE;
	- XDR *xdr = nvs->nvs_private;
	-
	- ASSERT(xdr != NULL && nvp != NULL);
	-
	- /* name string */
	- if ((buf = NVP_NAME(nvp)) >= buf_end)
	- return (EFAULT);
	- buflen = buf_end - buf;
	-
	- if (!xdr_string(xdr, &buf, buflen - 1))
	- return (EFAULT);
	- nvp->nvp_name_sz = strlen(buf) + 1;
	-
	- /* type and nelem */
	- if (!xdr_int(xdr, (int *)&nvp->nvp_type) \|\|
	- !xdr_int(xdr, &nvp->nvp_value_elem))
	- return (EFAULT);
	-
	- type = NVP_TYPE(nvp);
	- nelem = nvp->nvp_value_elem;
	-
	- /*
	- * Verify type and nelem and get the value size.
	- * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
	- * is the size of the string(s) excluded.
	- */
	- if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
	- return (EFAULT);
	-
	- /* if there is no data to extract then return */
	- if (nelem == 0)
	- return (0);
	-
	- /* value */
	- if ((buf = NVP_VALUE(nvp)) >= buf_end)
	- return (EFAULT);
	- buflen = buf_end - buf;
	-
	- if (buflen < value_sz)
	- return (EFAULT);
	-
	- switch (type) {
	- case DATA_TYPE_NVLIST:
	- if (nvs_embedded(nvs, (void *)buf) == 0)
	- return (0);
	- break;
	-
	- case DATA_TYPE_NVLIST_ARRAY:
	- if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0)
	- return (0);
	- break;
	-
	- case DATA_TYPE_BOOLEAN:
	- ret = TRUE;
	- break;
	-
	- case DATA_TYPE_BYTE:
	- case DATA_TYPE_INT8:
	- case DATA_TYPE_UINT8:
	- ret = xdr_char(xdr, buf);
	- break;
	-
	- case DATA_TYPE_INT16:
	- ret = xdr_short(xdr, (void *)buf);
	- break;
	-
	- case DATA_TYPE_UINT16:
	- ret = xdr_u_short(xdr, (void *)buf);
	- break;
	-
	- case DATA_TYPE_BOOLEAN_VALUE:
	- case DATA_TYPE_INT32:
	- ret = xdr_int(xdr, (void *)buf);
	- break;
	-
	- case DATA_TYPE_UINT32:
	- ret = xdr_u_int(xdr, (void *)buf);
	- break;
	-
	- case DATA_TYPE_INT64:
	- ret = xdr_longlong_t(xdr, (void *)buf);
	- break;
	-
	- case DATA_TYPE_UINT64:
	- ret = xdr_u_longlong_t(xdr, (void *)buf);
	- break;
	-
	- case DATA_TYPE_HRTIME:
	- /*
	- * NOTE: must expose the definition of hrtime_t here
	- */
	- ret = xdr_longlong_t(xdr, (void *)buf);
	- break;
	-#if !defined(_KERNEL)
	- case DATA_TYPE_DOUBLE:
	- ret = xdr_double(xdr, (void *)buf);
	- break;
	-#endif
	- case DATA_TYPE_STRING:
	- ret = xdr_string(xdr, &buf, buflen - 1);
	- break;
	-
	- case DATA_TYPE_BYTE_ARRAY:
	- ret = xdr_opaque(xdr, buf, nelem);
	- break;
	-
	- case DATA_TYPE_INT8_ARRAY:
	- case DATA_TYPE_UINT8_ARRAY:
	- ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
	- (xdrproc_t)xdr_char);
	- break;
	-
	- case DATA_TYPE_INT16_ARRAY:
	- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
	- sizeof (int16_t), (xdrproc_t)xdr_short);
	- break;
	-
	- case DATA_TYPE_UINT16_ARRAY:
	- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
	- sizeof (uint16_t), (xdrproc_t)xdr_u_short);
	- break;
	-
	- case DATA_TYPE_BOOLEAN_ARRAY:
	- case DATA_TYPE_INT32_ARRAY:
	- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
	- sizeof (int32_t), (xdrproc_t)xdr_int);
	- break;
	-
	- case DATA_TYPE_UINT32_ARRAY:
	- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
	- sizeof (uint32_t), (xdrproc_t)xdr_u_int);
	- break;
	-
	- case DATA_TYPE_INT64_ARRAY:
	- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
	- sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
	- break;
	-
	- case DATA_TYPE_UINT64_ARRAY:
	- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
	- sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
	- break;
	-
	- case DATA_TYPE_STRING_ARRAY: {
	- size_t len = nelem * sizeof (uint64_t);
	- char *strp = (void )buf;
	- int i;
	-
	- if (nvs->nvs_op == NVS_OP_DECODE)
	- bzero(buf, len); /* don't trust packed data */
	-
	- for (i = 0; i < nelem; i++) {
	- if (buflen <= len)
	- return (EFAULT);
	-
	- buf += len;
	- buflen -= len;
	-
	- if (xdr_string(xdr, &buf, buflen - 1) != TRUE)
	- return (EFAULT);
	-
	- if (nvs->nvs_op == NVS_OP_DECODE)
	- strp[i] = buf;
	- len = strlen(buf) + 1;
	- }
	- ret = TRUE;
	- break;
	- }
	- default:
	- break;
	- }
	-
	- return (ret == TRUE ? 0 : EFAULT);
	-}
	-
	-static int
	-nvs_xdr_nvp_size(nvstream_t nvs, nvpair_t nvp, size_t *size)
	-{
	- data_type_t type = NVP_TYPE(nvp);
	- /*
	- * encode_size + decode_size + name string size + data type + nelem
	- * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp)))
	- */
	- uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4;
	-
	- switch (type) {
	- case DATA_TYPE_BOOLEAN:
	- break;
	-
	- case DATA_TYPE_BOOLEAN_VALUE:
	- case DATA_TYPE_BYTE:
	- case DATA_TYPE_INT8:
	- case DATA_TYPE_UINT8:
	- case DATA_TYPE_INT16:
	- case DATA_TYPE_UINT16:
	- case DATA_TYPE_INT32:
	- case DATA_TYPE_UINT32:
	- nvp_sz += 4; /* 4 is the minimum xdr unit */
	- break;
	-
	- case DATA_TYPE_INT64:
	- case DATA_TYPE_UINT64:
	- case DATA_TYPE_HRTIME:
	-#if !defined(_KERNEL)
	- case DATA_TYPE_DOUBLE:
	-#endif
	- nvp_sz += 8;
	- break;
	-
	- case DATA_TYPE_STRING:
	- nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp)));
	- break;
	-
	- case DATA_TYPE_BYTE_ARRAY:
	- nvp_sz += NV_ALIGN4(NVP_NELEM(nvp));
	- break;
	-
	- case DATA_TYPE_BOOLEAN_ARRAY:
	- case DATA_TYPE_INT8_ARRAY:
	- case DATA_TYPE_UINT8_ARRAY:
	- case DATA_TYPE_INT16_ARRAY:
	- case DATA_TYPE_UINT16_ARRAY:
	- case DATA_TYPE_INT32_ARRAY:
	- case DATA_TYPE_UINT32_ARRAY:
	- nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp);
	- break;
	-
	- case DATA_TYPE_INT64_ARRAY:
	- case DATA_TYPE_UINT64_ARRAY:
	- nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp);
	- break;
	-
	- case DATA_TYPE_STRING_ARRAY: {
	- int i;
	- char *strs = (void )NVP_VALUE(nvp);
	-
	- for (i = 0; i < NVP_NELEM(nvp); i++)
	- nvp_sz += 4 + NV_ALIGN4(strlen(strs[i]));
	-
	- break;
	- }
	-
	- case DATA_TYPE_NVLIST:
	- case DATA_TYPE_NVLIST_ARRAY: {
	- size_t nvsize = 0;
	- int old_nvs_op = nvs->nvs_op;
	- int err;
	-
	- nvs->nvs_op = NVS_OP_GETSIZE;
	- if (type == DATA_TYPE_NVLIST)
	- err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize);
	- else
	- err = nvs_embedded_nvl_array(nvs, nvp, &nvsize);
	- nvs->nvs_op = old_nvs_op;
	-
	- if (err != 0)
	- return (EINVAL);
	-
	- nvp_sz += nvsize;
	- break;
	- }
	-
	- default:
	- return (EINVAL);
	- }
	-
	- if (nvp_sz > INT32_MAX)
	- return (EINVAL);
	-
	- *size = nvp_sz;
	-
	- return (0);
	-}
	-
	-
	-/*
	- * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates
	- * the largest nvpair that could be encoded in the buffer.
	- *
	- * See comments above nvpair_xdr_op() for the format of xdr encoding.
	- * The size of a xdr packed nvpair without any data is 5 words.
	- *
	- * Using the size of the data directly as an estimate would be ok
	- * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY
	- * then the actual nvpair has space for an array of pointers to index
	- * the strings. These pointers are not encoded into the packed xdr buffer.
	- *
	- * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are
	- * of length 0, then each string is endcoded in xdr format as a single word.
	- * Therefore when expanded to an nvpair there will be 2.25 word used for
	- * each string. (a int64_t allocated for pointer usage, and a single char
	- * for the null termination.)
	- *
	- * This is the calculation performed by the NVS_XDR_MAX_LEN macro.
	- */
	-#define NVS_XDR_HDR_LEN ((size_t)(5 * 4))
	-#define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \
	- 0 : ((size_t)(y) - NVS_XDR_HDR_LEN))
	-#define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \
	- (NVS_XDR_DATA_LEN(x) * 2) + \
	- NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4)))
	-
	-static int
	-nvs_xdr_nvpair(nvstream_t nvs, nvpair_t nvp, size_t *size)
	-{
	- XDR *xdr = nvs->nvs_private;
	- int32_t encode_len, decode_len;
	-
	- switch (nvs->nvs_op) {
	- case NVS_OP_ENCODE: {
	- size_t nvsize;
	-
	- if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0)
	- return (EFAULT);
	-
	- decode_len = nvp->nvp_size;
	- encode_len = nvsize;
	- if (!xdr_int(xdr, &encode_len) \|\| !xdr_int(xdr, &decode_len))
	- return (EFAULT);
	-
	- return (nvs_xdr_nvp_op(nvs, nvp));
	- }
	- case NVS_OP_DECODE: {
	- struct xdr_bytesrec bytesrec;
	-
	- /* get the encode and decode size */
	- if (!xdr_int(xdr, &encode_len) \|\| !xdr_int(xdr, &decode_len))
	- return (EFAULT);
	- *size = decode_len;
	-
	- /* are we at the end of the stream? */
	- if (*size == 0)
	- return (0);
	-
	- /* sanity check the size parameter */
	- if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec))
	- return (EFAULT);
	-
	- if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail))
	- return (EFAULT);
	- break;
	- }
	-
	- default:
	- return (EINVAL);
	- }
	- return (0);
	-}
	-
	-static const struct nvs_ops nvs_xdr_ops = {
	- nvs_xdr_nvlist,
	- nvs_xdr_nvpair,
	- nvs_xdr_nvp_op,
	- nvs_xdr_nvp_size,
	- nvs_xdr_nvl_fini
	-};
	-
	-static int
	-nvs_xdr(nvstream_t nvs, nvlist_t nvl, char buf, size_t buflen)
	-{
	- XDR xdr;
	- int err;
	-
	- nvs->nvs_ops = &nvs_xdr_ops;
	-
	- if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t),
	- *buflen - sizeof (nvs_header_t))) != 0)
	- return (err);
	-
	- err = nvs_operation(nvs, nvl, buflen);
	-
	- nvs_xdr_destroy(nvs);
	-
	- return (err);
	-}
	Index: head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c
	+++ head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c
	@@ -1,118 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include <sys/nvpair.h>
	-#include <sys/sysmacros.h>
	-#if defined(_KERNEL) && !defined(_BOOT)
	-#include <sys/varargs.h>
	-#else
	-#include <stdarg.h>
	-#include <strings.h>
	-#endif
	-
	-/*
	- * This allocator is very simple.
	- * - it uses a pre-allocated buffer for memory allocations.
	- * - it does _not_ free memory in the pre-allocated buffer.
	- *
	- * The reason for the selected implemention is simplicity.
	- * This allocator is designed for the usage in interrupt context when
	- * the caller may not wait for free memory.
	- */
	-
	-/* pre-allocated buffer for memory allocations */
	-typedef struct nvbuf {
	- uintptr_t nvb_buf; /* address of pre-allocated buffer */
	- uintptr_t nvb_lim; /* limit address in the buffer */
	- uintptr_t nvb_cur; /* current address in the buffer */
	-} nvbuf_t;
	-
	-/*
	- * Initialize the pre-allocated buffer allocator. The caller needs to supply
	- *
	- * buf address of pre-allocated buffer
	- * bufsz size of pre-allocated buffer
	- *
	- * nv_fixed_init() calculates the remaining members of nvbuf_t.
	- */
	-static int
	-nv_fixed_init(nv_alloc_t *nva, va_list valist)
	-{
	- uintptr_t base = va_arg(valist, uintptr_t);
	- uintptr_t lim = base + va_arg(valist, size_t);
	- nvbuf_t nvb = (nvbuf_t )P2ROUNDUP(base, sizeof (uintptr_t));
	-
	- if (base == 0 \|\| (uintptr_t)&nvb[1] > lim)
	- return (EINVAL);
	-
	- nvb->nvb_buf = (uintptr_t)&nvb[0];
	- nvb->nvb_cur = (uintptr_t)&nvb[1];
	- nvb->nvb_lim = lim;
	- nva->nva_arg = nvb;
	-
	- return (0);
	-}
	-
	-static void *
	-nv_fixed_alloc(nv_alloc_t *nva, size_t size)
	-{
	- nvbuf_t *nvb = nva->nva_arg;
	- uintptr_t new = nvb->nvb_cur;
	-
	- if (size == 0 \|\| new + size > nvb->nvb_lim)
	- return (NULL);
	-
	- nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t));
	-
	- return ((void *)new);
	-}
	-
	-/ARGSUSED/
	-static void
	-nv_fixed_free(nv_alloc_t nva, void buf, size_t size)
	-{
	- /* don't free memory in the pre-allocated buffer */
	-}
	-
	-static void
	-nv_fixed_reset(nv_alloc_t *nva)
	-{
	- nvbuf_t *nvb = nva->nva_arg;
	-
	- nvb->nvb_cur = (uintptr_t)&nvb[1];
	-}
	-
	-const nv_alloc_ops_t nv_fixed_ops_def = {
	- nv_fixed_init, /* nv_ao_init() */
	- NULL, /* nv_ao_fini() */
	- nv_fixed_alloc, /* nv_ao_alloc() */
	- nv_fixed_free, /* nv_ao_free() */
	- nv_fixed_reset /* nv_ao_reset() */
	-};
	-
	-const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def;
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
	@@ -1,111 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#ifndef _ZFEATURE_COMMON_H
	-#define _ZFEATURE_COMMON_H
	-
	-#include <sys/fs/zfs.h>
	-#include <sys/types.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct zfeature_info;
	-
	-typedef enum spa_feature {
	- SPA_FEATURE_NONE = -1,
	- SPA_FEATURE_ASYNC_DESTROY,
	- SPA_FEATURE_EMPTY_BPOBJ,
	- SPA_FEATURE_LZ4_COMPRESS,
	- SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
	- SPA_FEATURE_SPACEMAP_HISTOGRAM,
	- SPA_FEATURE_ENABLED_TXG,
	- SPA_FEATURE_HOLE_BIRTH,
	- SPA_FEATURE_EXTENSIBLE_DATASET,
	- SPA_FEATURE_EMBEDDED_DATA,
	- SPA_FEATURE_BOOKMARKS,
	- SPA_FEATURE_FS_SS_LIMIT,
	- SPA_FEATURE_LARGE_BLOCKS,
	- SPA_FEATURE_LARGE_DNODE,
	- SPA_FEATURE_SHA512,
	- SPA_FEATURE_SKEIN,
	-#ifdef illumos
	- SPA_FEATURE_EDONR,
	-#endif
	- SPA_FEATURE_DEVICE_REMOVAL,
	- SPA_FEATURE_OBSOLETE_COUNTS,
	- SPA_FEATURE_POOL_CHECKPOINT,
	- SPA_FEATURE_SPACEMAP_V2,
	- SPA_FEATURE_ALLOCATION_CLASSES,
	- SPA_FEATURES
	-} spa_feature_t;
	-
	-#define SPA_FEATURE_DISABLED (-1ULL)
	-
	-typedef enum zfeature_flags {
	- /* Can open pool readonly even if this feature is not supported. */
	- ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0),
	- /* Is this feature necessary to read the MOS? */
	- ZFEATURE_FLAG_MOS = (1 << 1),
	- /* Activate this feature at the same time it is enabled. */
	- ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2),
	- /* Each dataset has a field set if it has ever used this feature. */
	- ZFEATURE_FLAG_PER_DATASET = (1 << 3)
	-} zfeature_flags_t;
	-
	-typedef struct zfeature_info {
	- spa_feature_t fi_feature;
	- const char fi_uname; / User-facing feature name */
	- const char fi_guid; / On-disk feature identifier */
	- const char fi_desc; / Feature description */
	- zfeature_flags_t fi_flags;
	- /* array of dependencies, terminated by SPA_FEATURE_NONE */
	- const spa_feature_t *fi_depends;
	-} zfeature_info_t;
	-
	-typedef int (zfeature_func_t)(zfeature_info_t , void );
	-
	-#define ZFS_FEATURE_DEBUG
	-
	-extern zfeature_info_t spa_feature_table[SPA_FEATURES];
	-
	-extern boolean_t zfeature_is_valid_guid(const char *);
	-
	-extern boolean_t zfeature_is_supported(const char *);
	-extern int zfeature_lookup_name(const char , spa_feature_t );
	-extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
	-
	-extern void zpool_feature_init(void);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZFEATURE_COMMON_H */
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
	@@ -1,310 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#ifdef _KERNEL
	-#include <sys/systm.h>
	-#else
	-#include <errno.h>
	-#include <string.h>
	-#endif
	-#include <sys/debug.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/types.h>
	-#include "zfeature_common.h"
	-
	-/*
	- * Set to disable all feature checks while opening pools, allowing pools with
	- * unsupported features to be opened. Set for testing only.
	- */
	-boolean_t zfeature_checks_disable = B_FALSE;
	-
	-zfeature_info_t spa_feature_table[SPA_FEATURES];
	-
	-/*
	- * Valid characters for feature guids. This list is mainly for aesthetic
	- * purposes and could be expanded in the future. There are different allowed
	- * characters in the guids reverse dns portion (before the colon) and its
	- * short name (after the colon).
	- */
	-static int
	-valid_char(char c, boolean_t after_colon)
	-{
	- return ((c >= 'a' && c <= 'z') \|\|
	- (c >= '0' && c <= '9') \|\|
	- (after_colon && c == '_') \|\|
	- (!after_colon && (c == '.' \|\| c == '-')));
	-}
	-
	-/*
	- * Every feature guid must contain exactly one colon which separates a reverse
	- * dns organization name from the feature's "short" name (e.g.
	- * "com.company:feature_name").
	- */
	-boolean_t
	-zfeature_is_valid_guid(const char *name)
	-{
	- int i;
	- boolean_t has_colon = B_FALSE;
	-
	- i = 0;
	- while (name[i] != '\0') {
	- char c = name[i++];
	- if (c == ':') {
	- if (has_colon)
	- return (B_FALSE);
	- has_colon = B_TRUE;
	- continue;
	- }
	- if (!valid_char(c, has_colon))
	- return (B_FALSE);
	- }
	-
	- return (has_colon);
	-}
	-
	-boolean_t
	-zfeature_is_supported(const char *guid)
	-{
	- if (zfeature_checks_disable)
	- return (B_TRUE);
	-
	- for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
	- zfeature_info_t *feature = &spa_feature_table[i];
	- if (strcmp(guid, feature->fi_guid) == 0)
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-int
	-zfeature_lookup_name(const char name, spa_feature_t res)
	-{
	- for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
	- zfeature_info_t *feature = &spa_feature_table[i];
	- if (strcmp(name, feature->fi_uname) == 0) {
	- if (res != NULL)
	- *res = i;
	- return (0);
	- }
	- }
	-
	- return (ENOENT);
	-}
	-
	-boolean_t
	-zfeature_depends_on(spa_feature_t fid, spa_feature_t check)
	-{
	- zfeature_info_t *feature = &spa_feature_table[fid];
	-
	- for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
	- if (feature->fi_depends[i] == check)
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-static void
	-zfeature_register(spa_feature_t fid, const char guid, const char name,
	- const char desc, zfeature_flags_t flags, const spa_feature_t deps)
	-{
	- zfeature_info_t *feature = &spa_feature_table[fid];
	- static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
	-
	- ASSERT(name != NULL);
	- ASSERT(desc != NULL);
	- ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 \|\|
	- (flags & ZFEATURE_FLAG_MOS) == 0);
	- ASSERT3U(fid, <, SPA_FEATURES);
	- ASSERT(zfeature_is_valid_guid(guid));
	-
	- if (deps == NULL)
	- deps = nodeps;
	-
	- feature->fi_feature = fid;
	- feature->fi_guid = guid;
	- feature->fi_uname = name;
	- feature->fi_desc = desc;
	- feature->fi_flags = flags;
	- feature->fi_depends = deps;
	-}
	-
	-void
	-zpool_feature_init(void)
	-{
	- zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
	- "com.delphix:async_destroy", "async_destroy",
	- "Destroy filesystems asynchronously.",
	- ZFEATURE_FLAG_READONLY_COMPAT, NULL);
	-
	- zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
	- "com.delphix:empty_bpobj", "empty_bpobj",
	- "Snapshots use less space.",
	- ZFEATURE_FLAG_READONLY_COMPAT, NULL);
	-
	- zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
	- "org.illumos:lz4_compress", "lz4_compress",
	- "LZ4 compression algorithm support.",
	- ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL);
	-
	- zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
	- "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
	- "Crash dumps to multiple vdev pools.",
	- 0, NULL);
	-
	- zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
	- "com.delphix:spacemap_histogram", "spacemap_histogram",
	- "Spacemaps maintain space histograms.",
	- ZFEATURE_FLAG_READONLY_COMPAT, NULL);
	-
	- zfeature_register(SPA_FEATURE_ENABLED_TXG,
	- "com.delphix:enabled_txg", "enabled_txg",
	- "Record txg at which a feature is enabled",
	- ZFEATURE_FLAG_READONLY_COMPAT, NULL);
	-
	- static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
	- SPA_FEATURE_NONE };
	- zfeature_register(SPA_FEATURE_HOLE_BIRTH,
	- "com.delphix:hole_birth", "hole_birth",
	- "Retain hole birth txg for more precise zfs send",
	- ZFEATURE_FLAG_MOS \| ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
	- hole_birth_deps);
	-
	- zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
	- "com.delphix:extensible_dataset", "extensible_dataset",
	- "Enhanced dataset functionality, used by other features.",
	- 0, NULL);
	-
	- static const spa_feature_t bookmarks_deps[] = {
	- SPA_FEATURE_EXTENSIBLE_DATASET,
	- SPA_FEATURE_NONE
	- };
	- zfeature_register(SPA_FEATURE_BOOKMARKS,
	- "com.delphix:bookmarks", "bookmarks",
	- "\"zfs bookmark\" command",
	- ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps);
	-
	- static const spa_feature_t filesystem_limits_deps[] = {
	- SPA_FEATURE_EXTENSIBLE_DATASET,
	- SPA_FEATURE_NONE
	- };
	- zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
	- "com.joyent:filesystem_limits", "filesystem_limits",
	- "Filesystem and snapshot limits.",
	- ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps);
	-
	- zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
	- "com.delphix:embedded_data", "embedded_data",
	- "Blocks which compress very well use even less space.",
	- ZFEATURE_FLAG_MOS \| ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
	- NULL);
	-
	- zfeature_register(SPA_FEATURE_POOL_CHECKPOINT,
	- "com.delphix:zpool_checkpoint", "zpool_checkpoint",
	- "Pool state can be checkpointed, allowing rewind later.",
	- ZFEATURE_FLAG_READONLY_COMPAT, NULL);
	-
	- zfeature_register(SPA_FEATURE_SPACEMAP_V2,
	- "com.delphix:spacemap_v2", "spacemap_v2",
	- "Space maps representing large segments are more efficient.",
	- ZFEATURE_FLAG_READONLY_COMPAT \| ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
	- NULL);
	-
	- static const spa_feature_t large_blocks_deps[] = {
	- SPA_FEATURE_EXTENSIBLE_DATASET,
	- SPA_FEATURE_NONE
	- };
	- zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
	- "org.open-zfs:large_blocks", "large_blocks",
	- "Support for blocks larger than 128KB.",
	- ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
	-
	- {
	- static const spa_feature_t large_dnode_deps[] = {
	- SPA_FEATURE_EXTENSIBLE_DATASET,
	- SPA_FEATURE_NONE
	- };
	- zfeature_register(SPA_FEATURE_LARGE_DNODE,
	- "org.zfsonlinux:large_dnode", "large_dnode",
	- "Variable on-disk size of dnodes.",
	- ZFEATURE_FLAG_PER_DATASET, large_dnode_deps);
	- }
	-
	- static const spa_feature_t sha512_deps[] = {
	- SPA_FEATURE_EXTENSIBLE_DATASET,
	- SPA_FEATURE_NONE
	- };
	- zfeature_register(SPA_FEATURE_SHA512,
	- "org.illumos:sha512", "sha512",
	- "SHA-512/256 hash algorithm.",
	- ZFEATURE_FLAG_PER_DATASET, sha512_deps);
	-
	- static const spa_feature_t skein_deps[] = {
	- SPA_FEATURE_EXTENSIBLE_DATASET,
	- SPA_FEATURE_NONE
	- };
	- zfeature_register(SPA_FEATURE_SKEIN,
	- "org.illumos:skein", "skein",
	- "Skein hash algorithm.",
	- ZFEATURE_FLAG_PER_DATASET, skein_deps);
	-
	-#ifdef illumos
	- static const spa_feature_t edonr_deps[] = {
	- SPA_FEATURE_EXTENSIBLE_DATASET,
	- SPA_FEATURE_NONE
	- };
	- zfeature_register(SPA_FEATURE_EDONR,
	- "org.illumos:edonr", "edonr",
	- "Edon-R hash algorithm.",
	- ZFEATURE_FLAG_PER_DATASET, edonr_deps);
	-#endif
	-
	- zfeature_register(SPA_FEATURE_DEVICE_REMOVAL,
	- "com.delphix:device_removal", "device_removal",
	- "Top-level vdevs can be removed, reducing logical pool size.",
	- ZFEATURE_FLAG_MOS, NULL);
	-
	- static const spa_feature_t obsolete_counts_deps[] = {
	- SPA_FEATURE_EXTENSIBLE_DATASET,
	- SPA_FEATURE_DEVICE_REMOVAL,
	- SPA_FEATURE_NONE
	- };
	- zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS,
	- "com.delphix:obsolete_counts", "obsolete_counts",
	- "Reduce memory used by removed devices when their blocks are "
	- "freed or remapped.",
	- ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps);
	-
	- {
	- zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
	- "org.zfsonlinux:allocation_classes", "allocation_classes",
	- "Support for separate allocation classes.",
	- ZFEATURE_FLAG_READONLY_COMPAT, NULL);
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
	@@ -1,52 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright 2019 Joyent, Inc.
	- */
	-
	-#ifndef _ZFS_COMUTIL_H
	-#define _ZFS_COMUTIL_H
	-
	-#include <sys/fs/zfs.h>
	-#include <sys/types.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/* Needed for ZoL errno usage in MMP kernel and user code */
	-#define EREMOTEIO EREMOTE
	-
	-extern boolean_t zfs_allocatable_devs(nvlist_t *);
	-extern void zpool_get_load_policy(nvlist_t , zpool_load_policy_t );
	-
	-extern int zfs_zpl_version_map(int spa_version);
	-extern int zfs_spa_version_map(int zpl_version);
	-#define ZFS_NUM_LEGACY_HISTORY_EVENTS 41
	-extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS];
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZFS_COMUTIL_H */
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
	@@ -1,206 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * This file is intended for functions that ought to be common between user
	- * land (libzfs) and the kernel. When many common routines need to be shared
	- * then a separate file should to be created.
	- */
	-
	-#if defined(_KERNEL)
	-#include <sys/systm.h>
	-#else
	-#include <string.h>
	-#endif
	-
	-#include <sys/types.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/nvpair.h>
	-#include "zfs_comutil.h"
	-
	-/*
	- * Are there allocatable vdevs?
	- */
	-boolean_t
	-zfs_allocatable_devs(nvlist_t *nv)
	-{
	- uint64_t is_log;
	- uint_t c;
	- nvlist_t **child;
	- uint_t children;
	-
	- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) != 0) {
	- return (B_FALSE);
	- }
	- for (c = 0; c < children; c++) {
	- is_log = 0;
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	- &is_log);
	- if (!is_log)
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-void
	-zpool_get_load_policy(nvlist_t nvl, zpool_load_policy_t zlpp)
	-{
	- nvlist_t *policy;
	- nvpair_t *elem;
	- char *nm;
	-
	- /* Defaults */
	- zlpp->zlp_rewind = ZPOOL_NO_REWIND;
	- zlpp->zlp_maxmeta = 0;
	- zlpp->zlp_maxdata = UINT64_MAX;
	- zlpp->zlp_txg = UINT64_MAX;
	-
	- if (nvl == NULL)
	- return;
	-
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
	- nm = nvpair_name(elem);
	- if (strcmp(nm, ZPOOL_LOAD_POLICY) == 0) {
	- if (nvpair_value_nvlist(elem, &policy) == 0)
	- zpool_get_load_policy(policy, zlpp);
	- return;
	- } else if (strcmp(nm, ZPOOL_LOAD_REWIND_POLICY) == 0) {
	- if (nvpair_value_uint32(elem, &zlpp->zlp_rewind) == 0)
	- if (zlpp->zlp_rewind & ~ZPOOL_REWIND_POLICIES)
	- zlpp->zlp_rewind = ZPOOL_NO_REWIND;
	- } else if (strcmp(nm, ZPOOL_LOAD_REQUEST_TXG) == 0) {
	- (void) nvpair_value_uint64(elem, &zlpp->zlp_txg);
	- } else if (strcmp(nm, ZPOOL_LOAD_META_THRESH) == 0) {
	- (void) nvpair_value_uint64(elem, &zlpp->zlp_maxmeta);
	- } else if (strcmp(nm, ZPOOL_LOAD_DATA_THRESH) == 0) {
	- (void) nvpair_value_uint64(elem, &zlpp->zlp_maxdata);
	- }
	- }
	- if (zlpp->zlp_rewind == 0)
	- zlpp->zlp_rewind = ZPOOL_NO_REWIND;
	-}
	-
	-typedef struct zfs_version_spa_map {
	- int version_zpl;
	- int version_spa;
	-} zfs_version_spa_map_t;
	-
	-/*
	- * Keep this table in monotonically increasing version number order.
	- */
	-static zfs_version_spa_map_t zfs_version_table[] = {
	- {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
	- {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
	- {ZPL_VERSION_FUID, SPA_VERSION_FUID},
	- {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
	- {ZPL_VERSION_SA, SPA_VERSION_SA},
	- {0, 0}
	-};
	-
	-/*
	- * Return the max zpl version for a corresponding spa version
	- * -1 is returned if no mapping exists.
	- */
	-int
	-zfs_zpl_version_map(int spa_version)
	-{
	- int i;
	- int version = -1;
	-
	- for (i = 0; zfs_version_table[i].version_spa; i++) {
	- if (spa_version >= zfs_version_table[i].version_spa)
	- version = zfs_version_table[i].version_zpl;
	- }
	-
	- return (version);
	-}
	-
	-/*
	- * Return the min spa version for a corresponding spa version
	- * -1 is returned if no mapping exists.
	- */
	-int
	-zfs_spa_version_map(int zpl_version)
	-{
	- int i;
	- int version = -1;
	-
	- for (i = 0; zfs_version_table[i].version_zpl; i++) {
	- if (zfs_version_table[i].version_zpl >= zpl_version)
	- return (zfs_version_table[i].version_spa);
	- }
	-
	- return (version);
	-}
	-
	-/*
	- * This is the table of legacy internal event names; it should not be modified.
	- * The internal events are now stored in the history log as strings.
	- */
	-const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
	- "invalid event",
	- "pool create",
	- "vdev add",
	- "pool remove",
	- "pool destroy",
	- "pool export",
	- "pool import",
	- "vdev attach",
	- "vdev replace",
	- "vdev detach",
	- "vdev online",
	- "vdev offline",
	- "vdev upgrade",
	- "pool clear",
	- "pool scrub",
	- "pool property set",
	- "create",
	- "clone",
	- "destroy",
	- "destroy_begin_sync",
	- "inherit",
	- "property set",
	- "quota set",
	- "permission update",
	- "permission remove",
	- "permission who remove",
	- "promote",
	- "receive",
	- "rename",
	- "reservation set",
	- "replay_inc_sync",
	- "replay_full_sync",
	- "rollback",
	- "snapshot",
	- "filesystem version upgrade",
	- "refquota set",
	- "refreservation set",
	- "pool scrub done",
	- "user hold",
	- "user release",
	- "pool split",
	-};
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
	@@ -1,90 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _ZFS_DELEG_H
	-#define _ZFS_DELEG_H
	-
	-#include <sys/fs/zfs.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */
	-#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */
	-
	-/*
	- * Max name length for a delegation attribute
	- */
	-#define ZFS_MAX_DELEG_NAME 128
	-
	-#define ZFS_DELEG_LOCAL 'l'
	-#define ZFS_DELEG_DESCENDENT 'd'
	-#define ZFS_DELEG_NA '-'
	-
	-typedef enum {
	- ZFS_DELEG_NOTE_CREATE,
	- ZFS_DELEG_NOTE_DESTROY,
	- ZFS_DELEG_NOTE_SNAPSHOT,
	- ZFS_DELEG_NOTE_ROLLBACK,
	- ZFS_DELEG_NOTE_CLONE,
	- ZFS_DELEG_NOTE_PROMOTE,
	- ZFS_DELEG_NOTE_RENAME,
	- ZFS_DELEG_NOTE_SEND,
	- ZFS_DELEG_NOTE_RECEIVE,
	- ZFS_DELEG_NOTE_ALLOW,
	- ZFS_DELEG_NOTE_USERPROP,
	- ZFS_DELEG_NOTE_MOUNT,
	- ZFS_DELEG_NOTE_SHARE,
	- ZFS_DELEG_NOTE_USERQUOTA,
	- ZFS_DELEG_NOTE_GROUPQUOTA,
	- ZFS_DELEG_NOTE_USERUSED,
	- ZFS_DELEG_NOTE_GROUPUSED,
	- ZFS_DELEG_NOTE_HOLD,
	- ZFS_DELEG_NOTE_RELEASE,
	- ZFS_DELEG_NOTE_DIFF,
	- ZFS_DELEG_NOTE_BOOKMARK,
	- ZFS_DELEG_NOTE_REMAP,
	- ZFS_DELEG_NOTE_NONE
	-} zfs_deleg_note_t;
	-
	-typedef struct zfs_deleg_perm_tab {
	- char *z_perm;
	- zfs_deleg_note_t z_note;
	-} zfs_deleg_perm_tab_t;
	-
	-extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[];
	-
	-int zfs_deleg_verify_nvlist(nvlist_t *nvlist);
	-void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
	- char checkflag, void *data);
	-const char zfs_deleg_canonicalize_perm(const char perm);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZFS_DELEG_H */
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
	@@ -1,235 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	- */
	-
	-#include <sys/zfs_context.h>
	-
	-#if defined(_KERNEL)
	-#include <sys/systm.h>
	-#include <sys/sunddi.h>
	-#include <sys/ctype.h>
	-#else
	-#include <stdio.h>
	-#include <unistd.h>
	-#include <strings.h>
	-#include <libnvpair.h>
	-#include <ctype.h>
	-#endif
	-#include <sys/dsl_deleg.h>
	-#include "zfs_prop.h"
	-#include "zfs_deleg.h"
	-#include "zfs_namecheck.h"
	-
	-zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
	- {ZFS_DELEG_PERM_ALLOW},
	- {ZFS_DELEG_PERM_BOOKMARK},
	- {ZFS_DELEG_PERM_CLONE},
	- {ZFS_DELEG_PERM_CREATE},
	- {ZFS_DELEG_PERM_DESTROY},
	- {ZFS_DELEG_PERM_DIFF},
	- {ZFS_DELEG_PERM_MOUNT},
	- {ZFS_DELEG_PERM_PROMOTE},
	- {ZFS_DELEG_PERM_RECEIVE},
	- {ZFS_DELEG_PERM_REMAP},
	- {ZFS_DELEG_PERM_RENAME},
	- {ZFS_DELEG_PERM_ROLLBACK},
	- {ZFS_DELEG_PERM_SNAPSHOT},
	- {ZFS_DELEG_PERM_SHARE},
	- {ZFS_DELEG_PERM_SEND},
	- {ZFS_DELEG_PERM_USERPROP},
	- {ZFS_DELEG_PERM_USERQUOTA},
	- {ZFS_DELEG_PERM_GROUPQUOTA},
	- {ZFS_DELEG_PERM_USERUSED},
	- {ZFS_DELEG_PERM_GROUPUSED},
	- {ZFS_DELEG_PERM_HOLD},
	- {ZFS_DELEG_PERM_RELEASE},
	- {NULL}
	-};
	-
	-static int
	-zfs_valid_permission_name(const char *perm)
	-{
	- if (zfs_deleg_canonicalize_perm(perm))
	- return (0);
	-
	- return (permset_namecheck(perm, NULL, NULL));
	-}
	-
	-const char *
	-zfs_deleg_canonicalize_perm(const char *perm)
	-{
	- int i;
	- zfs_prop_t prop;
	-
	- for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
	- if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0)
	- return (perm);
	- }
	-
	- prop = zfs_name_to_prop(perm);
	- if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop))
	- return (zfs_prop_to_name(prop));
	- return (NULL);
	-
	-}
	-
	-static int
	-zfs_validate_who(char *who)
	-{
	- char *p;
	-
	- if (who[2] != ZFS_DELEG_FIELD_SEP_CHR)
	- return (-1);
	-
	- switch (who[0]) {
	- case ZFS_DELEG_USER:
	- case ZFS_DELEG_GROUP:
	- case ZFS_DELEG_USER_SETS:
	- case ZFS_DELEG_GROUP_SETS:
	- if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
	- return (-1);
	- for (p = &who[3]; *p; p++)
	- if (!isdigit(*p))
	- return (-1);
	- break;
	-
	- case ZFS_DELEG_NAMED_SET:
	- case ZFS_DELEG_NAMED_SET_SETS:
	- if (who[1] != ZFS_DELEG_NA)
	- return (-1);
	- return (permset_namecheck(&who[3], NULL, NULL));
	-
	- case ZFS_DELEG_CREATE:
	- case ZFS_DELEG_CREATE_SETS:
	- if (who[1] != ZFS_DELEG_NA)
	- return (-1);
	- if (who[3] != '\0')
	- return (-1);
	- break;
	-
	- case ZFS_DELEG_EVERYONE:
	- case ZFS_DELEG_EVERYONE_SETS:
	- if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
	- return (-1);
	- if (who[3] != '\0')
	- return (-1);
	- break;
	-
	- default:
	- return (-1);
	- }
	-
	- return (0);
	-}
	-
	-int
	-zfs_deleg_verify_nvlist(nvlist_t *nvp)
	-{
	- nvpair_t who, perm_name;
	- nvlist_t *perms;
	- int error;
	-
	- if (nvp == NULL)
	- return (-1);
	-
	- who = nvlist_next_nvpair(nvp, NULL);
	- if (who == NULL)
	- return (-1);
	-
	- do {
	- if (zfs_validate_who(nvpair_name(who)))
	- return (-1);
	-
	- error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms);
	-
	- if (error && error != ENOENT)
	- return (-1);
	- if (error == ENOENT)
	- continue;
	-
	- perm_name = nvlist_next_nvpair(perms, NULL);
	- if (perm_name == NULL) {
	- return (-1);
	- }
	- do {
	- error = zfs_valid_permission_name(
	- nvpair_name(perm_name));
	- if (error)
	- return (-1);
	- } while ((perm_name = nvlist_next_nvpair(perms, perm_name))
	- != NULL);
	- } while ((who = nvlist_next_nvpair(nvp, who)) != NULL);
	- return (0);
	-}
	-
	-/*
	- * Construct the base attribute name. The base attribute names
	- * are the "key" to locate the jump objects which contain the actual
	- * permissions. The base attribute names are encoded based on
	- * type of entry and whether it is a local or descendent permission.
	- *
	- * Arguments:
	- * attr - attribute name return string, attribute is assumed to be
	- * ZFS_MAX_DELEG_NAME long.
	- * type - type of entry to construct
	- * inheritchr - inheritance type (local,descendent, or NA for create and
	- * permission set definitions
	- * data - is either a permission set name or a 64 bit uid/gid.
	- */
	-void
	-zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
	- char inheritchr, void *data)
	-{
	- int len = ZFS_MAX_DELEG_NAME;
	- uint64_t *id = data;
	-
	- switch (type) {
	- case ZFS_DELEG_USER:
	- case ZFS_DELEG_GROUP:
	- case ZFS_DELEG_USER_SETS:
	- case ZFS_DELEG_GROUP_SETS:
	- (void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr,
	- ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id);
	- break;
	- case ZFS_DELEG_NAMED_SET_SETS:
	- case ZFS_DELEG_NAMED_SET:
	- (void) snprintf(attr, len, "%c-%c%s", type,
	- ZFS_DELEG_FIELD_SEP_CHR, (char *)data);
	- break;
	- case ZFS_DELEG_CREATE:
	- case ZFS_DELEG_CREATE_SETS:
	- (void) snprintf(attr, len, "%c-%c", type,
	- ZFS_DELEG_FIELD_SEP_CHR);
	- break;
	- case ZFS_DELEG_EVERYONE:
	- case ZFS_DELEG_EVERYONE_SETS:
	- (void) snprintf(attr, len, "%c%c%c", type, inheritchr,
	- ZFS_DELEG_FIELD_SEP_CHR);
	- break;
	- default:
	- ASSERT(!"bad zfs_deleg_who_type_t");
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h
	@@ -1,58 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _ZFS_FLETCHER_H
	-#define _ZFS_FLETCHER_H
	-
	-#include <sys/types.h>
	-#include <sys/spa.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * fletcher checksum functions
	- */
	-
	-void fletcher_init(zio_cksum_t *);
	-void fletcher_2_native(const void , size_t, const void , zio_cksum_t *);
	-void fletcher_2_byteswap(const void , size_t, const void , zio_cksum_t *);
	-int fletcher_2_incremental_native(void , size_t, void );
	-int fletcher_2_incremental_byteswap(void , size_t, void );
	-void fletcher_4_native(const void , size_t, const void , zio_cksum_t *);
	-void fletcher_4_byteswap(const void , size_t, const void , zio_cksum_t *);
	-int fletcher_4_incremental_native(void , size_t, void );
	-int fletcher_4_incremental_byteswap(void , size_t, void );
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZFS_FLETCHER_H */
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c
	@@ -1,279 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * Fletcher Checksums
	- * ------------------
	- *
	- * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
	- * recurrence relations:
	- *
	- * a = a + f
	- * i i-1 i-1
	- *
	- * b = b + a
	- * i i-1 i
	- *
	- * c = c + b (fletcher-4 only)
	- * i i-1 i
	- *
	- * d = d + c (fletcher-4 only)
	- * i i-1 i
	- *
	- * Where
	- * a_0 = b_0 = c_0 = d_0 = 0
	- * and
	- * f_0 .. f_(n-1) are the input data.
	- *
	- * Using standard techniques, these translate into the following series:
	- *
	- * __n_ __n_
	- * \ \| \ \|
	- * a = > f b = > i * f
	- * n /___\| n - i n /___\| n - i
	- * i = 1 i = 1
	- *
	- *
	- * __n_ __n_
	- * \ \| i(i+1) \ \| i(i+1)*(i+2)
	- * c = > ------- f d = > ------------- f
	- * n /___\| 2 n - i n /___\| 6 n - i
	- * i = 1 i = 1
	- *
	- * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
	- * Since the additions are done mod (2^64), errors in the high bits may not
	- * be noticed. For this reason, fletcher-2 is deprecated.
	- *
	- * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
	- * A conservative estimate of how big the buffer can get before we overflow
	- * can be estimated using f_i = 0xffffffff for all i:
	- *
	- * % bc
	- * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += fi(i+1)(i+2)/6 }; (i-1)4
	- * 2264
	- * quit
	- * %
	- *
	- * So blocks of up to 2k will not overflow. Our largest block size is
	- * 128k, which has 32k 4-byte words, so we can compute the largest possible
	- * accumulators, then divide by 2^64 to figure the max amount of overflow:
	- *
	- * % bc
	- * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
	- * a/2^64;b/2^64;c/2^64;d/2^64
	- * 0
	- * 0
	- * 1365
	- * 11186858
	- * quit
	- * %
	- *
	- * So a and b cannot overflow. To make sure each bit of input has some
	- * effect on the contents of c and d, we can look at what the factors of
	- * the coefficients in the equations for c_n and d_n are. The number of 2s
	- * in the factors determines the lowest set bit in the multiplier. Running
	- * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
	- * 2^14, and for n(n+1)(n+2)/6 it is 2^15. So while some data may overflow
	- * the 64-bit accumulators, every bit of every f_i effects every accumulator,
	- * even for 128k blocks.
	- *
	- * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
	- * we could do our calculations mod (2^32 - 1) by adding in the carries
	- * periodically, and store the number of carries in the top 32-bits.
	- *
	- * --------------------
	- * Checksum Performance
	- * --------------------
	- *
	- * There are two interesting components to checksum performance: cached and
	- * uncached performance. With cached data, fletcher-2 is about four times
	- * faster than fletcher-4. With uncached data, the performance difference is
	- * negligible, since the cost of a cache fill dominates the processing time.
	- * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
	- * efficient pass over the data.
	- *
	- * In normal operation, the data which is being checksummed is in a buffer
	- * which has been filled either by:
	- *
	- * 1. a compression step, which will be mostly cached, or
	- * 2. a bcopy() or copyin(), which will be uncached (because the
	- * copy is cache-bypassing).
	- *
	- * For both cached and uncached data, both fletcher checksums are much faster
	- * than sha-256, and slower than 'off', which doesn't touch the data at all.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/sysmacros.h>
	-#include <sys/byteorder.h>
	-#include <sys/zio.h>
	-#include <sys/spa.h>
	-#include <zfs_fletcher.h>
	-
	-void
	-fletcher_init(zio_cksum_t *zcp)
	-{
	- ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
	-}
	-
	-int
	-fletcher_2_incremental_native(void buf, size_t size, void data)
	-{
	- zio_cksum_t *zcp = data;
	-
	- const uint64_t *ip = buf;
	- const uint64_t *ipend = ip + (size / sizeof (uint64_t));
	- uint64_t a0, b0, a1, b1;
	-
	- a0 = zcp->zc_word[0];
	- a1 = zcp->zc_word[1];
	- b0 = zcp->zc_word[2];
	- b1 = zcp->zc_word[3];
	-
	- for (; ip < ipend; ip += 2) {
	- a0 += ip[0];
	- a1 += ip[1];
	- b0 += a0;
	- b1 += a1;
	- }
	-
	- ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
	- return (0);
	-}
	-
	-/ARGSUSED/
	-void
	-fletcher_2_native(const void *buf, size_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- fletcher_init(zcp);
	- (void) fletcher_2_incremental_native((void *) buf, size, zcp);
	-}
	-
	-int
	-fletcher_2_incremental_byteswap(void buf, size_t size, void data)
	-{
	- zio_cksum_t *zcp = data;
	-
	- const uint64_t *ip = buf;
	- const uint64_t *ipend = ip + (size / sizeof (uint64_t));
	- uint64_t a0, b0, a1, b1;
	-
	- a0 = zcp->zc_word[0];
	- a1 = zcp->zc_word[1];
	- b0 = zcp->zc_word[2];
	- b1 = zcp->zc_word[3];
	-
	- for (; ip < ipend; ip += 2) {
	- a0 += BSWAP_64(ip[0]);
	- a1 += BSWAP_64(ip[1]);
	- b0 += a0;
	- b1 += a1;
	- }
	-
	- ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
	- return (0);
	-}
	-
	-/ARGSUSED/
	-void
	-fletcher_2_byteswap(const void *buf, size_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- fletcher_init(zcp);
	- (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
	-}
	-
	-int
	-fletcher_4_incremental_native(void buf, size_t size, void data)
	-{
	- zio_cksum_t *zcp = data;
	-
	- const uint32_t *ip = buf;
	- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
	- uint64_t a, b, c, d;
	-
	- a = zcp->zc_word[0];
	- b = zcp->zc_word[1];
	- c = zcp->zc_word[2];
	- d = zcp->zc_word[3];
	-
	- for (; ip < ipend; ip++) {
	- a += ip[0];
	- b += a;
	- c += b;
	- d += c;
	- }
	-
	- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
	- return (0);
	-}
	-
	-/ARGSUSED/
	-void
	-fletcher_4_native(const void *buf, size_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- fletcher_init(zcp);
	- (void) fletcher_4_incremental_native((void *) buf, size, zcp);
	-}
	-
	-int
	-fletcher_4_incremental_byteswap(void buf, size_t size, void data)
	-{
	- zio_cksum_t *zcp = data;
	-
	- const uint32_t *ip = buf;
	- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
	- uint64_t a, b, c, d;
	-
	- a = zcp->zc_word[0];
	- b = zcp->zc_word[1];
	- c = zcp->zc_word[2];
	- d = zcp->zc_word[3];
	-
	- for (; ip < ipend; ip++) {
	- a += BSWAP_32(ip[0]);
	- b += a;
	- c += b;
	- d += c;
	- }
	-
	- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
	- return (0);
	-}
	-
	-/ARGSUSED/
	-void
	-fletcher_4_byteswap(const void *buf, size_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- fletcher_init(zcp);
	- (void) fletcher_4_incremental_byteswap((void *) buf, size, zcp);
	-}
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
	@@ -1,543 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
	- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _SYS_ZFS_IOCTL_COMPAT_H
	-#define _SYS_ZFS_IOCTL_COMPAT_H
	-
	-#include <sys/cred.h>
	-#include <sys/dmu.h>
	-#include <sys/zio.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/zfs_ioctl.h>
	-
	-#ifdef _KERNEL
	-#include <sys/nvpair.h>
	-#endif /* _KERNEL */
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Backwards ioctl compatibility
	- */
	-
	-/* ioctl versions for vfs.zfs.version.ioctl */
	-#define ZFS_IOCVER_UNDEF -1
	-#define ZFS_IOCVER_NONE 0
	-#define ZFS_IOCVER_DEADMAN 1
	-#define ZFS_IOCVER_LZC 2
	-#define ZFS_IOCVER_ZCMD 3
	-#define ZFS_IOCVER_EDBP 4
	-#define ZFS_IOCVER_RESUME 5
	-#define ZFS_IOCVER_INLANES 6
	-#define ZFS_IOCVER_PAD 7
	-#define ZFS_IOCVER_CURRENT ZFS_IOCVER_PAD
	-
	-/* compatibility conversion flag */
	-#define ZFS_CMD_COMPAT_NONE 0
	-#define ZFS_CMD_COMPAT_V15 1
	-#define ZFS_CMD_COMPAT_V28 2
	-#define ZFS_CMD_COMPAT_DEADMAN 3
	-#define ZFS_CMD_COMPAT_LZC 4
	-#define ZFS_CMD_COMPAT_ZCMD 5
	-#define ZFS_CMD_COMPAT_EDBP 6
	-#define ZFS_CMD_COMPAT_RESUME 7
	-#define ZFS_CMD_COMPAT_INLANES 8
	-
	-#define ZFS_IOC_COMPAT_PASS 254
	-#define ZFS_IOC_COMPAT_FAIL 255
	-
	-#define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff)
	-
	-typedef struct zfs_iocparm {
	- uint32_t zfs_ioctl_version;
	- uint64_t zfs_cmd;
	- uint64_t zfs_cmd_size;
	-} zfs_iocparm_t;
	-
	-typedef struct zinject_record_v15 {
	- uint64_t zi_objset;
	- uint64_t zi_object;
	- uint64_t zi_start;
	- uint64_t zi_end;
	- uint64_t zi_guid;
	- uint32_t zi_level;
	- uint32_t zi_error;
	- uint64_t zi_type;
	- uint32_t zi_freq;
	- uint32_t zi_failfast;
	-} zinject_record_v15_t;
	-
	-typedef struct zfs_cmd_v15 {
	- char zc_name[MAXPATHLEN];
	- char zc_value[MAXPATHLEN];
	- char zc_string[MAXNAMELEN];
	- uint64_t zc_guid;
	- uint64_t zc_nvlist_conf; /* really (char ) /
	- uint64_t zc_nvlist_conf_size;
	- uint64_t zc_nvlist_src; /* really (char ) /
	- uint64_t zc_nvlist_src_size;
	- uint64_t zc_nvlist_dst; /* really (char ) /
	- uint64_t zc_nvlist_dst_size;
	- uint64_t zc_cookie;
	- uint64_t zc_objset_type;
	- uint64_t zc_perm_action;
	- uint64_t zc_history; /* really (char ) /
	- uint64_t zc_history_len;
	- uint64_t zc_history_offset;
	- uint64_t zc_obj;
	- zfs_share_t zc_share;
	- uint64_t zc_jailid;
	- dmu_objset_stats_t zc_objset_stats;
	- struct drr_begin zc_begin_record;
	- zinject_record_v15_t zc_inject_record;
	-} zfs_cmd_v15_t;
	-
	-typedef struct zinject_record_v28 {
	- uint64_t zi_objset;
	- uint64_t zi_object;
	- uint64_t zi_start;
	- uint64_t zi_end;
	- uint64_t zi_guid;
	- uint32_t zi_level;
	- uint32_t zi_error;
	- uint64_t zi_type;
	- uint32_t zi_freq;
	- uint32_t zi_failfast;
	- char zi_func[MAXNAMELEN];
	- uint32_t zi_iotype;
	- int32_t zi_duration;
	- uint64_t zi_timer;
	-} zinject_record_v28_t;
	-
	-typedef struct zfs_cmd_v28 {
	- char zc_name[MAXPATHLEN];
	- char zc_value[MAXPATHLEN * 2];
	- char zc_string[MAXNAMELEN];
	- char zc_top_ds[MAXPATHLEN];
	- uint64_t zc_guid;
	- uint64_t zc_nvlist_conf; /* really (char ) /
	- uint64_t zc_nvlist_conf_size;
	- uint64_t zc_nvlist_src; /* really (char ) /
	- uint64_t zc_nvlist_src_size;
	- uint64_t zc_nvlist_dst; /* really (char ) /
	- uint64_t zc_nvlist_dst_size;
	- uint64_t zc_cookie;
	- uint64_t zc_objset_type;
	- uint64_t zc_perm_action;
	- uint64_t zc_history; /* really (char ) /
	- uint64_t zc_history_len;
	- uint64_t zc_history_offset;
	- uint64_t zc_obj;
	- uint64_t zc_iflags; /* internal to zfs(7fs) */
	- zfs_share_t zc_share;
	- uint64_t zc_jailid;
	- dmu_objset_stats_t zc_objset_stats;
	- struct drr_begin zc_begin_record;
	- zinject_record_v28_t zc_inject_record;
	- boolean_t zc_defer_destroy;
	- boolean_t zc_temphold;
	- uint64_t zc_action_handle;
	- int zc_cleanup_fd;
	- uint8_t zc_simple;
	- uint8_t zc_pad[3]; /* alignment */
	- uint64_t zc_sendobj;
	- uint64_t zc_fromobj;
	- uint64_t zc_createtxg;
	- zfs_stat_t zc_stat;
	-} zfs_cmd_v28_t;
	-
	-typedef struct zinject_record_deadman {
	- uint64_t zi_objset;
	- uint64_t zi_object;
	- uint64_t zi_start;
	- uint64_t zi_end;
	- uint64_t zi_guid;
	- uint32_t zi_level;
	- uint32_t zi_error;
	- uint64_t zi_type;
	- uint32_t zi_freq;
	- uint32_t zi_failfast;
	- char zi_func[MAXNAMELEN];
	- uint32_t zi_iotype;
	- int32_t zi_duration;
	- uint64_t zi_timer;
	- uint32_t zi_cmd;
	- uint32_t zi_pad;
	-} zinject_record_deadman_t;
	-
	-typedef struct zfs_cmd_deadman {
	- char zc_name[MAXPATHLEN];
	- char zc_value[MAXPATHLEN * 2];
	- char zc_string[MAXNAMELEN];
	- char zc_top_ds[MAXPATHLEN];
	- uint64_t zc_guid;
	- uint64_t zc_nvlist_conf; /* really (char ) /
	- uint64_t zc_nvlist_conf_size;
	- uint64_t zc_nvlist_src; /* really (char ) /
	- uint64_t zc_nvlist_src_size;
	- uint64_t zc_nvlist_dst; /* really (char ) /
	- uint64_t zc_nvlist_dst_size;
	- uint64_t zc_cookie;
	- uint64_t zc_objset_type;
	- uint64_t zc_perm_action;
	- uint64_t zc_history; /* really (char ) /
	- uint64_t zc_history_len;
	- uint64_t zc_history_offset;
	- uint64_t zc_obj;
	- uint64_t zc_iflags; /* internal to zfs(7fs) */
	- zfs_share_t zc_share;
	- uint64_t zc_jailid;
	- dmu_objset_stats_t zc_objset_stats;
	- struct drr_begin zc_begin_record;
	- /* zc_inject_record doesn't change in libzfs_core */
	- zinject_record_deadman_t zc_inject_record;
	- boolean_t zc_defer_destroy;
	- boolean_t zc_temphold;
	- uint64_t zc_action_handle;
	- int zc_cleanup_fd;
	- uint8_t zc_simple;
	- uint8_t zc_pad[3]; /* alignment */
	- uint64_t zc_sendobj;
	- uint64_t zc_fromobj;
	- uint64_t zc_createtxg;
	- zfs_stat_t zc_stat;
	-} zfs_cmd_deadman_t;
	-
	-typedef struct zfs_cmd_zcmd {
	- char zc_name[MAXPATHLEN]; /* name of pool or dataset */
	- uint64_t zc_nvlist_src; /* really (char ) /
	- uint64_t zc_nvlist_src_size;
	- uint64_t zc_nvlist_dst; /* really (char ) /
	- uint64_t zc_nvlist_dst_size;
	- boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
	- int zc_pad2;
	-
	- /*
	- * The following members are for legacy ioctls which haven't been
	- * converted to the new method.
	- */
	- uint64_t zc_history; /* really (char ) /
	- char zc_value[MAXPATHLEN * 2];
	- char zc_string[MAXNAMELEN];
	- uint64_t zc_guid;
	- uint64_t zc_nvlist_conf; /* really (char ) /
	- uint64_t zc_nvlist_conf_size;
	- uint64_t zc_cookie;
	- uint64_t zc_objset_type;
	- uint64_t zc_perm_action;
	- uint64_t zc_history_len;
	- uint64_t zc_history_offset;
	- uint64_t zc_obj;
	- uint64_t zc_iflags; /* internal to zfs(7fs) */
	- zfs_share_t zc_share;
	- uint64_t zc_jailid;
	- dmu_objset_stats_t zc_objset_stats;
	- struct drr_begin zc_begin_record;
	- zinject_record_deadman_t zc_inject_record;
	- boolean_t zc_defer_destroy;
	- boolean_t zc_temphold;
	- uint64_t zc_action_handle;
	- int zc_cleanup_fd;
	- uint8_t zc_simple;
	- uint8_t zc_pad[3]; /* alignment */
	- uint64_t zc_sendobj;
	- uint64_t zc_fromobj;
	- uint64_t zc_createtxg;
	- zfs_stat_t zc_stat;
	-} zfs_cmd_zcmd_t;
	-
	-typedef struct zfs_cmd_edbp {
	- char zc_name[MAXPATHLEN]; /* name of pool or dataset */
	- uint64_t zc_nvlist_src; /* really (char ) /
	- uint64_t zc_nvlist_src_size;
	- uint64_t zc_nvlist_dst; /* really (char ) /
	- uint64_t zc_nvlist_dst_size;
	- boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
	- int zc_pad2;
	-
	- /*
	- * The following members are for legacy ioctls which haven't been
	- * converted to the new method.
	- */
	- uint64_t zc_history; /* really (char ) /
	- char zc_value[MAXPATHLEN * 2];
	- char zc_string[MAXNAMELEN];
	- uint64_t zc_guid;
	- uint64_t zc_nvlist_conf; /* really (char ) /
	- uint64_t zc_nvlist_conf_size;
	- uint64_t zc_cookie;
	- uint64_t zc_objset_type;
	- uint64_t zc_perm_action;
	- uint64_t zc_history_len;
	- uint64_t zc_history_offset;
	- uint64_t zc_obj;
	- uint64_t zc_iflags; /* internal to zfs(7fs) */
	- zfs_share_t zc_share;
	- uint64_t zc_jailid;
	- dmu_objset_stats_t zc_objset_stats;
	- struct drr_begin zc_begin_record;
	- zinject_record_deadman_t zc_inject_record;
	- uint32_t zc_defer_destroy;
	- uint32_t zc_flags;
	- uint64_t zc_action_handle;
	- int zc_cleanup_fd;
	- uint8_t zc_simple;
	- uint8_t zc_pad[3]; /* alignment */
	- uint64_t zc_sendobj;
	- uint64_t zc_fromobj;
	- uint64_t zc_createtxg;
	- zfs_stat_t zc_stat;
	-} zfs_cmd_edbp_t;
	-
	-typedef struct zfs_cmd_resume {
	- char zc_name[MAXPATHLEN]; /* name of pool or dataset */
	- uint64_t zc_nvlist_src; /* really (char ) /
	- uint64_t zc_nvlist_src_size;
	- uint64_t zc_nvlist_dst; /* really (char ) /
	- uint64_t zc_nvlist_dst_size;
	- boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
	- int zc_pad2;
	-
	- /*
	- * The following members are for legacy ioctls which haven't been
	- * converted to the new method.
	- */
	- uint64_t zc_history; /* really (char ) /
	- char zc_value[MAXPATHLEN * 2];
	- char zc_string[MAXNAMELEN];
	- uint64_t zc_guid;
	- uint64_t zc_nvlist_conf; /* really (char ) /
	- uint64_t zc_nvlist_conf_size;
	- uint64_t zc_cookie;
	- uint64_t zc_objset_type;
	- uint64_t zc_perm_action;
	- uint64_t zc_history_len;
	- uint64_t zc_history_offset;
	- uint64_t zc_obj;
	- uint64_t zc_iflags; /* internal to zfs(7fs) */
	- zfs_share_t zc_share;
	- uint64_t zc_jailid;
	- dmu_objset_stats_t zc_objset_stats;
	- dmu_replay_record_t zc_begin_record;
	- zinject_record_deadman_t zc_inject_record;
	- uint32_t zc_defer_destroy;
	- uint32_t zc_flags;
	- uint64_t zc_action_handle;
	- int zc_cleanup_fd;
	- uint8_t zc_simple;
	- boolean_t zc_resumable;
	- uint64_t zc_sendobj;
	- uint64_t zc_fromobj;
	- uint64_t zc_createtxg;
	- zfs_stat_t zc_stat;
	-} zfs_cmd_resume_t;
	-
	-typedef struct zfs_cmd_inlanes {
	- char zc_name[MAXPATHLEN]; /* name of pool or dataset */
	- uint64_t zc_nvlist_src; /* really (char ) /
	- uint64_t zc_nvlist_src_size;
	- uint64_t zc_nvlist_dst; /* really (char ) /
	- uint64_t zc_nvlist_dst_size;
	- boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
	- int zc_pad2;
	-
	- /*
	- * The following members are for legacy ioctls which haven't been
	- * converted to the new method.
	- */
	- uint64_t zc_history; /* really (char ) /
	- char zc_value[MAXPATHLEN * 2];
	- char zc_string[MAXNAMELEN];
	- uint64_t zc_guid;
	- uint64_t zc_nvlist_conf; /* really (char ) /
	- uint64_t zc_nvlist_conf_size;
	- uint64_t zc_cookie;
	- uint64_t zc_objset_type;
	- uint64_t zc_perm_action;
	- uint64_t zc_history_len;
	- uint64_t zc_history_offset;
	- uint64_t zc_obj;
	- uint64_t zc_iflags; /* internal to zfs(7fs) */
	- zfs_share_t zc_share;
	- uint64_t zc_jailid;
	- dmu_objset_stats_t zc_objset_stats;
	- dmu_replay_record_t zc_begin_record;
	- zinject_record_t zc_inject_record;
	- uint32_t zc_defer_destroy;
	- uint32_t zc_flags;
	- uint64_t zc_action_handle;
	- int zc_cleanup_fd;
	- uint8_t zc_simple;
	- boolean_t zc_resumable;
	- uint64_t zc_sendobj;
	- uint64_t zc_fromobj;
	- uint64_t zc_createtxg;
	- zfs_stat_t zc_stat;
	-} zfs_cmd_inlanes_t;
	-
	-#ifdef _KERNEL
	-unsigned static long zfs_ioctl_v15_to_v28[] = {
	- 0, /* 0 ZFS_IOC_POOL_CREATE */
	- 1, /* 1 ZFS_IOC_POOL_DESTROY */
	- 2, /* 2 ZFS_IOC_POOL_IMPORT */
	- 3, /* 3 ZFS_IOC_POOL_EXPORT */
	- 4, /* 4 ZFS_IOC_POOL_CONFIGS */
	- 5, /* 5 ZFS_IOC_POOL_STATS */
	- 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */
	- 7, /* 7 ZFS_IOC_POOL_SCRUB */
	- 8, /* 8 ZFS_IOC_POOL_FREEZE */
	- 9, /* 9 ZFS_IOC_POOL_UPGRADE */
	- 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */
	- 11, /* 11 ZFS_IOC_VDEV_ADD */
	- 12, /* 12 ZFS_IOC_VDEV_REMOVE */
	- 13, /* 13 ZFS_IOC_VDEV_SET_STATE */
	- 14, /* 14 ZFS_IOC_VDEV_ATTACH */
	- 15, /* 15 ZFS_IOC_VDEV_DETACH */
	- 16, /* 16 ZFS_IOC_VDEV_SETPATH */
	- 18, /* 17 ZFS_IOC_OBJSET_STATS */
	- 19, /* 18 ZFS_IOC_OBJSET_ZPLPROPS */
	- 20, /* 19 ZFS_IOC_DATASET_LIST_NEXT */
	- 21, /* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */
	- 22, /* 21 ZFS_IOC_SET_PROP */
	- ZFS_IOC_COMPAT_PASS, /* 22 ZFS_IOC_CREATE_MINOR */
	- ZFS_IOC_COMPAT_PASS, /* 23 ZFS_IOC_REMOVE_MINOR */
	- 23, /* 24 ZFS_IOC_CREATE */
	- 24, /* 25 ZFS_IOC_DESTROY */
	- 25, /* 26 ZFS_IOC_ROLLBACK */
	- 26, /* 27 ZFS_IOC_RENAME */
	- 27, /* 28 ZFS_IOC_RECV */
	- 28, /* 29 ZFS_IOC_SEND */
	- 29, /* 30 ZFS_IOC_INJECT_FAULT */
	- 30, /* 31 ZFS_IOC_CLEAR_FAULT */
	- 31, /* 32 ZFS_IOC_INJECT_LIST_NEXT */
	- 32, /* 33 ZFS_IOC_ERROR_LOG */
	- 33, /* 34 ZFS_IOC_CLEAR */
	- 34, /* 35 ZFS_IOC_PROMOTE */
	- 35, /* 36 ZFS_IOC_DESTROY_SNAPS */
	- 36, /* 37 ZFS_IOC_SNAPSHOT */
	- 37, /* 38 ZFS_IOC_DSOBJ_TO_DSNAME */
	- 38, /* 39 ZFS_IOC_OBJ_TO_PATH */
	- 39, /* 40 ZFS_IOC_POOL_SET_PROPS */
	- 40, /* 41 ZFS_IOC_POOL_GET_PROPS */
	- 41, /* 42 ZFS_IOC_SET_FSACL */
	- 42, /* 43 ZFS_IOC_GET_FSACL */
	- ZFS_IOC_COMPAT_PASS, /* 44 ZFS_IOC_ISCSI_PERM_CHECK */
	- 43, /* 45 ZFS_IOC_SHARE */
	- 44, /* 46 ZFS_IOC_IHNERIT_PROP */
	- 58, /* 47 ZFS_IOC_JAIL */
	- 59, /* 48 ZFS_IOC_UNJAIL */
	- 45, /* 49 ZFS_IOC_SMB_ACL */
	- 46, /* 50 ZFS_IOC_USERSPACE_ONE */
	- 47, /* 51 ZFS_IOC_USERSPACE_MANY */
	- 48, /* 52 ZFS_IOC_USERSPACE_UPGRADE */
	- 17, /* 53 ZFS_IOC_SETFRU */
	-};
	-
	-#else /* KERNEL */
	-unsigned static long zfs_ioctl_v28_to_v15[] = {
	- 0, /* 0 ZFS_IOC_POOL_CREATE */
	- 1, /* 1 ZFS_IOC_POOL_DESTROY */
	- 2, /* 2 ZFS_IOC_POOL_IMPORT */
	- 3, /* 3 ZFS_IOC_POOL_EXPORT */
	- 4, /* 4 ZFS_IOC_POOL_CONFIGS */
	- 5, /* 5 ZFS_IOC_POOL_STATS */
	- 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */
	- 7, /* 7 ZFS_IOC_POOL_SCAN */
	- 8, /* 8 ZFS_IOC_POOL_FREEZE */
	- 9, /* 9 ZFS_IOC_POOL_UPGRADE */
	- 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */
	- 11, /* 11 ZFS_IOC_VDEV_ADD */
	- 12, /* 12 ZFS_IOC_VDEV_REMOVE */
	- 13, /* 13 ZFS_IOC_VDEV_SET_STATE */
	- 14, /* 14 ZFS_IOC_VDEV_ATTACH */
	- 15, /* 15 ZFS_IOC_VDEV_DETACH */
	- 16, /* 16 ZFS_IOC_VDEV_SETPATH */
	- 53, /* 17 ZFS_IOC_VDEV_SETFRU */
	- 17, /* 18 ZFS_IOC_OBJSET_STATS */
	- 18, /* 19 ZFS_IOC_OBJSET_ZPLPROPS */
	- 19, /* 20 ZFS_IOC_DATASET_LIST_NEXT */
	- 20, /* 21 ZFS_IOC_SNAPSHOT_LIST_NEXT */
	- 21, /* 22 ZFS_IOC_SET_PROP */
	- 24, /* 23 ZFS_IOC_CREATE */
	- 25, /* 24 ZFS_IOC_DESTROY */
	- 26, /* 25 ZFS_IOC_ROLLBACK */
	- 27, /* 26 ZFS_IOC_RENAME */
	- 28, /* 27 ZFS_IOC_RECV */
	- 29, /* 28 ZFS_IOC_SEND */
	- 30, /* 39 ZFS_IOC_INJECT_FAULT */
	- 31, /* 30 ZFS_IOC_CLEAR_FAULT */
	- 32, /* 31 ZFS_IOC_INJECT_LIST_NEXT */
	- 33, /* 32 ZFS_IOC_ERROR_LOG */
	- 34, /* 33 ZFS_IOC_CLEAR */
	- 35, /* 34 ZFS_IOC_PROMOTE */
	- 36, /* 35 ZFS_IOC_DESTROY_SNAPS */
	- 37, /* 36 ZFS_IOC_SNAPSHOT */
	- 38, /* 37 ZFS_IOC_DSOBJ_TO_DSNAME */
	- 39, /* 38 ZFS_IOC_OBJ_TO_PATH */
	- 40, /* 39 ZFS_IOC_POOL_SET_PROPS */
	- 41, /* 40 ZFS_IOC_POOL_GET_PROPS */
	- 42, /* 41 ZFS_IOC_SET_FSACL */
	- 43, /* 42 ZFS_IOC_GET_FSACL */
	- 45, /* 43 ZFS_IOC_SHARE */
	- 46, /* 44 ZFS_IOC_IHNERIT_PROP */
	- 49, /* 45 ZFS_IOC_SMB_ACL */
	- 50, /* 46 ZFS_IOC_USERSPACE_ONE */
	- 51, /* 47 ZFS_IOC_USERSPACE_MANY */
	- 52, /* 48 ZFS_IOC_USERSPACE_UPGRADE */
	- ZFS_IOC_COMPAT_FAIL, /* 49 ZFS_IOC_HOLD */
	- ZFS_IOC_COMPAT_FAIL, /* 50 ZFS_IOC_RELEASE */
	- ZFS_IOC_COMPAT_FAIL, /* 51 ZFS_IOC_GET_HOLDS */
	- ZFS_IOC_COMPAT_FAIL, /* 52 ZFS_IOC_OBJSET_RECVD_PROPS */
	- ZFS_IOC_COMPAT_FAIL, /* 53 ZFS_IOC_VDEV_SPLIT */
	- ZFS_IOC_COMPAT_FAIL, /* 54 ZFS_IOC_NEXT_OBJ */
	- ZFS_IOC_COMPAT_FAIL, /* 55 ZFS_IOC_DIFF */
	- ZFS_IOC_COMPAT_FAIL, /* 56 ZFS_IOC_TMP_SNAPSHOT */
	- ZFS_IOC_COMPAT_FAIL, /* 57 ZFS_IOC_OBJ_TO_STATS */
	- 47, /* 58 ZFS_IOC_JAIL */
	- 48, /* 59 ZFS_IOC_UNJAIL */
	-};
	-#endif /* ! _KERNEL */
	-
	-#ifdef _KERNEL
	-int zfs_ioctl_compat_pre(zfs_cmd_t , int , const int);
	-void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int);
	-nvlist_t zfs_ioctl_compat_innvl(zfs_cmd_t , nvlist_t *, const int,
	- const int);
	-nvlist_t zfs_ioctl_compat_outnvl(zfs_cmd_t , nvlist_t *, const int,
	- const int);
	-#else
	-int zcmd_ioctl_compat(int, int, zfs_cmd_t *, const int);
	-#endif /* _KERNEL */
	-void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int);
	-void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZFS_IOCTL_COMPAT_H */
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
	@@ -1,1380 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2013 Xin Li <delphij@FreeBSD.org>. All rights reserved.
	- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Portions Copyright 2005, 2010, Oracle and/or its affiliates.
	- * All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/cred.h>
	-#include <sys/dmu.h>
	-#include <sys/zio.h>
	-#include <sys/nvpair.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/zfs_ioctl.h>
	-#include "zfs_namecheck.h"
	-#include "zfs_ioctl_compat.h"
	-
	-static int zfs_version_ioctl = ZFS_IOCVER_CURRENT;
	-SYSCTL_DECL(_vfs_zfs_version);
	-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
	- 0, "ZFS_IOCTL_VERSION");
	-
	-/*
	- * FreeBSD zfs_cmd compatibility with older binaries
	- * appropriately remap/extend the zfs_cmd_t structure
	- */
	-void
	-zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
	-{
	- zfs_cmd_v15_t *zc_c;
	- zfs_cmd_v28_t *zc28_c;
	- zfs_cmd_deadman_t *zcdm_c;
	- zfs_cmd_zcmd_t *zcmd_c;
	- zfs_cmd_edbp_t *edbp_c;
	- zfs_cmd_resume_t *resume_c;
	- zfs_cmd_inlanes_t *inlanes_c;
	-
	- switch (cflag) {
	- case ZFS_CMD_COMPAT_INLANES:
	- inlanes_c = (void *)addr;
	- /* zc */
	- strlcpy(zc->zc_name, inlanes_c->zc_name, MAXPATHLEN);
	- strlcpy(zc->zc_value, inlanes_c->zc_value, MAXPATHLEN * 2);
	- strlcpy(zc->zc_string, inlanes_c->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) zc->field = inlanes_c->field
	- FIELD_COPY(zc_nvlist_src);
	- FIELD_COPY(zc_nvlist_src_size);
	- FIELD_COPY(zc_nvlist_dst);
	- FIELD_COPY(zc_nvlist_dst_size);
	- FIELD_COPY(zc_nvlist_dst_filled);
	- FIELD_COPY(zc_pad2);
	- FIELD_COPY(zc_history);
	- FIELD_COPY(zc_guid);
	- FIELD_COPY(zc_nvlist_conf);
	- FIELD_COPY(zc_nvlist_conf_size);
	- FIELD_COPY(zc_cookie);
	- FIELD_COPY(zc_objset_type);
	- FIELD_COPY(zc_perm_action);
	- FIELD_COPY(zc_history_len);
	- FIELD_COPY(zc_history_offset);
	- FIELD_COPY(zc_obj);
	- FIELD_COPY(zc_iflags);
	- FIELD_COPY(zc_share);
	- FIELD_COPY(zc_jailid);
	- FIELD_COPY(zc_objset_stats);
	- FIELD_COPY(zc_begin_record);
	- FIELD_COPY(zc_inject_record);
	- FIELD_COPY(zc_defer_destroy);
	- FIELD_COPY(zc_flags);
	- FIELD_COPY(zc_action_handle);
	- FIELD_COPY(zc_cleanup_fd);
	- FIELD_COPY(zc_simple);
	- FIELD_COPY(zc_resumable);
	- FIELD_COPY(zc_sendobj);
	- FIELD_COPY(zc_fromobj);
	- FIELD_COPY(zc_createtxg);
	- FIELD_COPY(zc_stat);
	-#undef FIELD_COPY
	- break;
	-
	- case ZFS_CMD_COMPAT_RESUME:
	- resume_c = (void *)addr;
	- /* zc */
	- strlcpy(zc->zc_name, resume_c->zc_name, MAXPATHLEN);
	- strlcpy(zc->zc_value, resume_c->zc_value, MAXPATHLEN * 2);
	- strlcpy(zc->zc_string, resume_c->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) zc->field = resume_c->field
	- FIELD_COPY(zc_nvlist_src);
	- FIELD_COPY(zc_nvlist_src_size);
	- FIELD_COPY(zc_nvlist_dst);
	- FIELD_COPY(zc_nvlist_dst_size);
	- FIELD_COPY(zc_nvlist_dst_filled);
	- FIELD_COPY(zc_pad2);
	- FIELD_COPY(zc_history);
	- FIELD_COPY(zc_guid);
	- FIELD_COPY(zc_nvlist_conf);
	- FIELD_COPY(zc_nvlist_conf_size);
	- FIELD_COPY(zc_cookie);
	- FIELD_COPY(zc_objset_type);
	- FIELD_COPY(zc_perm_action);
	- FIELD_COPY(zc_history_len);
	- FIELD_COPY(zc_history_offset);
	- FIELD_COPY(zc_obj);
	- FIELD_COPY(zc_iflags);
	- FIELD_COPY(zc_share);
	- FIELD_COPY(zc_jailid);
	- FIELD_COPY(zc_objset_stats);
	- FIELD_COPY(zc_begin_record);
	- FIELD_COPY(zc_inject_record.zi_objset);
	- FIELD_COPY(zc_inject_record.zi_object);
	- FIELD_COPY(zc_inject_record.zi_start);
	- FIELD_COPY(zc_inject_record.zi_end);
	- FIELD_COPY(zc_inject_record.zi_guid);
	- FIELD_COPY(zc_inject_record.zi_level);
	- FIELD_COPY(zc_inject_record.zi_error);
	- FIELD_COPY(zc_inject_record.zi_type);
	- FIELD_COPY(zc_inject_record.zi_freq);
	- FIELD_COPY(zc_inject_record.zi_failfast);
	- strlcpy(zc->zc_inject_record.zi_func,
	- resume_c->zc_inject_record.zi_func, MAXNAMELEN);
	- FIELD_COPY(zc_inject_record.zi_iotype);
	- FIELD_COPY(zc_inject_record.zi_duration);
	- FIELD_COPY(zc_inject_record.zi_timer);
	- zc->zc_inject_record.zi_nlanes = 1;
	- FIELD_COPY(zc_inject_record.zi_cmd);
	- FIELD_COPY(zc_inject_record.zi_pad);
	- FIELD_COPY(zc_defer_destroy);
	- FIELD_COPY(zc_flags);
	- FIELD_COPY(zc_action_handle);
	- FIELD_COPY(zc_cleanup_fd);
	- FIELD_COPY(zc_simple);
	- FIELD_COPY(zc_resumable);
	- FIELD_COPY(zc_sendobj);
	- FIELD_COPY(zc_fromobj);
	- FIELD_COPY(zc_createtxg);
	- FIELD_COPY(zc_stat);
	-#undef FIELD_COPY
	- break;
	-
	- case ZFS_CMD_COMPAT_EDBP:
	- edbp_c = (void *)addr;
	- /* zc */
	- strlcpy(zc->zc_name, edbp_c->zc_name, MAXPATHLEN);
	- strlcpy(zc->zc_value, edbp_c->zc_value, MAXPATHLEN * 2);
	- strlcpy(zc->zc_string, edbp_c->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) zc->field = edbp_c->field
	- FIELD_COPY(zc_nvlist_src);
	- FIELD_COPY(zc_nvlist_src_size);
	- FIELD_COPY(zc_nvlist_dst);
	- FIELD_COPY(zc_nvlist_dst_size);
	- FIELD_COPY(zc_nvlist_dst_filled);
	- FIELD_COPY(zc_pad2);
	- FIELD_COPY(zc_history);
	- FIELD_COPY(zc_guid);
	- FIELD_COPY(zc_nvlist_conf);
	- FIELD_COPY(zc_nvlist_conf_size);
	- FIELD_COPY(zc_cookie);
	- FIELD_COPY(zc_objset_type);
	- FIELD_COPY(zc_perm_action);
	- FIELD_COPY(zc_history_len);
	- FIELD_COPY(zc_history_offset);
	- FIELD_COPY(zc_obj);
	- FIELD_COPY(zc_iflags);
	- FIELD_COPY(zc_share);
	- FIELD_COPY(zc_jailid);
	- FIELD_COPY(zc_objset_stats);
	- zc->zc_begin_record.drr_u.drr_begin = edbp_c->zc_begin_record;
	- FIELD_COPY(zc_inject_record.zi_objset);
	- FIELD_COPY(zc_inject_record.zi_object);
	- FIELD_COPY(zc_inject_record.zi_start);
	- FIELD_COPY(zc_inject_record.zi_end);
	- FIELD_COPY(zc_inject_record.zi_guid);
	- FIELD_COPY(zc_inject_record.zi_level);
	- FIELD_COPY(zc_inject_record.zi_error);
	- FIELD_COPY(zc_inject_record.zi_type);
	- FIELD_COPY(zc_inject_record.zi_freq);
	- FIELD_COPY(zc_inject_record.zi_failfast);
	- strlcpy(zc->zc_inject_record.zi_func,
	- edbp_c->zc_inject_record.zi_func, MAXNAMELEN);
	- FIELD_COPY(zc_inject_record.zi_iotype);
	- FIELD_COPY(zc_inject_record.zi_duration);
	- FIELD_COPY(zc_inject_record.zi_timer);
	- zc->zc_inject_record.zi_nlanes = 1;
	- FIELD_COPY(zc_inject_record.zi_cmd);
	- FIELD_COPY(zc_inject_record.zi_pad);
	- FIELD_COPY(zc_defer_destroy);
	- FIELD_COPY(zc_flags);
	- FIELD_COPY(zc_action_handle);
	- FIELD_COPY(zc_cleanup_fd);
	- FIELD_COPY(zc_simple);
	- zc->zc_resumable = B_FALSE;
	- FIELD_COPY(zc_sendobj);
	- FIELD_COPY(zc_fromobj);
	- FIELD_COPY(zc_createtxg);
	- FIELD_COPY(zc_stat);
	-#undef FIELD_COPY
	- break;
	-
	- case ZFS_CMD_COMPAT_ZCMD:
	- zcmd_c = (void *)addr;
	- /* zc */
	- strlcpy(zc->zc_name, zcmd_c->zc_name, MAXPATHLEN);
	- strlcpy(zc->zc_value, zcmd_c->zc_value, MAXPATHLEN * 2);
	- strlcpy(zc->zc_string, zcmd_c->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) zc->field = zcmd_c->field
	- FIELD_COPY(zc_nvlist_src);
	- FIELD_COPY(zc_nvlist_src_size);
	- FIELD_COPY(zc_nvlist_dst);
	- FIELD_COPY(zc_nvlist_dst_size);
	- FIELD_COPY(zc_nvlist_dst_filled);
	- FIELD_COPY(zc_pad2);
	- FIELD_COPY(zc_history);
	- FIELD_COPY(zc_guid);
	- FIELD_COPY(zc_nvlist_conf);
	- FIELD_COPY(zc_nvlist_conf_size);
	- FIELD_COPY(zc_cookie);
	- FIELD_COPY(zc_objset_type);
	- FIELD_COPY(zc_perm_action);
	- FIELD_COPY(zc_history_len);
	- FIELD_COPY(zc_history_offset);
	- FIELD_COPY(zc_obj);
	- FIELD_COPY(zc_iflags);
	- FIELD_COPY(zc_share);
	- FIELD_COPY(zc_jailid);
	- FIELD_COPY(zc_objset_stats);
	- zc->zc_begin_record.drr_u.drr_begin = zcmd_c->zc_begin_record;
	- FIELD_COPY(zc_inject_record.zi_objset);
	- FIELD_COPY(zc_inject_record.zi_object);
	- FIELD_COPY(zc_inject_record.zi_start);
	- FIELD_COPY(zc_inject_record.zi_end);
	- FIELD_COPY(zc_inject_record.zi_guid);
	- FIELD_COPY(zc_inject_record.zi_level);
	- FIELD_COPY(zc_inject_record.zi_error);
	- FIELD_COPY(zc_inject_record.zi_type);
	- FIELD_COPY(zc_inject_record.zi_freq);
	- FIELD_COPY(zc_inject_record.zi_failfast);
	- strlcpy(zc->zc_inject_record.zi_func,
	- zcmd_c->zc_inject_record.zi_func, MAXNAMELEN);
	- FIELD_COPY(zc_inject_record.zi_iotype);
	- FIELD_COPY(zc_inject_record.zi_duration);
	- FIELD_COPY(zc_inject_record.zi_timer);
	- zc->zc_inject_record.zi_nlanes = 1;
	- FIELD_COPY(zc_inject_record.zi_cmd);
	- FIELD_COPY(zc_inject_record.zi_pad);
	-
	- /* boolean_t -> uint32_t */
	- zc->zc_defer_destroy = (uint32_t)(zcmd_c->zc_defer_destroy);
	- zc->zc_flags = 0;
	-
	- FIELD_COPY(zc_action_handle);
	- FIELD_COPY(zc_cleanup_fd);
	- FIELD_COPY(zc_simple);
	- zc->zc_resumable = B_FALSE;
	- FIELD_COPY(zc_sendobj);
	- FIELD_COPY(zc_fromobj);
	- FIELD_COPY(zc_createtxg);
	- FIELD_COPY(zc_stat);
	-#undef FIELD_COPY
	-
	- break;
	-
	- case ZFS_CMD_COMPAT_DEADMAN:
	- zcdm_c = (void *)addr;
	- /* zc */
	- strlcpy(zc->zc_name, zcdm_c->zc_name, MAXPATHLEN);
	- strlcpy(zc->zc_value, zcdm_c->zc_value, MAXPATHLEN * 2);
	- strlcpy(zc->zc_string, zcdm_c->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) zc->field = zcdm_c->field
	- zc->zc_guid = zcdm_c->zc_guid;
	- zc->zc_nvlist_conf = zcdm_c->zc_nvlist_conf;
	- zc->zc_nvlist_conf_size = zcdm_c->zc_nvlist_conf_size;
	- zc->zc_nvlist_src = zcdm_c->zc_nvlist_src;
	- zc->zc_nvlist_src_size = zcdm_c->zc_nvlist_src_size;
	- zc->zc_nvlist_dst = zcdm_c->zc_nvlist_dst;
	- zc->zc_nvlist_dst_size = zcdm_c->zc_nvlist_dst_size;
	- zc->zc_cookie = zcdm_c->zc_cookie;
	- zc->zc_objset_type = zcdm_c->zc_objset_type;
	- zc->zc_perm_action = zcdm_c->zc_perm_action;
	- zc->zc_history = zcdm_c->zc_history;
	- zc->zc_history_len = zcdm_c->zc_history_len;
	- zc->zc_history_offset = zcdm_c->zc_history_offset;
	- zc->zc_obj = zcdm_c->zc_obj;
	- zc->zc_iflags = zcdm_c->zc_iflags;
	- zc->zc_share = zcdm_c->zc_share;
	- zc->zc_jailid = zcdm_c->zc_jailid;
	- zc->zc_objset_stats = zcdm_c->zc_objset_stats;
	- zc->zc_begin_record.drr_u.drr_begin = zcdm_c->zc_begin_record;
	- zc->zc_defer_destroy = zcdm_c->zc_defer_destroy;
	- (void)zcdm_c->zc_temphold;
	- zc->zc_action_handle = zcdm_c->zc_action_handle;
	- zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd;
	- zc->zc_simple = zcdm_c->zc_simple;
	- zc->zc_resumable = B_FALSE;
	- zc->zc_sendobj = zcdm_c->zc_sendobj;
	- zc->zc_fromobj = zcdm_c->zc_fromobj;
	- zc->zc_createtxg = zcdm_c->zc_createtxg;
	- zc->zc_stat = zcdm_c->zc_stat;
	- FIELD_COPY(zc_inject_record.zi_objset);
	- FIELD_COPY(zc_inject_record.zi_object);
	- FIELD_COPY(zc_inject_record.zi_start);
	- FIELD_COPY(zc_inject_record.zi_end);
	- FIELD_COPY(zc_inject_record.zi_guid);
	- FIELD_COPY(zc_inject_record.zi_level);
	- FIELD_COPY(zc_inject_record.zi_error);
	- FIELD_COPY(zc_inject_record.zi_type);
	- FIELD_COPY(zc_inject_record.zi_freq);
	- FIELD_COPY(zc_inject_record.zi_failfast);
	- strlcpy(zc->zc_inject_record.zi_func,
	- resume_c->zc_inject_record.zi_func, MAXNAMELEN);
	- FIELD_COPY(zc_inject_record.zi_iotype);
	- FIELD_COPY(zc_inject_record.zi_duration);
	- FIELD_COPY(zc_inject_record.zi_timer);
	- zc->zc_inject_record.zi_nlanes = 1;
	- FIELD_COPY(zc_inject_record.zi_cmd);
	- FIELD_COPY(zc_inject_record.zi_pad);
	-
	- /* we always assume zc_nvlist_dst_filled is true */
	- zc->zc_nvlist_dst_filled = B_TRUE;
	-#undef FIELD_COPY
	- break;
	-
	- case ZFS_CMD_COMPAT_V28:
	- zc28_c = (void *)addr;
	-
	- /* zc */
	- strlcpy(zc->zc_name, zc28_c->zc_name, MAXPATHLEN);
	- strlcpy(zc->zc_value, zc28_c->zc_value, MAXPATHLEN * 2);
	- strlcpy(zc->zc_string, zc28_c->zc_string, MAXPATHLEN);
	- zc->zc_guid = zc28_c->zc_guid;
	- zc->zc_nvlist_conf = zc28_c->zc_nvlist_conf;
	- zc->zc_nvlist_conf_size = zc28_c->zc_nvlist_conf_size;
	- zc->zc_nvlist_src = zc28_c->zc_nvlist_src;
	- zc->zc_nvlist_src_size = zc28_c->zc_nvlist_src_size;
	- zc->zc_nvlist_dst = zc28_c->zc_nvlist_dst;
	- zc->zc_nvlist_dst_size = zc28_c->zc_nvlist_dst_size;
	- zc->zc_cookie = zc28_c->zc_cookie;
	- zc->zc_objset_type = zc28_c->zc_objset_type;
	- zc->zc_perm_action = zc28_c->zc_perm_action;
	- zc->zc_history = zc28_c->zc_history;
	- zc->zc_history_len = zc28_c->zc_history_len;
	- zc->zc_history_offset = zc28_c->zc_history_offset;
	- zc->zc_obj = zc28_c->zc_obj;
	- zc->zc_iflags = zc28_c->zc_iflags;
	- zc->zc_share = zc28_c->zc_share;
	- zc->zc_jailid = zc28_c->zc_jailid;
	- zc->zc_objset_stats = zc28_c->zc_objset_stats;
	- zc->zc_begin_record.drr_u.drr_begin = zc28_c->zc_begin_record;
	- zc->zc_defer_destroy = zc28_c->zc_defer_destroy;
	- (void)zc28_c->zc_temphold;
	- zc->zc_action_handle = zc28_c->zc_action_handle;
	- zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd;
	- zc->zc_simple = zc28_c->zc_simple;
	- zc->zc_resumable = B_FALSE;
	- zc->zc_sendobj = zc28_c->zc_sendobj;
	- zc->zc_fromobj = zc28_c->zc_fromobj;
	- zc->zc_createtxg = zc28_c->zc_createtxg;
	- zc->zc_stat = zc28_c->zc_stat;
	-
	- /* zc->zc_inject_record */
	- zc->zc_inject_record.zi_objset =
	- zc28_c->zc_inject_record.zi_objset;
	- zc->zc_inject_record.zi_object =
	- zc28_c->zc_inject_record.zi_object;
	- zc->zc_inject_record.zi_start =
	- zc28_c->zc_inject_record.zi_start;
	- zc->zc_inject_record.zi_end =
	- zc28_c->zc_inject_record.zi_end;
	- zc->zc_inject_record.zi_guid =
	- zc28_c->zc_inject_record.zi_guid;
	- zc->zc_inject_record.zi_level =
	- zc28_c->zc_inject_record.zi_level;
	- zc->zc_inject_record.zi_error =
	- zc28_c->zc_inject_record.zi_error;
	- zc->zc_inject_record.zi_type =
	- zc28_c->zc_inject_record.zi_type;
	- zc->zc_inject_record.zi_freq =
	- zc28_c->zc_inject_record.zi_freq;
	- zc->zc_inject_record.zi_failfast =
	- zc28_c->zc_inject_record.zi_failfast;
	- strlcpy(zc->zc_inject_record.zi_func,
	- zc28_c->zc_inject_record.zi_func, MAXNAMELEN);
	- zc->zc_inject_record.zi_iotype =
	- zc28_c->zc_inject_record.zi_iotype;
	- zc->zc_inject_record.zi_duration =
	- zc28_c->zc_inject_record.zi_duration;
	- zc->zc_inject_record.zi_timer =
	- zc28_c->zc_inject_record.zi_timer;
	- zc->zc_inject_record.zi_nlanes = 1;
	- zc->zc_inject_record.zi_cmd = ZINJECT_UNINITIALIZED;
	- zc->zc_inject_record.zi_pad = 0;
	- break;
	-
	- case ZFS_CMD_COMPAT_V15:
	- zc_c = (void *)addr;
	-
	- /* zc */
	- strlcpy(zc->zc_name, zc_c->zc_name, MAXPATHLEN);
	- strlcpy(zc->zc_value, zc_c->zc_value, MAXPATHLEN);
	- strlcpy(zc->zc_string, zc_c->zc_string, MAXPATHLEN);
	- zc->zc_guid = zc_c->zc_guid;
	- zc->zc_nvlist_conf = zc_c->zc_nvlist_conf;
	- zc->zc_nvlist_conf_size = zc_c->zc_nvlist_conf_size;
	- zc->zc_nvlist_src = zc_c->zc_nvlist_src;
	- zc->zc_nvlist_src_size = zc_c->zc_nvlist_src_size;
	- zc->zc_nvlist_dst = zc_c->zc_nvlist_dst;
	- zc->zc_nvlist_dst_size = zc_c->zc_nvlist_dst_size;
	- zc->zc_cookie = zc_c->zc_cookie;
	- zc->zc_objset_type = zc_c->zc_objset_type;
	- zc->zc_perm_action = zc_c->zc_perm_action;
	- zc->zc_history = zc_c->zc_history;
	- zc->zc_history_len = zc_c->zc_history_len;
	- zc->zc_history_offset = zc_c->zc_history_offset;
	- zc->zc_obj = zc_c->zc_obj;
	- zc->zc_share = zc_c->zc_share;
	- zc->zc_jailid = zc_c->zc_jailid;
	- zc->zc_objset_stats = zc_c->zc_objset_stats;
	- zc->zc_begin_record.drr_u.drr_begin = zc_c->zc_begin_record;
	-
	- /* zc->zc_inject_record */
	- zc->zc_inject_record.zi_objset =
	- zc_c->zc_inject_record.zi_objset;
	- zc->zc_inject_record.zi_object =
	- zc_c->zc_inject_record.zi_object;
	- zc->zc_inject_record.zi_start =
	- zc_c->zc_inject_record.zi_start;
	- zc->zc_inject_record.zi_end =
	- zc_c->zc_inject_record.zi_end;
	- zc->zc_inject_record.zi_guid =
	- zc_c->zc_inject_record.zi_guid;
	- zc->zc_inject_record.zi_level =
	- zc_c->zc_inject_record.zi_level;
	- zc->zc_inject_record.zi_error =
	- zc_c->zc_inject_record.zi_error;
	- zc->zc_inject_record.zi_type =
	- zc_c->zc_inject_record.zi_type;
	- zc->zc_inject_record.zi_freq =
	- zc_c->zc_inject_record.zi_freq;
	- zc->zc_inject_record.zi_failfast =
	- zc_c->zc_inject_record.zi_failfast;
	- break;
	- }
	-}
	-
	-void
	-zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request,
	- const int cflag)
	-{
	- zfs_cmd_v15_t *zc_c;
	- zfs_cmd_v28_t *zc28_c;
	- zfs_cmd_deadman_t *zcdm_c;
	- zfs_cmd_zcmd_t *zcmd_c;
	- zfs_cmd_edbp_t *edbp_c;
	- zfs_cmd_resume_t *resume_c;
	- zfs_cmd_inlanes_t *inlanes_c;
	-
	- switch (cflag) {
	- case ZFS_CMD_COMPAT_INLANES:
	- inlanes_c = (void *)addr;
	- strlcpy(inlanes_c->zc_name, zc->zc_name, MAXPATHLEN);
	- strlcpy(inlanes_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
	- strlcpy(inlanes_c->zc_string, zc->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) inlanes_c->field = zc->field
	- FIELD_COPY(zc_nvlist_src);
	- FIELD_COPY(zc_nvlist_src_size);
	- FIELD_COPY(zc_nvlist_dst);
	- FIELD_COPY(zc_nvlist_dst_size);
	- FIELD_COPY(zc_nvlist_dst_filled);
	- FIELD_COPY(zc_pad2);
	- FIELD_COPY(zc_history);
	- FIELD_COPY(zc_guid);
	- FIELD_COPY(zc_nvlist_conf);
	- FIELD_COPY(zc_nvlist_conf_size);
	- FIELD_COPY(zc_cookie);
	- FIELD_COPY(zc_objset_type);
	- FIELD_COPY(zc_perm_action);
	- FIELD_COPY(zc_history_len);
	- FIELD_COPY(zc_history_offset);
	- FIELD_COPY(zc_obj);
	- FIELD_COPY(zc_iflags);
	- FIELD_COPY(zc_share);
	- FIELD_COPY(zc_jailid);
	- FIELD_COPY(zc_objset_stats);
	- FIELD_COPY(zc_begin_record);
	- FIELD_COPY(zc_inject_record);
	- FIELD_COPY(zc_defer_destroy);
	- FIELD_COPY(zc_flags);
	- FIELD_COPY(zc_action_handle);
	- FIELD_COPY(zc_cleanup_fd);
	- FIELD_COPY(zc_simple);
	- FIELD_COPY(zc_sendobj);
	- FIELD_COPY(zc_fromobj);
	- FIELD_COPY(zc_createtxg);
	- FIELD_COPY(zc_stat);
	-#undef FIELD_COPY
	- break;
	-
	- case ZFS_CMD_COMPAT_RESUME:
	- resume_c = (void *)addr;
	- strlcpy(resume_c->zc_name, zc->zc_name, MAXPATHLEN);
	- strlcpy(resume_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
	- strlcpy(resume_c->zc_string, zc->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) resume_c->field = zc->field
	- FIELD_COPY(zc_nvlist_src);
	- FIELD_COPY(zc_nvlist_src_size);
	- FIELD_COPY(zc_nvlist_dst);
	- FIELD_COPY(zc_nvlist_dst_size);
	- FIELD_COPY(zc_nvlist_dst_filled);
	- FIELD_COPY(zc_pad2);
	- FIELD_COPY(zc_history);
	- FIELD_COPY(zc_guid);
	- FIELD_COPY(zc_nvlist_conf);
	- FIELD_COPY(zc_nvlist_conf_size);
	- FIELD_COPY(zc_cookie);
	- FIELD_COPY(zc_objset_type);
	- FIELD_COPY(zc_perm_action);
	- FIELD_COPY(zc_history_len);
	- FIELD_COPY(zc_history_offset);
	- FIELD_COPY(zc_obj);
	- FIELD_COPY(zc_iflags);
	- FIELD_COPY(zc_share);
	- FIELD_COPY(zc_jailid);
	- FIELD_COPY(zc_objset_stats);
	- FIELD_COPY(zc_begin_record);
	- FIELD_COPY(zc_inject_record.zi_objset);
	- FIELD_COPY(zc_inject_record.zi_object);
	- FIELD_COPY(zc_inject_record.zi_start);
	- FIELD_COPY(zc_inject_record.zi_end);
	- FIELD_COPY(zc_inject_record.zi_guid);
	- FIELD_COPY(zc_inject_record.zi_level);
	- FIELD_COPY(zc_inject_record.zi_error);
	- FIELD_COPY(zc_inject_record.zi_type);
	- FIELD_COPY(zc_inject_record.zi_freq);
	- FIELD_COPY(zc_inject_record.zi_failfast);
	- strlcpy(resume_c->zc_inject_record.zi_func,
	- zc->zc_inject_record.zi_func, MAXNAMELEN);
	- FIELD_COPY(zc_inject_record.zi_iotype);
	- FIELD_COPY(zc_inject_record.zi_duration);
	- FIELD_COPY(zc_inject_record.zi_timer);
	- FIELD_COPY(zc_inject_record.zi_cmd);
	- FIELD_COPY(zc_inject_record.zi_pad);
	- FIELD_COPY(zc_defer_destroy);
	- FIELD_COPY(zc_flags);
	- FIELD_COPY(zc_action_handle);
	- FIELD_COPY(zc_cleanup_fd);
	- FIELD_COPY(zc_simple);
	- FIELD_COPY(zc_sendobj);
	- FIELD_COPY(zc_fromobj);
	- FIELD_COPY(zc_createtxg);
	- FIELD_COPY(zc_stat);
	-#undef FIELD_COPY
	- break;
	-
	- case ZFS_CMD_COMPAT_EDBP:
	- edbp_c = (void *)addr;
	- strlcpy(edbp_c->zc_name, zc->zc_name, MAXPATHLEN);
	- strlcpy(edbp_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
	- strlcpy(edbp_c->zc_string, zc->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) edbp_c->field = zc->field
	- FIELD_COPY(zc_nvlist_src);
	- FIELD_COPY(zc_nvlist_src_size);
	- FIELD_COPY(zc_nvlist_dst);
	- FIELD_COPY(zc_nvlist_dst_size);
	- FIELD_COPY(zc_nvlist_dst_filled);
	- FIELD_COPY(zc_pad2);
	- FIELD_COPY(zc_history);
	- FIELD_COPY(zc_guid);
	- FIELD_COPY(zc_nvlist_conf);
	- FIELD_COPY(zc_nvlist_conf_size);
	- FIELD_COPY(zc_cookie);
	- FIELD_COPY(zc_objset_type);
	- FIELD_COPY(zc_perm_action);
	- FIELD_COPY(zc_history_len);
	- FIELD_COPY(zc_history_offset);
	- FIELD_COPY(zc_obj);
	- FIELD_COPY(zc_iflags);
	- FIELD_COPY(zc_share);
	- FIELD_COPY(zc_jailid);
	- FIELD_COPY(zc_objset_stats);
	- edbp_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
	- FIELD_COPY(zc_inject_record.zi_objset);
	- FIELD_COPY(zc_inject_record.zi_object);
	- FIELD_COPY(zc_inject_record.zi_start);
	- FIELD_COPY(zc_inject_record.zi_end);
	- FIELD_COPY(zc_inject_record.zi_guid);
	- FIELD_COPY(zc_inject_record.zi_level);
	- FIELD_COPY(zc_inject_record.zi_error);
	- FIELD_COPY(zc_inject_record.zi_type);
	- FIELD_COPY(zc_inject_record.zi_freq);
	- FIELD_COPY(zc_inject_record.zi_failfast);
	- strlcpy(resume_c->zc_inject_record.zi_func,
	- zc->zc_inject_record.zi_func, MAXNAMELEN);
	- FIELD_COPY(zc_inject_record.zi_iotype);
	- FIELD_COPY(zc_inject_record.zi_duration);
	- FIELD_COPY(zc_inject_record.zi_timer);
	- FIELD_COPY(zc_inject_record.zi_cmd);
	- FIELD_COPY(zc_inject_record.zi_pad);
	- FIELD_COPY(zc_defer_destroy);
	- FIELD_COPY(zc_flags);
	- FIELD_COPY(zc_action_handle);
	- FIELD_COPY(zc_cleanup_fd);
	- FIELD_COPY(zc_simple);
	- FIELD_COPY(zc_sendobj);
	- FIELD_COPY(zc_fromobj);
	- FIELD_COPY(zc_createtxg);
	- FIELD_COPY(zc_stat);
	-#undef FIELD_COPY
	- break;
	-
	- case ZFS_CMD_COMPAT_ZCMD:
	- zcmd_c = (void *)addr;
	- /* zc */
	- strlcpy(zcmd_c->zc_name, zc->zc_name, MAXPATHLEN);
	- strlcpy(zcmd_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
	- strlcpy(zcmd_c->zc_string, zc->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) zcmd_c->field = zc->field
	- FIELD_COPY(zc_nvlist_src);
	- FIELD_COPY(zc_nvlist_src_size);
	- FIELD_COPY(zc_nvlist_dst);
	- FIELD_COPY(zc_nvlist_dst_size);
	- FIELD_COPY(zc_nvlist_dst_filled);
	- FIELD_COPY(zc_pad2);
	- FIELD_COPY(zc_history);
	- FIELD_COPY(zc_guid);
	- FIELD_COPY(zc_nvlist_conf);
	- FIELD_COPY(zc_nvlist_conf_size);
	- FIELD_COPY(zc_cookie);
	- FIELD_COPY(zc_objset_type);
	- FIELD_COPY(zc_perm_action);
	- FIELD_COPY(zc_history_len);
	- FIELD_COPY(zc_history_offset);
	- FIELD_COPY(zc_obj);
	- FIELD_COPY(zc_iflags);
	- FIELD_COPY(zc_share);
	- FIELD_COPY(zc_jailid);
	- FIELD_COPY(zc_objset_stats);
	- zcmd_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
	- FIELD_COPY(zc_inject_record.zi_objset);
	- FIELD_COPY(zc_inject_record.zi_object);
	- FIELD_COPY(zc_inject_record.zi_start);
	- FIELD_COPY(zc_inject_record.zi_end);
	- FIELD_COPY(zc_inject_record.zi_guid);
	- FIELD_COPY(zc_inject_record.zi_level);
	- FIELD_COPY(zc_inject_record.zi_error);
	- FIELD_COPY(zc_inject_record.zi_type);
	- FIELD_COPY(zc_inject_record.zi_freq);
	- FIELD_COPY(zc_inject_record.zi_failfast);
	- strlcpy(resume_c->zc_inject_record.zi_func,
	- zc->zc_inject_record.zi_func, MAXNAMELEN);
	- FIELD_COPY(zc_inject_record.zi_iotype);
	- FIELD_COPY(zc_inject_record.zi_duration);
	- FIELD_COPY(zc_inject_record.zi_timer);
	- FIELD_COPY(zc_inject_record.zi_cmd);
	- FIELD_COPY(zc_inject_record.zi_pad);
	-
	- /* boolean_t -> uint32_t */
	- zcmd_c->zc_defer_destroy = (uint32_t)(zc->zc_defer_destroy);
	- zcmd_c->zc_temphold = 0;
	-
	- FIELD_COPY(zc_action_handle);
	- FIELD_COPY(zc_cleanup_fd);
	- FIELD_COPY(zc_simple);
	- FIELD_COPY(zc_sendobj);
	- FIELD_COPY(zc_fromobj);
	- FIELD_COPY(zc_createtxg);
	- FIELD_COPY(zc_stat);
	-#undef FIELD_COPY
	-
	- break;
	-
	- case ZFS_CMD_COMPAT_DEADMAN:
	- zcdm_c = (void *)addr;
	-
	- strlcpy(zcdm_c->zc_name, zc->zc_name, MAXPATHLEN);
	- strlcpy(zcdm_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
	- strlcpy(zcdm_c->zc_string, zc->zc_string, MAXPATHLEN);
	-
	-#define FIELD_COPY(field) zcdm_c->field = zc->field
	- zcdm_c->zc_guid = zc->zc_guid;
	- zcdm_c->zc_nvlist_conf = zc->zc_nvlist_conf;
	- zcdm_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
	- zcdm_c->zc_nvlist_src = zc->zc_nvlist_src;
	- zcdm_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
	- zcdm_c->zc_nvlist_dst = zc->zc_nvlist_dst;
	- zcdm_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
	- zcdm_c->zc_cookie = zc->zc_cookie;
	- zcdm_c->zc_objset_type = zc->zc_objset_type;
	- zcdm_c->zc_perm_action = zc->zc_perm_action;
	- zcdm_c->zc_history = zc->zc_history;
	- zcdm_c->zc_history_len = zc->zc_history_len;
	- zcdm_c->zc_history_offset = zc->zc_history_offset;
	- zcdm_c->zc_obj = zc->zc_obj;
	- zcdm_c->zc_iflags = zc->zc_iflags;
	- zcdm_c->zc_share = zc->zc_share;
	- zcdm_c->zc_jailid = zc->zc_jailid;
	- zcdm_c->zc_objset_stats = zc->zc_objset_stats;
	- zcdm_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
	- zcdm_c->zc_defer_destroy = zc->zc_defer_destroy;
	- zcdm_c->zc_temphold = 0;
	- zcdm_c->zc_action_handle = zc->zc_action_handle;
	- zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd;
	- zcdm_c->zc_simple = zc->zc_simple;
	- zcdm_c->zc_sendobj = zc->zc_sendobj;
	- zcdm_c->zc_fromobj = zc->zc_fromobj;
	- zcdm_c->zc_createtxg = zc->zc_createtxg;
	- zcdm_c->zc_stat = zc->zc_stat;
	- FIELD_COPY(zc_inject_record.zi_objset);
	- FIELD_COPY(zc_inject_record.zi_object);
	- FIELD_COPY(zc_inject_record.zi_start);
	- FIELD_COPY(zc_inject_record.zi_end);
	- FIELD_COPY(zc_inject_record.zi_guid);
	- FIELD_COPY(zc_inject_record.zi_level);
	- FIELD_COPY(zc_inject_record.zi_error);
	- FIELD_COPY(zc_inject_record.zi_type);
	- FIELD_COPY(zc_inject_record.zi_freq);
	- FIELD_COPY(zc_inject_record.zi_failfast);
	- strlcpy(resume_c->zc_inject_record.zi_func,
	- zc->zc_inject_record.zi_func, MAXNAMELEN);
	- FIELD_COPY(zc_inject_record.zi_iotype);
	- FIELD_COPY(zc_inject_record.zi_duration);
	- FIELD_COPY(zc_inject_record.zi_timer);
	- FIELD_COPY(zc_inject_record.zi_cmd);
	- FIELD_COPY(zc_inject_record.zi_pad);
	-#undef FIELD_COPY
	-#ifndef _KERNEL
	- if (request == ZFS_IOC_RECV)
	- strlcpy(zcdm_c->zc_top_ds,
	- zc->zc_value + strlen(zc->zc_value) + 1,
	- (MAXPATHLEN * 2) - strlen(zc->zc_value) - 1);
	-#endif
	- break;
	-
	- case ZFS_CMD_COMPAT_V28:
	- zc28_c = (void *)addr;
	-
	- strlcpy(zc28_c->zc_name, zc->zc_name, MAXPATHLEN);
	- strlcpy(zc28_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
	- strlcpy(zc28_c->zc_string, zc->zc_string, MAXPATHLEN);
	- zc28_c->zc_guid = zc->zc_guid;
	- zc28_c->zc_nvlist_conf = zc->zc_nvlist_conf;
	- zc28_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
	- zc28_c->zc_nvlist_src = zc->zc_nvlist_src;
	- zc28_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
	- zc28_c->zc_nvlist_dst = zc->zc_nvlist_dst;
	- zc28_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
	- zc28_c->zc_cookie = zc->zc_cookie;
	- zc28_c->zc_objset_type = zc->zc_objset_type;
	- zc28_c->zc_perm_action = zc->zc_perm_action;
	- zc28_c->zc_history = zc->zc_history;
	- zc28_c->zc_history_len = zc->zc_history_len;
	- zc28_c->zc_history_offset = zc->zc_history_offset;
	- zc28_c->zc_obj = zc->zc_obj;
	- zc28_c->zc_iflags = zc->zc_iflags;
	- zc28_c->zc_share = zc->zc_share;
	- zc28_c->zc_jailid = zc->zc_jailid;
	- zc28_c->zc_objset_stats = zc->zc_objset_stats;
	- zc28_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
	- zc28_c->zc_defer_destroy = zc->zc_defer_destroy;
	- zc28_c->zc_temphold = 0;
	- zc28_c->zc_action_handle = zc->zc_action_handle;
	- zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd;
	- zc28_c->zc_simple = zc->zc_simple;
	- zc28_c->zc_sendobj = zc->zc_sendobj;
	- zc28_c->zc_fromobj = zc->zc_fromobj;
	- zc28_c->zc_createtxg = zc->zc_createtxg;
	- zc28_c->zc_stat = zc->zc_stat;
	-#ifndef _KERNEL
	- if (request == ZFS_IOC_RECV)
	- strlcpy(zc28_c->zc_top_ds,
	- zc->zc_value + strlen(zc->zc_value) + 1,
	- MAXPATHLEN * 2 - strlen(zc->zc_value) - 1);
	-#endif
	- /* zc_inject_record */
	- zc28_c->zc_inject_record.zi_objset =
	- zc->zc_inject_record.zi_objset;
	- zc28_c->zc_inject_record.zi_object =
	- zc->zc_inject_record.zi_object;
	- zc28_c->zc_inject_record.zi_start =
	- zc->zc_inject_record.zi_start;
	- zc28_c->zc_inject_record.zi_end =
	- zc->zc_inject_record.zi_end;
	- zc28_c->zc_inject_record.zi_guid =
	- zc->zc_inject_record.zi_guid;
	- zc28_c->zc_inject_record.zi_level =
	- zc->zc_inject_record.zi_level;
	- zc28_c->zc_inject_record.zi_error =
	- zc->zc_inject_record.zi_error;
	- zc28_c->zc_inject_record.zi_type =
	- zc->zc_inject_record.zi_type;
	- zc28_c->zc_inject_record.zi_freq =
	- zc->zc_inject_record.zi_freq;
	- zc28_c->zc_inject_record.zi_failfast =
	- zc->zc_inject_record.zi_failfast;
	- strlcpy(zc28_c->zc_inject_record.zi_func,
	- zc->zc_inject_record.zi_func, MAXNAMELEN);
	- zc28_c->zc_inject_record.zi_iotype =
	- zc->zc_inject_record.zi_iotype;
	- zc28_c->zc_inject_record.zi_duration =
	- zc->zc_inject_record.zi_duration;
	- zc28_c->zc_inject_record.zi_timer =
	- zc->zc_inject_record.zi_timer;
	- break;
	-
	- case ZFS_CMD_COMPAT_V15:
	- zc_c = (void *)addr;
	-
	- /* zc */
	- strlcpy(zc_c->zc_name, zc->zc_name, MAXPATHLEN);
	- strlcpy(zc_c->zc_value, zc->zc_value, MAXPATHLEN);
	- strlcpy(zc_c->zc_string, zc->zc_string, MAXPATHLEN);
	- zc_c->zc_guid = zc->zc_guid;
	- zc_c->zc_nvlist_conf = zc->zc_nvlist_conf;
	- zc_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
	- zc_c->zc_nvlist_src = zc->zc_nvlist_src;
	- zc_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
	- zc_c->zc_nvlist_dst = zc->zc_nvlist_dst;
	- zc_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
	- zc_c->zc_cookie = zc->zc_cookie;
	- zc_c->zc_objset_type = zc->zc_objset_type;
	- zc_c->zc_perm_action = zc->zc_perm_action;
	- zc_c->zc_history = zc->zc_history;
	- zc_c->zc_history_len = zc->zc_history_len;
	- zc_c->zc_history_offset = zc->zc_history_offset;
	- zc_c->zc_obj = zc->zc_obj;
	- zc_c->zc_share = zc->zc_share;
	- zc_c->zc_jailid = zc->zc_jailid;
	- zc_c->zc_objset_stats = zc->zc_objset_stats;
	- zc_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
	-
	- /* zc_inject_record */
	- zc_c->zc_inject_record.zi_objset =
	- zc->zc_inject_record.zi_objset;
	- zc_c->zc_inject_record.zi_object =
	- zc->zc_inject_record.zi_object;
	- zc_c->zc_inject_record.zi_start =
	- zc->zc_inject_record.zi_start;
	- zc_c->zc_inject_record.zi_end =
	- zc->zc_inject_record.zi_end;
	- zc_c->zc_inject_record.zi_guid =
	- zc->zc_inject_record.zi_guid;
	- zc_c->zc_inject_record.zi_level =
	- zc->zc_inject_record.zi_level;
	- zc_c->zc_inject_record.zi_error =
	- zc->zc_inject_record.zi_error;
	- zc_c->zc_inject_record.zi_type =
	- zc->zc_inject_record.zi_type;
	- zc_c->zc_inject_record.zi_freq =
	- zc->zc_inject_record.zi_freq;
	- zc_c->zc_inject_record.zi_failfast =
	- zc->zc_inject_record.zi_failfast;
	-
	- break;
	- }
	-}
	-
	-static int
	-zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag,
	- nvlist_t **nvp)
	-{
	- char *packed;
	- int error;
	- nvlist_t *list = NULL;
	-
	- /*
	- * Read in and unpack the user-supplied nvlist.
	- */
	- if (size == 0)
	- return (EINVAL);
	-
	-#ifdef _KERNEL
	- packed = kmem_alloc(size, KM_SLEEP);
	- if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
	- iflag)) != 0) {
	- kmem_free(packed, size);
	- return (error);
	- }
	-#else
	- packed = (void *)(uintptr_t)nvl;
	-#endif
	-
	- error = nvlist_unpack(packed, size, &list, 0);
	-
	-#ifdef _KERNEL
	- kmem_free(packed, size);
	-#endif
	-
	- if (error != 0)
	- return (error);
	-
	- *nvp = list;
	- return (0);
	-}
	-
	-static int
	-zfs_ioctl_compat_put_nvlist(zfs_cmd_t zc, nvlist_t nvl)
	-{
	- char *packed = NULL;
	- int error = 0;
	- size_t size;
	-
	- VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
	-
	-#ifdef _KERNEL
	- packed = kmem_alloc(size, KM_SLEEP);
	- VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
	- KM_SLEEP) == 0);
	-
	- if (ddi_copyout(packed,
	- (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0)
	- error = EFAULT;
	- kmem_free(packed, size);
	-#else
	- packed = (void *)(uintptr_t)zc->zc_nvlist_dst;
	- VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
	- 0) == 0);
	-#endif
	-
	- zc->zc_nvlist_dst_size = size;
	- return (error);
	-}
	-
	-static void
	-zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl)
	-{
	- nvlist_t **child;
	- nvlist_t *nvroot = NULL;
	- vdev_stat_t *vs;
	- uint_t c, children, nelem;
	-
	- if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
	- &child, &children) == 0) {
	- for (c = 0; c < children; c++) {
	- zfs_ioctl_compat_fix_stats_nvlist(child[c]);
	- }
	- }
	-
	- if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0)
	- zfs_ioctl_compat_fix_stats_nvlist(nvroot);
	-#ifdef _KERNEL
	- if ((nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS,
	-#else
	- if ((nvlist_lookup_uint64_array(nvl, "stats",
	-#endif
	-
	- (uint64_t **)&vs, &nelem) == 0)) {
	- nvlist_add_uint64_array(nvl,
	-#ifdef _KERNEL
	- "stats",
	-#else
	- ZPOOL_CONFIG_VDEV_STATS,
	-#endif
	- (uint64_t *)vs, nelem);
	-#ifdef _KERNEL
	- nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS,
	-#else
	- nvlist_remove(nvl, "stats",
	-#endif
	- DATA_TYPE_UINT64_ARRAY);
	- }
	-}
	-
	-static int
	-zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc)
	-{
	- nvlist_t nv, nvp = NULL;
	- nvpair_t *elem;
	- int error;
	-
	- if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst,
	- zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0)
	- return (error);
	-
	- if (nc == 5) { /* ZFS_IOC_POOL_STATS */
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
	- if (nvpair_value_nvlist(elem, &nvp) == 0)
	- zfs_ioctl_compat_fix_stats_nvlist(nvp);
	- }
	- elem = NULL;
	- } else
	- zfs_ioctl_compat_fix_stats_nvlist(nv);
	-
	- error = zfs_ioctl_compat_put_nvlist(zc, nv);
	-
	- nvlist_free(nv);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc)
	-{
	- nvlist_t nv, nva = NULL;
	- int error;
	-
	- if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst,
	- zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0)
	- return (error);
	-
	-#ifdef _KERNEL
	- if (nvlist_lookup_nvlist(nv, "allocated", &nva) == 0) {
	- nvlist_add_nvlist(nv, "used", nva);
	- nvlist_remove(nv, "allocated", DATA_TYPE_NVLIST);
	- }
	-
	- if (nvlist_lookup_nvlist(nv, "free", &nva) == 0) {
	- nvlist_add_nvlist(nv, "available", nva);
	- nvlist_remove(nv, "free", DATA_TYPE_NVLIST);
	- }
	-#else
	- if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) {
	- nvlist_add_nvlist(nv, "allocated", nva);
	- nvlist_remove(nv, "used", DATA_TYPE_NVLIST);
	- }
	-
	- if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) {
	- nvlist_add_nvlist(nv, "free", nva);
	- nvlist_remove(nv, "available", DATA_TYPE_NVLIST);
	- }
	-#endif
	-
	- error = zfs_ioctl_compat_put_nvlist(zc, nv);
	-
	- nvlist_free(nv);
	-
	- return (error);
	-}
	-
	-#ifndef _KERNEL
	-int
	-zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag)
	-{
	- int nc, ret;
	- void *zc_c;
	- unsigned long ncmd;
	- zfs_iocparm_t zp;
	-
	- switch (cflag) {
	- case ZFS_CMD_COMPAT_NONE:
	- ncmd = _IOWR('Z', request, struct zfs_iocparm);
	- zp.zfs_cmd = (uint64_t)zc;
	- zp.zfs_cmd_size = sizeof(zfs_cmd_t);
	- zp.zfs_ioctl_version = ZFS_IOCVER_CURRENT;
	- return (ioctl(fd, ncmd, &zp));
	- case ZFS_CMD_COMPAT_INLANES:
	- ncmd = _IOWR('Z', request, struct zfs_iocparm);
	- zp.zfs_cmd = (uint64_t)zc;
	- zp.zfs_cmd_size = sizeof(zfs_cmd_inlanes_t);
	- zp.zfs_ioctl_version = ZFS_IOCVER_INLANES;
	- return (ioctl(fd, ncmd, &zp));
	- case ZFS_CMD_COMPAT_RESUME:
	- ncmd = _IOWR('Z', request, struct zfs_iocparm);
	- zp.zfs_cmd = (uint64_t)zc;
	- zp.zfs_cmd_size = sizeof(zfs_cmd_resume_t);
	- zp.zfs_ioctl_version = ZFS_IOCVER_RESUME;
	- return (ioctl(fd, ncmd, &zp));
	- case ZFS_CMD_COMPAT_EDBP:
	- ncmd = _IOWR('Z', request, struct zfs_iocparm);
	- zp.zfs_cmd = (uint64_t)zc;
	- zp.zfs_cmd_size = sizeof(zfs_cmd_edbp_t);
	- zp.zfs_ioctl_version = ZFS_IOCVER_EDBP;
	- return (ioctl(fd, ncmd, &zp));
	- case ZFS_CMD_COMPAT_ZCMD:
	- ncmd = _IOWR('Z', request, struct zfs_iocparm);
	- zp.zfs_cmd = (uint64_t)zc;
	- zp.zfs_cmd_size = sizeof(zfs_cmd_zcmd_t);
	- zp.zfs_ioctl_version = ZFS_IOCVER_ZCMD;
	- return (ioctl(fd, ncmd, &zp));
	- case ZFS_CMD_COMPAT_LZC:
	- ncmd = _IOWR('Z', request, struct zfs_cmd);
	- return (ioctl(fd, ncmd, zc));
	- case ZFS_CMD_COMPAT_DEADMAN:
	- zc_c = malloc(sizeof(zfs_cmd_deadman_t));
	- ncmd = _IOWR('Z', request, struct zfs_cmd_deadman);
	- break;
	- case ZFS_CMD_COMPAT_V28:
	- zc_c = malloc(sizeof(zfs_cmd_v28_t));
	- ncmd = _IOWR('Z', request, struct zfs_cmd_v28);
	- break;
	- case ZFS_CMD_COMPAT_V15:
	- nc = zfs_ioctl_v28_to_v15[request];
	- zc_c = malloc(sizeof(zfs_cmd_v15_t));
	- ncmd = _IOWR('Z', nc, struct zfs_cmd_v15);
	- break;
	- default:
	- return (EINVAL);
	- }
	-
	- if (ZFS_IOCREQ(ncmd) == ZFS_IOC_COMPAT_FAIL)
	- return (ENOTSUP);
	-
	- zfs_cmd_compat_put(zc, (caddr_t)zc_c, request, cflag);
	-
	- ret = ioctl(fd, ncmd, zc_c);
	- if (cflag == ZFS_CMD_COMPAT_V15 &&
	- nc == ZFS_IOC_POOL_IMPORT)
	- ret = ioctl(fd, _IOWR('Z', ZFS_IOC_POOL_CONFIGS,
	- struct zfs_cmd_v15), zc_c);
	- zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag);
	- free(zc_c);
	-
	- if (cflag == ZFS_CMD_COMPAT_V15) {
	- switch (nc) {
	- case ZFS_IOC_POOL_IMPORT:
	- case ZFS_IOC_POOL_CONFIGS:
	- case ZFS_IOC_POOL_STATS:
	- case ZFS_IOC_POOL_TRYIMPORT:
	- zfs_ioctl_compat_fix_stats(zc, nc);
	- break;
	- case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
	- zfs_ioctl_compat_pool_get_props(zc);
	- break;
	- }
	- }
	-
	- return (ret);
	-}
	-#else /* _KERNEL */
	-int
	-zfs_ioctl_compat_pre(zfs_cmd_t zc, int vec, const int cflag)
	-{
	- int error = 0;
	-
	- /* are we creating a clone? */
	- if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0')
	- *vec = ZFS_IOC_CLONE;
	-
	- if (cflag == ZFS_CMD_COMPAT_V15) {
	- switch (*vec) {
	-
	- case 7: /* ZFS_IOC_POOL_SCRUB (v15) */
	- zc->zc_cookie = POOL_SCAN_SCRUB;
	- break;
	- }
	- }
	-
	- return (error);
	-}
	-
	-void
	-zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag)
	-{
	- if (cflag == ZFS_CMD_COMPAT_V15) {
	- switch (vec) {
	- case ZFS_IOC_POOL_CONFIGS:
	- case ZFS_IOC_POOL_STATS:
	- case ZFS_IOC_POOL_TRYIMPORT:
	- zfs_ioctl_compat_fix_stats(zc, vec);
	- break;
	- case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
	- zfs_ioctl_compat_pool_get_props(zc);
	- break;
	- }
	- }
	-}
	-
	-nvlist_t *
	-zfs_ioctl_compat_innvl(zfs_cmd_t zc, nvlist_t innvl, const int vec,
	- const int cflag)
	-{
	- nvlist_t nvl, tmpnvl, *hnvl;
	- nvpair_t *elem;
	- char poolname, snapname;
	- int err;
	-
	- if (cflag == ZFS_CMD_COMPAT_NONE \|\| cflag == ZFS_CMD_COMPAT_LZC \|\|
	- cflag == ZFS_CMD_COMPAT_ZCMD \|\| cflag == ZFS_CMD_COMPAT_EDBP \|\|
	- cflag == ZFS_CMD_COMPAT_RESUME \|\| cflag == ZFS_CMD_COMPAT_INLANES)
	- goto out;
	-
	- switch (vec) {
	- case ZFS_IOC_CREATE:
	- nvl = fnvlist_alloc();
	- fnvlist_add_int32(nvl, "type", zc->zc_objset_type);
	- if (innvl != NULL) {
	- fnvlist_add_nvlist(nvl, "props", innvl);
	- nvlist_free(innvl);
	- }
	- return (nvl);
	- break;
	- case ZFS_IOC_CLONE:
	- nvl = fnvlist_alloc();
	- fnvlist_add_string(nvl, "origin", zc->zc_value);
	- if (innvl != NULL) {
	- fnvlist_add_nvlist(nvl, "props", innvl);
	- nvlist_free(innvl);
	- }
	- return (nvl);
	- break;
	- case ZFS_IOC_SNAPSHOT:
	- if (innvl == NULL)
	- goto out;
	- nvl = fnvlist_alloc();
	- fnvlist_add_nvlist(nvl, "props", innvl);
	- tmpnvl = fnvlist_alloc();
	- snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
	- fnvlist_add_boolean(tmpnvl, snapname);
	- kmem_free(snapname, strlen(snapname + 1));
	- /* check if we are doing a recursive snapshot */
	- if (zc->zc_cookie)
	- dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value,
	- tmpnvl);
	- fnvlist_add_nvlist(nvl, "snaps", tmpnvl);
	- fnvlist_free(tmpnvl);
	- nvlist_free(innvl);
	- /* strip dataset part from zc->zc_name */
	- zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
	- return (nvl);
	- break;
	- case ZFS_IOC_SPACE_SNAPS:
	- nvl = fnvlist_alloc();
	- fnvlist_add_string(nvl, "firstsnap", zc->zc_value);
	- if (innvl != NULL)
	- nvlist_free(innvl);
	- return (nvl);
	- break;
	- case ZFS_IOC_DESTROY_SNAPS:
	- if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN)
	- goto out;
	- nvl = fnvlist_alloc();
	- if (innvl != NULL) {
	- fnvlist_add_nvlist(nvl, "snaps", innvl);
	- } else {
	- /*
	- * We are probably called by even older binaries,
	- * allocate and populate nvlist with recursive
	- * snapshots
	- */
	- if (zfs_component_namecheck(zc->zc_value, NULL,
	- NULL) == 0) {
	- tmpnvl = fnvlist_alloc();
	- if (dmu_get_recursive_snaps_nvl(zc->zc_name,
	- zc->zc_value, tmpnvl) == 0)
	- fnvlist_add_nvlist(nvl, "snaps",
	- tmpnvl);
	- nvlist_free(tmpnvl);
	- }
	- }
	- if (innvl != NULL)
	- nvlist_free(innvl);
	- /* strip dataset part from zc->zc_name */
	- zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
	- return (nvl);
	- break;
	- case ZFS_IOC_HOLD:
	- nvl = fnvlist_alloc();
	- tmpnvl = fnvlist_alloc();
	- if (zc->zc_cleanup_fd != -1)
	- fnvlist_add_int32(nvl, "cleanup_fd",
	- (int32_t)zc->zc_cleanup_fd);
	- if (zc->zc_cookie) {
	- hnvl = fnvlist_alloc();
	- if (dmu_get_recursive_snaps_nvl(zc->zc_name,
	- zc->zc_value, hnvl) == 0) {
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(hnvl,
	- elem)) != NULL) {
	- nvlist_add_string(tmpnvl,
	- nvpair_name(elem), zc->zc_string);
	- }
	- }
	- nvlist_free(hnvl);
	- } else {
	- snapname = kmem_asprintf("%s@%s", zc->zc_name,
	- zc->zc_value);
	- nvlist_add_string(tmpnvl, snapname, zc->zc_string);
	- kmem_free(snapname, strlen(snapname + 1));
	- }
	- fnvlist_add_nvlist(nvl, "holds", tmpnvl);
	- nvlist_free(tmpnvl);
	- if (innvl != NULL)
	- nvlist_free(innvl);
	- /* strip dataset part from zc->zc_name */
	- zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
	- return (nvl);
	- break;
	- case ZFS_IOC_RELEASE:
	- nvl = fnvlist_alloc();
	- tmpnvl = fnvlist_alloc();
	- if (zc->zc_cookie) {
	- hnvl = fnvlist_alloc();
	- if (dmu_get_recursive_snaps_nvl(zc->zc_name,
	- zc->zc_value, hnvl) == 0) {
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(hnvl,
	- elem)) != NULL) {
	- fnvlist_add_boolean(tmpnvl,
	- zc->zc_string);
	- fnvlist_add_nvlist(nvl,
	- nvpair_name(elem), tmpnvl);
	- }
	- }
	- nvlist_free(hnvl);
	- } else {
	- snapname = kmem_asprintf("%s@%s", zc->zc_name,
	- zc->zc_value);
	- fnvlist_add_boolean(tmpnvl, zc->zc_string);
	- fnvlist_add_nvlist(nvl, snapname, tmpnvl);
	- kmem_free(snapname, strlen(snapname + 1));
	- }
	- nvlist_free(tmpnvl);
	- if (innvl != NULL)
	- nvlist_free(innvl);
	- /* strip dataset part from zc->zc_name */
	- zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
	- return (nvl);
	- break;
	- }
	-out:
	- return (innvl);
	-}
	-
	-nvlist_t *
	-zfs_ioctl_compat_outnvl(zfs_cmd_t zc, nvlist_t outnvl, const int vec,
	- const int cflag)
	-{
	- nvlist_t *tmpnvl;
	-
	- if (cflag == ZFS_CMD_COMPAT_NONE \|\| cflag == ZFS_CMD_COMPAT_LZC \|\|
	- cflag == ZFS_CMD_COMPAT_ZCMD \|\| cflag == ZFS_CMD_COMPAT_EDBP \|\|
	- cflag == ZFS_CMD_COMPAT_RESUME \|\| cflag == ZFS_CMD_COMPAT_INLANES)
	- return (outnvl);
	-
	- switch (vec) {
	- case ZFS_IOC_SPACE_SNAPS:
	- (void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie);
	- (void) nvlist_lookup_uint64(outnvl, "compressed",
	- &zc->zc_objset_type);
	- (void) nvlist_lookup_uint64(outnvl, "uncompressed",
	- &zc->zc_perm_action);
	- nvlist_free(outnvl);
	- /* return empty outnvl */
	- tmpnvl = fnvlist_alloc();
	- return (tmpnvl);
	- break;
	- case ZFS_IOC_CREATE:
	- case ZFS_IOC_CLONE:
	- case ZFS_IOC_HOLD:
	- case ZFS_IOC_RELEASE:
	- nvlist_free(outnvl);
	- /* return empty outnvl */
	- tmpnvl = fnvlist_alloc();
	- return (tmpnvl);
	- break;
	- }
	-
	- return (outnvl);
	-}
	-#endif /* KERNEL */
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
	@@ -1,66 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _ZFS_NAMECHECK_H
	-#define _ZFS_NAMECHECK_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef enum {
	- NAME_ERR_LEADING_SLASH, /* name begins with leading slash */
	- NAME_ERR_EMPTY_COMPONENT, /* name contains an empty component */
	- NAME_ERR_TRAILING_SLASH, /* name ends with a slash */
	- NAME_ERR_INVALCHAR, /* invalid character found */
	- NAME_ERR_MULTIPLE_DELIMITERS, /* multiple '@'/'#' delimiters found */
	- NAME_ERR_NOLETTER, /* pool doesn't begin with a letter */
	- NAME_ERR_RESERVED, /* entire name is reserved */
	- NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].) /
	- NAME_ERR_TOOLONG, /* name is too long */
	- NAME_ERR_NO_AT, /* permission set is missing '@' */
	-} namecheck_err_t;
	-
	-#define ZFS_PERMSET_MAXLEN 64
	-
	-extern int zfs_max_dataset_nesting;
	-
	-int get_dataset_depth(const char *);
	-int pool_namecheck(const char , namecheck_err_t , char *);
	-int entity_namecheck(const char , namecheck_err_t , char *);
	-int dataset_namecheck(const char , namecheck_err_t , char *);
	-int dataset_nestcheck(const char *);
	-int mountpoint_namecheck(const char , namecheck_err_t );
	-int zfs_component_namecheck(const char , namecheck_err_t , char *);
	-int permset_namecheck(const char , namecheck_err_t , char *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZFS_NAMECHECK_H */
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
	@@ -1,399 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * Common name validation routines for ZFS. These routines are shared by the
	- * userland code as well as the ioctl() layer to ensure that we don't
	- * inadvertently expose a hole through direct ioctl()s that never gets tested.
	- * In userland, however, we want significantly more information about _why_ the
	- * name is invalid. In the kernel, we only care whether it's valid or not.
	- * Each routine therefore takes a 'namecheck_err_t' which describes exactly why
	- * the name failed to validate.
	- */
	-
	-#if defined(_KERNEL)
	-#include <sys/systm.h>
	-#else
	-#include <string.h>
	-#endif
	-
	-#include <sys/dsl_dir.h>
	-#include <sys/param.h>
	-#include <sys/nvpair.h>
	-#include "zfs_namecheck.h"
	-#include "zfs_deleg.h"
	-
	-/*
	- * Deeply nested datasets can overflow the stack, so we put a limit
	- * in the amount of nesting a path can have. zfs_max_dataset_nesting
	- * can be tuned temporarily to fix existing datasets that exceed our
	- * predefined limit.
	- */
	-int zfs_max_dataset_nesting = 50;
	-
	-static int
	-valid_char(char c)
	-{
	- return ((c >= 'a' && c <= 'z') \|\|
	- (c >= 'A' && c <= 'Z') \|\|
	- (c >= '0' && c <= '9') \|\|
	- c == '-' \|\| c == '_' \|\| c == '.' \|\| c == ':' \|\| c == ' ');
	-}
	-
	-/*
	- * Looks at a path and returns its level of nesting (depth).
	- */
	-int
	-get_dataset_depth(const char *path)
	-{
	- const char *loc = path;
	- int nesting = 0;
	-
	- /*
	- * Keep track of nesting until you hit the end of the
	- * path or found the snapshot/bookmark seperator.
	- */
	- for (int i = 0; loc[i] != '\0' &&
	- loc[i] != '@' &&
	- loc[i] != '#'; i++) {
	- if (loc[i] == '/')
	- nesting++;
	- }
	-
	- return (nesting);
	-}
	-
	-/*
	- * Snapshot names must be made up of alphanumeric characters plus the following
	- * characters:
	- *
	- * [-_.: ]
	- *
	- * Returns 0 on success, -1 on error.
	- */
	-int
	-zfs_component_namecheck(const char path, namecheck_err_t why, char *what)
	-{
	- const char *loc;
	-
	- if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
	- if (why)
	- *why = NAME_ERR_TOOLONG;
	- return (-1);
	- }
	-
	- if (path[0] == '\0') {
	- if (why)
	- *why = NAME_ERR_EMPTY_COMPONENT;
	- return (-1);
	- }
	-
	- for (loc = path; *loc; loc++) {
	- if (!valid_char(*loc)) {
	- if (why) {
	- *why = NAME_ERR_INVALCHAR;
	- what = loc;
	- }
	- return (-1);
	- }
	- }
	- return (0);
	-}
	-
	-
	-/*
	- * Permissions set name must start with the letter '@' followed by the
	- * same character restrictions as snapshot names, except that the name
	- * cannot exceed 64 characters.
	- *
	- * Returns 0 on success, -1 on error.
	- */
	-int
	-permset_namecheck(const char path, namecheck_err_t why, char *what)
	-{
	- if (strlen(path) >= ZFS_PERMSET_MAXLEN) {
	- if (why)
	- *why = NAME_ERR_TOOLONG;
	- return (-1);
	- }
	-
	- if (path[0] != '@') {
	- if (why) {
	- *why = NAME_ERR_NO_AT;
	- *what = path[0];
	- }
	- return (-1);
	- }
	-
	- return (zfs_component_namecheck(&path[1], why, what));
	-}
	-
	-/*
	- * Dataset paths should not be deeper than zfs_max_dataset_nesting
	- * in terms of nesting.
	- *
	- * Returns 0 on success, -1 on error.
	- */
	-int
	-dataset_nestcheck(const char *path)
	-{
	- return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1);
	-}
	-
	-/*
	- * Entity names must be of the following form:
	- *
	- * [component/]*[component][(@\|#)component]?
	- *
	- * Where each component is made up of alphanumeric characters plus the following
	- * characters:
	- *
	- * [-_.:%]
	- *
	- * We allow '%' here as we use that character internally to create unique
	- * names for temporary clones (for online recv).
	- *
	- * Returns 0 on success, -1 on error.
	- */
	-int
	-entity_namecheck(const char path, namecheck_err_t why, char *what)
	-{
	- const char *end;
	-
	- /*
	- * Make sure the name is not too long.
	- */
	- if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
	- if (why)
	- *why = NAME_ERR_TOOLONG;
	- return (-1);
	- }
	-
	- /* Explicitly check for a leading slash. */
	- if (path[0] == '/') {
	- if (why)
	- *why = NAME_ERR_LEADING_SLASH;
	- return (-1);
	- }
	-
	- if (path[0] == '\0') {
	- if (why)
	- *why = NAME_ERR_EMPTY_COMPONENT;
	- return (-1);
	- }
	-
	- const char *start = path;
	- boolean_t found_delim = B_FALSE;
	- for (;;) {
	- /* Find the end of this component */
	- end = start;
	- while (end != '/' && end != '@' && *end != '#' &&
	- *end != '\0')
	- end++;
	-
	- if (*end == '\0' && end[-1] == '/') {
	- /* trailing slashes are not allowed */
	- if (why)
	- *why = NAME_ERR_TRAILING_SLASH;
	- return (-1);
	- }
	-
	- /* Validate the contents of this component */
	- for (const char *loc = start; loc != end; loc++) {
	- if (!valid_char(loc) && loc != '%') {
	- if (why) {
	- *why = NAME_ERR_INVALCHAR;
	- what = loc;
	- }
	- return (-1);
	- }
	- }
	-
	- /* Snapshot or bookmark delimiter found */
	- if (end == '@' \|\| end == '#') {
	- /* Multiple delimiters are not allowed */
	- if (found_delim != 0) {
	- if (why)
	- *why = NAME_ERR_MULTIPLE_DELIMITERS;
	- return (-1);
	- }
	-
	- found_delim = B_TRUE;
	- }
	-
	- /* Zero-length components are not allowed */
	- if (start == end) {
	- if (why)
	- *why = NAME_ERR_EMPTY_COMPONENT;
	- return (-1);
	- }
	-
	- /* If we've reached the end of the string, we're OK */
	- if (*end == '\0')
	- return (0);
	-
	- /*
	- * If there is a '/' in a snapshot or bookmark name
	- * then report an error
	- */
	- if (*end == '/' && found_delim != 0) {
	- if (why)
	- *why = NAME_ERR_TRAILING_SLASH;
	- return (-1);
	- }
	-
	- /* Update to the next component */
	- start = end + 1;
	- }
	-}
	-
	-/*
	- * Dataset is any entity, except bookmark
	- */
	-int
	-dataset_namecheck(const char path, namecheck_err_t why, char *what)
	-{
	- int ret = entity_namecheck(path, why, what);
	-
	- if (ret == 0 && strchr(path, '#') != NULL) {
	- if (why != NULL) {
	- *why = NAME_ERR_INVALCHAR;
	- *what = '#';
	- }
	- return (-1);
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * mountpoint names must be of the following form:
	- *
	- * /[component][/]*[component][/]
	- *
	- * Returns 0 on success, -1 on error.
	- */
	-int
	-mountpoint_namecheck(const char path, namecheck_err_t why)
	-{
	- const char start, end;
	-
	- /*
	- * Make sure none of the mountpoint component names are too long.
	- * If a component name is too long then the mkdir of the mountpoint
	- * will fail but then the mountpoint property will be set to a value
	- * that can never be mounted. Better to fail before setting the prop.
	- * Extra slashes are OK, they will be tossed by the mountpoint mkdir.
	- */
	-
	- if (path == NULL \|\| *path != '/') {
	- if (why)
	- *why = NAME_ERR_LEADING_SLASH;
	- return (-1);
	- }
	-
	- /* Skip leading slash */
	- start = &path[1];
	- do {
	- end = start;
	- while (end != '/' && end != '\0')
	- end++;
	-
	- if (end - start >= ZFS_MAX_DATASET_NAME_LEN) {
	- if (why)
	- *why = NAME_ERR_TOOLONG;
	- return (-1);
	- }
	- start = end + 1;
	-
	- } while (*end != '\0');
	-
	- return (0);
	-}
	-
	-/*
	- * For pool names, we have the same set of valid characters as described in
	- * dataset names, with the additional restriction that the pool name must begin
	- * with a letter. The pool names 'raidz' and 'mirror' are also reserved names
	- * that cannot be used.
	- *
	- * Returns 0 on success, -1 on error.
	- */
	-int
	-pool_namecheck(const char pool, namecheck_err_t why, char *what)
	-{
	- const char *c;
	-
	- /*
	- * Make sure the name is not too long.
	- * If we're creating a pool with version >= SPA_VERSION_DSL_SCRUB (v11)
	- * we need to account for additional space needed by the origin ds which
	- * will also be snapshotted: "poolname"+"/"+"$ORIGIN"+"@"+"$ORIGIN".
	- * Play it safe and enforce this limit even if the pool version is < 11
	- * so it can be upgraded without issues.
	- */
	- if (strlen(pool) >= (ZFS_MAX_DATASET_NAME_LEN - 2 -
	- strlen(ORIGIN_DIR_NAME) * 2)) {
	- if (why)
	- *why = NAME_ERR_TOOLONG;
	- return (-1);
	- }
	-
	- c = pool;
	- while (*c != '\0') {
	- if (!valid_char(*c)) {
	- if (why) {
	- *why = NAME_ERR_INVALCHAR;
	- what = c;
	- }
	- return (-1);
	- }
	- c++;
	- }
	-
	- if (!(pool >= 'a' && pool <= 'z') &&
	- !(pool >= 'A' && pool <= 'Z')) {
	- if (why)
	- *why = NAME_ERR_NOLETTER;
	- return (-1);
	- }
	-
	- if (strcmp(pool, "mirror") == 0 \|\| strcmp(pool, "raidz") == 0) {
	- if (why)
	- *why = NAME_ERR_RESERVED;
	- return (-1);
	- }
	-
	- if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) {
	- if (why)
	- *why = NAME_ERR_DISKLIKE;
	- return (-1);
	- }
	-
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
	@@ -1,131 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _ZFS_PROP_H
	-#define _ZFS_PROP_H
	-
	-#include <sys/fs/zfs.h>
	-#include <sys/types.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * For index types (e.g. compression and checksum), we want the numeric value
	- * in the kernel, but the string value in userland.
	- */
	-typedef enum {
	- PROP_TYPE_NUMBER, /* numeric value */
	- PROP_TYPE_STRING, /* string value */
	- PROP_TYPE_INDEX /* numeric value indexed by string */
	-} zprop_type_t;
	-
	-typedef enum {
	- PROP_DEFAULT,
	- PROP_READONLY,
	- PROP_INHERIT,
	- /*
	- * ONETIME properties are a sort of conglomeration of READONLY
	- * and INHERIT. They can be set only during object creation,
	- * after that they are READONLY. If not explicitly set during
	- * creation, they can be inherited.
	- */
	- PROP_ONETIME
	-} zprop_attr_t;
	-
	-typedef struct zfs_index {
	- const char *pi_name;
	- uint64_t pi_value;
	-} zprop_index_t;
	-
	-typedef struct {
	- const char pd_name; / human-readable property name */
	- int pd_propnum; /* property number */
	- zprop_type_t pd_proptype; /* string, boolean, index, number */
	- const char pd_strdefault; / default for strings */
	- uint64_t pd_numdefault; /* for boolean / index / number */
	- zprop_attr_t pd_attr; /* default, readonly, inherit */
	- int pd_types; /* bitfield of valid dataset types */
	- /* fs \| vol \| snap; or pool */
	- const char pd_values; / string telling acceptable values */
	- const char pd_colname; / column header for "zfs list" */
	- boolean_t pd_rightalign; /* column alignment for "zfs list" */
	- boolean_t pd_visible; /* do we list this property with the */
	- /* "zfs get" help message */
	- const zprop_index_t pd_table; / for index properties, a table */
	- /* defining the possible values */
	- size_t pd_table_size; /* number of entries in pd_table[] */
	-} zprop_desc_t;
	-
	-/*
	- * zfs dataset property functions
	- */
	-void zfs_prop_init(void);
	-zprop_type_t zfs_prop_get_type(zfs_prop_t);
	-boolean_t zfs_prop_delegatable(zfs_prop_t prop);
	-zprop_desc_t *zfs_prop_get_table(void);
	-
	-/*
	- * zpool property functions
	- */
	-void zpool_prop_init(void);
	-zprop_type_t zpool_prop_get_type(zpool_prop_t);
	-zprop_desc_t *zpool_prop_get_table(void);
	-
	-/*
	- * Common routines to initialize property tables
	- */
	-void zprop_register_impl(int, const char *, zprop_type_t, uint64_t,
	- const char , zprop_attr_t, int, const char , const char *,
	- boolean_t, boolean_t, const zprop_index_t *);
	-void zprop_register_string(int, const char , const char ,
	- zprop_attr_t attr, int, const char , const char );
	-void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int,
	- const char , const char );
	-void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int,
	- const char , const char , const zprop_index_t *);
	-void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
	- int, const char *);
	-
	-/*
	- * Common routines for zfs and zpool property management
	- */
	-int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t);
	-int zprop_name_to_prop(const char *, zfs_type_t);
	-int zprop_string_to_index(int, const char , uint64_t , zfs_type_t);
	-int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
	-uint64_t zprop_random_value(int, uint64_t, zfs_type_t);
	-const char *zprop_values(int, zfs_type_t);
	-size_t zprop_width(int, boolean_t *, zfs_type_t);
	-boolean_t zprop_valid_for_type(int, zfs_type_t);
	-boolean_t zfs_prop_written(const char *name);
	-
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZFS_PROP_H */
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
	@@ -1,718 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#include <sys/zio.h>
	-#include <sys/spa.h>
	-#include <sys/u8_textprep.h>
	-#include <sys/zfs_acl.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zfs_znode.h>
	-
	-#include "zfs_prop.h"
	-#include "zfs_deleg.h"
	-
	-#if defined(_KERNEL)
	-#include <sys/systm.h>
	-#else
	-#include <stdlib.h>
	-#include <string.h>
	-#include <ctype.h>
	-#endif
	-
	-static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
	-
	-/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
	-const char *zfs_userquota_prop_prefixes[] = {
	- "userused@",
	- "userquota@",
	- "groupused@",
	- "groupquota@"
	-};
	-
	-zprop_desc_t *
	-zfs_prop_get_table(void)
	-{
	- return (zfs_prop_table);
	-}
	-
	-void
	-zfs_prop_init(void)
	-{
	- static zprop_index_t checksum_table[] = {
	- { "on", ZIO_CHECKSUM_ON },
	- { "off", ZIO_CHECKSUM_OFF },
	- { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
	- { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
	- { "sha256", ZIO_CHECKSUM_SHA256 },
	- { "noparity", ZIO_CHECKSUM_NOPARITY },
	- { "sha512", ZIO_CHECKSUM_SHA512 },
	- { "skein", ZIO_CHECKSUM_SKEIN },
	-#ifdef illumos
	- { "edonr", ZIO_CHECKSUM_EDONR },
	-#endif
	- { NULL }
	- };
	-
	- static zprop_index_t dedup_table[] = {
	- { "on", ZIO_CHECKSUM_ON },
	- { "off", ZIO_CHECKSUM_OFF },
	- { "verify", ZIO_CHECKSUM_ON \| ZIO_CHECKSUM_VERIFY },
	- { "sha256", ZIO_CHECKSUM_SHA256 },
	- { "sha256,verify",
	- ZIO_CHECKSUM_SHA256 \| ZIO_CHECKSUM_VERIFY },
	- { "sha512", ZIO_CHECKSUM_SHA512 },
	- { "sha512,verify",
	- ZIO_CHECKSUM_SHA512 \| ZIO_CHECKSUM_VERIFY },
	- { "skein", ZIO_CHECKSUM_SKEIN },
	- { "skein,verify",
	- ZIO_CHECKSUM_SKEIN \| ZIO_CHECKSUM_VERIFY },
	-#ifdef illumos
	- { "edonr,verify",
	- ZIO_CHECKSUM_EDONR \| ZIO_CHECKSUM_VERIFY },
	-#endif
	- { NULL }
	- };
	-
	- static zprop_index_t compress_table[] = {
	- { "on", ZIO_COMPRESS_ON },
	- { "off", ZIO_COMPRESS_OFF },
	- { "lzjb", ZIO_COMPRESS_LZJB },
	- { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */
	- { "gzip-1", ZIO_COMPRESS_GZIP_1 },
	- { "gzip-2", ZIO_COMPRESS_GZIP_2 },
	- { "gzip-3", ZIO_COMPRESS_GZIP_3 },
	- { "gzip-4", ZIO_COMPRESS_GZIP_4 },
	- { "gzip-5", ZIO_COMPRESS_GZIP_5 },
	- { "gzip-6", ZIO_COMPRESS_GZIP_6 },
	- { "gzip-7", ZIO_COMPRESS_GZIP_7 },
	- { "gzip-8", ZIO_COMPRESS_GZIP_8 },
	- { "gzip-9", ZIO_COMPRESS_GZIP_9 },
	- { "zle", ZIO_COMPRESS_ZLE },
	- { "lz4", ZIO_COMPRESS_LZ4 },
	- { NULL }
	- };
	-
	- static zprop_index_t snapdir_table[] = {
	- { "hidden", ZFS_SNAPDIR_HIDDEN },
	- { "visible", ZFS_SNAPDIR_VISIBLE },
	- { NULL }
	- };
	-
	- static zprop_index_t acl_mode_table[] = {
	- { "discard", ZFS_ACL_DISCARD },
	- { "groupmask", ZFS_ACL_GROUPMASK },
	- { "passthrough", ZFS_ACL_PASSTHROUGH },
	- { "restricted", ZFS_ACL_RESTRICTED },
	- { NULL }
	- };
	-
	- static zprop_index_t acl_inherit_table[] = {
	- { "discard", ZFS_ACL_DISCARD },
	- { "noallow", ZFS_ACL_NOALLOW },
	- { "restricted", ZFS_ACL_RESTRICTED },
	- { "passthrough", ZFS_ACL_PASSTHROUGH },
	- { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */
	- { "passthrough-x", ZFS_ACL_PASSTHROUGH_X },
	- { NULL }
	- };
	-
	- static zprop_index_t case_table[] = {
	- { "sensitive", ZFS_CASE_SENSITIVE },
	- { "insensitive", ZFS_CASE_INSENSITIVE },
	- { "mixed", ZFS_CASE_MIXED },
	- { NULL }
	- };
	-
	- static zprop_index_t copies_table[] = {
	- { "1", 1 },
	- { "2", 2 },
	- { "3", 3 },
	- { NULL }
	- };
	-
	- /*
	- * Use the unique flags we have to send to u8_strcmp() and/or
	- * u8_textprep() to represent the various normalization property
	- * values.
	- */
	- static zprop_index_t normalize_table[] = {
	- { "none", 0 },
	- { "formD", U8_TEXTPREP_NFD },
	- { "formKC", U8_TEXTPREP_NFKC },
	- { "formC", U8_TEXTPREP_NFC },
	- { "formKD", U8_TEXTPREP_NFKD },
	- { NULL }
	- };
	-
	- static zprop_index_t version_table[] = {
	- { "1", 1 },
	- { "2", 2 },
	- { "3", 3 },
	- { "4", 4 },
	- { "5", 5 },
	- { "current", ZPL_VERSION },
	- { NULL }
	- };
	-
	- static zprop_index_t boolean_table[] = {
	- { "off", 0 },
	- { "on", 1 },
	- { NULL }
	- };
	-
	- static zprop_index_t logbias_table[] = {
	- { "latency", ZFS_LOGBIAS_LATENCY },
	- { "throughput", ZFS_LOGBIAS_THROUGHPUT },
	- { NULL }
	- };
	-
	- static zprop_index_t canmount_table[] = {
	- { "off", ZFS_CANMOUNT_OFF },
	- { "on", ZFS_CANMOUNT_ON },
	- { "noauto", ZFS_CANMOUNT_NOAUTO },
	- { NULL }
	- };
	-
	- static zprop_index_t cache_table[] = {
	- { "none", ZFS_CACHE_NONE },
	- { "metadata", ZFS_CACHE_METADATA },
	- { "all", ZFS_CACHE_ALL },
	- { NULL }
	- };
	-
	- static zprop_index_t sync_table[] = {
	- { "standard", ZFS_SYNC_STANDARD },
	- { "always", ZFS_SYNC_ALWAYS },
	- { "disabled", ZFS_SYNC_DISABLED },
	- { NULL }
	- };
	-
	- static zprop_index_t volmode_table[] = {
	- { "default", ZFS_VOLMODE_DEFAULT },
	- { "geom", ZFS_VOLMODE_GEOM },
	- { "dev", ZFS_VOLMODE_DEV },
	- { "none", ZFS_VOLMODE_NONE },
	- { NULL }
	- };
	-
	- static zprop_index_t dnsize_table[] = {
	- { "legacy", ZFS_DNSIZE_LEGACY },
	- { "auto", ZFS_DNSIZE_AUTO },
	- { "1k", ZFS_DNSIZE_1K },
	- { "2k", ZFS_DNSIZE_2K },
	- { "4k", ZFS_DNSIZE_4K },
	- { "8k", ZFS_DNSIZE_8K },
	- { "16k", ZFS_DNSIZE_16K },
	- { NULL }
	- };
	-
	- static zprop_index_t redundant_metadata_table[] = {
	- { "all", ZFS_REDUNDANT_METADATA_ALL },
	- { "most", ZFS_REDUNDANT_METADATA_MOST },
	- { NULL }
	- };
	-
	- /* inherit index properties */
	- zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
	- ZFS_REDUNDANT_METADATA_ALL,
	- PROP_INHERIT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "all \| most", "REDUND_MD",
	- redundant_metadata_table);
	- zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
	- PROP_INHERIT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "standard \| always \| disabled", "SYNC",
	- sync_table);
	- zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
	- ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM \|
	- ZFS_TYPE_VOLUME,
	- "on \| off \| fletcher2 \| fletcher4 \| sha256 \| sha512 \| "
	- "skein", "CHECKSUM", checksum_table);
	- zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
	- PROP_INHERIT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "on \| off \| verify \| sha256[,verify], sha512[,verify], "
	- "skein[,verify]", "DEDUP", dedup_table);
	- zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
	- ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "on \| off \| lzjb \| gzip \| gzip-[1-9] \| zle \| lz4",
	- "COMPRESS", compress_table);
	- zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
	- PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	- "hidden \| visible", "SNAPDIR", snapdir_table);
	- zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD,
	- PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	- "discard \| groupmask \| passthrough \| restricted", "ACLMODE",
	- acl_mode_table);
	- zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
	- ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	- "discard \| noallow \| restricted \| passthrough \| passthrough-x",
	- "ACLINHERIT", acl_inherit_table);
	- zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "1 \| 2 \| 3", "COPIES", copies_table);
	- zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
	- ZFS_CACHE_ALL, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT \| ZFS_TYPE_VOLUME,
	- "all \| none \| metadata", "PRIMARYCACHE", cache_table);
	- zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
	- ZFS_CACHE_ALL, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT \| ZFS_TYPE_VOLUME,
	- "all \| none \| metadata", "SECONDARYCACHE", cache_table);
	- zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
	- PROP_INHERIT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "latency \| throughput", "LOGBIAS", logbias_table);
	- zprop_register_index(ZFS_PROP_VOLMODE, "volmode",
	- ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT \| ZFS_TYPE_VOLUME,
	- "default \| geom \| dev \| none", "VOLMODE", volmode_table);
	-
	- zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize",
	- ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	- "legacy \| auto \| 1k \| 2k \| 4k \| 8k \| 16k", "DNSIZE", dnsize_table);
	-
	- /* inherit index (boolean) properties */
	- zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM, "on \| off", "ATIME", boolean_table);
	- zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "DEVICES",
	- boolean_table);
	- zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "EXEC",
	- boolean_table);
	- zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "SETUID",
	- boolean_table);
	- zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "on \| off", "RDONLY",
	- boolean_table);
	- zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM, "on \| off", "JAILED", boolean_table);
	- zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "XATTR",
	- boolean_table);
	- zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM, "on \| off", "VSCAN",
	- boolean_table);
	- zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "NBMAND",
	- boolean_table);
	-
	- /* default index properties */
	- zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT,
	- "1 \| 2 \| 3 \| 4 \| 5 \| current", "VERSION", version_table);
	- zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
	- PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on \| off \| noauto",
	- "CANMOUNT", canmount_table);
	-
	- /* readonly index (boolean) properties */
	- zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
	- ZFS_TYPE_FILESYSTEM, "yes \| no", "MOUNTED", boolean_table);
	- zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
	- PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes \| no", "DEFER_DESTROY",
	- boolean_table);
	-
	- /* set once index properties */
	- zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
	- PROP_ONETIME, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT,
	- "none \| formC \| formD \| formKC \| formKD", "NORMALIZATION",
	- normalize_table);
	- zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
	- ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM \|
	- ZFS_TYPE_SNAPSHOT,
	- "sensitive \| insensitive \| mixed", "CASE", case_table);
	-
	- /* set once index (boolean) properties */
	- zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT,
	- "on \| off", "UTF8ONLY", boolean_table);
	-
	- /* string properties */
	- zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
	- zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY,
	- ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES");
	- zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
	- PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> \| legacy \| none",
	- "MOUNTPOINT");
	- zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
	- PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on \| off \| share(1M) options",
	- "SHARENFS");
	- zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
	- ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK,
	- "filesystem \| volume \| snapshot \| bookmark", "TYPE");
	- zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
	- PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	- "on \| off \| sharemgr(1M) options", "SHARESMB");
	- zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
	- ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
	- "<sensitivity label>", "MLSLABEL");
	- zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
	- "receive_resume_token",
	- NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "<string token>", "RESUMETOK");
	-
	- /* readonly number properties */
	- zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
	- ZFS_TYPE_DATASET, "<size>", "USED");
	- zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>", "AVAIL");
	- zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
	- PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
	- zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
	- PROP_READONLY, ZFS_TYPE_DATASET,
	- "<1.00x or higher if compressed>", "RATIO");
	- zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
	- PROP_READONLY, ZFS_TYPE_DATASET,
	- "<1.00x or higher if compressed>", "REFRATIO");
	- zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
	- ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
	- ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK");
	- zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
	- PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>",
	- "USEDSNAP");
	- zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
	- PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>",
	- "USEDDS");
	- zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
	- PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>",
	- "USEDCHILD");
	- zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
	- PROP_READONLY,
	- ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
	- zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
	- ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
	- zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
	- ZFS_TYPE_DATASET, "<size>", "WRITTEN");
	- zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
	- PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>",
	- "LUSED");
	- zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
	- 0, PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LREFER");
	-
	- /* default number properties */
	- zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
	- ZFS_TYPE_FILESYSTEM, "<size> \| none", "QUOTA");
	- zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
	- PROP_DEFAULT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "<size> \| none", "RESERV");
	- zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
	- ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
	- zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
	- ZFS_TYPE_FILESYSTEM, "<size> \| none", "REFQUOTA");
	- zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
	- PROP_DEFAULT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "<size> \| none", "REFRESERV");
	- zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
	- UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
	- "<count> \| none", "FSLIMIT");
	- zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
	- UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "<count> \| none", "SSLIMIT");
	- zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
	- UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
	- "<count>", "FSCOUNT");
	- zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
	- UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	- "<count>", "SSCOUNT");
	- zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY,
	- ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK, "<uint64>", "GUID");
	- zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY,
	- ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG");
	-
	- /* inherit number properties */
	- zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
	- SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
	- ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
	- zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS,
	- "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	- "zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS");
	-
	- /* hidden properties */
	- zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
	- PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
	- zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
	- PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
	- zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
	- PROP_READONLY, ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK, "NAME");
	- zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
	- PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
	- zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
	- PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
	- "STMF_SBD_LU");
	- zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
	- PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
	- "USERACCOUNTING");
	- zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
	- PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
	- zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
	- PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
	- zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent",
	- PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
	- zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
	- PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "PREVSNAP");
	-
	- /* oddball properties */
	- zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
	- NULL, PROP_READONLY, ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK,
	- "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
	-}
	-
	-boolean_t
	-zfs_prop_delegatable(zfs_prop_t prop)
	-{
	- zprop_desc_t *pd = &zfs_prop_table[prop];
	-
	- /* The mlslabel property is never delegatable. */
	- if (prop == ZFS_PROP_MLSLABEL)
	- return (B_FALSE);
	-
	- return (pd->pd_attr != PROP_READONLY);
	-}
	-
	-/*
	- * Given a zfs dataset property name, returns the corresponding property ID.
	- */
	-zfs_prop_t
	-zfs_name_to_prop(const char *propname)
	-{
	- return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
	-}
	-
	-/*
	- * For user property names, we allow all lowercase alphanumeric characters, plus
	- * a few useful punctuation characters.
	- */
	-static int
	-valid_char(char c)
	-{
	- return ((c >= 'a' && c <= 'z') \|\|
	- (c >= '0' && c <= '9') \|\|
	- c == '-' \|\| c == '_' \|\| c == '.' \|\| c == ':');
	-}
	-
	-/*
	- * Returns true if this is a valid user-defined property (one with a ':').
	- */
	-boolean_t
	-zfs_prop_user(const char *name)
	-{
	- int i;
	- char c;
	- boolean_t foundsep = B_FALSE;
	-
	- for (i = 0; i < strlen(name); i++) {
	- c = name[i];
	- if (!valid_char(c))
	- return (B_FALSE);
	- if (c == ':')
	- foundsep = B_TRUE;
	- }
	-
	- if (!foundsep)
	- return (B_FALSE);
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Returns true if this is a valid userspace-type property (one with a '@').
	- * Note that after the @, any character is valid (eg, another @, for SID
	- * user@domain).
	- */
	-boolean_t
	-zfs_prop_userquota(const char *name)
	-{
	- zfs_userquota_prop_t prop;
	-
	- for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
	- if (strncmp(name, zfs_userquota_prop_prefixes[prop],
	- strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
	- return (B_TRUE);
	- }
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Returns true if this is a valid written@ property.
	- * Note that after the @, any character is valid (eg, another @, for
	- * written@pool/fs@origin).
	- */
	-boolean_t
	-zfs_prop_written(const char *name)
	-{
	- static const char *prefix = "written@";
	- return (strncmp(name, prefix, strlen(prefix)) == 0);
	-}
	-
	-/*
	- * Tables of index types, plus functions to convert between the user view
	- * (strings) and internal representation (uint64_t).
	- */
	-int
	-zfs_prop_string_to_index(zfs_prop_t prop, const char string, uint64_t index)
	-{
	- return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
	-}
	-
	-int
	-zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
	-{
	- return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
	-}
	-
	-uint64_t
	-zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
	-{
	- return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
	-}
	-
	-/*
	- * Returns TRUE if the property applies to any of the given dataset types.
	- */
	-boolean_t
	-zfs_prop_valid_for_type(int prop, zfs_type_t types)
	-{
	- return (zprop_valid_for_type(prop, types));
	-}
	-
	-zprop_type_t
	-zfs_prop_get_type(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_proptype);
	-}
	-
	-/*
	- * Returns TRUE if the property is readonly.
	- */
	-boolean_t
	-zfs_prop_readonly(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_attr == PROP_READONLY \|\|
	- zfs_prop_table[prop].pd_attr == PROP_ONETIME);
	-}
	-
	-/*
	- * Returns TRUE if the property is visible (not hidden).
	- */
	-boolean_t
	-zfs_prop_visible(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_visible);
	-}
	-
	-/*
	- * Returns TRUE if the property is only allowed to be set once.
	- */
	-boolean_t
	-zfs_prop_setonce(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
	-}
	-
	-const char *
	-zfs_prop_default_string(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_strdefault);
	-}
	-
	-uint64_t
	-zfs_prop_default_numeric(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_numdefault);
	-}
	-
	-/*
	- * Given a dataset property ID, returns the corresponding name.
	- * Assuming the zfs dataset property ID is valid.
	- */
	-const char *
	-zfs_prop_to_name(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_name);
	-}
	-
	-/*
	- * Returns TRUE if the property is inheritable.
	- */
	-boolean_t
	-zfs_prop_inheritable(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_attr == PROP_INHERIT \|\|
	- zfs_prop_table[prop].pd_attr == PROP_ONETIME);
	-}
	-
	-#ifndef _KERNEL
	-
	-/*
	- * Returns a string describing the set of acceptable values for the given
	- * zfs property, or NULL if it cannot be set.
	- */
	-const char *
	-zfs_prop_values(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_values);
	-}
	-
	-/*
	- * Returns TRUE if this property is a string type. Note that index types
	- * (compression, checksum) are treated as strings in userland, even though they
	- * are stored numerically on disk.
	- */
	-int
	-zfs_prop_is_string(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING \|\|
	- zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
	-}
	-
	-/*
	- * Returns the column header for the given property. Used only in
	- * 'zfs list -o', but centralized here with the other property information.
	- */
	-const char *
	-zfs_prop_column_name(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_colname);
	-}
	-
	-/*
	- * Returns whether the given property should be displayed right-justified for
	- * 'zfs list'.
	- */
	-boolean_t
	-zfs_prop_align_right(zfs_prop_t prop)
	-{
	- return (zfs_prop_table[prop].pd_rightalign);
	-}
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
	@@ -1,250 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/zio.h>
	-#include <sys/spa.h>
	-#include <sys/zfs_acl.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/fs/zfs.h>
	-
	-#include "zfs_prop.h"
	-
	-#if defined(_KERNEL)
	-#include <sys/systm.h>
	-#else
	-#include <stdlib.h>
	-#include <string.h>
	-#include <ctype.h>
	-#endif
	-
	-static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS];
	-
	-zprop_desc_t *
	-zpool_prop_get_table(void)
	-{
	- return (zpool_prop_table);
	-}
	-
	-void
	-zpool_prop_init(void)
	-{
	- static zprop_index_t boolean_table[] = {
	- { "off", 0},
	- { "on", 1},
	- { NULL }
	- };
	-
	- static zprop_index_t failuremode_table[] = {
	- { "wait", ZIO_FAILURE_MODE_WAIT },
	- { "continue", ZIO_FAILURE_MODE_CONTINUE },
	- { "panic", ZIO_FAILURE_MODE_PANIC },
	- { NULL }
	- };
	-
	- /* string properties */
	- zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
	- ZFS_TYPE_POOL, "<path>", "ALTROOT");
	- zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
	- ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
	- zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "<file> \| none", "CACHEFILE");
	- zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "<comment-string>", "COMMENT");
	-
	- /* readonly number properties */
	- zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
	- ZFS_TYPE_POOL, "<size>", "SIZE");
	- zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
	- ZFS_TYPE_POOL, "<size>", "FREE");
	- zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
	- ZFS_TYPE_POOL, "<size>", "FREEING");
	- zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0,
	- PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT");
	- zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
	- ZFS_TYPE_POOL, "<size>", "LEAKED");
	- zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
	- PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
	- zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
	- PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
	- zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
	- PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
	- zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
	- ZFS_TYPE_POOL, "<size>", "CAP");
	- zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
	- ZFS_TYPE_POOL, "<guid>", "GUID");
	- zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
	- ZFS_TYPE_POOL, "<state>", "HEALTH");
	- zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
	- PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
	- "DEDUP");
	-
	- /* system partition size */
	- zprop_register_number(ZPOOL_PROP_BOOTSIZE, "bootsize", 0, PROP_ONETIME,
	- ZFS_TYPE_POOL, "<size>", "BOOTSIZE");
	-
	- /* default number properties */
	- zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
	- zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
	-
	- /* default index (boolean) properties */
	- zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "on \| off", "DELEGATION",
	- boolean_table);
	- zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "on \| off", "REPLACE", boolean_table);
	- zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "on \| off", "LISTSNAPS",
	- boolean_table);
	- zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "on \| off", "EXPAND", boolean_table);
	- zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "on \| off", "RDONLY", boolean_table);
	- zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0,
	- PROP_DEFAULT, ZFS_TYPE_POOL, "on \| off", "MULTIHOST",
	- boolean_table);
	-
	- /* default index properties */
	- zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
	- ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
	- "wait \| continue \| panic", "FAILMODE", failuremode_table);
	-
	- /* hidden properties */
	- zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
	- PROP_READONLY, ZFS_TYPE_POOL, "NAME");
	- zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
	- PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
	- zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING,
	- PROP_ONETIME, ZFS_TYPE_POOL, "TNAME");
	- zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize",
	- PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE");
	-}
	-
	-/*
	- * Given a property name and its type, returns the corresponding property ID.
	- */
	-zpool_prop_t
	-zpool_name_to_prop(const char *propname)
	-{
	- return (zprop_name_to_prop(propname, ZFS_TYPE_POOL));
	-}
	-
	-/*
	- * Given a pool property ID, returns the corresponding name.
	- * Assuming the pool propety ID is valid.
	- */
	-const char *
	-zpool_prop_to_name(zpool_prop_t prop)
	-{
	- return (zpool_prop_table[prop].pd_name);
	-}
	-
	-zprop_type_t
	-zpool_prop_get_type(zpool_prop_t prop)
	-{
	- return (zpool_prop_table[prop].pd_proptype);
	-}
	-
	-boolean_t
	-zpool_prop_readonly(zpool_prop_t prop)
	-{
	- return (zpool_prop_table[prop].pd_attr == PROP_READONLY);
	-}
	-
	-const char *
	-zpool_prop_default_string(zpool_prop_t prop)
	-{
	- return (zpool_prop_table[prop].pd_strdefault);
	-}
	-
	-uint64_t
	-zpool_prop_default_numeric(zpool_prop_t prop)
	-{
	- return (zpool_prop_table[prop].pd_numdefault);
	-}
	-
	-/*
	- * Returns true if this is a valid feature@ property.
	- */
	-boolean_t
	-zpool_prop_feature(const char *name)
	-{
	- static const char *prefix = "feature@";
	- return (strncmp(name, prefix, strlen(prefix)) == 0);
	-}
	-
	-/*
	- * Returns true if this is a valid unsupported@ property.
	- */
	-boolean_t
	-zpool_prop_unsupported(const char *name)
	-{
	- static const char *prefix = "unsupported@";
	- return (strncmp(name, prefix, strlen(prefix)) == 0);
	-}
	-
	-int
	-zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
	- uint64_t *index)
	-{
	- return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL));
	-}
	-
	-int
	-zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
	- const char **string)
	-{
	- return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
	-}
	-
	-uint64_t
	-zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
	-{
	- return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
	-}
	-
	-#ifndef _KERNEL
	-
	-const char *
	-zpool_prop_values(zpool_prop_t prop)
	-{
	- return (zpool_prop_table[prop].pd_values);
	-}
	-
	-const char *
	-zpool_prop_column_name(zpool_prop_t prop)
	-{
	- return (zpool_prop_table[prop].pd_colname);
	-}
	-
	-boolean_t
	-zpool_prop_align_right(zpool_prop_t prop)
	-{
	- return (zpool_prop_table[prop].pd_rightalign);
	-}
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
	+++ head/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
	@@ -1,430 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * Common routines used by zfs and zpool property management.
	- */
	-
	-#include <sys/zio.h>
	-#include <sys/spa.h>
	-#include <sys/zfs_acl.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/fs/zfs.h>
	-
	-#include "zfs_prop.h"
	-#include "zfs_deleg.h"
	-
	-#if defined(_KERNEL)
	-#include <sys/systm.h>
	-#include <sys/libkern.h>
	-#else
	-#include <stdlib.h>
	-#include <string.h>
	-#include <ctype.h>
	-#endif
	-
	-static zprop_desc_t *
	-zprop_get_proptable(zfs_type_t type)
	-{
	- if (type == ZFS_TYPE_POOL)
	- return (zpool_prop_get_table());
	- else
	- return (zfs_prop_get_table());
	-}
	-
	-static int
	-zprop_get_numprops(zfs_type_t type)
	-{
	- if (type == ZFS_TYPE_POOL)
	- return (ZPOOL_NUM_PROPS);
	- else
	- return (ZFS_NUM_PROPS);
	-}
	-
	-void
	-zprop_register_impl(int prop, const char *name, zprop_type_t type,
	- uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
	- int objset_types, const char values, const char colname,
	- boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
	-{
	- zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types);
	- zprop_desc_t *pd;
	-
	- pd = &prop_tbl[prop];
	-
	- ASSERT(pd->pd_name == NULL \|\| pd->pd_name == name);
	- ASSERT(name != NULL);
	- ASSERT(colname != NULL);
	-
	- pd->pd_name = name;
	- pd->pd_propnum = prop;
	- pd->pd_proptype = type;
	- pd->pd_numdefault = numdefault;
	- pd->pd_strdefault = strdefault;
	- pd->pd_attr = attr;
	- pd->pd_types = objset_types;
	- pd->pd_values = values;
	- pd->pd_colname = colname;
	- pd->pd_rightalign = rightalign;
	- pd->pd_visible = visible;
	- pd->pd_table = idx_tbl;
	- pd->pd_table_size = 0;
	- while (idx_tbl && (idx_tbl++)->pi_name != NULL)
	- pd->pd_table_size++;
	-}
	-
	-void
	-zprop_register_string(int prop, const char name, const char def,
	- zprop_attr_t attr, int objset_types, const char *values,
	- const char *colname)
	-{
	- zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
	- objset_types, values, colname, B_FALSE, B_TRUE, NULL);
	-
	-}
	-
	-void
	-zprop_register_number(int prop, const char *name, uint64_t def,
	- zprop_attr_t attr, int objset_types, const char *values,
	- const char *colname)
	-{
	- zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
	- objset_types, values, colname, B_TRUE, B_TRUE, NULL);
	-}
	-
	-void
	-zprop_register_index(int prop, const char *name, uint64_t def,
	- zprop_attr_t attr, int objset_types, const char *values,
	- const char colname, const zprop_index_t idx_tbl)
	-{
	- zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
	- objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl);
	-}
	-
	-void
	-zprop_register_hidden(int prop, const char *name, zprop_type_t type,
	- zprop_attr_t attr, int objset_types, const char *colname)
	-{
	- zprop_register_impl(prop, name, type, 0, NULL, attr,
	- objset_types, NULL, colname,
	- type == PROP_TYPE_NUMBER, B_FALSE, NULL);
	-}
	-
	-
	-/*
	- * A comparison function we can use to order indexes into property tables.
	- */
	-static int
	-zprop_compare(const void arg1, const void arg2)
	-{
	- const zprop_desc_t p1 = ((zprop_desc_t **)arg1);
	- const zprop_desc_t p2 = ((zprop_desc_t **)arg2);
	- boolean_t p1ro, p2ro;
	-
	- p1ro = (p1->pd_attr == PROP_READONLY);
	- p2ro = (p2->pd_attr == PROP_READONLY);
	-
	- if (p1ro == p2ro)
	- return (strcmp(p1->pd_name, p2->pd_name));
	-
	- return (p1ro ? -1 : 1);
	-}
	-
	-/*
	- * Iterate over all properties in the given property table, calling back
	- * into the specified function for each property. We will continue to
	- * iterate until we either reach the end or the callback function returns
	- * something other than ZPROP_CONT.
	- */
	-int
	-zprop_iter_common(zprop_func func, void *cb, boolean_t show_all,
	- boolean_t ordered, zfs_type_t type)
	-{
	- int i, j, num_props, size, prop;
	- zprop_desc_t *prop_tbl;
	- zprop_desc_t **order;
	-
	- prop_tbl = zprop_get_proptable(type);
	- num_props = zprop_get_numprops(type);
	- size = num_props * sizeof (zprop_desc_t *);
	-
	-#if defined(_KERNEL)
	- order = kmem_alloc(size, KM_SLEEP);
	-#else
	- if ((order = malloc(size)) == NULL)
	- return (ZPROP_CONT);
	-#endif
	-
	- for (j = 0; j < num_props; j++)
	- order[j] = &prop_tbl[j];
	-
	- if (ordered) {
	- qsort((void )order, num_props, sizeof (zprop_desc_t ),
	- zprop_compare);
	- }
	-
	- prop = ZPROP_CONT;
	- for (i = 0; i < num_props; i++) {
	- if ((order[i]->pd_visible \|\| show_all) &&
	- (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) {
	- prop = order[i]->pd_propnum;
	- break;
	- }
	- }
	-
	-#if defined(_KERNEL)
	- kmem_free(order, size);
	-#else
	- free(order);
	-#endif
	- return (prop);
	-}
	-
	-static boolean_t
	-propname_match(const char p, size_t len, zprop_desc_t prop_entry)
	-{
	- const char *propname = prop_entry->pd_name;
	-#ifndef _KERNEL
	- const char *colname = prop_entry->pd_colname;
	- int c;
	-#endif
	-
	- if (len == strlen(propname) &&
	- strncmp(p, propname, len) == 0)
	- return (B_TRUE);
	-
	-#ifndef _KERNEL
	- if (colname == NULL \|\| len != strlen(colname))
	- return (B_FALSE);
	-
	- for (c = 0; c < len; c++)
	- if (p[c] != tolower(colname[c]))
	- break;
	-
	- return (colname[c] == '\0');
	-#else
	- return (B_FALSE);
	-#endif
	-}
	-
	-typedef struct name_to_prop_cb {
	- const char *propname;
	- zprop_desc_t *prop_tbl;
	-} name_to_prop_cb_t;
	-
	-static int
	-zprop_name_to_prop_cb(int prop, void *cb_data)
	-{
	- name_to_prop_cb_t *data = cb_data;
	-
	- if (propname_match(data->propname, strlen(data->propname),
	- &data->prop_tbl[prop]))
	- return (prop);
	-
	- return (ZPROP_CONT);
	-}
	-
	-int
	-zprop_name_to_prop(const char *propname, zfs_type_t type)
	-{
	- int prop;
	- name_to_prop_cb_t cb_data;
	-
	- cb_data.propname = propname;
	- cb_data.prop_tbl = zprop_get_proptable(type);
	-
	- prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data,
	- B_TRUE, B_FALSE, type);
	-
	- return (prop == ZPROP_CONT ? ZPROP_INVAL : prop);
	-}
	-
	-int
	-zprop_string_to_index(int prop, const char string, uint64_t index,
	- zfs_type_t type)
	-{
	- zprop_desc_t *prop_tbl;
	- const zprop_index_t *idx_tbl;
	- int i;
	-
	- if (prop == ZPROP_INVAL \|\| prop == ZPROP_CONT)
	- return (-1);
	-
	- ASSERT(prop < zprop_get_numprops(type));
	- prop_tbl = zprop_get_proptable(type);
	- if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
	- return (-1);
	-
	- for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
	- if (strcmp(string, idx_tbl[i].pi_name) == 0) {
	- *index = idx_tbl[i].pi_value;
	- return (0);
	- }
	- }
	-
	- return (-1);
	-}
	-
	-int
	-zprop_index_to_string(int prop, uint64_t index, const char **string,
	- zfs_type_t type)
	-{
	- zprop_desc_t *prop_tbl;
	- const zprop_index_t *idx_tbl;
	- int i;
	-
	- if (prop == ZPROP_INVAL \|\| prop == ZPROP_CONT)
	- return (-1);
	-
	- ASSERT(prop < zprop_get_numprops(type));
	- prop_tbl = zprop_get_proptable(type);
	- if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
	- return (-1);
	-
	- for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
	- if (idx_tbl[i].pi_value == index) {
	- *string = idx_tbl[i].pi_name;
	- return (0);
	- }
	- }
	-
	- return (-1);
	-}
	-
	-/*
	- * Return a random valid property value. Used by ztest.
	- */
	-uint64_t
	-zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
	-{
	- zprop_desc_t *prop_tbl;
	- const zprop_index_t *idx_tbl;
	-
	- ASSERT((uint_t)prop < zprop_get_numprops(type));
	- prop_tbl = zprop_get_proptable(type);
	- idx_tbl = prop_tbl[prop].pd_table;
	-
	- if (idx_tbl == NULL)
	- return (seed);
	-
	- return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
	-}
	-
	-const char *
	-zprop_values(int prop, zfs_type_t type)
	-{
	- zprop_desc_t *prop_tbl;
	-
	- ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
	- ASSERT(prop < zprop_get_numprops(type));
	-
	- prop_tbl = zprop_get_proptable(type);
	-
	- return (prop_tbl[prop].pd_values);
	-}
	-
	-/*
	- * Returns TRUE if the property applies to any of the given dataset types.
	- */
	-boolean_t
	-zprop_valid_for_type(int prop, zfs_type_t type)
	-{
	- zprop_desc_t *prop_tbl;
	-
	- if (prop == ZPROP_INVAL \|\| prop == ZPROP_CONT)
	- return (B_FALSE);
	-
	- ASSERT(prop < zprop_get_numprops(type));
	- prop_tbl = zprop_get_proptable(type);
	- return ((prop_tbl[prop].pd_types & type) != 0);
	-}
	-
	-#ifndef _KERNEL
	-
	-/*
	- * Determines the minimum width for the column, and indicates whether it's fixed
	- * or not. Only string columns are non-fixed.
	- */
	-size_t
	-zprop_width(int prop, boolean_t *fixed, zfs_type_t type)
	-{
	- zprop_desc_t prop_tbl, pd;
	- const zprop_index_t *idx;
	- size_t ret;
	- int i;
	-
	- ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
	- ASSERT(prop < zprop_get_numprops(type));
	-
	- prop_tbl = zprop_get_proptable(type);
	- pd = &prop_tbl[prop];
	-
	- *fixed = B_TRUE;
	-
	- /*
	- * Start with the width of the column name.
	- */
	- ret = strlen(pd->pd_colname);
	-
	- /*
	- * For fixed-width values, make sure the width is large enough to hold
	- * any possible value.
	- */
	- switch (pd->pd_proptype) {
	- case PROP_TYPE_NUMBER:
	- /*
	- * The maximum length of a human-readable number is 5 characters
	- * ("20.4M", for example).
	- */
	- if (ret < 5)
	- ret = 5;
	- /*
	- * 'creation' is handled specially because it's a number
	- * internally, but displayed as a date string.
	- */
	- if (prop == ZFS_PROP_CREATION)
	- *fixed = B_FALSE;
	- break;
	- case PROP_TYPE_INDEX:
	- idx = prop_tbl[prop].pd_table;
	- for (i = 0; idx[i].pi_name != NULL; i++) {
	- if (strlen(idx[i].pi_name) > ret)
	- ret = strlen(idx[i].pi_name);
	- }
	- break;
	-
	- case PROP_TYPE_STRING:
	- *fixed = B_FALSE;
	- break;
	- }
	-
	- return (ret);
	-}
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
	@@ -67,13 +67,15 @@
	* on capital-f functions.
	*/
	#include <sys/errno.h>
	+#include <sys/param.h>
	+#include <sys/types.h>
	#ifndef illumos
	#include <sys/time.h>
	#endif
	#include <sys/stat.h>
	-#include <sys/modctl.h>
	#include <sys/conf.h>
	#include <sys/systm.h>
	+#include <sys/endian.h>
	#ifdef illumos
	#include <sys/ddi.h>
	#include <sys/sunddi.h>
	@@ -96,7 +98,6 @@
	#include <sys/panic.h>
	#include <sys/priv_impl.h>
	#endif
	-#include <sys/policy.h>
	#ifdef illumos
	#include <sys/cred_impl.h>
	#include <sys/procfs_isa.h>
	@@ -119,6 +120,7 @@
	#include <sys/limits.h>
	#include <sys/linker.h>
	#include <sys/kdb.h>
	+#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	@@ -129,6 +131,13 @@
	#include <sys/sx.h>
	#include <sys/sysctl.h>

	+
	+#include <sys/mount.h>
	+#undef AT_UID
	+#undef AT_GID
	+#include <sys/vnode.h>
	+#include <sys/cred.h>
	+
	#include <sys/dtrace_bsd.h>

	#include <netinet/in.h>
	@@ -299,8 +308,10 @@
	#define ipaddr_t in_addr_t
	#define mod_modname pathname
	#define vuprintf vprintf
	+#ifndef crgetzoneid
	+#define crgetzoneid(_a) 0
	+#endif
	#define ttoproc(_a) ((_a)->td_proc)
	-#define crgetzoneid(_a) 0
	#define SNOCD 0
	#define CPU_ON_INTR(_a) 0

	@@ -491,7 +502,7 @@
	if ((remp) != NULL) { \
	*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
	} \
	-_NOTE(CONSTCOND) } while (0)
	+} while (0)


	/*
	Index: head/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
	@@ -35,6 +35,7 @@
	#include <sys/atomic.h>
	#include <sys/errno.h>
	#include <sys/stat.h>
	+#include <sys/endian.h>
	#include <sys/modctl.h>
	#include <sys/conf.h>
	#include <sys/systm.h>
	@@ -54,6 +55,8 @@
	#include <sys/dtrace_impl.h>
	#include <sys/sysmacros.h>
	#include <sys/proc.h>
	+#undef AT_UID
	+#undef AT_GID
	#include <sys/policy.h>
	#ifdef illumos
	#include <util/qsort.h>
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
	@@ -1,94 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
	-/* All Rights Reserved */
	-
	-/*
	- * University Copyright- Copyright (c) 1982, 1986, 1988
	- * The Regents of the University of California
	- * All Rights Reserved
	- *
	- * University Acknowledgment- Portions of this document are derived from
	- * software developed by the University of California, Berkeley, and its
	- * contributors.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/proc.h>
	-#include <sys/taskq.h>
	-#include <sys/vnode.h>
	-
	-/* Extensible attribute (xva) routines. */
	-
	-/*
	- * Zero out the structure, set the size of the requested/returned bitmaps,
	- * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
	- * to the returned attributes array.
	- */
	-void
	-xva_init(xvattr_t *xvap)
	-{
	- bzero(xvap, sizeof (xvattr_t));
	- xvap->xva_mapsize = XVA_MAPSIZE;
	- xvap->xva_magic = XVA_MAGIC;
	- xvap->xva_vattr.va_mask = AT_XVATTR;
	- xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
	-}
	-
	-/*
	- * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
	- * structure. Otherwise, returns NULL.
	- */
	-xoptattr_t *
	-xva_getxoptattr(xvattr_t *xvap)
	-{
	- xoptattr_t *xoap = NULL;
	- if (xvap->xva_vattr.va_mask & AT_XVATTR)
	- xoap = &xvap->xva_xoptattrs;
	- return (xoap);
	-}
	-
	-/*
	- * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
	- * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
	- * the file system as a result of releasing the vnode. Note, file systems
	- * already have to handle the race where the vnode is incremented before the
	- * inactive routine is called and does its locking.
	- *
	- * Warning: Excessive use of this routine can lead to performance problems.
	- * This is because taskqs throttle back allocation if too many are created.
	- */
	-void
	-vn_rele_async(vnode_t vp, taskq_t taskq)
	-{
	- VERIFY(vp->v_count > 0);
	- if (refcount_release_if_not_last(&vp->v_usecount)) {
	- return;
	- }
	- VERIFY(taskq_dispatch((taskq_t *)taskq,
	- (task_func_t *)vrele, vp, TQ_SLEEP) != 0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
	@@ -1,19 +0,0 @@
	-Copyright (c) 2011 Google, Inc.
	-
	-Permission is hereby granted, free of charge, to any person obtaining a copy
	-of this software and associated documentation files (the "Software"), to deal
	-in the Software without restriction, including without limitation the rights
	-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	-copies of the Software, and to permit persons to whom the Software is
	-furnished to do so, subject to the following conditions:
	-
	-The above copyright notice and this permission notice shall be included in
	-all copies or substantial portions of the Software.
	-
	-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	-THE SOFTWARE.
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
	@@ -1 +0,0 @@
	-CITYHASH CHECKSUM FUNCTIONALITY IN ZFS
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
	@@ -1,30 +0,0 @@
	-LZ4 - Fast LZ compression algorithm
	-Copyright (C) 2011-2013, Yann Collet.
	-BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
	-
	-Redistribution and use in source and binary forms, with or without
	-modification, are permitted provided that the following conditions are
	-met:
	-
	- * Redistributions of source code must retain the above copyright
	- notice, this list of conditions and the following disclaimer.
	-
	- * Redistributions in binary form must reproduce the above copyright
	- notice, this list of conditions and the following disclaimer in the
	- documentation and/or other materials provided with the distribution.
	-
	-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
	-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
	-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
	-OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	-
	-You can contact the author at :
	-- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
	-- LZ4 source repository : http://code.google.com/p/lz4/
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
	@@ -1 +0,0 @@
	-LZ4 COMPRESSION FUNCTIONALITY IN ZFS
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
	@@ -1,960 +0,0 @@
	-/*
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- */
	-
	-/*
	- * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * ARC buffer data (ABD).
	- *
	- * ABDs are an abstract data structure for the ARC which can use two
	- * different ways of storing the underlying data:
	- *
	- * (a) Linear buffer. In this case, all the data in the ABD is stored in one
	- * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
	- *
	- * +-------------------+
	- * \| ABD (linear) \|
	- * \| abd_flags = ... \|
	- * \| abd_size = ... \| +--------------------------------+
	- * \| abd_buf ------------->\| raw buffer of size abd_size \|
	- * +-------------------+ +--------------------------------+
	- * no abd_chunks
	- *
	- * (b) Scattered buffer. In this case, the data in the ABD is split into
	- * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
	- * to the chunks recorded in an array at the end of the ABD structure.
	- *
	- * +-------------------+
	- * \| ABD (scattered) \|
	- * \| abd_flags = ... \|
	- * \| abd_size = ... \|
	- * \| abd_offset = 0 \| +-----------+
	- * \| abd_chunks[0] ----------------------------->\| chunk 0 \|
	- * \| abd_chunks[1] ---------------------+ +-----------+
	- * \| ... \| \| +-----------+
	- * \| abd_chunks[N-1] ---------+ +------->\| chunk 1 \|
	- * +-------------------+ \| +-----------+
	- * \| ...
	- * \| +-----------+
	- * +----------------->\| chunk N-1 \|
	- * +-----------+
	- *
	- * Using a large proportion of scattered ABDs decreases ARC fragmentation since
	- * when we are at the limit of allocatable space, using equal-size chunks will
	- * allow us to quickly reclaim enough space for a new large allocation (assuming
	- * it is also scattered).
	- *
	- * In addition to directly allocating a linear or scattered ABD, it is also
	- * possible to create an ABD by requesting the "sub-ABD" starting at an offset
	- * within an existing ABD. In linear buffers this is simple (set abd_buf of
	- * the new ABD to the starting point within the original raw buffer), but
	- * scattered ABDs are a little more complex. The new ABD makes a copy of the
	- * relevant abd_chunks pointers (but not the underlying data). However, to
	- * provide arbitrary rather than only chunk-aligned starting offsets, it also
	- * tracks an abd_offset field which represents the starting point of the data
	- * within the first chunk in abd_chunks. For both linear and scattered ABDs,
	- * creating an offset ABD marks the original ABD as the offset's parent, and the
	- * original ABD's abd_children refcount is incremented. This data allows us to
	- * ensure the root ABD isn't deleted before its children.
	- *
	- * Most consumers should never need to know what type of ABD they're using --
	- * the ABD public API ensures that it's possible to transparently switch from
	- * using a linear ABD to a scattered one when doing so would be beneficial.
	- *
	- * If you need to use the data within an ABD directly, if you know it's linear
	- * (because you allocated it) you can use abd_to_buf() to access the underlying
	- * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
	- * which will allocate a raw buffer if necessary. Use the abd_return_buf*
	- * functions to return any raw buffers that are no longer necessary when you're
	- * done using them.
	- *
	- * There are a variety of ABD APIs that implement basic buffer operations:
	- * compare, copy, read, write, and fill with zeroes. If you need a custom
	- * function which progressively accesses the whole ABD, use the abd_iterate_*
	- * functions.
	- */
	-
	-#include <sys/abd.h>
	-#include <sys/param.h>
	-#include <sys/zio.h>
	-#include <sys/zfs_context.h>
	-#include <sys/zfs_znode.h>
	-
	-typedef struct abd_stats {
	- kstat_named_t abdstat_struct_size;
	- kstat_named_t abdstat_scatter_cnt;
	- kstat_named_t abdstat_scatter_data_size;
	- kstat_named_t abdstat_scatter_chunk_waste;
	- kstat_named_t abdstat_linear_cnt;
	- kstat_named_t abdstat_linear_data_size;
	-} abd_stats_t;
	-
	-static abd_stats_t abd_stats = {
	- /* Amount of memory occupied by all of the abd_t struct allocations */
	- { "struct_size", KSTAT_DATA_UINT64 },
	- /*
	- * The number of scatter ABDs which are currently allocated, excluding
	- * ABDs which don't own their data (for instance the ones which were
	- * allocated through abd_get_offset()).
	- */
	- { "scatter_cnt", KSTAT_DATA_UINT64 },
	- /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
	- { "scatter_data_size", KSTAT_DATA_UINT64 },
	- /*
	- * The amount of space wasted at the end of the last chunk across all
	- * scatter ABDs tracked by scatter_cnt.
	- */
	- { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
	- /*
	- * The number of linear ABDs which are currently allocated, excluding
	- * ABDs which don't own their data (for instance the ones which were
	- * allocated through abd_get_offset() and abd_get_from_buf()). If an
	- * ABD takes ownership of its buf then it will become tracked.
	- */
	- { "linear_cnt", KSTAT_DATA_UINT64 },
	- /* Amount of data stored in all linear ABDs tracked by linear_cnt */
	- { "linear_data_size", KSTAT_DATA_UINT64 },
	-};
	-
	-#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
	-#define ABDSTAT_INCR(stat, val) \
	- atomic_add_64(&abd_stats.stat.value.ui64, (val))
	-#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
	-#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
	-
	-/*
	- * It is possible to make all future ABDs be linear by setting this to B_FALSE.
	- * Otherwise, ABDs are allocated scattered by default unless the caller uses
	- * abd_alloc_linear().
	- */
	-boolean_t zfs_abd_scatter_enabled = B_TRUE;
	-
	-/*
	- * The size of the chunks ABD allocates. Because the sizes allocated from the
	- * kmem_cache can't change, this tunable can only be modified at boot. Changing
	- * it at runtime would cause ABD iteration to work incorrectly for ABDs which
	- * were allocated with the old size, so a safeguard has been put in place which
	- * will cause the machine to panic if you change it and try to access the data
	- * within a scattered ABD.
	- */
	-size_t zfs_abd_chunk_size = 4096;
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-SYSCTL_DECL(_vfs_zfs);
	-
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
	- &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
	-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
	- &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
	-#endif
	-
	-#ifdef _KERNEL
	-extern vmem_t *zio_alloc_arena;
	-#endif
	-
	-kmem_cache_t *abd_chunk_cache;
	-static kstat_t *abd_ksp;
	-
	-extern inline boolean_t abd_is_linear(abd_t *abd);
	-extern inline void abd_copy(abd_t dabd, abd_t sabd, size_t size);
	-extern inline void abd_copy_from_buf(abd_t abd, const void buf, size_t size);
	-extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size);
	-extern inline int abd_cmp_buf(abd_t abd, const void buf, size_t size);
	-extern inline void abd_zero(abd_t *abd, size_t size);
	-
	-static void *
	-abd_alloc_chunk()
	-{
	- void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
	- ASSERT3P(c, !=, NULL);
	- return (c);
	-}
	-
	-static void
	-abd_free_chunk(void *c)
	-{
	- kmem_cache_free(abd_chunk_cache, c);
	-}
	-
	-void
	-abd_init(void)
	-{
	-#ifdef illumos
	- vmem_t *data_alloc_arena = NULL;
	-
	-#ifdef _KERNEL
	- data_alloc_arena = zio_alloc_arena;
	-#endif
	-
	- /*
	- * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH
	- * so that no allocator metadata is stored with the buffers.
	- */
	- abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
	- NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH);
	-#else
	- abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
	- NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH \| KMC_NODEBUG);
	-#endif
	- abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
	- sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
	- if (abd_ksp != NULL) {
	- abd_ksp->ks_data = &abd_stats;
	- kstat_install(abd_ksp);
	- }
	-}
	-
	-void
	-abd_fini(void)
	-{
	- if (abd_ksp != NULL) {
	- kstat_delete(abd_ksp);
	- abd_ksp = NULL;
	- }
	-
	- kmem_cache_destroy(abd_chunk_cache);
	- abd_chunk_cache = NULL;
	-}
	-
	-static inline size_t
	-abd_chunkcnt_for_bytes(size_t size)
	-{
	- return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
	-}
	-
	-static inline size_t
	-abd_scatter_chunkcnt(abd_t *abd)
	-{
	- ASSERT(!abd_is_linear(abd));
	- return (abd_chunkcnt_for_bytes(
	- abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
	-}
	-
	-static inline void
	-abd_verify(abd_t *abd)
	-{
	- ASSERT3U(abd->abd_size, >, 0);
	- ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
	- ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR \|
	- ABD_FLAG_OWNER \| ABD_FLAG_META));
	- IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
	- IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
	- if (abd_is_linear(abd)) {
	- ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
	- } else {
	- ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <,
	- zfs_abd_chunk_size);
	- size_t n = abd_scatter_chunkcnt(abd);
	- for (int i = 0; i < n; i++) {
	- ASSERT3P(
	- abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
	- }
	- }
	-}
	-
	-static inline abd_t *
	-abd_alloc_struct(size_t chunkcnt)
	-{
	- size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
	- abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
	- ASSERT3P(abd, !=, NULL);
	- ABDSTAT_INCR(abdstat_struct_size, size);
	-
	- return (abd);
	-}
	-
	-static inline void
	-abd_free_struct(abd_t *abd)
	-{
	- size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
	- int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
	- kmem_free(abd, size);
	- ABDSTAT_INCR(abdstat_struct_size, -size);
	-}
	-
	-/*
	- * Allocate an ABD, along with its own underlying data buffers. Use this if you
	- * don't care whether the ABD is linear or not.
	- */
	-abd_t *
	-abd_alloc(size_t size, boolean_t is_metadata)
	-{
	- if (!zfs_abd_scatter_enabled \|\| size <= zfs_abd_chunk_size)
	- return (abd_alloc_linear(size, is_metadata));
	-
	- VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
	-
	- size_t n = abd_chunkcnt_for_bytes(size);
	- abd_t *abd = abd_alloc_struct(n);
	-
	- abd->abd_flags = ABD_FLAG_OWNER;
	- if (is_metadata) {
	- abd->abd_flags \|= ABD_FLAG_META;
	- }
	- abd->abd_size = size;
	- abd->abd_parent = NULL;
	- zfs_refcount_create(&abd->abd_children);
	-
	- abd->abd_u.abd_scatter.abd_offset = 0;
	- abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
	-
	- for (int i = 0; i < n; i++) {
	- void *c = abd_alloc_chunk();
	- ASSERT3P(c, !=, NULL);
	- abd->abd_u.abd_scatter.abd_chunks[i] = c;
	- }
	-
	- ABDSTAT_BUMP(abdstat_scatter_cnt);
	- ABDSTAT_INCR(abdstat_scatter_data_size, size);
	- ABDSTAT_INCR(abdstat_scatter_chunk_waste,
	- n * zfs_abd_chunk_size - size);
	-
	- return (abd);
	-}
	-
	-static void
	-abd_free_scatter(abd_t *abd)
	-{
	- size_t n = abd_scatter_chunkcnt(abd);
	- for (int i = 0; i < n; i++) {
	- abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
	- }
	-
	- zfs_refcount_destroy(&abd->abd_children);
	- ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
	- ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
	- ABDSTAT_INCR(abdstat_scatter_chunk_waste,
	- abd->abd_size - n * zfs_abd_chunk_size);
	-
	- abd_free_struct(abd);
	-}
	-
	-/*
	- * Allocate an ABD that must be linear, along with its own underlying data
	- * buffer. Only use this when it would be very annoying to write your ABD
	- * consumer with a scattered ABD.
	- */
	-abd_t *
	-abd_alloc_linear(size_t size, boolean_t is_metadata)
	-{
	- abd_t *abd = abd_alloc_struct(0);
	-
	- VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
	-
	- abd->abd_flags = ABD_FLAG_LINEAR \| ABD_FLAG_OWNER;
	- if (is_metadata) {
	- abd->abd_flags \|= ABD_FLAG_META;
	- }
	- abd->abd_size = size;
	- abd->abd_parent = NULL;
	- zfs_refcount_create(&abd->abd_children);
	-
	- if (is_metadata) {
	- abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
	- } else {
	- abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
	- }
	-
	- ABDSTAT_BUMP(abdstat_linear_cnt);
	- ABDSTAT_INCR(abdstat_linear_data_size, size);
	-
	- return (abd);
	-}
	-
	-static void
	-abd_free_linear(abd_t *abd)
	-{
	- if (abd->abd_flags & ABD_FLAG_META) {
	- zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
	- } else {
	- zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
	- }
	-
	- zfs_refcount_destroy(&abd->abd_children);
	- ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
	- ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
	-
	- abd_free_struct(abd);
	-}
	-
	-/*
	- * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
	- * abd_alloc_linear().
	- */
	-void
	-abd_free(abd_t *abd)
	-{
	- abd_verify(abd);
	- ASSERT3P(abd->abd_parent, ==, NULL);
	- ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
	- if (abd_is_linear(abd))
	- abd_free_linear(abd);
	- else
	- abd_free_scatter(abd);
	-}
	-
	-/*
	- * Allocate an ABD of the same format (same metadata flag, same scatterize
	- * setting) as another ABD.
	- */
	-abd_t *
	-abd_alloc_sametype(abd_t *sabd, size_t size)
	-{
	- boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
	- if (abd_is_linear(sabd)) {
	- return (abd_alloc_linear(size, is_metadata));
	- } else {
	- return (abd_alloc(size, is_metadata));
	- }
	-}
	-
	-/*
	- * If we're going to use this ABD for doing I/O using the block layer, the
	- * consumer of the ABD data doesn't care if it's scattered or not, and we don't
	- * plan to store this ABD in memory for a long period of time, we should
	- * allocate the ABD type that requires the least data copying to do the I/O.
	- *
	- * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
	- * using a scatter/gather list we should switch to that and replace this call
	- * with vanilla abd_alloc().
	- */
	-abd_t *
	-abd_alloc_for_io(size_t size, boolean_t is_metadata)
	-{
	- return (abd_alloc_linear(size, is_metadata));
	-}
	-
	-/*
	- * Allocate a new ABD to point to offset off of sabd. It shares the underlying
	- * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
	- * any derived ABDs exist.
	- */
	-abd_t *
	-abd_get_offset(abd_t *sabd, size_t off)
	-{
	- abd_t *abd;
	-
	- abd_verify(sabd);
	- ASSERT3U(off, <=, sabd->abd_size);
	-
	- if (abd_is_linear(sabd)) {
	- abd = abd_alloc_struct(0);
	-
	- /*
	- * Even if this buf is filesystem metadata, we only track that
	- * if we own the underlying data buffer, which is not true in
	- * this case. Therefore, we don't ever use ABD_FLAG_META here.
	- */
	- abd->abd_flags = ABD_FLAG_LINEAR;
	-
	- abd->abd_u.abd_linear.abd_buf =
	- (char *)sabd->abd_u.abd_linear.abd_buf + off;
	- } else {
	- size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
	- size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
	- (new_offset / zfs_abd_chunk_size);
	-
	- abd = abd_alloc_struct(chunkcnt);
	-
	- /*
	- * Even if this buf is filesystem metadata, we only track that
	- * if we own the underlying data buffer, which is not true in
	- * this case. Therefore, we don't ever use ABD_FLAG_META here.
	- */
	- abd->abd_flags = 0;
	-
	- abd->abd_u.abd_scatter.abd_offset =
	- new_offset % zfs_abd_chunk_size;
	- abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
	-
	- /* Copy the scatterlist starting at the correct offset */
	- (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
	- &sabd->abd_u.abd_scatter.abd_chunks[new_offset /
	- zfs_abd_chunk_size],
	- chunkcnt * sizeof (void *));
	- }
	-
	- abd->abd_size = sabd->abd_size - off;
	- abd->abd_parent = sabd;
	- zfs_refcount_create(&abd->abd_children);
	- (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
	-
	- return (abd);
	-}
	-
	-/*
	- * Allocate a linear ABD structure for buf. You must free this with abd_put()
	- * since the resulting ABD doesn't own its own buffer.
	- */
	-abd_t *
	-abd_get_from_buf(void *buf, size_t size)
	-{
	- abd_t *abd = abd_alloc_struct(0);
	-
	- VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
	-
	- /*
	- * Even if this buf is filesystem metadata, we only track that if we
	- * own the underlying data buffer, which is not true in this case.
	- * Therefore, we don't ever use ABD_FLAG_META here.
	- */
	- abd->abd_flags = ABD_FLAG_LINEAR;
	- abd->abd_size = size;
	- abd->abd_parent = NULL;
	- zfs_refcount_create(&abd->abd_children);
	-
	- abd->abd_u.abd_linear.abd_buf = buf;
	-
	- return (abd);
	-}
	-
	-/*
	- * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
	- * free the underlying scatterlist or buffer.
	- */
	-void
	-abd_put(abd_t *abd)
	-{
	- abd_verify(abd);
	- ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
	-
	- if (abd->abd_parent != NULL) {
	- (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
	- abd->abd_size, abd);
	- }
	-
	- zfs_refcount_destroy(&abd->abd_children);
	- abd_free_struct(abd);
	-}
	-
	-/*
	- * Get the raw buffer associated with a linear ABD.
	- */
	-void *
	-abd_to_buf(abd_t *abd)
	-{
	- ASSERT(abd_is_linear(abd));
	- abd_verify(abd);
	- return (abd->abd_u.abd_linear.abd_buf);
	-}
	-
	-/*
	- * Borrow a raw buffer from an ABD without copying the contents of the ABD
	- * into the buffer. If the ABD is scattered, this will allocate a raw buffer
	- * whose contents are undefined. To copy over the existing data in the ABD, use
	- * abd_borrow_buf_copy() instead.
	- */
	-void *
	-abd_borrow_buf(abd_t *abd, size_t n)
	-{
	- void *buf;
	- abd_verify(abd);
	- ASSERT3U(abd->abd_size, >=, n);
	- if (abd_is_linear(abd)) {
	- buf = abd_to_buf(abd);
	- } else {
	- buf = zio_buf_alloc(n);
	- }
	- (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
	-
	- return (buf);
	-}
	-
	-void *
	-abd_borrow_buf_copy(abd_t *abd, size_t n)
	-{
	- void *buf = abd_borrow_buf(abd, n);
	- if (!abd_is_linear(abd)) {
	- abd_copy_to_buf(buf, abd, n);
	- }
	- return (buf);
	-}
	-
	-/*
	- * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
	- * not change the contents of the ABD and will ASSERT that you didn't modify
	- * the buffer since it was borrowed. If you want any changes you made to buf to
	- * be copied back to abd, use abd_return_buf_copy() instead.
	- */
	-void
	-abd_return_buf(abd_t abd, void buf, size_t n)
	-{
	- abd_verify(abd);
	- ASSERT3U(abd->abd_size, >=, n);
	- if (abd_is_linear(abd)) {
	- ASSERT3P(buf, ==, abd_to_buf(abd));
	- } else {
	- ASSERT0(abd_cmp_buf(abd, buf, n));
	- zio_buf_free(buf, n);
	- }
	- (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
	-}
	-
	-void
	-abd_return_buf_copy(abd_t abd, void buf, size_t n)
	-{
	- if (!abd_is_linear(abd)) {
	- abd_copy_from_buf(abd, buf, n);
	- }
	- abd_return_buf(abd, buf, n);
	-}
	-
	-/*
	- * Give this ABD ownership of the buffer that it's storing. Can only be used on
	- * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
	- * with abd_alloc_linear() which subsequently released ownership of their buf
	- * with abd_release_ownership_of_buf().
	- */
	-void
	-abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
	-{
	- ASSERT(abd_is_linear(abd));
	- ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
	- abd_verify(abd);
	-
	- abd->abd_flags \|= ABD_FLAG_OWNER;
	- if (is_metadata) {
	- abd->abd_flags \|= ABD_FLAG_META;
	- }
	-
	- ABDSTAT_BUMP(abdstat_linear_cnt);
	- ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
	-}
	-
	-void
	-abd_release_ownership_of_buf(abd_t *abd)
	-{
	- ASSERT(abd_is_linear(abd));
	- ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
	- abd_verify(abd);
	-
	- abd->abd_flags &= ~ABD_FLAG_OWNER;
	- /* Disable this flag since we no longer own the data buffer */
	- abd->abd_flags &= ~ABD_FLAG_META;
	-
	- ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
	- ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
	-}
	-
	-struct abd_iter {
	- abd_t iter_abd; / ABD being iterated through */
	- size_t iter_pos; /* position (relative to abd_offset) */
	- void iter_mapaddr; / addr corresponding to iter_pos */
	- size_t iter_mapsize; /* length of data valid at mapaddr */
	-};
	-
	-static inline size_t
	-abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
	-{
	- ASSERT(!abd_is_linear(aiter->iter_abd));
	- return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
	- aiter->iter_pos) % zfs_abd_chunk_size);
	-}
	-
	-static inline size_t
	-abd_iter_scatter_chunk_index(struct abd_iter *aiter)
	-{
	- ASSERT(!abd_is_linear(aiter->iter_abd));
	- return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
	- aiter->iter_pos) / zfs_abd_chunk_size);
	-}
	-
	-/*
	- * Initialize the abd_iter.
	- */
	-static void
	-abd_iter_init(struct abd_iter aiter, abd_t abd)
	-{
	- abd_verify(abd);
	- aiter->iter_abd = abd;
	- aiter->iter_pos = 0;
	- aiter->iter_mapaddr = NULL;
	- aiter->iter_mapsize = 0;
	-}
	-
	-/*
	- * Advance the iterator by a certain amount. Cannot be called when a chunk is
	- * in use. This can be safely called when the aiter has already exhausted, in
	- * which case this does nothing.
	- */
	-static void
	-abd_iter_advance(struct abd_iter *aiter, size_t amount)
	-{
	- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
	- ASSERT0(aiter->iter_mapsize);
	-
	- /* There's nothing left to advance to, so do nothing */
	- if (aiter->iter_pos == aiter->iter_abd->abd_size)
	- return;
	-
	- aiter->iter_pos += amount;
	-}
	-
	-/*
	- * Map the current chunk into aiter. This can be safely called when the aiter
	- * has already exhausted, in which case this does nothing.
	- */
	-static void
	-abd_iter_map(struct abd_iter *aiter)
	-{
	- void *paddr;
	- size_t offset = 0;
	-
	- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
	- ASSERT0(aiter->iter_mapsize);
	-
	- /* Panic if someone has changed zfs_abd_chunk_size */
	- IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
	- aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
	-
	- /* There's nothing left to iterate over, so do nothing */
	- if (aiter->iter_pos == aiter->iter_abd->abd_size)
	- return;
	-
	- if (abd_is_linear(aiter->iter_abd)) {
	- offset = aiter->iter_pos;
	- aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
	- paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
	- } else {
	- size_t index = abd_iter_scatter_chunk_index(aiter);
	- offset = abd_iter_scatter_chunk_offset(aiter);
	- aiter->iter_mapsize = zfs_abd_chunk_size - offset;
	- paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
	- }
	- aiter->iter_mapaddr = (char *)paddr + offset;
	-}
	-
	-/*
	- * Unmap the current chunk from aiter. This can be safely called when the aiter
	- * has already exhausted, in which case this does nothing.
	- */
	-static void
	-abd_iter_unmap(struct abd_iter *aiter)
	-{
	- /* There's nothing left to unmap, so do nothing */
	- if (aiter->iter_pos == aiter->iter_abd->abd_size)
	- return;
	-
	- ASSERT3P(aiter->iter_mapaddr, !=, NULL);
	- ASSERT3U(aiter->iter_mapsize, >, 0);
	-
	- aiter->iter_mapaddr = NULL;
	- aiter->iter_mapsize = 0;
	-}
	-
	-int
	-abd_iterate_func(abd_t *abd, size_t off, size_t size,
	- abd_iter_func_t func, void private)
	-{
	- int ret = 0;
	- struct abd_iter aiter;
	-
	- abd_verify(abd);
	- ASSERT3U(off + size, <=, abd->abd_size);
	-
	- abd_iter_init(&aiter, abd);
	- abd_iter_advance(&aiter, off);
	-
	- while (size > 0) {
	- abd_iter_map(&aiter);
	-
	- size_t len = MIN(aiter.iter_mapsize, size);
	- ASSERT3U(len, >, 0);
	-
	- ret = func(aiter.iter_mapaddr, len, private);
	-
	- abd_iter_unmap(&aiter);
	-
	- if (ret != 0)
	- break;
	-
	- size -= len;
	- abd_iter_advance(&aiter, len);
	- }
	-
	- return (ret);
	-}
	-
	-struct buf_arg {
	- void *arg_buf;
	-};
	-
	-static int
	-abd_copy_to_buf_off_cb(void buf, size_t size, void private)
	-{
	- struct buf_arg *ba_ptr = private;
	-
	- (void) memcpy(ba_ptr->arg_buf, buf, size);
	- ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
	-
	- return (0);
	-}
	-
	-/*
	- * Copy abd to buf. (off is the offset in abd.)
	- */
	-void
	-abd_copy_to_buf_off(void buf, abd_t abd, size_t off, size_t size)
	-{
	- struct buf_arg ba_ptr = { buf };
	-
	- (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
	- &ba_ptr);
	-}
	-
	-static int
	-abd_cmp_buf_off_cb(void buf, size_t size, void private)
	-{
	- int ret;
	- struct buf_arg *ba_ptr = private;
	-
	- ret = memcmp(buf, ba_ptr->arg_buf, size);
	- ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
	-
	- return (ret);
	-}
	-
	-/*
	- * Compare the contents of abd to buf. (off is the offset in abd.)
	- */
	-int
	-abd_cmp_buf_off(abd_t abd, const void buf, size_t off, size_t size)
	-{
	- struct buf_arg ba_ptr = { (void *) buf };
	-
	- return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
	-}
	-
	-static int
	-abd_copy_from_buf_off_cb(void buf, size_t size, void private)
	-{
	- struct buf_arg *ba_ptr = private;
	-
	- (void) memcpy(buf, ba_ptr->arg_buf, size);
	- ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
	-
	- return (0);
	-}
	-
	-/*
	- * Copy from buf to abd. (off is the offset in abd.)
	- */
	-void
	-abd_copy_from_buf_off(abd_t abd, const void buf, size_t off, size_t size)
	-{
	- struct buf_arg ba_ptr = { (void *) buf };
	-
	- (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
	- &ba_ptr);
	-}
	-
	-/ARGSUSED/
	-static int
	-abd_zero_off_cb(void buf, size_t size, void private)
	-{
	- (void) memset(buf, 0, size);
	- return (0);
	-}
	-
	-/*
	- * Zero out the abd from a particular offset to the end.
	- */
	-void
	-abd_zero_off(abd_t *abd, size_t off, size_t size)
	-{
	- (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
	-}
	-
	-/*
	- * Iterate over two ABDs and call func incrementally on the two ABDs' data in
	- * equal-sized chunks (passed to func as raw buffers). func could be called many
	- * times during this iteration.
	- */
	-int
	-abd_iterate_func2(abd_t dabd, abd_t sabd, size_t doff, size_t soff,
	- size_t size, abd_iter_func2_t func, void private)
	-{
	- int ret = 0;
	- struct abd_iter daiter, saiter;
	-
	- abd_verify(dabd);
	- abd_verify(sabd);
	-
	- ASSERT3U(doff + size, <=, dabd->abd_size);
	- ASSERT3U(soff + size, <=, sabd->abd_size);
	-
	- abd_iter_init(&daiter, dabd);
	- abd_iter_init(&saiter, sabd);
	- abd_iter_advance(&daiter, doff);
	- abd_iter_advance(&saiter, soff);
	-
	- while (size > 0) {
	- abd_iter_map(&daiter);
	- abd_iter_map(&saiter);
	-
	- size_t dlen = MIN(daiter.iter_mapsize, size);
	- size_t slen = MIN(saiter.iter_mapsize, size);
	- size_t len = MIN(dlen, slen);
	- ASSERT(dlen > 0 \|\| slen > 0);
	-
	- ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
	- private);
	-
	- abd_iter_unmap(&saiter);
	- abd_iter_unmap(&daiter);
	-
	- if (ret != 0)
	- break;
	-
	- size -= len;
	- abd_iter_advance(&daiter, len);
	- abd_iter_advance(&saiter, len);
	- }
	-
	- return (ret);
	-}
	-
	-/ARGSUSED/
	-static int
	-abd_copy_off_cb(void dbuf, void sbuf, size_t size, void *private)
	-{
	- (void) memcpy(dbuf, sbuf, size);
	- return (0);
	-}
	-
	-/*
	- * Copy from sabd to dabd starting from soff and doff.
	- */
	-void
	-abd_copy_off(abd_t dabd, abd_t sabd, size_t doff, size_t soff, size_t size)
	-{
	- (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
	- abd_copy_off_cb, NULL);
	-}
	-
	-/ARGSUSED/
	-static int
	-abd_cmp_cb(void bufa, void bufb, size_t size, void *private)
	-{
	- return (memcmp(bufa, bufb, size));
	-}
	-
	-/*
	- * Compares the first size bytes of two ABDs.
	- */
	-int
	-abd_cmp(abd_t dabd, abd_t sabd, size_t size)
	-{
	- return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c
	@@ -1,234 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/aggsum.h>
	-
	-/*
	- * Aggregate-sum counters are a form of fanned-out counter, used when atomic
	- * instructions on a single field cause enough CPU cache line contention to
	- * slow system performance. Due to their increased overhead and the expense
	- * involved with precisely reading from them, they should only be used in cases
	- * where the write rate (increment/decrement) is much higher than the read rate
	- * (get value).
	- *
	- * Aggregate sum counters are comprised of two basic parts, the core and the
	- * buckets. The core counter contains a lock for the entire counter, as well
	- * as the current upper and lower bounds on the value of the counter. The
	- * aggsum_bucket structure contains a per-bucket lock to protect the contents of
	- * the bucket, the current amount that this bucket has changed from the global
	- * counter (called the delta), and the amount of increment and decrement we have
	- * "borrowed" from the core counter.
	- *
	- * The basic operation of an aggsum is simple. Threads that wish to modify the
	- * counter will modify one bucket's counter (determined by their current CPU, to
	- * help minimize lock and cache contention). If the bucket already has
	- * sufficient capacity borrowed from the core structure to handle their request,
	- * they simply modify the delta and return. If the bucket does not, we clear
	- * the bucket's current state (to prevent the borrowed amounts from getting too
	- * large), and borrow more from the core counter. Borrowing is done by adding to
	- * the upper bound (or subtracting from the lower bound) of the core counter,
	- * and setting the borrow value for the bucket to the amount added (or
	- * subtracted). Clearing the bucket is the opposite; we add the current delta
	- * to both the lower and upper bounds of the core counter, subtract the borrowed
	- * incremental from the upper bound, and add the borrowed decrement from the
	- * lower bound. Note that only borrowing and clearing require access to the
	- * core counter; since all other operations access CPU-local resources,
	- * performance can be much higher than a traditional counter.
	- *
	- * Threads that wish to read from the counter have a slightly more challenging
	- * task. It is fast to determine the upper and lower bounds of the aggum; this
	- * does not require grabbing any locks. This suffices for cases where an
	- * approximation of the aggsum's value is acceptable. However, if one needs to
	- * know whether some specific value is above or below the current value in the
	- * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
	- * comparing the target value to the upper and lower bounds of the aggsum, and
	- * then clearing a bucket. This proceeds until the target is outside of the
	- * upper and lower bounds and we return a response, or the last bucket has been
	- * cleared and we know that the target is equal to the aggsum's value. Finally,
	- * the most expensive operation is determining the precise value of the aggsum.
	- * To do this, we clear every bucket and then return the upper bound (which must
	- * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
	- * expensive is clearing buckets. This involves grabbing the global lock
	- * (serializing against themselves and borrow operations), grabbing a bucket's
	- * lock (preventing threads on those CPUs from modifying their delta), and
	- * zeroing out the borrowed value (forcing that thread to borrow on its next
	- * request, which will also be expensive). This is what makes aggsums well
	- * suited for write-many read-rarely operations.
	- */
	-
	-/*
	- * We will borrow aggsum_borrow_multiplier times the current request, so we will
	- * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
	- * aggsum_delta().
	- */
	-static uint_t aggsum_borrow_multiplier = 10;
	-
	-void
	-aggsum_init(aggsum_t *as, uint64_t value)
	-{
	- bzero(as, sizeof (*as));
	- as->as_lower_bound = as->as_upper_bound = value;
	- mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
	- as->as_numbuckets = boot_ncpus;
	- as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
	- KM_SLEEP);
	- for (int i = 0; i < as->as_numbuckets; i++) {
	- mutex_init(&as->as_buckets[i].asc_lock,
	- NULL, MUTEX_DEFAULT, NULL);
	- }
	-}
	-
	-void
	-aggsum_fini(aggsum_t *as)
	-{
	- for (int i = 0; i < as->as_numbuckets; i++)
	- mutex_destroy(&as->as_buckets[i].asc_lock);
	- kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
	- mutex_destroy(&as->as_lock);
	-}
	-
	-int64_t
	-aggsum_lower_bound(aggsum_t *as)
	-{
	- return (as->as_lower_bound);
	-}
	-
	-int64_t
	-aggsum_upper_bound(aggsum_t *as)
	-{
	- return (as->as_upper_bound);
	-}
	-
	-static void
	-aggsum_flush_bucket(aggsum_t as, struct aggsum_bucket asb)
	-{
	- ASSERT(MUTEX_HELD(&as->as_lock));
	- ASSERT(MUTEX_HELD(&asb->asc_lock));
	-
	- /*
	- * We use atomic instructions for this because we read the upper and
	- * lower bounds without the lock, so we need stores to be atomic.
	- */
	- atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
	- asb->asc_delta + asb->asc_borrowed);
	- atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
	- asb->asc_delta - asb->asc_borrowed);
	- asb->asc_delta = 0;
	- asb->asc_borrowed = 0;
	-}
	-
	-uint64_t
	-aggsum_value(aggsum_t *as)
	-{
	- int64_t rv;
	-
	- mutex_enter(&as->as_lock);
	- if (as->as_lower_bound == as->as_upper_bound) {
	- rv = as->as_lower_bound;
	- for (int i = 0; i < as->as_numbuckets; i++) {
	- ASSERT0(as->as_buckets[i].asc_delta);
	- ASSERT0(as->as_buckets[i].asc_borrowed);
	- }
	- mutex_exit(&as->as_lock);
	- return (rv);
	- }
	- for (int i = 0; i < as->as_numbuckets; i++) {
	- struct aggsum_bucket *asb = &as->as_buckets[i];
	- mutex_enter(&asb->asc_lock);
	- aggsum_flush_bucket(as, asb);
	- mutex_exit(&asb->asc_lock);
	- }
	- VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
	- rv = as->as_lower_bound;
	- mutex_exit(&as->as_lock);
	-
	- return (rv);
	-}
	-
	-void
	-aggsum_add(aggsum_t *as, int64_t delta)
	-{
	- struct aggsum_bucket *asb =
	- &as->as_buckets[CPU_SEQID % as->as_numbuckets];
	- int64_t borrow;
	-
	- /* Try fast path if we already borrowed enough before. */
	- mutex_enter(&asb->asc_lock);
	- if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
	- asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
	- asb->asc_delta += delta;
	- mutex_exit(&asb->asc_lock);
	- return;
	- }
	- mutex_exit(&asb->asc_lock);
	-
	- /*
	- * We haven't borrowed enough. Take the global lock and borrow
	- * considering what is requested now and what we borrowed before.
	- */
	- borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
	- mutex_enter(&as->as_lock);
	- mutex_enter(&asb->asc_lock);
	- delta += asb->asc_delta;
	- asb->asc_delta = 0;
	- if (borrow >= asb->asc_borrowed)
	- borrow -= asb->asc_borrowed;
	- else
	- borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
	- asb->asc_borrowed += borrow;
	- atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
	- delta - borrow);
	- atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
	- delta + borrow);
	- mutex_exit(&asb->asc_lock);
	- mutex_exit(&as->as_lock);
	-}
	-
	-/*
	- * Compare the aggsum value to target efficiently. Returns -1 if the value
	- * represented by the aggsum is less than target, 1 if it's greater, and 0 if
	- * they are equal.
	- */
	-int
	-aggsum_compare(aggsum_t *as, uint64_t target)
	-{
	- if (as->as_upper_bound < target)
	- return (-1);
	- if (as->as_lower_bound > target)
	- return (1);
	- mutex_enter(&as->as_lock);
	- for (int i = 0; i < as->as_numbuckets; i++) {
	- struct aggsum_bucket *asb = &as->as_buckets[i];
	- mutex_enter(&asb->asc_lock);
	- aggsum_flush_bucket(as, asb);
	- mutex_exit(&asb->asc_lock);
	- if (as->as_upper_bound < target) {
	- mutex_exit(&as->as_lock);
	- return (-1);
	- }
	- if (as->as_lower_bound > target) {
	- mutex_exit(&as->as_lock);
	- return (1);
	- }
	- }
	- VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
	- ASSERT3U(as->as_lower_bound, ==, target);
	- mutex_exit(&as->as_lock);
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
	@@ -1,8569 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2018, Joyent, Inc.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
	- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-/*
	- * DVA-based Adjustable Replacement Cache
	- *
	- * While much of the theory of operation used here is
	- * based on the self-tuning, low overhead replacement cache
	- * presented by Megiddo and Modha at FAST 2003, there are some
	- * significant differences:
	- *
	- * 1. The Megiddo and Modha model assumes any page is evictable.
	- * Pages in its cache cannot be "locked" into memory. This makes
	- * the eviction algorithm simple: evict the last page in the list.
	- * This also make the performance characteristics easy to reason
	- * about. Our cache is not so simple. At any given moment, some
	- * subset of the blocks in the cache are un-evictable because we
	- * have handed out a reference to them. Blocks are only evictable
	- * when there are no external references active. This makes
	- * eviction far more problematic: we choose to evict the evictable
	- * blocks that are the "lowest" in the list.
	- *
	- * There are times when it is not possible to evict the requested
	- * space. In these circumstances we are unable to adjust the cache
	- * size. To prevent the cache growing unbounded at these times we
	- * implement a "cache throttle" that slows the flow of new data
	- * into the cache until we can make space available.
	- *
	- * 2. The Megiddo and Modha model assumes a fixed cache size.
	- * Pages are evicted when the cache is full and there is a cache
	- * miss. Our model has a variable sized cache. It grows with
	- * high use, but also tries to react to memory pressure from the
	- * operating system: decreasing its size when system memory is
	- * tight.
	- *
	- * 3. The Megiddo and Modha model assumes a fixed page size. All
	- * elements of the cache are therefore exactly the same size. So
	- * when adjusting the cache size following a cache miss, its simply
	- * a matter of choosing a single page to evict. In our model, we
	- * have variable sized cache blocks (rangeing from 512 bytes to
	- * 128K bytes). We therefore choose a set of blocks to evict to make
	- * space for a cache miss that approximates as closely as possible
	- * the space used by the new block.
	- *
	- * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
	- * by N. Megiddo & D. Modha, FAST 2003
	- */
	-
	-/*
	- * The locking model:
	- *
	- * A new reference to a cache buffer can be obtained in two
	- * ways: 1) via a hash table lookup using the DVA as a key,
	- * or 2) via one of the ARC lists. The arc_read() interface
	- * uses method 1, while the internal ARC algorithms for
	- * adjusting the cache use method 2. We therefore provide two
	- * types of locks: 1) the hash table lock array, and 2) the
	- * ARC list locks.
	- *
	- * Buffers do not have their own mutexes, rather they rely on the
	- * hash table mutexes for the bulk of their protection (i.e. most
	- * fields in the arc_buf_hdr_t are protected by these mutexes).
	- *
	- * buf_hash_find() returns the appropriate mutex (held) when it
	- * locates the requested buffer in the hash table. It returns
	- * NULL for the mutex if the buffer was not in the table.
	- *
	- * buf_hash_remove() expects the appropriate hash mutex to be
	- * already held before it is invoked.
	- *
	- * Each ARC state also has a mutex which is used to protect the
	- * buffer list associated with the state. When attempting to
	- * obtain a hash table lock while holding an ARC list lock you
	- * must use: mutex_tryenter() to avoid deadlock. Also note that
	- * the active state mutex must be held before the ghost state mutex.
	- *
	- * It as also possible to register a callback which is run when the
	- * arc_meta_limit is reached and no buffers can be safely evicted. In
	- * this case the arc user should drop a reference on some arc buffers so
	- * they can be reclaimed and the arc_meta_limit honored. For example,
	- * when using the ZPL each dentry holds a references on a znode. These
	- * dentries must be pruned before the arc buffer holding the znode can
	- * be safely evicted.
	- *
	- * Note that the majority of the performance stats are manipulated
	- * with atomic operations.
	- *
	- * The L2ARC uses the l2ad_mtx on each vdev for the following:
	- *
	- * - L2ARC buflist creation
	- * - L2ARC buflist eviction
	- * - L2ARC write completion, which walks L2ARC buflists
	- * - ARC header destruction, as it removes from L2ARC buflists
	- * - ARC header release, as it removes from L2ARC buflists
	- */
	-
	-/*
	- * ARC operation:
	- *
	- * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
	- * This structure can point either to a block that is still in the cache or to
	- * one that is only accessible in an L2 ARC device, or it can provide
	- * information about a block that was recently evicted. If a block is
	- * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
	- * information to retrieve it from the L2ARC device. This information is
	- * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
	- * that is in this state cannot access the data directly.
	- *
	- * Blocks that are actively being referenced or have not been evicted
	- * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
	- * the arc_buf_hdr_t that will point to the data block in memory. A block can
	- * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
	- * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
	- * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
	- *
	- * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
	- * ability to store the physical data (b_pabd) associated with the DVA of the
	- * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
	- * it will match its on-disk compression characteristics. This behavior can be
	- * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
	- * compressed ARC functionality is disabled, the b_pabd will point to an
	- * uncompressed version of the on-disk data.
	- *
	- * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
	- * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
	- * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
	- * consumer. The ARC will provide references to this data and will keep it
	- * cached until it is no longer in use. The ARC caches only the L1ARC's physical
	- * data block and will evict any arc_buf_t that is no longer referenced. The
	- * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
	- * "overhead_size" kstat.
	- *
	- * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
	- * compressed form. The typical case is that consumers will want uncompressed
	- * data, and when that happens a new data buffer is allocated where the data is
	- * decompressed for them to use. Currently the only consumer who wants
	- * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
	- * exists on disk. When this happens, the arc_buf_t's data buffer is shared
	- * with the arc_buf_hdr_t.
	- *
	- * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
	- * first one is owned by a compressed send consumer (and therefore references
	- * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
	- * used by any other consumer (and has its own uncompressed copy of the data
	- * buffer).
	- *
	- * arc_buf_hdr_t
	- * +-----------+
	- * \| fields \|
	- * \| common to \|
	- * \| L1- and \|
	- * \| L2ARC \|
	- * +-----------+
	- * \| l2arc_buf_hdr_t
	- * \| \|
	- * +-----------+
	- * \| l1arc_buf_hdr_t
	- * \| \| arc_buf_t
	- * \| b_buf +------------>+-----------+ arc_buf_t
	- * \| b_pabd +-+ \|b_next +---->+-----------+
	- * +-----------+ \| \|-----------\| \|b_next +-->NULL
	- * \| \|b_comp = T \| +-----------+
	- * \| \|b_data +-+ \|b_comp = F \|
	- * \| +-----------+ \| \|b_data +-+
	- * +->+------+ \| +-----------+ \|
	- * compressed \| \| \| \|
	- * data \| \|<--------------+ \| uncompressed
	- * +------+ compressed, \| data
	- * shared +-->+------+
	- * data \| \|
	- * \| \|
	- * +------+
	- *
	- * When a consumer reads a block, the ARC must first look to see if the
	- * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
	- * arc_buf_t and either copies uncompressed data into a new data buffer from an
	- * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
	- * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
	- * hdr is compressed and the desired compression characteristics of the
	- * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
	- * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
	- * the last buffer in the hdr's b_buf list, however a shared compressed buf can
	- * be anywhere in the hdr's list.
	- *
	- * The diagram below shows an example of an uncompressed ARC hdr that is
	- * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
	- * the last element in the buf list):
	- *
	- * arc_buf_hdr_t
	- * +-----------+
	- * \| \|
	- * \| \|
	- * \| \|
	- * +-----------+
	- * l2arc_buf_hdr_t\| \|
	- * \| \|
	- * +-----------+
	- * l1arc_buf_hdr_t\| \|
	- * \| \| arc_buf_t (shared)
	- * \| b_buf +------------>+---------+ arc_buf_t
	- * \| \| \|b_next +---->+---------+
	- * \| b_pabd +-+ \|---------\| \|b_next +-->NULL
	- * +-----------+ \| \| \| +---------+
	- * \| \|b_data +-+ \| \|
	- * \| +---------+ \| \|b_data +-+
	- * +->+------+ \| +---------+ \|
	- * \| \| \| \|
	- * uncompressed \| \| \| \|
	- * data +------+ \| \|
	- * ^ +->+------+ \|
	- * \| uncompressed \| \| \|
	- * \| data \| \| \|
	- * \| +------+ \|
	- * +---------------------------------+
	- *
	- * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
	- * since the physical block is about to be rewritten. The new data contents
	- * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
	- * it may compress the data before writing it to disk. The ARC will be called
	- * with the transformed data and will bcopy the transformed on-disk block into
	- * a newly allocated b_pabd. Writes are always done into buffers which have
	- * either been loaned (and hence are new and don't have other readers) or
	- * buffers which have been released (and hence have their own hdr, if there
	- * were originally other readers of the buf's original hdr). This ensures that
	- * the ARC only needs to update a single buf and its hdr after a write occurs.
	- *
	- * When the L2ARC is in use, it will also take advantage of the b_pabd. The
	- * L2ARC will always write the contents of b_pabd to the L2ARC. This means
	- * that when compressed ARC is enabled that the L2ARC blocks are identical
	- * to the on-disk block in the main data pool. This provides a significant
	- * advantage since the ARC can leverage the bp's checksum when reading from the
	- * L2ARC to determine if the contents are valid. However, if the compressed
	- * ARC is disabled, then the L2ARC's block must be transformed to look
	- * like the physical block in the main data pool before comparing the
	- * checksum and determining its validity.
	- */
	-
	-#include <sys/spa.h>
	-#include <sys/zio.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zio_compress.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zfs_context.h>
	-#include <sys/arc.h>
	-#include <sys/refcount.h>
	-#include <sys/vdev.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/multilist.h>
	-#include <sys/abd.h>
	-#ifdef _KERNEL
	-#include <sys/dnlc.h>
	-#include <sys/racct.h>
	-#endif
	-#include <sys/callb.h>
	-#include <sys/kstat.h>
	-#include <sys/trim_map.h>
	-#include <sys/zthr.h>
	-#include <zfs_fletcher.h>
	-#include <sys/sdt.h>
	-#include <sys/aggsum.h>
	-#include <sys/cityhash.h>
	-
	-#include <machine/vmparam.h>
	-
	-#ifdef illumos
	-#ifndef _KERNEL
	-/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
	-boolean_t arc_watch = B_FALSE;
	-int arc_procfd;
	-#endif
	-#endif /* illumos */
	-
	-/*
	- * This thread's job is to keep enough free memory in the system, by
	- * calling arc_kmem_reap_now() plus arc_shrink(), which improves
	- * arc_available_memory().
	- */
	-static zthr_t *arc_reap_zthr;
	-
	-/*
	- * This thread's job is to keep arc_size under arc_c, by calling
	- * arc_adjust(), which improves arc_is_overflowing().
	- */
	-static zthr_t *arc_adjust_zthr;
	-
	-static kmutex_t arc_adjust_lock;
	-static kcondvar_t arc_adjust_waiters_cv;
	-static boolean_t arc_adjust_needed = B_FALSE;
	-
	-static kmutex_t arc_dnlc_evicts_lock;
	-static kcondvar_t arc_dnlc_evicts_cv;
	-static boolean_t arc_dnlc_evicts_thread_exit;
	-
	-uint_t arc_reduce_dnlc_percent = 3;
	-
	-/*
	- * The number of headers to evict in arc_evict_state_impl() before
	- * dropping the sublist lock and evicting from another sublist. A lower
	- * value means we're more likely to evict the "correct" header (i.e. the
	- * oldest header in the arc state), but comes with higher overhead
	- * (i.e. more invocations of arc_evict_state_impl()).
	- */
	-int zfs_arc_evict_batch_limit = 10;
	-
	-/* number of seconds before growing cache again */
	-int arc_grow_retry = 60;
	-
	-/*
	- * Minimum time between calls to arc_kmem_reap_soon(). Note that this will
	- * be converted to ticks, so with the default hz=100, a setting of 15 ms
	- * will actually wait 2 ticks, or 20ms.
	- */
	-int arc_kmem_cache_reap_retry_ms = 1000;
	-
	-/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
	-int zfs_arc_overflow_shift = 8;
	-
	-/* shift of arc_c for calculating both min and max arc_p */
	-int arc_p_min_shift = 4;
	-
	-/* log2(fraction of arc to reclaim) */
	-int arc_shrink_shift = 7;
	-
	-/*
	- * log2(fraction of ARC which must be free to allow growing).
	- * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
	- * when reading a new block into the ARC, we will evict an equal-sized block
	- * from the ARC.
	- *
	- * This must be less than arc_shrink_shift, so that when we shrink the ARC,
	- * we will still not allow it to grow.
	- */
	-int arc_no_grow_shift = 5;
	-
	-
	-/*
	- * minimum lifespan of a prefetch block in clock ticks
	- * (initialized in arc_init())
	- */
	-static int zfs_arc_min_prefetch_ms = 1;
	-static int zfs_arc_min_prescient_prefetch_ms = 6;
	-
	-/*
	- * If this percent of memory is free, don't throttle.
	- */
	-int arc_lotsfree_percent = 10;
	-
	-static boolean_t arc_initialized;
	-extern boolean_t zfs_prefetch_disable;
	-
	-/*
	- * The arc has filled available memory and has now warmed up.
	- */
	-static boolean_t arc_warm;
	-
	-/*
	- * log2 fraction of the zio arena to keep free.
	- */
	-int arc_zio_arena_free_shift = 2;
	-
	-/*
	- * These tunables are for performance analysis.
	- */
	-uint64_t zfs_arc_max;
	-uint64_t zfs_arc_min;
	-uint64_t zfs_arc_meta_limit = 0;
	-uint64_t zfs_arc_meta_min = 0;
	-uint64_t zfs_arc_dnode_limit = 0;
	-uint64_t zfs_arc_dnode_reduce_percent = 10;
	-int zfs_arc_grow_retry = 0;
	-int zfs_arc_shrink_shift = 0;
	-int zfs_arc_no_grow_shift = 0;
	-int zfs_arc_p_min_shift = 0;
	-uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
	-u_int zfs_arc_free_target = 0;
	-
	-/* Absolute min for arc min / max is 16MB. */
	-static uint64_t arc_abs_min = 16 << 20;
	-
	-/*
	- * ARC dirty data constraints for arc_tempreserve_space() throttle
	- */
	-uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
	-uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
	-uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
	-
	-boolean_t zfs_compressed_arc_enabled = B_TRUE;
	-
	-static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
	-static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
	-static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
	-static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
	-static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-static void
	-arc_free_target_init(void *unused __unused)
	-{
	-
	- zfs_arc_free_target = vm_cnt.v_free_target;
	-}
	-SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
	- arc_free_target_init, NULL);
	-
	-TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
	-TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
	-TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
	-TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry);
	-TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift);
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
	- CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN,
	- 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
	- CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN,
	- 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
	- CTLTYPE_U32 \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN,
	- 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
	- "log2(fraction of ARC which must be free to allow growing)");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
	- &zfs_arc_average_blocksize, 0,
	- "ARC average blocksize");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
	- &arc_shrink_shift, 0,
	- "log2(fraction of arc to reclaim)");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW,
	- &arc_grow_retry, 0,
	- "Wait in seconds before considering growing ARC");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
	- &zfs_compressed_arc_enabled, 0,
	- "Enable compressed ARC");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN,
	- &arc_kmem_cache_reap_retry_ms, 0,
	- "Interval between ARC kmem_cache reapings");
	-
	-/*
	- * We don't have a tunable for arc_free_target due to the dependency on
	- * pagedaemon initialisation.
	- */
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
	- CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(u_int),
	- sysctl_vfs_zfs_arc_free_target, "IU",
	- "Desired number of free pages below which ARC triggers reclaim");
	-
	-static int
	-sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
	-{
	- u_int val;
	- int err;
	-
	- val = zfs_arc_free_target;
	- err = sysctl_handle_int(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val < minfree)
	- return (EINVAL);
	- if (val > vm_cnt.v_page_count)
	- return (EINVAL);
	-
	- zfs_arc_free_target = val;
	-
	- return (0);
	-}
	-
	-/*
	- * Must be declared here, before the definition of corresponding kstat
	- * macro which uses the same names will confuse the compiler.
	- */
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
	- CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t),
	- sysctl_vfs_zfs_arc_meta_limit, "QU",
	- "ARC metadata limit");
	-#endif
	-
	-/*
	- * Note that buffers can be in one of 6 states:
	- * ARC_anon - anonymous (discussed below)
	- * ARC_mru - recently used, currently cached
	- * ARC_mru_ghost - recentely used, no longer in cache
	- * ARC_mfu - frequently used, currently cached
	- * ARC_mfu_ghost - frequently used, no longer in cache
	- * ARC_l2c_only - exists in L2ARC but not other states
	- * When there are no active references to the buffer, they are
	- * are linked onto a list in one of these arc states. These are
	- * the only buffers that can be evicted or deleted. Within each
	- * state there are multiple lists, one for meta-data and one for
	- * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
	- * etc.) is tracked separately so that it can be managed more
	- * explicitly: favored over data, limited explicitly.
	- *
	- * Anonymous buffers are buffers that are not associated with
	- * a DVA. These are buffers that hold dirty block copies
	- * before they are written to stable storage. By definition,
	- * they are "ref'd" and are considered part of arc_mru
	- * that cannot be freed. Generally, they will aquire a DVA
	- * as they are written and migrate onto the arc_mru list.
	- *
	- * The ARC_l2c_only state is for buffers that are in the second
	- * level ARC but no longer in any of the ARC_m* lists. The second
	- * level ARC itself may also contain buffers that are in any of
	- * the ARC_m* states - meaning that a buffer can exist in two
	- * places. The reason for the ARC_l2c_only state is to keep the
	- * buffer header in the hash table, so that reads that hit the
	- * second level ARC benefit from these fast lookups.
	- */
	-
	-typedef struct arc_state {
	- /*
	- * list of evictable buffers
	- */
	- multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
	- /*
	- * total amount of evictable data in this state
	- */
	- zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
	- /*
	- * total amount of data in this state; this includes: evictable,
	- * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
	- */
	- zfs_refcount_t arcs_size;
	- /*
	- * supports the "dbufs" kstat
	- */
	- arc_state_type_t arcs_state;
	-} arc_state_t;
	-
	-/*
	- * Percentage that can be consumed by dnodes of ARC meta buffers.
	- */
	-int zfs_arc_meta_prune = 10000;
	-unsigned long zfs_arc_dnode_limit_percent = 10;
	-int zfs_arc_meta_strategy = ARC_STRATEGY_META_ONLY;
	-int zfs_arc_meta_adjust_restarts = 4096;
	-
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_meta_strategy, CTLFLAG_RWTUN,
	- &zfs_arc_meta_strategy, 0,
	- "ARC metadata reclamation strategy "
	- "(0 = metadata only, 1 = balance data and metadata)");
	-
	-/* The 6 states: */
	-static arc_state_t ARC_anon;
	-static arc_state_t ARC_mru;
	-static arc_state_t ARC_mru_ghost;
	-static arc_state_t ARC_mfu;
	-static arc_state_t ARC_mfu_ghost;
	-static arc_state_t ARC_l2c_only;
	-
	-typedef struct arc_stats {
	- kstat_named_t arcstat_hits;
	- kstat_named_t arcstat_misses;
	- kstat_named_t arcstat_demand_data_hits;
	- kstat_named_t arcstat_demand_data_misses;
	- kstat_named_t arcstat_demand_metadata_hits;
	- kstat_named_t arcstat_demand_metadata_misses;
	- kstat_named_t arcstat_prefetch_data_hits;
	- kstat_named_t arcstat_prefetch_data_misses;
	- kstat_named_t arcstat_prefetch_metadata_hits;
	- kstat_named_t arcstat_prefetch_metadata_misses;
	- kstat_named_t arcstat_mru_hits;
	- kstat_named_t arcstat_mru_ghost_hits;
	- kstat_named_t arcstat_mfu_hits;
	- kstat_named_t arcstat_mfu_ghost_hits;
	- kstat_named_t arcstat_allocated;
	- kstat_named_t arcstat_deleted;
	- /*
	- * Number of buffers that could not be evicted because the hash lock
	- * was held by another thread. The lock may not necessarily be held
	- * by something using the same buffer, since hash locks are shared
	- * by multiple buffers.
	- */
	- kstat_named_t arcstat_mutex_miss;
	- /*
	- * Number of buffers skipped when updating the access state due to the
	- * header having already been released after acquiring the hash lock.
	- */
	- kstat_named_t arcstat_access_skip;
	- /*
	- * Number of buffers skipped because they have I/O in progress, are
	- * indirect prefetch buffers that have not lived long enough, or are
	- * not from the spa we're trying to evict from.
	- */
	- kstat_named_t arcstat_evict_skip;
	- /*
	- * Number of times arc_evict_state() was unable to evict enough
	- * buffers to reach it's target amount.
	- */
	- kstat_named_t arcstat_evict_not_enough;
	- kstat_named_t arcstat_evict_l2_cached;
	- kstat_named_t arcstat_evict_l2_eligible;
	- kstat_named_t arcstat_evict_l2_ineligible;
	- kstat_named_t arcstat_evict_l2_skip;
	- kstat_named_t arcstat_hash_elements;
	- kstat_named_t arcstat_hash_elements_max;
	- kstat_named_t arcstat_hash_collisions;
	- kstat_named_t arcstat_hash_chains;
	- kstat_named_t arcstat_hash_chain_max;
	- kstat_named_t arcstat_p;
	- kstat_named_t arcstat_c;
	- kstat_named_t arcstat_c_min;
	- kstat_named_t arcstat_c_max;
	- /* Not updated directly; only synced in arc_kstat_update. */
	- kstat_named_t arcstat_size;
	- /*
	- * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
	- * Note that the compressed bytes may match the uncompressed bytes
	- * if the block is either not compressed or compressed arc is disabled.
	- */
	- kstat_named_t arcstat_compressed_size;
	- /*
	- * Uncompressed size of the data stored in b_pabd. If compressed
	- * arc is disabled then this value will be identical to the stat
	- * above.
	- */
	- kstat_named_t arcstat_uncompressed_size;
	- /*
	- * Number of bytes stored in all the arc_buf_t's. This is classified
	- * as "overhead" since this data is typically short-lived and will
	- * be evicted from the arc when it becomes unreferenced unless the
	- * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
	- * values have been set (see comment in dbuf.c for more information).
	- */
	- kstat_named_t arcstat_overhead_size;
	- /*
	- * Number of bytes consumed by internal ARC structures necessary
	- * for tracking purposes; these structures are not actually
	- * backed by ARC buffers. This includes arc_buf_hdr_t structures
	- * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
	- * caches), and arc_buf_t structures (allocated via arc_buf_t
	- * cache).
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_hdr_size;
	- /*
	- * Number of bytes consumed by ARC buffers of type equal to
	- * ARC_BUFC_DATA. This is generally consumed by buffers backing
	- * on disk user data (e.g. plain file contents).
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_data_size;
	- /*
	- * Number of bytes consumed by ARC buffers of type equal to
	- * ARC_BUFC_METADATA. This is generally consumed by buffers
	- * backing on disk data that is used for internal ZFS
	- * structures (e.g. ZAP, dnode, indirect blocks, etc).
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_metadata_size;
	- /*
	- * Number of bytes consumed by dmu_buf_impl_t objects.
	- */
	- kstat_named_t arcstat_dbuf_size;
	- /*
	- * Number of bytes consumed by dnode_t objects.
	- */
	- kstat_named_t arcstat_dnode_size;
	- /*
	- * Number of bytes consumed by bonus buffers.
	- */
	- kstat_named_t arcstat_bonus_size;
	-#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
	- /*
	- * Sum of the previous three counters, provided for compatibility.
	- */
	- kstat_named_t arcstat_other_size;
	-#endif
	- /*
	- * Total number of bytes consumed by ARC buffers residing in the
	- * arc_anon state. This includes all buffers in the arc_anon
	- * state; e.g. data, metadata, evictable, and unevictable buffers
	- * are all included in this value.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_anon_size;
	- /*
	- * Number of bytes consumed by ARC buffers that meet the
	- * following criteria: backing buffers of type ARC_BUFC_DATA,
	- * residing in the arc_anon state, and are eligible for eviction
	- * (e.g. have no outstanding holds on the buffer).
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_anon_evictable_data;
	- /*
	- * Number of bytes consumed by ARC buffers that meet the
	- * following criteria: backing buffers of type ARC_BUFC_METADATA,
	- * residing in the arc_anon state, and are eligible for eviction
	- * (e.g. have no outstanding holds on the buffer).
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_anon_evictable_metadata;
	- /*
	- * Total number of bytes consumed by ARC buffers residing in the
	- * arc_mru state. This includes all buffers in the arc_mru
	- * state; e.g. data, metadata, evictable, and unevictable buffers
	- * are all included in this value.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mru_size;
	- /*
	- * Number of bytes consumed by ARC buffers that meet the
	- * following criteria: backing buffers of type ARC_BUFC_DATA,
	- * residing in the arc_mru state, and are eligible for eviction
	- * (e.g. have no outstanding holds on the buffer).
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mru_evictable_data;
	- /*
	- * Number of bytes consumed by ARC buffers that meet the
	- * following criteria: backing buffers of type ARC_BUFC_METADATA,
	- * residing in the arc_mru state, and are eligible for eviction
	- * (e.g. have no outstanding holds on the buffer).
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mru_evictable_metadata;
	- /*
	- * Total number of bytes that would have been consumed by ARC
	- * buffers in the arc_mru_ghost state. The key thing to note
	- * here, is the fact that this size doesn't actually indicate
	- * RAM consumption. The ghost lists only consist of headers and
	- * don't actually have ARC buffers linked off of these headers.
	- * Thus, if the headers had associated ARC buffers, these
	- * buffers would have consumed this number of bytes.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mru_ghost_size;
	- /*
	- * Number of bytes that would have been consumed by ARC
	- * buffers that are eligible for eviction, of type
	- * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mru_ghost_evictable_data;
	- /*
	- * Number of bytes that would have been consumed by ARC
	- * buffers that are eligible for eviction, of type
	- * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mru_ghost_evictable_metadata;
	- /*
	- * Total number of bytes consumed by ARC buffers residing in the
	- * arc_mfu state. This includes all buffers in the arc_mfu
	- * state; e.g. data, metadata, evictable, and unevictable buffers
	- * are all included in this value.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mfu_size;
	- /*
	- * Number of bytes consumed by ARC buffers that are eligible for
	- * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
	- * state.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mfu_evictable_data;
	- /*
	- * Number of bytes consumed by ARC buffers that are eligible for
	- * eviction, of type ARC_BUFC_METADATA, and reside in the
	- * arc_mfu state.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mfu_evictable_metadata;
	- /*
	- * Total number of bytes that would have been consumed by ARC
	- * buffers in the arc_mfu_ghost state. See the comment above
	- * arcstat_mru_ghost_size for more details.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mfu_ghost_size;
	- /*
	- * Number of bytes that would have been consumed by ARC
	- * buffers that are eligible for eviction, of type
	- * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mfu_ghost_evictable_data;
	- /*
	- * Number of bytes that would have been consumed by ARC
	- * buffers that are eligible for eviction, of type
	- * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
	- * Not updated directly; only synced in arc_kstat_update.
	- */
	- kstat_named_t arcstat_mfu_ghost_evictable_metadata;
	- kstat_named_t arcstat_l2_hits;
	- kstat_named_t arcstat_l2_misses;
	- kstat_named_t arcstat_l2_feeds;
	- kstat_named_t arcstat_l2_rw_clash;
	- kstat_named_t arcstat_l2_read_bytes;
	- kstat_named_t arcstat_l2_write_bytes;
	- kstat_named_t arcstat_l2_writes_sent;
	- kstat_named_t arcstat_l2_writes_done;
	- kstat_named_t arcstat_l2_writes_error;
	- kstat_named_t arcstat_l2_writes_lock_retry;
	- kstat_named_t arcstat_l2_evict_lock_retry;
	- kstat_named_t arcstat_l2_evict_reading;
	- kstat_named_t arcstat_l2_evict_l1cached;
	- kstat_named_t arcstat_l2_free_on_write;
	- kstat_named_t arcstat_l2_abort_lowmem;
	- kstat_named_t arcstat_l2_cksum_bad;
	- kstat_named_t arcstat_l2_io_error;
	- kstat_named_t arcstat_l2_lsize;
	- kstat_named_t arcstat_l2_psize;
	- /* Not updated directly; only synced in arc_kstat_update. */
	- kstat_named_t arcstat_l2_hdr_size;
	- kstat_named_t arcstat_l2_write_trylock_fail;
	- kstat_named_t arcstat_l2_write_passed_headroom;
	- kstat_named_t arcstat_l2_write_spa_mismatch;
	- kstat_named_t arcstat_l2_write_in_l2;
	- kstat_named_t arcstat_l2_write_hdr_io_in_progress;
	- kstat_named_t arcstat_l2_write_not_cacheable;
	- kstat_named_t arcstat_l2_write_full;
	- kstat_named_t arcstat_l2_write_buffer_iter;
	- kstat_named_t arcstat_l2_write_pios;
	- kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
	- kstat_named_t arcstat_l2_write_buffer_list_iter;
	- kstat_named_t arcstat_l2_write_buffer_list_null_iter;
	- kstat_named_t arcstat_memory_throttle_count;
	- kstat_named_t arcstat_memory_direct_count;
	- kstat_named_t arcstat_memory_indirect_count;
	- kstat_named_t arcstat_memory_all_bytes;
	- kstat_named_t arcstat_memory_free_bytes;
	- kstat_named_t arcstat_memory_available_bytes;
	- kstat_named_t arcstat_no_grow;
	- kstat_named_t arcstat_tempreserve;
	- kstat_named_t arcstat_loaned_bytes;
	- kstat_named_t arcstat_prune;
	- /* Not updated directly; only synced in arc_kstat_update. */
	- kstat_named_t arcstat_meta_used;
	- kstat_named_t arcstat_meta_limit;
	- kstat_named_t arcstat_dnode_limit;
	- kstat_named_t arcstat_meta_max;
	- kstat_named_t arcstat_meta_min;
	- kstat_named_t arcstat_async_upgrade_sync;
	- kstat_named_t arcstat_demand_hit_predictive_prefetch;
	- kstat_named_t arcstat_demand_hit_prescient_prefetch;
	-} arc_stats_t;
	-
	-static arc_stats_t arc_stats = {
	- { "hits", KSTAT_DATA_UINT64 },
	- { "misses", KSTAT_DATA_UINT64 },
	- { "demand_data_hits", KSTAT_DATA_UINT64 },
	- { "demand_data_misses", KSTAT_DATA_UINT64 },
	- { "demand_metadata_hits", KSTAT_DATA_UINT64 },
	- { "demand_metadata_misses", KSTAT_DATA_UINT64 },
	- { "prefetch_data_hits", KSTAT_DATA_UINT64 },
	- { "prefetch_data_misses", KSTAT_DATA_UINT64 },
	- { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
	- { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
	- { "mru_hits", KSTAT_DATA_UINT64 },
	- { "mru_ghost_hits", KSTAT_DATA_UINT64 },
	- { "mfu_hits", KSTAT_DATA_UINT64 },
	- { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
	- { "allocated", KSTAT_DATA_UINT64 },
	- { "deleted", KSTAT_DATA_UINT64 },
	- { "mutex_miss", KSTAT_DATA_UINT64 },
	- { "access_skip", KSTAT_DATA_UINT64 },
	- { "evict_skip", KSTAT_DATA_UINT64 },
	- { "evict_not_enough", KSTAT_DATA_UINT64 },
	- { "evict_l2_cached", KSTAT_DATA_UINT64 },
	- { "evict_l2_eligible", KSTAT_DATA_UINT64 },
	- { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
	- { "evict_l2_skip", KSTAT_DATA_UINT64 },
	- { "hash_elements", KSTAT_DATA_UINT64 },
	- { "hash_elements_max", KSTAT_DATA_UINT64 },
	- { "hash_collisions", KSTAT_DATA_UINT64 },
	- { "hash_chains", KSTAT_DATA_UINT64 },
	- { "hash_chain_max", KSTAT_DATA_UINT64 },
	- { "p", KSTAT_DATA_UINT64 },
	- { "c", KSTAT_DATA_UINT64 },
	- { "c_min", KSTAT_DATA_UINT64 },
	- { "c_max", KSTAT_DATA_UINT64 },
	- { "size", KSTAT_DATA_UINT64 },
	- { "compressed_size", KSTAT_DATA_UINT64 },
	- { "uncompressed_size", KSTAT_DATA_UINT64 },
	- { "overhead_size", KSTAT_DATA_UINT64 },
	- { "hdr_size", KSTAT_DATA_UINT64 },
	- { "data_size", KSTAT_DATA_UINT64 },
	- { "metadata_size", KSTAT_DATA_UINT64 },
	- { "dbuf_size", KSTAT_DATA_UINT64 },
	- { "dnode_size", KSTAT_DATA_UINT64 },
	- { "bonus_size", KSTAT_DATA_UINT64 },
	-#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
	- { "other_size", KSTAT_DATA_UINT64 },
	-#endif
	- { "anon_size", KSTAT_DATA_UINT64 },
	- { "anon_evictable_data", KSTAT_DATA_UINT64 },
	- { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
	- { "mru_size", KSTAT_DATA_UINT64 },
	- { "mru_evictable_data", KSTAT_DATA_UINT64 },
	- { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
	- { "mru_ghost_size", KSTAT_DATA_UINT64 },
	- { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
	- { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
	- { "mfu_size", KSTAT_DATA_UINT64 },
	- { "mfu_evictable_data", KSTAT_DATA_UINT64 },
	- { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
	- { "mfu_ghost_size", KSTAT_DATA_UINT64 },
	- { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
	- { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
	- { "l2_hits", KSTAT_DATA_UINT64 },
	- { "l2_misses", KSTAT_DATA_UINT64 },
	- { "l2_feeds", KSTAT_DATA_UINT64 },
	- { "l2_rw_clash", KSTAT_DATA_UINT64 },
	- { "l2_read_bytes", KSTAT_DATA_UINT64 },
	- { "l2_write_bytes", KSTAT_DATA_UINT64 },
	- { "l2_writes_sent", KSTAT_DATA_UINT64 },
	- { "l2_writes_done", KSTAT_DATA_UINT64 },
	- { "l2_writes_error", KSTAT_DATA_UINT64 },
	- { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
	- { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
	- { "l2_evict_reading", KSTAT_DATA_UINT64 },
	- { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
	- { "l2_free_on_write", KSTAT_DATA_UINT64 },
	- { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
	- { "l2_cksum_bad", KSTAT_DATA_UINT64 },
	- { "l2_io_error", KSTAT_DATA_UINT64 },
	- { "l2_size", KSTAT_DATA_UINT64 },
	- { "l2_asize", KSTAT_DATA_UINT64 },
	- { "l2_hdr_size", KSTAT_DATA_UINT64 },
	- { "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
	- { "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
	- { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
	- { "l2_write_in_l2", KSTAT_DATA_UINT64 },
	- { "l2_write_io_in_progress", KSTAT_DATA_UINT64 },
	- { "l2_write_not_cacheable", KSTAT_DATA_UINT64 },
	- { "l2_write_full", KSTAT_DATA_UINT64 },
	- { "l2_write_buffer_iter", KSTAT_DATA_UINT64 },
	- { "l2_write_pios", KSTAT_DATA_UINT64 },
	- { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
	- { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
	- { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
	- { "memory_throttle_count", KSTAT_DATA_UINT64 },
	- { "memory_direct_count", KSTAT_DATA_UINT64 },
	- { "memory_indirect_count", KSTAT_DATA_UINT64 },
	- { "memory_all_bytes", KSTAT_DATA_UINT64 },
	- { "memory_free_bytes", KSTAT_DATA_UINT64 },
	- { "memory_available_bytes", KSTAT_DATA_UINT64 },
	- { "arc_no_grow", KSTAT_DATA_UINT64 },
	- { "arc_tempreserve", KSTAT_DATA_UINT64 },
	- { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
	- { "arc_prune", KSTAT_DATA_UINT64 },
	- { "arc_meta_used", KSTAT_DATA_UINT64 },
	- { "arc_meta_limit", KSTAT_DATA_UINT64 },
	- { "arc_dnode_limit", KSTAT_DATA_UINT64 },
	- { "arc_meta_max", KSTAT_DATA_UINT64 },
	- { "arc_meta_min", KSTAT_DATA_UINT64 },
	- { "async_upgrade_sync", KSTAT_DATA_UINT64 },
	- { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
	- { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
	-};
	-
	-#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
	-
	-#define ARCSTAT_INCR(stat, val) \
	- atomic_add_64(&arc_stats.stat.value.ui64, (val))
	-
	-#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
	-#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
	-
	-#define ARCSTAT_MAX(stat, val) { \
	- uint64_t m; \
	- while ((val) > (m = arc_stats.stat.value.ui64) && \
	- (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
	- continue; \
	-}
	-
	-#define ARCSTAT_MAXSTAT(stat) \
	- ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
	-
	-/*
	- * We define a macro to allow ARC hits/misses to be easily broken down by
	- * two separate conditions, giving a total of four different subtypes for
	- * each of hits and misses (so eight statistics total).
	- */
	-#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
	- if (cond1) { \
	- if (cond2) { \
	- ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
	- } else { \
	- ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
	- } \
	- } else { \
	- if (cond2) { \
	- ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
	- } else { \
	- ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
	- } \
	- }
	-
	-kstat_t *arc_ksp;
	-static arc_state_t *arc_anon;
	-static arc_state_t *arc_mru;
	-static arc_state_t *arc_mru_ghost;
	-static arc_state_t *arc_mfu;
	-static arc_state_t *arc_mfu_ghost;
	-static arc_state_t *arc_l2c_only;
	-
	-/*
	- * There are several ARC variables that are critical to export as kstats --
	- * but we don't want to have to grovel around in the kstat whenever we wish to
	- * manipulate them. For these variables, we therefore define them to be in
	- * terms of the statistic variable. This assures that we are not introducing
	- * the possibility of inconsistency by having shadow copies of the variables,
	- * while still allowing the code to be readable.
	- */
	-#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
	-#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
	-#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
	-#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
	-#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
	-#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
	-#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
	-#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
	-#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
	-#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */
	-#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
	-
	-/* compressed size of entire arc */
	-#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
	-/* uncompressed size of entire arc */
	-#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
	-/* number of bytes in the arc from arc_buf_t's */
	-#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
	-
	-/*
	- * There are also some ARC variables that we want to export, but that are
	- * updated so often that having the canonical representation be the statistic
	- * variable causes a performance bottleneck. We want to use aggsum_t's for these
	- * instead, but still be able to export the kstat in the same way as before.
	- * The solution is to always use the aggsum version, except in the kstat update
	- * callback.
	- */
	-aggsum_t arc_size;
	-aggsum_t arc_meta_used;
	-aggsum_t astat_data_size;
	-aggsum_t astat_metadata_size;
	-aggsum_t astat_hdr_size;
	-aggsum_t astat_bonus_size;
	-aggsum_t astat_dnode_size;
	-aggsum_t astat_dbuf_size;
	-aggsum_t astat_l2_hdr_size;
	-
	-static list_t arc_prune_list;
	-static kmutex_t arc_prune_mtx;
	-static taskq_t *arc_prune_taskq;
	-
	-static int arc_no_grow; /* Don't try to grow cache size */
	-static hrtime_t arc_growtime;
	-static uint64_t arc_tempreserve;
	-static uint64_t arc_loaned_bytes;
	-
	-typedef struct arc_callback arc_callback_t;
	-
	-struct arc_callback {
	- void *acb_private;
	- arc_read_done_func_t *acb_done;
	- arc_buf_t *acb_buf;
	- boolean_t acb_compressed;
	- zio_t *acb_zio_dummy;
	- zio_t *acb_zio_head;
	- arc_callback_t *acb_next;
	-};
	-
	-typedef struct arc_write_callback arc_write_callback_t;
	-
	-struct arc_write_callback {
	- void *awcb_private;
	- arc_write_done_func_t *awcb_ready;
	- arc_write_done_func_t *awcb_children_ready;
	- arc_write_done_func_t *awcb_physdone;
	- arc_write_done_func_t *awcb_done;
	- arc_buf_t *awcb_buf;
	-};
	-
	-/*
	- * ARC buffers are separated into multiple structs as a memory saving measure:
	- * - Common fields struct, always defined, and embedded within it:
	- * - L2-only fields, always allocated but undefined when not in L2ARC
	- * - L1-only fields, only allocated when in L1ARC
	- *
	- * Buffer in L1 Buffer only in L2
	- * +------------------------+ +------------------------+
	- * \| arc_buf_hdr_t \| \| arc_buf_hdr_t \|
	- * \| \| \| \|
	- * \| \| \| \|
	- * \| \| \| \|
	- * +------------------------+ +------------------------+
	- * \| l2arc_buf_hdr_t \| \| l2arc_buf_hdr_t \|
	- * \| (undefined if L1-only) \| \| \|
	- * +------------------------+ +------------------------+
	- * \| l1arc_buf_hdr_t \|
	- * \| \|
	- * \| \|
	- * \| \|
	- * \| \|
	- * +------------------------+
	- *
	- * Because it's possible for the L2ARC to become extremely large, we can wind
	- * up eating a lot of memory in L2ARC buffer headers, so the size of a header
	- * is minimized by only allocating the fields necessary for an L1-cached buffer
	- * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
	- * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
	- * words in pointers. arc_hdr_realloc() is used to switch a header between
	- * these two allocation states.
	- */
	-typedef struct l1arc_buf_hdr {
	- kmutex_t b_freeze_lock;
	- zio_cksum_t *b_freeze_cksum;
	-#ifdef ZFS_DEBUG
	- /*
	- * Used for debugging with kmem_flags - by allocating and freeing
	- * b_thawed when the buffer is thawed, we get a record of the stack
	- * trace that thawed it.
	- */
	- void *b_thawed;
	-#endif
	-
	- arc_buf_t *b_buf;
	- uint32_t b_bufcnt;
	- /* for waiting on writes to complete */
	- kcondvar_t b_cv;
	- uint8_t b_byteswap;
	-
	- /* protected by arc state mutex */
	- arc_state_t *b_state;
	- multilist_node_t b_arc_node;
	-
	- /* updated atomically */
	- clock_t b_arc_access;
	- uint32_t b_mru_hits;
	- uint32_t b_mru_ghost_hits;
	- uint32_t b_mfu_hits;
	- uint32_t b_mfu_ghost_hits;
	- uint32_t b_l2_hits;
	-
	- /* self protecting */
	- zfs_refcount_t b_refcnt;
	-
	- arc_callback_t *b_acb;
	- abd_t *b_pabd;
	-} l1arc_buf_hdr_t;
	-
	-typedef struct l2arc_dev l2arc_dev_t;
	-
	-typedef struct l2arc_buf_hdr {
	- /* protected by arc_buf_hdr mutex */
	- l2arc_dev_t b_dev; / L2ARC device */
	- uint64_t b_daddr; /* disk address, offset byte */
	- uint32_t b_hits;
	-
	- list_node_t b_l2node;
	-} l2arc_buf_hdr_t;
	-
	-struct arc_buf_hdr {
	- /* protected by hash lock */
	- dva_t b_dva;
	- uint64_t b_birth;
	-
	- arc_buf_contents_t b_type;
	- arc_buf_hdr_t *b_hash_next;
	- arc_flags_t b_flags;
	-
	- /*
	- * This field stores the size of the data buffer after
	- * compression, and is set in the arc's zio completion handlers.
	- * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
	- *
	- * While the block pointers can store up to 32MB in their psize
	- * field, we can only store up to 32MB minus 512B. This is due
	- * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
	- * a field of zeros represents 512B in the bp). We can't use a
	- * bias of 1 since we need to reserve a psize of zero, here, to
	- * represent holes and embedded blocks.
	- *
	- * This isn't a problem in practice, since the maximum size of a
	- * buffer is limited to 16MB, so we never need to store 32MB in
	- * this field. Even in the upstream illumos code base, the
	- * maximum size of a buffer is limited to 16MB.
	- */
	- uint16_t b_psize;
	-
	- /*
	- * This field stores the size of the data buffer before
	- * compression, and cannot change once set. It is in units
	- * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
	- */
	- uint16_t b_lsize; /* immutable */
	- uint64_t b_spa; /* immutable */
	-
	- /* L2ARC fields. Undefined when not in L2ARC. */
	- l2arc_buf_hdr_t b_l2hdr;
	- /* L1ARC fields. Undefined when in l2arc_only state */
	- l1arc_buf_hdr_t b_l1hdr;
	-};
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-static int
	-sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
	-{
	- uint64_t val;
	- int err;
	-
	- val = arc_meta_limit;
	- err = sysctl_handle_64(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val <= 0 \|\| val > arc_c_max)
	- return (EINVAL);
	-
	- arc_meta_limit = val;
	-
	- mutex_enter(&arc_adjust_lock);
	- arc_adjust_needed = B_TRUE;
	- mutex_exit(&arc_adjust_lock);
	- zthr_wakeup(arc_adjust_zthr);
	-
	- return (0);
	-}
	-
	-static int
	-sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
	-{
	- uint32_t val;
	- int err;
	-
	- val = arc_no_grow_shift;
	- err = sysctl_handle_32(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val >= arc_shrink_shift)
	- return (EINVAL);
	-
	- arc_no_grow_shift = val;
	- return (0);
	-}
	-
	-static int
	-sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
	-{
	- uint64_t val;
	- int err;
	-
	- val = zfs_arc_max;
	- err = sysctl_handle_64(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (zfs_arc_max == 0) {
	- /* Loader tunable so blindly set */
	- zfs_arc_max = val;
	- return (0);
	- }
	-
	- if (val < arc_abs_min \|\| val > kmem_size())
	- return (EINVAL);
	- if (val < arc_c_min)
	- return (EINVAL);
	- if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
	- return (EINVAL);
	-
	- arc_c_max = val;
	-
	- arc_c = arc_c_max;
	- arc_p = (arc_c >> 1);
	-
	- if (zfs_arc_meta_limit == 0) {
	- /* limit meta-data to 1/4 of the arc capacity */
	- arc_meta_limit = arc_c_max / 4;
	- }
	-
	- /* if kmem_flags are set, lets try to use less memory */
	- if (kmem_debugging())
	- arc_c = arc_c / 2;
	-
	- zfs_arc_max = arc_c;
	-
	- mutex_enter(&arc_adjust_lock);
	- arc_adjust_needed = B_TRUE;
	- mutex_exit(&arc_adjust_lock);
	- zthr_wakeup(arc_adjust_zthr);
	-
	- return (0);
	-}
	-
	-static int
	-sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
	-{
	- uint64_t val;
	- int err;
	-
	- val = zfs_arc_min;
	- err = sysctl_handle_64(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (zfs_arc_min == 0) {
	- /* Loader tunable so blindly set */
	- zfs_arc_min = val;
	- return (0);
	- }
	-
	- if (val < arc_abs_min \|\| val > arc_c_max)
	- return (EINVAL);
	-
	- arc_c_min = val;
	-
	- if (zfs_arc_meta_min == 0)
	- arc_meta_min = arc_c_min / 2;
	-
	- if (arc_c < arc_c_min)
	- arc_c = arc_c_min;
	-
	- zfs_arc_min = arc_c_min;
	-
	- return (0);
	-}
	-#endif
	-
	-#define GHOST_STATE(state) \
	- ((state) == arc_mru_ghost \|\| (state) == arc_mfu_ghost \|\| \
	- (state) == arc_l2c_only)
	-
	-#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
	-#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
	-#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
	-#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
	-#define HDR_PRESCIENT_PREFETCH(hdr) \
	- ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
	-#define HDR_COMPRESSION_ENABLED(hdr) \
	- ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
	-
	-#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
	-#define HDR_L2_READING(hdr) \
	- (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
	- ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
	-#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
	-#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
	-#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
	-#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
	-
	-#define HDR_ISTYPE_METADATA(hdr) \
	- ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
	-#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
	-
	-#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
	-#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
	-
	-/* For storing compression mode in b_flags */
	-#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
	-
	-#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
	- HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
	-#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
	- HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
	-
	-#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
	-#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
	-#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
	-
	-/*
	- * Other sizes
	- */
	-
	-#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
	-#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
	-
	-/*
	- * Hash table routines
	- */
	-
	-#define HT_LOCK_PAD CACHE_LINE_SIZE
	-
	-struct ht_lock {
	- kmutex_t ht_lock;
	-#ifdef _KERNEL
	- unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
	-#endif
	-};
	-
	-#define BUF_LOCKS 256
	-typedef struct buf_hash_table {
	- uint64_t ht_mask;
	- arc_buf_hdr_t **ht_table;
	- struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
	-} buf_hash_table_t;
	-
	-static buf_hash_table_t buf_hash_table;
	-
	-#define BUF_HASH_INDEX(spa, dva, birth) \
	- (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
	-#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
	-#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
	-#define HDR_LOCK(hdr) \
	- (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
	-
	-uint64_t zfs_crc64_table[256];
	-
	-/*
	- * Level 2 ARC
	- */
	-
	-#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
	-#define L2ARC_HEADROOM 2 /* num of writes */
	-/*
	- * If we discover during ARC scan any buffers to be compressed, we boost
	- * our headroom for the next scanning cycle by this percentage multiple.
	- */
	-#define L2ARC_HEADROOM_BOOST 200
	-#define L2ARC_FEED_SECS 1 /* caching interval secs */
	-#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
	-
	-#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
	-#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
	-
	-/* L2ARC Performance Tunables */
	-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
	-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
	-uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
	-uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
	-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
	-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
	-boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
	-boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
	-boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN,
	- &l2arc_write_max, 0, "max write size");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN,
	- &l2arc_write_boost, 0, "extra write during warmup");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN,
	- &l2arc_headroom, 0, "number of dev writes");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN,
	- &l2arc_feed_secs, 0, "interval seconds");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN,
	- &l2arc_feed_min_ms, 0, "min interval milliseconds");
	-
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN,
	- &l2arc_noprefetch, 0, "don't cache prefetch bufs");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN,
	- &l2arc_feed_again, 0, "turbo warmup");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN,
	- &l2arc_norw, 0, "no reads during writes");
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
	- &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
	- &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	- "size of anonymous state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
	- &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	- "size of anonymous state");
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
	- &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
	- &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	- "size of metadata in mru state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
	- &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	- "size of data in mru state");
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
	- &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
	- &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	- "size of metadata in mru ghost state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
	- &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	- "size of data in mru ghost state");
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
	- &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
	- &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	- "size of metadata in mfu state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
	- &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	- "size of data in mfu state");
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
	- &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
	- &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	- "size of metadata in mfu ghost state");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
	- &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	- "size of data in mfu ghost state");
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
	- &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
	-
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW,
	- &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
	- &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms");
	-
	-/*
	- * L2ARC Internals
	- */
	-struct l2arc_dev {
	- vdev_t l2ad_vdev; / vdev */
	- spa_t l2ad_spa; / spa */
	- uint64_t l2ad_hand; /* next write location */
	- uint64_t l2ad_start; /* first addr on device */
	- uint64_t l2ad_end; /* last addr on device */
	- boolean_t l2ad_first; /* first sweep through */
	- boolean_t l2ad_writing; /* currently writing */
	- kmutex_t l2ad_mtx; /* lock for buffer list */
	- list_t l2ad_buflist; /* buffer list */
	- list_node_t l2ad_node; /* device list node */
	- zfs_refcount_t l2ad_alloc; /* allocated bytes */
	-};
	-
	-static list_t L2ARC_dev_list; /* device list */
	-static list_t l2arc_dev_list; / device list pointer */
	-static kmutex_t l2arc_dev_mtx; /* device list mutex */
	-static l2arc_dev_t l2arc_dev_last; / last device used */
	-static list_t L2ARC_free_on_write; /* free after write buf list */
	-static list_t l2arc_free_on_write; / free after write list ptr */
	-static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
	-static uint64_t l2arc_ndev; /* number of devices */
	-
	-typedef struct l2arc_read_callback {
	- arc_buf_hdr_t l2rcb_hdr; / read header */
	- blkptr_t l2rcb_bp; /* original blkptr */
	- zbookmark_phys_t l2rcb_zb; /* original bookmark */
	- int l2rcb_flags; /* original flags */
	- abd_t l2rcb_abd; / temporary buffer */
	-} l2arc_read_callback_t;
	-
	-typedef struct l2arc_write_callback {
	- l2arc_dev_t l2wcb_dev; / device info */
	- arc_buf_hdr_t l2wcb_head; / head of write buflist */
	-} l2arc_write_callback_t;
	-
	-typedef struct l2arc_data_free {
	- /* protected by l2arc_free_on_write_mtx */
	- abd_t *l2df_abd;
	- size_t l2df_size;
	- arc_buf_contents_t l2df_type;
	- list_node_t l2df_list_node;
	-} l2arc_data_free_t;
	-
	-static kmutex_t l2arc_feed_thr_lock;
	-static kcondvar_t l2arc_feed_thr_cv;
	-static uint8_t l2arc_thread_exit;
	-
	-static abd_t arc_get_data_abd(arc_buf_hdr_t , uint64_t, void *, boolean_t);
	-static void arc_get_data_buf(arc_buf_hdr_t , uint64_t, void *);
	-static void arc_get_data_impl(arc_buf_hdr_t , uint64_t, void , boolean_t);
	-static void arc_free_data_abd(arc_buf_hdr_t , abd_t , uint64_t, void *);
	-static void arc_free_data_buf(arc_buf_hdr_t , void , uint64_t, void *);
	-static void arc_free_data_impl(arc_buf_hdr_t hdr, uint64_t size, void tag);
	-static void arc_hdr_free_pabd(arc_buf_hdr_t *);
	-static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t);
	-static void arc_access(arc_buf_hdr_t , kmutex_t );
	-static boolean_t arc_is_overflowing();
	-static void arc_buf_watch(arc_buf_t *);
	-static void arc_prune_async(int64_t);
	-
	-static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
	-static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
	-static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
	-static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
	-
	-static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
	-static void l2arc_read_done(zio_t *);
	-
	-static void
	-l2arc_trim(const arc_buf_hdr_t *hdr)
	-{
	- l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
	-
	- ASSERT(HDR_HAS_L2HDR(hdr));
	- ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
	-
	- if (HDR_GET_PSIZE(hdr) != 0) {
	- trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
	- HDR_GET_PSIZE(hdr), 0);
	- }
	-}
	-
	-/*
	- * We use Cityhash for this. It's fast, and has good hash properties without
	- * requiring any large static buffers.
	- */
	-static uint64_t
	-buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
	-{
	- return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
	-}
	-
	-#define HDR_EMPTY(hdr) \
	- ((hdr)->b_dva.dva_word[0] == 0 && \
	- (hdr)->b_dva.dva_word[1] == 0)
	-
	-#define HDR_EQUAL(spa, dva, birth, hdr) \
	- ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
	- ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
	- ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
	-
	-static void
	-buf_discard_identity(arc_buf_hdr_t *hdr)
	-{
	- hdr->b_dva.dva_word[0] = 0;
	- hdr->b_dva.dva_word[1] = 0;
	- hdr->b_birth = 0;
	-}
	-
	-static arc_buf_hdr_t *
	-buf_hash_find(uint64_t spa, const blkptr_t bp, kmutex_t *lockp)
	-{
	- const dva_t *dva = BP_IDENTITY(bp);
	- uint64_t birth = BP_PHYSICAL_BIRTH(bp);
	- uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
	- kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
	- arc_buf_hdr_t *hdr;
	-
	- mutex_enter(hash_lock);
	- for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
	- hdr = hdr->b_hash_next) {
	- if (HDR_EQUAL(spa, dva, birth, hdr)) {
	- *lockp = hash_lock;
	- return (hdr);
	- }
	- }
	- mutex_exit(hash_lock);
	- *lockp = NULL;
	- return (NULL);
	-}
	-
	-/*
	- * Insert an entry into the hash table. If there is already an element
	- * equal to elem in the hash table, then the already existing element
	- * will be returned and the new element will not be inserted.
	- * Otherwise returns NULL.
	- * If lockp == NULL, the caller is assumed to already hold the hash lock.
	- */
	-static arc_buf_hdr_t *
	-buf_hash_insert(arc_buf_hdr_t hdr, kmutex_t *lockp)
	-{
	- uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
	- kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
	- arc_buf_hdr_t *fhdr;
	- uint32_t i;
	-
	- ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
	- ASSERT(hdr->b_birth != 0);
	- ASSERT(!HDR_IN_HASH_TABLE(hdr));
	-
	- if (lockp != NULL) {
	- *lockp = hash_lock;
	- mutex_enter(hash_lock);
	- } else {
	- ASSERT(MUTEX_HELD(hash_lock));
	- }
	-
	- for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
	- fhdr = fhdr->b_hash_next, i++) {
	- if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
	- return (fhdr);
	- }
	-
	- hdr->b_hash_next = buf_hash_table.ht_table[idx];
	- buf_hash_table.ht_table[idx] = hdr;
	- arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
	-
	- /* collect some hash table performance data */
	- if (i > 0) {
	- ARCSTAT_BUMP(arcstat_hash_collisions);
	- if (i == 1)
	- ARCSTAT_BUMP(arcstat_hash_chains);
	-
	- ARCSTAT_MAX(arcstat_hash_chain_max, i);
	- }
	-
	- ARCSTAT_BUMP(arcstat_hash_elements);
	- ARCSTAT_MAXSTAT(arcstat_hash_elements);
	-
	- return (NULL);
	-}
	-
	-static void
	-buf_hash_remove(arc_buf_hdr_t *hdr)
	-{
	- arc_buf_hdr_t fhdr, *hdrp;
	- uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
	-
	- ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
	- ASSERT(HDR_IN_HASH_TABLE(hdr));
	-
	- hdrp = &buf_hash_table.ht_table[idx];
	- while ((fhdr = *hdrp) != hdr) {
	- ASSERT3P(fhdr, !=, NULL);
	- hdrp = &fhdr->b_hash_next;
	- }
	- *hdrp = hdr->b_hash_next;
	- hdr->b_hash_next = NULL;
	- arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
	-
	- /* collect some hash table performance data */
	- ARCSTAT_BUMPDOWN(arcstat_hash_elements);
	-
	- if (buf_hash_table.ht_table[idx] &&
	- buf_hash_table.ht_table[idx]->b_hash_next == NULL)
	- ARCSTAT_BUMPDOWN(arcstat_hash_chains);
	-}
	-
	-/*
	- * Global data structures and functions for the buf kmem cache.
	- */
	-static kmem_cache_t *hdr_full_cache;
	-static kmem_cache_t *hdr_l2only_cache;
	-static kmem_cache_t *buf_cache;
	-
	-static void
	-buf_fini(void)
	-{
	- int i;
	-
	- kmem_free(buf_hash_table.ht_table,
	- (buf_hash_table.ht_mask + 1) * sizeof (void *));
	- for (i = 0; i < BUF_LOCKS; i++)
	- mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
	- kmem_cache_destroy(hdr_full_cache);
	- kmem_cache_destroy(hdr_l2only_cache);
	- kmem_cache_destroy(buf_cache);
	-}
	-
	-/*
	- * Constructor callback - called when the cache is empty
	- * and a new buf is requested.
	- */
	-/* ARGSUSED */
	-static int
	-hdr_full_cons(void vbuf, void unused, int kmflag)
	-{
	- arc_buf_hdr_t *hdr = vbuf;
	-
	- bzero(hdr, HDR_FULL_SIZE);
	- cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
	- zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
	- mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
	- multilist_link_init(&hdr->b_l1hdr.b_arc_node);
	- arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-hdr_l2only_cons(void vbuf, void unused, int kmflag)
	-{
	- arc_buf_hdr_t *hdr = vbuf;
	-
	- bzero(hdr, HDR_L2ONLY_SIZE);
	- arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-buf_cons(void vbuf, void unused, int kmflag)
	-{
	- arc_buf_t *buf = vbuf;
	-
	- bzero(buf, sizeof (arc_buf_t));
	- mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
	- arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
	-
	- return (0);
	-}
	-
	-/*
	- * Destructor callback - called when a cached buf is
	- * no longer required.
	- */
	-/* ARGSUSED */
	-static void
	-hdr_full_dest(void vbuf, void unused)
	-{
	- arc_buf_hdr_t *hdr = vbuf;
	-
	- ASSERT(HDR_EMPTY(hdr));
	- cv_destroy(&hdr->b_l1hdr.b_cv);
	- zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
	- mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
	- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
	- arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
	-}
	-
	-/* ARGSUSED */
	-static void
	-hdr_l2only_dest(void vbuf, void unused)
	-{
	- arc_buf_hdr_t *hdr = vbuf;
	-
	- ASSERT(HDR_EMPTY(hdr));
	- arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
	-}
	-
	-/* ARGSUSED */
	-static void
	-buf_dest(void vbuf, void unused)
	-{
	- arc_buf_t *buf = vbuf;
	-
	- mutex_destroy(&buf->b_evict_lock);
	- arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
	-}
	-
	-/*
	- * Reclaim callback -- invoked when memory is low.
	- */
	-/* ARGSUSED */
	-static void
	-hdr_recl(void *unused)
	-{
	- dprintf("hdr_recl called\n");
	- /*
	- * umem calls the reclaim func when we destroy the buf cache,
	- * which is after we do arc_fini().
	- */
	- if (arc_initialized)
	- zthr_wakeup(arc_reap_zthr);
	-}
	-
	-static void
	-buf_init(void)
	-{
	- uint64_t *ct;
	- uint64_t hsize = 1ULL << 12;
	- int i, j;
	-
	- /*
	- * The hash table is big enough to fill all of physical memory
	- * with an average block size of zfs_arc_average_blocksize (default 8K).
	- * By default, the table will take up
	- * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
	- */
	- while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
	- hsize <<= 1;
	-retry:
	- buf_hash_table.ht_mask = hsize - 1;
	- buf_hash_table.ht_table =
	- kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
	- if (buf_hash_table.ht_table == NULL) {
	- ASSERT(hsize > (1ULL << 8));
	- hsize >>= 1;
	- goto retry;
	- }
	-
	- hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
	- 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
	- hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
	- HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
	- NULL, NULL, 0);
	- buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
	- 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
	-
	- for (i = 0; i < 256; i++)
	- for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
	- ct = (ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
	-
	- for (i = 0; i < BUF_LOCKS; i++) {
	- mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
	- NULL, MUTEX_DEFAULT, NULL);
	- }
	-}
	-
	-/*
	- * This is the size that the buf occupies in memory. If the buf is compressed,
	- * it will correspond to the compressed size. You should use this method of
	- * getting the buf size unless you explicitly need the logical size.
	- */
	-int32_t
	-arc_buf_size(arc_buf_t *buf)
	-{
	- return (ARC_BUF_COMPRESSED(buf) ?
	- HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
	-}
	-
	-int32_t
	-arc_buf_lsize(arc_buf_t *buf)
	-{
	- return (HDR_GET_LSIZE(buf->b_hdr));
	-}
	-
	-enum zio_compress
	-arc_get_compression(arc_buf_t *buf)
	-{
	- return (ARC_BUF_COMPRESSED(buf) ?
	- HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
	-}
	-
	-#define ARC_MINTIME (hz>>4) /* 62 ms */
	-
	-static inline boolean_t
	-arc_buf_is_shared(arc_buf_t *buf)
	-{
	- boolean_t shared = (buf->b_data != NULL &&
	- buf->b_hdr->b_l1hdr.b_pabd != NULL &&
	- abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
	- buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
	- IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
	- IMPLY(shared, ARC_BUF_SHARED(buf));
	- IMPLY(shared, ARC_BUF_COMPRESSED(buf) \|\| ARC_BUF_LAST(buf));
	-
	- /*
	- * It would be nice to assert arc_can_share() too, but the "hdr isn't
	- * already being shared" requirement prevents us from doing that.
	- */
	-
	- return (shared);
	-}
	-
	-/*
	- * Free the checksum associated with this header. If there is no checksum, this
	- * is a no-op.
	- */
	-static inline void
	-arc_cksum_free(arc_buf_hdr_t *hdr)
	-{
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
	- if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
	- kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
	- hdr->b_l1hdr.b_freeze_cksum = NULL;
	- }
	- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	-}
	-
	-/*
	- * Return true iff at least one of the bufs on hdr is not compressed.
	- */
	-static boolean_t
	-arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
	-{
	- for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
	- if (!ARC_BUF_COMPRESSED(b)) {
	- return (B_TRUE);
	- }
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
	- * matches the checksum that is stored in the hdr. If there is no checksum,
	- * or if the buf is compressed, this is a no-op.
	- */
	-static void
	-arc_cksum_verify(arc_buf_t *buf)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	- zio_cksum_t zc;
	-
	- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
	- return;
	-
	- if (ARC_BUF_COMPRESSED(buf)) {
	- ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL \|\|
	- arc_hdr_has_uncompressed_buf(hdr));
	- return;
	- }
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	-
	- mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
	- if (hdr->b_l1hdr.b_freeze_cksum == NULL \|\| HDR_IO_ERROR(hdr)) {
	- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	- return;
	- }
	-
	- fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
	- if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
	- panic("buffer modified while frozen!");
	- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	-}
	-
	-static boolean_t
	-arc_cksum_is_equal(arc_buf_hdr_t hdr, zio_t zio)
	-{
	- enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
	- boolean_t valid_cksum;
	-
	- ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
	- VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
	-
	- /*
	- * We rely on the blkptr's checksum to determine if the block
	- * is valid or not. When compressed arc is enabled, the l2arc
	- * writes the block to the l2arc just as it appears in the pool.
	- * This allows us to use the blkptr's checksum to validate the
	- * data that we just read off of the l2arc without having to store
	- * a separate checksum in the arc_buf_hdr_t. However, if compressed
	- * arc is disabled, then the data written to the l2arc is always
	- * uncompressed and won't match the block as it exists in the main
	- * pool. When this is the case, we must first compress it if it is
	- * compressed on the main pool before we can validate the checksum.
	- */
	- if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
	- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
	- uint64_t lsize = HDR_GET_LSIZE(hdr);
	- uint64_t csize;
	-
	- abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
	- csize = zio_compress_data(compress, zio->io_abd,
	- abd_to_buf(cdata), lsize);
	-
	- ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
	- if (csize < HDR_GET_PSIZE(hdr)) {
	- /*
	- * Compressed blocks are always a multiple of the
	- * smallest ashift in the pool. Ideally, we would
	- * like to round up the csize to the next
	- * spa_min_ashift but that value may have changed
	- * since the block was last written. Instead,
	- * we rely on the fact that the hdr's psize
	- * was set to the psize of the block when it was
	- * last written. We set the csize to that value
	- * and zero out any part that should not contain
	- * data.
	- */
	- abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
	- csize = HDR_GET_PSIZE(hdr);
	- }
	- zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
	- }
	-
	- /*
	- * Block pointers always store the checksum for the logical data.
	- * If the block pointer has the gang bit set, then the checksum
	- * it represents is for the reconstituted data and not for an
	- * individual gang member. The zio pipeline, however, must be able to
	- * determine the checksum of each of the gang constituents so it
	- * treats the checksum comparison differently than what we need
	- * for l2arc blocks. This prevents us from using the
	- * zio_checksum_error() interface directly. Instead we must call the
	- * zio_checksum_error_impl() so that we can ensure the checksum is
	- * generated using the correct checksum algorithm and accounts for the
	- * logical I/O size and not just a gang fragment.
	- */
	- valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
	- BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
	- zio->io_offset, NULL) == 0);
	- zio_pop_transforms(zio);
	- return (valid_cksum);
	-}
	-
	-/*
	- * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
	- * checksum and attaches it to the buf's hdr so that we can ensure that the buf
	- * isn't modified later on. If buf is compressed or there is already a checksum
	- * on the hdr, this is a no-op (we only checksum uncompressed bufs).
	- */
	-static void
	-arc_cksum_compute(arc_buf_t *buf)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	-
	- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
	- return;
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	-
	- mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
	- if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
	- ASSERT(arc_hdr_has_uncompressed_buf(hdr));
	- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	- return;
	- } else if (ARC_BUF_COMPRESSED(buf)) {
	- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	- return;
	- }
	-
	- ASSERT(!ARC_BUF_COMPRESSED(buf));
	- hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
	- KM_SLEEP);
	- fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
	- hdr->b_l1hdr.b_freeze_cksum);
	- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	-#ifdef illumos
	- arc_buf_watch(buf);
	-#endif
	-}
	-
	-#ifdef illumos
	-#ifndef _KERNEL
	-typedef struct procctl {
	- long cmd;
	- prwatch_t prwatch;
	-} procctl_t;
	-#endif
	-
	-/* ARGSUSED */
	-static void
	-arc_buf_unwatch(arc_buf_t *buf)
	-{
	-#ifndef _KERNEL
	- if (arc_watch) {
	- int result;
	- procctl_t ctl;
	- ctl.cmd = PCWATCH;
	- ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
	- ctl.prwatch.pr_size = 0;
	- ctl.prwatch.pr_wflags = 0;
	- result = write(arc_procfd, &ctl, sizeof (ctl));
	- ASSERT3U(result, ==, sizeof (ctl));
	- }
	-#endif
	-}
	-
	-/* ARGSUSED */
	-static void
	-arc_buf_watch(arc_buf_t *buf)
	-{
	-#ifndef _KERNEL
	- if (arc_watch) {
	- int result;
	- procctl_t ctl;
	- ctl.cmd = PCWATCH;
	- ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
	- ctl.prwatch.pr_size = arc_buf_size(buf);
	- ctl.prwatch.pr_wflags = WA_WRITE;
	- result = write(arc_procfd, &ctl, sizeof (ctl));
	- ASSERT3U(result, ==, sizeof (ctl));
	- }
	-#endif
	-}
	-#endif /* illumos */
	-
	-static arc_buf_contents_t
	-arc_buf_type(arc_buf_hdr_t *hdr)
	-{
	- arc_buf_contents_t type;
	- if (HDR_ISTYPE_METADATA(hdr)) {
	- type = ARC_BUFC_METADATA;
	- } else {
	- type = ARC_BUFC_DATA;
	- }
	- VERIFY3U(hdr->b_type, ==, type);
	- return (type);
	-}
	-
	-boolean_t
	-arc_is_metadata(arc_buf_t *buf)
	-{
	- return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
	-}
	-
	-static uint32_t
	-arc_bufc_to_flags(arc_buf_contents_t type)
	-{
	- switch (type) {
	- case ARC_BUFC_DATA:
	- /* metadata field is 0 if buffer contains normal data */
	- return (0);
	- case ARC_BUFC_METADATA:
	- return (ARC_FLAG_BUFC_METADATA);
	- default:
	- break;
	- }
	- panic("undefined ARC buffer type!");
	- return ((uint32_t)-1);
	-}
	-
	-void
	-arc_buf_thaw(arc_buf_t *buf)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	-
	- ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
	- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	-
	- arc_cksum_verify(buf);
	-
	- /*
	- * Compressed buffers do not manipulate the b_freeze_cksum or
	- * allocate b_thawed.
	- */
	- if (ARC_BUF_COMPRESSED(buf)) {
	- ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL \|\|
	- arc_hdr_has_uncompressed_buf(hdr));
	- return;
	- }
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- arc_cksum_free(hdr);
	-
	- mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
	-#ifdef ZFS_DEBUG
	- if (zfs_flags & ZFS_DEBUG_MODIFY) {
	- if (hdr->b_l1hdr.b_thawed != NULL)
	- kmem_free(hdr->b_l1hdr.b_thawed, 1);
	- hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
	- }
	-#endif
	-
	- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	-
	-#ifdef illumos
	- arc_buf_unwatch(buf);
	-#endif
	-}
	-
	-void
	-arc_buf_freeze(arc_buf_t *buf)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	- kmutex_t *hash_lock;
	-
	- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
	- return;
	-
	- if (ARC_BUF_COMPRESSED(buf)) {
	- ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL \|\|
	- arc_hdr_has_uncompressed_buf(hdr));
	- return;
	- }
	-
	- hash_lock = HDR_LOCK(hdr);
	- mutex_enter(hash_lock);
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL \|\|
	- hdr->b_l1hdr.b_state == arc_anon);
	- arc_cksum_compute(buf);
	- mutex_exit(hash_lock);
	-}
	-
	-/*
	- * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
	- * the following functions should be used to ensure that the flags are
	- * updated in a thread-safe way. When manipulating the flags either
	- * the hash_lock must be held or the hdr must be undiscoverable. This
	- * ensures that we're not racing with any other threads when updating
	- * the flags.
	- */
	-static inline void
	-arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
	-{
	- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	- hdr->b_flags \|= flags;
	-}
	-
	-static inline void
	-arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
	-{
	- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	- hdr->b_flags &= ~flags;
	-}
	-
	-/*
	- * Setting the compression bits in the arc_buf_hdr_t's b_flags is
	- * done in a special way since we have to clear and set bits
	- * at the same time. Consumers that wish to set the compression bits
	- * must use this function to ensure that the flags are updated in
	- * thread-safe manner.
	- */
	-static void
	-arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
	-{
	- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	-
	- /*
	- * Holes and embedded blocks will always have a psize = 0 so
	- * we ignore the compression of the blkptr and set the
	- * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
	- * Holes and embedded blocks remain anonymous so we don't
	- * want to uncompress them. Mark them as uncompressed.
	- */
	- if (!zfs_compressed_arc_enabled \|\| HDR_GET_PSIZE(hdr) == 0) {
	- arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
	- HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
	- ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
	- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
	- } else {
	- arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
	- HDR_SET_COMPRESS(hdr, cmp);
	- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
	- ASSERT(HDR_COMPRESSION_ENABLED(hdr));
	- }
	-}
	-
	-/*
	- * Looks for another buf on the same hdr which has the data decompressed, copies
	- * from it, and returns true. If no such buf exists, returns false.
	- */
	-static boolean_t
	-arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	- boolean_t copied = B_FALSE;
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- ASSERT3P(buf->b_data, !=, NULL);
	- ASSERT(!ARC_BUF_COMPRESSED(buf));
	-
	- for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
	- from = from->b_next) {
	- /* can't use our own data buffer */
	- if (from == buf) {
	- continue;
	- }
	-
	- if (!ARC_BUF_COMPRESSED(from)) {
	- bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
	- copied = B_TRUE;
	- break;
	- }
	- }
	-
	- /*
	- * There were no decompressed bufs, so there should not be a
	- * checksum on the hdr either.
	- */
	- EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
	-
	- return (copied);
	-}
	-
	-/*
	- * Given a buf that has a data buffer attached to it, this function will
	- * efficiently fill the buf with data of the specified compression setting from
	- * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
	- * are already sharing a data buf, no copy is performed.
	- *
	- * If the buf is marked as compressed but uncompressed data was requested, this
	- * will allocate a new data buffer for the buf, remove that flag, and fill the
	- * buf with uncompressed data. You can't request a compressed buf on a hdr with
	- * uncompressed data, and (since we haven't added support for it yet) if you
	- * want compressed data your buf must already be marked as compressed and have
	- * the correct-sized data buffer.
	- */
	-static int
	-arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	- boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
	- dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
	-
	- ASSERT3P(buf->b_data, !=, NULL);
	- IMPLY(compressed, hdr_compressed);
	- IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
	-
	- if (hdr_compressed == compressed) {
	- if (!arc_buf_is_shared(buf)) {
	- abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
	- arc_buf_size(buf));
	- }
	- } else {
	- ASSERT(hdr_compressed);
	- ASSERT(!compressed);
	- ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
	-
	- /*
	- * If the buf is sharing its data with the hdr, unlink it and
	- * allocate a new data buffer for the buf.
	- */
	- if (arc_buf_is_shared(buf)) {
	- ASSERT(ARC_BUF_COMPRESSED(buf));
	-
	- /* We need to give the buf it's own b_data */
	- buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
	- buf->b_data =
	- arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
	- arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
	-
	- /* Previously overhead was 0; just add new overhead */
	- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
	- } else if (ARC_BUF_COMPRESSED(buf)) {
	- /* We need to reallocate the buf's b_data */
	- arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
	- buf);
	- buf->b_data =
	- arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
	-
	- /* We increased the size of b_data; update overhead */
	- ARCSTAT_INCR(arcstat_overhead_size,
	- HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
	- }
	-
	- /*
	- * Regardless of the buf's previous compression settings, it
	- * should not be compressed at the end of this function.
	- */
	- buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
	-
	- /*
	- * Try copying the data from another buf which already has a
	- * decompressed version. If that's not possible, it's time to
	- * bite the bullet and decompress the data from the hdr.
	- */
	- if (arc_buf_try_copy_decompressed_data(buf)) {
	- /* Skip byteswapping and checksumming (already done) */
	- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
	- return (0);
	- } else {
	- int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
	- hdr->b_l1hdr.b_pabd, buf->b_data,
	- HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
	-
	- /*
	- * Absent hardware errors or software bugs, this should
	- * be impossible, but log it anyway so we can debug it.
	- */
	- if (error != 0) {
	- zfs_dbgmsg(
	- "hdr %p, compress %d, psize %d, lsize %d",
	- hdr, HDR_GET_COMPRESS(hdr),
	- HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
	- return (SET_ERROR(EIO));
	- }
	- }
	- }
	-
	- /* Byteswap the buf's data if necessary */
	- if (bswap != DMU_BSWAP_NUMFUNCS) {
	- ASSERT(!HDR_SHARED_DATA(hdr));
	- ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
	- dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
	- }
	-
	- /* Compute the hdr's checksum if necessary */
	- arc_cksum_compute(buf);
	-
	- return (0);
	-}
	-
	-int
	-arc_decompress(arc_buf_t *buf)
	-{
	- return (arc_buf_fill(buf, B_FALSE));
	-}
	-
	-/*
	- * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
	- */
	-static uint64_t
	-arc_hdr_size(arc_buf_hdr_t *hdr)
	-{
	- uint64_t size;
	-
	- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
	- HDR_GET_PSIZE(hdr) > 0) {
	- size = HDR_GET_PSIZE(hdr);
	- } else {
	- ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
	- size = HDR_GET_LSIZE(hdr);
	- }
	- return (size);
	-}
	-
	-/*
	- * Increment the amount of evictable space in the arc_state_t's refcount.
	- * We account for the space used by the hdr and the arc buf individually
	- * so that we can add and remove them from the refcount individually.
	- */
	-static void
	-arc_evictable_space_increment(arc_buf_hdr_t hdr, arc_state_t state)
	-{
	- arc_buf_contents_t type = arc_buf_type(hdr);
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	-
	- if (GHOST_STATE(state)) {
	- ASSERT0(hdr->b_l1hdr.b_bufcnt);
	- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	- (void) zfs_refcount_add_many(&state->arcs_esize[type],
	- HDR_GET_LSIZE(hdr), hdr);
	- return;
	- }
	-
	- ASSERT(!GHOST_STATE(state));
	- if (hdr->b_l1hdr.b_pabd != NULL) {
	- (void) zfs_refcount_add_many(&state->arcs_esize[type],
	- arc_hdr_size(hdr), hdr);
	- }
	- for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
	- buf = buf->b_next) {
	- if (arc_buf_is_shared(buf))
	- continue;
	- (void) zfs_refcount_add_many(&state->arcs_esize[type],
	- arc_buf_size(buf), buf);
	- }
	-}
	-
	-/*
	- * Decrement the amount of evictable space in the arc_state_t's refcount.
	- * We account for the space used by the hdr and the arc buf individually
	- * so that we can add and remove them from the refcount individually.
	- */
	-static void
	-arc_evictable_space_decrement(arc_buf_hdr_t hdr, arc_state_t state)
	-{
	- arc_buf_contents_t type = arc_buf_type(hdr);
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	-
	- if (GHOST_STATE(state)) {
	- ASSERT0(hdr->b_l1hdr.b_bufcnt);
	- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
	- HDR_GET_LSIZE(hdr), hdr);
	- return;
	- }
	-
	- ASSERT(!GHOST_STATE(state));
	- if (hdr->b_l1hdr.b_pabd != NULL) {
	- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
	- arc_hdr_size(hdr), hdr);
	- }
	- for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
	- buf = buf->b_next) {
	- if (arc_buf_is_shared(buf))
	- continue;
	- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
	- arc_buf_size(buf), buf);
	- }
	-}
	-
	-/*
	- * Add a reference to this hdr indicating that someone is actively
	- * referencing that memory. When the refcount transitions from 0 to 1,
	- * we remove it from the respective arc_state_t list to indicate that
	- * it is not evictable.
	- */
	-static void
	-add_reference(arc_buf_hdr_t hdr, void tag)
	-{
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- if (!MUTEX_HELD(HDR_LOCK(hdr))) {
	- ASSERT(hdr->b_l1hdr.b_state == arc_anon);
	- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	- }
	-
	- arc_state_t *state = hdr->b_l1hdr.b_state;
	-
	- if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
	- (state != arc_anon)) {
	- /* We don't use the L2-only state list. */
	- if (state != arc_l2c_only) {
	- multilist_remove(state->arcs_list[arc_buf_type(hdr)],
	- hdr);
	- arc_evictable_space_decrement(hdr, state);
	- }
	- /* remove the prefetch flag if we get a reference */
	- arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
	- }
	-}
	-
	-/*
	- * Remove a reference from this hdr. When the reference transitions from
	- * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
	- * list making it eligible for eviction.
	- */
	-static int
	-remove_reference(arc_buf_hdr_t hdr, kmutex_t hash_lock, void *tag)
	-{
	- int cnt;
	- arc_state_t *state = hdr->b_l1hdr.b_state;
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- ASSERT(state == arc_anon \|\| MUTEX_HELD(hash_lock));
	- ASSERT(!GHOST_STATE(state));
	-
	- /*
	- * arc_l2c_only counts as a ghost state so we don't need to explicitly
	- * check to prevent usage of the arc_l2c_only list.
	- */
	- if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
	- (state != arc_anon)) {
	- multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
	- ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
	- arc_evictable_space_increment(hdr, state);
	- }
	- return (cnt);
	-}
	-
	-/*
	- * Returns detailed information about a specific arc buffer. When the
	- * state_index argument is set the function will calculate the arc header
	- * list position for its arc state. Since this requires a linear traversal
	- * callers are strongly encourage not to do this. However, it can be helpful
	- * for targeted analysis so the functionality is provided.
	- */
	-void
	-arc_buf_info(arc_buf_t ab, arc_buf_info_t abi, int state_index)
	-{
	- arc_buf_hdr_t *hdr = ab->b_hdr;
	- l1arc_buf_hdr_t *l1hdr = NULL;
	- l2arc_buf_hdr_t *l2hdr = NULL;
	- arc_state_t *state = NULL;
	-
	- memset(abi, 0, sizeof (arc_buf_info_t));
	-
	- if (hdr == NULL)
	- return;
	-
	- abi->abi_flags = hdr->b_flags;
	-
	- if (HDR_HAS_L1HDR(hdr)) {
	- l1hdr = &hdr->b_l1hdr;
	- state = l1hdr->b_state;
	- }
	- if (HDR_HAS_L2HDR(hdr))
	- l2hdr = &hdr->b_l2hdr;
	-
	- if (l1hdr) {
	- abi->abi_bufcnt = l1hdr->b_bufcnt;
	- abi->abi_access = l1hdr->b_arc_access;
	- abi->abi_mru_hits = l1hdr->b_mru_hits;
	- abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
	- abi->abi_mfu_hits = l1hdr->b_mfu_hits;
	- abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
	- abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
	- }
	-
	- if (l2hdr) {
	- abi->abi_l2arc_dattr = l2hdr->b_daddr;
	- abi->abi_l2arc_hits = l2hdr->b_hits;
	- }
	-
	- abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
	- abi->abi_state_contents = arc_buf_type(hdr);
	- abi->abi_size = arc_hdr_size(hdr);
	-}
	-
	-/*
	- * Move the supplied buffer to the indicated state. The hash lock
	- * for the buffer must be held by the caller.
	- */
	-static void
	-arc_change_state(arc_state_t new_state, arc_buf_hdr_t hdr,
	- kmutex_t *hash_lock)
	-{
	- arc_state_t *old_state;
	- int64_t refcnt;
	- uint32_t bufcnt;
	- boolean_t update_old, update_new;
	- arc_buf_contents_t buftype = arc_buf_type(hdr);
	-
	- /*
	- * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
	- * in arc_read() when bringing a buffer out of the L2ARC. However, the
	- * L1 hdr doesn't always exist when we change state to arc_anon before
	- * destroying a header, in which case reallocating to add the L1 hdr is
	- * pointless.
	- */
	- if (HDR_HAS_L1HDR(hdr)) {
	- old_state = hdr->b_l1hdr.b_state;
	- refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
	- bufcnt = hdr->b_l1hdr.b_bufcnt;
	- update_old = (bufcnt > 0 \|\| hdr->b_l1hdr.b_pabd != NULL);
	- } else {
	- old_state = arc_l2c_only;
	- refcnt = 0;
	- bufcnt = 0;
	- update_old = B_FALSE;
	- }
	- update_new = update_old;
	-
	- ASSERT(MUTEX_HELD(hash_lock));
	- ASSERT3P(new_state, !=, old_state);
	- ASSERT(!GHOST_STATE(new_state) \|\| bufcnt == 0);
	- ASSERT(old_state != arc_anon \|\| bufcnt <= 1);
	-
	- /*
	- * If this buffer is evictable, transfer it from the
	- * old state list to the new state list.
	- */
	- if (refcnt == 0) {
	- if (old_state != arc_anon && old_state != arc_l2c_only) {
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- multilist_remove(old_state->arcs_list[buftype], hdr);
	-
	- if (GHOST_STATE(old_state)) {
	- ASSERT0(bufcnt);
	- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	- update_old = B_TRUE;
	- }
	- arc_evictable_space_decrement(hdr, old_state);
	- }
	- if (new_state != arc_anon && new_state != arc_l2c_only) {
	-
	- /*
	- * An L1 header always exists here, since if we're
	- * moving to some L1-cached state (i.e. not l2c_only or
	- * anonymous), we realloc the header to add an L1hdr
	- * beforehand.
	- */
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- multilist_insert(new_state->arcs_list[buftype], hdr);
	-
	- if (GHOST_STATE(new_state)) {
	- ASSERT0(bufcnt);
	- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	- update_new = B_TRUE;
	- }
	- arc_evictable_space_increment(hdr, new_state);
	- }
	- }
	-
	- ASSERT(!HDR_EMPTY(hdr));
	- if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
	- buf_hash_remove(hdr);
	-
	- /* adjust state sizes (ignore arc_l2c_only) */
	-
	- if (update_new && new_state != arc_l2c_only) {
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- if (GHOST_STATE(new_state)) {
	- ASSERT0(bufcnt);
	-
	- /*
	- * When moving a header to a ghost state, we first
	- * remove all arc buffers. Thus, we'll have a
	- * bufcnt of zero, and no arc buffer to use for
	- * the reference. As a result, we use the arc
	- * header pointer for the reference.
	- */
	- (void) zfs_refcount_add_many(&new_state->arcs_size,
	- HDR_GET_LSIZE(hdr), hdr);
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	- } else {
	- uint32_t buffers = 0;
	-
	- /*
	- * Each individual buffer holds a unique reference,
	- * thus we must remove each of these references one
	- * at a time.
	- */
	- for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
	- buf = buf->b_next) {
	- ASSERT3U(bufcnt, !=, 0);
	- buffers++;
	-
	- /*
	- * When the arc_buf_t is sharing the data
	- * block with the hdr, the owner of the
	- * reference belongs to the hdr. Only
	- * add to the refcount if the arc_buf_t is
	- * not shared.
	- */
	- if (arc_buf_is_shared(buf))
	- continue;
	-
	- (void) zfs_refcount_add_many(
	- &new_state->arcs_size,
	- arc_buf_size(buf), buf);
	- }
	- ASSERT3U(bufcnt, ==, buffers);
	-
	- if (hdr->b_l1hdr.b_pabd != NULL) {
	- (void) zfs_refcount_add_many(
	- &new_state->arcs_size,
	- arc_hdr_size(hdr), hdr);
	- } else {
	- ASSERT(GHOST_STATE(old_state));
	- }
	- }
	- }
	-
	- if (update_old && old_state != arc_l2c_only) {
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- if (GHOST_STATE(old_state)) {
	- ASSERT0(bufcnt);
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	-
	- /*
	- * When moving a header off of a ghost state,
	- * the header will not contain any arc buffers.
	- * We use the arc header pointer for the reference
	- * which is exactly what we did when we put the
	- * header on the ghost state.
	- */
	-
	- (void) zfs_refcount_remove_many(&old_state->arcs_size,
	- HDR_GET_LSIZE(hdr), hdr);
	- } else {
	- uint32_t buffers = 0;
	-
	- /*
	- * Each individual buffer holds a unique reference,
	- * thus we must remove each of these references one
	- * at a time.
	- */
	- for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
	- buf = buf->b_next) {
	- ASSERT3U(bufcnt, !=, 0);
	- buffers++;
	-
	- /*
	- * When the arc_buf_t is sharing the data
	- * block with the hdr, the owner of the
	- * reference belongs to the hdr. Only
	- * add to the refcount if the arc_buf_t is
	- * not shared.
	- */
	- if (arc_buf_is_shared(buf))
	- continue;
	-
	- (void) zfs_refcount_remove_many(
	- &old_state->arcs_size, arc_buf_size(buf),
	- buf);
	- }
	- ASSERT3U(bufcnt, ==, buffers);
	- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	- (void) zfs_refcount_remove_many(
	- &old_state->arcs_size, arc_hdr_size(hdr), hdr);
	- }
	- }
	-
	- if (HDR_HAS_L1HDR(hdr))
	- hdr->b_l1hdr.b_state = new_state;
	-
	- /*
	- * L2 headers should never be on the L2 state list since they don't
	- * have L1 headers allocated.
	- */
	- ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
	- multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
	-}
	-
	-void
	-arc_space_consume(uint64_t space, arc_space_type_t type)
	-{
	- ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
	-
	- switch (type) {
	- case ARC_SPACE_DATA:
	- aggsum_add(&astat_data_size, space);
	- break;
	- case ARC_SPACE_META:
	- aggsum_add(&astat_metadata_size, space);
	- break;
	- case ARC_SPACE_BONUS:
	- aggsum_add(&astat_bonus_size, space);
	- break;
	- case ARC_SPACE_DNODE:
	- aggsum_add(&astat_dnode_size, space);
	- break;
	- case ARC_SPACE_DBUF:
	- aggsum_add(&astat_dbuf_size, space);
	- break;
	- case ARC_SPACE_HDRS:
	- aggsum_add(&astat_hdr_size, space);
	- break;
	- case ARC_SPACE_L2HDRS:
	- aggsum_add(&astat_l2_hdr_size, space);
	- break;
	- }
	-
	- if (type != ARC_SPACE_DATA)
	- aggsum_add(&arc_meta_used, space);
	-
	- aggsum_add(&arc_size, space);
	-}
	-
	-void
	-arc_space_return(uint64_t space, arc_space_type_t type)
	-{
	- ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
	-
	- switch (type) {
	- case ARC_SPACE_DATA:
	- aggsum_add(&astat_data_size, -space);
	- break;
	- case ARC_SPACE_META:
	- aggsum_add(&astat_metadata_size, -space);
	- break;
	- case ARC_SPACE_BONUS:
	- aggsum_add(&astat_bonus_size, -space);
	- break;
	- case ARC_SPACE_DNODE:
	- aggsum_add(&astat_dnode_size, -space);
	- break;
	- case ARC_SPACE_DBUF:
	- aggsum_add(&astat_dbuf_size, -space);
	- break;
	- case ARC_SPACE_HDRS:
	- aggsum_add(&astat_hdr_size, -space);
	- break;
	- case ARC_SPACE_L2HDRS:
	- aggsum_add(&astat_l2_hdr_size, -space);
	- break;
	- }
	-
	- if (type != ARC_SPACE_DATA) {
	- ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
	- /*
	- * We use the upper bound here rather than the precise value
	- * because the arc_meta_max value doesn't need to be
	- * precise. It's only consumed by humans via arcstats.
	- */
	- if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
	- arc_meta_max = aggsum_upper_bound(&arc_meta_used);
	- aggsum_add(&arc_meta_used, -space);
	- }
	-
	- ASSERT(aggsum_compare(&arc_size, space) >= 0);
	- aggsum_add(&arc_size, -space);
	-}
	-
	-/*
	- * Given a hdr and a buf, returns whether that buf can share its b_data buffer
	- * with the hdr's b_pabd.
	- */
	-static boolean_t
	-arc_can_share(arc_buf_hdr_t hdr, arc_buf_t buf)
	-{
	- /*
	- * The criteria for sharing a hdr's data are:
	- * 1. the hdr's compression matches the buf's compression
	- * 2. the hdr doesn't need to be byteswapped
	- * 3. the hdr isn't already being shared
	- * 4. the buf is either compressed or it is the last buf in the hdr list
	- *
	- * Criterion #4 maintains the invariant that shared uncompressed
	- * bufs must be the final buf in the hdr's b_buf list. Reading this, you
	- * might ask, "if a compressed buf is allocated first, won't that be the
	- * last thing in the list?", but in that case it's impossible to create
	- * a shared uncompressed buf anyway (because the hdr must be compressed
	- * to have the compressed buf). You might also think that #3 is
	- * sufficient to make this guarantee, however it's possible
	- * (specifically in the rare L2ARC write race mentioned in
	- * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
	- * is sharable, but wasn't at the time of its allocation. Rather than
	- * allow a new shared uncompressed buf to be created and then shuffle
	- * the list around to make it the last element, this simply disallows
	- * sharing if the new buf isn't the first to be added.
	- */
	- ASSERT3P(buf->b_hdr, ==, hdr);
	- boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
	- boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
	- return (buf_compressed == hdr_compressed &&
	- hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
	- !HDR_SHARED_DATA(hdr) &&
	- (ARC_BUF_LAST(buf) \|\| ARC_BUF_COMPRESSED(buf)));
	-}
	-
	-/*
	- * Allocate a buf for this hdr. If you care about the data that's in the hdr,
	- * or if you want a compressed buffer, pass those flags in. Returns 0 if the
	- * copy was made successfully, or an error code otherwise.
	- */
	-static int
	-arc_buf_alloc_impl(arc_buf_hdr_t hdr, void tag, boolean_t compressed,
	- boolean_t fill, arc_buf_t **ret)
	-{
	- arc_buf_t *buf;
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
	- VERIFY(hdr->b_type == ARC_BUFC_DATA \|\|
	- hdr->b_type == ARC_BUFC_METADATA);
	- ASSERT3P(ret, !=, NULL);
	- ASSERT3P(*ret, ==, NULL);
	-
	- buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
	- buf->b_hdr = hdr;
	- buf->b_data = NULL;
	- buf->b_next = hdr->b_l1hdr.b_buf;
	- buf->b_flags = 0;
	-
	- add_reference(hdr, tag);
	-
	- /*
	- * We're about to change the hdr's b_flags. We must either
	- * hold the hash_lock or be undiscoverable.
	- */
	- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	-
	- /*
	- * Only honor requests for compressed bufs if the hdr is actually
	- * compressed.
	- */
	- if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
	- buf->b_flags \|= ARC_BUF_FLAG_COMPRESSED;
	-
	- /*
	- * If the hdr's data can be shared then we share the data buffer and
	- * set the appropriate bit in the hdr's b_flags to indicate the hdr is
	- * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
	- * buffer to store the buf's data.
	- *
	- * There are two additional restrictions here because we're sharing
	- * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
	- * actively involved in an L2ARC write, because if this buf is used by
	- * an arc_write() then the hdr's data buffer will be released when the
	- * write completes, even though the L2ARC write might still be using it.
	- * Second, the hdr's ABD must be linear so that the buf's user doesn't
	- * need to be ABD-aware.
	- */
	- boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
	- abd_is_linear(hdr->b_l1hdr.b_pabd);
	-
	- /* Set up b_data and sharing */
	- if (can_share) {
	- buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
	- buf->b_flags \|= ARC_BUF_FLAG_SHARED;
	- arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
	- } else {
	- buf->b_data =
	- arc_get_data_buf(hdr, arc_buf_size(buf), buf);
	- ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
	- }
	- VERIFY3P(buf->b_data, !=, NULL);
	-
	- hdr->b_l1hdr.b_buf = buf;
	- hdr->b_l1hdr.b_bufcnt += 1;
	-
	- /*
	- * If the user wants the data from the hdr, we need to either copy or
	- * decompress the data.
	- */
	- if (fill) {
	- return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
	- }
	-
	- return (0);
	-}
	-
	-static char *arc_onloan_tag = "onloan";
	-
	-static inline void
	-arc_loaned_bytes_update(int64_t delta)
	-{
	- atomic_add_64(&arc_loaned_bytes, delta);
	-
	- /* assert that it did not wrap around */
	- ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
	-}
	-
	-/*
	- * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
	- * flight data by arc_tempreserve_space() until they are "returned". Loaned
	- * buffers must be returned to the arc before they can be used by the DMU or
	- * freed.
	- */
	-arc_buf_t *
	-arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
	-{
	- arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
	- is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
	-
	- arc_loaned_bytes_update(arc_buf_size(buf));
	-
	- return (buf);
	-}
	-
	-arc_buf_t *
	-arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
	- enum zio_compress compression_type)
	-{
	- arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
	- psize, lsize, compression_type);
	-
	- arc_loaned_bytes_update(arc_buf_size(buf));
	-
	- return (buf);
	-}
	-
	-
	-/*
	- * Return a loaned arc buffer to the arc.
	- */
	-void
	-arc_return_buf(arc_buf_t buf, void tag)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	-
	- ASSERT3P(buf->b_data, !=, NULL);
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
	- (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
	-
	- arc_loaned_bytes_update(-arc_buf_size(buf));
	-}
	-
	-/* Detach an arc_buf from a dbuf (tag) */
	-void
	-arc_loan_inuse_buf(arc_buf_t buf, void tag)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	-
	- ASSERT3P(buf->b_data, !=, NULL);
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
	- (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
	-
	- arc_loaned_bytes_update(arc_buf_size(buf));
	-}
	-
	-static void
	-l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
	-{
	- l2arc_data_free_t df = kmem_alloc(sizeof (df), KM_SLEEP);
	-
	- df->l2df_abd = abd;
	- df->l2df_size = size;
	- df->l2df_type = type;
	- mutex_enter(&l2arc_free_on_write_mtx);
	- list_insert_head(l2arc_free_on_write, df);
	- mutex_exit(&l2arc_free_on_write_mtx);
	-}
	-
	-static void
	-arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
	-{
	- arc_state_t *state = hdr->b_l1hdr.b_state;
	- arc_buf_contents_t type = arc_buf_type(hdr);
	- uint64_t size = arc_hdr_size(hdr);
	-
	- /* protected by hash lock, if in the hash table */
	- if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
	- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	- ASSERT(state != arc_anon && state != arc_l2c_only);
	-
	- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
	- size, hdr);
	- }
	- (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
	- if (type == ARC_BUFC_METADATA) {
	- arc_space_return(size, ARC_SPACE_META);
	- } else {
	- ASSERT(type == ARC_BUFC_DATA);
	- arc_space_return(size, ARC_SPACE_DATA);
	- }
	-
	- l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
	-}
	-
	-/*
	- * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
	- * data buffer, we transfer the refcount ownership to the hdr and update
	- * the appropriate kstats.
	- */
	-static void
	-arc_share_buf(arc_buf_hdr_t hdr, arc_buf_t buf)
	-{
	- arc_state_t *state = hdr->b_l1hdr.b_state;
	-
	- ASSERT(arc_can_share(hdr, buf));
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	-
	- /*
	- * Start sharing the data buffer. We transfer the
	- * refcount ownership to the hdr since it always owns
	- * the refcount whenever an arc_buf_t is shared.
	- */
	- zfs_refcount_transfer_ownership(&state->arcs_size, buf, hdr);
	- hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
	- abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
	- HDR_ISTYPE_METADATA(hdr));
	- arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
	- buf->b_flags \|= ARC_BUF_FLAG_SHARED;
	-
	- /*
	- * Since we've transferred ownership to the hdr we need
	- * to increment its compressed and uncompressed kstats and
	- * decrement the overhead size.
	- */
	- ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
	- ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
	- ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
	-}
	-
	-static void
	-arc_unshare_buf(arc_buf_hdr_t hdr, arc_buf_t buf)
	-{
	- arc_state_t *state = hdr->b_l1hdr.b_state;
	-
	- ASSERT(arc_buf_is_shared(buf));
	- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	-
	- /*
	- * We are no longer sharing this buffer so we need
	- * to transfer its ownership to the rightful owner.
	- */
	- zfs_refcount_transfer_ownership(&state->arcs_size, hdr, buf);
	- arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
	- abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
	- abd_put(hdr->b_l1hdr.b_pabd);
	- hdr->b_l1hdr.b_pabd = NULL;
	- buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
	-
	- /*
	- * Since the buffer is no longer shared between
	- * the arc buf and the hdr, count it as overhead.
	- */
	- ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
	- ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
	- ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
	-}
	-
	-/*
	- * Remove an arc_buf_t from the hdr's buf list and return the last
	- * arc_buf_t on the list. If no buffers remain on the list then return
	- * NULL.
	- */
	-static arc_buf_t *
	-arc_buf_remove(arc_buf_hdr_t hdr, arc_buf_t buf)
	-{
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	-
	- arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
	- arc_buf_t *lastbuf = NULL;
	-
	- /*
	- * Remove the buf from the hdr list and locate the last
	- * remaining buffer on the list.
	- */
	- while (*bufp != NULL) {
	- if (*bufp == buf)
	- *bufp = buf->b_next;
	-
	- /*
	- * If we've removed a buffer in the middle of
	- * the list then update the lastbuf and update
	- * bufp.
	- */
	- if (*bufp != NULL) {
	- lastbuf = *bufp;
	- bufp = &(*bufp)->b_next;
	- }
	- }
	- buf->b_next = NULL;
	- ASSERT3P(lastbuf, !=, buf);
	- IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
	- IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
	- IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
	-
	- return (lastbuf);
	-}
	-
	-/*
	- * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
	- * list and free it.
	- */
	-static void
	-arc_buf_destroy_impl(arc_buf_t *buf)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	-
	- /*
	- * Free up the data associated with the buf but only if we're not
	- * sharing this with the hdr. If we are sharing it with the hdr, the
	- * hdr is responsible for doing the free.
	- */
	- if (buf->b_data != NULL) {
	- /*
	- * We're about to change the hdr's b_flags. We must either
	- * hold the hash_lock or be undiscoverable.
	- */
	- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	-
	- arc_cksum_verify(buf);
	-#ifdef illumos
	- arc_buf_unwatch(buf);
	-#endif
	-
	- if (arc_buf_is_shared(buf)) {
	- arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
	- } else {
	- uint64_t size = arc_buf_size(buf);
	- arc_free_data_buf(hdr, buf->b_data, size, buf);
	- ARCSTAT_INCR(arcstat_overhead_size, -size);
	- }
	- buf->b_data = NULL;
	-
	- ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
	- hdr->b_l1hdr.b_bufcnt -= 1;
	- }
	-
	- arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
	-
	- if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
	- /*
	- * If the current arc_buf_t is sharing its data buffer with the
	- * hdr, then reassign the hdr's b_pabd to share it with the new
	- * buffer at the end of the list. The shared buffer is always
	- * the last one on the hdr's buffer list.
	- *
	- * There is an equivalent case for compressed bufs, but since
	- * they aren't guaranteed to be the last buf in the list and
	- * that is an exceedingly rare case, we just allow that space be
	- * wasted temporarily.
	- */
	- if (lastbuf != NULL) {
	- /* Only one buf can be shared at once */
	- VERIFY(!arc_buf_is_shared(lastbuf));
	- /* hdr is uncompressed so can't have compressed buf */
	- VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
	-
	- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	- arc_hdr_free_pabd(hdr);
	-
	- /*
	- * We must setup a new shared block between the
	- * last buffer and the hdr. The data would have
	- * been allocated by the arc buf so we need to transfer
	- * ownership to the hdr since it's now being shared.
	- */
	- arc_share_buf(hdr, lastbuf);
	- }
	- } else if (HDR_SHARED_DATA(hdr)) {
	- /*
	- * Uncompressed shared buffers are always at the end
	- * of the list. Compressed buffers don't have the
	- * same requirements. This makes it hard to
	- * simply assert that the lastbuf is shared so
	- * we rely on the hdr's compression flags to determine
	- * if we have a compressed, shared buffer.
	- */
	- ASSERT3P(lastbuf, !=, NULL);
	- ASSERT(arc_buf_is_shared(lastbuf) \|\|
	- HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
	- }
	-
	- /*
	- * Free the checksum if we're removing the last uncompressed buf from
	- * this hdr.
	- */
	- if (!arc_hdr_has_uncompressed_buf(hdr)) {
	- arc_cksum_free(hdr);
	- }
	-
	- /* clean up the buf */
	- buf->b_hdr = NULL;
	- kmem_cache_free(buf_cache, buf);
	-}
	-
	-static void
	-arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t do_adapt)
	-{
	- ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- ASSERT(!HDR_SHARED_DATA(hdr));
	-
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	- hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, do_adapt);
	- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
	- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	-
	- ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
	- ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
	-}
	-
	-static void
	-arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
	-{
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	-
	- /*
	- * If the hdr is currently being written to the l2arc then
	- * we defer freeing the data by adding it to the l2arc_free_on_write
	- * list. The l2arc will free the data once it's finished
	- * writing it to the l2arc device.
	- */
	- if (HDR_L2_WRITING(hdr)) {
	- arc_hdr_free_on_write(hdr);
	- ARCSTAT_BUMP(arcstat_l2_free_on_write);
	- } else {
	- arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
	- arc_hdr_size(hdr), hdr);
	- }
	- hdr->b_l1hdr.b_pabd = NULL;
	- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
	-
	- ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
	- ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
	-}
	-
	-static arc_buf_hdr_t *
	-arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
	- enum zio_compress compression_type, arc_buf_contents_t type)
	-{
	- arc_buf_hdr_t *hdr;
	-
	- VERIFY(type == ARC_BUFC_DATA \|\| type == ARC_BUFC_METADATA);
	-
	- hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
	- ASSERT(HDR_EMPTY(hdr));
	- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
	- ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
	- HDR_SET_PSIZE(hdr, psize);
	- HDR_SET_LSIZE(hdr, lsize);
	- hdr->b_spa = spa;
	- hdr->b_type = type;
	- hdr->b_flags = 0;
	- arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) \| ARC_FLAG_HAS_L1HDR);
	- arc_hdr_set_compress(hdr, compression_type);
	-
	- hdr->b_l1hdr.b_state = arc_anon;
	- hdr->b_l1hdr.b_arc_access = 0;
	- hdr->b_l1hdr.b_bufcnt = 0;
	- hdr->b_l1hdr.b_buf = NULL;
	-
	- /*
	- * Allocate the hdr's buffer. This will contain either
	- * the compressed or uncompressed data depending on the block
	- * it references and compressed arc enablement.
	- */
	- arc_hdr_alloc_pabd(hdr, B_TRUE);
	- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	-
	- return (hdr);
	-}
	-
	-/*
	- * Transition between the two allocation states for the arc_buf_hdr struct.
	- * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
	- * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
	- * version is used when a cache buffer is only in the L2ARC in order to reduce
	- * memory usage.
	- */
	-static arc_buf_hdr_t *
	-arc_hdr_realloc(arc_buf_hdr_t hdr, kmem_cache_t old, kmem_cache_t *new)
	-{
	- ASSERT(HDR_HAS_L2HDR(hdr));
	-
	- arc_buf_hdr_t *nhdr;
	- l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
	-
	- ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) \|\|
	- (old == hdr_l2only_cache && new == hdr_full_cache));
	-
	- nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
	-
	- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
	- buf_hash_remove(hdr);
	-
	- bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
	-
	- if (new == hdr_full_cache) {
	- arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
	- /*
	- * arc_access and arc_change_state need to be aware that a
	- * header has just come out of L2ARC, so we set its state to
	- * l2c_only even though it's about to change.
	- */
	- nhdr->b_l1hdr.b_state = arc_l2c_only;
	-
	- /* Verify previous threads set to NULL before freeing */
	- ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
	- } else {
	- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	- ASSERT0(hdr->b_l1hdr.b_bufcnt);
	- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
	-
	- /*
	- * If we've reached here, We must have been called from
	- * arc_evict_hdr(), as such we should have already been
	- * removed from any ghost list we were previously on
	- * (which protects us from racing with arc_evict_state),
	- * thus no locking is needed during this check.
	- */
	- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
	-
	- /*
	- * A buffer must not be moved into the arc_l2c_only
	- * state if it's not finished being written out to the
	- * l2arc device. Otherwise, the b_l1hdr.b_pabd field
	- * might try to be accessed, even though it was removed.
	- */
	- VERIFY(!HDR_L2_WRITING(hdr));
	- VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	-
	-#ifdef ZFS_DEBUG
	- if (hdr->b_l1hdr.b_thawed != NULL) {
	- kmem_free(hdr->b_l1hdr.b_thawed, 1);
	- hdr->b_l1hdr.b_thawed = NULL;
	- }
	-#endif
	-
	- arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
	- }
	- /*
	- * The header has been reallocated so we need to re-insert it into any
	- * lists it was on.
	- */
	- (void) buf_hash_insert(nhdr, NULL);
	-
	- ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
	-
	- mutex_enter(&dev->l2ad_mtx);
	-
	- /*
	- * We must place the realloc'ed header back into the list at
	- * the same spot. Otherwise, if it's placed earlier in the list,
	- * l2arc_write_buffers() could find it during the function's
	- * write phase, and try to write it out to the l2arc.
	- */
	- list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
	- list_remove(&dev->l2ad_buflist, hdr);
	-
	- mutex_exit(&dev->l2ad_mtx);
	-
	- /*
	- * Since we're using the pointer address as the tag when
	- * incrementing and decrementing the l2ad_alloc refcount, we
	- * must remove the old pointer (that we're about to destroy) and
	- * add the new pointer to the refcount. Otherwise we'd remove
	- * the wrong pointer address when calling arc_hdr_destroy() later.
	- */
	-
	- (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
	- hdr);
	- (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr),
	- nhdr);
	-
	- buf_discard_identity(hdr);
	- kmem_cache_free(old, hdr);
	-
	- return (nhdr);
	-}
	-
	-/*
	- * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
	- * The buf is returned thawed since we expect the consumer to modify it.
	- */
	-arc_buf_t *
	-arc_alloc_buf(spa_t spa, void tag, arc_buf_contents_t type, int32_t size)
	-{
	- arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
	- ZIO_COMPRESS_OFF, type);
	- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
	-
	- arc_buf_t *buf = NULL;
	- VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
	- arc_buf_thaw(buf);
	-
	- return (buf);
	-}
	-
	-/*
	- * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
	- * for bufs containing metadata.
	- */
	-arc_buf_t *
	-arc_alloc_compressed_buf(spa_t spa, void tag, uint64_t psize, uint64_t lsize,
	- enum zio_compress compression_type)
	-{
	- ASSERT3U(lsize, >, 0);
	- ASSERT3U(lsize, >=, psize);
	- ASSERT(compression_type > ZIO_COMPRESS_OFF);
	- ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
	-
	- arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
	- compression_type, ARC_BUFC_DATA);
	- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
	-
	- arc_buf_t *buf = NULL;
	- VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
	- arc_buf_thaw(buf);
	- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
	-
	- if (!arc_buf_is_shared(buf)) {
	- /*
	- * To ensure that the hdr has the correct data in it if we call
	- * arc_decompress() on this buf before it's been written to
	- * disk, it's easiest if we just set up sharing between the
	- * buf and the hdr.
	- */
	- ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
	- arc_hdr_free_pabd(hdr);
	- arc_share_buf(hdr, buf);
	- }
	-
	- return (buf);
	-}
	-
	-static void
	-arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
	-{
	- l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
	- l2arc_dev_t *dev = l2hdr->b_dev;
	- uint64_t psize = arc_hdr_size(hdr);
	-
	- ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
	- ASSERT(HDR_HAS_L2HDR(hdr));
	-
	- list_remove(&dev->l2ad_buflist, hdr);
	-
	- ARCSTAT_INCR(arcstat_l2_psize, -psize);
	- ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
	-
	- vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);
	-
	- (void) zfs_refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
	- arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
	-}
	-
	-static void
	-arc_hdr_destroy(arc_buf_hdr_t *hdr)
	-{
	- if (HDR_HAS_L1HDR(hdr)) {
	- ASSERT(hdr->b_l1hdr.b_buf == NULL \|\|
	- hdr->b_l1hdr.b_bufcnt > 0);
	- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	- ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
	- }
	- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	- ASSERT(!HDR_IN_HASH_TABLE(hdr));
	-
	- if (!HDR_EMPTY(hdr))
	- buf_discard_identity(hdr);
	-
	- if (HDR_HAS_L2HDR(hdr)) {
	- l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
	- boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
	-
	- if (!buflist_held)
	- mutex_enter(&dev->l2ad_mtx);
	-
	- /*
	- * Even though we checked this conditional above, we
	- * need to check this again now that we have the
	- * l2ad_mtx. This is because we could be racing with
	- * another thread calling l2arc_evict() which might have
	- * destroyed this header's L2 portion as we were waiting
	- * to acquire the l2ad_mtx. If that happens, we don't
	- * want to re-destroy the header's L2 portion.
	- */
	- if (HDR_HAS_L2HDR(hdr)) {
	- l2arc_trim(hdr);
	- arc_hdr_l2hdr_destroy(hdr);
	- }
	-
	- if (!buflist_held)
	- mutex_exit(&dev->l2ad_mtx);
	- }
	-
	- if (HDR_HAS_L1HDR(hdr)) {
	- arc_cksum_free(hdr);
	-
	- while (hdr->b_l1hdr.b_buf != NULL)
	- arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
	-
	-#ifdef ZFS_DEBUG
	- if (hdr->b_l1hdr.b_thawed != NULL) {
	- kmem_free(hdr->b_l1hdr.b_thawed, 1);
	- hdr->b_l1hdr.b_thawed = NULL;
	- }
	-#endif
	-
	- if (hdr->b_l1hdr.b_pabd != NULL) {
	- arc_hdr_free_pabd(hdr);
	- }
	- }
	-
	- ASSERT3P(hdr->b_hash_next, ==, NULL);
	- if (HDR_HAS_L1HDR(hdr)) {
	- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
	- ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
	- kmem_cache_free(hdr_full_cache, hdr);
	- } else {
	- kmem_cache_free(hdr_l2only_cache, hdr);
	- }
	-}
	-
	-void
	-arc_buf_destroy(arc_buf_t buf, void tag)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	- kmutex_t *hash_lock = HDR_LOCK(hdr);
	-
	- if (hdr->b_l1hdr.b_state == arc_anon) {
	- ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
	- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	- VERIFY0(remove_reference(hdr, NULL, tag));
	- arc_hdr_destroy(hdr);
	- return;
	- }
	-
	- mutex_enter(hash_lock);
	- ASSERT3P(hdr, ==, buf->b_hdr);
	- ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
	- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
	- ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
	- ASSERT3P(buf->b_data, !=, NULL);
	-
	- (void) remove_reference(hdr, hash_lock, tag);
	- arc_buf_destroy_impl(buf);
	- mutex_exit(hash_lock);
	-}
	-
	-/*
	- * Evict the arc_buf_hdr that is provided as a parameter. The resultant
	- * state of the header is dependent on its state prior to entering this
	- * function. The following transitions are possible:
	- *
	- * - arc_mru -> arc_mru_ghost
	- * - arc_mfu -> arc_mfu_ghost
	- * - arc_mru_ghost -> arc_l2c_only
	- * - arc_mru_ghost -> deleted
	- * - arc_mfu_ghost -> arc_l2c_only
	- * - arc_mfu_ghost -> deleted
	- */
	-static int64_t
	-arc_evict_hdr(arc_buf_hdr_t hdr, kmutex_t hash_lock)
	-{
	- arc_state_t evicted_state, state;
	- int64_t bytes_evicted = 0;
	- int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
	- zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
	-
	- ASSERT(MUTEX_HELD(hash_lock));
	- ASSERT(HDR_HAS_L1HDR(hdr));
	-
	- state = hdr->b_l1hdr.b_state;
	- if (GHOST_STATE(state)) {
	- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	-
	- /*
	- * l2arc_write_buffers() relies on a header's L1 portion
	- * (i.e. its b_pabd field) during it's write phase.
	- * Thus, we cannot push a header onto the arc_l2c_only
	- * state (removing it's L1 piece) until the header is
	- * done being written to the l2arc.
	- */
	- if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
	- ARCSTAT_BUMP(arcstat_evict_l2_skip);
	- return (bytes_evicted);
	- }
	-
	- ARCSTAT_BUMP(arcstat_deleted);
	- bytes_evicted += HDR_GET_LSIZE(hdr);
	-
	- DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
	-
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	- if (HDR_HAS_L2HDR(hdr)) {
	- /*
	- * This buffer is cached on the 2nd Level ARC;
	- * don't destroy the header.
	- */
	- arc_change_state(arc_l2c_only, hdr, hash_lock);
	- /*
	- * dropping from L1+L2 cached to L2-only,
	- * realloc to remove the L1 header.
	- */
	- hdr = arc_hdr_realloc(hdr, hdr_full_cache,
	- hdr_l2only_cache);
	- } else {
	- arc_change_state(arc_anon, hdr, hash_lock);
	- arc_hdr_destroy(hdr);
	- }
	- return (bytes_evicted);
	- }
	-
	- ASSERT(state == arc_mru \|\| state == arc_mfu);
	- evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
	-
	- /* prefetch buffers have a minimum lifespan */
	- if (HDR_IO_IN_PROGRESS(hdr) \|\|
	- ((hdr->b_flags & (ARC_FLAG_PREFETCH \| ARC_FLAG_INDIRECT)) &&
	- ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
	- ARCSTAT_BUMP(arcstat_evict_skip);
	- return (bytes_evicted);
	- }
	-
	- ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
	- while (hdr->b_l1hdr.b_buf) {
	- arc_buf_t *buf = hdr->b_l1hdr.b_buf;
	- if (!mutex_tryenter(&buf->b_evict_lock)) {
	- ARCSTAT_BUMP(arcstat_mutex_miss);
	- break;
	- }
	- if (buf->b_data != NULL)
	- bytes_evicted += HDR_GET_LSIZE(hdr);
	- mutex_exit(&buf->b_evict_lock);
	- arc_buf_destroy_impl(buf);
	- }
	-
	- if (HDR_HAS_L2HDR(hdr)) {
	- ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
	- } else {
	- if (l2arc_write_eligible(hdr->b_spa, hdr)) {
	- ARCSTAT_INCR(arcstat_evict_l2_eligible,
	- HDR_GET_LSIZE(hdr));
	- } else {
	- ARCSTAT_INCR(arcstat_evict_l2_ineligible,
	- HDR_GET_LSIZE(hdr));
	- }
	- }
	-
	- if (hdr->b_l1hdr.b_bufcnt == 0) {
	- arc_cksum_free(hdr);
	-
	- bytes_evicted += arc_hdr_size(hdr);
	-
	- /*
	- * If this hdr is being evicted and has a compressed
	- * buffer then we discard it here before we change states.
	- * This ensures that the accounting is updated correctly
	- * in arc_free_data_impl().
	- */
	- arc_hdr_free_pabd(hdr);
	-
	- arc_change_state(evicted_state, hdr, hash_lock);
	- ASSERT(HDR_IN_HASH_TABLE(hdr));
	- arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
	- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
	- }
	-
	- return (bytes_evicted);
	-}
	-
	-static uint64_t
	-arc_evict_state_impl(multilist_t ml, int idx, arc_buf_hdr_t marker,
	- uint64_t spa, int64_t bytes)
	-{
	- multilist_sublist_t *mls;
	- uint64_t bytes_evicted = 0;
	- arc_buf_hdr_t *hdr;
	- kmutex_t *hash_lock;
	- int evict_count = 0;
	-
	- ASSERT3P(marker, !=, NULL);
	- IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
	-
	- mls = multilist_sublist_lock(ml, idx);
	-
	- for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
	- hdr = multilist_sublist_prev(mls, marker)) {
	- if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) \|\|
	- (evict_count >= zfs_arc_evict_batch_limit))
	- break;
	-
	- /*
	- * To keep our iteration location, move the marker
	- * forward. Since we're not holding hdr's hash lock, we
	- * must be very careful and not remove 'hdr' from the
	- * sublist. Otherwise, other consumers might mistake the
	- * 'hdr' as not being on a sublist when they call the
	- * multilist_link_active() function (they all rely on
	- * the hash lock protecting concurrent insertions and
	- * removals). multilist_sublist_move_forward() was
	- * specifically implemented to ensure this is the case
	- * (only 'marker' will be removed and re-inserted).
	- */
	- multilist_sublist_move_forward(mls, marker);
	-
	- /*
	- * The only case where the b_spa field should ever be
	- * zero, is the marker headers inserted by
	- * arc_evict_state(). It's possible for multiple threads
	- * to be calling arc_evict_state() concurrently (e.g.
	- * dsl_pool_close() and zio_inject_fault()), so we must
	- * skip any markers we see from these other threads.
	- */
	- if (hdr->b_spa == 0)
	- continue;
	-
	- /* we're only interested in evicting buffers of a certain spa */
	- if (spa != 0 && hdr->b_spa != spa) {
	- ARCSTAT_BUMP(arcstat_evict_skip);
	- continue;
	- }
	-
	- hash_lock = HDR_LOCK(hdr);
	-
	- /*
	- * We aren't calling this function from any code path
	- * that would already be holding a hash lock, so we're
	- * asserting on this assumption to be defensive in case
	- * this ever changes. Without this check, it would be
	- * possible to incorrectly increment arcstat_mutex_miss
	- * below (e.g. if the code changed such that we called
	- * this function with a hash lock held).
	- */
	- ASSERT(!MUTEX_HELD(hash_lock));
	-
	- if (mutex_tryenter(hash_lock)) {
	- uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
	- mutex_exit(hash_lock);
	-
	- bytes_evicted += evicted;
	-
	- /*
	- * If evicted is zero, arc_evict_hdr() must have
	- * decided to skip this header, don't increment
	- * evict_count in this case.
	- */
	- if (evicted != 0)
	- evict_count++;
	-
	- /*
	- * If arc_size isn't overflowing, signal any
	- * threads that might happen to be waiting.
	- *
	- * For each header evicted, we wake up a single
	- * thread. If we used cv_broadcast, we could
	- * wake up "too many" threads causing arc_size
	- * to significantly overflow arc_c; since
	- * arc_get_data_impl() doesn't check for overflow
	- * when it's woken up (it doesn't because it's
	- * possible for the ARC to be overflowing while
	- * full of un-evictable buffers, and the
	- * function should proceed in this case).
	- *
	- * If threads are left sleeping, due to not
	- * using cv_broadcast here, they will be woken
	- * up via cv_broadcast in arc_adjust_cb() just
	- * before arc_adjust_zthr sleeps.
	- */
	- mutex_enter(&arc_adjust_lock);
	- if (!arc_is_overflowing())
	- cv_signal(&arc_adjust_waiters_cv);
	- mutex_exit(&arc_adjust_lock);
	- } else {
	- ARCSTAT_BUMP(arcstat_mutex_miss);
	- }
	- }
	-
	- multilist_sublist_unlock(mls);
	-
	- return (bytes_evicted);
	-}
	-
	-/*
	- * Evict buffers from the given arc state, until we've removed the
	- * specified number of bytes. Move the removed buffers to the
	- * appropriate evict state.
	- *
	- * This function makes a "best effort". It skips over any buffers
	- * it can't get a hash_lock on, and so, may not catch all candidates.
	- * It may also return without evicting as much space as requested.
	- *
	- * If bytes is specified using the special value ARC_EVICT_ALL, this
	- * will evict all available (i.e. unlocked and evictable) buffers from
	- * the given arc state; which is used by arc_flush().
	- */
	-static uint64_t
	-arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
	- arc_buf_contents_t type)
	-{
	- uint64_t total_evicted = 0;
	- multilist_t *ml = state->arcs_list[type];
	- int num_sublists;
	- arc_buf_hdr_t **markers;
	-
	- IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
	-
	- num_sublists = multilist_get_num_sublists(ml);
	-
	- /*
	- * If we've tried to evict from each sublist, made some
	- * progress, but still have not hit the target number of bytes
	- * to evict, we want to keep trying. The markers allow us to
	- * pick up where we left off for each individual sublist, rather
	- * than starting from the tail each time.
	- */
	- markers = kmem_zalloc(sizeof (markers) num_sublists, KM_SLEEP);
	- for (int i = 0; i < num_sublists; i++) {
	- markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
	-
	- /*
	- * A b_spa of 0 is used to indicate that this header is
	- * a marker. This fact is used in arc_adjust_type() and
	- * arc_evict_state_impl().
	- */
	- markers[i]->b_spa = 0;
	-
	- multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
	- multilist_sublist_insert_tail(mls, markers[i]);
	- multilist_sublist_unlock(mls);
	- }
	-
	- /*
	- * While we haven't hit our target number of bytes to evict, or
	- * we're evicting all available buffers.
	- */
	- while (total_evicted < bytes \|\| bytes == ARC_EVICT_ALL) {
	- int sublist_idx = multilist_get_random_index(ml);
	- uint64_t scan_evicted = 0;
	-
	- /*
	- * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
	- * Request that 10% of the LRUs be scanned by the superblock
	- * shrinker.
	- */
	- if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
	- arc_dnode_limit) > 0) {
	- arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
	- arc_dnode_limit) / sizeof (dnode_t) /
	- zfs_arc_dnode_reduce_percent);
	- }
	-
	- /*
	- * Start eviction using a randomly selected sublist,
	- * this is to try and evenly balance eviction across all
	- * sublists. Always starting at the same sublist
	- * (e.g. index 0) would cause evictions to favor certain
	- * sublists over others.
	- */
	- for (int i = 0; i < num_sublists; i++) {
	- uint64_t bytes_remaining;
	- uint64_t bytes_evicted;
	-
	- if (bytes == ARC_EVICT_ALL)
	- bytes_remaining = ARC_EVICT_ALL;
	- else if (total_evicted < bytes)
	- bytes_remaining = bytes - total_evicted;
	- else
	- break;
	-
	- bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
	- markers[sublist_idx], spa, bytes_remaining);
	-
	- scan_evicted += bytes_evicted;
	- total_evicted += bytes_evicted;
	-
	- /* we've reached the end, wrap to the beginning */
	- if (++sublist_idx >= num_sublists)
	- sublist_idx = 0;
	- }
	-
	- /*
	- * If we didn't evict anything during this scan, we have
	- * no reason to believe we'll evict more during another
	- * scan, so break the loop.
	- */
	- if (scan_evicted == 0) {
	- /* This isn't possible, let's make that obvious */
	- ASSERT3S(bytes, !=, 0);
	-
	- /*
	- * When bytes is ARC_EVICT_ALL, the only way to
	- * break the loop is when scan_evicted is zero.
	- * In that case, we actually have evicted enough,
	- * so we don't want to increment the kstat.
	- */
	- if (bytes != ARC_EVICT_ALL) {
	- ASSERT3S(total_evicted, <, bytes);
	- ARCSTAT_BUMP(arcstat_evict_not_enough);
	- }
	-
	- break;
	- }
	- }
	-
	- for (int i = 0; i < num_sublists; i++) {
	- multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
	- multilist_sublist_remove(mls, markers[i]);
	- multilist_sublist_unlock(mls);
	-
	- kmem_cache_free(hdr_full_cache, markers[i]);
	- }
	- kmem_free(markers, sizeof (markers) num_sublists);
	-
	- return (total_evicted);
	-}
	-
	-/*
	- * Flush all "evictable" data of the given type from the arc state
	- * specified. This will not evict any "active" buffers (i.e. referenced).
	- *
	- * When 'retry' is set to B_FALSE, the function will make a single pass
	- * over the state and evict any buffers that it can. Since it doesn't
	- * continually retry the eviction, it might end up leaving some buffers
	- * in the ARC due to lock misses.
	- *
	- * When 'retry' is set to B_TRUE, the function will continually retry the
	- * eviction until all evictable buffers have been removed from the
	- * state. As a result, if concurrent insertions into the state are
	- * allowed (e.g. if the ARC isn't shutting down), this function might
	- * wind up in an infinite loop, continually trying to evict buffers.
	- */
	-static uint64_t
	-arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
	- boolean_t retry)
	-{
	- uint64_t evicted = 0;
	-
	- while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
	- evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
	-
	- if (!retry)
	- break;
	- }
	-
	- return (evicted);
	-}
	-
	-/*
	- * Helper function for arc_prune_async() it is responsible for safely
	- * handling the execution of a registered arc_prune_func_t.
	- */
	-static void
	-arc_prune_task(void *ptr)
	-{
	- arc_prune_t ap = (arc_prune_t )ptr;
	- arc_prune_func_t *func = ap->p_pfunc;
	-
	- if (func != NULL)
	- func(ap->p_adjust, ap->p_private);
	-
	- zfs_refcount_remove(&ap->p_refcnt, func);
	-}
	-
	-/*
	- * Notify registered consumers they must drop holds on a portion of the ARC
	- * buffered they reference. This provides a mechanism to ensure the ARC can
	- * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
	- * is analogous to dnlc_reduce_cache() but more generic.
	- *
	- * This operation is performed asynchronously so it may be safely called
	- * in the context of the arc_reclaim_thread(). A reference is taken here
	- * for each registered arc_prune_t and the arc_prune_task() is responsible
	- * for releasing it once the registered arc_prune_func_t has completed.
	- */
	-static void
	-arc_prune_async(int64_t adjust)
	-{
	- arc_prune_t *ap;
	-
	- mutex_enter(&arc_prune_mtx);
	- for (ap = list_head(&arc_prune_list); ap != NULL;
	- ap = list_next(&arc_prune_list, ap)) {
	-
	- if (zfs_refcount_count(&ap->p_refcnt) >= 2)
	- continue;
	-
	- zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
	- ap->p_adjust = adjust;
	- if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
	- ap, TQ_SLEEP) == TASKQID_INVALID) {
	- zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
	- continue;
	- }
	- ARCSTAT_BUMP(arcstat_prune);
	- }
	- mutex_exit(&arc_prune_mtx);
	-}
	-
	-/*
	- * Evict the specified number of bytes from the state specified,
	- * restricting eviction to the spa and type given. This function
	- * prevents us from trying to evict more from a state's list than
	- * is "evictable", and to skip evicting altogether when passed a
	- * negative value for "bytes". In contrast, arc_evict_state() will
	- * evict everything it can, when passed a negative value for "bytes".
	- */
	-static uint64_t
	-arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
	- arc_buf_contents_t type)
	-{
	- int64_t delta;
	-
	- if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
	- delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
	- bytes);
	- return (arc_evict_state(state, spa, delta, type));
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * The goal of this function is to evict enough meta data buffers from the
	- * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
	- * more complicated than it appears because it is common for data buffers
	- * to have holds on meta data buffers. In addition, dnode meta data buffers
	- * will be held by the dnodes in the block preventing them from being freed.
	- * This means we can't simply traverse the ARC and expect to always find
	- * enough unheld meta data buffer to release.
	- *
	- * Therefore, this function has been updated to make alternating passes
	- * over the ARC releasing data buffers and then newly unheld meta data
	- * buffers. This ensures forward progress is maintained and meta_used
	- * will decrease. Normally this is sufficient, but if required the ARC
	- * will call the registered prune callbacks causing dentry and inodes to
	- * be dropped from the VFS cache. This will make dnode meta data buffers
	- * available for reclaim.
	- */
	-static uint64_t
	-arc_adjust_meta_balanced(uint64_t meta_used)
	-{
	- int64_t delta, prune = 0, adjustmnt;
	- uint64_t total_evicted = 0;
	- arc_buf_contents_t type = ARC_BUFC_DATA;
	- int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
	-
	-restart:
	- /*
	- * This slightly differs than the way we evict from the mru in
	- * arc_adjust because we don't have a "target" value (i.e. no
	- * "meta" arc_p). As a result, I think we can completely
	- * cannibalize the metadata in the MRU before we evict the
	- * metadata from the MFU. I think we probably need to implement a
	- * "metadata arc_p" value to do this properly.
	- */
	- adjustmnt = meta_used - arc_meta_limit;
	-
	- if (adjustmnt > 0 &&
	- zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
	- delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
	- adjustmnt);
	- total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
	- adjustmnt -= delta;
	- }
	-
	- /*
	- * We can't afford to recalculate adjustmnt here. If we do,
	- * new metadata buffers can sneak into the MRU or ANON lists,
	- * thus penalize the MFU metadata. Although the fudge factor is
	- * small, it has been empirically shown to be significant for
	- * certain workloads (e.g. creating many empty directories). As
	- * such, we use the original calculation for adjustmnt, and
	- * simply decrement the amount of data evicted from the MRU.
	- */
	-
	- if (adjustmnt > 0 &&
	- zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
	- delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
	- adjustmnt);
	- total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
	- }
	-
	- adjustmnt = meta_used - arc_meta_limit;
	-
	- if (adjustmnt > 0 &&
	- zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
	- delta = MIN(adjustmnt,
	- zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
	- total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
	- adjustmnt -= delta;
	- }
	-
	- if (adjustmnt > 0 &&
	- zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
	- delta = MIN(adjustmnt,
	- zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
	- total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
	- }
	-
	- /*
	- * If after attempting to make the requested adjustment to the ARC
	- * the meta limit is still being exceeded then request that the
	- * higher layers drop some cached objects which have holds on ARC
	- * meta buffers. Requests to the upper layers will be made with
	- * increasingly large scan sizes until the ARC is below the limit.
	- */
	- if (meta_used > arc_meta_limit) {
	- if (type == ARC_BUFC_DATA) {
	- type = ARC_BUFC_METADATA;
	- } else {
	- type = ARC_BUFC_DATA;
	-
	- if (zfs_arc_meta_prune) {
	- prune += zfs_arc_meta_prune;
	- arc_prune_async(prune);
	- }
	- }
	-
	- if (restarts > 0) {
	- restarts--;
	- goto restart;
	- }
	- }
	- return (total_evicted);
	-}
	-
	-/*
	- * Evict metadata buffers from the cache, such that arc_meta_used is
	- * capped by the arc_meta_limit tunable.
	- */
	-static uint64_t
	-arc_adjust_meta_only(uint64_t meta_used)
	-{
	- uint64_t total_evicted = 0;
	- int64_t target;
	-
	- /*
	- * If we're over the meta limit, we want to evict enough
	- * metadata to get back under the meta limit. We don't want to
	- * evict so much that we drop the MRU below arc_p, though. If
	- * we're over the meta limit more than we're over arc_p, we
	- * evict some from the MRU here, and some from the MFU below.
	- */
	- target = MIN((int64_t)(meta_used - arc_meta_limit),
	- (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
	- zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
	-
	- total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
	-
	- /*
	- * Similar to the above, we want to evict enough bytes to get us
	- * below the meta limit, but not so much as to drop us below the
	- * space allotted to the MFU (which is defined as arc_c - arc_p).
	- */
	- target = MIN((int64_t)(meta_used - arc_meta_limit),
	- (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
	- (arc_c - arc_p)));
	-
	- total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
	-
	- return (total_evicted);
	-}
	-
	-static uint64_t
	-arc_adjust_meta(uint64_t meta_used)
	-{
	- if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
	- return (arc_adjust_meta_only(meta_used));
	- else
	- return (arc_adjust_meta_balanced(meta_used));
	-}
	-
	-/*
	- * Return the type of the oldest buffer in the given arc state
	- *
	- * This function will select a random sublist of type ARC_BUFC_DATA and
	- * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
	- * is compared, and the type which contains the "older" buffer will be
	- * returned.
	- */
	-static arc_buf_contents_t
	-arc_adjust_type(arc_state_t *state)
	-{
	- multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
	- multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
	- int data_idx = multilist_get_random_index(data_ml);
	- int meta_idx = multilist_get_random_index(meta_ml);
	- multilist_sublist_t *data_mls;
	- multilist_sublist_t *meta_mls;
	- arc_buf_contents_t type;
	- arc_buf_hdr_t *data_hdr;
	- arc_buf_hdr_t *meta_hdr;
	-
	- /*
	- * We keep the sublist lock until we're finished, to prevent
	- * the headers from being destroyed via arc_evict_state().
	- */
	- data_mls = multilist_sublist_lock(data_ml, data_idx);
	- meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
	-
	- /*
	- * These two loops are to ensure we skip any markers that
	- * might be at the tail of the lists due to arc_evict_state().
	- */
	-
	- for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
	- data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
	- if (data_hdr->b_spa != 0)
	- break;
	- }
	-
	- for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
	- meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
	- if (meta_hdr->b_spa != 0)
	- break;
	- }
	-
	- if (data_hdr == NULL && meta_hdr == NULL) {
	- type = ARC_BUFC_DATA;
	- } else if (data_hdr == NULL) {
	- ASSERT3P(meta_hdr, !=, NULL);
	- type = ARC_BUFC_METADATA;
	- } else if (meta_hdr == NULL) {
	- ASSERT3P(data_hdr, !=, NULL);
	- type = ARC_BUFC_DATA;
	- } else {
	- ASSERT3P(data_hdr, !=, NULL);
	- ASSERT3P(meta_hdr, !=, NULL);
	-
	- /* The headers can't be on the sublist without an L1 header */
	- ASSERT(HDR_HAS_L1HDR(data_hdr));
	- ASSERT(HDR_HAS_L1HDR(meta_hdr));
	-
	- if (data_hdr->b_l1hdr.b_arc_access <
	- meta_hdr->b_l1hdr.b_arc_access) {
	- type = ARC_BUFC_DATA;
	- } else {
	- type = ARC_BUFC_METADATA;
	- }
	- }
	-
	- multilist_sublist_unlock(meta_mls);
	- multilist_sublist_unlock(data_mls);
	-
	- return (type);
	-}
	-
	-/*
	- * Evict buffers from the cache, such that arc_size is capped by arc_c.
	- */
	-static uint64_t
	-arc_adjust(void)
	-{
	- uint64_t total_evicted = 0;
	- uint64_t bytes;
	- int64_t target;
	- uint64_t asize = aggsum_value(&arc_size);
	- uint64_t ameta = aggsum_value(&arc_meta_used);
	-
	- /*
	- * If we're over arc_meta_limit, we want to correct that before
	- * potentially evicting data buffers below.
	- */
	- total_evicted += arc_adjust_meta(ameta);
	-
	- /*
	- * Adjust MRU size
	- *
	- * If we're over the target cache size, we want to evict enough
	- * from the list to get back to our target size. We don't want
	- * to evict too much from the MRU, such that it drops below
	- * arc_p. So, if we're over our target cache size more than
	- * the MRU is over arc_p, we'll evict enough to get back to
	- * arc_p here, and then evict more from the MFU below.
	- */
	- target = MIN((int64_t)(asize - arc_c),
	- (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
	- zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
	-
	- /*
	- * If we're below arc_meta_min, always prefer to evict data.
	- * Otherwise, try to satisfy the requested number of bytes to
	- * evict from the type which contains older buffers; in an
	- * effort to keep newer buffers in the cache regardless of their
	- * type. If we cannot satisfy the number of bytes from this
	- * type, spill over into the next type.
	- */
	- if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
	- ameta > arc_meta_min) {
	- bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
	- total_evicted += bytes;
	-
	- /*
	- * If we couldn't evict our target number of bytes from
	- * metadata, we try to get the rest from data.
	- */
	- target -= bytes;
	-
	- total_evicted +=
	- arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
	- } else {
	- bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
	- total_evicted += bytes;
	-
	- /*
	- * If we couldn't evict our target number of bytes from
	- * data, we try to get the rest from metadata.
	- */
	- target -= bytes;
	-
	- total_evicted +=
	- arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
	- }
	-
	- /*
	- * Re-sum ARC stats after the first round of evictions.
	- */
	- asize = aggsum_value(&arc_size);
	- ameta = aggsum_value(&arc_meta_used);
	-
	- /*
	- * Adjust MFU size
	- *
	- * Now that we've tried to evict enough from the MRU to get its
	- * size back to arc_p, if we're still above the target cache
	- * size, we evict the rest from the MFU.
	- */
	- target = asize - arc_c;
	-
	- if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
	- ameta > arc_meta_min) {
	- bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
	- total_evicted += bytes;
	-
	- /*
	- * If we couldn't evict our target number of bytes from
	- * metadata, we try to get the rest from data.
	- */
	- target -= bytes;
	-
	- total_evicted +=
	- arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
	- } else {
	- bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
	- total_evicted += bytes;
	-
	- /*
	- * If we couldn't evict our target number of bytes from
	- * data, we try to get the rest from data.
	- */
	- target -= bytes;
	-
	- total_evicted +=
	- arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
	- }
	-
	- /*
	- * Adjust ghost lists
	- *
	- * In addition to the above, the ARC also defines target values
	- * for the ghost lists. The sum of the mru list and mru ghost
	- * list should never exceed the target size of the cache, and
	- * the sum of the mru list, mfu list, mru ghost list, and mfu
	- * ghost list should never exceed twice the target size of the
	- * cache. The following logic enforces these limits on the ghost
	- * caches, and evicts from them as needed.
	- */
	- target = zfs_refcount_count(&arc_mru->arcs_size) +
	- zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
	-
	- bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
	- total_evicted += bytes;
	-
	- target -= bytes;
	-
	- total_evicted +=
	- arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
	-
	- /*
	- * We assume the sum of the mru list and mfu list is less than
	- * or equal to arc_c (we enforced this above), which means we
	- * can use the simpler of the two equations below:
	- *
	- * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
	- * mru ghost + mfu ghost <= arc_c
	- */
	- target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
	- zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
	-
	- bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
	- total_evicted += bytes;
	-
	- target -= bytes;
	-
	- total_evicted +=
	- arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
	-
	- return (total_evicted);
	-}
	-
	-void
	-arc_flush(spa_t *spa, boolean_t retry)
	-{
	- uint64_t guid = 0;
	-
	- /*
	- * If retry is B_TRUE, a spa must not be specified since we have
	- * no good way to determine if all of a spa's buffers have been
	- * evicted from an arc state.
	- */
	- ASSERT(!retry \|\| spa == 0);
	-
	- if (spa != NULL)
	- guid = spa_load_guid(spa);
	-
	- (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
	- (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
	-
	- (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
	- (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
	-
	- (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
	- (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
	-
	- (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
	- (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
	-}
	-
	-static void
	-arc_reduce_target_size(int64_t to_free)
	-{
	- uint64_t asize = aggsum_value(&arc_size);
	- if (arc_c > arc_c_min) {
	- DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
	- arc_c_min, uint64_t, arc_p, uint64_t, to_free);
	- if (arc_c > arc_c_min + to_free)
	- atomic_add_64(&arc_c, -to_free);
	- else
	- arc_c = arc_c_min;
	-
	- atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
	- if (asize < arc_c)
	- arc_c = MAX(asize, arc_c_min);
	- if (arc_p > arc_c)
	- arc_p = (arc_c >> 1);
	-
	- DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
	- arc_p);
	-
	- ASSERT(arc_c >= arc_c_min);
	- ASSERT((int64_t)arc_p >= 0);
	- }
	-
	- if (asize > arc_c) {
	- DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize,
	- uint64_t, arc_c);
	- /* See comment in arc_adjust_cb_check() on why lock+flag */
	- mutex_enter(&arc_adjust_lock);
	- arc_adjust_needed = B_TRUE;
	- mutex_exit(&arc_adjust_lock);
	- zthr_wakeup(arc_adjust_zthr);
	- }
	-}
	-
	-typedef enum free_memory_reason_t {
	- FMR_UNKNOWN,
	- FMR_NEEDFREE,
	- FMR_LOTSFREE,
	- FMR_SWAPFS_MINFREE,
	- FMR_PAGES_PP_MAXIMUM,
	- FMR_HEAP_ARENA,
	- FMR_ZIO_ARENA,
	-} free_memory_reason_t;
	-
	-int64_t last_free_memory;
	-free_memory_reason_t last_free_reason;
	-
	-/*
	- * Additional reserve of pages for pp_reserve.
	- */
	-int64_t arc_pages_pp_reserve = 64;
	-
	-/*
	- * Additional reserve of pages for swapfs.
	- */
	-int64_t arc_swapfs_reserve = 64;
	-
	-/*
	- * Return the amount of memory that can be consumed before reclaim will be
	- * needed. Positive if there is sufficient free memory, negative indicates
	- * the amount of memory that needs to be freed up.
	- */
	-static int64_t
	-arc_available_memory(void)
	-{
	- int64_t lowest = INT64_MAX;
	- int64_t n;
	- free_memory_reason_t r = FMR_UNKNOWN;
	-
	-#ifdef _KERNEL
	-#ifdef __FreeBSD__
	- /*
	- * Cooperate with pagedaemon when it's time for it to scan
	- * and reclaim some pages.
	- */
	- n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
	- if (n < lowest) {
	- lowest = n;
	- r = FMR_LOTSFREE;
	- }
	-
	-#else
	- if (needfree > 0) {
	- n = PAGESIZE * (-needfree);
	- if (n < lowest) {
	- lowest = n;
	- r = FMR_NEEDFREE;
	- }
	- }
	-
	- /*
	- * check that we're out of range of the pageout scanner. It starts to
	- * schedule paging if freemem is less than lotsfree and needfree.
	- * lotsfree is the high-water mark for pageout, and needfree is the
	- * number of needed free pages. We add extra pages here to make sure
	- * the scanner doesn't start up while we're freeing memory.
	- */
	- n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
	- if (n < lowest) {
	- lowest = n;
	- r = FMR_LOTSFREE;
	- }
	-
	- /*
	- * check to make sure that swapfs has enough space so that anon
	- * reservations can still succeed. anon_resvmem() checks that the
	- * availrmem is greater than swapfs_minfree, and the number of reserved
	- * swap pages. We also add a bit of extra here just to prevent
	- * circumstances from getting really dire.
	- */
	- n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
	- desfree - arc_swapfs_reserve);
	- if (n < lowest) {
	- lowest = n;
	- r = FMR_SWAPFS_MINFREE;
	- }
	-
	-
	- /*
	- * Check that we have enough availrmem that memory locking (e.g., via
	- * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
	- * stores the number of pages that cannot be locked; when availrmem
	- * drops below pages_pp_maximum, page locking mechanisms such as
	- * page_pp_lock() will fail.)
	- */
	- n = PAGESIZE * (availrmem - pages_pp_maximum -
	- arc_pages_pp_reserve);
	- if (n < lowest) {
	- lowest = n;
	- r = FMR_PAGES_PP_MAXIMUM;
	- }
	-
	-#endif /* __FreeBSD__ */
	-#if defined(__i386) \|\| !defined(UMA_MD_SMALL_ALLOC)
	- /*
	- * If we're on an i386 platform, it's possible that we'll exhaust the
	- * kernel heap space before we ever run out of available physical
	- * memory. Most checks of the size of the heap_area compare against
	- * tune.t_minarmem, which is the minimum available real memory that we
	- * can have in the system. However, this is generally fixed at 25 pages
	- * which is so low that it's useless. In this comparison, we seek to
	- * calculate the total heap-size, and reclaim if more than 3/4ths of the
	- * heap is allocated. (Or, in the calculation, if less than 1/4th is
	- * free)
	- */
	- n = uma_avail() - (long)(uma_limit() / 4);
	- if (n < lowest) {
	- lowest = n;
	- r = FMR_HEAP_ARENA;
	- }
	-#endif
	-
	- /*
	- * If zio data pages are being allocated out of a separate heap segment,
	- * then enforce that the size of available vmem for this arena remains
	- * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
	- *
	- * Note that reducing the arc_zio_arena_free_shift keeps more virtual
	- * memory (in the zio_arena) free, which can avoid memory
	- * fragmentation issues.
	- */
	- if (zio_arena != NULL) {
	- n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
	- (vmem_size(zio_arena, VMEM_ALLOC) >>
	- arc_zio_arena_free_shift);
	- if (n < lowest) {
	- lowest = n;
	- r = FMR_ZIO_ARENA;
	- }
	- }
	-
	-#else /* _KERNEL */
	- /* Every 100 calls, free a small amount */
	- if (spa_get_random(100) == 0)
	- lowest = -1024;
	-#endif /* _KERNEL */
	-
	- last_free_memory = lowest;
	- last_free_reason = r;
	- DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
	- return (lowest);
	-}
	-
	-
	-/*
	- * Determine if the system is under memory pressure and is asking
	- * to reclaim memory. A return value of B_TRUE indicates that the system
	- * is under memory pressure and that the arc should adjust accordingly.
	- */
	-static boolean_t
	-arc_reclaim_needed(void)
	-{
	- return (arc_available_memory() < 0);
	-}
	-
	-extern kmem_cache_t *zio_buf_cache[];
	-extern kmem_cache_t *zio_data_buf_cache[];
	-extern kmem_cache_t *range_seg_cache;
	-extern kmem_cache_t *abd_chunk_cache;
	-
	-static __noinline void
	-arc_kmem_reap_soon(void)
	-{
	- size_t i;
	- kmem_cache_t *prev_cache = NULL;
	- kmem_cache_t *prev_data_cache = NULL;
	-
	- DTRACE_PROBE(arc__kmem_reap_start);
	-#ifdef _KERNEL
	- if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {
	- /*
	- * We are exceeding our meta-data cache limit.
	- * Purge some DNLC entries to release holds on meta-data.
	- */
	- dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
	- }
	-#if defined(__i386)
	- /*
	- * Reclaim unused memory from all kmem caches.
	- */
	- kmem_reap();
	-#endif
	-#endif
	-
	- for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
	- if (zio_buf_cache[i] != prev_cache) {
	- prev_cache = zio_buf_cache[i];
	- kmem_cache_reap_soon(zio_buf_cache[i]);
	- }
	- if (zio_data_buf_cache[i] != prev_data_cache) {
	- prev_data_cache = zio_data_buf_cache[i];
	- kmem_cache_reap_soon(zio_data_buf_cache[i]);
	- }
	- }
	- kmem_cache_reap_soon(abd_chunk_cache);
	- kmem_cache_reap_soon(buf_cache);
	- kmem_cache_reap_soon(hdr_full_cache);
	- kmem_cache_reap_soon(hdr_l2only_cache);
	- kmem_cache_reap_soon(range_seg_cache);
	-
	-#ifdef illumos
	- if (zio_arena != NULL) {
	- /*
	- * Ask the vmem arena to reclaim unused memory from its
	- * quantum caches.
	- */
	- vmem_qcache_reap(zio_arena);
	- }
	-#endif
	- DTRACE_PROBE(arc__kmem_reap_end);
	-}
	-
	-/* ARGSUSED */
	-static boolean_t
	-arc_adjust_cb_check(void arg, zthr_t zthr)
	-{
	- /*
	- * This is necessary in order for the mdb ::arc dcmd to
	- * show up to date information. Since the ::arc command
	- * does not call the kstat's update function, without
	- * this call, the command may show stale stats for the
	- * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
	- * with this change, the data might be up to 1 second
	- * out of date(the arc_adjust_zthr has a maximum sleep
	- * time of 1 second); but that should suffice. The
	- * arc_state_t structures can be queried directly if more
	- * accurate information is needed.
	- */
	- if (arc_ksp != NULL)
	- arc_ksp->ks_update(arc_ksp, KSTAT_READ);
	-
	- /*
	- * We have to rely on arc_get_data_impl() to tell us when to adjust,
	- * rather than checking if we are overflowing here, so that we are
	- * sure to not leave arc_get_data_impl() waiting on
	- * arc_adjust_waiters_cv. If we have become "not overflowing" since
	- * arc_get_data_impl() checked, we need to wake it up. We could
	- * broadcast the CV here, but arc_get_data_impl() may have not yet
	- * gone to sleep. We would need to use a mutex to ensure that this
	- * function doesn't broadcast until arc_get_data_impl() has gone to
	- * sleep (e.g. the arc_adjust_lock). However, the lock ordering of
	- * such a lock would necessarily be incorrect with respect to the
	- * zthr_lock, which is held before this function is called, and is
	- * held by arc_get_data_impl() when it calls zthr_wakeup().
	- */
	- return (arc_adjust_needed);
	-}
	-
	-/*
	- * Keep arc_size under arc_c by running arc_adjust which evicts data
	- * from the ARC. */
	-/* ARGSUSED */
	-static void
	-arc_adjust_cb(void arg, zthr_t zthr)
	-{
	- uint64_t evicted = 0;
	-
	- /* Evict from cache */
	- evicted = arc_adjust();
	-
	- /*
	- * If evicted is zero, we couldn't evict anything
	- * via arc_adjust(). This could be due to hash lock
	- * collisions, but more likely due to the majority of
	- * arc buffers being unevictable. Therefore, even if
	- * arc_size is above arc_c, another pass is unlikely to
	- * be helpful and could potentially cause us to enter an
	- * infinite loop. Additionally, zthr_iscancelled() is
	- * checked here so that if the arc is shutting down, the
	- * broadcast will wake any remaining arc adjust waiters.
	- */
	- mutex_enter(&arc_adjust_lock);
	- arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
	- evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
	- if (!arc_adjust_needed) {
	- /*
	- * We're either no longer overflowing, or we
	- * can't evict anything more, so we should wake
	- * up any waiters.
	- */
	- cv_broadcast(&arc_adjust_waiters_cv);
	- }
	- mutex_exit(&arc_adjust_lock);
	-}
	-
	-/* ARGSUSED */
	-static boolean_t
	-arc_reap_cb_check(void arg, zthr_t zthr)
	-{
	- int64_t free_memory = arc_available_memory();
	-
	- /*
	- * If a kmem reap is already active, don't schedule more. We must
	- * check for this because kmem_cache_reap_soon() won't actually
	- * block on the cache being reaped (this is to prevent callers from
	- * becoming implicitly blocked by a system-wide kmem reap -- which,
	- * on a system with many, many full magazines, can take minutes).
	- */
	- if (!kmem_cache_reap_active() &&
	- free_memory < 0) {
	- arc_no_grow = B_TRUE;
	- arc_warm = B_TRUE;
	- /*
	- * Wait at least zfs_grow_retry (default 60) seconds
	- * before considering growing.
	- */
	- arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
	- return (B_TRUE);
	- } else if (free_memory < arc_c >> arc_no_grow_shift) {
	- arc_no_grow = B_TRUE;
	- } else if (gethrtime() >= arc_growtime) {
	- arc_no_grow = B_FALSE;
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Keep enough free memory in the system by reaping the ARC's kmem
	- * caches. To cause more slabs to be reapable, we may reduce the
	- * target size of the cache (arc_c), causing the arc_adjust_cb()
	- * to free more buffers.
	- */
	-/* ARGSUSED */
	-static void
	-arc_reap_cb(void arg, zthr_t zthr)
	-{
	- int64_t free_memory;
	-
	- /*
	- * Kick off asynchronous kmem_reap()'s of all our caches.
	- */
	- arc_kmem_reap_soon();
	-
	- /*
	- * Wait at least arc_kmem_cache_reap_retry_ms between
	- * arc_kmem_reap_soon() calls. Without this check it is possible to
	- * end up in a situation where we spend lots of time reaping
	- * caches, while we're near arc_c_min. Waiting here also gives the
	- * subsequent free memory check a chance of finding that the
	- * asynchronous reap has already freed enough memory, and we don't
	- * need to call arc_reduce_target_size().
	- */
	- delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
	-
	- /*
	- * Reduce the target size as needed to maintain the amount of free
	- * memory in the system at a fraction of the arc_size (1/128th by
	- * default). If oversubscribed (free_memory < 0) then reduce the
	- * target arc_size by the deficit amount plus the fractional
	- * amount. If free memory is positive but less then the fractional
	- * amount, reduce by what is needed to hit the fractional amount.
	- */
	- free_memory = arc_available_memory();
	-
	- int64_t to_free =
	- (arc_c >> arc_shrink_shift) - free_memory;
	- if (to_free > 0) {
	-#ifdef _KERNEL
	-#ifdef illumos
	- to_free = MAX(to_free, ptob(needfree));
	-#endif
	-#endif
	- arc_reduce_target_size(to_free);
	- }
	-}
	-
	-static u_int arc_dnlc_evicts_arg;
	-extern struct vfsops zfs_vfsops;
	-
	-static void
	-arc_dnlc_evicts_thread(void *dummy __unused)
	-{
	- callb_cpr_t cpr;
	- u_int percent;
	-
	- CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);
	-
	- mutex_enter(&arc_dnlc_evicts_lock);
	- while (!arc_dnlc_evicts_thread_exit) {
	- CALLB_CPR_SAFE_BEGIN(&cpr);
	- (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
	- CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
	- if (arc_dnlc_evicts_arg != 0) {
	- percent = arc_dnlc_evicts_arg;
	- mutex_exit(&arc_dnlc_evicts_lock);
	-#ifdef _KERNEL
	- vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops);
	-#endif
	- mutex_enter(&arc_dnlc_evicts_lock);
	- /*
	- * Clear our token only after vnlru_free()
	- * pass is done, to avoid false queueing of
	- * the requests.
	- */
	- arc_dnlc_evicts_arg = 0;
	- }
	- }
	- arc_dnlc_evicts_thread_exit = FALSE;
	- cv_broadcast(&arc_dnlc_evicts_cv);
	- CALLB_CPR_EXIT(&cpr);
	- thread_exit();
	-}
	-
	-void
	-dnlc_reduce_cache(void *arg)
	-{
	- u_int percent;
	-
	- percent = (u_int)(uintptr_t)arg;
	- mutex_enter(&arc_dnlc_evicts_lock);
	- if (arc_dnlc_evicts_arg == 0) {
	- arc_dnlc_evicts_arg = percent;
	- cv_broadcast(&arc_dnlc_evicts_cv);
	- }
	- mutex_exit(&arc_dnlc_evicts_lock);
	-}
	-
	-/*
	- * Adapt arc info given the number of bytes we are trying to add and
	- * the state that we are comming from. This function is only called
	- * when we are adding new content to the cache.
	- */
	-static void
	-arc_adapt(int bytes, arc_state_t *state)
	-{
	- int mult;
	- uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
	- int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
	- int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
	-
	- if (state == arc_l2c_only)
	- return;
	-
	- ASSERT(bytes > 0);
	- /*
	- * Adapt the target size of the MRU list:
	- * - if we just hit in the MRU ghost list, then increase
	- * the target size of the MRU list.
	- * - if we just hit in the MFU ghost list, then increase
	- * the target size of the MFU list by decreasing the
	- * target size of the MRU list.
	- */
	- if (state == arc_mru_ghost) {
	- mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
	- mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
	-
	- arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
	- } else if (state == arc_mfu_ghost) {
	- uint64_t delta;
	-
	- mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
	- mult = MIN(mult, 10);
	-
	- delta = MIN(bytes * mult, arc_p);
	- arc_p = MAX(arc_p_min, arc_p - delta);
	- }
	- ASSERT((int64_t)arc_p >= 0);
	-
	- /*
	- * Wake reap thread if we do not have any available memory
	- */
	- if (arc_reclaim_needed()) {
	- zthr_wakeup(arc_reap_zthr);
	- return;
	- }
	-
	- if (arc_no_grow)
	- return;
	-
	- if (arc_c >= arc_c_max)
	- return;
	-
	- /*
	- * If we're within (2 * maxblocksize) bytes of the target
	- * cache size, increment the target cache size
	- */
	- if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >
	- 0) {
	- DTRACE_PROBE1(arc__inc_adapt, int, bytes);
	- atomic_add_64(&arc_c, (int64_t)bytes);
	- if (arc_c > arc_c_max)
	- arc_c = arc_c_max;
	- else if (state == arc_anon)
	- atomic_add_64(&arc_p, (int64_t)bytes);
	- if (arc_p > arc_c)
	- arc_p = arc_c;
	- }
	- ASSERT((int64_t)arc_p >= 0);
	-}
	-
	-/*
	- * Check if arc_size has grown past our upper threshold, determined by
	- * zfs_arc_overflow_shift.
	- */
	-static boolean_t
	-arc_is_overflowing(void)
	-{
	- /* Always allow at least one block of overflow */
	- int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
	- arc_c >> zfs_arc_overflow_shift);
	-
	- /*
	- * We just compare the lower bound here for performance reasons. Our
	- * primary goals are to make sure that the arc never grows without
	- * bound, and that it can reach its maximum size. This check
	- * accomplishes both goals. The maximum amount we could run over by is
	- * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
	- * in the ARC. In practice, that's in the tens of MB, which is low
	- * enough to be safe.
	- */
	- return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow);
	-}
	-
	-static abd_t *
	-arc_get_data_abd(arc_buf_hdr_t hdr, uint64_t size, void tag, boolean_t do_adapt)
	-{
	- arc_buf_contents_t type = arc_buf_type(hdr);
	-
	- arc_get_data_impl(hdr, size, tag, do_adapt);
	- if (type == ARC_BUFC_METADATA) {
	- return (abd_alloc(size, B_TRUE));
	- } else {
	- ASSERT(type == ARC_BUFC_DATA);
	- return (abd_alloc(size, B_FALSE));
	- }
	-}
	-
	-static void *
	-arc_get_data_buf(arc_buf_hdr_t hdr, uint64_t size, void tag)
	-{
	- arc_buf_contents_t type = arc_buf_type(hdr);
	-
	- arc_get_data_impl(hdr, size, tag, B_TRUE);
	- if (type == ARC_BUFC_METADATA) {
	- return (zio_buf_alloc(size));
	- } else {
	- ASSERT(type == ARC_BUFC_DATA);
	- return (zio_data_buf_alloc(size));
	- }
	-}
	-
	-/*
	- * Allocate a block and return it to the caller. If we are hitting the
	- * hard limit for the cache size, we must sleep, waiting for the eviction
	- * thread to catch up. If we're past the target size but below the hard
	- * limit, we'll only signal the reclaim thread and continue on.
	- */
	-static void
	-arc_get_data_impl(arc_buf_hdr_t hdr, uint64_t size, void tag, boolean_t do_adapt)
	-{
	- arc_state_t *state = hdr->b_l1hdr.b_state;
	- arc_buf_contents_t type = arc_buf_type(hdr);
	-
	- if (do_adapt)
	- arc_adapt(size, state);
	-
	- /*
	- * If arc_size is currently overflowing, and has grown past our
	- * upper limit, we must be adding data faster than the evict
	- * thread can evict. Thus, to ensure we don't compound the
	- * problem by adding more data and forcing arc_size to grow even
	- * further past it's target size, we halt and wait for the
	- * eviction thread to catch up.
	- *
	- * It's also possible that the reclaim thread is unable to evict
	- * enough buffers to get arc_size below the overflow limit (e.g.
	- * due to buffers being un-evictable, or hash lock collisions).
	- * In this case, we want to proceed regardless if we're
	- * overflowing; thus we don't use a while loop here.
	- */
	- if (arc_is_overflowing()) {
	- mutex_enter(&arc_adjust_lock);
	-
	- /*
	- * Now that we've acquired the lock, we may no longer be
	- * over the overflow limit, lets check.
	- *
	- * We're ignoring the case of spurious wake ups. If that
	- * were to happen, it'd let this thread consume an ARC
	- * buffer before it should have (i.e. before we're under
	- * the overflow limit and were signalled by the reclaim
	- * thread). As long as that is a rare occurrence, it
	- * shouldn't cause any harm.
	- */
	- if (arc_is_overflowing()) {
	- arc_adjust_needed = B_TRUE;
	- zthr_wakeup(arc_adjust_zthr);
	- (void) cv_wait(&arc_adjust_waiters_cv,
	- &arc_adjust_lock);
	- }
	- mutex_exit(&arc_adjust_lock);
	- }
	-
	- VERIFY3U(hdr->b_type, ==, type);
	- if (type == ARC_BUFC_METADATA) {
	- arc_space_consume(size, ARC_SPACE_META);
	- } else {
	- arc_space_consume(size, ARC_SPACE_DATA);
	- }
	-
	- /*
	- * Update the state size. Note that ghost states have a
	- * "ghost size" and so don't need to be updated.
	- */
	- if (!GHOST_STATE(state)) {
	-
	- (void) zfs_refcount_add_many(&state->arcs_size, size, tag);
	-
	- /*
	- * If this is reached via arc_read, the link is
	- * protected by the hash lock. If reached via
	- * arc_buf_alloc, the header should not be accessed by
	- * any other thread. And, if reached via arc_read_done,
	- * the hash lock will protect it if it's found in the
	- * hash table; otherwise no other thread should be
	- * trying to [add\|remove]_reference it.
	- */
	- if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
	- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	- (void) zfs_refcount_add_many(&state->arcs_esize[type],
	- size, tag);
	- }
	-
	- /*
	- * If we are growing the cache, and we are adding anonymous
	- * data, and we have outgrown arc_p, update arc_p
	- */
	- if (aggsum_upper_bound(&arc_size) < arc_c &&
	- hdr->b_l1hdr.b_state == arc_anon &&
	- (zfs_refcount_count(&arc_anon->arcs_size) +
	- zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
	- arc_p = MIN(arc_c, arc_p + size);
	- }
	- ARCSTAT_BUMP(arcstat_allocated);
	-}
	-
	-static void
	-arc_free_data_abd(arc_buf_hdr_t hdr, abd_t abd, uint64_t size, void *tag)
	-{
	- arc_free_data_impl(hdr, size, tag);
	- abd_free(abd);
	-}
	-
	-static void
	-arc_free_data_buf(arc_buf_hdr_t hdr, void buf, uint64_t size, void *tag)
	-{
	- arc_buf_contents_t type = arc_buf_type(hdr);
	-
	- arc_free_data_impl(hdr, size, tag);
	- if (type == ARC_BUFC_METADATA) {
	- zio_buf_free(buf, size);
	- } else {
	- ASSERT(type == ARC_BUFC_DATA);
	- zio_data_buf_free(buf, size);
	- }
	-}
	-
	-/*
	- * Free the arc data buffer.
	- */
	-static void
	-arc_free_data_impl(arc_buf_hdr_t hdr, uint64_t size, void tag)
	-{
	- arc_state_t *state = hdr->b_l1hdr.b_state;
	- arc_buf_contents_t type = arc_buf_type(hdr);
	-
	- /* protected by hash lock, if in the hash table */
	- if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
	- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	- ASSERT(state != arc_anon && state != arc_l2c_only);
	-
	- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
	- size, tag);
	- }
	- (void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
	-
	- VERIFY3U(hdr->b_type, ==, type);
	- if (type == ARC_BUFC_METADATA) {
	- arc_space_return(size, ARC_SPACE_META);
	- } else {
	- ASSERT(type == ARC_BUFC_DATA);
	- arc_space_return(size, ARC_SPACE_DATA);
	- }
	-}
	-
	-/*
	- * This routine is called whenever a buffer is accessed.
	- * NOTE: the hash lock is dropped in this function.
	- */
	-static void
	-arc_access(arc_buf_hdr_t hdr, kmutex_t hash_lock)
	-{
	- clock_t now;
	-
	- ASSERT(MUTEX_HELD(hash_lock));
	- ASSERT(HDR_HAS_L1HDR(hdr));
	-
	- if (hdr->b_l1hdr.b_state == arc_anon) {
	- /*
	- * This buffer is not in the cache, and does not
	- * appear in our "ghost" list. Add the new buffer
	- * to the MRU state.
	- */
	-
	- ASSERT0(hdr->b_l1hdr.b_arc_access);
	- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	- DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
	- arc_change_state(arc_mru, hdr, hash_lock);
	-
	- } else if (hdr->b_l1hdr.b_state == arc_mru) {
	- now = ddi_get_lbolt();
	-
	- /*
	- * If this buffer is here because of a prefetch, then either:
	- * - clear the flag if this is a "referencing" read
	- * (any subsequent access will bump this into the MFU state).
	- * or
	- * - move the buffer to the head of the list if this is
	- * another prefetch (to make it less likely to be evicted).
	- */
	- if (HDR_PREFETCH(hdr) \|\| HDR_PRESCIENT_PREFETCH(hdr)) {
	- if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
	- /* link protected by hash lock */
	- ASSERT(multilist_link_active(
	- &hdr->b_l1hdr.b_arc_node));
	- } else {
	- arc_hdr_clear_flags(hdr,
	- ARC_FLAG_PREFETCH \|
	- ARC_FLAG_PRESCIENT_PREFETCH);
	- ARCSTAT_BUMP(arcstat_mru_hits);
	- }
	- hdr->b_l1hdr.b_arc_access = now;
	- return;
	- }
	-
	- /*
	- * This buffer has been "accessed" only once so far,
	- * but it is still in the cache. Move it to the MFU
	- * state.
	- */
	- if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
	- /*
	- * More than 125ms have passed since we
	- * instantiated this buffer. Move it to the
	- * most frequently used state.
	- */
	- hdr->b_l1hdr.b_arc_access = now;
	- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
	- arc_change_state(arc_mfu, hdr, hash_lock);
	- }
	- atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
	- ARCSTAT_BUMP(arcstat_mru_hits);
	- } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
	- arc_state_t *new_state;
	- /*
	- * This buffer has been "accessed" recently, but
	- * was evicted from the cache. Move it to the
	- * MFU state.
	- */
	-
	- if (HDR_PREFETCH(hdr) \|\| HDR_PRESCIENT_PREFETCH(hdr)) {
	- new_state = arc_mru;
	- if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
	- arc_hdr_clear_flags(hdr,
	- ARC_FLAG_PREFETCH \|
	- ARC_FLAG_PRESCIENT_PREFETCH);
	- }
	- DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
	- } else {
	- new_state = arc_mfu;
	- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
	- }
	-
	- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	- arc_change_state(new_state, hdr, hash_lock);
	-
	- atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
	- ARCSTAT_BUMP(arcstat_mru_ghost_hits);
	- } else if (hdr->b_l1hdr.b_state == arc_mfu) {
	- /*
	- * This buffer has been accessed more than once and is
	- * still in the cache. Keep it in the MFU state.
	- *
	- * NOTE: an add_reference() that occurred when we did
	- * the arc_read() will have kicked this off the list.
	- * If it was a prefetch, we will explicitly move it to
	- * the head of the list now.
	- */
	-
	- atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
	- ARCSTAT_BUMP(arcstat_mfu_hits);
	- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	- } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
	- arc_state_t *new_state = arc_mfu;
	- /*
	- * This buffer has been accessed more than once but has
	- * been evicted from the cache. Move it back to the
	- * MFU state.
	- */
	-
	- if (HDR_PREFETCH(hdr) \|\| HDR_PRESCIENT_PREFETCH(hdr)) {
	- /*
	- * This is a prefetch access...
	- * move this block back to the MRU state.
	- */
	- new_state = arc_mru;
	- }
	-
	- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
	- arc_change_state(new_state, hdr, hash_lock);
	-
	- atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
	- ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
	- } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
	- /*
	- * This buffer is on the 2nd Level ARC.
	- */
	-
	- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
	- arc_change_state(arc_mfu, hdr, hash_lock);
	- } else {
	- ASSERT(!"invalid arc state");
	- }
	-}
	-
	-/*
	- * This routine is called by dbuf_hold() to update the arc_access() state
	- * which otherwise would be skipped for entries in the dbuf cache.
	- */
	-void
	-arc_buf_access(arc_buf_t *buf)
	-{
	- mutex_enter(&buf->b_evict_lock);
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	-
	- /*
	- * Avoid taking the hash_lock when possible as an optimization.
	- * The header must be checked again under the hash_lock in order
	- * to handle the case where it is concurrently being released.
	- */
	- if (hdr->b_l1hdr.b_state == arc_anon \|\| HDR_EMPTY(hdr)) {
	- mutex_exit(&buf->b_evict_lock);
	- ARCSTAT_BUMP(arcstat_access_skip);
	- return;
	- }
	-
	- kmutex_t *hash_lock = HDR_LOCK(hdr);
	- mutex_enter(hash_lock);
	-
	- if (hdr->b_l1hdr.b_state == arc_anon \|\| HDR_EMPTY(hdr)) {
	- mutex_exit(hash_lock);
	- mutex_exit(&buf->b_evict_lock);
	- ARCSTAT_BUMP(arcstat_access_skip);
	- return;
	- }
	-
	- mutex_exit(&buf->b_evict_lock);
	-
	- ASSERT(hdr->b_l1hdr.b_state == arc_mru \|\|
	- hdr->b_l1hdr.b_state == arc_mfu);
	-
	- DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
	- arc_access(hdr, hash_lock);
	- mutex_exit(hash_lock);
	-
	- ARCSTAT_BUMP(arcstat_hits);
	- ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
	- demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
	-}
	-
	-/* a generic arc_read_done_func_t which you can use */
	-/* ARGSUSED */
	-void
	-arc_bcopy_func(zio_t zio, const zbookmark_phys_t zb, const blkptr_t *bp,
	- arc_buf_t buf, void arg)
	-{
	- if (buf == NULL)
	- return;
	-
	- bcopy(buf->b_data, arg, arc_buf_size(buf));
	- arc_buf_destroy(buf, arg);
	-}
	-
	-/* a generic arc_read_done_func_t */
	-/* ARGSUSED */
	-void
	-arc_getbuf_func(zio_t zio, const zbookmark_phys_t zb, const blkptr_t *bp,
	- arc_buf_t buf, void arg)
	-{
	- arc_buf_t **bufp = arg;
	- if (buf == NULL) {
	- ASSERT(zio == NULL \|\| zio->io_error != 0);
	- *bufp = NULL;
	- } else {
	- ASSERT(zio == NULL \|\| zio->io_error == 0);
	- *bufp = buf;
	- ASSERT(buf->b_data != NULL);
	- }
	-}
	-
	-static void
	-arc_hdr_verify(arc_buf_hdr_t hdr, blkptr_t bp)
	-{
	- if (BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp)) {
	- ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
	- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
	- } else {
	- if (HDR_COMPRESSION_ENABLED(hdr)) {
	- ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
	- BP_GET_COMPRESS(bp));
	- }
	- ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
	- ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
	- }
	-}
	-
	-static void
	-arc_read_done(zio_t *zio)
	-{
	- arc_buf_hdr_t *hdr = zio->io_private;
	- kmutex_t *hash_lock = NULL;
	- arc_callback_t *callback_list;
	- arc_callback_t *acb;
	- boolean_t freeable = B_FALSE;
	- boolean_t no_zio_error = (zio->io_error == 0);
	-
	- /*
	- * The hdr was inserted into hash-table and removed from lists
	- * prior to starting I/O. We should find this header, since
	- * it's in the hash table, and it should be legit since it's
	- * not possible to evict it during the I/O. The only possible
	- * reason for it not to be found is if we were freed during the
	- * read.
	- */
	- if (HDR_IN_HASH_TABLE(hdr)) {
	- ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
	- ASSERT3U(hdr->b_dva.dva_word[0], ==,
	- BP_IDENTITY(zio->io_bp)->dva_word[0]);
	- ASSERT3U(hdr->b_dva.dva_word[1], ==,
	- BP_IDENTITY(zio->io_bp)->dva_word[1]);
	-
	- arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
	- &hash_lock);
	-
	- ASSERT((found == hdr &&
	- DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) \|\|
	- (found == hdr && HDR_L2_READING(hdr)));
	- ASSERT3P(hash_lock, !=, NULL);
	- }
	-
	- if (no_zio_error) {
	- /* byteswap if necessary */
	- if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
	- if (BP_GET_LEVEL(zio->io_bp) > 0) {
	- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
	- } else {
	- hdr->b_l1hdr.b_byteswap =
	- DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
	- }
	- } else {
	- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
	- }
	- }
	-
	- arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
	- if (l2arc_noprefetch && HDR_PREFETCH(hdr))
	- arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
	-
	- callback_list = hdr->b_l1hdr.b_acb;
	- ASSERT3P(callback_list, !=, NULL);
	-
	- if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
	- /*
	- * Only call arc_access on anonymous buffers. This is because
	- * if we've issued an I/O for an evicted buffer, we've already
	- * called arc_access (to prevent any simultaneous readers from
	- * getting confused).
	- */
	- arc_access(hdr, hash_lock);
	- }
	-
	- /*
	- * If a read request has a callback (i.e. acb_done is not NULL), then we
	- * make a buf containing the data according to the parameters which were
	- * passed in. The implementation of arc_buf_alloc_impl() ensures that we
	- * aren't needlessly decompressing the data multiple times.
	- */
	- int callback_cnt = 0;
	- for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
	- if (!acb->acb_done)
	- continue;
	-
	- callback_cnt++;
	-
	- if (no_zio_error) {
	- int error = arc_buf_alloc_impl(hdr, acb->acb_private,
	- acb->acb_compressed, zio->io_error == 0,
	- &acb->acb_buf);
	- if (error != 0) {
	- /*
	- * Decompression failed. Set io_error
	- * so that when we call acb_done (below),
	- * we will indicate that the read failed.
	- * Note that in the unusual case where one
	- * callback is compressed and another
	- * uncompressed, we will mark all of them
	- * as failed, even though the uncompressed
	- * one can't actually fail. In this case,
	- * the hdr will not be anonymous, because
	- * if there are multiple callbacks, it's
	- * because multiple threads found the same
	- * arc buf in the hash table.
	- */
	- zio->io_error = error;
	- }
	- }
	- }
	- /*
	- * If there are multiple callbacks, we must have the hash lock,
	- * because the only way for multiple threads to find this hdr is
	- * in the hash table. This ensures that if there are multiple
	- * callbacks, the hdr is not anonymous. If it were anonymous,
	- * we couldn't use arc_buf_destroy() in the error case below.
	- */
	- ASSERT(callback_cnt < 2 \|\| hash_lock != NULL);
	-
	- hdr->b_l1hdr.b_acb = NULL;
	- arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
	- if (callback_cnt == 0) {
	- ASSERT(HDR_PREFETCH(hdr));
	- ASSERT0(hdr->b_l1hdr.b_bufcnt);
	- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	- }
	-
	- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) \|\|
	- callback_list != NULL);
	-
	- if (no_zio_error) {
	- arc_hdr_verify(hdr, zio->io_bp);
	- } else {
	- arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
	- if (hdr->b_l1hdr.b_state != arc_anon)
	- arc_change_state(arc_anon, hdr, hash_lock);
	- if (HDR_IN_HASH_TABLE(hdr))
	- buf_hash_remove(hdr);
	- freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
	- }
	-
	- /*
	- * Broadcast before we drop the hash_lock to avoid the possibility
	- * that the hdr (and hence the cv) might be freed before we get to
	- * the cv_broadcast().
	- */
	- cv_broadcast(&hdr->b_l1hdr.b_cv);
	-
	- if (hash_lock != NULL) {
	- mutex_exit(hash_lock);
	- } else {
	- /*
	- * This block was freed while we waited for the read to
	- * complete. It has been removed from the hash table and
	- * moved to the anonymous state (so that it won't show up
	- * in the cache).
	- */
	- ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
	- freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
	- }
	-
	- /* execute each callback and free its structure */
	- while ((acb = callback_list) != NULL) {
	- if (acb->acb_done != NULL) {
	- if (zio->io_error != 0 && acb->acb_buf != NULL) {
	- /*
	- * If arc_buf_alloc_impl() fails during
	- * decompression, the buf will still be
	- * allocated, and needs to be freed here.
	- */
	- arc_buf_destroy(acb->acb_buf, acb->acb_private);
	- acb->acb_buf = NULL;
	- }
	- acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
	- acb->acb_buf, acb->acb_private);
	- }
	-
	- if (acb->acb_zio_dummy != NULL) {
	- acb->acb_zio_dummy->io_error = zio->io_error;
	- zio_nowait(acb->acb_zio_dummy);
	- }
	-
	- callback_list = acb->acb_next;
	- kmem_free(acb, sizeof (arc_callback_t));
	- }
	-
	- if (freeable)
	- arc_hdr_destroy(hdr);
	-}
	-
	-/*
	- * "Read" the block at the specified DVA (in bp) via the
	- * cache. If the block is found in the cache, invoke the provided
	- * callback immediately and return. Note that the `zio' parameter
	- * in the callback will be NULL in this case, since no IO was
	- * required. If the block is not in the cache pass the read request
	- * on to the spa with a substitute callback function, so that the
	- * requested block will be added to the cache.
	- *
	- * If a read request arrives for a block that has a read in-progress,
	- * either wait for the in-progress read to complete (and return the
	- * results); or, if this is a read with a "done" func, add a record
	- * to the read to invoke the "done" func when the read completes,
	- * and return; or just return.
	- *
	- * arc_read_done() will invoke all the requested "done" functions
	- * for readers of this block.
	- */
	-int
	-arc_read(zio_t pio, spa_t spa, const blkptr_t bp, arc_read_done_func_t done,
	- void *private, zio_priority_t priority, int zio_flags,
	- arc_flags_t arc_flags, const zbookmark_phys_t zb)
	-{
	- arc_buf_hdr_t *hdr = NULL;
	- kmutex_t *hash_lock = NULL;
	- zio_t *rzio;
	- uint64_t guid = spa_load_guid(spa);
	- boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
	- int rc = 0;
	-
	- ASSERT(!BP_IS_EMBEDDED(bp) \|\|
	- BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
	-
	-top:
	- if (!BP_IS_EMBEDDED(bp)) {
	- /*
	- * Embedded BP's have no DVA and require no I/O to "read".
	- * Create an anonymous arc buf to back it.
	- */
	- hdr = buf_hash_find(guid, bp, &hash_lock);
	- }
	-
	- if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
	- arc_buf_t *buf = NULL;
	- *arc_flags \|= ARC_FLAG_CACHED;
	-
	- if (HDR_IO_IN_PROGRESS(hdr)) {
	- zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
	-
	- ASSERT3P(head_zio, !=, NULL);
	- if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
	- priority == ZIO_PRIORITY_SYNC_READ) {
	- /*
	- * This is a sync read that needs to wait for
	- * an in-flight async read. Request that the
	- * zio have its priority upgraded.
	- */
	- zio_change_priority(head_zio, priority);
	- DTRACE_PROBE1(arc__async__upgrade__sync,
	- arc_buf_hdr_t *, hdr);
	- ARCSTAT_BUMP(arcstat_async_upgrade_sync);
	- }
	- if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
	- arc_hdr_clear_flags(hdr,
	- ARC_FLAG_PREDICTIVE_PREFETCH);
	- }
	-
	- if (*arc_flags & ARC_FLAG_WAIT) {
	- cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
	- mutex_exit(hash_lock);
	- goto top;
	- }
	- ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
	-
	- if (done) {
	- arc_callback_t *acb = NULL;
	-
	- acb = kmem_zalloc(sizeof (arc_callback_t),
	- KM_SLEEP);
	- acb->acb_done = done;
	- acb->acb_private = private;
	- acb->acb_compressed = compressed_read;
	- if (pio != NULL)
	- acb->acb_zio_dummy = zio_null(pio,
	- spa, NULL, NULL, NULL, zio_flags);
	-
	- ASSERT3P(acb->acb_done, !=, NULL);
	- acb->acb_zio_head = head_zio;
	- acb->acb_next = hdr->b_l1hdr.b_acb;
	- hdr->b_l1hdr.b_acb = acb;
	- mutex_exit(hash_lock);
	- return (0);
	- }
	- mutex_exit(hash_lock);
	- return (0);
	- }
	-
	- ASSERT(hdr->b_l1hdr.b_state == arc_mru \|\|
	- hdr->b_l1hdr.b_state == arc_mfu);
	-
	- if (done) {
	- if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
	- /*
	- * This is a demand read which does not have to
	- * wait for i/o because we did a predictive
	- * prefetch i/o for it, which has completed.
	- */
	- DTRACE_PROBE1(
	- arc__demand__hit__predictive__prefetch,
	- arc_buf_hdr_t *, hdr);
	- ARCSTAT_BUMP(
	- arcstat_demand_hit_predictive_prefetch);
	- arc_hdr_clear_flags(hdr,
	- ARC_FLAG_PREDICTIVE_PREFETCH);
	- }
	-
	- if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
	- ARCSTAT_BUMP(
	- arcstat_demand_hit_prescient_prefetch);
	- arc_hdr_clear_flags(hdr,
	- ARC_FLAG_PRESCIENT_PREFETCH);
	- }
	-
	- ASSERT(!BP_IS_EMBEDDED(bp) \|\| !BP_IS_HOLE(bp));
	- /* Get a buf with the desired data in it. */
	- rc = arc_buf_alloc_impl(hdr, private,
	- compressed_read, B_TRUE, &buf);
	- if (rc != 0) {
	- arc_buf_destroy(buf, private);
	- buf = NULL;
	- }
	- ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) \|\|
	- rc == 0 \|\| rc != ENOENT);
	- } else if (*arc_flags & ARC_FLAG_PREFETCH &&
	- zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
	- arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
	- }
	- DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
	- arc_access(hdr, hash_lock);
	- if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
	- arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
	- if (*arc_flags & ARC_FLAG_L2CACHE)
	- arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
	- mutex_exit(hash_lock);
	- ARCSTAT_BUMP(arcstat_hits);
	- ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
	- demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
	- data, metadata, hits);
	-
	- if (done)
	- done(NULL, zb, bp, buf, private);
	- } else {
	- uint64_t lsize = BP_GET_LSIZE(bp);
	- uint64_t psize = BP_GET_PSIZE(bp);
	- arc_callback_t *acb;
	- vdev_t *vd = NULL;
	- uint64_t addr = 0;
	- boolean_t devw = B_FALSE;
	- uint64_t size;
	-
	- if (hdr == NULL) {
	- /* this block is not in the cache */
	- arc_buf_hdr_t *exists = NULL;
	- arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
	- hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
	- BP_GET_COMPRESS(bp), type);
	-
	- if (!BP_IS_EMBEDDED(bp)) {
	- hdr->b_dva = *BP_IDENTITY(bp);
	- hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
	- exists = buf_hash_insert(hdr, &hash_lock);
	- }
	- if (exists != NULL) {
	- /* somebody beat us to the hash insert */
	- mutex_exit(hash_lock);
	- buf_discard_identity(hdr);
	- arc_hdr_destroy(hdr);
	- goto top; /* restart the IO request */
	- }
	- } else {
	- /*
	- * This block is in the ghost cache. If it was L2-only
	- * (and thus didn't have an L1 hdr), we realloc the
	- * header to add an L1 hdr.
	- */
	- if (!HDR_HAS_L1HDR(hdr)) {
	- hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
	- hdr_full_cache);
	- }
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	- ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
	- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
	-
	- /*
	- * This is a delicate dance that we play here.
	- * This hdr is in the ghost list so we access it
	- * to move it out of the ghost list before we
	- * initiate the read. If it's a prefetch then
	- * it won't have a callback so we'll remove the
	- * reference that arc_buf_alloc_impl() created. We
	- * do this after we've called arc_access() to
	- * avoid hitting an assert in remove_reference().
	- */
	- arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
	- arc_access(hdr, hash_lock);
	- arc_hdr_alloc_pabd(hdr, B_FALSE);
	- }
	- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	- size = arc_hdr_size(hdr);
	-
	- /*
	- * If compression is enabled on the hdr, then will do
	- * RAW I/O and will store the compressed data in the hdr's
	- * data block. Otherwise, the hdr's data block will contain
	- * the uncompressed data.
	- */
	- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
	- zio_flags \|= ZIO_FLAG_RAW;
	- }
	-
	- if (*arc_flags & ARC_FLAG_PREFETCH)
	- arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
	- if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
	- arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
	-
	- if (*arc_flags & ARC_FLAG_L2CACHE)
	- arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
	- if (BP_GET_LEVEL(bp) > 0)
	- arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
	- if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
	- arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
	- ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
	-
	- acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
	- acb->acb_done = done;
	- acb->acb_private = private;
	- acb->acb_compressed = compressed_read;
	-
	- ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
	- hdr->b_l1hdr.b_acb = acb;
	- arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
	-
	- if (HDR_HAS_L2HDR(hdr) &&
	- (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
	- devw = hdr->b_l2hdr.b_dev->l2ad_writing;
	- addr = hdr->b_l2hdr.b_daddr;
	- /*
	- * Lock out L2ARC device removal.
	- */
	- if (vdev_is_dead(vd) \|\|
	- !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
	- vd = NULL;
	- }
	-
	- /*
	- * We count both async reads and scrub IOs as asynchronous so
	- * that both can be upgraded in the event of a cache hit while
	- * the read IO is still in-flight.
	- */
	- if (priority == ZIO_PRIORITY_ASYNC_READ \|\|
	- priority == ZIO_PRIORITY_SCRUB)
	- arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
	- else
	- arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
	-
	- /*
	- * At this point, we have a level 1 cache miss. Try again in
	- * L2ARC if possible.
	- */
	- ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
	-
	- DTRACE_PROBE4(arc__miss, arc_buf_hdr_t , hdr, blkptr_t , bp,
	- uint64_t, lsize, zbookmark_phys_t *, zb);
	- ARCSTAT_BUMP(arcstat_misses);
	- ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
	- demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
	- data, metadata, misses);
	-#ifdef _KERNEL
	-#ifdef RACCT
	- if (racct_enable) {
	- PROC_LOCK(curproc);
	- racct_add_force(curproc, RACCT_READBPS, size);
	- racct_add_force(curproc, RACCT_READIOPS, 1);
	- PROC_UNLOCK(curproc);
	- }
	-#endif /* RACCT */
	- curthread->td_ru.ru_inblock++;
	-#endif
	-
	- if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
	- /*
	- * Read from the L2ARC if the following are true:
	- * 1. The L2ARC vdev was previously cached.
	- * 2. This buffer still has L2ARC metadata.
	- * 3. This buffer isn't currently writing to the L2ARC.
	- * 4. The L2ARC entry wasn't evicted, which may
	- * also have invalidated the vdev.
	- * 5. This isn't prefetch and l2arc_noprefetch is set.
	- */
	- if (HDR_HAS_L2HDR(hdr) &&
	- !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
	- !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
	- l2arc_read_callback_t *cb;
	- abd_t *abd;
	- uint64_t asize;
	-
	- DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
	- ARCSTAT_BUMP(arcstat_l2_hits);
	- atomic_inc_32(&hdr->b_l2hdr.b_hits);
	-
	- cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
	- KM_SLEEP);
	- cb->l2rcb_hdr = hdr;
	- cb->l2rcb_bp = *bp;
	- cb->l2rcb_zb = *zb;
	- cb->l2rcb_flags = zio_flags;
	-
	- asize = vdev_psize_to_asize(vd, size);
	- if (asize != size) {
	- abd = abd_alloc_for_io(asize,
	- HDR_ISTYPE_METADATA(hdr));
	- cb->l2rcb_abd = abd;
	- } else {
	- abd = hdr->b_l1hdr.b_pabd;
	- }
	-
	- ASSERT(addr >= VDEV_LABEL_START_SIZE &&
	- addr + asize <= vd->vdev_psize -
	- VDEV_LABEL_END_SIZE);
	-
	- /*
	- * l2arc read. The SCL_L2ARC lock will be
	- * released by l2arc_read_done().
	- * Issue a null zio if the underlying buffer
	- * was squashed to zero size by compression.
	- */
	- ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
	- ZIO_COMPRESS_EMPTY);
	- rzio = zio_read_phys(pio, vd, addr,
	- asize, abd,
	- ZIO_CHECKSUM_OFF,
	- l2arc_read_done, cb, priority,
	- zio_flags \| ZIO_FLAG_DONT_CACHE \|
	- ZIO_FLAG_CANFAIL \|
	- ZIO_FLAG_DONT_PROPAGATE \|
	- ZIO_FLAG_DONT_RETRY, B_FALSE);
	- acb->acb_zio_head = rzio;
	-
	- if (hash_lock != NULL)
	- mutex_exit(hash_lock);
	-
	- DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
	- zio_t *, rzio);
	- ARCSTAT_INCR(arcstat_l2_read_bytes, size);
	-
	- if (*arc_flags & ARC_FLAG_NOWAIT) {
	- zio_nowait(rzio);
	- return (0);
	- }
	-
	- ASSERT(*arc_flags & ARC_FLAG_WAIT);
	- if (zio_wait(rzio) == 0)
	- return (0);
	-
	- /* l2arc read error; goto zio_read() */
	- if (hash_lock != NULL)
	- mutex_enter(hash_lock);
	- } else {
	- DTRACE_PROBE1(l2arc__miss,
	- arc_buf_hdr_t *, hdr);
	- ARCSTAT_BUMP(arcstat_l2_misses);
	- if (HDR_L2_WRITING(hdr))
	- ARCSTAT_BUMP(arcstat_l2_rw_clash);
	- spa_config_exit(spa, SCL_L2ARC, vd);
	- }
	- } else {
	- if (vd != NULL)
	- spa_config_exit(spa, SCL_L2ARC, vd);
	- if (l2arc_ndev != 0) {
	- DTRACE_PROBE1(l2arc__miss,
	- arc_buf_hdr_t *, hdr);
	- ARCSTAT_BUMP(arcstat_l2_misses);
	- }
	- }
	-
	- rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
	- arc_read_done, hdr, priority, zio_flags, zb);
	- acb->acb_zio_head = rzio;
	-
	- if (hash_lock != NULL)
	- mutex_exit(hash_lock);
	-
	- if (*arc_flags & ARC_FLAG_WAIT)
	- return (zio_wait(rzio));
	-
	- ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
	- zio_nowait(rzio);
	- }
	- return (0);
	-}
	-
	-arc_prune_t *
	-arc_add_prune_callback(arc_prune_func_t func, void private)
	-{
	- arc_prune_t *p;
	-
	- p = kmem_alloc(sizeof (*p), KM_SLEEP);
	- p->p_pfunc = func;
	- p->p_private = private;
	- list_link_init(&p->p_node);
	- zfs_refcount_create(&p->p_refcnt);
	-
	- mutex_enter(&arc_prune_mtx);
	- zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
	- list_insert_head(&arc_prune_list, p);
	- mutex_exit(&arc_prune_mtx);
	-
	- return (p);
	-}
	-
	-void
	-arc_remove_prune_callback(arc_prune_t *p)
	-{
	- boolean_t wait = B_FALSE;
	- mutex_enter(&arc_prune_mtx);
	- list_remove(&arc_prune_list, p);
	- if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
	- wait = B_TRUE;
	- mutex_exit(&arc_prune_mtx);
	-
	- /* wait for arc_prune_task to finish */
	- if (wait)
	- taskq_wait(arc_prune_taskq);
	- ASSERT0(zfs_refcount_count(&p->p_refcnt));
	- zfs_refcount_destroy(&p->p_refcnt);
	- kmem_free(p, sizeof (*p));
	-}
	-
	-/*
	- * Notify the arc that a block was freed, and thus will never be used again.
	- */
	-void
	-arc_freed(spa_t spa, const blkptr_t bp)
	-{
	- arc_buf_hdr_t *hdr;
	- kmutex_t *hash_lock;
	- uint64_t guid = spa_load_guid(spa);
	-
	- ASSERT(!BP_IS_EMBEDDED(bp));
	-
	- hdr = buf_hash_find(guid, bp, &hash_lock);
	- if (hdr == NULL)
	- return;
	-
	- /*
	- * We might be trying to free a block that is still doing I/O
	- * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
	- * dmu_sync-ed block). If this block is being prefetched, then it
	- * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
	- * until the I/O completes. A block may also have a reference if it is
	- * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
	- * have written the new block to its final resting place on disk but
	- * without the dedup flag set. This would have left the hdr in the MRU
	- * state and discoverable. When the txg finally syncs it detects that
	- * the block was overridden in open context and issues an override I/O.
	- * Since this is a dedup block, the override I/O will determine if the
	- * block is already in the DDT. If so, then it will replace the io_bp
	- * with the bp from the DDT and allow the I/O to finish. When the I/O
	- * reaches the done callback, dbuf_write_override_done, it will
	- * check to see if the io_bp and io_bp_override are identical.
	- * If they are not, then it indicates that the bp was replaced with
	- * the bp in the DDT and the override bp is freed. This allows
	- * us to arrive here with a reference on a block that is being
	- * freed. So if we have an I/O in progress, or a reference to
	- * this hdr, then we don't destroy the hdr.
	- */
	- if (!HDR_HAS_L1HDR(hdr) \|\| (!HDR_IO_IN_PROGRESS(hdr) &&
	- zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
	- arc_change_state(arc_anon, hdr, hash_lock);
	- arc_hdr_destroy(hdr);
	- mutex_exit(hash_lock);
	- } else {
	- mutex_exit(hash_lock);
	- }
	-
	-}
	-
	-/*
	- * Release this buffer from the cache, making it an anonymous buffer. This
	- * must be done after a read and prior to modifying the buffer contents.
	- * If the buffer has more than one reference, we must make
	- * a new hdr for the buffer.
	- */
	-void
	-arc_release(arc_buf_t buf, void tag)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	-
	- /*
	- * It would be nice to assert that if it's DMU metadata (level >
	- * 0 \|\| it's the dnode file), then it must be syncing context.
	- * But we don't know that information at this level.
	- */
	-
	- mutex_enter(&buf->b_evict_lock);
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	-
	- /*
	- * We don't grab the hash lock prior to this check, because if
	- * the buffer's header is in the arc_anon state, it won't be
	- * linked into the hash table.
	- */
	- if (hdr->b_l1hdr.b_state == arc_anon) {
	- mutex_exit(&buf->b_evict_lock);
	- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	- ASSERT(!HDR_IN_HASH_TABLE(hdr));
	- ASSERT(!HDR_HAS_L2HDR(hdr));
	- ASSERT(HDR_EMPTY(hdr));
	- ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
	- ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
	- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
	-
	- hdr->b_l1hdr.b_arc_access = 0;
	-
	- /*
	- * If the buf is being overridden then it may already
	- * have a hdr that is not empty.
	- */
	- buf_discard_identity(hdr);
	- arc_buf_thaw(buf);
	-
	- return;
	- }
	-
	- kmutex_t *hash_lock = HDR_LOCK(hdr);
	- mutex_enter(hash_lock);
	-
	- /*
	- * This assignment is only valid as long as the hash_lock is
	- * held, we must be careful not to reference state or the
	- * b_state field after dropping the lock.
	- */
	- arc_state_t *state = hdr->b_l1hdr.b_state;
	- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
	- ASSERT3P(state, !=, arc_anon);
	-
	- /* this buffer is not on any list */
	- ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
	-
	- if (HDR_HAS_L2HDR(hdr)) {
	- mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
	-
	- /*
	- * We have to recheck this conditional again now that
	- * we're holding the l2ad_mtx to prevent a race with
	- * another thread which might be concurrently calling
	- * l2arc_evict(). In that case, l2arc_evict() might have
	- * destroyed the header's L2 portion as we were waiting
	- * to acquire the l2ad_mtx.
	- */
	- if (HDR_HAS_L2HDR(hdr)) {
	- l2arc_trim(hdr);
	- arc_hdr_l2hdr_destroy(hdr);
	- }
	-
	- mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
	- }
	-
	- /*
	- * Do we have more than one buf?
	- */
	- if (hdr->b_l1hdr.b_bufcnt > 1) {
	- arc_buf_hdr_t *nhdr;
	- uint64_t spa = hdr->b_spa;
	- uint64_t psize = HDR_GET_PSIZE(hdr);
	- uint64_t lsize = HDR_GET_LSIZE(hdr);
	- enum zio_compress compress = HDR_GET_COMPRESS(hdr);
	- arc_buf_contents_t type = arc_buf_type(hdr);
	- VERIFY3U(hdr->b_type, ==, type);
	-
	- ASSERT(hdr->b_l1hdr.b_buf != buf \|\| buf->b_next != NULL);
	- (void) remove_reference(hdr, hash_lock, tag);
	-
	- if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
	- ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
	- ASSERT(ARC_BUF_LAST(buf));
	- }
	-
	- /*
	- * Pull the data off of this hdr and attach it to
	- * a new anonymous hdr. Also find the last buffer
	- * in the hdr's buffer list.
	- */
	- arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
	- ASSERT3P(lastbuf, !=, NULL);
	-
	- /*
	- * If the current arc_buf_t and the hdr are sharing their data
	- * buffer, then we must stop sharing that block.
	- */
	- if (arc_buf_is_shared(buf)) {
	- VERIFY(!arc_buf_is_shared(lastbuf));
	-
	- /*
	- * First, sever the block sharing relationship between
	- * buf and the arc_buf_hdr_t.
	- */
	- arc_unshare_buf(hdr, buf);
	-
	- /*
	- * Now we need to recreate the hdr's b_pabd. Since we
	- * have lastbuf handy, we try to share with it, but if
	- * we can't then we allocate a new b_pabd and copy the
	- * data from buf into it.
	- */
	- if (arc_can_share(hdr, lastbuf)) {
	- arc_share_buf(hdr, lastbuf);
	- } else {
	- arc_hdr_alloc_pabd(hdr, B_TRUE);
	- abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
	- buf->b_data, psize);
	- }
	- VERIFY3P(lastbuf->b_data, !=, NULL);
	- } else if (HDR_SHARED_DATA(hdr)) {
	- /*
	- * Uncompressed shared buffers are always at the end
	- * of the list. Compressed buffers don't have the
	- * same requirements. This makes it hard to
	- * simply assert that the lastbuf is shared so
	- * we rely on the hdr's compression flags to determine
	- * if we have a compressed, shared buffer.
	- */
	- ASSERT(arc_buf_is_shared(lastbuf) \|\|
	- HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
	- ASSERT(!ARC_BUF_SHARED(buf));
	- }
	- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	- ASSERT3P(state, !=, arc_l2c_only);
	-
	- (void) zfs_refcount_remove_many(&state->arcs_size,
	- arc_buf_size(buf), buf);
	-
	- if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
	- ASSERT3P(state, !=, arc_l2c_only);
	- (void) zfs_refcount_remove_many(
	- &state->arcs_esize[type],
	- arc_buf_size(buf), buf);
	- }
	-
	- hdr->b_l1hdr.b_bufcnt -= 1;
	- arc_cksum_verify(buf);
	-#ifdef illumos
	- arc_buf_unwatch(buf);
	-#endif
	-
	- mutex_exit(hash_lock);
	-
	- /*
	- * Allocate a new hdr. The new hdr will contain a b_pabd
	- * buffer which will be freed in arc_write().
	- */
	- nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
	- ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
	- ASSERT0(nhdr->b_l1hdr.b_bufcnt);
	- ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
	- VERIFY3U(nhdr->b_type, ==, type);
	- ASSERT(!HDR_SHARED_DATA(nhdr));
	-
	- nhdr->b_l1hdr.b_buf = buf;
	- nhdr->b_l1hdr.b_bufcnt = 1;
	- (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
	- buf->b_hdr = nhdr;
	-
	- mutex_exit(&buf->b_evict_lock);
	- (void) zfs_refcount_add_many(&arc_anon->arcs_size,
	- arc_buf_size(buf), buf);
	- } else {
	- mutex_exit(&buf->b_evict_lock);
	- ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
	- /* protected by hash lock, or hdr is on arc_anon */
	- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
	- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	- arc_change_state(arc_anon, hdr, hash_lock);
	- hdr->b_l1hdr.b_arc_access = 0;
	- mutex_exit(hash_lock);
	-
	- buf_discard_identity(hdr);
	- arc_buf_thaw(buf);
	- }
	-}
	-
	-int
	-arc_released(arc_buf_t *buf)
	-{
	- int released;
	-
	- mutex_enter(&buf->b_evict_lock);
	- released = (buf->b_data != NULL &&
	- buf->b_hdr->b_l1hdr.b_state == arc_anon);
	- mutex_exit(&buf->b_evict_lock);
	- return (released);
	-}
	-
	-#ifdef ZFS_DEBUG
	-int
	-arc_referenced(arc_buf_t *buf)
	-{
	- int referenced;
	-
	- mutex_enter(&buf->b_evict_lock);
	- referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
	- mutex_exit(&buf->b_evict_lock);
	- return (referenced);
	-}
	-#endif
	-
	-static void
	-arc_write_ready(zio_t *zio)
	-{
	- arc_write_callback_t *callback = zio->io_private;
	- arc_buf_t *buf = callback->awcb_buf;
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	- uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
	-
	- ASSERT(HDR_HAS_L1HDR(hdr));
	- ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
	- ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
	-
	- /*
	- * If we're reexecuting this zio because the pool suspended, then
	- * cleanup any state that was previously set the first time the
	- * callback was invoked.
	- */
	- if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
	- arc_cksum_free(hdr);
	-#ifdef illumos
	- arc_buf_unwatch(buf);
	-#endif
	- if (hdr->b_l1hdr.b_pabd != NULL) {
	- if (arc_buf_is_shared(buf)) {
	- arc_unshare_buf(hdr, buf);
	- } else {
	- arc_hdr_free_pabd(hdr);
	- }
	- }
	- }
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	- ASSERT(!HDR_SHARED_DATA(hdr));
	- ASSERT(!arc_buf_is_shared(buf));
	-
	- callback->awcb_ready(zio, buf, callback->awcb_private);
	-
	- if (HDR_IO_IN_PROGRESS(hdr))
	- ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
	-
	- arc_cksum_compute(buf);
	- arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
	-
	- enum zio_compress compress;
	- if (BP_IS_HOLE(zio->io_bp) \|\| BP_IS_EMBEDDED(zio->io_bp)) {
	- compress = ZIO_COMPRESS_OFF;
	- } else {
	- ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
	- compress = BP_GET_COMPRESS(zio->io_bp);
	- }
	- HDR_SET_PSIZE(hdr, psize);
	- arc_hdr_set_compress(hdr, compress);
	-
	-
	- /*
	- * Fill the hdr with data. If the hdr is compressed, the data we want
	- * is available from the zio, otherwise we can take it from the buf.
	- *
	- * We might be able to share the buf's data with the hdr here. However,
	- * doing so would cause the ARC to be full of linear ABDs if we write a
	- * lot of shareable data. As a compromise, we check whether scattered
	- * ABDs are allowed, and assume that if they are then the user wants
	- * the ARC to be primarily filled with them regardless of the data being
	- * written. Therefore, if they're allowed then we allocate one and copy
	- * the data into it; otherwise, we share the data directly if we can.
	- */
	- if (zfs_abd_scatter_enabled \|\| !arc_can_share(hdr, buf)) {
	- arc_hdr_alloc_pabd(hdr, B_TRUE);
	-
	- /*
	- * Ideally, we would always copy the io_abd into b_pabd, but the
	- * user may have disabled compressed ARC, thus we must check the
	- * hdr's compression setting rather than the io_bp's.
	- */
	- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
	- ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
	- ZIO_COMPRESS_OFF);
	- ASSERT3U(psize, >, 0);
	-
	- abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
	- } else {
	- ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
	-
	- abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
	- arc_buf_size(buf));
	- }
	- } else {
	- ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
	- ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
	- ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
	-
	- arc_share_buf(hdr, buf);
	- }
	-
	- arc_hdr_verify(hdr, zio->io_bp);
	-}
	-
	-static void
	-arc_write_children_ready(zio_t *zio)
	-{
	- arc_write_callback_t *callback = zio->io_private;
	- arc_buf_t *buf = callback->awcb_buf;
	-
	- callback->awcb_children_ready(zio, buf, callback->awcb_private);
	-}
	-
	-/*
	- * The SPA calls this callback for each physical write that happens on behalf
	- * of a logical write. See the comment in dbuf_write_physdone() for details.
	- */
	-static void
	-arc_write_physdone(zio_t *zio)
	-{
	- arc_write_callback_t *cb = zio->io_private;
	- if (cb->awcb_physdone != NULL)
	- cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
	-}
	-
	-static void
	-arc_write_done(zio_t *zio)
	-{
	- arc_write_callback_t *callback = zio->io_private;
	- arc_buf_t *buf = callback->awcb_buf;
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	-
	- ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
	-
	- if (zio->io_error == 0) {
	- arc_hdr_verify(hdr, zio->io_bp);
	-
	- if (BP_IS_HOLE(zio->io_bp) \|\| BP_IS_EMBEDDED(zio->io_bp)) {
	- buf_discard_identity(hdr);
	- } else {
	- hdr->b_dva = *BP_IDENTITY(zio->io_bp);
	- hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
	- }
	- } else {
	- ASSERT(HDR_EMPTY(hdr));
	- }
	-
	- /*
	- * If the block to be written was all-zero or compressed enough to be
	- * embedded in the BP, no write was performed so there will be no
	- * dva/birth/checksum. The buffer must therefore remain anonymous
	- * (and uncached).
	- */
	- if (!HDR_EMPTY(hdr)) {
	- arc_buf_hdr_t *exists;
	- kmutex_t *hash_lock;
	-
	- ASSERT3U(zio->io_error, ==, 0);
	-
	- arc_cksum_verify(buf);
	-
	- exists = buf_hash_insert(hdr, &hash_lock);
	- if (exists != NULL) {
	- /*
	- * This can only happen if we overwrite for
	- * sync-to-convergence, because we remove
	- * buffers from the hash table when we arc_free().
	- */
	- if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
	- if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
	- panic("bad overwrite, hdr=%p exists=%p",
	- (void )hdr, (void )exists);
	- ASSERT(zfs_refcount_is_zero(
	- &exists->b_l1hdr.b_refcnt));
	- arc_change_state(arc_anon, exists, hash_lock);
	- mutex_exit(hash_lock);
	- arc_hdr_destroy(exists);
	- exists = buf_hash_insert(hdr, &hash_lock);
	- ASSERT3P(exists, ==, NULL);
	- } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
	- /* nopwrite */
	- ASSERT(zio->io_prop.zp_nopwrite);
	- if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
	- panic("bad nopwrite, hdr=%p exists=%p",
	- (void )hdr, (void )exists);
	- } else {
	- /* Dedup */
	- ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
	- ASSERT(hdr->b_l1hdr.b_state == arc_anon);
	- ASSERT(BP_GET_DEDUP(zio->io_bp));
	- ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
	- }
	- }
	- arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
	- /* if it's not anon, we are doing a scrub */
	- if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
	- arc_access(hdr, hash_lock);
	- mutex_exit(hash_lock);
	- } else {
	- arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
	- }
	-
	- ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	- callback->awcb_done(zio, buf, callback->awcb_private);
	-
	- abd_put(zio->io_abd);
	- kmem_free(callback, sizeof (arc_write_callback_t));
	-}
	-
	-zio_t *
	-arc_write(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, arc_buf_t buf,
	- boolean_t l2arc, const zio_prop_t zp, arc_write_done_func_t ready,
	- arc_write_done_func_t children_ready, arc_write_done_func_t physdone,
	- arc_write_done_func_t done, void private, zio_priority_t priority,
	- int zio_flags, const zbookmark_phys_t *zb)
	-{
	- arc_buf_hdr_t *hdr = buf->b_hdr;
	- arc_write_callback_t *callback;
	- zio_t *zio;
	- zio_prop_t localprop = *zp;
	-
	- ASSERT3P(ready, !=, NULL);
	- ASSERT3P(done, !=, NULL);
	- ASSERT(!HDR_IO_ERROR(hdr));
	- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	- ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
	- ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
	- if (l2arc)
	- arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
	- if (ARC_BUF_COMPRESSED(buf)) {
	- /*
	- * We're writing a pre-compressed buffer. Make the
	- * compression algorithm requested by the zio_prop_t match
	- * the pre-compressed buffer's compression algorithm.
	- */
	- localprop.zp_compress = HDR_GET_COMPRESS(hdr);
	-
	- ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
	- zio_flags \|= ZIO_FLAG_RAW;
	- }
	- callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
	- callback->awcb_ready = ready;
	- callback->awcb_children_ready = children_ready;
	- callback->awcb_physdone = physdone;
	- callback->awcb_done = done;
	- callback->awcb_private = private;
	- callback->awcb_buf = buf;
	-
	- /*
	- * The hdr's b_pabd is now stale, free it now. A new data block
	- * will be allocated when the zio pipeline calls arc_write_ready().
	- */
	- if (hdr->b_l1hdr.b_pabd != NULL) {
	- /*
	- * If the buf is currently sharing the data block with
	- * the hdr then we need to break that relationship here.
	- * The hdr will remain with a NULL data pointer and the
	- * buf will take sole ownership of the block.
	- */
	- if (arc_buf_is_shared(buf)) {
	- arc_unshare_buf(hdr, buf);
	- } else {
	- arc_hdr_free_pabd(hdr);
	- }
	- VERIFY3P(buf->b_data, !=, NULL);
	- arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
	- }
	- ASSERT(!arc_buf_is_shared(buf));
	- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	-
	- zio = zio_write(pio, spa, txg, bp,
	- abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
	- HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
	- (children_ready != NULL) ? arc_write_children_ready : NULL,
	- arc_write_physdone, arc_write_done, callback,
	- priority, zio_flags, zb);
	-
	- return (zio);
	-}
	-
	-static int
	-arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
	-{
	-#ifdef _KERNEL
	- uint64_t available_memory = ptob(freemem);
	-
	-#if defined(__i386) \|\| !defined(UMA_MD_SMALL_ALLOC)
	- available_memory = MIN(available_memory, uma_avail());
	-#endif
	-
	- if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
	- return (0);
	-
	- if (txg > spa->spa_lowmem_last_txg) {
	- spa->spa_lowmem_last_txg = txg;
	- spa->spa_lowmem_page_load = 0;
	- }
	- /*
	- * If we are in pageout, we know that memory is already tight,
	- * the arc is already going to be evicting, so we just want to
	- * continue to let page writes occur as quickly as possible.
	- */
	- if (curproc == pageproc) {
	- if (spa->spa_lowmem_page_load >
	- MAX(ptob(minfree), available_memory) / 4)
	- return (SET_ERROR(ERESTART));
	- /* Note: reserve is inflated, so we deflate */
	- atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
	- return (0);
	- } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
	- /* memory is low, delay before restarting */
	- ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
	- return (SET_ERROR(EAGAIN));
	- }
	- spa->spa_lowmem_page_load = 0;
	-#endif /* _KERNEL */
	- return (0);
	-}
	-
	-void
	-arc_tempreserve_clear(uint64_t reserve)
	-{
	- atomic_add_64(&arc_tempreserve, -reserve);
	- ASSERT((int64_t)arc_tempreserve >= 0);
	-}
	-
	-int
	-arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
	-{
	- int error;
	- uint64_t anon_size;
	-
	- if (reserve > arc_c/4 && !arc_no_grow) {
	- arc_c = MIN(arc_c_max, reserve * 4);
	- DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
	- }
	- if (reserve > arc_c)
	- return (SET_ERROR(ENOMEM));
	-
	- /*
	- * Don't count loaned bufs as in flight dirty data to prevent long
	- * network delays from blocking transactions that are ready to be
	- * assigned to a txg.
	- */
	-
	- /* assert that it has not wrapped around */
	- ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
	-
	- anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
	- arc_loaned_bytes), 0);
	-
	- /*
	- * Writes will, almost always, require additional memory allocations
	- * in order to compress/encrypt/etc the data. We therefore need to
	- * make sure that there is sufficient available memory for this.
	- */
	- error = arc_memory_throttle(spa, reserve, txg);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Throttle writes when the amount of dirty data in the cache
	- * gets too large. We try to keep the cache less than half full
	- * of dirty blocks so that our sync times don't grow too large.
	- *
	- * In the case of one pool being built on another pool, we want
	- * to make sure we don't end up throttling the lower (backing)
	- * pool when the upper pool is the majority contributor to dirty
	- * data. To insure we make forward progress during throttling, we
	- * also check the current pool's net dirty data and only throttle
	- * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
	- * data in the cache.
	- *
	- * Note: if two requests come in concurrently, we might let them
	- * both succeed, when one of them should fail. Not a huge deal.
	- */
	- uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
	- uint64_t spa_dirty_anon = spa_dirty_data(spa);
	-
	- if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
	- anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
	- spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
	- uint64_t meta_esize =
	- zfs_refcount_count(
	- &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
	- uint64_t data_esize =
	- zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
	- dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
	- "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
	- arc_tempreserve >> 10, meta_esize >> 10,
	- data_esize >> 10, reserve >> 10, arc_c >> 10);
	- return (SET_ERROR(ERESTART));
	- }
	- atomic_add_64(&arc_tempreserve, reserve);
	- return (0);
	-}
	-
	-static void
	-arc_kstat_update_state(arc_state_t state, kstat_named_t size,
	- kstat_named_t evict_data, kstat_named_t evict_metadata)
	-{
	- size->value.ui64 = zfs_refcount_count(&state->arcs_size);
	- evict_data->value.ui64 =
	- zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
	- evict_metadata->value.ui64 =
	- zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
	-}
	-
	-static int
	-arc_kstat_update(kstat_t *ksp, int rw)
	-{
	- arc_stats_t *as = ksp->ks_data;
	-
	- if (rw == KSTAT_WRITE) {
	- return (EACCES);
	- } else {
	- arc_kstat_update_state(arc_anon,
	- &as->arcstat_anon_size,
	- &as->arcstat_anon_evictable_data,
	- &as->arcstat_anon_evictable_metadata);
	- arc_kstat_update_state(arc_mru,
	- &as->arcstat_mru_size,
	- &as->arcstat_mru_evictable_data,
	- &as->arcstat_mru_evictable_metadata);
	- arc_kstat_update_state(arc_mru_ghost,
	- &as->arcstat_mru_ghost_size,
	- &as->arcstat_mru_ghost_evictable_data,
	- &as->arcstat_mru_ghost_evictable_metadata);
	- arc_kstat_update_state(arc_mfu,
	- &as->arcstat_mfu_size,
	- &as->arcstat_mfu_evictable_data,
	- &as->arcstat_mfu_evictable_metadata);
	- arc_kstat_update_state(arc_mfu_ghost,
	- &as->arcstat_mfu_ghost_size,
	- &as->arcstat_mfu_ghost_evictable_data,
	- &as->arcstat_mfu_ghost_evictable_metadata);
	-
	- ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
	- ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
	- ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
	- ARCSTAT(arcstat_metadata_size) =
	- aggsum_value(&astat_metadata_size);
	- ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
	- ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
	- ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
	- ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
	-#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
	- ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) +
	- aggsum_value(&astat_dnode_size) +
	- aggsum_value(&astat_dbuf_size);
	-#endif
	- ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * This function must return indices evenly distributed between all
	- * sublists of the multilist. This is needed due to how the ARC eviction
	- * code is laid out; arc_evict_state() assumes ARC buffers are evenly
	- * distributed between all sublists and uses this assumption when
	- * deciding which sublist to evict from and how much to evict from it.
	- */
	-unsigned int
	-arc_state_multilist_index_func(multilist_t ml, void obj)
	-{
	- arc_buf_hdr_t *hdr = obj;
	-
	- /*
	- * We rely on b_dva to generate evenly distributed index
	- * numbers using buf_hash below. So, as an added precaution,
	- * let's make sure we never add empty buffers to the arc lists.
	- */
	- ASSERT(!HDR_EMPTY(hdr));
	-
	- /*
	- * The assumption here, is the hash value for a given
	- * arc_buf_hdr_t will remain constant throughout it's lifetime
	- * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
	- * Thus, we don't need to store the header's sublist index
	- * on insertion, as this index can be recalculated on removal.
	- *
	- * Also, the low order bits of the hash value are thought to be
	- * distributed evenly. Otherwise, in the case that the multilist
	- * has a power of two number of sublists, each sublists' usage
	- * would not be evenly distributed.
	- */
	- return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
	- multilist_get_num_sublists(ml));
	-}
	-
	-#ifdef _KERNEL
	-static eventhandler_tag arc_event_lowmem = NULL;
	-
	-static void
	-arc_lowmem(void *arg __unused, int howto __unused)
	-{
	- int64_t free_memory, to_free;
	-
	- arc_no_grow = B_TRUE;
	- arc_warm = B_TRUE;
	- arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
	- free_memory = arc_available_memory();
	- to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
	- DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
	- arc_reduce_target_size(to_free);
	-
	- mutex_enter(&arc_adjust_lock);
	- arc_adjust_needed = B_TRUE;
	- zthr_wakeup(arc_adjust_zthr);
	-
	- /*
	- * It is unsafe to block here in arbitrary threads, because we can come
	- * here from ARC itself and may hold ARC locks and thus risk a deadlock
	- * with ARC reclaim thread.
	- */
	- if (curproc == pageproc)
	- (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock);
	- mutex_exit(&arc_adjust_lock);
	-}
	-#endif
	-
	-static void
	-arc_state_init(void)
	-{
	- arc_anon = &ARC_anon;
	- arc_mru = &ARC_mru;
	- arc_mru_ghost = &ARC_mru_ghost;
	- arc_mfu = &ARC_mfu;
	- arc_mfu_ghost = &ARC_mfu_ghost;
	- arc_l2c_only = &ARC_l2c_only;
	-
	- arc_mru->arcs_list[ARC_BUFC_METADATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	- arc_mru->arcs_list[ARC_BUFC_DATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	- arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	- arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	- arc_mfu->arcs_list[ARC_BUFC_METADATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	- arc_mfu->arcs_list[ARC_BUFC_DATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	- arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	- arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	- arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	- arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
	- multilist_create(sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	- arc_state_multilist_index_func);
	-
	- zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
	-
	- zfs_refcount_create(&arc_anon->arcs_size);
	- zfs_refcount_create(&arc_mru->arcs_size);
	- zfs_refcount_create(&arc_mru_ghost->arcs_size);
	- zfs_refcount_create(&arc_mfu->arcs_size);
	- zfs_refcount_create(&arc_mfu_ghost->arcs_size);
	- zfs_refcount_create(&arc_l2c_only->arcs_size);
	-
	- aggsum_init(&arc_meta_used, 0);
	- aggsum_init(&arc_size, 0);
	- aggsum_init(&astat_data_size, 0);
	- aggsum_init(&astat_metadata_size, 0);
	- aggsum_init(&astat_hdr_size, 0);
	- aggsum_init(&astat_bonus_size, 0);
	- aggsum_init(&astat_dnode_size, 0);
	- aggsum_init(&astat_dbuf_size, 0);
	- aggsum_init(&astat_l2_hdr_size, 0);
	-}
	-
	-static void
	-arc_state_fini(void)
	-{
	- zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
	- zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
	- zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
	-
	- zfs_refcount_destroy(&arc_anon->arcs_size);
	- zfs_refcount_destroy(&arc_mru->arcs_size);
	- zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
	- zfs_refcount_destroy(&arc_mfu->arcs_size);
	- zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
	- zfs_refcount_destroy(&arc_l2c_only->arcs_size);
	-
	- multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
	- multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
	- multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
	- multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
	- multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
	- multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
	- multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
	- multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
	-
	- aggsum_fini(&arc_meta_used);
	- aggsum_fini(&arc_size);
	- aggsum_fini(&astat_data_size);
	- aggsum_fini(&astat_metadata_size);
	- aggsum_fini(&astat_hdr_size);
	- aggsum_fini(&astat_bonus_size);
	- aggsum_fini(&astat_dnode_size);
	- aggsum_fini(&astat_dbuf_size);
	- aggsum_fini(&astat_l2_hdr_size);
	-}
	-
	-uint64_t
	-arc_max_bytes(void)
	-{
	- return (arc_c_max);
	-}
	-
	-void
	-arc_init(void)
	-{
	- int i, prefetch_tunable_set = 0;
	-
	- /*
	- * allmem is "all memory that we could possibly use".
	- */
	-#ifdef illumos
	-#ifdef _KERNEL
	- uint64_t allmem = ptob(physmem - swapfs_minfree);
	-#else
	- uint64_t allmem = (physmem * PAGESIZE) / 2;
	-#endif
	-#else
	- uint64_t allmem = kmem_size();
	-#endif
	- mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
	-
	- mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
	-
	- /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
	- arc_c_min = MAX(allmem / 32, arc_abs_min);
	- /* set max to 5/8 of all memory, or all but 1GB, whichever is more */
	- if (allmem >= 1 << 30)
	- arc_c_max = allmem - (1 << 30);
	- else
	- arc_c_max = arc_c_min;
	- arc_c_max = MAX(allmem * 5 / 8, arc_c_max);
	-
	- /*
	- * In userland, there's only the memory pressure that we artificially
	- * create (see arc_available_memory()). Don't let arc_c get too
	- * small, because it can cause transactions to be larger than
	- * arc_c, causing arc_tempreserve_space() to fail.
	- */
	-#ifndef _KERNEL
	- arc_c_min = arc_c_max / 2;
	-#endif
	-
	-#ifdef _KERNEL
	- /*
	- * Allow the tunables to override our calculations if they are
	- * reasonable.
	- */
	- if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) {
	- arc_c_max = zfs_arc_max;
	- arc_c_min = MIN(arc_c_min, arc_c_max);
	- }
	- if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
	- arc_c_min = zfs_arc_min;
	-#endif
	-
	- arc_c = arc_c_max;
	- arc_p = (arc_c >> 1);
	-
	- /* limit meta-data to 1/4 of the arc capacity */
	- arc_meta_limit = arc_c_max / 4;
	-
	-#ifdef _KERNEL
	- /*
	- * Metadata is stored in the kernel's heap. Don't let us
	- * use more than half the heap for the ARC.
	- */
	-#ifdef __FreeBSD__
	- arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2);
	- arc_dnode_limit = arc_meta_limit / 10;
	-#else
	- arc_meta_limit = MIN(arc_meta_limit,
	- vmem_size(heap_arena, VMEM_ALLOC \| VMEM_FREE) / 2);
	-#endif
	-#endif
	-
	- /* Allow the tunable to override if it is reasonable */
	- if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
	- arc_meta_limit = zfs_arc_meta_limit;
	-
	- if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
	- arc_c_min = arc_meta_limit / 2;
	-
	- if (zfs_arc_meta_min > 0) {
	- arc_meta_min = zfs_arc_meta_min;
	- } else {
	- arc_meta_min = arc_c_min / 2;
	- }
	-
	- /* Valid range: <arc_meta_min> - <arc_c_max> */
	- if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) &&
	- (zfs_arc_dnode_limit >= zfs_arc_meta_min) &&
	- (zfs_arc_dnode_limit <= arc_c_max))
	- arc_dnode_limit = zfs_arc_dnode_limit;
	-
	- if (zfs_arc_grow_retry > 0)
	- arc_grow_retry = zfs_arc_grow_retry;
	-
	- if (zfs_arc_shrink_shift > 0)
	- arc_shrink_shift = zfs_arc_shrink_shift;
	-
	- if (zfs_arc_no_grow_shift > 0)
	- arc_no_grow_shift = zfs_arc_no_grow_shift;
	- /*
	- * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
	- */
	- if (arc_no_grow_shift >= arc_shrink_shift)
	- arc_no_grow_shift = arc_shrink_shift - 1;
	-
	- if (zfs_arc_p_min_shift > 0)
	- arc_p_min_shift = zfs_arc_p_min_shift;
	-
	- /* if kmem_flags are set, lets try to use less memory */
	- if (kmem_debugging())
	- arc_c = arc_c / 2;
	- if (arc_c < arc_c_min)
	- arc_c = arc_c_min;
	-
	- zfs_arc_min = arc_c_min;
	- zfs_arc_max = arc_c_max;
	-
	- arc_state_init();
	-
	- /*
	- * The arc must be "uninitialized", so that hdr_recl() (which is
	- * registered by buf_init()) will not access arc_reap_zthr before
	- * it is created.
	- */
	- ASSERT(!arc_initialized);
	- buf_init();
	-
	- list_create(&arc_prune_list, sizeof (arc_prune_t),
	- offsetof(arc_prune_t, p_node));
	- mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
	-
	- arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
	- max_ncpus, INT_MAX, TASKQ_PREPOPULATE \| TASKQ_DYNAMIC);
	-
	- arc_dnlc_evicts_thread_exit = FALSE;
	-
	- arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
	- sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
	-
	- if (arc_ksp != NULL) {
	- arc_ksp->ks_data = &arc_stats;
	- arc_ksp->ks_update = arc_kstat_update;
	- kstat_install(arc_ksp);
	- }
	-
	- arc_adjust_zthr = zthr_create_timer(arc_adjust_cb_check,
	- arc_adjust_cb, NULL, SEC2NSEC(1));
	- arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
	- arc_reap_cb, NULL, SEC2NSEC(1));
	-
	-#ifdef _KERNEL
	- arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
	- EVENTHANDLER_PRI_FIRST);
	-#endif
	-
	- (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
	- TS_RUN, minclsyspri);
	-
	- arc_initialized = B_TRUE;
	- arc_warm = B_FALSE;
	-
	- /*
	- * Calculate maximum amount of dirty data per pool.
	- *
	- * If it has been set by /etc/system, take that.
	- * Otherwise, use a percentage of physical memory defined by
	- * zfs_dirty_data_max_percent (default 10%) with a cap at
	- * zfs_dirty_data_max_max (default 4GB).
	- */
	- if (zfs_dirty_data_max == 0) {
	- zfs_dirty_data_max = ptob(physmem) *
	- zfs_dirty_data_max_percent / 100;
	- zfs_dirty_data_max = MIN(zfs_dirty_data_max,
	- zfs_dirty_data_max_max);
	- }
	-
	-#ifdef _KERNEL
	- if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
	- prefetch_tunable_set = 1;
	-
	-#ifdef __i386__
	- if (prefetch_tunable_set == 0) {
	- printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
	- "-- to enable,\n");
	- printf(" add \"vfs.zfs.prefetch_disable=0\" "
	- "to /boot/loader.conf.\n");
	- zfs_prefetch_disable = 1;
	- }
	-#else
	- if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
	- prefetch_tunable_set == 0) {
	- printf("ZFS NOTICE: Prefetch is disabled by default if less "
	- "than 4GB of RAM is present;\n"
	- " to enable, add \"vfs.zfs.prefetch_disable=0\" "
	- "to /boot/loader.conf.\n");
	- zfs_prefetch_disable = 1;
	- }
	-#endif
	- /* Warn about ZFS memory and address space requirements. */
	- if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
	- printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
	- "expect unstable behavior.\n");
	- }
	- if (allmem < 512 * (1 << 20)) {
	- printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
	- "expect unstable behavior.\n");
	- printf(" Consider tuning vm.kmem_size and "
	- "vm.kmem_size_max\n");
	- printf(" in /boot/loader.conf.\n");
	- }
	-#endif
	-}
	-
	-void
	-arc_fini(void)
	-{
	- arc_prune_t *p;
	-
	-#ifdef _KERNEL
	- if (arc_event_lowmem != NULL)
	- EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
	-#endif
	-
	- /* Use B_TRUE to ensure all buffers are evicted */
	- arc_flush(NULL, B_TRUE);
	-
	- mutex_enter(&arc_dnlc_evicts_lock);
	- arc_dnlc_evicts_thread_exit = TRUE;
	- /*
	- * The user evicts thread will set arc_user_evicts_thread_exit
	- * to FALSE when it is finished exiting; we're waiting for that.
	- */
	- while (arc_dnlc_evicts_thread_exit) {
	- cv_signal(&arc_dnlc_evicts_cv);
	- cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
	- }
	- mutex_exit(&arc_dnlc_evicts_lock);
	-
	- arc_initialized = B_FALSE;
	-
	- if (arc_ksp != NULL) {
	- kstat_delete(arc_ksp);
	- arc_ksp = NULL;
	- }
	-
	- taskq_wait(arc_prune_taskq);
	- taskq_destroy(arc_prune_taskq);
	-
	- mutex_enter(&arc_prune_mtx);
	- while ((p = list_head(&arc_prune_list)) != NULL) {
	- list_remove(&arc_prune_list, p);
	- zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
	- zfs_refcount_destroy(&p->p_refcnt);
	- kmem_free(p, sizeof (*p));
	- }
	- mutex_exit(&arc_prune_mtx);
	-
	- list_destroy(&arc_prune_list);
	- mutex_destroy(&arc_prune_mtx);
	-
	- (void) zthr_cancel(arc_adjust_zthr);
	- zthr_destroy(arc_adjust_zthr);
	-
	- mutex_destroy(&arc_dnlc_evicts_lock);
	- cv_destroy(&arc_dnlc_evicts_cv);
	-
	- (void) zthr_cancel(arc_reap_zthr);
	- zthr_destroy(arc_reap_zthr);
	-
	- mutex_destroy(&arc_adjust_lock);
	- cv_destroy(&arc_adjust_waiters_cv);
	-
	- /*
	- * buf_fini() must proceed arc_state_fini() because buf_fin() may
	- * trigger the release of kmem magazines, which can callback to
	- * arc_space_return() which accesses aggsums freed in act_state_fini().
	- */
	- buf_fini();
	- arc_state_fini();
	-
	- ASSERT0(arc_loaned_bytes);
	-}
	-
	-/*
	- * Level 2 ARC
	- *
	- * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
	- * It uses dedicated storage devices to hold cached data, which are populated
	- * using large infrequent writes. The main role of this cache is to boost
	- * the performance of random read workloads. The intended L2ARC devices
	- * include short-stroked disks, solid state disks, and other media with
	- * substantially faster read latency than disk.
	- *
	- * +-----------------------+
	- * \| ARC \|
	- * +-----------------------+
	- * \| ^ ^
	- * \| \| \|
	- * l2arc_feed_thread() arc_read()
	- * \| \| \|
	- * \| l2arc read \|
	- * V \| \|
	- * +---------------+ \|
	- * \| L2ARC \| \|
	- * +---------------+ \|
	- * \| ^ \|
	- * l2arc_write() \| \|
	- * \| \| \|
	- * V \| \|
	- * +-------+ +-------+
	- * \| vdev \| \| vdev \|
	- * \| cache \| \| cache \|
	- * +-------+ +-------+
	- * +=========+ .-----.
	- * : L2ARC : \|-_____-\|
	- * : devices : \| Disks \|
	- * +=========+ `-_____-'
	- *
	- * Read requests are satisfied from the following sources, in order:
	- *
	- * 1) ARC
	- * 2) vdev cache of L2ARC devices
	- * 3) L2ARC devices
	- * 4) vdev cache of disks
	- * 5) disks
	- *
	- * Some L2ARC device types exhibit extremely slow write performance.
	- * To accommodate for this there are some significant differences between
	- * the L2ARC and traditional cache design:
	- *
	- * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
	- * the ARC behave as usual, freeing buffers and placing headers on ghost
	- * lists. The ARC does not send buffers to the L2ARC during eviction as
	- * this would add inflated write latencies for all ARC memory pressure.
	- *
	- * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
	- * It does this by periodically scanning buffers from the eviction-end of
	- * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
	- * not already there. It scans until a headroom of buffers is satisfied,
	- * which itself is a buffer for ARC eviction. If a compressible buffer is
	- * found during scanning and selected for writing to an L2ARC device, we
	- * temporarily boost scanning headroom during the next scan cycle to make
	- * sure we adapt to compression effects (which might significantly reduce
	- * the data volume we write to L2ARC). The thread that does this is
	- * l2arc_feed_thread(), illustrated below; example sizes are included to
	- * provide a better sense of ratio than this diagram:
	- *
	- * head --> tail
	- * +---------------------+----------+
	- * ARC_mfu \|:::::#:::::::::::::::\|o#o###o###\|-->. # already on L2ARC
	- * +---------------------+----------+ \| o L2ARC eligible
	- * ARC_mru \|:#:::::::::::::::::::\|#o#ooo####\|-->\| : ARC buffer
	- * +---------------------+----------+ \|
	- * 15.9 Gbytes ^ 32 Mbytes \|
	- * headroom \|
	- * l2arc_feed_thread()
	- * \|
	- * l2arc write hand <--[oooo]--'
	- * \| 8 Mbyte
	- * \| write max
	- * V
	- * +==============================+
	- * L2ARC dev \|####\|#\|###\|###\| \|####\| ... \|
	- * +==============================+
	- * 32 Gbytes
	- *
	- * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
	- * evicted, then the L2ARC has cached a buffer much sooner than it probably
	- * needed to, potentially wasting L2ARC device bandwidth and storage. It is
	- * safe to say that this is an uncommon case, since buffers at the end of
	- * the ARC lists have moved there due to inactivity.
	- *
	- * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
	- * then the L2ARC simply misses copying some buffers. This serves as a
	- * pressure valve to prevent heavy read workloads from both stalling the ARC
	- * with waits and clogging the L2ARC with writes. This also helps prevent
	- * the potential for the L2ARC to churn if it attempts to cache content too
	- * quickly, such as during backups of the entire pool.
	- *
	- * 5. After system boot and before the ARC has filled main memory, there are
	- * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
	- * lists can remain mostly static. Instead of searching from tail of these
	- * lists as pictured, the l2arc_feed_thread() will search from the list heads
	- * for eligible buffers, greatly increasing its chance of finding them.
	- *
	- * The L2ARC device write speed is also boosted during this time so that
	- * the L2ARC warms up faster. Since there have been no ARC evictions yet,
	- * there are no L2ARC reads, and no fear of degrading read performance
	- * through increased writes.
	- *
	- * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
	- * the vdev queue can aggregate them into larger and fewer writes. Each
	- * device is written to in a rotor fashion, sweeping writes through
	- * available space then repeating.
	- *
	- * 7. The L2ARC does not store dirty content. It never needs to flush
	- * write buffers back to disk based storage.
	- *
	- * 8. If an ARC buffer is written (and dirtied) which also exists in the
	- * L2ARC, the now stale L2ARC buffer is immediately dropped.
	- *
	- * The performance of the L2ARC can be tweaked by a number of tunables, which
	- * may be necessary for different workloads:
	- *
	- * l2arc_write_max max write bytes per interval
	- * l2arc_write_boost extra write bytes during device warmup
	- * l2arc_noprefetch skip caching prefetched buffers
	- * l2arc_headroom number of max device writes to precache
	- * l2arc_headroom_boost when we find compressed buffers during ARC
	- * scanning, we multiply headroom by this
	- * percentage factor for the next scan cycle,
	- * since more compressed buffers are likely to
	- * be present
	- * l2arc_feed_secs seconds between L2ARC writing
	- *
	- * Tunables may be removed or added as future performance improvements are
	- * integrated, and also may become zpool properties.
	- *
	- * There are three key functions that control how the L2ARC warms up:
	- *
	- * l2arc_write_eligible() check if a buffer is eligible to cache
	- * l2arc_write_size() calculate how much to write
	- * l2arc_write_interval() calculate sleep delay between writes
	- *
	- * These three functions determine what to write, how much, and how quickly
	- * to send writes.
	- */
	-
	-static boolean_t
	-l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
	-{
	- /*
	- * A buffer is not eligible for the L2ARC if it:
	- * 1. belongs to a different spa.
	- * 2. is already cached on the L2ARC.
	- * 3. has an I/O in progress (it may be an incomplete read).
	- * 4. is flagged not eligible (zfs property).
	- */
	- if (hdr->b_spa != spa_guid) {
	- ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
	- return (B_FALSE);
	- }
	- if (HDR_HAS_L2HDR(hdr)) {
	- ARCSTAT_BUMP(arcstat_l2_write_in_l2);
	- return (B_FALSE);
	- }
	- if (HDR_IO_IN_PROGRESS(hdr)) {
	- ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
	- return (B_FALSE);
	- }
	- if (!HDR_L2CACHE(hdr)) {
	- ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
	- return (B_FALSE);
	- }
	-
	- return (B_TRUE);
	-}
	-
	-static uint64_t
	-l2arc_write_size(void)
	-{
	- uint64_t size;
	-
	- /*
	- * Make sure our globals have meaningful values in case the user
	- * altered them.
	- */
	- size = l2arc_write_max;
	- if (size == 0) {
	- cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
	- "be greater than zero, resetting it to the default (%d)",
	- L2ARC_WRITE_SIZE);
	- size = l2arc_write_max = L2ARC_WRITE_SIZE;
	- }
	-
	- if (arc_warm == B_FALSE)
	- size += l2arc_write_boost;
	-
	- return (size);
	-
	-}
	-
	-static clock_t
	-l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
	-{
	- clock_t interval, next, now;
	-
	- /*
	- * If the ARC lists are busy, increase our write rate; if the
	- * lists are stale, idle back. This is achieved by checking
	- * how much we previously wrote - if it was more than half of
	- * what we wanted, schedule the next write much sooner.
	- */
	- if (l2arc_feed_again && wrote > (wanted / 2))
	- interval = (hz * l2arc_feed_min_ms) / 1000;
	- else
	- interval = hz * l2arc_feed_secs;
	-
	- now = ddi_get_lbolt();
	- next = MAX(now, MIN(now + interval, began + interval));
	-
	- return (next);
	-}
	-
	-/*
	- * Cycle through L2ARC devices. This is how L2ARC load balances.
	- * If a device is returned, this also returns holding the spa config lock.
	- */
	-static l2arc_dev_t *
	-l2arc_dev_get_next(void)
	-{
	- l2arc_dev_t first, next = NULL;
	-
	- /*
	- * Lock out the removal of spas (spa_namespace_lock), then removal
	- * of cache devices (l2arc_dev_mtx). Once a device has been selected,
	- * both locks will be dropped and a spa config lock held instead.
	- */
	- mutex_enter(&spa_namespace_lock);
	- mutex_enter(&l2arc_dev_mtx);
	-
	- /* if there are no vdevs, there is nothing to do */
	- if (l2arc_ndev == 0)
	- goto out;
	-
	- first = NULL;
	- next = l2arc_dev_last;
	- do {
	- /* loop around the list looking for a non-faulted vdev */
	- if (next == NULL) {
	- next = list_head(l2arc_dev_list);
	- } else {
	- next = list_next(l2arc_dev_list, next);
	- if (next == NULL)
	- next = list_head(l2arc_dev_list);
	- }
	-
	- /* if we have come back to the start, bail out */
	- if (first == NULL)
	- first = next;
	- else if (next == first)
	- break;
	-
	- } while (vdev_is_dead(next->l2ad_vdev));
	-
	- /* if we were unable to find any usable vdevs, return NULL */
	- if (vdev_is_dead(next->l2ad_vdev))
	- next = NULL;
	-
	- l2arc_dev_last = next;
	-
	-out:
	- mutex_exit(&l2arc_dev_mtx);
	-
	- /*
	- * Grab the config lock to prevent the 'next' device from being
	- * removed while we are writing to it.
	- */
	- if (next != NULL)
	- spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
	- mutex_exit(&spa_namespace_lock);
	-
	- return (next);
	-}
	-
	-/*
	- * Free buffers that were tagged for destruction.
	- */
	-static void
	-l2arc_do_free_on_write()
	-{
	- list_t *buflist;
	- l2arc_data_free_t df, df_prev;
	-
	- mutex_enter(&l2arc_free_on_write_mtx);
	- buflist = l2arc_free_on_write;
	-
	- for (df = list_tail(buflist); df; df = df_prev) {
	- df_prev = list_prev(buflist, df);
	- ASSERT3P(df->l2df_abd, !=, NULL);
	- abd_free(df->l2df_abd);
	- list_remove(buflist, df);
	- kmem_free(df, sizeof (l2arc_data_free_t));
	- }
	-
	- mutex_exit(&l2arc_free_on_write_mtx);
	-}
	-
	-/*
	- * A write to a cache device has completed. Update all headers to allow
	- * reads from these buffers to begin.
	- */
	-static void
	-l2arc_write_done(zio_t *zio)
	-{
	- l2arc_write_callback_t *cb;
	- l2arc_dev_t *dev;
	- list_t *buflist;
	- arc_buf_hdr_t head, hdr, *hdr_prev;
	- kmutex_t *hash_lock;
	- int64_t bytes_dropped = 0;
	-
	- cb = zio->io_private;
	- ASSERT3P(cb, !=, NULL);
	- dev = cb->l2wcb_dev;
	- ASSERT3P(dev, !=, NULL);
	- head = cb->l2wcb_head;
	- ASSERT3P(head, !=, NULL);
	- buflist = &dev->l2ad_buflist;
	- ASSERT3P(buflist, !=, NULL);
	- DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
	- l2arc_write_callback_t *, cb);
	-
	- if (zio->io_error != 0)
	- ARCSTAT_BUMP(arcstat_l2_writes_error);
	-
	- /*
	- * All writes completed, or an error was hit.
	- */
	-top:
	- mutex_enter(&dev->l2ad_mtx);
	- for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
	- hdr_prev = list_prev(buflist, hdr);
	-
	- hash_lock = HDR_LOCK(hdr);
	-
	- /*
	- * We cannot use mutex_enter or else we can deadlock
	- * with l2arc_write_buffers (due to swapping the order
	- * the hash lock and l2ad_mtx are taken).
	- */
	- if (!mutex_tryenter(hash_lock)) {
	- /*
	- * Missed the hash lock. We must retry so we
	- * don't leave the ARC_FLAG_L2_WRITING bit set.
	- */
	- ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
	-
	- /*
	- * We don't want to rescan the headers we've
	- * already marked as having been written out, so
	- * we reinsert the head node so we can pick up
	- * where we left off.
	- */
	- list_remove(buflist, head);
	- list_insert_after(buflist, hdr, head);
	-
	- mutex_exit(&dev->l2ad_mtx);
	-
	- /*
	- * We wait for the hash lock to become available
	- * to try and prevent busy waiting, and increase
	- * the chance we'll be able to acquire the lock
	- * the next time around.
	- */
	- mutex_enter(hash_lock);
	- mutex_exit(hash_lock);
	- goto top;
	- }
	-
	- /*
	- * We could not have been moved into the arc_l2c_only
	- * state while in-flight due to our ARC_FLAG_L2_WRITING
	- * bit being set. Let's just ensure that's being enforced.
	- */
	- ASSERT(HDR_HAS_L1HDR(hdr));
	-
	- if (zio->io_error != 0) {
	- /*
	- * Error - drop L2ARC entry.
	- */
	- list_remove(buflist, hdr);
	- l2arc_trim(hdr);
	- arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
	-
	- ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr));
	- ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
	-
	- bytes_dropped += arc_hdr_size(hdr);
	- (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
	- arc_hdr_size(hdr), hdr);
	- }
	-
	- /*
	- * Allow ARC to begin reads and ghost list evictions to
	- * this L2ARC entry.
	- */
	- arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
	-
	- mutex_exit(hash_lock);
	- }
	-
	- atomic_inc_64(&l2arc_writes_done);
	- list_remove(buflist, head);
	- ASSERT(!HDR_HAS_L1HDR(head));
	- kmem_cache_free(hdr_l2only_cache, head);
	- mutex_exit(&dev->l2ad_mtx);
	-
	- vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
	-
	- l2arc_do_free_on_write();
	-
	- kmem_free(cb, sizeof (l2arc_write_callback_t));
	-}
	-
	-/*
	- * A read to a cache device completed. Validate buffer contents before
	- * handing over to the regular ARC routines.
	- */
	-static void
	-l2arc_read_done(zio_t *zio)
	-{
	- l2arc_read_callback_t *cb;
	- arc_buf_hdr_t *hdr;
	- kmutex_t *hash_lock;
	- boolean_t valid_cksum;
	-
	- ASSERT3P(zio->io_vd, !=, NULL);
	- ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
	-
	- spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
	-
	- cb = zio->io_private;
	- ASSERT3P(cb, !=, NULL);
	- hdr = cb->l2rcb_hdr;
	- ASSERT3P(hdr, !=, NULL);
	-
	- hash_lock = HDR_LOCK(hdr);
	- mutex_enter(hash_lock);
	- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
	-
	- /*
	- * If the data was read into a temporary buffer,
	- * move it and free the buffer.
	- */
	- if (cb->l2rcb_abd != NULL) {
	- ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
	- if (zio->io_error == 0) {
	- abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd,
	- arc_hdr_size(hdr));
	- }
	-
	- /*
	- * The following must be done regardless of whether
	- * there was an error:
	- * - free the temporary buffer
	- * - point zio to the real ARC buffer
	- * - set zio size accordingly
	- * These are required because zio is either re-used for
	- * an I/O of the block in the case of the error
	- * or the zio is passed to arc_read_done() and it
	- * needs real data.
	- */
	- abd_free(cb->l2rcb_abd);
	- zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
	- zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
	- }
	-
	- ASSERT3P(zio->io_abd, !=, NULL);
	-
	- /*
	- * Check this survived the L2ARC journey.
	- */
	- ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
	- zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
	- zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
	-
	- valid_cksum = arc_cksum_is_equal(hdr, zio);
	- if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
	- mutex_exit(hash_lock);
	- zio->io_private = hdr;
	- arc_read_done(zio);
	- } else {
	- /*
	- * Buffer didn't survive caching. Increment stats and
	- * reissue to the original storage device.
	- */
	- if (zio->io_error != 0) {
	- ARCSTAT_BUMP(arcstat_l2_io_error);
	- } else {
	- zio->io_error = SET_ERROR(EIO);
	- }
	- if (!valid_cksum)
	- ARCSTAT_BUMP(arcstat_l2_cksum_bad);
	-
	- /*
	- * If there's no waiter, issue an async i/o to the primary
	- * storage now. If there is a waiter, the caller must
	- * issue the i/o in a context where it's OK to block.
	- */
	- if (zio->io_waiter == NULL) {
	- zio_t *pio = zio_unique_parent(zio);
	-
	- ASSERT(!pio \|\| pio->io_child_type == ZIO_CHILD_LOGICAL);
	-
	- zio = zio_read(pio, zio->io_spa, zio->io_bp,
	- hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
	- hdr, zio->io_priority, cb->l2rcb_flags,
	- &cb->l2rcb_zb);
	- for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
	- acb != NULL; acb = acb->acb_next)
	- acb->acb_zio_head = zio;
	- mutex_exit(hash_lock);
	- zio_nowait(zio);
	- } else
	- mutex_exit(hash_lock);
	- }
	-
	- kmem_free(cb, sizeof (l2arc_read_callback_t));
	-}
	-
	-/*
	- * This is the list priority from which the L2ARC will search for pages to
	- * cache. This is used within loops (0..3) to cycle through lists in the
	- * desired order. This order can have a significant effect on cache
	- * performance.
	- *
	- * Currently the metadata lists are hit first, MFU then MRU, followed by
	- * the data lists. This function returns a locked list, and also returns
	- * the lock pointer.
	- */
	-static multilist_sublist_t *
	-l2arc_sublist_lock(int list_num)
	-{
	- multilist_t *ml = NULL;
	- unsigned int idx;
	-
	- ASSERT(list_num >= 0 && list_num <= 3);
	-
	- switch (list_num) {
	- case 0:
	- ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
	- break;
	- case 1:
	- ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
	- break;
	- case 2:
	- ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
	- break;
	- case 3:
	- ml = arc_mru->arcs_list[ARC_BUFC_DATA];
	- break;
	- }
	-
	- /*
	- * Return a randomly-selected sublist. This is acceptable
	- * because the caller feeds only a little bit of data for each
	- * call (8MB). Subsequent calls will result in different
	- * sublists being selected.
	- */
	- idx = multilist_get_random_index(ml);
	- return (multilist_sublist_lock(ml, idx));
	-}
	-
	-/*
	- * Evict buffers from the device write hand to the distance specified in
	- * bytes. This distance may span populated buffers, it may span nothing.
	- * This is clearing a region on the L2ARC device ready for writing.
	- * If the 'all' boolean is set, every buffer is evicted.
	- */
	-static void
	-l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
	-{
	- list_t *buflist;
	- arc_buf_hdr_t hdr, hdr_prev;
	- kmutex_t *hash_lock;
	- uint64_t taddr;
	-
	- buflist = &dev->l2ad_buflist;
	-
	- if (!all && dev->l2ad_first) {
	- /*
	- * This is the first sweep through the device. There is
	- * nothing to evict.
	- */
	- return;
	- }
	-
	- if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
	- /*
	- * When nearing the end of the device, evict to the end
	- * before the device write hand jumps to the start.
	- */
	- taddr = dev->l2ad_end;
	- } else {
	- taddr = dev->l2ad_hand + distance;
	- }
	- DTRACE_PROBE4(l2arc__evict, l2arc_dev_t , dev, list_t , buflist,
	- uint64_t, taddr, boolean_t, all);
	-
	-top:
	- mutex_enter(&dev->l2ad_mtx);
	- for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
	- hdr_prev = list_prev(buflist, hdr);
	-
	- hash_lock = HDR_LOCK(hdr);
	-
	- /*
	- * We cannot use mutex_enter or else we can deadlock
	- * with l2arc_write_buffers (due to swapping the order
	- * the hash lock and l2ad_mtx are taken).
	- */
	- if (!mutex_tryenter(hash_lock)) {
	- /*
	- * Missed the hash lock. Retry.
	- */
	- ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
	- mutex_exit(&dev->l2ad_mtx);
	- mutex_enter(hash_lock);
	- mutex_exit(hash_lock);
	- goto top;
	- }
	-
	- /*
	- * A header can't be on this list if it doesn't have L2 header.
	- */
	- ASSERT(HDR_HAS_L2HDR(hdr));
	-
	- /* Ensure this header has finished being written. */
	- ASSERT(!HDR_L2_WRITING(hdr));
	- ASSERT(!HDR_L2_WRITE_HEAD(hdr));
	-
	- if (!all && (hdr->b_l2hdr.b_daddr >= taddr \|\|
	- hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
	- /*
	- * We've evicted to the target address,
	- * or the end of the device.
	- */
	- mutex_exit(hash_lock);
	- break;
	- }
	-
	- if (!HDR_HAS_L1HDR(hdr)) {
	- ASSERT(!HDR_L2_READING(hdr));
	- /*
	- * This doesn't exist in the ARC. Destroy.
	- * arc_hdr_destroy() will call list_remove()
	- * and decrement arcstat_l2_lsize.
	- */
	- arc_change_state(arc_anon, hdr, hash_lock);
	- arc_hdr_destroy(hdr);
	- } else {
	- ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
	- ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
	- /*
	- * Invalidate issued or about to be issued
	- * reads, since we may be about to write
	- * over this location.
	- */
	- if (HDR_L2_READING(hdr)) {
	- ARCSTAT_BUMP(arcstat_l2_evict_reading);
	- arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
	- }
	-
	- arc_hdr_l2hdr_destroy(hdr);
	- }
	- mutex_exit(hash_lock);
	- }
	- mutex_exit(&dev->l2ad_mtx);
	-}
	-
	-/*
	- * Find and write ARC buffers to the L2ARC device.
	- *
	- * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
	- * for reading until they have completed writing.
	- * The headroom_boost is an in-out parameter used to maintain headroom boost
	- * state between calls to this function.
	- *
	- * Returns the number of bytes actually written (which may be smaller than
	- * the delta by which the device hand has changed due to alignment).
	- */
	-static uint64_t
	-l2arc_write_buffers(spa_t spa, l2arc_dev_t dev, uint64_t target_sz)
	-{
	- arc_buf_hdr_t hdr, hdr_prev, *head;
	- uint64_t write_asize, write_psize, write_lsize, headroom;
	- boolean_t full;
	- l2arc_write_callback_t *cb;
	- zio_t pio, wzio;
	- uint64_t guid = spa_load_guid(spa);
	- int try;
	-
	- ASSERT3P(dev->l2ad_vdev, !=, NULL);
	-
	- pio = NULL;
	- write_lsize = write_asize = write_psize = 0;
	- full = B_FALSE;
	- head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
	- arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD \| ARC_FLAG_HAS_L2HDR);
	-
	- ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
	- /*
	- * Copy buffers for L2ARC writing.
	- */
	- for (try = 0; try <= 3; try++) {
	- multilist_sublist_t *mls = l2arc_sublist_lock(try);
	- uint64_t passed_sz = 0;
	-
	- ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
	-
	- /*
	- * L2ARC fast warmup.
	- *
	- * Until the ARC is warm and starts to evict, read from the
	- * head of the ARC lists rather than the tail.
	- */
	- if (arc_warm == B_FALSE)
	- hdr = multilist_sublist_head(mls);
	- else
	- hdr = multilist_sublist_tail(mls);
	- if (hdr == NULL)
	- ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
	-
	- headroom = target_sz * l2arc_headroom;
	- if (zfs_compressed_arc_enabled)
	- headroom = (headroom * l2arc_headroom_boost) / 100;
	-
	- for (; hdr; hdr = hdr_prev) {
	- kmutex_t *hash_lock;
	-
	- if (arc_warm == B_FALSE)
	- hdr_prev = multilist_sublist_next(mls, hdr);
	- else
	- hdr_prev = multilist_sublist_prev(mls, hdr);
	- ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
	- HDR_GET_LSIZE(hdr));
	-
	- hash_lock = HDR_LOCK(hdr);
	- if (!mutex_tryenter(hash_lock)) {
	- ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
	- /*
	- * Skip this buffer rather than waiting.
	- */
	- continue;
	- }
	-
	- passed_sz += HDR_GET_LSIZE(hdr);
	- if (passed_sz > headroom) {
	- /*
	- * Searched too far.
	- */
	- mutex_exit(hash_lock);
	- ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
	- break;
	- }
	-
	- if (!l2arc_write_eligible(guid, hdr)) {
	- mutex_exit(hash_lock);
	- continue;
	- }
	-
	- /*
	- * We rely on the L1 portion of the header below, so
	- * it's invalid for this header to have been evicted out
	- * of the ghost cache, prior to being written out. The
	- * ARC_FLAG_L2_WRITING bit ensures this won't happen.
	- */
	- ASSERT(HDR_HAS_L1HDR(hdr));
	-
	- ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
	- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	- ASSERT3U(arc_hdr_size(hdr), >, 0);
	- uint64_t psize = arc_hdr_size(hdr);
	- uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
	- psize);
	-
	- if ((write_asize + asize) > target_sz) {
	- full = B_TRUE;
	- mutex_exit(hash_lock);
	- ARCSTAT_BUMP(arcstat_l2_write_full);
	- break;
	- }
	-
	- if (pio == NULL) {
	- /*
	- * Insert a dummy header on the buflist so
	- * l2arc_write_done() can find where the
	- * write buffers begin without searching.
	- */
	- mutex_enter(&dev->l2ad_mtx);
	- list_insert_head(&dev->l2ad_buflist, head);
	- mutex_exit(&dev->l2ad_mtx);
	-
	- cb = kmem_alloc(
	- sizeof (l2arc_write_callback_t), KM_SLEEP);
	- cb->l2wcb_dev = dev;
	- cb->l2wcb_head = head;
	- pio = zio_root(spa, l2arc_write_done, cb,
	- ZIO_FLAG_CANFAIL);
	- ARCSTAT_BUMP(arcstat_l2_write_pios);
	- }
	-
	- hdr->b_l2hdr.b_dev = dev;
	- hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
	- arc_hdr_set_flags(hdr,
	- ARC_FLAG_L2_WRITING \| ARC_FLAG_HAS_L2HDR);
	-
	- mutex_enter(&dev->l2ad_mtx);
	- list_insert_head(&dev->l2ad_buflist, hdr);
	- mutex_exit(&dev->l2ad_mtx);
	-
	- (void) zfs_refcount_add_many(&dev->l2ad_alloc, psize,
	- hdr);
	-
	- /*
	- * Normally the L2ARC can use the hdr's data, but if
	- * we're sharing data between the hdr and one of its
	- * bufs, L2ARC needs its own copy of the data so that
	- * the ZIO below can't race with the buf consumer.
	- * Another case where we need to create a copy of the
	- * data is when the buffer size is not device-aligned
	- * and we need to pad the block to make it such.
	- * That also keeps the clock hand suitably aligned.
	- *
	- * To ensure that the copy will be available for the
	- * lifetime of the ZIO and be cleaned up afterwards, we
	- * add it to the l2arc_free_on_write queue.
	- */
	- abd_t *to_write;
	- if (!HDR_SHARED_DATA(hdr) && psize == asize) {
	- to_write = hdr->b_l1hdr.b_pabd;
	- } else {
	- to_write = abd_alloc_for_io(asize,
	- HDR_ISTYPE_METADATA(hdr));
	- abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
	- if (asize != psize) {
	- abd_zero_off(to_write, psize,
	- asize - psize);
	- }
	- l2arc_free_abd_on_write(to_write, asize,
	- arc_buf_type(hdr));
	- }
	- wzio = zio_write_phys(pio, dev->l2ad_vdev,
	- hdr->b_l2hdr.b_daddr, asize, to_write,
	- ZIO_CHECKSUM_OFF, NULL, hdr,
	- ZIO_PRIORITY_ASYNC_WRITE,
	- ZIO_FLAG_CANFAIL, B_FALSE);
	-
	- write_lsize += HDR_GET_LSIZE(hdr);
	- DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
	- zio_t *, wzio);
	-
	- write_psize += psize;
	- write_asize += asize;
	- dev->l2ad_hand += asize;
	-
	- mutex_exit(hash_lock);
	-
	- (void) zio_nowait(wzio);
	- }
	-
	- multilist_sublist_unlock(mls);
	-
	- if (full == B_TRUE)
	- break;
	- }
	-
	- /* No buffers selected for writing? */
	- if (pio == NULL) {
	- ASSERT0(write_lsize);
	- ASSERT(!HDR_HAS_L1HDR(head));
	- kmem_cache_free(hdr_l2only_cache, head);
	- return (0);
	- }
	-
	- ASSERT3U(write_psize, <=, target_sz);
	- ARCSTAT_BUMP(arcstat_l2_writes_sent);
	- ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
	- ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
	- ARCSTAT_INCR(arcstat_l2_psize, write_psize);
	- vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
	-
	- /*
	- * Bump device hand to the device start if it is approaching the end.
	- * l2arc_evict() will already have evicted ahead for this case.
	- */
	- if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
	- dev->l2ad_hand = dev->l2ad_start;
	- dev->l2ad_first = B_FALSE;
	- }
	-
	- dev->l2ad_writing = B_TRUE;
	- (void) zio_wait(pio);
	- dev->l2ad_writing = B_FALSE;
	-
	- return (write_asize);
	-}
	-
	-/*
	- * This thread feeds the L2ARC at regular intervals. This is the beating
	- * heart of the L2ARC.
	- */
	-/* ARGSUSED */
	-static void
	-l2arc_feed_thread(void *unused __unused)
	-{
	- callb_cpr_t cpr;
	- l2arc_dev_t *dev;
	- spa_t *spa;
	- uint64_t size, wrote;
	- clock_t begin, next = ddi_get_lbolt();
	-
	- CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
	-
	- mutex_enter(&l2arc_feed_thr_lock);
	-
	- while (l2arc_thread_exit == 0) {
	- CALLB_CPR_SAFE_BEGIN(&cpr);
	- (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
	- next - ddi_get_lbolt());
	- CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
	- next = ddi_get_lbolt() + hz;
	-
	- /*
	- * Quick check for L2ARC devices.
	- */
	- mutex_enter(&l2arc_dev_mtx);
	- if (l2arc_ndev == 0) {
	- mutex_exit(&l2arc_dev_mtx);
	- continue;
	- }
	- mutex_exit(&l2arc_dev_mtx);
	- begin = ddi_get_lbolt();
	-
	- /*
	- * This selects the next l2arc device to write to, and in
	- * doing so the next spa to feed from: dev->l2ad_spa. This
	- * will return NULL if there are now no l2arc devices or if
	- * they are all faulted.
	- *
	- * If a device is returned, its spa's config lock is also
	- * held to prevent device removal. l2arc_dev_get_next()
	- * will grab and release l2arc_dev_mtx.
	- */
	- if ((dev = l2arc_dev_get_next()) == NULL)
	- continue;
	-
	- spa = dev->l2ad_spa;
	- ASSERT3P(spa, !=, NULL);
	-
	- /*
	- * If the pool is read-only then force the feed thread to
	- * sleep a little longer.
	- */
	- if (!spa_writeable(spa)) {
	- next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
	- spa_config_exit(spa, SCL_L2ARC, dev);
	- continue;
	- }
	-
	- /*
	- * Avoid contributing to memory pressure.
	- */
	- if (arc_reclaim_needed()) {
	- ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
	- spa_config_exit(spa, SCL_L2ARC, dev);
	- continue;
	- }
	-
	- ARCSTAT_BUMP(arcstat_l2_feeds);
	-
	- size = l2arc_write_size();
	-
	- /*
	- * Evict L2ARC buffers that will be overwritten.
	- */
	- l2arc_evict(dev, size, B_FALSE);
	-
	- /*
	- * Write ARC buffers.
	- */
	- wrote = l2arc_write_buffers(spa, dev, size);
	-
	- /*
	- * Calculate interval between writes.
	- */
	- next = l2arc_write_interval(begin, size, wrote);
	- spa_config_exit(spa, SCL_L2ARC, dev);
	- }
	-
	- l2arc_thread_exit = 0;
	- cv_broadcast(&l2arc_feed_thr_cv);
	- CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
	- thread_exit();
	-}
	-
	-boolean_t
	-l2arc_vdev_present(vdev_t *vd)
	-{
	- l2arc_dev_t *dev;
	-
	- mutex_enter(&l2arc_dev_mtx);
	- for (dev = list_head(l2arc_dev_list); dev != NULL;
	- dev = list_next(l2arc_dev_list, dev)) {
	- if (dev->l2ad_vdev == vd)
	- break;
	- }
	- mutex_exit(&l2arc_dev_mtx);
	-
	- return (dev != NULL);
	-}
	-
	-/*
	- * Add a vdev for use by the L2ARC. By this point the spa has already
	- * validated the vdev and opened it.
	- */
	-void
	-l2arc_add_vdev(spa_t spa, vdev_t vd)
	-{
	- l2arc_dev_t *adddev;
	-
	- ASSERT(!l2arc_vdev_present(vd));
	-
	- vdev_ashift_optimize(vd);
	-
	- /*
	- * Create a new l2arc device entry.
	- */
	- adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
	- adddev->l2ad_spa = spa;
	- adddev->l2ad_vdev = vd;
	- adddev->l2ad_start = VDEV_LABEL_START_SIZE;
	- adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
	- adddev->l2ad_hand = adddev->l2ad_start;
	- adddev->l2ad_first = B_TRUE;
	- adddev->l2ad_writing = B_FALSE;
	-
	- mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
	- /*
	- * This is a list of all ARC buffers that are still valid on the
	- * device.
	- */
	- list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
	- offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
	-
	- vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
	- zfs_refcount_create(&adddev->l2ad_alloc);
	-
	- /*
	- * Add device to global list
	- */
	- mutex_enter(&l2arc_dev_mtx);
	- list_insert_head(l2arc_dev_list, adddev);
	- atomic_inc_64(&l2arc_ndev);
	- mutex_exit(&l2arc_dev_mtx);
	-}
	-
	-/*
	- * Remove a vdev from the L2ARC.
	- */
	-void
	-l2arc_remove_vdev(vdev_t *vd)
	-{
	- l2arc_dev_t dev, nextdev, *remdev = NULL;
	-
	- /*
	- * Find the device by vdev
	- */
	- mutex_enter(&l2arc_dev_mtx);
	- for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
	- nextdev = list_next(l2arc_dev_list, dev);
	- if (vd == dev->l2ad_vdev) {
	- remdev = dev;
	- break;
	- }
	- }
	- ASSERT3P(remdev, !=, NULL);
	-
	- /*
	- * Remove device from global list
	- */
	- list_remove(l2arc_dev_list, remdev);
	- l2arc_dev_last = NULL; /* may have been invalidated */
	- atomic_dec_64(&l2arc_ndev);
	- mutex_exit(&l2arc_dev_mtx);
	-
	- /*
	- * Clear all buflists and ARC references. L2ARC device flush.
	- */
	- l2arc_evict(remdev, 0, B_TRUE);
	- list_destroy(&remdev->l2ad_buflist);
	- mutex_destroy(&remdev->l2ad_mtx);
	- zfs_refcount_destroy(&remdev->l2ad_alloc);
	- kmem_free(remdev, sizeof (l2arc_dev_t));
	-}
	-
	-void
	-l2arc_init(void)
	-{
	- l2arc_thread_exit = 0;
	- l2arc_ndev = 0;
	- l2arc_writes_sent = 0;
	- l2arc_writes_done = 0;
	-
	- mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
	- mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
	-
	- l2arc_dev_list = &L2ARC_dev_list;
	- l2arc_free_on_write = &L2ARC_free_on_write;
	- list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
	- offsetof(l2arc_dev_t, l2ad_node));
	- list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
	- offsetof(l2arc_data_free_t, l2df_list_node));
	-}
	-
	-void
	-l2arc_fini(void)
	-{
	- /*
	- * This is called from dmu_fini(), which is called from spa_fini();
	- * Because of this, we can assume that all l2arc devices have
	- * already been removed when the pools themselves were removed.
	- */
	-
	- l2arc_do_free_on_write();
	-
	- mutex_destroy(&l2arc_feed_thr_lock);
	- cv_destroy(&l2arc_feed_thr_cv);
	- mutex_destroy(&l2arc_dev_mtx);
	- mutex_destroy(&l2arc_free_on_write_mtx);
	-
	- list_destroy(l2arc_dev_list);
	- list_destroy(l2arc_free_on_write);
	-}
	-
	-void
	-l2arc_start(void)
	-{
	- if (!(spa_mode_global & FWRITE))
	- return;
	-
	- (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
	- TS_RUN, minclsyspri);
	-}
	-
	-void
	-l2arc_stop(void)
	-{
	- if (!(spa_mode_global & FWRITE))
	- return;
	-
	- mutex_enter(&l2arc_feed_thr_lock);
	- cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
	- l2arc_thread_exit = 1;
	- while (l2arc_thread_exit != 0)
	- cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
	- mutex_exit(&l2arc_feed_thr_lock);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c
	@@ -1,152 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/zio.h>
	-#include <sys/zio_compress.h>
	-
	-/*
	- * Embedded-data Block Pointers
	- *
	- * Normally, block pointers point (via their DVAs) to a block which holds data.
	- * If the data that we need to store is very small, this is an inefficient
	- * use of space, because a block must be at minimum 1 sector (typically 512
	- * bytes or 4KB). Additionally, reading these small blocks tends to generate
	- * more random reads.
	- *
	- * Embedded-data Block Pointers allow small pieces of data (the "payload",
	- * up to 112 bytes) to be stored in the block pointer itself, instead of
	- * being pointed to. The "Pointer" part of this name is a bit of a
	- * misnomer, as nothing is pointed to.
	- *
	- * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
	- * be embedded in the block pointer. The logic for this is handled in
	- * the SPA, by the zio pipeline. Therefore most code outside the zio
	- * pipeline doesn't need special-cases to handle these block pointers.
	- *
	- * See spa.h for details on the exact layout of embedded block pointers.
	- */
	-
	-void
	-encode_embedded_bp_compressed(blkptr_t bp, void data,
	- enum zio_compress comp, int uncompressed_size, int compressed_size)
	-{
	- uint64_t bp64 = (uint64_t )bp;
	- uint64_t w = 0;
	- uint8_t *data8 = data;
	-
	- ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
	- ASSERT(uncompressed_size == compressed_size \|\|
	- comp != ZIO_COMPRESS_OFF);
	- ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
	- ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
	-
	- bzero(bp, sizeof (*bp));
	- BP_SET_EMBEDDED(bp, B_TRUE);
	- BP_SET_COMPRESS(bp, comp);
	- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
	- BPE_SET_LSIZE(bp, uncompressed_size);
	- BPE_SET_PSIZE(bp, compressed_size);
	-
	- /*
	- * Encode the byte array into the words of the block pointer.
	- * First byte goes into low bits of first word (little endian).
	- */
	- for (int i = 0; i < compressed_size; i++) {
	- BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
	- if (i % sizeof (w) == sizeof (w) - 1) {
	- /* we've reached the end of a word */
	- ASSERT3P(bp64, <, bp + 1);
	- *bp64 = w;
	- bp64++;
	- if (!BPE_IS_PAYLOADWORD(bp, bp64))
	- bp64++;
	- w = 0;
	- }
	- }
	- /* write last partial word */
	- if (bp64 < (uint64_t *)(bp + 1))
	- *bp64 = w;
	-}
	-
	-/*
	- * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
	- * more than BPE_PAYLOAD_SIZE bytes).
	- */
	-void
	-decode_embedded_bp_compressed(const blkptr_t bp, void buf)
	-{
	- int psize;
	- uint8_t *buf8 = buf;
	- uint64_t w = 0;
	- const uint64_t bp64 = (const uint64_t )bp;
	-
	- ASSERT(BP_IS_EMBEDDED(bp));
	-
	- psize = BPE_GET_PSIZE(bp);
	-
	- /*
	- * Decode the words of the block pointer into the byte array.
	- * Low bits of first word are the first byte (little endian).
	- */
	- for (int i = 0; i < psize; i++) {
	- if (i % sizeof (w) == 0) {
	- /* beginning of a word */
	- ASSERT3P(bp64, <, bp + 1);
	- w = *bp64;
	- bp64++;
	- if (!BPE_IS_PAYLOADWORD(bp, bp64))
	- bp64++;
	- }
	- buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
	- }
	-}
	-
	-/*
	- * Fill in the buffer with the (decompressed) payload of the embedded
	- * blkptr_t. Takes into account compression and byteorder (the payload is
	- * treated as a stream of bytes).
	- * Return 0 on success, or ENOSPC if it won't fit in the buffer.
	- */
	-int
	-decode_embedded_bp(const blkptr_t bp, void buf, int buflen)
	-{
	- int lsize, psize;
	-
	- ASSERT(BP_IS_EMBEDDED(bp));
	-
	- lsize = BPE_GET_LSIZE(bp);
	- psize = BPE_GET_PSIZE(bp);
	-
	- if (lsize > buflen)
	- return (ENOSPC);
	- ASSERT3U(lsize, ==, buflen);
	-
	- if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
	- uint8_t dstbuf[BPE_PAYLOAD_SIZE];
	- decode_embedded_bp_compressed(bp, dstbuf);
	- VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
	- dstbuf, buf, psize, buflen));
	- } else {
	- ASSERT3U(lsize, ==, psize);
	- decode_embedded_bp_compressed(bp, buf);
	- }
	-
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
	@@ -1,77 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/bplist.h>
	-#include <sys/zfs_context.h>
	-
	-
	-void
	-bplist_create(bplist_t *bpl)
	-{
	- mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
	- list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
	- offsetof(bplist_entry_t, bpe_node));
	-}
	-
	-void
	-bplist_destroy(bplist_t *bpl)
	-{
	- list_destroy(&bpl->bpl_list);
	- mutex_destroy(&bpl->bpl_lock);
	-}
	-
	-void
	-bplist_append(bplist_t bpl, const blkptr_t bp)
	-{
	- bplist_entry_t bpe = kmem_alloc(sizeof (bpe), KM_SLEEP);
	-
	- mutex_enter(&bpl->bpl_lock);
	- bpe->bpe_blk = *bp;
	- list_insert_tail(&bpl->bpl_list, bpe);
	- mutex_exit(&bpl->bpl_lock);
	-}
	-
	-/*
	- * To aid debugging, we keep the most recently removed entry. This way if
	- * we are in the callback, we can easily locate the entry.
	- */
	-static bplist_entry_t *bplist_iterate_last_removed;
	-
	-void
	-bplist_iterate(bplist_t bpl, bplist_itor_t func, void arg, dmu_tx_t tx)
	-{
	- bplist_entry_t *bpe;
	-
	- mutex_enter(&bpl->bpl_lock);
	- while (bpe = list_head(&bpl->bpl_list)) {
	- bplist_iterate_last_removed = bpe;
	- list_remove(&bpl->bpl_list, bpe);
	- mutex_exit(&bpl->bpl_lock);
	- func(arg, &bpe->bpe_blk, tx);
	- kmem_free(bpe, sizeof (*bpe));
	- mutex_enter(&bpl->bpl_lock);
	- }
	- mutex_exit(&bpl->bpl_lock);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
	@@ -1,606 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright (c) 2017 Datto Inc.
	- */
	-
	-#include <sys/bpobj.h>
	-#include <sys/zfs_context.h>
	-#include <sys/refcount.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/zfeature.h>
	-#include <sys/zap.h>
	-
	-/*
	- * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
	- */
	-uint64_t
	-bpobj_alloc_empty(objset_t os, int blocksize, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_objset_spa(os);
	- dsl_pool_t *dp = dmu_objset_pool(os);
	-
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
	- if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
	- ASSERT0(dp->dp_empty_bpobj);
	- dp->dp_empty_bpobj =
	- bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
	- VERIFY(zap_add(os,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
	- &dp->dp_empty_bpobj, tx) == 0);
	- }
	- spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
	- ASSERT(dp->dp_empty_bpobj != 0);
	- return (dp->dp_empty_bpobj);
	- } else {
	- return (bpobj_alloc(os, blocksize, tx));
	- }
	-}
	-
	-void
	-bpobj_decr_empty(objset_t os, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = dmu_objset_pool(os);
	-
	- spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
	- if (!spa_feature_is_active(dmu_objset_spa(os),
	- SPA_FEATURE_EMPTY_BPOBJ)) {
	- VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_EMPTY_BPOBJ, tx));
	- VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
	- dp->dp_empty_bpobj = 0;
	- }
	-}
	-
	-uint64_t
	-bpobj_alloc(objset_t os, int blocksize, dmu_tx_t tx)
	-{
	- int size;
	-
	- if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
	- size = BPOBJ_SIZE_V0;
	- else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
	- size = BPOBJ_SIZE_V1;
	- else
	- size = sizeof (bpobj_phys_t);
	-
	- return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
	- DMU_OT_BPOBJ_HDR, size, tx));
	-}
	-
	-void
	-bpobj_free(objset_t os, uint64_t obj, dmu_tx_t tx)
	-{
	- int64_t i;
	- bpobj_t bpo;
	- dmu_object_info_t doi;
	- int epb;
	- dmu_buf_t *dbuf = NULL;
	-
	- ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
	- VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
	-
	- mutex_enter(&bpo.bpo_lock);
	-
	- if (!bpo.bpo_havesubobj \|\| bpo.bpo_phys->bpo_subobjs == 0)
	- goto out;
	-
	- VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
	- epb = doi.doi_data_block_size / sizeof (uint64_t);
	-
	- for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
	- uint64_t *objarray;
	- uint64_t offset, blkoff;
	-
	- offset = i * sizeof (uint64_t);
	- blkoff = P2PHASE(i, epb);
	-
	- if (dbuf == NULL \|\| dbuf->db_offset > offset) {
	- if (dbuf)
	- dmu_buf_rele(dbuf, FTAG);
	- VERIFY3U(0, ==, dmu_buf_hold(os,
	- bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
	- }
	-
	- ASSERT3U(offset, >=, dbuf->db_offset);
	- ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
	-
	- objarray = dbuf->db_data;
	- bpobj_free(os, objarray[blkoff], tx);
	- }
	- if (dbuf) {
	- dmu_buf_rele(dbuf, FTAG);
	- dbuf = NULL;
	- }
	- VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
	-
	-out:
	- mutex_exit(&bpo.bpo_lock);
	- bpobj_close(&bpo);
	-
	- VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
	-}
	-
	-int
	-bpobj_open(bpobj_t bpo, objset_t os, uint64_t object)
	-{
	- dmu_object_info_t doi;
	- int err;
	-
	- err = dmu_object_info(os, object, &doi);
	- if (err)
	- return (err);
	-
	- bzero(bpo, sizeof (*bpo));
	- mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- ASSERT(bpo->bpo_dbuf == NULL);
	- ASSERT(bpo->bpo_phys == NULL);
	- ASSERT(object != 0);
	- ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
	- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
	-
	- err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
	- if (err)
	- return (err);
	-
	- bpo->bpo_os = os;
	- bpo->bpo_object = object;
	- bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
	- bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
	- bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
	- bpo->bpo_phys = bpo->bpo_dbuf->db_data;
	- return (0);
	-}
	-
	-boolean_t
	-bpobj_is_open(const bpobj_t *bpo)
	-{
	- return (bpo->bpo_object != 0);
	-}
	-
	-void
	-bpobj_close(bpobj_t *bpo)
	-{
	- /* Lame workaround for closing a bpobj that was never opened. */
	- if (bpo->bpo_object == 0)
	- return;
	-
	- dmu_buf_rele(bpo->bpo_dbuf, bpo);
	- if (bpo->bpo_cached_dbuf != NULL)
	- dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
	- bpo->bpo_dbuf = NULL;
	- bpo->bpo_phys = NULL;
	- bpo->bpo_cached_dbuf = NULL;
	- bpo->bpo_object = 0;
	-
	- mutex_destroy(&bpo->bpo_lock);
	-}
	-
	-boolean_t
	-bpobj_is_empty(bpobj_t *bpo)
	-{
	- return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
	- (!bpo->bpo_havesubobj \|\| bpo->bpo_phys->bpo_num_subobjs == 0));
	-}
	-
	-static int
	-bpobj_iterate_impl(bpobj_t bpo, bpobj_itor_t func, void arg, dmu_tx_t *tx,
	- boolean_t free)
	-{
	- dmu_object_info_t doi;
	- int epb;
	- int64_t i;
	- int err = 0;
	- dmu_buf_t *dbuf = NULL;
	-
	- ASSERT(bpobj_is_open(bpo));
	- mutex_enter(&bpo->bpo_lock);
	-
	- if (free)
	- dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
	-
	- for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
	- blkptr_t *bparray;
	- blkptr_t *bp;
	- uint64_t offset, blkoff;
	-
	- offset = i * sizeof (blkptr_t);
	- blkoff = P2PHASE(i, bpo->bpo_epb);
	-
	- if (dbuf == NULL \|\| dbuf->db_offset > offset) {
	- if (dbuf)
	- dmu_buf_rele(dbuf, FTAG);
	- err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
	- FTAG, &dbuf, 0);
	- if (err)
	- break;
	- }
	-
	- ASSERT3U(offset, >=, dbuf->db_offset);
	- ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
	-
	- bparray = dbuf->db_data;
	- bp = &bparray[blkoff];
	- err = func(arg, bp, tx);
	- if (err)
	- break;
	- if (free) {
	- bpo->bpo_phys->bpo_bytes -=
	- bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
	- ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
	- if (bpo->bpo_havecomp) {
	- bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
	- bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
	- }
	- bpo->bpo_phys->bpo_num_blkptrs--;
	- ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
	- }
	- }
	- if (dbuf) {
	- dmu_buf_rele(dbuf, FTAG);
	- dbuf = NULL;
	- }
	- if (free) {
	- VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
	- (i + 1) * sizeof (blkptr_t), -1ULL, tx));
	- }
	- if (err \|\| !bpo->bpo_havesubobj \|\| bpo->bpo_phys->bpo_subobjs == 0)
	- goto out;
	-
	- ASSERT(bpo->bpo_havecomp);
	- err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
	- if (err) {
	- mutex_exit(&bpo->bpo_lock);
	- return (err);
	- }
	- ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
	- epb = doi.doi_data_block_size / sizeof (uint64_t);
	-
	- for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
	- uint64_t *objarray;
	- uint64_t offset, blkoff;
	- bpobj_t sublist;
	- uint64_t used_before, comp_before, uncomp_before;
	- uint64_t used_after, comp_after, uncomp_after;
	-
	- offset = i * sizeof (uint64_t);
	- blkoff = P2PHASE(i, epb);
	-
	- if (dbuf == NULL \|\| dbuf->db_offset > offset) {
	- if (dbuf)
	- dmu_buf_rele(dbuf, FTAG);
	- err = dmu_buf_hold(bpo->bpo_os,
	- bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
	- if (err)
	- break;
	- }
	-
	- ASSERT3U(offset, >=, dbuf->db_offset);
	- ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
	-
	- objarray = dbuf->db_data;
	- err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
	- if (err)
	- break;
	- if (free) {
	- err = bpobj_space(&sublist,
	- &used_before, &comp_before, &uncomp_before);
	- if (err != 0) {
	- bpobj_close(&sublist);
	- break;
	- }
	- }
	- err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
	- if (free) {
	- VERIFY3U(0, ==, bpobj_space(&sublist,
	- &used_after, &comp_after, &uncomp_after));
	- bpo->bpo_phys->bpo_bytes -= used_before - used_after;
	- ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
	- bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
	- bpo->bpo_phys->bpo_uncomp -=
	- uncomp_before - uncomp_after;
	- }
	-
	- bpobj_close(&sublist);
	- if (err)
	- break;
	- if (free) {
	- err = dmu_object_free(bpo->bpo_os,
	- objarray[blkoff], tx);
	- if (err)
	- break;
	- bpo->bpo_phys->bpo_num_subobjs--;
	- ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
	- }
	- }
	- if (dbuf) {
	- dmu_buf_rele(dbuf, FTAG);
	- dbuf = NULL;
	- }
	- if (free) {
	- VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
	- bpo->bpo_phys->bpo_subobjs,
	- (i + 1) * sizeof (uint64_t), -1ULL, tx));
	- }
	-
	-out:
	- /* If there are no entries, there should be no bytes. */
	- if (bpobj_is_empty(bpo)) {
	- ASSERT0(bpo->bpo_phys->bpo_bytes);
	- ASSERT0(bpo->bpo_phys->bpo_comp);
	- ASSERT0(bpo->bpo_phys->bpo_uncomp);
	- }
	-
	- mutex_exit(&bpo->bpo_lock);
	- return (err);
	-}
	-
	-/*
	- * Iterate and remove the entries. If func returns nonzero, iteration
	- * will stop and that entry will not be removed.
	- */
	-int
	-bpobj_iterate(bpobj_t bpo, bpobj_itor_t func, void arg, dmu_tx_t *tx)
	-{
	- return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
	-}
	-
	-/*
	- * Iterate the entries. If func returns nonzero, iteration will stop.
	- */
	-int
	-bpobj_iterate_nofree(bpobj_t bpo, bpobj_itor_t func, void arg, dmu_tx_t *tx)
	-{
	- return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
	-}
	-
	-void
	-bpobj_enqueue_subobj(bpobj_t bpo, uint64_t subobj, dmu_tx_t tx)
	-{
	- bpobj_t subbpo;
	- uint64_t used, comp, uncomp, subsubobjs;
	-
	- ASSERT(bpobj_is_open(bpo));
	- ASSERT(subobj != 0);
	- ASSERT(bpo->bpo_havesubobj);
	- ASSERT(bpo->bpo_havecomp);
	- ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
	-
	- if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
	- bpobj_decr_empty(bpo->bpo_os, tx);
	- return;
	- }
	-
	- VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
	- VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
	-
	- if (bpobj_is_empty(&subbpo)) {
	- /* No point in having an empty subobj. */
	- bpobj_close(&subbpo);
	- bpobj_free(bpo->bpo_os, subobj, tx);
	- return;
	- }
	-
	- mutex_enter(&bpo->bpo_lock);
	- dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
	- if (bpo->bpo_phys->bpo_subobjs == 0) {
	- bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
	- DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
	- DMU_OT_NONE, 0, tx);
	- }
	-
	- dmu_object_info_t doi;
	- ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
	- ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
	-
	- dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
	- bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
	- sizeof (subobj), &subobj, tx);
	- bpo->bpo_phys->bpo_num_subobjs++;
	-
	- /*
	- * If subobj has only one block of subobjs, then move subobj's
	- * subobjs to bpo's subobj list directly. This reduces
	- * recursion in bpobj_iterate due to nested subobjs.
	- */
	- subsubobjs = subbpo.bpo_phys->bpo_subobjs;
	- if (subsubobjs != 0) {
	- dmu_object_info_t doi;
	-
	- VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
	- if (doi.doi_max_offset == doi.doi_data_block_size) {
	- dmu_buf_t *subdb;
	- uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
	-
	- VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
	- 0, FTAG, &subdb, 0));
	- /*
	- * Make sure that we are not asking dmu_write()
	- * to write more data than we have in our buffer.
	- */
	- VERIFY3U(subdb->db_size, >=,
	- numsubsub * sizeof (subobj));
	- dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
	- bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
	- numsubsub * sizeof (subobj), subdb->db_data, tx);
	- dmu_buf_rele(subdb, FTAG);
	- bpo->bpo_phys->bpo_num_subobjs += numsubsub;
	-
	- dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
	- subbpo.bpo_phys->bpo_subobjs = 0;
	- VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
	- subsubobjs, tx));
	- }
	- }
	- bpo->bpo_phys->bpo_bytes += used;
	- bpo->bpo_phys->bpo_comp += comp;
	- bpo->bpo_phys->bpo_uncomp += uncomp;
	- mutex_exit(&bpo->bpo_lock);
	-
	- bpobj_close(&subbpo);
	-}
	-
	-void
	-bpobj_enqueue(bpobj_t bpo, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- blkptr_t stored_bp = *bp;
	- uint64_t offset;
	- int blkoff;
	- blkptr_t *bparray;
	-
	- ASSERT(bpobj_is_open(bpo));
	- ASSERT(!BP_IS_HOLE(bp));
	- ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
	-
	- if (BP_IS_EMBEDDED(bp)) {
	- /*
	- * The bpobj will compress better without the payload.
	- *
	- * Note that we store EMBEDDED bp's because they have an
	- * uncompressed size, which must be accounted for. An
	- * alternative would be to add their size to bpo_uncomp
	- * without storing the bp, but that would create additional
	- * complications: bpo_uncomp would be inconsistent with the
	- * set of BP's stored, and bpobj_iterate() wouldn't visit
	- * all the space accounted for in the bpobj.
	- */
	- bzero(&stored_bp, sizeof (stored_bp));
	- stored_bp.blk_prop = bp->blk_prop;
	- stored_bp.blk_birth = bp->blk_birth;
	- } else if (!BP_GET_DEDUP(bp)) {
	- /* The bpobj will compress better without the checksum */
	- bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
	- }
	-
	- /* We never need the fill count. */
	- stored_bp.blk_fill = 0;
	-
	- mutex_enter(&bpo->bpo_lock);
	-
	- offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
	- blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
	-
	- if (bpo->bpo_cached_dbuf == NULL \|\|
	- offset < bpo->bpo_cached_dbuf->db_offset \|\|
	- offset >= bpo->bpo_cached_dbuf->db_offset +
	- bpo->bpo_cached_dbuf->db_size) {
	- if (bpo->bpo_cached_dbuf)
	- dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
	- VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
	- offset, bpo, &bpo->bpo_cached_dbuf, 0));
	- }
	-
	- dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
	- bparray = bpo->bpo_cached_dbuf->db_data;
	- bparray[blkoff] = stored_bp;
	-
	- dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
	- bpo->bpo_phys->bpo_num_blkptrs++;
	- bpo->bpo_phys->bpo_bytes +=
	- bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
	- if (bpo->bpo_havecomp) {
	- bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
	- bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
	- }
	- mutex_exit(&bpo->bpo_lock);
	-}
	-
	-struct space_range_arg {
	- spa_t *spa;
	- uint64_t mintxg;
	- uint64_t maxtxg;
	- uint64_t used;
	- uint64_t comp;
	- uint64_t uncomp;
	-};
	-
	-/* ARGSUSED */
	-static int
	-space_range_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- struct space_range_arg *sra = arg;
	-
	- if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
	- if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
	- sra->used += bp_get_dsize_sync(sra->spa, bp);
	- else
	- sra->used += bp_get_dsize(sra->spa, bp);
	- sra->comp += BP_GET_PSIZE(bp);
	- sra->uncomp += BP_GET_UCSIZE(bp);
	- }
	- return (0);
	-}
	-
	-int
	-bpobj_space(bpobj_t bpo, uint64_t usedp, uint64_t compp, uint64_t uncompp)
	-{
	- ASSERT(bpobj_is_open(bpo));
	- mutex_enter(&bpo->bpo_lock);
	-
	- *usedp = bpo->bpo_phys->bpo_bytes;
	- if (bpo->bpo_havecomp) {
	- *compp = bpo->bpo_phys->bpo_comp;
	- *uncompp = bpo->bpo_phys->bpo_uncomp;
	- mutex_exit(&bpo->bpo_lock);
	- return (0);
	- } else {
	- mutex_exit(&bpo->bpo_lock);
	- return (bpobj_space_range(bpo, 0, UINT64_MAX,
	- usedp, compp, uncompp));
	- }
	-}
	-
	-/*
	- * Return the amount of space in the bpobj which is:
	- * mintxg < blk_birth <= maxtxg
	- */
	-int
	-bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	-{
	- struct space_range_arg sra = { 0 };
	- int err;
	-
	- ASSERT(bpobj_is_open(bpo));
	-
	- /*
	- * As an optimization, if they want the whole txg range, just
	- * get bpo_bytes rather than iterating over the bps.
	- */
	- if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
	- return (bpobj_space(bpo, usedp, compp, uncompp));
	-
	- sra.spa = dmu_objset_spa(bpo->bpo_os);
	- sra.mintxg = mintxg;
	- sra.maxtxg = maxtxg;
	-
	- err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
	- *usedp = sra.used;
	- *compp = sra.comp;
	- *uncompp = sra.uncomp;
	- return (err);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
	@@ -1,301 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/arc.h>
	-#include <sys/bptree.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dnode.h>
	-#include <sys/refcount.h>
	-#include <sys/spa.h>
	-
	-/*
	- * A bptree is a queue of root block pointers from destroyed datasets. When a
	- * dataset is destroyed its root block pointer is put on the end of the pool's
	- * bptree queue so the dataset's blocks can be freed asynchronously by
	- * dsl_scan_sync. This allows the delete operation to finish without traversing
	- * all the dataset's blocks.
	- *
	- * Note that while bt_begin and bt_end are only ever incremented in this code,
	- * they are effectively reset to 0 every time the entire bptree is freed because
	- * the bptree's object is destroyed and re-created.
	- */
	-
	-struct bptree_args {
	- bptree_phys_t ba_phys; / data in bonus buffer, dirtied if freeing */
	- boolean_t ba_free; /* true if freeing during traversal */
	-
	- bptree_itor_t ba_func; / function to call for each blockpointer */
	- void ba_arg; / caller supplied argument to ba_func */
	- dmu_tx_t ba_tx; / caller supplied tx, NULL if not freeing */
	-} bptree_args_t;
	-
	-uint64_t
	-bptree_alloc(objset_t os, dmu_tx_t tx)
	-{
	- uint64_t obj;
	- dmu_buf_t *db;
	- bptree_phys_t *bt;
	-
	- obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
	- SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
	- sizeof (bptree_phys_t), tx);
	-
	- /*
	- * Bonus buffer contents are already initialized to 0, but for
	- * readability we make it explicit.
	- */
	- VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
	- dmu_buf_will_dirty(db, tx);
	- bt = db->db_data;
	- bt->bt_begin = 0;
	- bt->bt_end = 0;
	- bt->bt_bytes = 0;
	- bt->bt_comp = 0;
	- bt->bt_uncomp = 0;
	- dmu_buf_rele(db, FTAG);
	-
	- return (obj);
	-}
	-
	-int
	-bptree_free(objset_t os, uint64_t obj, dmu_tx_t tx)
	-{
	- dmu_buf_t *db;
	- bptree_phys_t *bt;
	-
	- VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
	- bt = db->db_data;
	- ASSERT3U(bt->bt_begin, ==, bt->bt_end);
	- ASSERT0(bt->bt_bytes);
	- ASSERT0(bt->bt_comp);
	- ASSERT0(bt->bt_uncomp);
	- dmu_buf_rele(db, FTAG);
	-
	- return (dmu_object_free(os, obj, tx));
	-}
	-
	-boolean_t
	-bptree_is_empty(objset_t *os, uint64_t obj)
	-{
	- dmu_buf_t *db;
	- bptree_phys_t *bt;
	- boolean_t rv;
	-
	- VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
	- bt = db->db_data;
	- rv = (bt->bt_begin == bt->bt_end);
	- dmu_buf_rele(db, FTAG);
	- return (rv);
	-}
	-
	-void
	-bptree_add(objset_t os, uint64_t obj, blkptr_t bp, uint64_t birth_txg,
	- uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
	-{
	- dmu_buf_t *db;
	- bptree_phys_t *bt;
	- bptree_entry_phys_t bte = { 0 };
	-
	- /*
	- * bptree objects are in the pool mos, therefore they can only be
	- * modified in syncing context. Furthermore, this is only modified
	- * by the sync thread, so no locking is necessary.
	- */
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
	- bt = db->db_data;
	-
	- bte.be_birth_txg = birth_txg;
	- bte.be_bp = *bp;
	- dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
	-
	- dmu_buf_will_dirty(db, tx);
	- bt->bt_end++;
	- bt->bt_bytes += bytes;
	- bt->bt_comp += comp;
	- bt->bt_uncomp += uncomp;
	- dmu_buf_rele(db, FTAG);
	-}
	-
	-/* ARGSUSED */
	-static int
	-bptree_visit_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	-{
	- int err;
	- struct bptree_args *ba = arg;
	-
	- if (bp == NULL \|\| BP_IS_HOLE(bp))
	- return (0);
	-
	- err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
	- if (err == 0 && ba->ba_free) {
	- ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
	- ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
	- ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
	- }
	- return (err);
	-}
	-
	-/*
	- * If "free" is set:
	- * - It is assumed that "func" will be freeing the block pointers.
	- * - If "func" returns nonzero, the bookmark will be remembered and
	- * iteration will be restarted from this point on next invocation.
	- * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
	- * bptree_iterate will remember the bookmark, continue traversing
	- * any additional entries, and return 0.
	- *
	- * If "free" is not set, traversal will stop and return an error if
	- * an i/o error is encountered.
	- *
	- * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
	- * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
	- * traverse_dataset_destroyed()).
	- */
	-int
	-bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
	- void arg, dmu_tx_t tx)
	-{
	- boolean_t ioerr = B_FALSE;
	- int err;
	- uint64_t i;
	- dmu_buf_t *db;
	- struct bptree_args ba;
	-
	- ASSERT(!free \|\| dmu_tx_is_syncing(tx));
	-
	- err = dmu_bonus_hold(os, obj, FTAG, &db);
	- if (err != 0)
	- return (err);
	-
	- if (free)
	- dmu_buf_will_dirty(db, tx);
	-
	- ba.ba_phys = db->db_data;
	- ba.ba_free = free;
	- ba.ba_func = func;
	- ba.ba_arg = arg;
	- ba.ba_tx = tx;
	-
	- err = 0;
	- for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
	- bptree_entry_phys_t bte;
	- int flags = TRAVERSE_PREFETCH_METADATA \| TRAVERSE_POST;
	-
	- err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
	- &bte, DMU_READ_NO_PREFETCH);
	- if (err != 0)
	- break;
	-
	- if (zfs_free_leak_on_eio)
	- flags \|= TRAVERSE_HARD;
	- zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
	- "bookmark %lld/%lld/%lld/%lld",
	- (longlong_t)i,
	- (longlong_t)bte.be_birth_txg,
	- (longlong_t)bte.be_zb.zb_objset,
	- (longlong_t)bte.be_zb.zb_object,
	- (longlong_t)bte.be_zb.zb_level,
	- (longlong_t)bte.be_zb.zb_blkid);
	- err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
	- bte.be_birth_txg, &bte.be_zb, flags,
	- bptree_visit_cb, &ba);
	- if (free) {
	- /*
	- * The callback has freed the visited block pointers.
	- * Record our traversal progress on disk, either by
	- * updating this record's bookmark, or by logically
	- * removing this record by advancing bt_begin.
	- */
	- if (err != 0) {
	- /* save bookmark for future resume */
	- ASSERT3U(bte.be_zb.zb_objset, ==,
	- ZB_DESTROYED_OBJSET);
	- ASSERT0(bte.be_zb.zb_level);
	- dmu_write(os, obj, i * sizeof (bte),
	- sizeof (bte), &bte, tx);
	- if (err == EIO \|\| err == ECKSUM \|\|
	- err == ENXIO) {
	- /*
	- * Skip the rest of this tree and
	- * continue on to the next entry.
	- */
	- err = 0;
	- ioerr = B_TRUE;
	- } else {
	- break;
	- }
	- } else if (ioerr) {
	- /*
	- * This entry is finished, but there were
	- * i/o errors on previous entries, so we
	- * can't adjust bt_begin. Set this entry's
	- * be_birth_txg such that it will be
	- * treated as a no-op in future traversals.
	- */
	- bte.be_birth_txg = UINT64_MAX;
	- dmu_write(os, obj, i * sizeof (bte),
	- sizeof (bte), &bte, tx);
	- }
	-
	- if (!ioerr) {
	- ba.ba_phys->bt_begin++;
	- (void) dmu_free_range(os, obj,
	- i * sizeof (bte), sizeof (bte), tx);
	- }
	- } else if (err != 0) {
	- break;
	- }
	- }
	-
	- ASSERT(!free \|\| err != 0 \|\| ioerr \|\|
	- ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
	-
	- /* if all blocks are free there should be no used space */
	- if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
	- if (zfs_free_leak_on_eio) {
	- ba.ba_phys->bt_bytes = 0;
	- ba.ba_phys->bt_comp = 0;
	- ba.ba_phys->bt_uncomp = 0;
	- }
	-
	- ASSERT0(ba.ba_phys->bt_bytes);
	- ASSERT0(ba.ba_phys->bt_comp);
	- ASSERT0(ba.ba_phys->bt_uncomp);
	- }
	-
	- dmu_buf_rele(db, FTAG);
	-
	- return (err);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
	@@ -1,111 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2014 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/bqueue.h>
	-#include <sys/zfs_context.h>
	-
	-static inline bqueue_node_t *
	-obj2node(bqueue_t q, void data)
	-{
	- return ((bqueue_node_t )((char )data + q->bq_node_offset));
	-}
	-
	-/*
	- * Initialize a blocking queue The maximum capacity of the queue is set to
	- * size. Types that want to be stored in a bqueue must contain a bqueue_node_t,
	- * and offset should give its offset from the start of the struct. Return 0 on
	- * success, or -1 on failure.
	- */
	-int
	-bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
	-{
	- list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
	- node_offset + offsetof(bqueue_node_t, bqn_node));
	- cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
	- mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
	- q->bq_node_offset = node_offset;
	- q->bq_size = 0;
	- q->bq_maxsize = size;
	- return (0);
	-}
	-
	-/*
	- * Destroy a blocking queue. This function asserts that there are no
	- * elements in the queue, and no one is blocked on the condition
	- * variables.
	- */
	-void
	-bqueue_destroy(bqueue_t *q)
	-{
	- ASSERT0(q->bq_size);
	- cv_destroy(&q->bq_add_cv);
	- cv_destroy(&q->bq_pop_cv);
	- mutex_destroy(&q->bq_lock);
	- list_destroy(&q->bq_list);
	-}
	-
	-/*
	- * Add data to q, consuming size units of capacity. If there is insufficient
	- * capacity to consume size units, block until capacity exists. Asserts size is
	- * > 0.
	- */
	-void
	-bqueue_enqueue(bqueue_t q, void data, uint64_t item_size)
	-{
	- ASSERT3U(item_size, >, 0);
	- ASSERT3U(item_size, <, q->bq_maxsize);
	- mutex_enter(&q->bq_lock);
	- obj2node(q, data)->bqn_size = item_size;
	- while (q->bq_size + item_size > q->bq_maxsize) {
	- cv_wait(&q->bq_add_cv, &q->bq_lock);
	- }
	- q->bq_size += item_size;
	- list_insert_tail(&q->bq_list, data);
	- cv_signal(&q->bq_pop_cv);
	- mutex_exit(&q->bq_lock);
	-}
	-/*
	- * Take the first element off of q. If there are no elements on the queue, wait
	- * until one is put there. Return the removed element.
	- */
	-void *
	-bqueue_dequeue(bqueue_t *q)
	-{
	- void *ret;
	- uint64_t item_size;
	- mutex_enter(&q->bq_lock);
	- while (q->bq_size == 0) {
	- cv_wait(&q->bq_pop_cv, &q->bq_lock);
	- }
	- ret = list_remove_head(&q->bq_list);
	- item_size = obj2node(q, ret)->bqn_size;
	- q->bq_size -= item_size;
	- mutex_exit(&q->bq_lock);
	- cv_signal(&q->bq_add_cv);
	- return (ret);
	-}
	-
	-/*
	- * Returns true if the space used is 0.
	- */
	-boolean_t
	-bqueue_empty(bqueue_t *q)
	-{
	- return (q->bq_size == 0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c
	@@ -1,63 +0,0 @@
	-// Copyright (c) 2011 Google, Inc.
	-//
	-// Permission is hereby granted, free of charge, to any person obtaining a copy
	-// of this software and associated documentation files (the "Software"), to deal
	-// in the Software without restriction, including without limitation the rights
	-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	-// copies of the Software, and to permit persons to whom the Software is
	-// furnished to do so, subject to the following conditions:
	-//
	-// The above copyright notice and this permission notice shall be included in
	-// all copies or substantial portions of the Software.
	-//
	-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	-// THE SOFTWARE.
	-
	-/*
	- * Copyright (c) 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/cityhash.h>
	-
	-#define HASH_K1 0xb492b66fbe98f273ULL
	-#define HASH_K2 0x9ae16a3b2f90404fULL
	-
	-/*
	- * Bitwise right rotate. Normally this will compile to a single
	- * instruction.
	- */
	-static inline uint64_t
	-rotate(uint64_t val, int shift)
	-{
	- // Avoid shifting by 64: doing so yields an undefined result.
	- return (shift == 0 ? val : (val >> shift) \| (val << (64 - shift)));
	-}
	-
	-static inline uint64_t
	-cityhash_helper(uint64_t u, uint64_t v, uint64_t mul)
	-{
	- uint64_t a = (u ^ v) * mul;
	- a ^= (a >> 47);
	- uint64_t b = (v ^ a) * mul;
	- b ^= (b >> 47);
	- b *= mul;
	- return (b);
	-}
	-
	-uint64_t
	-cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4)
	-{
	- uint64_t mul = HASH_K2 + 64;
	- uint64_t a = w1 * HASH_K1;
	- uint64_t b = w2;
	- uint64_t c = w4 * mul;
	- uint64_t d = w3 * HASH_K2;
	- return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d,
	- a + rotate(b + HASH_K2, 18) + c, mul));
	-
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
	@@ -1,4248 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_send.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dbuf.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/spa.h>
	-#include <sys/zio.h>
	-#include <sys/dmu_zfetch.h>
	-#include <sys/sa.h>
	-#include <sys/sa_impl.h>
	-#include <sys/zfeature.h>
	-#include <sys/blkptr.h>
	-#include <sys/range_tree.h>
	-#include <sys/callb.h>
	-#include <sys/abd.h>
	-#include <sys/vdev.h>
	-#include <sys/cityhash.h>
	-#include <sys/spa_impl.h>
	-
	-kstat_t *dbuf_ksp;
	-
	-typedef struct dbuf_stats {
	- /*
	- * Various statistics about the size of the dbuf cache.
	- */
	- kstat_named_t cache_count;
	- kstat_named_t cache_size_bytes;
	- kstat_named_t cache_size_bytes_max;
	- /*
	- * Statistics regarding the bounds on the dbuf cache size.
	- */
	- kstat_named_t cache_target_bytes;
	- kstat_named_t cache_lowater_bytes;
	- kstat_named_t cache_hiwater_bytes;
	- /*
	- * Total number of dbuf cache evictions that have occurred.
	- */
	- kstat_named_t cache_total_evicts;
	- /*
	- * The distribution of dbuf levels in the dbuf cache and
	- * the total size of all dbufs at each level.
	- */
	- kstat_named_t cache_levels[DN_MAX_LEVELS];
	- kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
	- /*
	- * Statistics about the dbuf hash table.
	- */
	- kstat_named_t hash_hits;
	- kstat_named_t hash_misses;
	- kstat_named_t hash_collisions;
	- kstat_named_t hash_elements;
	- kstat_named_t hash_elements_max;
	- /*
	- * Number of sublists containing more than one dbuf in the dbuf
	- * hash table. Keep track of the longest hash chain.
	- */
	- kstat_named_t hash_chains;
	- kstat_named_t hash_chain_max;
	- /*
	- * Number of times a dbuf_create() discovers that a dbuf was
	- * already created and in the dbuf hash table.
	- */
	- kstat_named_t hash_insert_race;
	- /*
	- * Statistics about the size of the metadata dbuf cache.
	- */
	- kstat_named_t metadata_cache_count;
	- kstat_named_t metadata_cache_size_bytes;
	- kstat_named_t metadata_cache_size_bytes_max;
	- /*
	- * For diagnostic purposes, this is incremented whenever we can't add
	- * something to the metadata cache because it's full, and instead put
	- * the data in the regular dbuf cache.
	- */
	- kstat_named_t metadata_cache_overflow;
	-} dbuf_stats_t;
	-
	-dbuf_stats_t dbuf_stats = {
	- { "cache_count", KSTAT_DATA_UINT64 },
	- { "cache_size_bytes", KSTAT_DATA_UINT64 },
	- { "cache_size_bytes_max", KSTAT_DATA_UINT64 },
	- { "cache_target_bytes", KSTAT_DATA_UINT64 },
	- { "cache_lowater_bytes", KSTAT_DATA_UINT64 },
	- { "cache_hiwater_bytes", KSTAT_DATA_UINT64 },
	- { "cache_total_evicts", KSTAT_DATA_UINT64 },
	- { { "cache_levels_N", KSTAT_DATA_UINT64 } },
	- { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } },
	- { "hash_hits", KSTAT_DATA_UINT64 },
	- { "hash_misses", KSTAT_DATA_UINT64 },
	- { "hash_collisions", KSTAT_DATA_UINT64 },
	- { "hash_elements", KSTAT_DATA_UINT64 },
	- { "hash_elements_max", KSTAT_DATA_UINT64 },
	- { "hash_chains", KSTAT_DATA_UINT64 },
	- { "hash_chain_max", KSTAT_DATA_UINT64 },
	- { "hash_insert_race", KSTAT_DATA_UINT64 },
	- { "metadata_cache_count", KSTAT_DATA_UINT64 },
	- { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
	- { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
	- { "metadata_cache_overflow", KSTAT_DATA_UINT64 }
	-};
	-
	-#define DBUF_STAT_INCR(stat, val) \
	- atomic_add_64(&dbuf_stats.stat.value.ui64, (val));
	-#define DBUF_STAT_DECR(stat, val) \
	- DBUF_STAT_INCR(stat, -(val));
	-#define DBUF_STAT_BUMP(stat) \
	- DBUF_STAT_INCR(stat, 1);
	-#define DBUF_STAT_BUMPDOWN(stat) \
	- DBUF_STAT_INCR(stat, -1);
	-#define DBUF_STAT_MAX(stat, v) { \
	- uint64_t _m; \
	- while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
	- (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
	- continue; \
	-}
	-
	-struct dbuf_hold_impl_data {
	- /* Function arguments */
	- dnode_t *dh_dn;
	- uint8_t dh_level;
	- uint64_t dh_blkid;
	- boolean_t dh_fail_sparse;
	- boolean_t dh_fail_uncached;
	- void *dh_tag;
	- dmu_buf_impl_t **dh_dbp;
	- /* Local variables */
	- dmu_buf_impl_t *dh_db;
	- dmu_buf_impl_t *dh_parent;
	- blkptr_t *dh_bp;
	- int dh_err;
	- dbuf_dirty_record_t *dh_dr;
	- int dh_depth;
	-};
	-
	-static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
	- dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse,
	- boolean_t fail_uncached,
	- void tag, dmu_buf_impl_t *dbp, int depth);
	-static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
	-
	-static boolean_t dbuf_undirty(dmu_buf_impl_t db, dmu_tx_t tx);
	-static void dbuf_write(dbuf_dirty_record_t dr, arc_buf_t data, dmu_tx_t *tx);
	-
	-/*
	- * Global data structures and functions for the dbuf cache.
	- */
	-static kmem_cache_t *dbuf_kmem_cache;
	-static taskq_t *dbu_evict_taskq;
	-
	-static kthread_t *dbuf_cache_evict_thread;
	-static kmutex_t dbuf_evict_lock;
	-static kcondvar_t dbuf_evict_cv;
	-static boolean_t dbuf_evict_thread_exit;
	-
	-/*
	- * There are two dbuf caches; each dbuf can only be in one of them at a time.
	- *
	- * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
	- * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
	- * that represent the metadata that describes filesystems/snapshots/
	- * bookmarks/properties/etc. We only evict from this cache when we export a
	- * pool, to short-circuit as much I/O as possible for all administrative
	- * commands that need the metadata. There is no eviction policy for this
	- * cache, because we try to only include types in it which would occupy a
	- * very small amount of space per object but create a large impact on the
	- * performance of these commands. Instead, after it reaches a maximum size
	- * (which should only happen on very small memory systems with a very large
	- * number of filesystem objects), we stop taking new dbufs into the
	- * metadata cache, instead putting them in the normal dbuf cache.
	- *
	- * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
	- * are not currently held but have been recently released. These dbufs
	- * are not eligible for arc eviction until they are aged out of the cache.
	- * Dbufs that are aged out of the cache will be immediately destroyed and
	- * become eligible for arc eviction.
	- *
	- * Dbufs are added to these caches once the last hold is released. If a dbuf is
	- * later accessed and still exists in the dbuf cache, then it will be removed
	- * from the cache and later re-added to the head of the cache.
	- *
	- * If a given dbuf meets the requirements for the metadata cache, it will go
	- * there, otherwise it will be considered for the generic LRU dbuf cache. The
	- * caches and the refcounts tracking their sizes are stored in an array indexed
	- * by those caches' matching enum values (from dbuf_cached_state_t).
	- */
	-typedef struct dbuf_cache {
	- multilist_t *cache;
	- zfs_refcount_t size;
	-} dbuf_cache_t;
	-dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
	-
	-/* Size limits for the caches */
	-uint64_t dbuf_cache_max_bytes = 0;
	-uint64_t dbuf_metadata_cache_max_bytes = 0;
	-/* Set the default sizes of the caches to log2 fraction of arc size */
	-int dbuf_cache_shift = 5;
	-int dbuf_metadata_cache_shift = 6;
	-
	-/*
	- * For diagnostic purposes, this is incremented whenever we can't add
	- * something to the metadata cache because it's full, and instead put
	- * the data in the regular dbuf cache.
	- */
	-uint64_t dbuf_metadata_cache_overflow;
	-
	-/*
	- * The LRU dbuf cache uses a three-stage eviction policy:
	- * - A low water marker designates when the dbuf eviction thread
	- * should stop evicting from the dbuf cache.
	- * - When we reach the maximum size (aka mid water mark), we
	- * signal the eviction thread to run.
	- * - The high water mark indicates when the eviction thread
	- * is unable to keep up with the incoming load and eviction must
	- * happen in the context of the calling thread.
	- *
	- * The dbuf cache:
	- * (max size)
	- * low water mid water hi water
	- * +----------------------------------------+----------+----------+
	- * \| \| \| \|
	- * \| \| \| \|
	- * \| \| \| \|
	- * \| \| \| \|
	- * +----------------------------------------+----------+----------+
	- * stop signal evict
	- * evicting eviction directly
	- * thread
	- *
	- * The high and low water marks indicate the operating range for the eviction
	- * thread. The low water mark is, by default, 90% of the total size of the
	- * cache and the high water mark is at 110% (both of these percentages can be
	- * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
	- * respectively). The eviction thread will try to ensure that the cache remains
	- * within this range by waking up every second and checking if the cache is
	- * above the low water mark. The thread can also be woken up by callers adding
	- * elements into the cache if the cache is larger than the mid water (i.e max
	- * cache size). Once the eviction thread is woken up and eviction is required,
	- * it will continue evicting buffers until it's able to reduce the cache size
	- * to the low water mark. If the cache size continues to grow and hits the high
	- * water mark, then callers adding elments to the cache will begin to evict
	- * directly from the cache until the cache is no longer above the high water
	- * mark.
	- */
	-
	-/*
	- * The percentage above and below the maximum cache size.
	- */
	-uint_t dbuf_cache_hiwater_pct = 10;
	-uint_t dbuf_cache_lowater_pct = 10;
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN,
	- &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes");
	-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN,
	- &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN,
	- &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN,
	- &dbuf_metadata_cache_shift, 0,
	- "dbuf metadata cache size as log2 fraction of ARC");
	-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD,
	- &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN,
	- &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN,
	- &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size");
	-
	-/* ARGSUSED */
	-static int
	-dbuf_cons(void vdb, void unused, int kmflag)
	-{
	- dmu_buf_impl_t *db = vdb;
	- bzero(db, sizeof (dmu_buf_impl_t));
	-
	- mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
	- multilist_link_init(&db->db_cache_link);
	- zfs_refcount_create(&db->db_holds);
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dbuf_dest(void vdb, void unused)
	-{
	- dmu_buf_impl_t *db = vdb;
	- mutex_destroy(&db->db_mtx);
	- cv_destroy(&db->db_changed);
	- ASSERT(!multilist_link_active(&db->db_cache_link));
	- zfs_refcount_destroy(&db->db_holds);
	-}
	-
	-/*
	- * dbuf hash table routines
	- */
	-static dbuf_hash_table_t dbuf_hash_table;
	-
	-static uint64_t dbuf_hash_count;
	-
	-/*
	- * We use Cityhash for this. It's fast, and has good hash properties without
	- * requiring any large static buffers.
	- */
	-static uint64_t
	-dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
	-{
	- return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
	-}
	-
	-#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
	- ((dbuf)->db.db_object == (obj) && \
	- (dbuf)->db_objset == (os) && \
	- (dbuf)->db_level == (level) && \
	- (dbuf)->db_blkid == (blkid))
	-
	-dmu_buf_impl_t *
	-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
	-{
	- dbuf_hash_table_t *h = &dbuf_hash_table;
	- uint64_t hv = dbuf_hash(os, obj, level, blkid);
	- uint64_t idx = hv & h->hash_table_mask;
	- dmu_buf_impl_t *db;
	-
	- mutex_enter(DBUF_HASH_MUTEX(h, idx));
	- for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
	- if (DBUF_EQUAL(db, os, obj, level, blkid)) {
	- mutex_enter(&db->db_mtx);
	- if (db->db_state != DB_EVICTING) {
	- mutex_exit(DBUF_HASH_MUTEX(h, idx));
	- return (db);
	- }
	- mutex_exit(&db->db_mtx);
	- }
	- }
	- mutex_exit(DBUF_HASH_MUTEX(h, idx));
	- return (NULL);
	-}
	-
	-static dmu_buf_impl_t *
	-dbuf_find_bonus(objset_t *os, uint64_t object)
	-{
	- dnode_t *dn;
	- dmu_buf_impl_t *db = NULL;
	-
	- if (dnode_hold(os, object, FTAG, &dn) == 0) {
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- if (dn->dn_bonus != NULL) {
	- db = dn->dn_bonus;
	- mutex_enter(&db->db_mtx);
	- }
	- rw_exit(&dn->dn_struct_rwlock);
	- dnode_rele(dn, FTAG);
	- }
	- return (db);
	-}
	-
	-/*
	- * Insert an entry into the hash table. If there is already an element
	- * equal to elem in the hash table, then the already existing element
	- * will be returned and the new element will not be inserted.
	- * Otherwise returns NULL.
	- */
	-static dmu_buf_impl_t *
	-dbuf_hash_insert(dmu_buf_impl_t *db)
	-{
	- dbuf_hash_table_t *h = &dbuf_hash_table;
	- objset_t *os = db->db_objset;
	- uint64_t obj = db->db.db_object;
	- int level = db->db_level;
	- uint64_t blkid, hv, idx;
	- dmu_buf_impl_t *dbf;
	- uint32_t i;
	-
	- blkid = db->db_blkid;
	- hv = dbuf_hash(os, obj, level, blkid);
	- idx = hv & h->hash_table_mask;
	-
	- mutex_enter(DBUF_HASH_MUTEX(h, idx));
	- for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
	- dbf = dbf->db_hash_next, i++) {
	- if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
	- mutex_enter(&dbf->db_mtx);
	- if (dbf->db_state != DB_EVICTING) {
	- mutex_exit(DBUF_HASH_MUTEX(h, idx));
	- return (dbf);
	- }
	- mutex_exit(&dbf->db_mtx);
	- }
	- }
	-
	- if (i > 0) {
	- DBUF_STAT_BUMP(hash_collisions);
	- if (i == 1)
	- DBUF_STAT_BUMP(hash_chains);
	-
	- DBUF_STAT_MAX(hash_chain_max, i);
	- }
	-
	- mutex_enter(&db->db_mtx);
	- db->db_hash_next = h->hash_table[idx];
	- h->hash_table[idx] = db;
	- mutex_exit(DBUF_HASH_MUTEX(h, idx));
	- atomic_inc_64(&dbuf_hash_count);
	- DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count);
	-
	- return (NULL);
	-}
	-
	-/*
	- * Remove an entry from the hash table. It must be in the EVICTING state.
	- */
	-static void
	-dbuf_hash_remove(dmu_buf_impl_t *db)
	-{
	- dbuf_hash_table_t *h = &dbuf_hash_table;
	- uint64_t hv, idx;
	- dmu_buf_impl_t dbf, *dbp;
	-
	- hv = dbuf_hash(db->db_objset, db->db.db_object,
	- db->db_level, db->db_blkid);
	- idx = hv & h->hash_table_mask;
	-
	- /*
	- * We mustn't hold db_mtx to maintain lock ordering:
	- * DBUF_HASH_MUTEX > db_mtx.
	- */
	- ASSERT(zfs_refcount_is_zero(&db->db_holds));
	- ASSERT(db->db_state == DB_EVICTING);
	- ASSERT(!MUTEX_HELD(&db->db_mtx));
	-
	- mutex_enter(DBUF_HASH_MUTEX(h, idx));
	- dbp = &h->hash_table[idx];
	- while ((dbf = *dbp) != db) {
	- dbp = &dbf->db_hash_next;
	- ASSERT(dbf != NULL);
	- }
	- *dbp = db->db_hash_next;
	- db->db_hash_next = NULL;
	- if (h->hash_table[idx] &&
	- h->hash_table[idx]->db_hash_next == NULL)
	- DBUF_STAT_BUMPDOWN(hash_chains);
	- mutex_exit(DBUF_HASH_MUTEX(h, idx));
	- atomic_dec_64(&dbuf_hash_count);
	-}
	-
	-typedef enum {
	- DBVU_EVICTING,
	- DBVU_NOT_EVICTING
	-} dbvu_verify_type_t;
	-
	-static void
	-dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
	-{
	-#ifdef ZFS_DEBUG
	- int64_t holds;
	-
	- if (db->db_user == NULL)
	- return;
	-
	- /* Only data blocks support the attachment of user data. */
	- ASSERT(db->db_level == 0);
	-
	- /* Clients must resolve a dbuf before attaching user data. */
	- ASSERT(db->db.db_data != NULL);
	- ASSERT3U(db->db_state, ==, DB_CACHED);
	-
	- holds = zfs_refcount_count(&db->db_holds);
	- if (verify_type == DBVU_EVICTING) {
	- /*
	- * Immediate eviction occurs when holds == dirtycnt.
	- * For normal eviction buffers, holds is zero on
	- * eviction, except when dbuf_fix_old_data() calls
	- * dbuf_clear_data(). However, the hold count can grow
	- * during eviction even though db_mtx is held (see
	- * dmu_bonus_hold() for an example), so we can only
	- * test the generic invariant that holds >= dirtycnt.
	- */
	- ASSERT3U(holds, >=, db->db_dirtycnt);
	- } else {
	- if (db->db_user_immediate_evict == TRUE)
	- ASSERT3U(holds, >=, db->db_dirtycnt);
	- else
	- ASSERT3U(holds, >, 0);
	- }
	-#endif
	-}
	-
	-static void
	-dbuf_evict_user(dmu_buf_impl_t *db)
	-{
	- dmu_buf_user_t *dbu = db->db_user;
	-
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	-
	- if (dbu == NULL)
	- return;
	-
	- dbuf_verify_user(db, DBVU_EVICTING);
	- db->db_user = NULL;
	-
	-#ifdef ZFS_DEBUG
	- if (dbu->dbu_clear_on_evict_dbufp != NULL)
	- *dbu->dbu_clear_on_evict_dbufp = NULL;
	-#endif
	-
	- /*
	- * There are two eviction callbacks - one that we call synchronously
	- * and one that we invoke via a taskq. The async one is useful for
	- * avoiding lock order reversals and limiting stack depth.
	- *
	- * Note that if we have a sync callback but no async callback,
	- * it's likely that the sync callback will free the structure
	- * containing the dbu. In that case we need to take care to not
	- * dereference dbu after calling the sync evict func.
	- */
	- boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
	-
	- if (dbu->dbu_evict_func_sync != NULL)
	- dbu->dbu_evict_func_sync(dbu);
	-
	- if (has_async) {
	- taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
	- dbu, 0, &dbu->dbu_tqent);
	- }
	-}
	-
	-boolean_t
	-dbuf_is_metadata(dmu_buf_impl_t *db)
	-{
	- if (db->db_level > 0) {
	- return (B_TRUE);
	- } else {
	- boolean_t is_metadata;
	-
	- DB_DNODE_ENTER(db);
	- is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
	- DB_DNODE_EXIT(db);
	-
	- return (is_metadata);
	- }
	-}
	-
	-/*
	- * This returns whether this dbuf should be stored in the metadata cache, which
	- * is based on whether it's from one of the dnode types that store data related
	- * to traversing dataset hierarchies.
	- */
	-static boolean_t
	-dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
	-{
	- DB_DNODE_ENTER(db);
	- dmu_object_type_t type = DB_DNODE(db)->dn_type;
	- DB_DNODE_EXIT(db);
	-
	- /* Check if this dbuf is one of the types we care about */
	- if (DMU_OT_IS_METADATA_CACHED(type)) {
	- /* If we hit this, then we set something up wrong in dmu_ot */
	- ASSERT(DMU_OT_IS_METADATA(type));
	-
	- /*
	- * Sanity check for small-memory systems: don't allocate too
	- * much memory for this purpose.
	- */
	- if (zfs_refcount_count(
	- &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
	- dbuf_metadata_cache_max_bytes) {
	- dbuf_metadata_cache_overflow++;
	- DTRACE_PROBE1(dbuf__metadata__cache__overflow,
	- dmu_buf_impl_t *, db);
	- return (B_FALSE);
	- }
	-
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * This function must return indices evenly distributed between all
	- * sublists of the multilist. This is needed due to how the dbuf eviction
	- * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
	- * distributed between all sublists and uses this assumption when
	- * deciding which sublist to evict from and how much to evict from it.
	- */
	-unsigned int
	-dbuf_cache_multilist_index_func(multilist_t ml, void obj)
	-{
	- dmu_buf_impl_t *db = obj;
	-
	- /*
	- * The assumption here, is the hash value for a given
	- * dmu_buf_impl_t will remain constant throughout it's lifetime
	- * (i.e. it's objset, object, level and blkid fields don't change).
	- * Thus, we don't need to store the dbuf's sublist index
	- * on insertion, as this index can be recalculated on removal.
	- *
	- * Also, the low order bits of the hash value are thought to be
	- * distributed evenly. Otherwise, in the case that the multilist
	- * has a power of two number of sublists, each sublists' usage
	- * would not be evenly distributed.
	- */
	- return (dbuf_hash(db->db_objset, db->db.db_object,
	- db->db_level, db->db_blkid) %
	- multilist_get_num_sublists(ml));
	-}
	-
	-static inline unsigned long
	-dbuf_cache_target_bytes(void)
	-{
	- return MIN(dbuf_cache_max_bytes,
	- arc_max_bytes() >> dbuf_cache_shift);
	-}
	-
	-static inline uint64_t
	-dbuf_cache_hiwater_bytes(void)
	-{
	- uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
	- return (dbuf_cache_target +
	- (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
	-}
	-
	-static inline uint64_t
	-dbuf_cache_lowater_bytes(void)
	-{
	- uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
	- return (dbuf_cache_target -
	- (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
	-}
	-
	-static inline boolean_t
	-dbuf_cache_above_lowater(void)
	-{
	- return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
	- dbuf_cache_lowater_bytes());
	-}
	-
	-/*
	- * Evict the oldest eligible dbuf from the dbuf cache.
	- */
	-static void
	-dbuf_evict_one(void)
	-{
	- int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
	- multilist_sublist_t *mls = multilist_sublist_lock(
	- dbuf_caches[DB_DBUF_CACHE].cache, idx);
	-
	- ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
	-
	- dmu_buf_impl_t *db = multilist_sublist_tail(mls);
	- while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
	- db = multilist_sublist_prev(mls, db);
	- }
	-
	- DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
	- multilist_sublist_t *, mls);
	-
	- if (db != NULL) {
	- multilist_sublist_remove(mls, db);
	- multilist_sublist_unlock(mls);
	- (void) zfs_refcount_remove_many(
	- &dbuf_caches[DB_DBUF_CACHE].size,
	- db->db.db_size, db);
	- DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
	- DBUF_STAT_BUMPDOWN(cache_count);
	- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
	- db->db.db_size);
	- ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
	- db->db_caching_status = DB_NO_CACHE;
	- dbuf_destroy(db);
	- DBUF_STAT_BUMP(cache_total_evicts);
	- } else {
	- multilist_sublist_unlock(mls);
	- }
	-}
	-
	-/*
	- * The dbuf evict thread is responsible for aging out dbufs from the
	- * cache. Once the cache has reached it's maximum size, dbufs are removed
	- * and destroyed. The eviction thread will continue running until the size
	- * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
	- * out of the cache it is destroyed and becomes eligible for arc eviction.
	- */
	-/* ARGSUSED */
	-static void
	-dbuf_evict_thread(void *unused __unused)
	-{
	- callb_cpr_t cpr;
	-
	- CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
	-
	- mutex_enter(&dbuf_evict_lock);
	- while (!dbuf_evict_thread_exit) {
	- while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
	- CALLB_CPR_SAFE_BEGIN(&cpr);
	- (void) cv_timedwait_hires(&dbuf_evict_cv,
	- &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
	- CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
	-#ifdef __FreeBSD__
	- if (dbuf_ksp != NULL)
	- dbuf_ksp->ks_update(dbuf_ksp, KSTAT_READ);
	-#endif
	- }
	- mutex_exit(&dbuf_evict_lock);
	-
	- /*
	- * Keep evicting as long as we're above the low water mark
	- * for the cache. We do this without holding the locks to
	- * minimize lock contention.
	- */
	- while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
	- dbuf_evict_one();
	- }
	-
	- mutex_enter(&dbuf_evict_lock);
	- }
	-
	- dbuf_evict_thread_exit = B_FALSE;
	- cv_broadcast(&dbuf_evict_cv);
	- CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
	- thread_exit();
	-}
	-
	-/*
	- * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
	- * If the dbuf cache is at its high water mark, then evict a dbuf from the
	- * dbuf cache using the callers context.
	- */
	-static void
	-dbuf_evict_notify(uint64_t size)
	-{
	- /*
	- * We check if we should evict without holding the dbuf_evict_lock,
	- * because it's OK to occasionally make the wrong decision here,
	- * and grabbing the lock results in massive lock contention.
	- */
	- if (size > dbuf_cache_max_bytes) {
	- if (size > dbuf_cache_hiwater_bytes())
	- dbuf_evict_one();
	- cv_signal(&dbuf_evict_cv);
	- }
	-}
	-
	-static int
	-dbuf_kstat_update(kstat_t *ksp, int rw)
	-{
	- dbuf_stats_t *ds = ksp->ks_data;
	-
	- if (rw == KSTAT_WRITE) {
	- return (SET_ERROR(EACCES));
	- } else {
	- ds->metadata_cache_size_bytes.value.ui64 =
	- zfs_refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size);
	- ds->cache_size_bytes.value.ui64 =
	- zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
	- ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
	- ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
	- ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
	- ds->hash_elements.value.ui64 = dbuf_hash_count;
	- }
	-
	- return (0);
	-}
	-
	-void
	-dbuf_init(void)
	-{
	- uint64_t hsize = 1ULL << 16;
	- dbuf_hash_table_t *h = &dbuf_hash_table;
	- int i;
	-
	- /*
	- * The hash table is big enough to fill all of physical memory
	- * with an average 4K block size. The table will take up
	- * totalmemsizeof(void)/4K (i.e. 2MB/GB with 8-byte pointers).
	- */
	- while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
	- hsize <<= 1;
	-
	-retry:
	- h->hash_table_mask = hsize - 1;
	- h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
	- if (h->hash_table == NULL) {
	- /* XXX - we should really return an error instead of assert */
	- ASSERT(hsize > (1ULL << 10));
	- hsize >>= 1;
	- goto retry;
	- }
	-
	- dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
	- sizeof (dmu_buf_impl_t),
	- 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
	-
	- for (i = 0; i < DBUF_MUTEXES; i++)
	- mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
	-
	- dbuf_stats_init(h);
	- /*
	- * Setup the parameters for the dbuf caches. We set the sizes of the
	- * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
	- * of the size of the ARC, respectively. If the values are set in
	- * /etc/system and they're not greater than the size of the ARC, then
	- * we honor that value.
	- */
	- if (dbuf_cache_max_bytes == 0 \|\|
	- dbuf_cache_max_bytes >= arc_max_bytes()) {
	- dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
	- }
	- if (dbuf_metadata_cache_max_bytes == 0 \|\|
	- dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
	- dbuf_metadata_cache_max_bytes =
	- arc_max_bytes() >> dbuf_metadata_cache_shift;
	- }
	-
	- /*
	- * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
	- * configuration is not required.
	- */
	- dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
	-
	- for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
	- dbuf_caches[dcs].cache =
	- multilist_create(sizeof (dmu_buf_impl_t),
	- offsetof(dmu_buf_impl_t, db_cache_link),
	- dbuf_cache_multilist_index_func);
	- zfs_refcount_create(&dbuf_caches[dcs].size);
	- }
	-
	- dbuf_evict_thread_exit = B_FALSE;
	- mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
	- dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
	- NULL, 0, &p0, TS_RUN, minclsyspri);
	-
	- dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
	- KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
	- KSTAT_FLAG_VIRTUAL);
	- if (dbuf_ksp != NULL) {
	- for (i = 0; i < DN_MAX_LEVELS; i++) {
	- snprintf(dbuf_stats.cache_levels[i].name,
	- KSTAT_STRLEN, "cache_level_%d", i);
	- dbuf_stats.cache_levels[i].data_type =
	- KSTAT_DATA_UINT64;
	- snprintf(dbuf_stats.cache_levels_bytes[i].name,
	- KSTAT_STRLEN, "cache_level_%d_bytes", i);
	- dbuf_stats.cache_levels_bytes[i].data_type =
	- KSTAT_DATA_UINT64;
	- }
	- dbuf_ksp->ks_data = &dbuf_stats;
	- dbuf_ksp->ks_update = dbuf_kstat_update;
	- kstat_install(dbuf_ksp);
	- }
	-}
	-
	-void
	-dbuf_fini(void)
	-{
	- dbuf_hash_table_t *h = &dbuf_hash_table;
	- int i;
	-
	- dbuf_stats_destroy();
	-
	- for (i = 0; i < DBUF_MUTEXES; i++)
	- mutex_destroy(&h->hash_mutexes[i]);
	- kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
	- kmem_cache_destroy(dbuf_kmem_cache);
	- taskq_destroy(dbu_evict_taskq);
	-
	- mutex_enter(&dbuf_evict_lock);
	- dbuf_evict_thread_exit = B_TRUE;
	- while (dbuf_evict_thread_exit) {
	- cv_signal(&dbuf_evict_cv);
	- cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
	- }
	- mutex_exit(&dbuf_evict_lock);
	-
	- mutex_destroy(&dbuf_evict_lock);
	- cv_destroy(&dbuf_evict_cv);
	-
	- for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
	- zfs_refcount_destroy(&dbuf_caches[dcs].size);
	- multilist_destroy(dbuf_caches[dcs].cache);
	- }
	-
	- if (dbuf_ksp != NULL) {
	- kstat_delete(dbuf_ksp);
	- dbuf_ksp = NULL;
	- }
	-}
	-
	-/*
	- * Other stuff.
	- */
	-
	-#ifdef ZFS_DEBUG
	-static void
	-dbuf_verify(dmu_buf_impl_t *db)
	-{
	- dnode_t *dn;
	- dbuf_dirty_record_t *dr;
	-
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	-
	- if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
	- return;
	-
	- ASSERT(db->db_objset != NULL);
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- if (dn == NULL) {
	- ASSERT(db->db_parent == NULL);
	- ASSERT(db->db_blkptr == NULL);
	- } else {
	- ASSERT3U(db->db.db_object, ==, dn->dn_object);
	- ASSERT3P(db->db_objset, ==, dn->dn_objset);
	- ASSERT3U(db->db_level, <, dn->dn_nlevels);
	- ASSERT(db->db_blkid == DMU_BONUS_BLKID \|\|
	- db->db_blkid == DMU_SPILL_BLKID \|\|
	- !avl_is_empty(&dn->dn_dbufs));
	- }
	- if (db->db_blkid == DMU_BONUS_BLKID) {
	- ASSERT(dn != NULL);
	- ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
	- ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
	- } else if (db->db_blkid == DMU_SPILL_BLKID) {
	- ASSERT(dn != NULL);
	- ASSERT0(db->db.db_offset);
	- } else {
	- ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
	- }
	-
	- for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
	- ASSERT(dr->dr_dbuf == db);
	-
	- for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
	- ASSERT(dr->dr_dbuf == db);
	-
	- /*
	- * We can't assert that db_size matches dn_datablksz because it
	- * can be momentarily different when another thread is doing
	- * dnode_set_blksz().
	- */
	- if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
	- dr = db->db_data_pending;
	- /*
	- * It should only be modified in syncing context, so
	- * make sure we only have one copy of the data.
	- */
	- ASSERT(dr == NULL \|\| dr->dt.dl.dr_data == db->db_buf);
	- }
	-
	- /* verify db->db_blkptr */
	- if (db->db_blkptr) {
	- if (db->db_parent == dn->dn_dbuf) {
	- /* db is pointed to by the dnode */
	- /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
	- if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
	- ASSERT(db->db_parent == NULL);
	- else
	- ASSERT(db->db_parent != NULL);
	- if (db->db_blkid != DMU_SPILL_BLKID)
	- ASSERT3P(db->db_blkptr, ==,
	- &dn->dn_phys->dn_blkptr[db->db_blkid]);
	- } else {
	- /* db is pointed to by an indirect block */
	- int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
	- ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
	- ASSERT3U(db->db_parent->db.db_object, ==,
	- db->db.db_object);
	- /*
	- * dnode_grow_indblksz() can make this fail if we don't
	- * have the struct_rwlock. XXX indblksz no longer
	- * grows. safe to do this now?
	- */
	- if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
	- ASSERT3P(db->db_blkptr, ==,
	- ((blkptr_t *)db->db_parent->db.db_data +
	- db->db_blkid % epb));
	- }
	- }
	- }
	- if ((db->db_blkptr == NULL \|\| BP_IS_HOLE(db->db_blkptr)) &&
	- (db->db_buf == NULL \|\| db->db_buf->b_data) &&
	- db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
	- db->db_state != DB_FILL && !dn->dn_free_txg) {
	- /*
	- * If the blkptr isn't set but they have nonzero data,
	- * it had better be dirty, otherwise we'll lose that
	- * data when we evict this buffer.
	- *
	- * There is an exception to this rule for indirect blocks; in
	- * this case, if the indirect block is a hole, we fill in a few
	- * fields on each of the child blocks (importantly, birth time)
	- * to prevent hole birth times from being lost when you
	- * partially fill in a hole.
	- */
	- if (db->db_dirtycnt == 0) {
	- if (db->db_level == 0) {
	- uint64_t *buf = db->db.db_data;
	- int i;
	-
	- for (i = 0; i < db->db.db_size >> 3; i++) {
	- ASSERT(buf[i] == 0);
	- }
	- } else {
	- blkptr_t *bps = db->db.db_data;
	- ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
	- db->db.db_size);
	- /*
	- * We want to verify that all the blkptrs in the
	- * indirect block are holes, but we may have
	- * automatically set up a few fields for them.
	- * We iterate through each blkptr and verify
	- * they only have those fields set.
	- */
	- for (int i = 0;
	- i < db->db.db_size / sizeof (blkptr_t);
	- i++) {
	- blkptr_t *bp = &bps[i];
	- ASSERT(ZIO_CHECKSUM_IS_ZERO(
	- &bp->blk_cksum));
	- ASSERT(
	- DVA_IS_EMPTY(&bp->blk_dva[0]) &&
	- DVA_IS_EMPTY(&bp->blk_dva[1]) &&
	- DVA_IS_EMPTY(&bp->blk_dva[2]));
	- ASSERT0(bp->blk_fill);
	- ASSERT0(bp->blk_pad[0]);
	- ASSERT0(bp->blk_pad[1]);
	- ASSERT(!BP_IS_EMBEDDED(bp));
	- ASSERT(BP_IS_HOLE(bp));
	- ASSERT0(bp->blk_phys_birth);
	- }
	- }
	- }
	- }
	- DB_DNODE_EXIT(db);
	-}
	-#endif
	-
	-static void
	-dbuf_clear_data(dmu_buf_impl_t *db)
	-{
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	- dbuf_evict_user(db);
	- ASSERT3P(db->db_buf, ==, NULL);
	- db->db.db_data = NULL;
	- if (db->db_state != DB_NOFILL)
	- db->db_state = DB_UNCACHED;
	-}
	-
	-static void
	-dbuf_set_data(dmu_buf_impl_t db, arc_buf_t buf)
	-{
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	- ASSERT(buf != NULL);
	-
	- db->db_buf = buf;
	- ASSERT(buf->b_data != NULL);
	- db->db.db_data = buf->b_data;
	-}
	-
	-/*
	- * Loan out an arc_buf for read. Return the loaned arc_buf.
	- */
	-arc_buf_t *
	-dbuf_loan_arcbuf(dmu_buf_impl_t *db)
	-{
	- arc_buf_t *abuf;
	-
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	- mutex_enter(&db->db_mtx);
	- if (arc_released(db->db_buf) \|\| zfs_refcount_count(&db->db_holds) > 1) {
	- int blksz = db->db.db_size;
	- spa_t *spa = db->db_objset->os_spa;
	-
	- mutex_exit(&db->db_mtx);
	- abuf = arc_loan_buf(spa, B_FALSE, blksz);
	- bcopy(db->db.db_data, abuf->b_data, blksz);
	- } else {
	- abuf = db->db_buf;
	- arc_loan_inuse_buf(abuf, db);
	- db->db_buf = NULL;
	- dbuf_clear_data(db);
	- mutex_exit(&db->db_mtx);
	- }
	- return (abuf);
	-}
	-
	-/*
	- * Calculate which level n block references the data at the level 0 offset
	- * provided.
	- */
	-uint64_t
	-dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
	-{
	- if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
	- /*
	- * The level n blkid is equal to the level 0 blkid divided by
	- * the number of level 0s in a level n block.
	- *
	- * The level 0 blkid is offset >> datablkshift =
	- * offset / 2^datablkshift.
	- *
	- * The number of level 0s in a level n is the number of block
	- * pointers in an indirect block, raised to the power of level.
	- * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
	- * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
	- *
	- * Thus, the level n blkid is: offset /
	- * ((2^datablkshift)(2^(level(indblkshift - SPA_BLKPTRSHIFT)))
	- * = offset / 2^(datablkshift + level *
	- * (indblkshift - SPA_BLKPTRSHIFT))
	- * = offset >> (datablkshift + level *
	- * (indblkshift - SPA_BLKPTRSHIFT))
	- */
	- return (offset >> (dn->dn_datablkshift + level *
	- (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
	- } else {
	- ASSERT3U(offset, <, dn->dn_datablksz);
	- return (0);
	- }
	-}
	-
	-static void
	-dbuf_read_done(zio_t zio, const zbookmark_phys_t zb, const blkptr_t *bp,
	- arc_buf_t buf, void vdb)
	-{
	- dmu_buf_impl_t *db = vdb;
	-
	- mutex_enter(&db->db_mtx);
	- ASSERT3U(db->db_state, ==, DB_READ);
	- /*
	- * All reads are synchronous, so we must have a hold on the dbuf
	- */
	- ASSERT(zfs_refcount_count(&db->db_holds) > 0);
	- ASSERT(db->db_buf == NULL);
	- ASSERT(db->db.db_data == NULL);
	- if (buf == NULL) {
	- /* i/o error */
	- ASSERT(zio == NULL \|\| zio->io_error != 0);
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	- ASSERT3P(db->db_buf, ==, NULL);
	- db->db_state = DB_UNCACHED;
	- } else if (db->db_level == 0 && db->db_freed_in_flight) {
	- /* freed in flight */
	- ASSERT(zio == NULL \|\| zio->io_error == 0);
	- if (buf == NULL) {
	- buf = arc_alloc_buf(db->db_objset->os_spa,
	- db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
	- }
	- arc_release(buf, db);
	- bzero(buf->b_data, db->db.db_size);
	- arc_buf_freeze(buf);
	- db->db_freed_in_flight = FALSE;
	- dbuf_set_data(db, buf);
	- db->db_state = DB_CACHED;
	- } else {
	- /* success */
	- ASSERT(zio == NULL \|\| zio->io_error == 0);
	- dbuf_set_data(db, buf);
	- db->db_state = DB_CACHED;
	- }
	- cv_broadcast(&db->db_changed);
	- dbuf_rele_and_unlock(db, NULL, B_FALSE);
	-}
	-
	-static void
	-dbuf_read_impl(dmu_buf_impl_t db, zio_t zio, uint32_t flags)
	-{
	- dnode_t *dn;
	- zbookmark_phys_t zb;
	- arc_flags_t aflags = ARC_FLAG_NOWAIT;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
	- /* We need the struct_rwlock to prevent db_blkptr from changing. */
	- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	- ASSERT(db->db_state == DB_UNCACHED);
	- ASSERT(db->db_buf == NULL);
	-
	- if (db->db_blkid == DMU_BONUS_BLKID) {
	- /*
	- * The bonus length stored in the dnode may be less than
	- * the maximum available space in the bonus buffer.
	- */
	- int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
	- int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
	-
	- ASSERT3U(bonuslen, <=, db->db.db_size);
	- db->db.db_data = zio_buf_alloc(max_bonuslen);
	- arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
	- if (bonuslen < max_bonuslen)
	- bzero(db->db.db_data, max_bonuslen);
	- if (bonuslen)
	- bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
	- DB_DNODE_EXIT(db);
	- db->db_state = DB_CACHED;
	- mutex_exit(&db->db_mtx);
	- return;
	- }
	-
	- /*
	- * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
	- * processes the delete record and clears the bp while we are waiting
	- * for the dn_mtx (resulting in a "no" from block_freed).
	- */
	- if (db->db_blkptr == NULL \|\| BP_IS_HOLE(db->db_blkptr) \|\|
	- (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) \|\|
	- BP_IS_HOLE(db->db_blkptr)))) {
	- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
	-
	- dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
	- db->db.db_size));
	- bzero(db->db.db_data, db->db.db_size);
	-
	- if (db->db_blkptr != NULL && db->db_level > 0 &&
	- BP_IS_HOLE(db->db_blkptr) &&
	- db->db_blkptr->blk_birth != 0) {
	- blkptr_t *bps = db->db.db_data;
	- for (int i = 0; i < ((1 <<
	- DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
	- i++) {
	- blkptr_t *bp = &bps[i];
	- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
	- 1 << dn->dn_indblkshift);
	- BP_SET_LSIZE(bp,
	- BP_GET_LEVEL(db->db_blkptr) == 1 ?
	- dn->dn_datablksz :
	- BP_GET_LSIZE(db->db_blkptr));
	- BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
	- BP_SET_LEVEL(bp,
	- BP_GET_LEVEL(db->db_blkptr) - 1);
	- BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
	- }
	- }
	- DB_DNODE_EXIT(db);
	- db->db_state = DB_CACHED;
	- mutex_exit(&db->db_mtx);
	- return;
	- }
	-
	- DB_DNODE_EXIT(db);
	-
	- db->db_state = DB_READ;
	- mutex_exit(&db->db_mtx);
	-
	- if (DBUF_IS_L2CACHEABLE(db))
	- aflags \|= ARC_FLAG_L2CACHE;
	-
	- SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
	- db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
	- db->db.db_object, db->db_level, db->db_blkid);
	-
	- dbuf_add_ref(db, NULL);
	-
	- (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
	- dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
	- (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
	- &aflags, &zb);
	-}
	-
	-/*
	- * This is our just-in-time copy function. It makes a copy of buffers that
	- * have been modified in a previous transaction group before we access them in
	- * the current active group.
	- *
	- * This function is used in three places: when we are dirtying a buffer for the
	- * first time in a txg, when we are freeing a range in a dnode that includes
	- * this buffer, and when we are accessing a buffer which was received compressed
	- * and later referenced in a WRITE_BYREF record.
	- *
	- * Note that when we are called from dbuf_free_range() we do not put a hold on
	- * the buffer, we just traverse the active dbuf list for the dnode.
	- */
	-static void
	-dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
	-{
	- dbuf_dirty_record_t *dr = db->db_last_dirty;
	-
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	- ASSERT(db->db.db_data != NULL);
	- ASSERT(db->db_level == 0);
	- ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
	-
	- if (dr == NULL \|\|
	- (dr->dt.dl.dr_data !=
	- ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
	- return;
	-
	- /*
	- * If the last dirty record for this dbuf has not yet synced
	- * and its referencing the dbuf data, either:
	- * reset the reference to point to a new copy,
	- * or (if there a no active holders)
	- * just null out the current db_data pointer.
	- */
	- ASSERT(dr->dr_txg >= txg - 2);
	- if (db->db_blkid == DMU_BONUS_BLKID) {
	- /* Note that the data bufs here are zio_bufs */
	- dnode_t *dn = DB_DNODE(db);
	- int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
	- dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
	- arc_space_consume(bonuslen, ARC_SPACE_BONUS);
	- bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
	- } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
	- int size = arc_buf_size(db->db_buf);
	- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
	- spa_t *spa = db->db_objset->os_spa;
	- enum zio_compress compress_type =
	- arc_get_compression(db->db_buf);
	-
	- if (compress_type == ZIO_COMPRESS_OFF) {
	- dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
	- } else {
	- ASSERT3U(type, ==, ARC_BUFC_DATA);
	- dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
	- size, arc_buf_lsize(db->db_buf), compress_type);
	- }
	- bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
	- } else {
	- db->db_buf = NULL;
	- dbuf_clear_data(db);
	- }
	-}
	-
	-int
	-dbuf_read(dmu_buf_impl_t db, zio_t zio, uint32_t flags)
	-{
	- int err = 0;
	- boolean_t prefetch;
	- dnode_t *dn;
	-
	- /*
	- * We don't have to hold the mutex to check db_state because it
	- * can't be freed while we have a hold on the buffer.
	- */
	- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
	-
	- if (db->db_state == DB_NOFILL)
	- return (SET_ERROR(EIO));
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- if ((flags & DB_RF_HAVESTRUCT) == 0)
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	-
	- prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
	- (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
	- DBUF_IS_CACHEABLE(db);
	-
	- mutex_enter(&db->db_mtx);
	- if (db->db_state == DB_CACHED) {
	- /*
	- * If the arc buf is compressed, we need to decompress it to
	- * read the data. This could happen during the "zfs receive" of
	- * a stream which is compressed and deduplicated.
	- */
	- if (db->db_buf != NULL &&
	- arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
	- dbuf_fix_old_data(db,
	- spa_syncing_txg(dmu_objset_spa(db->db_objset)));
	- err = arc_decompress(db->db_buf);
	- dbuf_set_data(db, db->db_buf);
	- }
	- mutex_exit(&db->db_mtx);
	- if (prefetch)
	- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
	- if ((flags & DB_RF_HAVESTRUCT) == 0)
	- rw_exit(&dn->dn_struct_rwlock);
	- DB_DNODE_EXIT(db);
	- DBUF_STAT_BUMP(hash_hits);
	- } else if (db->db_state == DB_UNCACHED) {
	- spa_t *spa = dn->dn_objset->os_spa;
	- boolean_t need_wait = B_FALSE;
	-
	- if (zio == NULL &&
	- db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
	- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
	- need_wait = B_TRUE;
	- }
	- dbuf_read_impl(db, zio, flags);
	-
	- /* dbuf_read_impl has dropped db_mtx for us */
	-
	- if (prefetch)
	- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
	-
	- if ((flags & DB_RF_HAVESTRUCT) == 0)
	- rw_exit(&dn->dn_struct_rwlock);
	- DB_DNODE_EXIT(db);
	- DBUF_STAT_BUMP(hash_misses);
	-
	- if (need_wait)
	- err = zio_wait(zio);
	- } else {
	- /*
	- * Another reader came in while the dbuf was in flight
	- * between UNCACHED and CACHED. Either a writer will finish
	- * writing the buffer (sending the dbuf to CACHED) or the
	- * first reader's request will reach the read_done callback
	- * and send the dbuf to CACHED. Otherwise, a failure
	- * occurred and the dbuf went to UNCACHED.
	- */
	- mutex_exit(&db->db_mtx);
	- if (prefetch)
	- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
	- if ((flags & DB_RF_HAVESTRUCT) == 0)
	- rw_exit(&dn->dn_struct_rwlock);
	- DB_DNODE_EXIT(db);
	- DBUF_STAT_BUMP(hash_misses);
	-
	- /* Skip the wait per the caller's request. */
	- mutex_enter(&db->db_mtx);
	- if ((flags & DB_RF_NEVERWAIT) == 0) {
	- while (db->db_state == DB_READ \|\|
	- db->db_state == DB_FILL) {
	- ASSERT(db->db_state == DB_READ \|\|
	- (flags & DB_RF_HAVESTRUCT) == 0);
	- DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
	- db, zio_t *, zio);
	- cv_wait(&db->db_changed, &db->db_mtx);
	- }
	- if (db->db_state == DB_UNCACHED)
	- err = SET_ERROR(EIO);
	- }
	- mutex_exit(&db->db_mtx);
	- }
	-
	- return (err);
	-}
	-
	-static void
	-dbuf_noread(dmu_buf_impl_t *db)
	-{
	- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	- mutex_enter(&db->db_mtx);
	- while (db->db_state == DB_READ \|\| db->db_state == DB_FILL)
	- cv_wait(&db->db_changed, &db->db_mtx);
	- if (db->db_state == DB_UNCACHED) {
	- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
	- spa_t *spa = db->db_objset->os_spa;
	-
	- ASSERT(db->db_buf == NULL);
	- ASSERT(db->db.db_data == NULL);
	- dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
	- db->db_state = DB_FILL;
	- } else if (db->db_state == DB_NOFILL) {
	- dbuf_clear_data(db);
	- } else {
	- ASSERT3U(db->db_state, ==, DB_CACHED);
	- }
	- mutex_exit(&db->db_mtx);
	-}
	-
	-void
	-dbuf_unoverride(dbuf_dirty_record_t *dr)
	-{
	- dmu_buf_impl_t *db = dr->dr_dbuf;
	- blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
	- uint64_t txg = dr->dr_txg;
	-
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	- /*
	- * This assert is valid because dmu_sync() expects to be called by
	- * a zilog's get_data while holding a range lock. This call only
	- * comes from dbuf_dirty() callers who must also hold a range lock.
	- */
	- ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
	- ASSERT(db->db_level == 0);
	-
	- if (db->db_blkid == DMU_BONUS_BLKID \|\|
	- dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
	- return;
	-
	- ASSERT(db->db_data_pending != dr);
	-
	- /* free this block */
	- if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
	- zio_free(db->db_objset->os_spa, txg, bp);
	-
	- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
	- dr->dt.dl.dr_nopwrite = B_FALSE;
	-
	- /*
	- * Release the already-written buffer, so we leave it in
	- * a consistent dirty state. Note that all callers are
	- * modifying the buffer, so they will immediately do
	- * another (redundant) arc_release(). Therefore, leave
	- * the buf thawed to save the effort of freezing &
	- * immediately re-thawing it.
	- */
	- arc_release(dr->dt.dl.dr_data, db);
	-}
	-
	-/*
	- * Evict (if its unreferenced) or clear (if its referenced) any level-0
	- * data blocks in the free range, so that any future readers will find
	- * empty blocks.
	- */
	-void
	-dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
	- dmu_tx_t *tx)
	-{
	- dmu_buf_impl_t db_search;
	- dmu_buf_impl_t db, db_next;
	- uint64_t txg = tx->tx_txg;
	- avl_index_t where;
	-
	- if (end_blkid > dn->dn_maxblkid &&
	- !(start_blkid == DMU_SPILL_BLKID \|\| end_blkid == DMU_SPILL_BLKID))
	- end_blkid = dn->dn_maxblkid;
	- dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
	-
	- db_search.db_level = 0;
	- db_search.db_blkid = start_blkid;
	- db_search.db_state = DB_SEARCH;
	-
	- mutex_enter(&dn->dn_dbufs_mtx);
	- db = avl_find(&dn->dn_dbufs, &db_search, &where);
	- ASSERT3P(db, ==, NULL);
	-
	- db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
	-
	- for (; db != NULL; db = db_next) {
	- db_next = AVL_NEXT(&dn->dn_dbufs, db);
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	-
	- if (db->db_level != 0 \|\| db->db_blkid > end_blkid) {
	- break;
	- }
	- ASSERT3U(db->db_blkid, >=, start_blkid);
	-
	- /* found a level 0 buffer in the range */
	- mutex_enter(&db->db_mtx);
	- if (dbuf_undirty(db, tx)) {
	- /* mutex has been dropped and dbuf destroyed */
	- continue;
	- }
	-
	- if (db->db_state == DB_UNCACHED \|\|
	- db->db_state == DB_NOFILL \|\|
	- db->db_state == DB_EVICTING) {
	- ASSERT(db->db.db_data == NULL);
	- mutex_exit(&db->db_mtx);
	- continue;
	- }
	- if (db->db_state == DB_READ \|\| db->db_state == DB_FILL) {
	- /* will be handled in dbuf_read_done or dbuf_rele */
	- db->db_freed_in_flight = TRUE;
	- mutex_exit(&db->db_mtx);
	- continue;
	- }
	- if (zfs_refcount_count(&db->db_holds) == 0) {
	- ASSERT(db->db_buf);
	- dbuf_destroy(db);
	- continue;
	- }
	- /* The dbuf is referenced */
	-
	- if (db->db_last_dirty != NULL) {
	- dbuf_dirty_record_t *dr = db->db_last_dirty;
	-
	- if (dr->dr_txg == txg) {
	- /*
	- * This buffer is "in-use", re-adjust the file
	- * size to reflect that this buffer may
	- * contain new data when we sync.
	- */
	- if (db->db_blkid != DMU_SPILL_BLKID &&
	- db->db_blkid > dn->dn_maxblkid)
	- dn->dn_maxblkid = db->db_blkid;
	- dbuf_unoverride(dr);
	- } else {
	- /*
	- * This dbuf is not dirty in the open context.
	- * Either uncache it (if its not referenced in
	- * the open context) or reset its contents to
	- * empty.
	- */
	- dbuf_fix_old_data(db, txg);
	- }
	- }
	- /* clear the contents if its cached */
	- if (db->db_state == DB_CACHED) {
	- ASSERT(db->db.db_data != NULL);
	- arc_release(db->db_buf, db);
	- bzero(db->db.db_data, db->db.db_size);
	- arc_buf_freeze(db->db_buf);
	- }
	-
	- mutex_exit(&db->db_mtx);
	- }
	- mutex_exit(&dn->dn_dbufs_mtx);
	-}
	-
	-void
	-dbuf_new_size(dmu_buf_impl_t db, int size, dmu_tx_t tx)
	-{
	- arc_buf_t buf, obuf;
	- int osize = db->db.db_size;
	- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
	- dnode_t *dn;
	-
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	-
	- /* XXX does this func really need the lock? */
	- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
	-
	- /*
	- * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
	- * is OK, because there can be no other references to the db
	- * when we are changing its size, so no concurrent DB_FILL can
	- * be happening.
	- */
	- /*
	- * XXX we should be doing a dbuf_read, checking the return
	- * value and returning that up to our callers
	- */
	- dmu_buf_will_dirty(&db->db, tx);
	-
	- /* create the data buffer for the new block */
	- buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
	-
	- /* copy old block data to the new block */
	- obuf = db->db_buf;
	- bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
	- /* zero the remainder */
	- if (size > osize)
	- bzero((uint8_t *)buf->b_data + osize, size - osize);
	-
	- mutex_enter(&db->db_mtx);
	- dbuf_set_data(db, buf);
	- arc_buf_destroy(obuf, db);
	- db->db.db_size = size;
	-
	- if (db->db_level == 0) {
	- ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
	- db->db_last_dirty->dt.dl.dr_data = buf;
	- }
	- mutex_exit(&db->db_mtx);
	-
	- dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
	- DB_DNODE_EXIT(db);
	-}
	-
	-void
	-dbuf_release_bp(dmu_buf_impl_t *db)
	-{
	- objset_t *os = db->db_objset;
	-
	- ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
	- ASSERT(arc_released(os->os_phys_buf) \|\|
	- list_link_active(&os->os_dsl_dataset->ds_synced_link));
	- ASSERT(db->db_parent == NULL \|\| arc_released(db->db_parent->db_buf));
	-
	- (void) arc_release(db->db_buf, db);
	-}
	-
	-/*
	- * We already have a dirty record for this TXG, and we are being
	- * dirtied again.
	- */
	-static void
	-dbuf_redirty(dbuf_dirty_record_t *dr)
	-{
	- dmu_buf_impl_t *db = dr->dr_dbuf;
	-
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	-
	- if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
	- /*
	- * If this buffer has already been written out,
	- * we now need to reset its state.
	- */
	- dbuf_unoverride(dr);
	- if (db->db.db_object != DMU_META_DNODE_OBJECT &&
	- db->db_state != DB_NOFILL) {
	- /* Already released on initial dirty, so just thaw. */
	- ASSERT(arc_released(db->db_buf));
	- arc_buf_thaw(db->db_buf);
	- }
	- }
	-}
	-
	-dbuf_dirty_record_t *
	-dbuf_dirty(dmu_buf_impl_t db, dmu_tx_t tx)
	-{
	- dnode_t *dn;
	- objset_t *os;
	- dbuf_dirty_record_t *drp, dr;
	- int drop_struct_lock = FALSE;
	- int txgoff = tx->tx_txg & TXG_MASK;
	-
	- ASSERT(tx->tx_txg != 0);
	- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
	- DMU_TX_DIRTY_BUF(tx, db);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- /*
	- * Shouldn't dirty a regular buffer in syncing context. Private
	- * objects may be dirtied in syncing context, but only if they
	- * were already pre-dirtied in open context.
	- */
	-#ifdef DEBUG
	- if (dn->dn_objset->os_dsl_dataset != NULL) {
	- rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
	- RW_READER, FTAG);
	- }
	- ASSERT(!dmu_tx_is_syncing(tx) \|\|
	- BP_IS_HOLE(dn->dn_objset->os_rootbp) \|\|
	- DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\|
	- dn->dn_objset->os_dsl_dataset == NULL);
	- if (dn->dn_objset->os_dsl_dataset != NULL)
	- rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
	-#endif
	- /*
	- * We make this assert for private objects as well, but after we
	- * check if we're already dirty. They are allowed to re-dirty
	- * in syncing context.
	- */
	- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT \|\|
	- dn->dn_dirtyctx == DN_UNDIRTIED \|\| dn->dn_dirtyctx ==
	- (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
	-
	- mutex_enter(&db->db_mtx);
	- /*
	- * XXX make this true for indirects too? The problem is that
	- * transactions created with dmu_tx_create_assigned() from
	- * syncing context don't bother holding ahead.
	- */
	- ASSERT(db->db_level != 0 \|\|
	- db->db_state == DB_CACHED \|\| db->db_state == DB_FILL \|\|
	- db->db_state == DB_NOFILL);
	-
	- mutex_enter(&dn->dn_mtx);
	- /*
	- * Don't set dirtyctx to SYNC if we're just modifying this as we
	- * initialize the objset.
	- */
	- if (dn->dn_dirtyctx == DN_UNDIRTIED) {
	- if (dn->dn_objset->os_dsl_dataset != NULL) {
	- rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
	- RW_READER, FTAG);
	- }
	- if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
	- dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
	- DN_DIRTY_SYNC : DN_DIRTY_OPEN);
	- ASSERT(dn->dn_dirtyctx_firstset == NULL);
	- dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
	- }
	- if (dn->dn_objset->os_dsl_dataset != NULL) {
	- rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
	- FTAG);
	- }
	- }
	-
	- if (tx->tx_txg > dn->dn_dirty_txg)
	- dn->dn_dirty_txg = tx->tx_txg;
	- mutex_exit(&dn->dn_mtx);
	-
	- if (db->db_blkid == DMU_SPILL_BLKID)
	- dn->dn_have_spill = B_TRUE;
	-
	- /*
	- * If this buffer is already dirty, we're done.
	- */
	- drp = &db->db_last_dirty;
	- ASSERT(drp == NULL \|\| (drp)->dr_txg <= tx->tx_txg \|\|
	- db->db.db_object == DMU_META_DNODE_OBJECT);
	- while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
	- drp = &dr->dr_next;
	- if (dr && dr->dr_txg == tx->tx_txg) {
	- DB_DNODE_EXIT(db);
	-
	- dbuf_redirty(dr);
	- mutex_exit(&db->db_mtx);
	- return (dr);
	- }
	-
	- /*
	- * Only valid if not already dirty.
	- */
	- ASSERT(dn->dn_object == 0 \|\|
	- dn->dn_dirtyctx == DN_UNDIRTIED \|\| dn->dn_dirtyctx ==
	- (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
	-
	- ASSERT3U(dn->dn_nlevels, >, db->db_level);
	-
	- /*
	- * We should only be dirtying in syncing context if it's the
	- * mos or we're initializing the os or it's a special object.
	- * However, we are allowed to dirty in syncing context provided
	- * we already dirtied it in open context. Hence we must make
	- * this assertion only if we're not already dirty.
	- */
	- os = dn->dn_objset;
	- VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
	-#ifdef DEBUG
	- if (dn->dn_objset->os_dsl_dataset != NULL)
	- rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
	- ASSERT(!dmu_tx_is_syncing(tx) \|\| DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\|
	- os->os_dsl_dataset == NULL \|\| BP_IS_HOLE(os->os_rootbp));
	- if (dn->dn_objset->os_dsl_dataset != NULL)
	- rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
	-#endif
	- ASSERT(db->db.db_size != 0);
	-
	- dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
	-
	- if (db->db_blkid != DMU_BONUS_BLKID) {
	- dmu_objset_willuse_space(os, db->db.db_size, tx);
	- }
	-
	- /*
	- * If this buffer is dirty in an old transaction group we need
	- * to make a copy of it so that the changes we make in this
	- * transaction group won't leak out when we sync the older txg.
	- */
	- dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
	- list_link_init(&dr->dr_dirty_node);
	- if (db->db_level == 0) {
	- void *data_old = db->db_buf;
	-
	- if (db->db_state != DB_NOFILL) {
	- if (db->db_blkid == DMU_BONUS_BLKID) {
	- dbuf_fix_old_data(db, tx->tx_txg);
	- data_old = db->db.db_data;
	- } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
	- /*
	- * Release the data buffer from the cache so
	- * that we can modify it without impacting
	- * possible other users of this cached data
	- * block. Note that indirect blocks and
	- * private objects are not released until the
	- * syncing state (since they are only modified
	- * then).
	- */
	- arc_release(db->db_buf, db);
	- dbuf_fix_old_data(db, tx->tx_txg);
	- data_old = db->db_buf;
	- }
	- ASSERT(data_old != NULL);
	- }
	- dr->dt.dl.dr_data = data_old;
	- } else {
	- mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
	- list_create(&dr->dt.di.dr_children,
	- sizeof (dbuf_dirty_record_t),
	- offsetof(dbuf_dirty_record_t, dr_dirty_node));
	- }
	- if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
	- dr->dr_accounted = db->db.db_size;
	- dr->dr_dbuf = db;
	- dr->dr_txg = tx->tx_txg;
	- dr->dr_next = *drp;
	- *drp = dr;
	-
	- /*
	- * We could have been freed_in_flight between the dbuf_noread
	- * and dbuf_dirty. We win, as though the dbuf_noread() had
	- * happened after the free.
	- */
	- if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
	- db->db_blkid != DMU_SPILL_BLKID) {
	- mutex_enter(&dn->dn_mtx);
	- if (dn->dn_free_ranges[txgoff] != NULL) {
	- range_tree_clear(dn->dn_free_ranges[txgoff],
	- db->db_blkid, 1);
	- }
	- mutex_exit(&dn->dn_mtx);
	- db->db_freed_in_flight = FALSE;
	- }
	-
	- /*
	- * This buffer is now part of this txg
	- */
	- dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
	- db->db_dirtycnt += 1;
	- ASSERT3U(db->db_dirtycnt, <=, 3);
	-
	- mutex_exit(&db->db_mtx);
	-
	- if (db->db_blkid == DMU_BONUS_BLKID \|\|
	- db->db_blkid == DMU_SPILL_BLKID) {
	- mutex_enter(&dn->dn_mtx);
	- ASSERT(!list_link_active(&dr->dr_dirty_node));
	- list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
	- mutex_exit(&dn->dn_mtx);
	- dnode_setdirty(dn, tx);
	- DB_DNODE_EXIT(db);
	- return (dr);
	- }
	-
	- /*
	- * The dn_struct_rwlock prevents db_blkptr from changing
	- * due to a write from syncing context completing
	- * while we are running, so we want to acquire it before
	- * looking at db_blkptr.
	- */
	- if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- drop_struct_lock = TRUE;
	- }
	-
	- /*
	- * We need to hold the dn_struct_rwlock to make this assertion,
	- * because it protects dn_phys / dn_next_nlevels from changing.
	- */
	- ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) \|\|
	- dn->dn_phys->dn_nlevels > db->db_level \|\|
	- dn->dn_next_nlevels[txgoff] > db->db_level \|\|
	- dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level \|\|
	- dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
	-
	- /*
	- * If we are overwriting a dedup BP, then unless it is snapshotted,
	- * when we get to syncing context we will need to decrement its
	- * refcount in the DDT. Prefetch the relevant DDT block so that
	- * syncing context won't have to wait for the i/o.
	- */
	- ddt_prefetch(os->os_spa, db->db_blkptr);
	-
	- if (db->db_level == 0) {
	- dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
	- ASSERT(dn->dn_maxblkid >= db->db_blkid);
	- }
	-
	- if (db->db_level+1 < dn->dn_nlevels) {
	- dmu_buf_impl_t *parent = db->db_parent;
	- dbuf_dirty_record_t *di;
	- int parent_held = FALSE;
	-
	- if (db->db_parent == NULL \|\| db->db_parent == dn->dn_dbuf) {
	- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	-
	- parent = dbuf_hold_level(dn, db->db_level+1,
	- db->db_blkid >> epbs, FTAG);
	- ASSERT(parent != NULL);
	- parent_held = TRUE;
	- }
	- if (drop_struct_lock)
	- rw_exit(&dn->dn_struct_rwlock);
	- ASSERT3U(db->db_level+1, ==, parent->db_level);
	- di = dbuf_dirty(parent, tx);
	- if (parent_held)
	- dbuf_rele(parent, FTAG);
	-
	- mutex_enter(&db->db_mtx);
	- /*
	- * Since we've dropped the mutex, it's possible that
	- * dbuf_undirty() might have changed this out from under us.
	- */
	- if (db->db_last_dirty == dr \|\|
	- dn->dn_object == DMU_META_DNODE_OBJECT) {
	- mutex_enter(&di->dt.di.dr_mtx);
	- ASSERT3U(di->dr_txg, ==, tx->tx_txg);
	- ASSERT(!list_link_active(&dr->dr_dirty_node));
	- list_insert_tail(&di->dt.di.dr_children, dr);
	- mutex_exit(&di->dt.di.dr_mtx);
	- dr->dr_parent = di;
	- }
	- mutex_exit(&db->db_mtx);
	- } else {
	- ASSERT(db->db_level+1 == dn->dn_nlevels);
	- ASSERT(db->db_blkid < dn->dn_nblkptr);
	- ASSERT(db->db_parent == NULL \|\| db->db_parent == dn->dn_dbuf);
	- mutex_enter(&dn->dn_mtx);
	- ASSERT(!list_link_active(&dr->dr_dirty_node));
	- list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
	- mutex_exit(&dn->dn_mtx);
	- if (drop_struct_lock)
	- rw_exit(&dn->dn_struct_rwlock);
	- }
	-
	- dnode_setdirty(dn, tx);
	- DB_DNODE_EXIT(db);
	- return (dr);
	-}
	-
	-/*
	- * Undirty a buffer in the transaction group referenced by the given
	- * transaction. Return whether this evicted the dbuf.
	- */
	-static boolean_t
	-dbuf_undirty(dmu_buf_impl_t db, dmu_tx_t tx)
	-{
	- dnode_t *dn;
	- uint64_t txg = tx->tx_txg;
	- dbuf_dirty_record_t dr, *drp;
	-
	- ASSERT(txg != 0);
	-
	- /*
	- * Due to our use of dn_nlevels below, this can only be called
	- * in open context, unless we are operating on the MOS.
	- * From syncing context, dn_nlevels may be different from the
	- * dn_nlevels used when dbuf was dirtied.
	- */
	- ASSERT(db->db_objset ==
	- dmu_objset_pool(db->db_objset)->dp_meta_objset \|\|
	- txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	- ASSERT0(db->db_level);
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	-
	- /*
	- * If this buffer is not dirty, we're done.
	- */
	- for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
	- if (dr->dr_txg <= txg)
	- break;
	- if (dr == NULL \|\| dr->dr_txg < txg)
	- return (B_FALSE);
	- ASSERT(dr->dr_txg == txg);
	- ASSERT(dr->dr_dbuf == db);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	-
	- dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
	-
	- ASSERT(db->db.db_size != 0);
	-
	- dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
	- dr->dr_accounted, txg);
	-
	- *drp = dr->dr_next;
	-
	- /*
	- * Note that there are three places in dbuf_dirty()
	- * where this dirty record may be put on a list.
	- * Make sure to do a list_remove corresponding to
	- * every one of those list_insert calls.
	- */
	- if (dr->dr_parent) {
	- mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
	- list_remove(&dr->dr_parent->dt.di.dr_children, dr);
	- mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
	- } else if (db->db_blkid == DMU_SPILL_BLKID \|\|
	- db->db_level + 1 == dn->dn_nlevels) {
	- ASSERT(db->db_blkptr == NULL \|\| db->db_parent == dn->dn_dbuf);
	- mutex_enter(&dn->dn_mtx);
	- list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
	- mutex_exit(&dn->dn_mtx);
	- }
	- DB_DNODE_EXIT(db);
	-
	- if (db->db_state != DB_NOFILL) {
	- dbuf_unoverride(dr);
	-
	- ASSERT(db->db_buf != NULL);
	- ASSERT(dr->dt.dl.dr_data != NULL);
	- if (dr->dt.dl.dr_data != db->db_buf)
	- arc_buf_destroy(dr->dt.dl.dr_data, db);
	- }
	-
	- kmem_free(dr, sizeof (dbuf_dirty_record_t));
	-
	- ASSERT(db->db_dirtycnt > 0);
	- db->db_dirtycnt -= 1;
	-
	- if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
	- ASSERT(db->db_state == DB_NOFILL \|\| arc_released(db->db_buf));
	- dbuf_destroy(db);
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-void
	-dmu_buf_will_dirty(dmu_buf_t db_fake, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	- int rf = DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH;
	-
	- ASSERT(tx->tx_txg != 0);
	- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
	-
	- /*
	- * Quick check for dirtyness. For already dirty blocks, this
	- * reduces runtime of this function by >90%, and overall performance
	- * by 50% for some workloads (e.g. file deletion with indirect blocks
	- * cached).
	- */
	- mutex_enter(&db->db_mtx);
	- dbuf_dirty_record_t *dr;
	- for (dr = db->db_last_dirty;
	- dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
	- /*
	- * It's possible that it is already dirty but not cached,
	- * because there are some calls to dbuf_dirty() that don't
	- * go through dmu_buf_will_dirty().
	- */
	- if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
	- /* This dbuf is already dirty and cached. */
	- dbuf_redirty(dr);
	- mutex_exit(&db->db_mtx);
	- return;
	- }
	- }
	- mutex_exit(&db->db_mtx);
	-
	- DB_DNODE_ENTER(db);
	- if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
	- rf \|= DB_RF_HAVESTRUCT;
	- DB_DNODE_EXIT(db);
	- (void) dbuf_read(db, NULL, rf);
	- (void) dbuf_dirty(db, tx);
	-}
	-
	-void
	-dmu_buf_will_not_fill(dmu_buf_t db_fake, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	-
	- db->db_state = DB_NOFILL;
	-
	- dmu_buf_will_fill(db_fake, tx);
	-}
	-
	-void
	-dmu_buf_will_fill(dmu_buf_t db_fake, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	-
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	- ASSERT(tx->tx_txg != 0);
	- ASSERT(db->db_level == 0);
	- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
	-
	- ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT \|\|
	- dmu_tx_private_ok(tx));
	-
	- dbuf_noread(db);
	- (void) dbuf_dirty(db, tx);
	-}
	-
	-#pragma weak dmu_buf_fill_done = dbuf_fill_done
	-/* ARGSUSED */
	-void
	-dbuf_fill_done(dmu_buf_impl_t db, dmu_tx_t tx)
	-{
	- mutex_enter(&db->db_mtx);
	- DBUF_VERIFY(db);
	-
	- if (db->db_state == DB_FILL) {
	- if (db->db_level == 0 && db->db_freed_in_flight) {
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	- /* we were freed while filling */
	- /* XXX dbuf_undirty? */
	- bzero(db->db.db_data, db->db.db_size);
	- db->db_freed_in_flight = FALSE;
	- }
	- db->db_state = DB_CACHED;
	- cv_broadcast(&db->db_changed);
	- }
	- mutex_exit(&db->db_mtx);
	-}
	-
	-void
	-dmu_buf_write_embedded(dmu_buf_t dbuf, void data,
	- bp_embedded_type_t etype, enum zio_compress comp,
	- int uncompressed_size, int compressed_size, int byteorder,
	- dmu_tx_t *tx)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )dbuf;
	- struct dirty_leaf *dl;
	- dmu_object_type_t type;
	-
	- if (etype == BP_EMBEDDED_TYPE_DATA) {
	- ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
	- SPA_FEATURE_EMBEDDED_DATA));
	- }
	-
	- DB_DNODE_ENTER(db);
	- type = DB_DNODE(db)->dn_type;
	- DB_DNODE_EXIT(db);
	-
	- ASSERT0(db->db_level);
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	-
	- dmu_buf_will_not_fill(dbuf, tx);
	-
	- ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
	- dl = &db->db_last_dirty->dt.dl;
	- encode_embedded_bp_compressed(&dl->dr_overridden_by,
	- data, comp, uncompressed_size, compressed_size);
	- BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
	- BP_SET_TYPE(&dl->dr_overridden_by, type);
	- BP_SET_LEVEL(&dl->dr_overridden_by, 0);
	- BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
	-
	- dl->dr_override_state = DR_OVERRIDDEN;
	- dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
	-}
	-
	-/*
	- * Directly assign a provided arc buf to a given dbuf if it's not referenced
	- * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
	- */
	-void
	-dbuf_assign_arcbuf(dmu_buf_impl_t db, arc_buf_t buf, dmu_tx_t *tx)
	-{
	- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	- ASSERT(db->db_level == 0);
	- ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
	- ASSERT(buf != NULL);
	- ASSERT(arc_buf_lsize(buf) == db->db.db_size);
	- ASSERT(tx->tx_txg != 0);
	-
	- arc_return_buf(buf, db);
	- ASSERT(arc_released(buf));
	-
	- mutex_enter(&db->db_mtx);
	-
	- while (db->db_state == DB_READ \|\| db->db_state == DB_FILL)
	- cv_wait(&db->db_changed, &db->db_mtx);
	-
	- ASSERT(db->db_state == DB_CACHED \|\| db->db_state == DB_UNCACHED);
	-
	- if (db->db_state == DB_CACHED &&
	- zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
	- mutex_exit(&db->db_mtx);
	- (void) dbuf_dirty(db, tx);
	- bcopy(buf->b_data, db->db.db_data, db->db.db_size);
	- arc_buf_destroy(buf, db);
	- xuio_stat_wbuf_copied();
	- return;
	- }
	-
	- xuio_stat_wbuf_nocopy();
	- if (db->db_state == DB_CACHED) {
	- dbuf_dirty_record_t *dr = db->db_last_dirty;
	-
	- ASSERT(db->db_buf != NULL);
	- if (dr != NULL && dr->dr_txg == tx->tx_txg) {
	- ASSERT(dr->dt.dl.dr_data == db->db_buf);
	- if (!arc_released(db->db_buf)) {
	- ASSERT(dr->dt.dl.dr_override_state ==
	- DR_OVERRIDDEN);
	- arc_release(db->db_buf, db);
	- }
	- dr->dt.dl.dr_data = buf;
	- arc_buf_destroy(db->db_buf, db);
	- } else if (dr == NULL \|\| dr->dt.dl.dr_data != db->db_buf) {
	- arc_release(db->db_buf, db);
	- arc_buf_destroy(db->db_buf, db);
	- }
	- db->db_buf = NULL;
	- }
	- ASSERT(db->db_buf == NULL);
	- dbuf_set_data(db, buf);
	- db->db_state = DB_FILL;
	- mutex_exit(&db->db_mtx);
	- (void) dbuf_dirty(db, tx);
	- dmu_buf_fill_done(&db->db, tx);
	-}
	-
	-void
	-dbuf_destroy(dmu_buf_impl_t *db)
	-{
	- dnode_t *dn;
	- dmu_buf_impl_t *parent = db->db_parent;
	- dmu_buf_impl_t *dndb;
	-
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	- ASSERT(zfs_refcount_is_zero(&db->db_holds));
	-
	- if (db->db_buf != NULL) {
	- arc_buf_destroy(db->db_buf, db);
	- db->db_buf = NULL;
	- }
	-
	- if (db->db_blkid == DMU_BONUS_BLKID) {
	- int slots = DB_DNODE(db)->dn_num_slots;
	- int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
	- if (db->db.db_data != NULL) {
	- zio_buf_free(db->db.db_data, bonuslen);
	- arc_space_return(bonuslen, ARC_SPACE_BONUS);
	- db->db_state = DB_UNCACHED;
	- }
	- }
	-
	- dbuf_clear_data(db);
	-
	- if (multilist_link_active(&db->db_cache_link)) {
	- ASSERT(db->db_caching_status == DB_DBUF_CACHE \|\|
	- db->db_caching_status == DB_DBUF_METADATA_CACHE);
	-
	- multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
	- (void) zfs_refcount_remove_many(
	- &dbuf_caches[db->db_caching_status].size,
	- db->db.db_size, db);
	-
	- if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
	- DBUF_STAT_BUMPDOWN(metadata_cache_count);
	- } else {
	- DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
	- DBUF_STAT_BUMPDOWN(cache_count);
	- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
	- db->db.db_size);
	- }
	- db->db_caching_status = DB_NO_CACHE;
	- }
	-
	- ASSERT(db->db_state == DB_UNCACHED \|\| db->db_state == DB_NOFILL);
	- ASSERT(db->db_data_pending == NULL);
	-
	- db->db_state = DB_EVICTING;
	- db->db_blkptr = NULL;
	-
	- /*
	- * Now that db_state is DB_EVICTING, nobody else can find this via
	- * the hash table. We can now drop db_mtx, which allows us to
	- * acquire the dn_dbufs_mtx.
	- */
	- mutex_exit(&db->db_mtx);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- dndb = dn->dn_dbuf;
	- if (db->db_blkid != DMU_BONUS_BLKID) {
	- boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
	- if (needlock)
	- mutex_enter(&dn->dn_dbufs_mtx);
	- avl_remove(&dn->dn_dbufs, db);
	- membar_producer();
	- DB_DNODE_EXIT(db);
	- if (needlock)
	- mutex_exit(&dn->dn_dbufs_mtx);
	- /*
	- * Decrementing the dbuf count means that the hold corresponding
	- * to the removed dbuf is no longer discounted in dnode_move(),
	- * so the dnode cannot be moved until after we release the hold.
	- * The membar_producer() ensures visibility of the decremented
	- * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
	- * release any lock.
	- */
	- mutex_enter(&dn->dn_mtx);
	- dnode_rele_and_unlock(dn, db, B_TRUE);
	- db->db_dnode_handle = NULL;
	-
	- dbuf_hash_remove(db);
	- } else {
	- DB_DNODE_EXIT(db);
	- }
	-
	- ASSERT(zfs_refcount_is_zero(&db->db_holds));
	-
	- db->db_parent = NULL;
	-
	- ASSERT(db->db_buf == NULL);
	- ASSERT(db->db.db_data == NULL);
	- ASSERT(db->db_hash_next == NULL);
	- ASSERT(db->db_blkptr == NULL);
	- ASSERT(db->db_data_pending == NULL);
	- ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
	- ASSERT(!multilist_link_active(&db->db_cache_link));
	-
	- kmem_cache_free(dbuf_kmem_cache, db);
	- arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
	-
	- /*
	- * If this dbuf is referenced from an indirect dbuf,
	- * decrement the ref count on the indirect dbuf.
	- */
	- if (parent && parent != dndb) {
	- mutex_enter(&parent->db_mtx);
	- dbuf_rele_and_unlock(parent, db, B_TRUE);
	- }
	-}
	-
	-/*
	- * Note: While bpp will always be updated if the function returns success,
	- * parentp will not be updated if the dnode does not have dn_dbuf filled in;
	- * this happens when the dnode is the meta-dnode, or a userused or groupused
	- * object.
	- */
	-__attribute__((always_inline))
	-static inline int
	-dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
	- dmu_buf_impl_t parentp, blkptr_t bpp, struct dbuf_hold_impl_data *dh)
	-{
	- *parentp = NULL;
	- *bpp = NULL;
	-
	- ASSERT(blkid != DMU_BONUS_BLKID);
	-
	- if (blkid == DMU_SPILL_BLKID) {
	- mutex_enter(&dn->dn_mtx);
	- if (dn->dn_have_spill &&
	- (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
	- *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
	- else
	- *bpp = NULL;
	- dbuf_add_ref(dn->dn_dbuf, NULL);
	- *parentp = dn->dn_dbuf;
	- mutex_exit(&dn->dn_mtx);
	- return (0);
	- }
	-
	- int nlevels =
	- (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
	- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	-
	- ASSERT3U(level * epbs, <, 64);
	- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
	- /*
	- * This assertion shouldn't trip as long as the max indirect block size
	- * is less than 1M. The reason for this is that up to that point,
	- * the number of levels required to address an entire object with blocks
	- * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In
	- * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
	- * (i.e. we can address the entire object), objects will all use at most
	- * N-1 levels and the assertion won't overflow. However, once epbs is
	- * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be
	- * enough to address an entire object, so objects will have 5 levels,
	- * but then this assertion will overflow.
	- *
	- * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
	- * need to redo this logic to handle overflows.
	- */
	- ASSERT(level >= nlevels \|\|
	- ((nlevels - level - 1) * epbs) +
	- highbit64(dn->dn_phys->dn_nblkptr) <= 64);
	- if (level >= nlevels \|\|
	- blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
	- ((nlevels - level - 1) * epbs)) \|\|
	- (fail_sparse &&
	- blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
	- /* the buffer has no parent yet */
	- return (SET_ERROR(ENOENT));
	- } else if (level < nlevels-1) {
	- /* this block is referenced from an indirect block */
	- int err;
	- if (dh == NULL) {
	- err = dbuf_hold_impl(dn, level+1,
	- blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
	- } else {
	- __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
	- blkid >> epbs, fail_sparse, FALSE, NULL,
	- parentp, dh->dh_depth + 1);
	- err = __dbuf_hold_impl(dh + 1);
	- }
	- if (err)
	- return (err);
	- err = dbuf_read(*parentp, NULL,
	- (DB_RF_HAVESTRUCT \| DB_RF_NOPREFETCH \| DB_RF_CANFAIL));
	- if (err) {
	- dbuf_rele(*parentp, NULL);
	- *parentp = NULL;
	- return (err);
	- }
	- bpp = ((blkptr_t )(*parentp)->db.db_data) +
	- (blkid & ((1ULL << epbs) - 1));
	- if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
	- ASSERT(BP_IS_HOLE(*bpp));
	- return (0);
	- } else {
	- /* the block is referenced from the dnode */
	- ASSERT3U(level, ==, nlevels-1);
	- ASSERT(dn->dn_phys->dn_nblkptr == 0 \|\|
	- blkid < dn->dn_phys->dn_nblkptr);
	- if (dn->dn_dbuf) {
	- dbuf_add_ref(dn->dn_dbuf, NULL);
	- *parentp = dn->dn_dbuf;
	- }
	- *bpp = &dn->dn_phys->dn_blkptr[blkid];
	- return (0);
	- }
	-}
	-
	-static dmu_buf_impl_t *
	-dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
	- dmu_buf_impl_t parent, blkptr_t blkptr)
	-{
	- objset_t *os = dn->dn_objset;
	- dmu_buf_impl_t db, odb;
	-
	- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
	- ASSERT(dn->dn_type != DMU_OT_NONE);
	-
	- db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
	-
	- db->db_objset = os;
	- db->db.db_object = dn->dn_object;
	- db->db_level = level;
	- db->db_blkid = blkid;
	- db->db_last_dirty = NULL;
	- db->db_dirtycnt = 0;
	- db->db_dnode_handle = dn->dn_handle;
	- db->db_parent = parent;
	- db->db_blkptr = blkptr;
	-
	- db->db_user = NULL;
	- db->db_user_immediate_evict = FALSE;
	- db->db_freed_in_flight = FALSE;
	- db->db_pending_evict = FALSE;
	-
	- if (blkid == DMU_BONUS_BLKID) {
	- ASSERT3P(parent, ==, dn->dn_dbuf);
	- db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
	- (dn->dn_nblkptr-1) * sizeof (blkptr_t);
	- ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
	- db->db.db_offset = DMU_BONUS_BLKID;
	- db->db_state = DB_UNCACHED;
	- db->db_caching_status = DB_NO_CACHE;
	- /* the bonus dbuf is not placed in the hash table */
	- arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
	- return (db);
	- } else if (blkid == DMU_SPILL_BLKID) {
	- db->db.db_size = (blkptr != NULL) ?
	- BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
	- db->db.db_offset = 0;
	- } else {
	- int blocksize =
	- db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
	- db->db.db_size = blocksize;
	- db->db.db_offset = db->db_blkid * blocksize;
	- }
	-
	- /*
	- * Hold the dn_dbufs_mtx while we get the new dbuf
	- * in the hash table and added to the dbufs list.
	- * This prevents a possible deadlock with someone
	- * trying to look up this dbuf before its added to the
	- * dn_dbufs list.
	- */
	- mutex_enter(&dn->dn_dbufs_mtx);
	- db->db_state = DB_EVICTING;
	- if ((odb = dbuf_hash_insert(db)) != NULL) {
	- /* someone else inserted it first */
	- kmem_cache_free(dbuf_kmem_cache, db);
	- mutex_exit(&dn->dn_dbufs_mtx);
	- DBUF_STAT_BUMP(hash_insert_race);
	- return (odb);
	- }
	- avl_add(&dn->dn_dbufs, db);
	-
	- db->db_state = DB_UNCACHED;
	- db->db_caching_status = DB_NO_CACHE;
	- mutex_exit(&dn->dn_dbufs_mtx);
	- arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
	-
	- if (parent && parent != dn->dn_dbuf)
	- dbuf_add_ref(parent, db);
	-
	- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT \|\|
	- zfs_refcount_count(&dn->dn_holds) > 0);
	- (void) zfs_refcount_add(&dn->dn_holds, db);
	-
	- dprintf_dbuf(db, "db=%p\n", db);
	-
	- return (db);
	-}
	-
	-typedef struct dbuf_prefetch_arg {
	- spa_t dpa_spa; / The spa to issue the prefetch in. */
	- zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
	- int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
	- int dpa_curlevel; /* The current level that we're reading */
	- dnode_t dpa_dnode; / The dnode associated with the prefetch */
	- zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
	- zio_t dpa_zio; / The parent zio_t for all prefetches. */
	- arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
	-} dbuf_prefetch_arg_t;
	-
	-/*
	- * Actually issue the prefetch read for the block given.
	- */
	-static void
	-dbuf_issue_final_prefetch(dbuf_prefetch_arg_t dpa, blkptr_t bp)
	-{
	- if (BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp))
	- return;
	-
	- arc_flags_t aflags =
	- dpa->dpa_aflags \| ARC_FLAG_NOWAIT \| ARC_FLAG_PREFETCH;
	-
	- ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
	- ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
	- ASSERT(dpa->dpa_zio != NULL);
	- (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
	- dpa->dpa_prio, ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE,
	- &aflags, &dpa->dpa_zb);
	-}
	-
	-/*
	- * Called when an indirect block above our prefetch target is read in. This
	- * will either read in the next indirect block down the tree or issue the actual
	- * prefetch if the next block down is our target.
	- */
	-static void
	-dbuf_prefetch_indirect_done(zio_t zio, const zbookmark_phys_t zb,
	- const blkptr_t iobp, arc_buf_t abuf, void *private)
	-{
	- dbuf_prefetch_arg_t *dpa = private;
	-
	- ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
	- ASSERT3S(dpa->dpa_curlevel, >, 0);
	-
	- if (abuf == NULL) {
	- ASSERT(zio == NULL \|\| zio->io_error != 0);
	- kmem_free(dpa, sizeof (*dpa));
	- return;
	- }
	- ASSERT(zio == NULL \|\| zio->io_error == 0);
	-
	- /*
	- * The dpa_dnode is only valid if we are called with a NULL
	- * zio. This indicates that the arc_read() returned without
	- * first calling zio_read() to issue a physical read. Once
	- * a physical read is made the dpa_dnode must be invalidated
	- * as the locks guarding it may have been dropped. If the
	- * dpa_dnode is still valid, then we want to add it to the dbuf
	- * cache. To do so, we must hold the dbuf associated with the block
	- * we just prefetched, read its contents so that we associate it
	- * with an arc_buf_t, and then release it.
	- */
	- if (zio != NULL) {
	- ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
	- if (zio->io_flags & ZIO_FLAG_RAW) {
	- ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
	- } else {
	- ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
	- }
	- ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
	-
	- dpa->dpa_dnode = NULL;
	- } else if (dpa->dpa_dnode != NULL) {
	- uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
	- (dpa->dpa_epbs * (dpa->dpa_curlevel -
	- dpa->dpa_zb.zb_level));
	- dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
	- dpa->dpa_curlevel, curblkid, FTAG);
	- (void) dbuf_read(db, NULL,
	- DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH \| DB_RF_HAVESTRUCT);
	- dbuf_rele(db, FTAG);
	- }
	-
	- if (abuf == NULL) {
	- kmem_free(dpa, sizeof(*dpa));
	- return;
	- }
	-
	- dpa->dpa_curlevel--;
	-
	- uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
	- (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
	- blkptr_t bp = ((blkptr_t )abuf->b_data) +
	- P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
	- if (BP_IS_HOLE(bp)) {
	- kmem_free(dpa, sizeof (*dpa));
	- } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
	- ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
	- dbuf_issue_final_prefetch(dpa, bp);
	- kmem_free(dpa, sizeof (*dpa));
	- } else {
	- arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
	- zbookmark_phys_t zb;
	-
	- /* flag if L2ARC eligible, l2arc_noprefetch then decides */
	- if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
	- iter_aflags \|= ARC_FLAG_L2CACHE;
	-
	- ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
	-
	- SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
	- dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
	-
	- (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
	- bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE,
	- &iter_aflags, &zb);
	- }
	-
	- arc_buf_destroy(abuf, private);
	-}
	-
	-/*
	- * Issue prefetch reads for the given block on the given level. If the indirect
	- * blocks above that block are not in memory, we will read them in
	- * asynchronously. As a result, this call never blocks waiting for a read to
	- * complete.
	- */
	-void
	-dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
	- arc_flags_t aflags)
	-{
	- blkptr_t bp;
	- int epbs, nlevels, curlevel;
	- uint64_t curblkid;
	-
	- ASSERT(blkid != DMU_BONUS_BLKID);
	- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
	-
	- if (blkid > dn->dn_maxblkid)
	- return;
	-
	- if (dnode_block_freed(dn, blkid))
	- return;
	-
	- /*
	- * This dnode hasn't been written to disk yet, so there's nothing to
	- * prefetch.
	- */
	- nlevels = dn->dn_phys->dn_nlevels;
	- if (level >= nlevels \|\| dn->dn_phys->dn_nblkptr == 0)
	- return;
	-
	- epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	- if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
	- return;
	-
	- dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
	- level, blkid);
	- if (db != NULL) {
	- mutex_exit(&db->db_mtx);
	- /*
	- * This dbuf already exists. It is either CACHED, or
	- * (we assume) about to be read or filled.
	- */
	- return;
	- }
	-
	- /*
	- * Find the closest ancestor (indirect block) of the target block
	- * that is present in the cache. In this indirect block, we will
	- * find the bp that is at curlevel, curblkid.
	- */
	- curlevel = level;
	- curblkid = blkid;
	- while (curlevel < nlevels - 1) {
	- int parent_level = curlevel + 1;
	- uint64_t parent_blkid = curblkid >> epbs;
	- dmu_buf_impl_t *db;
	-
	- if (dbuf_hold_impl(dn, parent_level, parent_blkid,
	- FALSE, TRUE, FTAG, &db) == 0) {
	- blkptr_t *bpp = db->db_buf->b_data;
	- bp = bpp[P2PHASE(curblkid, 1 << epbs)];
	- dbuf_rele(db, FTAG);
	- break;
	- }
	-
	- curlevel = parent_level;
	- curblkid = parent_blkid;
	- }
	-
	- if (curlevel == nlevels - 1) {
	- /* No cached indirect blocks found. */
	- ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
	- bp = dn->dn_phys->dn_blkptr[curblkid];
	- }
	- if (BP_IS_HOLE(&bp))
	- return;
	-
	- ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
	-
	- zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
	- ZIO_FLAG_CANFAIL);
	-
	- dbuf_prefetch_arg_t dpa = kmem_zalloc(sizeof (dpa), KM_SLEEP);
	- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
	- SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
	- dn->dn_object, level, blkid);
	- dpa->dpa_curlevel = curlevel;
	- dpa->dpa_prio = prio;
	- dpa->dpa_aflags = aflags;
	- dpa->dpa_spa = dn->dn_objset->os_spa;
	- dpa->dpa_dnode = dn;
	- dpa->dpa_epbs = epbs;
	- dpa->dpa_zio = pio;
	-
	- /* flag if L2ARC eligible, l2arc_noprefetch then decides */
	- if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
	- dpa->dpa_aflags \|= ARC_FLAG_L2CACHE;
	-
	- /*
	- * If we have the indirect just above us, no need to do the asynchronous
	- * prefetch chain; we'll just run the last step ourselves. If we're at
	- * a higher level, though, we want to issue the prefetches for all the
	- * indirect blocks asynchronously, so we can go on with whatever we were
	- * doing.
	- */
	- if (curlevel == level) {
	- ASSERT3U(curblkid, ==, blkid);
	- dbuf_issue_final_prefetch(dpa, &bp);
	- kmem_free(dpa, sizeof (*dpa));
	- } else {
	- arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
	- zbookmark_phys_t zb;
	-
	- /* flag if L2ARC eligible, l2arc_noprefetch then decides */
	- if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
	- iter_aflags \|= ARC_FLAG_L2CACHE;
	-
	- SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
	- dn->dn_object, curlevel, curblkid);
	- (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
	- &bp, dbuf_prefetch_indirect_done, dpa, prio,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE,
	- &iter_aflags, &zb);
	- }
	- /*
	- * We use pio here instead of dpa_zio since it's possible that
	- * dpa may have already been freed.
	- */
	- zio_nowait(pio);
	-}
	-
	-#define DBUF_HOLD_IMPL_MAX_DEPTH 20
	-
	-/*
	- * Helper function for __dbuf_hold_impl() to copy a buffer. Handles
	- * the case of encrypted, compressed and uncompressed buffers by
	- * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
	- * arc_alloc_compressed_buf() or arc_alloc_buf().*
	- *
	- * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl().
	- */
	-noinline static void
	-dbuf_hold_copy(struct dbuf_hold_impl_data *dh)
	-{
	- dnode_t *dn = dh->dh_dn;
	- dmu_buf_impl_t *db = dh->dh_db;
	- dbuf_dirty_record_t *dr = dh->dh_dr;
	- arc_buf_t *data = dr->dt.dl.dr_data;
	-
	- enum zio_compress compress_type = arc_get_compression(data);
	-
	- if (compress_type != ZIO_COMPRESS_OFF) {
	- dbuf_set_data(db, arc_alloc_compressed_buf(
	- dn->dn_objset->os_spa, db, arc_buf_size(data),
	- arc_buf_lsize(data), compress_type));
	- } else {
	- dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
	- DBUF_GET_BUFC_TYPE(db), db->db.db_size));
	- }
	-
	- bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
	-}
	-
	-/*
	- * Returns with db_holds incremented, and db_mtx not held.
	- * Note: dn_struct_rwlock must be held.
	- */
	-static int
	-__dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
	-{
	- ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH);
	- dh->dh_parent = NULL;
	-
	- ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
	- ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
	- ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
	-
	- *(dh->dh_dbp) = NULL;
	-
	- /* dbuf_find() returns with db_mtx held */
	- dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
	- dh->dh_level, dh->dh_blkid);
	-
	- if (dh->dh_db == NULL) {
	- dh->dh_bp = NULL;
	-
	- if (dh->dh_fail_uncached)
	- return (SET_ERROR(ENOENT));
	-
	- ASSERT3P(dh->dh_parent, ==, NULL);
	- dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
	- dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh);
	- if (dh->dh_fail_sparse) {
	- if (dh->dh_err == 0 &&
	- dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
	- dh->dh_err = SET_ERROR(ENOENT);
	- if (dh->dh_err) {
	- if (dh->dh_parent)
	- dbuf_rele(dh->dh_parent, NULL);
	- return (dh->dh_err);
	- }
	- }
	- if (dh->dh_err && dh->dh_err != ENOENT)
	- return (dh->dh_err);
	- dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
	- dh->dh_parent, dh->dh_bp);
	- }
	-
	- if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) {
	- mutex_exit(&dh->dh_db->db_mtx);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- if (dh->dh_db->db_buf != NULL) {
	- arc_buf_access(dh->dh_db->db_buf);
	- ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
	- }
	-
	- ASSERT(dh->dh_db->db_buf == NULL \|\| arc_referenced(dh->dh_db->db_buf));
	-
	- /*
	- * If this buffer is currently syncing out, and we are are
	- * still referencing it from db_data, we need to make a copy
	- * of it in case we decide we want to dirty it again in this txg.
	- */
	- if (dh->dh_db->db_level == 0 &&
	- dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
	- dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
	- dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
	- dh->dh_dr = dh->dh_db->db_data_pending;
	- if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf)
	- dbuf_hold_copy(dh);
	- }
	-
	- if (multilist_link_active(&dh->dh_db->db_cache_link)) {
	- ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds));
	- ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE \|\|
	- dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE);
	-
	- multilist_remove(
	- dbuf_caches[dh->dh_db->db_caching_status].cache,
	- dh->dh_db);
	- (void) zfs_refcount_remove_many(
	- &dbuf_caches[dh->dh_db->db_caching_status].size,
	- dh->dh_db->db.db_size, dh->dh_db);
	-
	- if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) {
	- DBUF_STAT_BUMPDOWN(metadata_cache_count);
	- } else {
	- DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
	- DBUF_STAT_BUMPDOWN(cache_count);
	- DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level],
	- dh->dh_db->db.db_size);
	- }
	- dh->dh_db->db_caching_status = DB_NO_CACHE;
	- }
	- (void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
	- DBUF_VERIFY(dh->dh_db);
	- mutex_exit(&dh->dh_db->db_mtx);
	-
	- /* NOTE: we can't rele the parent until after we drop the db_mtx */
	- if (dh->dh_parent)
	- dbuf_rele(dh->dh_parent, NULL);
	-
	- ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
	- ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
	- ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
	- *(dh->dh_dbp) = dh->dh_db;
	-
	- return (0);
	-}
	-
	-/*
	- * The following code preserves the recursive function dbuf_hold_impl()
	- * but moves the local variables AND function arguments to the heap to
	- * minimize the stack frame size. Enough space is initially allocated
	- * on the stack for 20 levels of recursion.
	- */
	-int
	-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
	- boolean_t fail_sparse, boolean_t fail_uncached,
	- void tag, dmu_buf_impl_t *dbp)
	-{
	- struct dbuf_hold_impl_data *dh;
	- int error;
	-
	- dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) *
	- DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
	- __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse,
	- fail_uncached, tag, dbp, 0);
	-
	- error = __dbuf_hold_impl(dh);
	-
	- kmem_free(dh, sizeof (struct dbuf_hold_impl_data) *
	- DBUF_HOLD_IMPL_MAX_DEPTH);
	-
	- return (error);
	-}
	-
	-static void
	-__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
	- dnode_t *dn, uint8_t level, uint64_t blkid,
	- boolean_t fail_sparse, boolean_t fail_uncached,
	- void tag, dmu_buf_impl_t *dbp, int depth)
	-{
	- dh->dh_dn = dn;
	- dh->dh_level = level;
	- dh->dh_blkid = blkid;
	-
	- dh->dh_fail_sparse = fail_sparse;
	- dh->dh_fail_uncached = fail_uncached;
	-
	- dh->dh_tag = tag;
	- dh->dh_dbp = dbp;
	-
	- dh->dh_db = NULL;
	- dh->dh_parent = NULL;
	- dh->dh_bp = NULL;
	- dh->dh_err = 0;
	- dh->dh_dr = NULL;
	-
	- dh->dh_depth = depth;
	-}
	-
	-dmu_buf_impl_t *
	-dbuf_hold(dnode_t dn, uint64_t blkid, void tag)
	-{
	- return (dbuf_hold_level(dn, 0, blkid, tag));
	-}
	-
	-dmu_buf_impl_t *
	-dbuf_hold_level(dnode_t dn, int level, uint64_t blkid, void tag)
	-{
	- dmu_buf_impl_t *db;
	- int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
	- return (err ? NULL : db);
	-}
	-
	-void
	-dbuf_create_bonus(dnode_t *dn)
	-{
	- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
	-
	- ASSERT(dn->dn_bonus == NULL);
	- dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
	-}
	-
	-int
	-dbuf_spill_set_blksz(dmu_buf_t db_fake, uint64_t blksz, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	- dnode_t *dn;
	-
	- if (db->db_blkid != DMU_SPILL_BLKID)
	- return (SET_ERROR(ENOTSUP));
	- if (blksz == 0)
	- blksz = SPA_MINBLOCKSIZE;
	- ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
	- blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- dbuf_new_size(db, blksz, tx);
	- rw_exit(&dn->dn_struct_rwlock);
	- DB_DNODE_EXIT(db);
	-
	- return (0);
	-}
	-
	-void
	-dbuf_rm_spill(dnode_t dn, dmu_tx_t tx)
	-{
	- dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
	-}
	-
	-#pragma weak dmu_buf_add_ref = dbuf_add_ref
	-void
	-dbuf_add_ref(dmu_buf_impl_t db, void tag)
	-{
	- int64_t holds = zfs_refcount_add(&db->db_holds, tag);
	- ASSERT3S(holds, >, 1);
	-}
	-
	-#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
	-boolean_t
	-dbuf_try_add_ref(dmu_buf_t db_fake, objset_t os, uint64_t obj, uint64_t blkid,
	- void *tag)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	- dmu_buf_impl_t *found_db;
	- boolean_t result = B_FALSE;
	-
	- if (db->db_blkid == DMU_BONUS_BLKID)
	- found_db = dbuf_find_bonus(os, obj);
	- else
	- found_db = dbuf_find(os, obj, 0, blkid);
	-
	- if (found_db != NULL) {
	- if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
	- (void) zfs_refcount_add(&db->db_holds, tag);
	- result = B_TRUE;
	- }
	- mutex_exit(&db->db_mtx);
	- }
	- return (result);
	-}
	-
	-/*
	- * If you call dbuf_rele() you had better not be referencing the dnode handle
	- * unless you have some other direct or indirect hold on the dnode. (An indirect
	- * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
	- * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
	- * dnode's parent dbuf evicting its dnode handles.
	- */
	-void
	-dbuf_rele(dmu_buf_impl_t db, void tag)
	-{
	- mutex_enter(&db->db_mtx);
	- dbuf_rele_and_unlock(db, tag, B_FALSE);
	-}
	-
	-void
	-dmu_buf_rele(dmu_buf_t db, void tag)
	-{
	- dbuf_rele((dmu_buf_impl_t *)db, tag);
	-}
	-
	-/*
	- * dbuf_rele() for an already-locked dbuf. This is necessary to allow
	- * db_dirtycnt and db_holds to be updated atomically. The 'evicting'
	- * argument should be set if we are already in the dbuf-evicting code
	- * path, in which case we don't want to recursively evict. This allows us to
	- * avoid deeply nested stacks that would have a call flow similar to this:
	- *
	- * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
	- * ^ \|
	- * \| \|
	- * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
	- *
	- */
	-void
	-dbuf_rele_and_unlock(dmu_buf_impl_t db, void tag, boolean_t evicting)
	-{
	- int64_t holds;
	- uint64_t size;
	-
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	- DBUF_VERIFY(db);
	-
	- /*
	- * Remove the reference to the dbuf before removing its hold on the
	- * dnode so we can guarantee in dnode_move() that a referenced bonus
	- * buffer has a corresponding dnode hold.
	- */
	- holds = zfs_refcount_remove(&db->db_holds, tag);
	- ASSERT(holds >= 0);
	-
	- /*
	- * We can't freeze indirects if there is a possibility that they
	- * may be modified in the current syncing context.
	- */
	- if (db->db_buf != NULL &&
	- holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
	- arc_buf_freeze(db->db_buf);
	- }
	-
	- if (holds == db->db_dirtycnt &&
	- db->db_level == 0 && db->db_user_immediate_evict)
	- dbuf_evict_user(db);
	-
	- if (holds == 0) {
	- if (db->db_blkid == DMU_BONUS_BLKID) {
	- dnode_t *dn;
	- boolean_t evict_dbuf = db->db_pending_evict;
	-
	- /*
	- * If the dnode moves here, we cannot cross this
	- * barrier until the move completes.
	- */
	- DB_DNODE_ENTER(db);
	-
	- dn = DB_DNODE(db);
	- atomic_dec_32(&dn->dn_dbufs_count);
	-
	- /*
	- * Decrementing the dbuf count means that the bonus
	- * buffer's dnode hold is no longer discounted in
	- * dnode_move(). The dnode cannot move until after
	- * the dnode_rele() below.
	- */
	- DB_DNODE_EXIT(db);
	-
	- /*
	- * Do not reference db after its lock is dropped.
	- * Another thread may evict it.
	- */
	- mutex_exit(&db->db_mtx);
	-
	- if (evict_dbuf)
	- dnode_evict_bonus(dn);
	-
	- dnode_rele(dn, db);
	- } else if (db->db_buf == NULL) {
	- /*
	- * This is a special case: we never associated this
	- * dbuf with any data allocated from the ARC.
	- */
	- ASSERT(db->db_state == DB_UNCACHED \|\|
	- db->db_state == DB_NOFILL);
	- dbuf_destroy(db);
	- } else if (arc_released(db->db_buf)) {
	- /*
	- * This dbuf has anonymous data associated with it.
	- */
	- dbuf_destroy(db);
	- } else {
	- boolean_t do_arc_evict = B_FALSE;
	- blkptr_t bp;
	- spa_t *spa = dmu_objset_spa(db->db_objset);
	-
	- if (!DBUF_IS_CACHEABLE(db) &&
	- db->db_blkptr != NULL &&
	- !BP_IS_HOLE(db->db_blkptr) &&
	- !BP_IS_EMBEDDED(db->db_blkptr)) {
	- do_arc_evict = B_TRUE;
	- bp = *db->db_blkptr;
	- }
	-
	- if (!DBUF_IS_CACHEABLE(db) \|\|
	- db->db_pending_evict) {
	- dbuf_destroy(db);
	- } else if (!multilist_link_active(&db->db_cache_link)) {
	- ASSERT3U(db->db_caching_status, ==,
	- DB_NO_CACHE);
	-
	- dbuf_cached_state_t dcs =
	- dbuf_include_in_metadata_cache(db) ?
	- DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
	- db->db_caching_status = dcs;
	-
	- multilist_insert(dbuf_caches[dcs].cache, db);
	- size = zfs_refcount_add_many(
	- &dbuf_caches[dcs].size, db->db.db_size, db);
	-
	- if (dcs == DB_DBUF_METADATA_CACHE) {
	- DBUF_STAT_BUMP(metadata_cache_count);
	- DBUF_STAT_MAX(
	- metadata_cache_size_bytes_max,
	- size);
	- } else {
	- DBUF_STAT_BUMP(
	- cache_levels[db->db_level]);
	- DBUF_STAT_BUMP(cache_count);
	- DBUF_STAT_INCR(
	- cache_levels_bytes[db->db_level],
	- db->db.db_size);
	- DBUF_STAT_MAX(cache_size_bytes_max,
	- size);
	- }
	- mutex_exit(&db->db_mtx);
	-
	- if (dcs == DB_DBUF_CACHE && !evicting)
	- dbuf_evict_notify(size);
	- }
	-
	- if (do_arc_evict)
	- arc_freed(spa, &bp);
	- }
	- } else {
	- mutex_exit(&db->db_mtx);
	- }
	-
	-}
	-
	-#pragma weak dmu_buf_refcount = dbuf_refcount
	-uint64_t
	-dbuf_refcount(dmu_buf_impl_t *db)
	-{
	- return (zfs_refcount_count(&db->db_holds));
	-}
	-
	-void *
	-dmu_buf_replace_user(dmu_buf_t db_fake, dmu_buf_user_t old_user,
	- dmu_buf_user_t *new_user)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	-
	- mutex_enter(&db->db_mtx);
	- dbuf_verify_user(db, DBVU_NOT_EVICTING);
	- if (db->db_user == old_user)
	- db->db_user = new_user;
	- else
	- old_user = db->db_user;
	- dbuf_verify_user(db, DBVU_NOT_EVICTING);
	- mutex_exit(&db->db_mtx);
	-
	- return (old_user);
	-}
	-
	-void *
	-dmu_buf_set_user(dmu_buf_t db_fake, dmu_buf_user_t user)
	-{
	- return (dmu_buf_replace_user(db_fake, NULL, user));
	-}
	-
	-void *
	-dmu_buf_set_user_ie(dmu_buf_t db_fake, dmu_buf_user_t user)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	-
	- db->db_user_immediate_evict = TRUE;
	- return (dmu_buf_set_user(db_fake, user));
	-}
	-
	-void *
	-dmu_buf_remove_user(dmu_buf_t db_fake, dmu_buf_user_t user)
	-{
	- return (dmu_buf_replace_user(db_fake, user, NULL));
	-}
	-
	-void *
	-dmu_buf_get_user(dmu_buf_t *db_fake)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	-
	- dbuf_verify_user(db, DBVU_NOT_EVICTING);
	- return (db->db_user);
	-}
	-
	-void
	-dmu_buf_user_evict_wait()
	-{
	- taskq_wait(dbu_evict_taskq);
	-}
	-
	-blkptr_t *
	-dmu_buf_get_blkptr(dmu_buf_t *db)
	-{
	- dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	- return (dbi->db_blkptr);
	-}
	-
	-objset_t *
	-dmu_buf_get_objset(dmu_buf_t *db)
	-{
	- dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	- return (dbi->db_objset);
	-}
	-
	-dnode_t *
	-dmu_buf_dnode_enter(dmu_buf_t *db)
	-{
	- dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	- DB_DNODE_ENTER(dbi);
	- return (DB_DNODE(dbi));
	-}
	-
	-void
	-dmu_buf_dnode_exit(dmu_buf_t *db)
	-{
	- dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	- DB_DNODE_EXIT(dbi);
	-}
	-
	-static void
	-dbuf_check_blkptr(dnode_t dn, dmu_buf_impl_t db)
	-{
	- /* ASSERT(dmu_tx_is_syncing(tx) */
	- ASSERT(MUTEX_HELD(&db->db_mtx));
	-
	- if (db->db_blkptr != NULL)
	- return;
	-
	- if (db->db_blkid == DMU_SPILL_BLKID) {
	- db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
	- BP_ZERO(db->db_blkptr);
	- return;
	- }
	- if (db->db_level == dn->dn_phys->dn_nlevels-1) {
	- /*
	- * This buffer was allocated at a time when there was
	- * no available blkptrs from the dnode, or it was
	- * inappropriate to hook it in (i.e., nlevels mis-match).
	- */
	- ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
	- ASSERT(db->db_parent == NULL);
	- db->db_parent = dn->dn_dbuf;
	- db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
	- DBUF_VERIFY(db);
	- } else {
	- dmu_buf_impl_t *parent = db->db_parent;
	- int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	-
	- ASSERT(dn->dn_phys->dn_nlevels > 1);
	- if (parent == NULL) {
	- mutex_exit(&db->db_mtx);
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- parent = dbuf_hold_level(dn, db->db_level + 1,
	- db->db_blkid >> epbs, db);
	- rw_exit(&dn->dn_struct_rwlock);
	- mutex_enter(&db->db_mtx);
	- db->db_parent = parent;
	- }
	- db->db_blkptr = (blkptr_t *)parent->db.db_data +
	- (db->db_blkid & ((1ULL << epbs) - 1));
	- DBUF_VERIFY(db);
	- }
	-}
	-
	-/*
	- * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
	- * is critical the we not allow the compiler to inline this function in to
	- * dbuf_sync_list() thereby drastically bloating the stack usage.
	- */
	-noinline static void
	-dbuf_sync_indirect(dbuf_dirty_record_t dr, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t *db = dr->dr_dbuf;
	- dnode_t *dn;
	- zio_t *zio;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
	-
	- mutex_enter(&db->db_mtx);
	-
	- ASSERT(db->db_level > 0);
	- DBUF_VERIFY(db);
	-
	- /* Read the block if it hasn't been read yet. */
	- if (db->db_buf == NULL) {
	- mutex_exit(&db->db_mtx);
	- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
	- mutex_enter(&db->db_mtx);
	- }
	- ASSERT3U(db->db_state, ==, DB_CACHED);
	- ASSERT(db->db_buf != NULL);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- /* Indirect block size must match what the dnode thinks it is. */
	- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
	- dbuf_check_blkptr(dn, db);
	- DB_DNODE_EXIT(db);
	-
	- /* Provide the pending dirty record to child dbufs */
	- db->db_data_pending = dr;
	-
	- mutex_exit(&db->db_mtx);
	-
	- dbuf_write(dr, db->db_buf, tx);
	-
	- zio = dr->dr_zio;
	- mutex_enter(&dr->dt.di.dr_mtx);
	- dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
	- ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
	- mutex_exit(&dr->dt.di.dr_mtx);
	- zio_nowait(zio);
	-}
	-
	-/*
	- * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
	- * critical the we not allow the compiler to inline this function in to
	- * dbuf_sync_list() thereby drastically bloating the stack usage.
	- */
	-noinline static void
	-dbuf_sync_leaf(dbuf_dirty_record_t dr, dmu_tx_t tx)
	-{
	- arc_buf_t **datap = &dr->dt.dl.dr_data;
	- dmu_buf_impl_t *db = dr->dr_dbuf;
	- dnode_t *dn;
	- objset_t *os;
	- uint64_t txg = tx->tx_txg;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
	-
	- mutex_enter(&db->db_mtx);
	- /*
	- * To be synced, we must be dirtied. But we
	- * might have been freed after the dirty.
	- */
	- if (db->db_state == DB_UNCACHED) {
	- /* This buffer has been freed since it was dirtied */
	- ASSERT(db->db.db_data == NULL);
	- } else if (db->db_state == DB_FILL) {
	- /* This buffer was freed and is now being re-filled */
	- ASSERT(db->db.db_data != dr->dt.dl.dr_data);
	- } else {
	- ASSERT(db->db_state == DB_CACHED \|\| db->db_state == DB_NOFILL);
	- }
	- DBUF_VERIFY(db);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	-
	- if (db->db_blkid == DMU_SPILL_BLKID) {
	- mutex_enter(&dn->dn_mtx);
	- if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
	- /*
	- * In the previous transaction group, the bonus buffer
	- * was entirely used to store the attributes for the
	- * dnode which overrode the dn_spill field. However,
	- * when adding more attributes to the file a spill
	- * block was required to hold the extra attributes.
	- *
	- * Make sure to clear the garbage left in the dn_spill
	- * field from the previous attributes in the bonus
	- * buffer. Otherwise, after writing out the spill
	- * block to the new allocated dva, it will free
	- * the old block pointed to by the invalid dn_spill.
	- */
	- db->db_blkptr = NULL;
	- }
	- dn->dn_phys->dn_flags \|= DNODE_FLAG_SPILL_BLKPTR;
	- mutex_exit(&dn->dn_mtx);
	- }
	-
	- /*
	- * If this is a bonus buffer, simply copy the bonus data into the
	- * dnode. It will be written out when the dnode is synced (and it
	- * will be synced, since it must have been dirty for dbuf_sync to
	- * be called).
	- */
	- if (db->db_blkid == DMU_BONUS_BLKID) {
	- dbuf_dirty_record_t **drp;
	-
	- ASSERT(*datap != NULL);
	- ASSERT0(db->db_level);
	- ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
	- DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
	- bcopy(*datap, DN_BONUS(dn->dn_phys),
	- DN_MAX_BONUS_LEN(dn->dn_phys));
	- DB_DNODE_EXIT(db);
	-
	- if (*datap != db->db.db_data) {
	- int slots = DB_DNODE(db)->dn_num_slots;
	- int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
	- zio_buf_free(*datap, bonuslen);
	- arc_space_return(bonuslen, ARC_SPACE_BONUS);
	- }
	- db->db_data_pending = NULL;
	- drp = &db->db_last_dirty;
	- while (*drp != dr)
	- drp = &(*drp)->dr_next;
	- ASSERT(dr->dr_next == NULL);
	- ASSERT(dr->dr_dbuf == db);
	- *drp = dr->dr_next;
	- if (dr->dr_dbuf->db_level != 0) {
	- mutex_destroy(&dr->dt.di.dr_mtx);
	- list_destroy(&dr->dt.di.dr_children);
	- }
	- kmem_free(dr, sizeof (dbuf_dirty_record_t));
	- ASSERT(db->db_dirtycnt > 0);
	- db->db_dirtycnt -= 1;
	- dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
	- return;
	- }
	-
	- os = dn->dn_objset;
	-
	- /*
	- * This function may have dropped the db_mtx lock allowing a dmu_sync
	- * operation to sneak in. As a result, we need to ensure that we
	- * don't check the dr_override_state until we have returned from
	- * dbuf_check_blkptr.
	- */
	- dbuf_check_blkptr(dn, db);
	-
	- /*
	- * If this buffer is in the middle of an immediate write,
	- * wait for the synchronous IO to complete.
	- */
	- while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
	- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
	- cv_wait(&db->db_changed, &db->db_mtx);
	- ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
	- }
	-
	- if (db->db_state != DB_NOFILL &&
	- dn->dn_object != DMU_META_DNODE_OBJECT &&
	- zfs_refcount_count(&db->db_holds) > 1 &&
	- dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
	- *datap == db->db_buf) {
	- /*
	- * If this buffer is currently "in use" (i.e., there
	- * are active holds and db_data still references it),
	- * then make a copy before we start the write so that
	- * any modifications from the open txg will not leak
	- * into this write.
	- *
	- * NOTE: this copy does not need to be made for
	- * objects only modified in the syncing context (e.g.
	- * DNONE_DNODE blocks).
	- */
	- int psize = arc_buf_size(*datap);
	- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
	- enum zio_compress compress_type = arc_get_compression(*datap);
	-
	- if (compress_type == ZIO_COMPRESS_OFF) {
	- *datap = arc_alloc_buf(os->os_spa, db, type, psize);
	- } else {
	- ASSERT3U(type, ==, ARC_BUFC_DATA);
	- int lsize = arc_buf_lsize(*datap);
	- *datap = arc_alloc_compressed_buf(os->os_spa, db,
	- psize, lsize, compress_type);
	- }
	- bcopy(db->db.db_data, (*datap)->b_data, psize);
	- }
	- db->db_data_pending = dr;
	-
	- mutex_exit(&db->db_mtx);
	-
	- dbuf_write(dr, *datap, tx);
	-
	- ASSERT(!list_link_active(&dr->dr_dirty_node));
	- if (dn->dn_object == DMU_META_DNODE_OBJECT) {
	- list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
	- DB_DNODE_EXIT(db);
	- } else {
	- /*
	- * Although zio_nowait() does not "wait for an IO", it does
	- * initiate the IO. If this is an empty write it seems plausible
	- * that the IO could actually be completed before the nowait
	- * returns. We need to DB_DNODE_EXIT() first in case
	- * zio_nowait() invalidates the dbuf.
	- */
	- DB_DNODE_EXIT(db);
	- zio_nowait(dr->dr_zio);
	- }
	-}
	-
	-void
	-dbuf_sync_list(list_t list, int level, dmu_tx_t tx)
	-{
	- dbuf_dirty_record_t *dr;
	-
	- while (dr = list_head(list)) {
	- if (dr->dr_zio != NULL) {
	- /*
	- * If we find an already initialized zio then we
	- * are processing the meta-dnode, and we have finished.
	- * The dbufs for all dnodes are put back on the list
	- * during processing, so that we can zio_wait()
	- * these IOs after initiating all child IOs.
	- */
	- ASSERT3U(dr->dr_dbuf->db.db_object, ==,
	- DMU_META_DNODE_OBJECT);
	- break;
	- }
	- if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
	- dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
	- VERIFY3U(dr->dr_dbuf->db_level, ==, level);
	- }
	- list_remove(list, dr);
	- if (dr->dr_dbuf->db_level > 0)
	- dbuf_sync_indirect(dr, tx);
	- else
	- dbuf_sync_leaf(dr, tx);
	- }
	-}
	-
	-/* ARGSUSED */
	-static void
	-dbuf_write_ready(zio_t zio, arc_buf_t buf, void *vdb)
	-{
	- dmu_buf_impl_t *db = vdb;
	- dnode_t *dn;
	- blkptr_t *bp = zio->io_bp;
	- blkptr_t *bp_orig = &zio->io_bp_orig;
	- spa_t *spa = zio->io_spa;
	- int64_t delta;
	- uint64_t fill = 0;
	- int i;
	-
	- ASSERT3P(db->db_blkptr, !=, NULL);
	- ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
	- dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
	- zio->io_prev_space_delta = delta;
	-
	- if (bp->blk_birth != 0) {
	- ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
	- BP_GET_TYPE(bp) == dn->dn_type) \|\|
	- (db->db_blkid == DMU_SPILL_BLKID &&
	- BP_GET_TYPE(bp) == dn->dn_bonustype) \|\|
	- BP_IS_EMBEDDED(bp));
	- ASSERT(BP_GET_LEVEL(bp) == db->db_level);
	- }
	-
	- mutex_enter(&db->db_mtx);
	-
	-#ifdef ZFS_DEBUG
	- if (db->db_blkid == DMU_SPILL_BLKID) {
	- ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
	- ASSERT(!(BP_IS_HOLE(bp)) &&
	- db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
	- }
	-#endif
	-
	- if (db->db_level == 0) {
	- mutex_enter(&dn->dn_mtx);
	- if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
	- db->db_blkid != DMU_SPILL_BLKID)
	- dn->dn_phys->dn_maxblkid = db->db_blkid;
	- mutex_exit(&dn->dn_mtx);
	-
	- if (dn->dn_type == DMU_OT_DNODE) {
	- i = 0;
	- while (i < db->db.db_size) {
	- dnode_phys_t *dnp =
	- (void )(((char )db->db.db_data) + i);
	-
	- i += DNODE_MIN_SIZE;
	- if (dnp->dn_type != DMU_OT_NONE) {
	- fill++;
	- i += dnp->dn_extra_slots *
	- DNODE_MIN_SIZE;
	- }
	- }
	- } else {
	- if (BP_IS_HOLE(bp)) {
	- fill = 0;
	- } else {
	- fill = 1;
	- }
	- }
	- } else {
	- blkptr_t *ibp = db->db.db_data;
	- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
	- for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
	- if (BP_IS_HOLE(ibp))
	- continue;
	- fill += BP_GET_FILL(ibp);
	- }
	- }
	- DB_DNODE_EXIT(db);
	-
	- if (!BP_IS_EMBEDDED(bp))
	- bp->blk_fill = fill;
	-
	- mutex_exit(&db->db_mtx);
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- db->db_blkptr = bp;
	- rw_exit(&dn->dn_struct_rwlock);
	-}
	-
	-/* ARGSUSED */
	-/*
	- * This function gets called just prior to running through the compression
	- * stage of the zio pipeline. If we're an indirect block comprised of only
	- * holes, then we want this indirect to be compressed away to a hole. In
	- * order to do that we must zero out any information about the holes that
	- * this indirect points to prior to before we try to compress it.
	- */
	-static void
	-dbuf_write_children_ready(zio_t zio, arc_buf_t buf, void *vdb)
	-{
	- dmu_buf_impl_t *db = vdb;
	- dnode_t *dn;
	- blkptr_t *bp;
	- unsigned int epbs, i;
	-
	- ASSERT3U(db->db_level, >, 0);
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	- ASSERT3U(epbs, <, 31);
	-
	- /* Determine if all our children are holes */
	- for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
	- if (!BP_IS_HOLE(bp))
	- break;
	- }
	-
	- /*
	- * If all the children are holes, then zero them all out so that
	- * we may get compressed away.
	- */
	- if (i == 1 << epbs) {
	- /*
	- * We only found holes. Grab the rwlock to prevent
	- * anybody from reading the blocks we're about to
	- * zero out.
	- */
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- bzero(db->db.db_data, db->db.db_size);
	- rw_exit(&dn->dn_struct_rwlock);
	- }
	- DB_DNODE_EXIT(db);
	-}
	-
	-/*
	- * The SPA will call this callback several times for each zio - once
	- * for every physical child i/o (zio->io_phys_children times). This
	- * allows the DMU to monitor the progress of each logical i/o. For example,
	- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
	- * block. There may be a long delay before all copies/fragments are completed,
	- * so this callback allows us to retire dirty space gradually, as the physical
	- * i/os complete.
	- */
	-/* ARGSUSED */
	-static void
	-dbuf_write_physdone(zio_t zio, arc_buf_t buf, void *arg)
	-{
	- dmu_buf_impl_t *db = arg;
	- objset_t *os = db->db_objset;
	- dsl_pool_t *dp = dmu_objset_pool(os);
	- dbuf_dirty_record_t *dr;
	- int delta = 0;
	-
	- dr = db->db_data_pending;
	- ASSERT3U(dr->dr_txg, ==, zio->io_txg);
	-
	- /*
	- * The callback will be called io_phys_children times. Retire one
	- * portion of our dirty space each time we are called. Any rounding
	- * error will be cleaned up by dsl_pool_sync()'s call to
	- * dsl_pool_undirty_space().
	- */
	- delta = dr->dr_accounted / zio->io_phys_children;
	- dsl_pool_undirty_space(dp, delta, zio->io_txg);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dbuf_write_done(zio_t zio, arc_buf_t buf, void *vdb)
	-{
	- dmu_buf_impl_t *db = vdb;
	- blkptr_t *bp_orig = &zio->io_bp_orig;
	- blkptr_t *bp = db->db_blkptr;
	- objset_t *os = db->db_objset;
	- dmu_tx_t *tx = os->os_synctx;
	- dbuf_dirty_record_t *drp, dr;
	-
	- ASSERT0(zio->io_error);
	- ASSERT(db->db_blkptr == bp);
	-
	- /*
	- * For nopwrites and rewrites we ensure that the bp matches our
	- * original and bypass all the accounting.
	- */
	- if (zio->io_flags & (ZIO_FLAG_IO_REWRITE \| ZIO_FLAG_NOPWRITE)) {
	- ASSERT(BP_EQUAL(bp, bp_orig));
	- } else {
	- dsl_dataset_t *ds = os->os_dsl_dataset;
	- (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
	- dsl_dataset_block_born(ds, bp, tx);
	- }
	-
	- mutex_enter(&db->db_mtx);
	-
	- DBUF_VERIFY(db);
	-
	- drp = &db->db_last_dirty;
	- while ((dr = *drp) != db->db_data_pending)
	- drp = &dr->dr_next;
	- ASSERT(!list_link_active(&dr->dr_dirty_node));
	- ASSERT(dr->dr_dbuf == db);
	- ASSERT(dr->dr_next == NULL);
	- *drp = dr->dr_next;
	-
	-#ifdef ZFS_DEBUG
	- if (db->db_blkid == DMU_SPILL_BLKID) {
	- dnode_t *dn;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
	- ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
	- db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
	- DB_DNODE_EXIT(db);
	- }
	-#endif
	-
	- if (db->db_level == 0) {
	- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	- ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
	- if (db->db_state != DB_NOFILL) {
	- if (dr->dt.dl.dr_data != db->db_buf)
	- arc_buf_destroy(dr->dt.dl.dr_data, db);
	- }
	- } else {
	- dnode_t *dn;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
	- ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
	- if (!BP_IS_HOLE(db->db_blkptr)) {
	- int epbs =
	- dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	- ASSERT3U(db->db_blkid, <=,
	- dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
	- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
	- db->db.db_size);
	- }
	- DB_DNODE_EXIT(db);
	- mutex_destroy(&dr->dt.di.dr_mtx);
	- list_destroy(&dr->dt.di.dr_children);
	- }
	- kmem_free(dr, sizeof (dbuf_dirty_record_t));
	-
	- cv_broadcast(&db->db_changed);
	- ASSERT(db->db_dirtycnt > 0);
	- db->db_dirtycnt -= 1;
	- db->db_data_pending = NULL;
	- dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
	-}
	-
	-static void
	-dbuf_write_nofill_ready(zio_t *zio)
	-{
	- dbuf_write_ready(zio, NULL, zio->io_private);
	-}
	-
	-static void
	-dbuf_write_nofill_done(zio_t *zio)
	-{
	- dbuf_write_done(zio, NULL, zio->io_private);
	-}
	-
	-static void
	-dbuf_write_override_ready(zio_t *zio)
	-{
	- dbuf_dirty_record_t *dr = zio->io_private;
	- dmu_buf_impl_t *db = dr->dr_dbuf;
	-
	- dbuf_write_ready(zio, NULL, db);
	-}
	-
	-static void
	-dbuf_write_override_done(zio_t *zio)
	-{
	- dbuf_dirty_record_t *dr = zio->io_private;
	- dmu_buf_impl_t *db = dr->dr_dbuf;
	- blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
	-
	- mutex_enter(&db->db_mtx);
	- if (!BP_EQUAL(zio->io_bp, obp)) {
	- if (!BP_IS_HOLE(obp))
	- dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
	- arc_release(dr->dt.dl.dr_data, db);
	- }
	- mutex_exit(&db->db_mtx);
	- dbuf_write_done(zio, NULL, db);
	-
	- if (zio->io_abd != NULL)
	- abd_put(zio->io_abd);
	-}
	-
	-typedef struct dbuf_remap_impl_callback_arg {
	- objset_t *drica_os;
	- uint64_t drica_blk_birth;
	- dmu_tx_t *drica_tx;
	-} dbuf_remap_impl_callback_arg_t;
	-
	-static void
	-dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
	- void *arg)
	-{
	- dbuf_remap_impl_callback_arg_t *drica = arg;
	- objset_t *os = drica->drica_os;
	- spa_t *spa = dmu_objset_spa(os);
	- dmu_tx_t *tx = drica->drica_tx;
	-
	- ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
	-
	- if (os == spa_meta_objset(spa)) {
	- spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
	- } else {
	- dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
	- size, drica->drica_blk_birth, tx);
	- }
	-}
	-
	-static void
	-dbuf_remap_impl(dnode_t dn, blkptr_t bp, dmu_tx_t *tx)
	-{
	- blkptr_t bp_copy = *bp;
	- spa_t *spa = dmu_objset_spa(dn->dn_objset);
	- dbuf_remap_impl_callback_arg_t drica;
	-
	- ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
	-
	- drica.drica_os = dn->dn_objset;
	- drica.drica_blk_birth = bp->blk_birth;
	- drica.drica_tx = tx;
	- if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
	- &drica)) {
	- /*
	- * The struct_rwlock prevents dbuf_read_impl() from
	- * dereferencing the BP while we are changing it. To
	- * avoid lock contention, only grab it when we are actually
	- * changing the BP.
	- */
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- *bp = bp_copy;
	- rw_exit(&dn->dn_struct_rwlock);
	- }
	-}
	-
	-/*
	- * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
	- * to remap a copy of every bp in the dbuf.
	- */
	-boolean_t
	-dbuf_can_remap(const dmu_buf_impl_t *db)
	-{
	- spa_t *spa = dmu_objset_spa(db->db_objset);
	- blkptr_t *bp = db->db.db_data;
	- boolean_t ret = B_FALSE;
	-
	- ASSERT3U(db->db_level, >, 0);
	- ASSERT3S(db->db_state, ==, DB_CACHED);
	-
	- ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	- for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
	- blkptr_t bp_copy = bp[i];
	- if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
	- ret = B_TRUE;
	- break;
	- }
	- }
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- return (ret);
	-}
	-
	-boolean_t
	-dnode_needs_remap(const dnode_t *dn)
	-{
	- spa_t *spa = dmu_objset_spa(dn->dn_objset);
	- boolean_t ret = B_FALSE;
	-
	- if (dn->dn_phys->dn_nlevels == 0) {
	- return (B_FALSE);
	- }
	-
	- ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	- for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
	- blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
	- if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
	- ret = B_TRUE;
	- break;
	- }
	- }
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- return (ret);
	-}
	-
	-/*
	- * Remap any existing BP's to concrete vdevs, if possible.
	- */
	-static void
	-dbuf_remap(dnode_t dn, dmu_buf_impl_t db, dmu_tx_t *tx)
	-{
	- spa_t *spa = dmu_objset_spa(db->db_objset);
	- ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
	-
	- if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
	- return;
	-
	- if (db->db_level > 0) {
	- blkptr_t *bp = db->db.db_data;
	- for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
	- dbuf_remap_impl(dn, &bp[i], tx);
	- }
	- } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
	- dnode_phys_t *dnp = db->db.db_data;
	- ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
	- DMU_OT_DNODE);
	- for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
	- i += dnp[i].dn_extra_slots + 1) {
	- for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
	- dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
	- }
	- }
	- }
	-}
	-
	-
	-/* Issue I/O to commit a dirty buffer to disk. */
	-static void
	-dbuf_write(dbuf_dirty_record_t dr, arc_buf_t data, dmu_tx_t *tx)
	-{
	- dmu_buf_impl_t *db = dr->dr_dbuf;
	- dnode_t *dn;
	- objset_t *os;
	- dmu_buf_impl_t *parent = db->db_parent;
	- uint64_t txg = tx->tx_txg;
	- zbookmark_phys_t zb;
	- zio_prop_t zp;
	- zio_t *zio;
	- int wp_flag = 0;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- os = dn->dn_objset;
	-
	- if (db->db_state != DB_NOFILL) {
	- if (db->db_level > 0 \|\| dn->dn_type == DMU_OT_DNODE) {
	- /*
	- * Private object buffers are released here rather
	- * than in dbuf_dirty() since they are only modified
	- * in the syncing context and we don't want the
	- * overhead of making multiple copies of the data.
	- */
	- if (BP_IS_HOLE(db->db_blkptr)) {
	- arc_buf_thaw(data);
	- } else {
	- dbuf_release_bp(db);
	- }
	- dbuf_remap(dn, db, tx);
	- }
	- }
	-
	- if (parent != dn->dn_dbuf) {
	- /* Our parent is an indirect block. */
	- /* We have a dirty parent that has been scheduled for write. */
	- ASSERT(parent && parent->db_data_pending);
	- /* Our parent's buffer is one level closer to the dnode. */
	- ASSERT(db->db_level == parent->db_level-1);
	- /*
	- * We're about to modify our parent's db_data by modifying
	- * our block pointer, so the parent must be released.
	- */
	- ASSERT(arc_released(parent->db_buf));
	- zio = parent->db_data_pending->dr_zio;
	- } else {
	- /* Our parent is the dnode itself. */
	- ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
	- db->db_blkid != DMU_SPILL_BLKID) \|\|
	- (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
	- if (db->db_blkid != DMU_SPILL_BLKID)
	- ASSERT3P(db->db_blkptr, ==,
	- &dn->dn_phys->dn_blkptr[db->db_blkid]);
	- zio = dn->dn_zio;
	- }
	-
	- ASSERT(db->db_level == 0 \|\| data == db->db_buf);
	- ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
	- ASSERT(zio);
	-
	- SET_BOOKMARK(&zb, os->os_dsl_dataset ?
	- os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
	- db->db.db_object, db->db_level, db->db_blkid);
	-
	- if (db->db_blkid == DMU_SPILL_BLKID)
	- wp_flag = WP_SPILL;
	- wp_flag \|= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
	-
	- dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
	- DB_DNODE_EXIT(db);
	-
	- /*
	- * We copy the blkptr now (rather than when we instantiate the dirty
	- * record), because its value can change between open context and
	- * syncing context. We do not need to hold dn_struct_rwlock to read
	- * db_blkptr because we are in syncing context.
	- */
	- dr->dr_bp_copy = *db->db_blkptr;
	-
	- if (db->db_level == 0 &&
	- dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
	- /*
	- * The BP for this block has been provided by open context
	- * (by dmu_sync() or dmu_buf_write_embedded()).
	- */
	- abd_t *contents = (data != NULL) ?
	- abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
	-
	- dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
	- contents, db->db.db_size, db->db.db_size, &zp,
	- dbuf_write_override_ready, NULL, NULL,
	- dbuf_write_override_done,
	- dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
	- mutex_enter(&db->db_mtx);
	- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
	- zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
	- dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
	- mutex_exit(&db->db_mtx);
	- } else if (db->db_state == DB_NOFILL) {
	- ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF \|\|
	- zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
	- dr->dr_zio = zio_write(zio, os->os_spa, txg,
	- &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
	- dbuf_write_nofill_ready, NULL, NULL,
	- dbuf_write_nofill_done, db,
	- ZIO_PRIORITY_ASYNC_WRITE,
	- ZIO_FLAG_MUSTSUCCEED \| ZIO_FLAG_NODATA, &zb);
	- } else {
	- ASSERT(arc_released(data));
	-
	- /*
	- * For indirect blocks, we want to setup the children
	- * ready callback so that we can properly handle an indirect
	- * block that only contains holes.
	- */
	- arc_write_done_func_t *children_ready_cb = NULL;
	- if (db->db_level != 0)
	- children_ready_cb = dbuf_write_children_ready;
	-
	- dr->dr_zio = arc_write(zio, os->os_spa, txg,
	- &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
	- &zp, dbuf_write_ready, children_ready_cb,
	- dbuf_write_physdone, dbuf_write_done, db,
	- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c
	@@ -1,242 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dbuf.h>
	-#include <sys/dmu_objset.h>
	-
	-/*
	- * Calculate the index of the arc header for the state, disabled by default.
	- */
	-int zfs_dbuf_state_index = 0;
	-
	-/*
	- * ==========================================================================
	- * Dbuf Hash Read Routines
	- * ==========================================================================
	- */
	-typedef struct dbuf_stats_t {
	- kmutex_t lock;
	- kstat_t *kstat;
	- dbuf_hash_table_t *hash;
	- int idx;
	-} dbuf_stats_t;
	-
	-static dbuf_stats_t dbuf_stats_hash_table;
	-
	-static int
	-dbuf_stats_hash_table_headers(char *buf, size_t size)
	-{
	- size = snprintf(buf, size - 1,
	- "%-88s \| %-124s \| %s\n"
	- "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s \| "
	- "%-5s %-5s %-6s %-8s %-6s %-8s %-12s "
	- "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s \| "
	- "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n",
	- "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
	- "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list",
	- "atype", "index", "flags", "count", "asize", "access", "mru", "gmru",
	- "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds",
	- "dtype", "btype", "data_bs", "meta_bs", "bsize",
	- "lvls", "dholds", "blocks", "dsize");
	- buf[size] = '\0';
	-
	- return (0);
	-}
	-
	-int
	-__dbuf_stats_hash_table_data(char buf, size_t size, dmu_buf_impl_t db)
	-{
	- arc_buf_info_t abi = { 0 };
	- dmu_object_info_t doi = { 0 };
	- dnode_t *dn = DB_DNODE(db);
	-
	- if (db->db_buf)
	- arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
	-
	- if (dn)
	- __dmu_object_info_from_dnode(dn, &doi);
	-
	- size = snprintf(buf, size - 1,
	- "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu \| "
	- "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu "
	- "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu \| "
	- "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n",
	- /* dmu_buf_impl_t */
	- spa_name(dn->dn_objset->os_spa),
	- (u_longlong_t)dmu_objset_id(db->db_objset),
	- (longlong_t)db->db.db_object,
	- (longlong_t)db->db_level,
	- (longlong_t)db->db_blkid,
	- (u_longlong_t)db->db.db_offset,
	- (u_longlong_t)db->db.db_size,
	- !!dbuf_is_metadata(db),
	- db->db_state,
	- (ulong_t)zfs_refcount_count(&db->db_holds),
	- /* arc_buf_info_t */
	- abi.abi_state_type,
	- abi.abi_state_contents,
	- (longlong_t)abi.abi_state_index,
	- abi.abi_flags,
	- (ulong_t)abi.abi_bufcnt,
	- (u_longlong_t)abi.abi_size,
	- (u_longlong_t)abi.abi_access,
	- (ulong_t)abi.abi_mru_hits,
	- (ulong_t)abi.abi_mru_ghost_hits,
	- (ulong_t)abi.abi_mfu_hits,
	- (ulong_t)abi.abi_mfu_ghost_hits,
	- (ulong_t)abi.abi_l2arc_hits,
	- (u_longlong_t)abi.abi_l2arc_dattr,
	- (u_longlong_t)abi.abi_l2arc_asize,
	- abi.abi_l2arc_compress,
	- (ulong_t)abi.abi_holds,
	- /* dmu_object_info_t */
	- doi.doi_type,
	- doi.doi_bonus_type,
	- (ulong_t)doi.doi_data_block_size,
	- (ulong_t)doi.doi_metadata_block_size,
	- (u_longlong_t)doi.doi_bonus_size,
	- (ulong_t)doi.doi_indirection,
	- (ulong_t)zfs_refcount_count(&dn->dn_holds),
	- (u_longlong_t)doi.doi_fill_count,
	- (u_longlong_t)doi.doi_max_offset);
	- buf[size] = '\0';
	-
	- return (size);
	-}
	-
	-static int
	-dbuf_stats_hash_table_data(char buf, size_t size, void data)
	-{
	- dbuf_stats_t dsh = (dbuf_stats_t )data;
	- dbuf_hash_table_t *h = dsh->hash;
	- dmu_buf_impl_t *db;
	- int length, error = 0;
	-
	- ASSERT3S(dsh->idx, >=, 0);
	- ASSERT3S(dsh->idx, <=, h->hash_table_mask);
	- memset(buf, 0, size);
	-
	- mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
	- for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
	- /*
	- * Returning ENOMEM will cause the data and header functions
	- * to be called with a larger scratch buffers.
	- */
	- if (size < 512) {
	- error = ENOMEM;
	- break;
	- }
	-
	- mutex_enter(&db->db_mtx);
	- mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
	-
	- length = __dbuf_stats_hash_table_data(buf, size, db);
	- buf += length;
	- size -= length;
	-
	- mutex_exit(&db->db_mtx);
	- mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
	- }
	- mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
	-
	- return (error);
	-}
	-
	-static void *
	-dbuf_stats_hash_table_addr(kstat_t *ksp, off_t n)
	-{
	- dbuf_stats_t *dsh = ksp->ks_private;
	-
	- ASSERT(MUTEX_HELD(&dsh->lock));
	-
	- if (n <= dsh->hash->hash_table_mask) {
	- dsh->idx = n;
	- return (dsh);
	- }
	-
	- return (NULL);
	-}
	-
	-#ifndef __FreeBSD__
	-/*
	- * XXX The FreeBSD SPL is missing support for KSTAT_TYPE_RAW
	- * we can enable this as soon as that's implemented. See the
	- * lindebugfs module for similar callback semantics.
	- */
	-static void
	-dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
	-{
	- dbuf_stats_t *dsh = &dbuf_stats_hash_table;
	- kstat_t *ksp;
	-
	- mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
	- dsh->hash = hash;
	-
	- ksp = kstat_create("zfs", 0, "dbufs", "misc",
	- KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
	- dsh->kstat = ksp;
	-
	- if (ksp) {
	- ksp->ks_lock = &dsh->lock;
	- ksp->ks_ndata = UINT32_MAX;
	- ksp->ks_private = dsh;
	- kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
	- dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
	- kstat_install(ksp);
	- }
	-}
	-
	-static void
	-dbuf_stats_hash_table_destroy(void)
	-{
	- dbuf_stats_t *dsh = &dbuf_stats_hash_table;
	- kstat_t *ksp;
	-
	- ksp = dsh->kstat;
	- if (ksp)
	- kstat_delete(ksp);
	-
	- mutex_destroy(&dsh->lock);
	-}
	-#else
	-static void
	-dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
	-{
	-}
	-
	-static void
	-dbuf_stats_hash_table_destroy(void)
	-{
	-}
	-#endif
	-
	-void
	-dbuf_stats_init(dbuf_hash_table_t *hash)
	-{
	- dbuf_stats_hash_table_init(hash);
	-}
	-
	-void
	-dbuf_stats_destroy(void)
	-{
	- dbuf_stats_hash_table_destroy();
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
	@@ -1,1189 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zio.h>
	-#include <sys/ddt.h>
	-#include <sys/zap.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/arc.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zio_compress.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/abd.h>
	-
	-/*
	- * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
	- */
	-int zfs_dedup_prefetch = 1;
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "ZFS DEDUP");
	-SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch,
	- 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");
	-
	-static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
	- &ddt_zap_ops,
	-};
	-
	-static const char *ddt_class_name[DDT_CLASSES] = {
	- "ditto",
	- "duplicate",
	- "unique",
	-};
	-
	-static void
	-ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- dmu_tx_t *tx)
	-{
	- spa_t *spa = ddt->ddt_spa;
	- objset_t *os = ddt->ddt_os;
	- uint64_t *objectp = &ddt->ddt_object[type][class];
	- boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
	- ZCHECKSUM_FLAG_DEDUP;
	- char name[DDT_NAMELEN];
	-
	- ddt_object_name(ddt, type, class, name);
	-
	- ASSERT(*objectp == 0);
	- VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
	- ASSERT(*objectp != 0);
	-
	- VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
	- sizeof (uint64_t), 1, objectp, tx) == 0);
	-
	- VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
	- sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
	- &ddt->ddt_histogram[type][class], tx) == 0);
	-}
	-
	-static void
	-ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- dmu_tx_t *tx)
	-{
	- spa_t *spa = ddt->ddt_spa;
	- objset_t *os = ddt->ddt_os;
	- uint64_t *objectp = &ddt->ddt_object[type][class];
	- uint64_t count;
	- char name[DDT_NAMELEN];
	-
	- ddt_object_name(ddt, type, class, name);
	-
	- ASSERT(*objectp != 0);
	- VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
	- ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
	- VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
	- VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
	- VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
	- bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
	-
	- *objectp = 0;
	-}
	-
	-static int
	-ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
	-{
	- ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
	- dmu_object_info_t doi;
	- uint64_t count;
	- char name[DDT_NAMELEN];
	- int error;
	-
	- ddt_object_name(ddt, type, class, name);
	-
	- error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
	- sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
	-
	- if (error != 0)
	- return (error);
	-
	- VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
	- sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
	- &ddt->ddt_histogram[type][class]));
	-
	- /*
	- * Seed the cached statistics.
	- */
	- VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
	-
	- error = ddt_object_count(ddt, type, class, &count);
	- if (error)
	- return error;
	-
	- ddo->ddo_count = count;
	- ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
	- ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
	-
	- return (0);
	-}
	-
	-static void
	-ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- dmu_tx_t *tx)
	-{
	- ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
	- dmu_object_info_t doi;
	- uint64_t count;
	- char name[DDT_NAMELEN];
	-
	- ddt_object_name(ddt, type, class, name);
	-
	- VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
	- sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
	- &ddt->ddt_histogram[type][class], tx) == 0);
	-
	- /*
	- * Cache DDT statistics; this is the only time they'll change.
	- */
	- VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
	- VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
	-
	- ddo->ddo_count = count;
	- ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
	- ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
	-}
	-
	-static int
	-ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- ddt_entry_t *dde)
	-{
	- if (!ddt_object_exists(ddt, type, class))
	- return (SET_ERROR(ENOENT));
	-
	- return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
	- ddt->ddt_object[type][class], dde));
	-}
	-
	-static void
	-ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- ddt_entry_t *dde)
	-{
	- if (!ddt_object_exists(ddt, type, class))
	- return;
	-
	- ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
	- ddt->ddt_object[type][class], dde);
	-}
	-
	-int
	-ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- ddt_entry_t dde, dmu_tx_t tx)
	-{
	- ASSERT(ddt_object_exists(ddt, type, class));
	-
	- return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
	- ddt->ddt_object[type][class], dde, tx));
	-}
	-
	-static int
	-ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- ddt_entry_t dde, dmu_tx_t tx)
	-{
	- ASSERT(ddt_object_exists(ddt, type, class));
	-
	- return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
	- ddt->ddt_object[type][class], dde, tx));
	-}
	-
	-int
	-ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- uint64_t walk, ddt_entry_t dde)
	-{
	- ASSERT(ddt_object_exists(ddt, type, class));
	-
	- return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
	- ddt->ddt_object[type][class], dde, walk));
	-}
	-
	-int
	-ddt_object_count(ddt_t ddt, enum ddt_type type, enum ddt_class class, uint64_t count)
	-{
	- ASSERT(ddt_object_exists(ddt, type, class));
	-
	- return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
	- ddt->ddt_object[type][class], count));
	-}
	-
	-int
	-ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- dmu_object_info_t *doi)
	-{
	- if (!ddt_object_exists(ddt, type, class))
	- return (SET_ERROR(ENOENT));
	-
	- return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
	- doi));
	-}
	-
	-boolean_t
	-ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
	-{
	- return (!!ddt->ddt_object[type][class]);
	-}
	-
	-void
	-ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	- char *name)
	-{
	- (void) sprintf(name, DMU_POOL_DDT,
	- zio_checksum_table[ddt->ddt_checksum].ci_name,
	- ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
	-}
	-
	-void
	-ddt_bp_fill(const ddt_phys_t ddp, blkptr_t bp, uint64_t txg)
	-{
	- ASSERT(txg != 0);
	-
	- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
	- bp->blk_dva[d] = ddp->ddp_dva[d];
	- BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
	-}
	-
	-void
	-ddt_bp_create(enum zio_checksum checksum,
	- const ddt_key_t ddk, const ddt_phys_t ddp, blkptr_t *bp)
	-{
	- BP_ZERO(bp);
	-
	- if (ddp != NULL)
	- ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
	-
	- bp->blk_cksum = ddk->ddk_cksum;
	- bp->blk_fill = 1;
	-
	- BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
	- BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
	- BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
	- BP_SET_CHECKSUM(bp, checksum);
	- BP_SET_TYPE(bp, DMU_OT_DEDUP);
	- BP_SET_LEVEL(bp, 0);
	- BP_SET_DEDUP(bp, 0);
	- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
	-}
	-
	-void
	-ddt_key_fill(ddt_key_t ddk, const blkptr_t bp)
	-{
	- ddk->ddk_cksum = bp->blk_cksum;
	- ddk->ddk_prop = 0;
	-
	- DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
	- DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
	- DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
	-}
	-
	-void
	-ddt_phys_fill(ddt_phys_t ddp, const blkptr_t bp)
	-{
	- ASSERT(ddp->ddp_phys_birth == 0);
	-
	- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
	- ddp->ddp_dva[d] = bp->blk_dva[d];
	- ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
	-}
	-
	-void
	-ddt_phys_clear(ddt_phys_t *ddp)
	-{
	- bzero(ddp, sizeof (*ddp));
	-}
	-
	-void
	-ddt_phys_addref(ddt_phys_t *ddp)
	-{
	- ddp->ddp_refcnt++;
	-}
	-
	-void
	-ddt_phys_decref(ddt_phys_t *ddp)
	-{
	- if (ddp) {
	- ASSERT((int64_t)ddp->ddp_refcnt > 0);
	- ddp->ddp_refcnt--;
	- }
	-}
	-
	-void
	-ddt_phys_free(ddt_t ddt, ddt_key_t ddk, ddt_phys_t *ddp, uint64_t txg)
	-{
	- blkptr_t blk;
	-
	- ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
	- ddt_phys_clear(ddp);
	- zio_free(ddt->ddt_spa, txg, &blk);
	-}
	-
	-ddt_phys_t *
	-ddt_phys_select(const ddt_entry_t dde, const blkptr_t bp)
	-{
	- ddt_phys_t ddp = (ddt_phys_t )dde->dde_phys;
	-
	- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	- if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
	- BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
	- return (ddp);
	- }
	- return (NULL);
	-}
	-
	-uint64_t
	-ddt_phys_total_refcnt(const ddt_entry_t *dde)
	-{
	- uint64_t refcnt = 0;
	-
	- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
	- refcnt += dde->dde_phys[p].ddp_refcnt;
	-
	- return (refcnt);
	-}
	-
	-static void
	-ddt_stat_generate(ddt_t ddt, ddt_entry_t dde, ddt_stat_t *dds)
	-{
	- spa_t *spa = ddt->ddt_spa;
	- ddt_phys_t *ddp = dde->dde_phys;
	- ddt_key_t *ddk = &dde->dde_key;
	- uint64_t lsize = DDK_GET_LSIZE(ddk);
	- uint64_t psize = DDK_GET_PSIZE(ddk);
	-
	- bzero(dds, sizeof (*dds));
	-
	- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	- uint64_t dsize = 0;
	- uint64_t refcnt = ddp->ddp_refcnt;
	-
	- if (ddp->ddp_phys_birth == 0)
	- continue;
	-
	- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
	- dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
	-
	- dds->dds_blocks += 1;
	- dds->dds_lsize += lsize;
	- dds->dds_psize += psize;
	- dds->dds_dsize += dsize;
	-
	- dds->dds_ref_blocks += refcnt;
	- dds->dds_ref_lsize += lsize * refcnt;
	- dds->dds_ref_psize += psize * refcnt;
	- dds->dds_ref_dsize += dsize * refcnt;
	- }
	-}
	-
	-void
	-ddt_stat_add(ddt_stat_t dst, const ddt_stat_t src, uint64_t neg)
	-{
	- const uint64_t s = (const uint64_t )src;
	- uint64_t d = (uint64_t )dst;
	- uint64_t d_end = (uint64_t )(dst + 1);
	-
	- ASSERT(neg == 0 \|\| neg == -1ULL); /* add or subtract */
	-
	- while (d < d_end)
	- d++ += (s++ ^ neg) - neg;
	-}
	-
	-static void
	-ddt_stat_update(ddt_t ddt, ddt_entry_t dde, uint64_t neg)
	-{
	- ddt_stat_t dds;
	- ddt_histogram_t *ddh;
	- int bucket;
	-
	- ddt_stat_generate(ddt, dde, &dds);
	-
	- bucket = highbit64(dds.dds_ref_blocks) - 1;
	- ASSERT(bucket >= 0);
	-
	- ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
	-
	- ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
	-}
	-
	-void
	-ddt_histogram_add(ddt_histogram_t dst, const ddt_histogram_t src)
	-{
	- for (int h = 0; h < 64; h++)
	- ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
	-}
	-
	-void
	-ddt_histogram_stat(ddt_stat_t dds, const ddt_histogram_t ddh)
	-{
	- bzero(dds, sizeof (*dds));
	-
	- for (int h = 0; h < 64; h++)
	- ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
	-}
	-
	-boolean_t
	-ddt_histogram_empty(const ddt_histogram_t *ddh)
	-{
	- const uint64_t s = (const uint64_t )ddh;
	- const uint64_t s_end = (const uint64_t )(ddh + 1);
	-
	- while (s < s_end)
	- if (*s++ != 0)
	- return (B_FALSE);
	-
	- return (B_TRUE);
	-}
	-
	-void
	-ddt_get_dedup_object_stats(spa_t spa, ddt_object_t ddo_total)
	-{
	- /* Sum the statistics we cached in ddt_object_sync(). */
	- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	- ddt_t *ddt = spa->spa_ddt[c];
	- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	- for (enum ddt_class class = 0; class < DDT_CLASSES;
	- class++) {
	- ddt_object_t *ddo =
	- &ddt->ddt_object_stats[type][class];
	- ddo_total->ddo_count += ddo->ddo_count;
	- ddo_total->ddo_dspace += ddo->ddo_dspace;
	- ddo_total->ddo_mspace += ddo->ddo_mspace;
	- }
	- }
	- }
	-
	- /* ... and compute the averages. */
	- if (ddo_total->ddo_count != 0) {
	- ddo_total->ddo_dspace /= ddo_total->ddo_count;
	- ddo_total->ddo_mspace /= ddo_total->ddo_count;
	- }
	-}
	-
	-void
	-ddt_get_dedup_histogram(spa_t spa, ddt_histogram_t ddh)
	-{
	- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	- ddt_t *ddt = spa->spa_ddt[c];
	- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	- for (enum ddt_class class = 0; class < DDT_CLASSES;
	- class++) {
	- ddt_histogram_add(ddh,
	- &ddt->ddt_histogram_cache[type][class]);
	- }
	- }
	- }
	-}
	-
	-void
	-ddt_get_dedup_stats(spa_t spa, ddt_stat_t dds_total)
	-{
	- ddt_histogram_t *ddh_total;
	-
	- ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
	- ddt_get_dedup_histogram(spa, ddh_total);
	- ddt_histogram_stat(dds_total, ddh_total);
	- kmem_free(ddh_total, sizeof (ddt_histogram_t));
	-}
	-
	-uint64_t
	-ddt_get_dedup_dspace(spa_t *spa)
	-{
	- ddt_stat_t dds_total = { 0 };
	-
	- ddt_get_dedup_stats(spa, &dds_total);
	- return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
	-}
	-
	-uint64_t
	-ddt_get_pool_dedup_ratio(spa_t *spa)
	-{
	- ddt_stat_t dds_total = { 0 };
	-
	- ddt_get_dedup_stats(spa, &dds_total);
	- if (dds_total.dds_dsize == 0)
	- return (100);
	-
	- return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
	-}
	-
	-int
	-ddt_ditto_copies_needed(ddt_t ddt, ddt_entry_t dde, ddt_phys_t *ddp_willref)
	-{
	- spa_t *spa = ddt->ddt_spa;
	- uint64_t total_refcnt = 0;
	- uint64_t ditto = spa->spa_dedup_ditto;
	- int total_copies = 0;
	- int desired_copies = 0;
	-
	- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
	- ddt_phys_t *ddp = &dde->dde_phys[p];
	- zio_t *zio = dde->dde_lead_zio[p];
	- uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
	- if (zio != NULL)
	- refcnt += zio->io_parent_count; /* pending refs */
	- if (ddp == ddp_willref)
	- refcnt++; /* caller's ref */
	- if (refcnt != 0) {
	- total_refcnt += refcnt;
	- total_copies += p;
	- }
	- }
	-
	- if (ditto == 0 \|\| ditto > UINT32_MAX)
	- ditto = UINT32_MAX;
	-
	- if (total_refcnt >= 1)
	- desired_copies++;
	- if (total_refcnt >= ditto)
	- desired_copies++;
	- if (total_refcnt >= ditto * ditto)
	- desired_copies++;
	-
	- return (MAX(desired_copies, total_copies) - total_copies);
	-}
	-
	-int
	-ddt_ditto_copies_present(ddt_entry_t *dde)
	-{
	- ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
	- dva_t *dva = ddp->ddp_dva;
	- int copies = 0 - DVA_GET_GANG(dva);
	-
	- for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
	- if (DVA_IS_VALID(dva))
	- copies++;
	-
	- ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
	-
	- return (copies);
	-}
	-
	-size_t
	-ddt_compress(void src, uchar_t dst, size_t s_len, size_t d_len)
	-{
	- uchar_t *version = dst++;
	- int cpfunc = ZIO_COMPRESS_ZLE;
	- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
	- size_t c_len;
	-
	- ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
	-
	- c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
	-
	- if (c_len == s_len) {
	- cpfunc = ZIO_COMPRESS_OFF;
	- bcopy(src, dst, s_len);
	- }
	-
	- *version = cpfunc;
	- /* CONSTCOND */
	- if (ZFS_HOST_BYTEORDER)
	- *version \|= DDT_COMPRESS_BYTEORDER_MASK;
	-
	- return (c_len + 1);
	-}
	-
	-void
	-ddt_decompress(uchar_t src, void dst, size_t s_len, size_t d_len)
	-{
	- uchar_t version = *src++;
	- int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
	- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
	-
	- if (ci->ci_decompress != NULL)
	- (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
	- else
	- bcopy(src, dst, d_len);
	-
	- if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
	- (ZFS_HOST_BYTEORDER != 0))
	- byteswap_uint64_array(dst, d_len);
	-}
	-
	-ddt_t *
	-ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
	-{
	- return (spa->spa_ddt[c]);
	-}
	-
	-ddt_t *
	-ddt_select(spa_t spa, const blkptr_t bp)
	-{
	- return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
	-}
	-
	-void
	-ddt_enter(ddt_t *ddt)
	-{
	- mutex_enter(&ddt->ddt_lock);
	-}
	-
	-void
	-ddt_exit(ddt_t *ddt)
	-{
	- mutex_exit(&ddt->ddt_lock);
	-}
	-
	-static ddt_entry_t *
	-ddt_alloc(const ddt_key_t *ddk)
	-{
	- ddt_entry_t *dde;
	-
	- dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
	- cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
	-
	- dde->dde_key = *ddk;
	-
	- return (dde);
	-}
	-
	-static void
	-ddt_free(ddt_entry_t *dde)
	-{
	- ASSERT(!dde->dde_loading);
	-
	- for (int p = 0; p < DDT_PHYS_TYPES; p++)
	- ASSERT(dde->dde_lead_zio[p] == NULL);
	-
	- if (dde->dde_repair_abd != NULL)
	- abd_free(dde->dde_repair_abd);
	-
	- cv_destroy(&dde->dde_cv);
	- kmem_free(dde, sizeof (*dde));
	-}
	-
	-void
	-ddt_remove(ddt_t ddt, ddt_entry_t dde)
	-{
	- ASSERT(MUTEX_HELD(&ddt->ddt_lock));
	-
	- avl_remove(&ddt->ddt_tree, dde);
	- ddt_free(dde);
	-}
	-
	-ddt_entry_t *
	-ddt_lookup(ddt_t ddt, const blkptr_t bp, boolean_t add)
	-{
	- ddt_entry_t *dde, dde_search;
	- enum ddt_type type;
	- enum ddt_class class;
	- avl_index_t where;
	- int error;
	-
	- ASSERT(MUTEX_HELD(&ddt->ddt_lock));
	-
	- ddt_key_fill(&dde_search.dde_key, bp);
	-
	- dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
	- if (dde == NULL) {
	- if (!add)
	- return (NULL);
	- dde = ddt_alloc(&dde_search.dde_key);
	- avl_insert(&ddt->ddt_tree, dde, where);
	- }
	-
	- while (dde->dde_loading)
	- cv_wait(&dde->dde_cv, &ddt->ddt_lock);
	-
	- if (dde->dde_loaded)
	- return (dde);
	-
	- dde->dde_loading = B_TRUE;
	-
	- ddt_exit(ddt);
	-
	- error = ENOENT;
	-
	- for (type = 0; type < DDT_TYPES; type++) {
	- for (class = 0; class < DDT_CLASSES; class++) {
	- error = ddt_object_lookup(ddt, type, class, dde);
	- if (error != ENOENT) {
	- ASSERT0(error);
	- break;
	- }
	- }
	- if (error != ENOENT)
	- break;
	- }
	-
	- ddt_enter(ddt);
	-
	- ASSERT(dde->dde_loaded == B_FALSE);
	- ASSERT(dde->dde_loading == B_TRUE);
	-
	- dde->dde_type = type; /* will be DDT_TYPES if no entry found */
	- dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
	- dde->dde_loaded = B_TRUE;
	- dde->dde_loading = B_FALSE;
	-
	- if (error == 0)
	- ddt_stat_update(ddt, dde, -1ULL);
	-
	- cv_broadcast(&dde->dde_cv);
	-
	- return (dde);
	-}
	-
	-void
	-ddt_prefetch(spa_t spa, const blkptr_t bp)
	-{
	- ddt_t *ddt;
	- ddt_entry_t dde;
	-
	- if (!zfs_dedup_prefetch \|\| bp == NULL \|\| !BP_GET_DEDUP(bp))
	- return;
	-
	- /*
	- * We only remove the DDT once all tables are empty and only
	- * prefetch dedup blocks when there are entries in the DDT.
	- * Thus no locking is required as the DDT can't disappear on us.
	- */
	- ddt = ddt_select(spa, bp);
	- ddt_key_fill(&dde.dde_key, bp);
	-
	- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
	- ddt_object_prefetch(ddt, type, class, &dde);
	- }
	- }
	-}
	-
	-/*
	- * Opaque struct used for ddt_key comparison
	- */
	-#define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t))
	-
	-typedef struct ddt_key_cmp {
	- uint16_t u16[DDT_KEY_CMP_LEN];
	-} ddt_key_cmp_t;
	-
	-int
	-ddt_entry_compare(const void x1, const void x2)
	-{
	- const ddt_entry_t *dde1 = x1;
	- const ddt_entry_t *dde2 = x2;
	- const ddt_key_cmp_t k1 = (const ddt_key_cmp_t )&dde1->dde_key;
	- const ddt_key_cmp_t k2 = (const ddt_key_cmp_t )&dde2->dde_key;
	- int32_t cmp = 0;
	-
	- for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
	- cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i];
	- if (likely(cmp))
	- break;
	- }
	-
	- return (AVL_ISIGN(cmp));
	-}
	-
	-static ddt_t *
	-ddt_table_alloc(spa_t *spa, enum zio_checksum c)
	-{
	- ddt_t *ddt;
	-
	- ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
	-
	- mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
	- avl_create(&ddt->ddt_tree, ddt_entry_compare,
	- sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
	- avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
	- sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
	- ddt->ddt_checksum = c;
	- ddt->ddt_spa = spa;
	- ddt->ddt_os = spa->spa_meta_objset;
	-
	- return (ddt);
	-}
	-
	-static void
	-ddt_table_free(ddt_t *ddt)
	-{
	- ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
	- ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
	- avl_destroy(&ddt->ddt_tree);
	- avl_destroy(&ddt->ddt_repair_tree);
	- mutex_destroy(&ddt->ddt_lock);
	- kmem_free(ddt, sizeof (*ddt));
	-}
	-
	-void
	-ddt_create(spa_t *spa)
	-{
	- spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
	-
	- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
	- spa->spa_ddt[c] = ddt_table_alloc(spa, c);
	-}
	-
	-int
	-ddt_load(spa_t *spa)
	-{
	- int error;
	-
	- ddt_create(spa);
	-
	- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
	- &spa->spa_ddt_stat_object);
	-
	- if (error)
	- return (error == ENOENT ? 0 : error);
	-
	- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	- ddt_t *ddt = spa->spa_ddt[c];
	- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	- for (enum ddt_class class = 0; class < DDT_CLASSES;
	- class++) {
	- error = ddt_object_load(ddt, type, class);
	- if (error != 0 && error != ENOENT)
	- return (error);
	- }
	- }
	-
	- /*
	- * Seed the cached histograms.
	- */
	- bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
	- sizeof (ddt->ddt_histogram));
	- }
	-
	- return (0);
	-}
	-
	-void
	-ddt_unload(spa_t *spa)
	-{
	- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	- if (spa->spa_ddt[c]) {
	- ddt_table_free(spa->spa_ddt[c]);
	- spa->spa_ddt[c] = NULL;
	- }
	- }
	-}
	-
	-boolean_t
	-ddt_class_contains(spa_t spa, enum ddt_class max_class, const blkptr_t bp)
	-{
	- ddt_t *ddt;
	- ddt_entry_t dde;
	-
	- if (!BP_GET_DEDUP(bp))
	- return (B_FALSE);
	-
	- if (max_class == DDT_CLASS_UNIQUE)
	- return (B_TRUE);
	-
	- ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
	-
	- ddt_key_fill(&dde.dde_key, bp);
	-
	- for (enum ddt_type type = 0; type < DDT_TYPES; type++)
	- for (enum ddt_class class = 0; class <= max_class; class++)
	- if (ddt_object_lookup(ddt, type, class, &dde) == 0)
	- return (B_TRUE);
	-
	- return (B_FALSE);
	-}
	-
	-ddt_entry_t *
	-ddt_repair_start(ddt_t ddt, const blkptr_t bp)
	-{
	- ddt_key_t ddk;
	- ddt_entry_t *dde;
	-
	- ddt_key_fill(&ddk, bp);
	-
	- dde = ddt_alloc(&ddk);
	-
	- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
	- /*
	- * We can only do repair if there are multiple copies
	- * of the block. For anything in the UNIQUE class,
	- * there's definitely only one copy, so don't even try.
	- */
	- if (class != DDT_CLASS_UNIQUE &&
	- ddt_object_lookup(ddt, type, class, dde) == 0)
	- return (dde);
	- }
	- }
	-
	- bzero(dde->dde_phys, sizeof (dde->dde_phys));
	-
	- return (dde);
	-}
	-
	-void
	-ddt_repair_done(ddt_t ddt, ddt_entry_t dde)
	-{
	- avl_index_t where;
	-
	- ddt_enter(ddt);
	-
	- if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
	- avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
	- avl_insert(&ddt->ddt_repair_tree, dde, where);
	- else
	- ddt_free(dde);
	-
	- ddt_exit(ddt);
	-}
	-
	-static void
	-ddt_repair_entry_done(zio_t *zio)
	-{
	- ddt_entry_t *rdde = zio->io_private;
	-
	- ddt_free(rdde);
	-}
	-
	-static void
	-ddt_repair_entry(ddt_t ddt, ddt_entry_t dde, ddt_entry_t rdde, zio_t rio)
	-{
	- ddt_phys_t *ddp = dde->dde_phys;
	- ddt_phys_t *rddp = rdde->dde_phys;
	- ddt_key_t *ddk = &dde->dde_key;
	- ddt_key_t *rddk = &rdde->dde_key;
	- zio_t *zio;
	- blkptr_t blk;
	-
	- zio = zio_null(rio, rio->io_spa, NULL,
	- ddt_repair_entry_done, rdde, rio->io_flags);
	-
	- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
	- if (ddp->ddp_phys_birth == 0 \|\|
	- ddp->ddp_phys_birth != rddp->ddp_phys_birth \|\|
	- bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
	- continue;
	- ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
	- zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
	- rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
	- ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
	- }
	-
	- zio_nowait(zio);
	-}
	-
	-static void
	-ddt_repair_table(ddt_t ddt, zio_t rio)
	-{
	- spa_t *spa = ddt->ddt_spa;
	- ddt_entry_t dde, rdde_next, *rdde;
	- avl_tree_t *t = &ddt->ddt_repair_tree;
	- blkptr_t blk;
	-
	- if (spa_sync_pass(spa) > 1)
	- return;
	-
	- ddt_enter(ddt);
	- for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
	- rdde_next = AVL_NEXT(t, rdde);
	- avl_remove(&ddt->ddt_repair_tree, rdde);
	- ddt_exit(ddt);
	- ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
	- dde = ddt_repair_start(ddt, &blk);
	- ddt_repair_entry(ddt, dde, rdde, rio);
	- ddt_repair_done(ddt, dde);
	- ddt_enter(ddt);
	- }
	- ddt_exit(ddt);
	-}
	-
	-static void
	-ddt_sync_entry(ddt_t ddt, ddt_entry_t dde, dmu_tx_t *tx, uint64_t txg)
	-{
	- dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
	- ddt_phys_t *ddp = dde->dde_phys;
	- ddt_key_t *ddk = &dde->dde_key;
	- enum ddt_type otype = dde->dde_type;
	- enum ddt_type ntype = DDT_TYPE_CURRENT;
	- enum ddt_class oclass = dde->dde_class;
	- enum ddt_class nclass;
	- uint64_t total_refcnt = 0;
	-
	- ASSERT(dde->dde_loaded);
	- ASSERT(!dde->dde_loading);
	-
	- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	- ASSERT(dde->dde_lead_zio[p] == NULL);
	- ASSERT((int64_t)ddp->ddp_refcnt >= 0);
	- if (ddp->ddp_phys_birth == 0) {
	- ASSERT(ddp->ddp_refcnt == 0);
	- continue;
	- }
	- if (p == DDT_PHYS_DITTO) {
	- if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
	- ddt_phys_free(ddt, ddk, ddp, txg);
	- continue;
	- }
	- if (ddp->ddp_refcnt == 0)
	- ddt_phys_free(ddt, ddk, ddp, txg);
	- total_refcnt += ddp->ddp_refcnt;
	- }
	-
	- if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
	- nclass = DDT_CLASS_DITTO;
	- else if (total_refcnt > 1)
	- nclass = DDT_CLASS_DUPLICATE;
	- else
	- nclass = DDT_CLASS_UNIQUE;
	-
	- if (otype != DDT_TYPES &&
	- (otype != ntype \|\| oclass != nclass \|\| total_refcnt == 0)) {
	- VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
	- ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
	- }
	-
	- if (total_refcnt != 0) {
	- dde->dde_type = ntype;
	- dde->dde_class = nclass;
	- ddt_stat_update(ddt, dde, 0);
	- if (!ddt_object_exists(ddt, ntype, nclass))
	- ddt_object_create(ddt, ntype, nclass, tx);
	- VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
	-
	- /*
	- * If the class changes, the order that we scan this bp
	- * changes. If it decreases, we could miss it, so
	- * scan it right now. (This covers both class changing
	- * while we are doing ddt_walk(), and when we are
	- * traversing.)
	- */
	- if (nclass < oclass) {
	- dsl_scan_ddt_entry(dp->dp_scan,
	- ddt->ddt_checksum, dde, tx);
	- }
	- }
	-}
	-
	-static void
	-ddt_sync_table(ddt_t ddt, dmu_tx_t tx, uint64_t txg)
	-{
	- spa_t *spa = ddt->ddt_spa;
	- ddt_entry_t *dde;
	- void *cookie = NULL;
	-
	- if (avl_numnodes(&ddt->ddt_tree) == 0)
	- return;
	-
	- ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
	-
	- if (spa->spa_ddt_stat_object == 0) {
	- spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
	- DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_DDT_STATS, tx);
	- }
	-
	- while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
	- ddt_sync_entry(ddt, dde, tx, txg);
	- ddt_free(dde);
	- }
	-
	- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	- uint64_t add, count = 0;
	- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
	- if (ddt_object_exists(ddt, type, class)) {
	- ddt_object_sync(ddt, type, class, tx);
	- VERIFY(ddt_object_count(ddt, type, class,
	- &add) == 0);
	- count += add;
	- }
	- }
	- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
	- if (count == 0 && ddt_object_exists(ddt, type, class))
	- ddt_object_destroy(ddt, type, class, tx);
	- }
	- }
	-
	- bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
	- sizeof (ddt->ddt_histogram));
	-}
	-
	-void
	-ddt_sync(spa_t *spa, uint64_t txg)
	-{
	- dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
	- dmu_tx_t *tx;
	- zio_t *rio;
	-
	- ASSERT(spa_syncing_txg(spa) == txg);
	-
	- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	-
	- rio = zio_root(spa, NULL, NULL,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SELF_HEAL);
	-
	- /*
	- * This function may cause an immediate scan of ddt blocks (see
	- * the comment above dsl_scan_ddt() for details). We set the
	- * scan's root zio here so that we can wait for any scan IOs in
	- * addition to the regular ddt IOs.
	- */
	- ASSERT3P(scn->scn_zio_root, ==, NULL);
	- scn->scn_zio_root = rio;
	-
	- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	- ddt_t *ddt = spa->spa_ddt[c];
	- if (ddt == NULL)
	- continue;
	- ddt_sync_table(ddt, tx, txg);
	- ddt_repair_table(ddt, rio);
	- }
	-
	- (void) zio_wait(rio);
	- scn->scn_zio_root = NULL;
	-
	- dmu_tx_commit(tx);
	-}
	-
	-int
	-ddt_walk(spa_t spa, ddt_bookmark_t ddb, ddt_entry_t *dde)
	-{
	- do {
	- do {
	- do {
	- ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
	- int error = ENOENT;
	- if (ddt_object_exists(ddt, ddb->ddb_type,
	- ddb->ddb_class)) {
	- error = ddt_object_walk(ddt,
	- ddb->ddb_type, ddb->ddb_class,
	- &ddb->ddb_cursor, dde);
	- }
	- dde->dde_type = ddb->ddb_type;
	- dde->dde_class = ddb->ddb_class;
	- if (error == 0)
	- return (0);
	- if (error != ENOENT)
	- return (error);
	- ddb->ddb_cursor = 0;
	- } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
	- ddb->ddb_checksum = 0;
	- } while (++ddb->ddb_type < DDT_TYPES);
	- ddb->ddb_type = 0;
	- } while (++ddb->ddb_class < DDT_CLASSES);
	-
	- return (SET_ERROR(ENOENT));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
	@@ -1,165 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2018 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/zio.h>
	-#include <sys/ddt.h>
	-#include <sys/zap.h>
	-#include <sys/dmu_tx.h>
	-
	-int ddt_zap_leaf_blockshift = 12;
	-int ddt_zap_indirect_blockshift = 12;
	-
	-static int
	-ddt_zap_create(objset_t os, uint64_t objectp, dmu_tx_t *tx, boolean_t prehash)
	-{
	- zap_flags_t flags = ZAP_FLAG_HASH64 \| ZAP_FLAG_UINT64_KEY;
	-
	- if (prehash)
	- flags \|= ZAP_FLAG_PRE_HASHED_KEY;
	-
	- *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
	- ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
	- DMU_OT_NONE, 0, tx);
	-
	- return (*objectp == 0 ? ENOTSUP : 0);
	-}
	-
	-static int
	-ddt_zap_destroy(objset_t os, uint64_t object, dmu_tx_t tx)
	-{
	- return (zap_destroy(os, object, tx));
	-}
	-
	-static int
	-ddt_zap_lookup(objset_t os, uint64_t object, ddt_entry_t dde)
	-{
	- uchar_t cbuf[sizeof (dde->dde_phys) + 1];
	- uint64_t one, csize;
	- int error;
	-
	- error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
	- DDT_KEY_WORDS, &one, &csize);
	- if (error)
	- return (error);
	-
	- ASSERT(one == 1);
	- ASSERT(csize <= sizeof (cbuf));
	-
	- error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
	- DDT_KEY_WORDS, 1, csize, cbuf);
	- if (error)
	- return (error);
	-
	- ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
	-
	- return (0);
	-}
	-
	-static void
	-ddt_zap_prefetch(objset_t os, uint64_t object, ddt_entry_t dde)
	-{
	- (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
	- DDT_KEY_WORDS);
	-}
	-
	-static int
	-ddt_zap_update(objset_t os, uint64_t object, ddt_entry_t dde, dmu_tx_t *tx)
	-{
	- uchar_t cbuf[sizeof (dde->dde_phys) + 1];
	- uint64_t csize;
	-
	- csize = ddt_compress(dde->dde_phys, cbuf,
	- sizeof (dde->dde_phys), sizeof (cbuf));
	-
	- return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
	- DDT_KEY_WORDS, 1, csize, cbuf, tx));
	-}
	-
	-static int
	-ddt_zap_remove(objset_t os, uint64_t object, ddt_entry_t dde, dmu_tx_t *tx)
	-{
	- return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
	- DDT_KEY_WORDS, tx));
	-}
	-
	-static int
	-ddt_zap_walk(objset_t os, uint64_t object, ddt_entry_t dde, uint64_t *walk)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- int error;
	-
	- if (*walk == 0) {
	- /*
	- * We don't want to prefetch the entire ZAP object, because
	- * it can be enormous. Also the primary use of DDT iteration
	- * is for scrubbing, in which case we will be issuing many
	- * scrub i/os for each ZAP block that we read in, so
	- * reading the ZAP is unlikely to be the bottleneck.
	- */
	- zap_cursor_init_noprefetch(&zc, os, object);
	- } else {
	- zap_cursor_init_serialized(&zc, os, object, *walk);
	- }
	- if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
	- uchar_t cbuf[sizeof (dde->dde_phys) + 1];
	- uint64_t csize = za.za_num_integers;
	- ASSERT(za.za_integer_length == 1);
	- error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
	- DDT_KEY_WORDS, 1, csize, cbuf);
	- ASSERT(error == 0);
	- if (error == 0) {
	- ddt_decompress(cbuf, dde->dde_phys, csize,
	- sizeof (dde->dde_phys));
	- dde->dde_key = (ddt_key_t )za.za_name;
	- }
	- zap_cursor_advance(&zc);
	- *walk = zap_cursor_serialize(&zc);
	- }
	- zap_cursor_fini(&zc);
	- return (error);
	-}
	-
	-static int
	-ddt_zap_count(objset_t os, uint64_t object, uint64_t count)
	-{
	-
	- return (zap_count(os, object, count));
	-}
	-
	-const ddt_ops_t ddt_zap_ops = {
	- "zap",
	- ddt_zap_create,
	- ddt_zap_destroy,
	- ddt_zap_lookup,
	- ddt_zap_prefetch,
	- ddt_zap_update,
	- ddt_zap_remove,
	- ddt_zap_walk,
	- ddt_zap_count,
	-};
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
	@@ -1,2748 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2019 Datto Inc.
	- */
	-/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
	-/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
	-/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dbuf.h>
	-#include <sys/dnode.h>
	-#include <sys/zfs_context.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dmu_zfetch.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zap.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zio_compress.h>
	-#include <sys/sa.h>
	-#include <sys/zfeature.h>
	-#include <sys/abd.h>
	-#ifdef _KERNEL
	-#include <sys/racct.h>
	-#include <sys/vm.h>
	-#include <sys/zfs_znode.h>
	-#endif
	-
	-/*
	- * Enable/disable nopwrite feature.
	- */
	-int zfs_nopwrite_enabled = 1;
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
	- &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
	-
	-/*
	- * Tunable to control percentage of dirtied L1 blocks from frees allowed into
	- * one TXG. After this threshold is crossed, additional dirty blocks from frees
	- * will wait until the next TXG.
	- * A value of zero will disable this throttle.
	- */
	-uint32_t zfs_per_txg_dirty_frees_percent = 5;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
	- &zfs_per_txg_dirty_frees_percent, 0,
	- "Percentage of dirtied indirect blocks from frees allowed in one txg");
	-
	-/*
	- * This can be used for testing, to ensure that certain actions happen
	- * while in the middle of a remap (which might otherwise complete too
	- * quickly).
	- */
	-int zfs_object_remap_one_indirect_delay_ticks = 0;
	-
	-/*
	- * Limit the amount we can prefetch with one call to this amount. This
	- * helps to limit the amount of memory that can be used by prefetching.
	- * Larger objects should be prefetched a bit at a time.
	- */
	-uint64_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
	-
	-const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
	- { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" },
	- { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" },
	- { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
	- { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
	- { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
	- { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
	- { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
	- { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
	- { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" },
	- { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" },
	- { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" },
	- { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" },
	- { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" },
	- { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
	- { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" },
	- { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" },
	- { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
	- { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" },
	- { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
	- { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
	- { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" },
	- { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" },
	- { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" },
	- { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" },
	- { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
	- { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" },
	- { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
	- { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" },
	- { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" },
	- { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" },
	- { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
	-};
	-
	-const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
	- { byteswap_uint8_array, "uint8" },
	- { byteswap_uint16_array, "uint16" },
	- { byteswap_uint32_array, "uint32" },
	- { byteswap_uint64_array, "uint64" },
	- { zap_byteswap, "zap" },
	- { dnode_buf_byteswap, "dnode" },
	- { dmu_objset_byteswap, "objset" },
	- { zfs_znode_byteswap, "znode" },
	- { zfs_oldacl_byteswap, "oldacl" },
	- { zfs_acl_byteswap, "acl" }
	-};
	-
	-int
	-dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
	- void tag, dmu_buf_t *dbp)
	-{
	- uint64_t blkid;
	- dmu_buf_impl_t *db;
	-
	- blkid = dbuf_whichblock(dn, 0, offset);
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- db = dbuf_hold(dn, blkid, tag);
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- if (db == NULL) {
	- *dbp = NULL;
	- return (SET_ERROR(EIO));
	- }
	-
	- *dbp = &db->db;
	- return (0);
	-}
	-int
	-dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
	- void tag, dmu_buf_t *dbp)
	-{
	- dnode_t *dn;
	- uint64_t blkid;
	- dmu_buf_impl_t *db;
	- int err;
	-
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err)
	- return (err);
	- blkid = dbuf_whichblock(dn, 0, offset);
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- db = dbuf_hold(dn, blkid, tag);
	- rw_exit(&dn->dn_struct_rwlock);
	- dnode_rele(dn, FTAG);
	-
	- if (db == NULL) {
	- *dbp = NULL;
	- return (SET_ERROR(EIO));
	- }
	-
	- *dbp = &db->db;
	- return (err);
	-}
	-
	-int
	-dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
	- void tag, dmu_buf_t *dbp, int flags)
	-{
	- int err;
	- int db_flags = DB_RF_CANFAIL;
	-
	- if (flags & DMU_READ_NO_PREFETCH)
	- db_flags \|= DB_RF_NOPREFETCH;
	-
	- err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
	- if (err == 0) {
	- dmu_buf_impl_t db = (dmu_buf_impl_t )(*dbp);
	- err = dbuf_read(db, NULL, db_flags);
	- if (err != 0) {
	- dbuf_rele(db, tag);
	- *dbp = NULL;
	- }
	- }
	-
	- return (err);
	-}
	-
	-int
	-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
	- void tag, dmu_buf_t *dbp, int flags)
	-{
	- int err;
	- int db_flags = DB_RF_CANFAIL;
	-
	- if (flags & DMU_READ_NO_PREFETCH)
	- db_flags \|= DB_RF_NOPREFETCH;
	-
	- err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
	- if (err == 0) {
	- dmu_buf_impl_t db = (dmu_buf_impl_t )(*dbp);
	- err = dbuf_read(db, NULL, db_flags);
	- if (err != 0) {
	- dbuf_rele(db, tag);
	- *dbp = NULL;
	- }
	- }
	-
	- return (err);
	-}
	-
	-int
	-dmu_bonus_max(void)
	-{
	- return (DN_OLD_MAX_BONUSLEN);
	-}
	-
	-int
	-dmu_set_bonus(dmu_buf_t db_fake, int newsize, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	- dnode_t *dn;
	- int error;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	-
	- if (dn->dn_bonus != db) {
	- error = SET_ERROR(EINVAL);
	- } else if (newsize < 0 \|\| newsize > db_fake->db_size) {
	- error = SET_ERROR(EINVAL);
	- } else {
	- dnode_setbonuslen(dn, newsize, tx);
	- error = 0;
	- }
	-
	- DB_DNODE_EXIT(db);
	- return (error);
	-}
	-
	-int
	-dmu_set_bonustype(dmu_buf_t db_fake, dmu_object_type_t type, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	- dnode_t *dn;
	- int error;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	-
	- if (!DMU_OT_IS_VALID(type)) {
	- error = SET_ERROR(EINVAL);
	- } else if (dn->dn_bonus != db) {
	- error = SET_ERROR(EINVAL);
	- } else {
	- dnode_setbonus_type(dn, type, tx);
	- error = 0;
	- }
	-
	- DB_DNODE_EXIT(db);
	- return (error);
	-}
	-
	-dmu_object_type_t
	-dmu_get_bonustype(dmu_buf_t *db_fake)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	- dnode_t *dn;
	- dmu_object_type_t type;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- type = dn->dn_bonustype;
	- DB_DNODE_EXIT(db);
	-
	- return (type);
	-}
	-
	-int
	-dmu_rm_spill(objset_t os, uint64_t object, dmu_tx_t tx)
	-{
	- dnode_t *dn;
	- int error;
	-
	- error = dnode_hold(os, object, FTAG, &dn);
	- dbuf_rm_spill(dn, tx);
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- dnode_rm_spill(dn, tx);
	- rw_exit(&dn->dn_struct_rwlock);
	- dnode_rele(dn, FTAG);
	- return (error);
	-}
	-
	-/*
	- * returns ENOENT, EIO, or 0.
	- */
	-int
	-dmu_bonus_hold(objset_t os, uint64_t object, void tag, dmu_buf_t **dbp)
	-{
	- dnode_t *dn;
	- dmu_buf_impl_t *db;
	- int error;
	-
	- error = dnode_hold(os, object, FTAG, &dn);
	- if (error)
	- return (error);
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- if (dn->dn_bonus == NULL) {
	- rw_exit(&dn->dn_struct_rwlock);
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- if (dn->dn_bonus == NULL)
	- dbuf_create_bonus(dn);
	- }
	- db = dn->dn_bonus;
	-
	- /* as long as the bonus buf is held, the dnode will be held */
	- if (zfs_refcount_add(&db->db_holds, tag) == 1) {
	- VERIFY(dnode_add_ref(dn, db));
	- atomic_inc_32(&dn->dn_dbufs_count);
	- }
	-
	- /*
	- * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
	- * hold and incrementing the dbuf count to ensure that dnode_move() sees
	- * a dnode hold for every dbuf.
	- */
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- dnode_rele(dn, FTAG);
	-
	- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH));
	-
	- *dbp = &db->db;
	- return (0);
	-}
	-
	-/*
	- * returns ENOENT, EIO, or 0.
	- *
	- * This interface will allocate a blank spill dbuf when a spill blk
	- * doesn't already exist on the dnode.
	- *
	- * if you only want to find an already existing spill db, then
	- * dmu_spill_hold_existing() should be used.
	- */
	-int
	-dmu_spill_hold_by_dnode(dnode_t dn, uint32_t flags, void tag, dmu_buf_t **dbp)
	-{
	- dmu_buf_impl_t *db = NULL;
	- int err;
	-
	- if ((flags & DB_RF_HAVESTRUCT) == 0)
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	-
	- db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
	-
	- if ((flags & DB_RF_HAVESTRUCT) == 0)
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- ASSERT(db != NULL);
	- err = dbuf_read(db, NULL, flags);
	- if (err == 0)
	- *dbp = &db->db;
	- else
	- dbuf_rele(db, tag);
	- return (err);
	-}
	-
	-int
	-dmu_spill_hold_existing(dmu_buf_t bonus, void tag, dmu_buf_t **dbp)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )bonus;
	- dnode_t *dn;
	- int err;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	-
	- if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
	- err = SET_ERROR(EINVAL);
	- } else {
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	-
	- if (!dn->dn_have_spill) {
	- err = SET_ERROR(ENOENT);
	- } else {
	- err = dmu_spill_hold_by_dnode(dn,
	- DB_RF_HAVESTRUCT \| DB_RF_CANFAIL, tag, dbp);
	- }
	-
	- rw_exit(&dn->dn_struct_rwlock);
	- }
	-
	- DB_DNODE_EXIT(db);
	- return (err);
	-}
	-
	-int
	-dmu_spill_hold_by_bonus(dmu_buf_t bonus, void tag, dmu_buf_t **dbp)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )bonus;
	- dnode_t *dn;
	- int err;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
	- DB_DNODE_EXIT(db);
	-
	- return (err);
	-}
	-
	-/*
	- * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
	- * to take a held dnode rather than <os, object> -- the lookup is wasteful,
	- * and can induce severe lock contention when writing to several files
	- * whose dnodes are in the same block.
	- */
	-int
	-dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
	- boolean_t read, void tag, int numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
	-{
	- dmu_buf_t **dbp;
	- uint64_t blkid, nblks, i;
	- uint32_t dbuf_flags;
	- int err;
	- zio_t *zio;
	-
	- ASSERT(length <= DMU_MAX_ACCESS);
	-
	- /*
	- * Note: We directly notify the prefetch code of this read, so that
	- * we can tell it about the multi-block read. dbuf_read() only knows
	- * about the one block it is accessing.
	- */
	- dbuf_flags = DB_RF_CANFAIL \| DB_RF_NEVERWAIT \| DB_RF_HAVESTRUCT \|
	- DB_RF_NOPREFETCH;
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- if (dn->dn_datablkshift) {
	- int blkshift = dn->dn_datablkshift;
	- nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
	- P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
	- } else {
	- if (offset + length > dn->dn_datablksz) {
	- zfs_panic_recover("zfs: accessing past end of object "
	- "%llx/%llx (size=%u access=%llu+%llu)",
	- (longlong_t)dn->dn_objset->
	- os_dsl_dataset->ds_object,
	- (longlong_t)dn->dn_object, dn->dn_datablksz,
	- (longlong_t)offset, (longlong_t)length);
	- rw_exit(&dn->dn_struct_rwlock);
	- return (SET_ERROR(EIO));
	- }
	- nblks = 1;
	- }
	- dbp = kmem_zalloc(sizeof (dmu_buf_t ) nblks, KM_SLEEP);
	-
	-#if defined(_KERNEL) && defined(RACCT)
	- if (racct_enable && !read) {
	- PROC_LOCK(curproc);
	- racct_add_force(curproc, RACCT_WRITEBPS, length);
	- racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
	- PROC_UNLOCK(curproc);
	- }
	-#endif
	-
	- zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
	- blkid = dbuf_whichblock(dn, 0, offset);
	- for (i = 0; i < nblks; i++) {
	- dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
	- if (db == NULL) {
	- rw_exit(&dn->dn_struct_rwlock);
	- dmu_buf_rele_array(dbp, nblks, tag);
	- zio_nowait(zio);
	- return (SET_ERROR(EIO));
	- }
	-
	- /* initiate async i/o */
	- if (read)
	- (void) dbuf_read(db, zio, dbuf_flags);
	-#ifdef _KERNEL
	- else
	- curthread->td_ru.ru_oublock++;
	-#endif
	- dbp[i] = &db->db;
	- }
	-
	- if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
	- DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
	- dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
	- read && DNODE_IS_CACHEABLE(dn));
	- }
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- /* wait for async i/o */
	- err = zio_wait(zio);
	- if (err) {
	- dmu_buf_rele_array(dbp, nblks, tag);
	- return (err);
	- }
	-
	- /* wait for other io to complete */
	- if (read) {
	- for (i = 0; i < nblks; i++) {
	- dmu_buf_impl_t db = (dmu_buf_impl_t )dbp[i];
	- mutex_enter(&db->db_mtx);
	- while (db->db_state == DB_READ \|\|
	- db->db_state == DB_FILL)
	- cv_wait(&db->db_changed, &db->db_mtx);
	- if (db->db_state == DB_UNCACHED)
	- err = SET_ERROR(EIO);
	- mutex_exit(&db->db_mtx);
	- if (err) {
	- dmu_buf_rele_array(dbp, nblks, tag);
	- return (err);
	- }
	- }
	- }
	-
	- *numbufsp = nblks;
	- *dbpp = dbp;
	- return (0);
	-}
	-
	-static int
	-dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
	- uint64_t length, int read, void tag, int numbufsp, dmu_buf_t ***dbpp)
	-{
	- dnode_t *dn;
	- int err;
	-
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err)
	- return (err);
	-
	- err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
	- numbufsp, dbpp, DMU_READ_PREFETCH);
	-
	- dnode_rele(dn, FTAG);
	-
	- return (err);
	-}
	-
	-int
	-dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
	- uint64_t length, boolean_t read, void tag, int numbufsp,
	- dmu_buf_t ***dbpp)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	- dnode_t *dn;
	- int err;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
	- numbufsp, dbpp, DMU_READ_PREFETCH);
	- DB_DNODE_EXIT(db);
	-
	- return (err);
	-}
	-
	-void
	-dmu_buf_rele_array(dmu_buf_t *dbp_fake, int numbufs, void tag)
	-{
	- int i;
	- dmu_buf_impl_t dbp = (dmu_buf_impl_t )dbp_fake;
	-
	- if (numbufs == 0)
	- return;
	-
	- for (i = 0; i < numbufs; i++) {
	- if (dbp[i])
	- dbuf_rele(dbp[i], tag);
	- }
	-
	- kmem_free(dbp, sizeof (dmu_buf_t ) numbufs);
	-}
	-
	-/*
	- * Issue prefetch i/os for the given blocks. If level is greater than 0, the
	- * indirect blocks prefeteched will be those that point to the blocks containing
	- * the data starting at offset, and continuing to offset + len.
	- *
	- * Note that if the indirect blocks above the blocks being prefetched are not in
	- * cache, they will be asychronously read in.
	- */
	-void
	-dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
	- uint64_t len, zio_priority_t pri)
	-{
	- dnode_t *dn;
	- uint64_t blkid;
	- int nblks, err;
	-
	- if (len == 0) { /* they're interested in the bonus buffer */
	- dn = DMU_META_DNODE(os);
	-
	- if (object == 0 \|\| object >= DN_MAX_OBJECT)
	- return;
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- blkid = dbuf_whichblock(dn, level,
	- object * sizeof (dnode_phys_t));
	- dbuf_prefetch(dn, level, blkid, pri, 0);
	- rw_exit(&dn->dn_struct_rwlock);
	- return;
	- }
	-
	- /*
	- * See comment before the definition of dmu_prefetch_max.
	- */
	- len = MIN(len, dmu_prefetch_max);
	-
	- /*
	- * XXX - Note, if the dnode for the requested object is not
	- * already cached, we will do a synchronous read in the
	- * dnode_hold() call. The same is true for any indirects.
	- */
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err != 0)
	- return;
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- /*
	- * offset + len - 1 is the last byte we want to prefetch for, and offset
	- * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
	- * last block we want to prefetch, and dbuf_whichblock(dn, level,
	- * offset) is the first. Then the number we need to prefetch is the
	- * last - first + 1.
	- */
	- if (level > 0 \|\| dn->dn_datablkshift != 0) {
	- nblks = dbuf_whichblock(dn, level, offset + len - 1) -
	- dbuf_whichblock(dn, level, offset) + 1;
	- } else {
	- nblks = (offset < dn->dn_datablksz);
	- }
	-
	- if (nblks != 0) {
	- blkid = dbuf_whichblock(dn, level, offset);
	- for (int i = 0; i < nblks; i++)
	- dbuf_prefetch(dn, level, blkid + i, pri, 0);
	- }
	-
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- dnode_rele(dn, FTAG);
	-}
	-
	-/*
	- * Get the next "chunk" of file data to free. We traverse the file from
	- * the end so that the file gets shorter over time (if we crashes in the
	- * middle, this will leave us in a better state). We find allocated file
	- * data by simply searching the allocated level 1 indirects.
	- *
	- * On input, *start should be the first offset that does not need to be
	- * freed (e.g. "offset + length"). On return, *start will be the first
	- * offset that should be freed and l1blks is set to the number of level 1
	- * indirect blocks found within the chunk.
	- */
	-static int
	-get_next_chunk(dnode_t dn, uint64_t start, uint64_t minimum, uint64_t *l1blks)
	-{
	- uint64_t blks;
	- uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
	- /* bytes of data covered by a level-1 indirect block */
	- uint64_t iblkrange =
	- dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
	-
	- ASSERT3U(minimum, <=, *start);
	-
	- /*
	- * Check if we can free the entire range assuming that all of the
	- * L1 blocks in this range have data. If we can, we use this
	- * worst case value as an estimate so we can avoid having to look
	- * at the object's actual data.
	- */
	- uint64_t total_l1blks =
	- (roundup(start, iblkrange) - (minimum / iblkrange iblkrange)) /
	- iblkrange;
	- if (total_l1blks <= maxblks) {
	- *l1blks = total_l1blks;
	- *start = minimum;
	- return (0);
	- }
	- ASSERT(ISP2(iblkrange));
	-
	- for (blks = 0; *start > minimum && blks < maxblks; blks++) {
	- int err;
	-
	- /*
	- * dnode_next_offset(BACKWARDS) will find an allocated L1
	- * indirect block at or before the input offset. We must
	- * decrement *start so that it is at the end of the region
	- * to search.
	- */
	- (*start)--;
	-
	- err = dnode_next_offset(dn,
	- DNODE_FIND_BACKWARDS, start, 2, 1, 0);
	-
	- /* if there are no indirect blocks before start, we are done */
	- if (err == ESRCH) {
	- *start = minimum;
	- break;
	- } else if (err != 0) {
	- *l1blks = blks;
	- return (err);
	- }
	-
	- /* set start to the beginning of this L1 indirect */
	- start = P2ALIGN(start, iblkrange);
	- }
	- if (*start < minimum)
	- *start = minimum;
	- *l1blks = blks;
	-
	- return (0);
	-}
	-
	-/*
	- * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
	- * otherwise return false.
	- * Used below in dmu_free_long_range_impl() to enable abort when unmounting
	- */
	-/ARGSUSED/
	-static boolean_t
	-dmu_objset_zfs_unmounting(objset_t *os)
	-{
	-#ifdef _KERNEL
	- if (dmu_objset_type(os) == DMU_OST_ZFS)
	- return (zfs_get_vfs_flag_unmounted(os));
	-#endif
	- return (B_FALSE);
	-}
	-
	-static int
	-dmu_free_long_range_impl(objset_t os, dnode_t dn, uint64_t offset,
	- uint64_t length)
	-{
	- uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
	- int err;
	- uint64_t dirty_frees_threshold;
	- dsl_pool_t *dp = dmu_objset_pool(os);
	-
	- if (offset >= object_size)
	- return (0);
	-
	- if (zfs_per_txg_dirty_frees_percent <= 100)
	- dirty_frees_threshold =
	- zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
	- else
	- dirty_frees_threshold = zfs_dirty_data_max / 20;
	-
	- if (length == DMU_OBJECT_END \|\| offset + length > object_size)
	- length = object_size - offset;
	-
	- while (length != 0) {
	- uint64_t chunk_end, chunk_begin, chunk_len;
	- uint64_t l1blks;
	- dmu_tx_t *tx;
	-
	- if (dmu_objset_zfs_unmounting(dn->dn_objset))
	- return (SET_ERROR(EINTR));
	-
	- chunk_end = chunk_begin = offset + length;
	-
	- /* move chunk_begin backwards to the beginning of this chunk */
	- err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
	- if (err)
	- return (err);
	- ASSERT3U(chunk_begin, >=, offset);
	- ASSERT3U(chunk_begin, <=, chunk_end);
	-
	- chunk_len = chunk_end - chunk_begin;
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
	-
	- /*
	- * Mark this transaction as typically resulting in a net
	- * reduction in space used.
	- */
	- dmu_tx_mark_netfree(tx);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err) {
	- dmu_tx_abort(tx);
	- return (err);
	- }
	-
	- uint64_t txg = dmu_tx_get_txg(tx);
	-
	- mutex_enter(&dp->dp_lock);
	- uint64_t long_free_dirty =
	- dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
	- mutex_exit(&dp->dp_lock);
	-
	- /*
	- * To avoid filling up a TXG with just frees, wait for
	- * the next TXG to open before freeing more chunks if
	- * we have reached the threshold of frees.
	- */
	- if (dirty_frees_threshold != 0 &&
	- long_free_dirty >= dirty_frees_threshold) {
	- dmu_tx_commit(tx);
	- txg_wait_open(dp, 0);
	- continue;
	- }
	-
	- /*
	- * In order to prevent unnecessary write throttling, for each
	- * TXG, we track the cumulative size of L1 blocks being dirtied
	- * in dnode_free_range() below. We compare this number to a
	- * tunable threshold, past which we prevent new L1 dirty freeing
	- * blocks from being added into the open TXG. See
	- * dmu_free_long_range_impl() for details. The threshold
	- * prevents write throttle activation due to dirty freeing L1
	- * blocks taking up a large percentage of zfs_dirty_data_max.
	- */
	- mutex_enter(&dp->dp_lock);
	- dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
	- l1blks << dn->dn_indblkshift;
	- mutex_exit(&dp->dp_lock);
	- DTRACE_PROBE3(free__long__range,
	- uint64_t, long_free_dirty, uint64_t, chunk_len,
	- uint64_t, txg);
	- dnode_free_range(dn, chunk_begin, chunk_len, tx);
	- dmu_tx_commit(tx);
	-
	- length -= chunk_len;
	- }
	- return (0);
	-}
	-
	-int
	-dmu_free_long_range(objset_t *os, uint64_t object,
	- uint64_t offset, uint64_t length)
	-{
	- dnode_t *dn;
	- int err;
	-
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err != 0)
	- return (err);
	- err = dmu_free_long_range_impl(os, dn, offset, length);
	-
	- /*
	- * It is important to zero out the maxblkid when freeing the entire
	- * file, so that (a) subsequent calls to dmu_free_long_range_impl()
	- * will take the fast path, and (b) dnode_reallocate() can verify
	- * that the entire file has been freed.
	- */
	- if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
	- dn->dn_maxblkid = 0;
	-
	- dnode_rele(dn, FTAG);
	- return (err);
	-}
	-
	-int
	-dmu_free_long_object(objset_t *os, uint64_t object)
	-{
	- dmu_tx_t *tx;
	- int err;
	-
	- err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
	- if (err != 0)
	- return (err);
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_bonus(tx, object);
	- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
	- dmu_tx_mark_netfree(tx);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err == 0) {
	- err = dmu_object_free(os, object, tx);
	- dmu_tx_commit(tx);
	- } else {
	- dmu_tx_abort(tx);
	- }
	-
	- return (err);
	-}
	-
	-int
	-dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
	- uint64_t size, dmu_tx_t *tx)
	-{
	- dnode_t *dn;
	- int err = dnode_hold(os, object, FTAG, &dn);
	- if (err)
	- return (err);
	- ASSERT(offset < UINT64_MAX);
	- ASSERT(size == -1ULL \|\| size <= UINT64_MAX - offset);
	- dnode_free_range(dn, offset, size, tx);
	- dnode_rele(dn, FTAG);
	- return (0);
	-}
	-
	-static int
	-dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
	- void *buf, uint32_t flags)
	-{
	- dmu_buf_t **dbp;
	- int numbufs, err = 0;
	-
	- /*
	- * Deal with odd block sizes, where there can't be data past the first
	- * block. If we ever do the tail block optimization, we will need to
	- * handle that here as well.
	- */
	- if (dn->dn_maxblkid == 0) {
	- int newsz = offset > dn->dn_datablksz ? 0 :
	- MIN(size, dn->dn_datablksz - offset);
	- bzero((char *)buf + newsz, size - newsz);
	- size = newsz;
	- }
	-
	- while (size > 0) {
	- uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
	- int i;
	-
	- /*
	- * NB: we could do this block-at-a-time, but it's nice
	- * to be reading in parallel.
	- */
	- err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
	- TRUE, FTAG, &numbufs, &dbp, flags);
	- if (err)
	- break;
	-
	- for (i = 0; i < numbufs; i++) {
	- int tocpy;
	- int bufoff;
	- dmu_buf_t *db = dbp[i];
	-
	- ASSERT(size > 0);
	-
	- bufoff = offset - db->db_offset;
	- tocpy = (int)MIN(db->db_size - bufoff, size);
	-
	- bcopy((char *)db->db_data + bufoff, buf, tocpy);
	-
	- offset += tocpy;
	- size -= tocpy;
	- buf = (char *)buf + tocpy;
	- }
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	- }
	- return (err);
	-}
	-
	-int
	-dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	- void *buf, uint32_t flags)
	-{
	- dnode_t *dn;
	- int err;
	-
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err != 0)
	- return (err);
	-
	- err = dmu_read_impl(dn, offset, size, buf, flags);
	- dnode_rele(dn, FTAG);
	- return (err);
	-}
	-
	-int
	-dmu_read_by_dnode(dnode_t dn, uint64_t offset, uint64_t size, void buf,
	- uint32_t flags)
	-{
	- return (dmu_read_impl(dn, offset, size, buf, flags));
	-}
	-
	-static void
	-dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
	- const void buf, dmu_tx_t tx)
	-{
	- int i;
	-
	- for (i = 0; i < numbufs; i++) {
	- int tocpy;
	- int bufoff;
	- dmu_buf_t *db = dbp[i];
	-
	- ASSERT(size > 0);
	-
	- bufoff = offset - db->db_offset;
	- tocpy = (int)MIN(db->db_size - bufoff, size);
	-
	- ASSERT(i == 0 \|\| i == numbufs-1 \|\| tocpy == db->db_size);
	-
	- if (tocpy == db->db_size)
	- dmu_buf_will_fill(db, tx);
	- else
	- dmu_buf_will_dirty(db, tx);
	-
	- bcopy(buf, (char *)db->db_data + bufoff, tocpy);
	-
	- if (tocpy == db->db_size)
	- dmu_buf_fill_done(db, tx);
	-
	- offset += tocpy;
	- size -= tocpy;
	- buf = (char *)buf + tocpy;
	- }
	-}
	-
	-void
	-dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	- const void buf, dmu_tx_t tx)
	-{
	- dmu_buf_t **dbp;
	- int numbufs;
	-
	- if (size == 0)
	- return;
	-
	- VERIFY0(dmu_buf_hold_array(os, object, offset, size,
	- FALSE, FTAG, &numbufs, &dbp));
	- dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	-}
	-
	-void
	-dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
	- const void buf, dmu_tx_t tx)
	-{
	- dmu_buf_t **dbp;
	- int numbufs;
	-
	- if (size == 0)
	- return;
	-
	- VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
	- FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
	- dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	-}
	-
	-static int
	-dmu_object_remap_one_indirect(objset_t os, dnode_t dn,
	- uint64_t last_removal_txg, uint64_t offset)
	-{
	- uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
	- int err = 0;
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
	- ASSERT3P(dbuf, !=, NULL);
	-
	- /*
	- * If the block hasn't been written yet, this default will ensure
	- * we don't try to remap it.
	- */
	- uint64_t birth = UINT64_MAX;
	- ASSERT3U(last_removal_txg, !=, UINT64_MAX);
	- if (dbuf->db_blkptr != NULL)
	- birth = dbuf->db_blkptr->blk_birth;
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- /*
	- * If this L1 was already written after the last removal, then we've
	- * already tried to remap it.
	- */
	- if (birth <= last_removal_txg &&
	- dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
	- dbuf_can_remap(dbuf)) {
	- dmu_tx_t *tx = dmu_tx_create(os);
	- dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err == 0) {
	- (void) dbuf_dirty(dbuf, tx);
	- dmu_tx_commit(tx);
	- } else {
	- dmu_tx_abort(tx);
	- }
	- }
	-
	- dbuf_rele(dbuf, FTAG);
	-
	- delay(zfs_object_remap_one_indirect_delay_ticks);
	-
	- return (err);
	-}
	-
	-/*
	- * Remap all blockpointers in the object, if possible, so that they reference
	- * only concrete vdevs.
	- *
	- * To do this, iterate over the L0 blockpointers and remap any that reference
	- * an indirect vdev. Note that we only examine L0 blockpointers; since we
	- * cannot guarantee that we can remap all blockpointer anyways (due to split
	- * blocks), we do not want to make the code unnecessarily complicated to
	- * catch the unlikely case that there is an L1 block on an indirect vdev that
	- * contains no indirect blockpointers.
	- */
	-int
	-dmu_object_remap_indirects(objset_t *os, uint64_t object,
	- uint64_t last_removal_txg)
	-{
	- uint64_t offset, l1span;
	- int err;
	- dnode_t *dn;
	-
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err != 0) {
	- return (err);
	- }
	-
	- if (dn->dn_nlevels <= 1) {
	- if (issig(JUSTLOOKING) && issig(FORREAL)) {
	- err = SET_ERROR(EINTR);
	- }
	-
	- /*
	- * If the dnode has no indirect blocks, we cannot dirty them.
	- * We still want to remap the blkptr(s) in the dnode if
	- * appropriate, so mark it as dirty.
	- */
	- if (err == 0 && dnode_needs_remap(dn)) {
	- dmu_tx_t *tx = dmu_tx_create(os);
	- dmu_tx_hold_bonus(tx, dn->dn_object);
	- if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
	- dnode_setdirty(dn, tx);
	- dmu_tx_commit(tx);
	- } else {
	- dmu_tx_abort(tx);
	- }
	- }
	-
	- dnode_rele(dn, FTAG);
	- return (err);
	- }
	-
	- offset = 0;
	- l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
	- dn->dn_datablkshift);
	- /*
	- * Find the next L1 indirect that is not a hole.
	- */
	- while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
	- if (issig(JUSTLOOKING) && issig(FORREAL)) {
	- err = SET_ERROR(EINTR);
	- break;
	- }
	- if ((err = dmu_object_remap_one_indirect(os, dn,
	- last_removal_txg, offset)) != 0) {
	- break;
	- }
	- offset += l1span;
	- }
	-
	- dnode_rele(dn, FTAG);
	- return (err);
	-}
	-
	-void
	-dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	- dmu_tx_t *tx)
	-{
	- dmu_buf_t **dbp;
	- int numbufs, i;
	-
	- if (size == 0)
	- return;
	-
	- VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
	- FALSE, FTAG, &numbufs, &dbp));
	-
	- for (i = 0; i < numbufs; i++) {
	- dmu_buf_t *db = dbp[i];
	-
	- dmu_buf_will_not_fill(db, tx);
	- }
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	-}
	-
	-void
	-dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
	- void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
	- int compressed_size, int byteorder, dmu_tx_t *tx)
	-{
	- dmu_buf_t *db;
	-
	- ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
	- ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
	- VERIFY0(dmu_buf_hold_noread(os, object, offset,
	- FTAG, &db));
	-
	- dmu_buf_write_embedded(db,
	- data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
	- uncompressed_size, compressed_size, byteorder, tx);
	-
	- dmu_buf_rele(db, FTAG);
	-}
	-
	-/*
	- * DMU support for xuio
	- */
	-kstat_t *xuio_ksp = NULL;
	-
	-int
	-dmu_xuio_init(xuio_t *xuio, int nblk)
	-{
	- dmu_xuio_t *priv;
	- uio_t *uio = &xuio->xu_uio;
	-
	- uio->uio_iovcnt = nblk;
	- uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
	-
	- priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
	- priv->cnt = nblk;
	- priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
	- priv->iovp = uio->uio_iov;
	- XUIO_XUZC_PRIV(xuio) = priv;
	-
	- if (XUIO_XUZC_RW(xuio) == UIO_READ)
	- XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
	- else
	- XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
	-
	- return (0);
	-}
	-
	-void
	-dmu_xuio_fini(xuio_t *xuio)
	-{
	- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
	- int nblk = priv->cnt;
	-
	- kmem_free(priv->iovp, nblk * sizeof (iovec_t));
	- kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
	- kmem_free(priv, sizeof (dmu_xuio_t));
	-
	- if (XUIO_XUZC_RW(xuio) == UIO_READ)
	- XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
	- else
	- XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
	-}
	-
	-/*
	- * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
	- * and increase priv->next by 1.
	- */
	-int
	-dmu_xuio_add(xuio_t xuio, arc_buf_t abuf, offset_t off, size_t n)
	-{
	- struct iovec *iov;
	- uio_t *uio = &xuio->xu_uio;
	- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
	- int i = priv->next++;
	-
	- ASSERT(i < priv->cnt);
	- ASSERT(off + n <= arc_buf_lsize(abuf));
	- iov = uio->uio_iov + i;
	- iov->iov_base = (char *)abuf->b_data + off;
	- iov->iov_len = n;
	- priv->bufs[i] = abuf;
	- return (0);
	-}
	-
	-int
	-dmu_xuio_cnt(xuio_t *xuio)
	-{
	- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
	- return (priv->cnt);
	-}
	-
	-arc_buf_t *
	-dmu_xuio_arcbuf(xuio_t *xuio, int i)
	-{
	- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
	-
	- ASSERT(i < priv->cnt);
	- return (priv->bufs[i]);
	-}
	-
	-void
	-dmu_xuio_clear(xuio_t *xuio, int i)
	-{
	- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
	-
	- ASSERT(i < priv->cnt);
	- priv->bufs[i] = NULL;
	-}
	-
	-static void
	-xuio_stat_init(void)
	-{
	- xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
	- KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
	- KSTAT_FLAG_VIRTUAL);
	- if (xuio_ksp != NULL) {
	- xuio_ksp->ks_data = &xuio_stats;
	- kstat_install(xuio_ksp);
	- }
	-}
	-
	-static void
	-xuio_stat_fini(void)
	-{
	- if (xuio_ksp != NULL) {
	- kstat_delete(xuio_ksp);
	- xuio_ksp = NULL;
	- }
	-}
	-
	-void
	-xuio_stat_wbuf_copied(void)
	-{
	- XUIOSTAT_BUMP(xuiostat_wbuf_copied);
	-}
	-
	-void
	-xuio_stat_wbuf_nocopy(void)
	-{
	- XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
	-}
	-
	-#ifdef _KERNEL
	-int
	-dmu_read_uio_dnode(dnode_t dn, uio_t uio, uint64_t size)
	-{
	- dmu_buf_t **dbp;
	- int numbufs, i, err;
	- xuio_t *xuio = NULL;
	-
	- /*
	- * NB: we could do this block-at-a-time, but it's nice
	- * to be reading in parallel.
	- */
	- err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
	- TRUE, FTAG, &numbufs, &dbp, 0);
	- if (err)
	- return (err);
	-
	-#ifdef UIO_XUIO
	- if (uio->uio_extflg == UIO_XUIO)
	- xuio = (xuio_t *)uio;
	-#endif
	-
	- for (i = 0; i < numbufs; i++) {
	- int tocpy;
	- int bufoff;
	- dmu_buf_t *db = dbp[i];
	-
	- ASSERT(size > 0);
	-
	- bufoff = uio->uio_loffset - db->db_offset;
	- tocpy = (int)MIN(db->db_size - bufoff, size);
	-
	- if (xuio) {
	- dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	- arc_buf_t *dbuf_abuf = dbi->db_buf;
	- arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
	- err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
	- if (!err) {
	- uio->uio_resid -= tocpy;
	- uio->uio_loffset += tocpy;
	- }
	-
	- if (abuf == dbuf_abuf)
	- XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
	- else
	- XUIOSTAT_BUMP(xuiostat_rbuf_copied);
	- } else {
	-#ifdef illumos
	- err = uiomove((char *)db->db_data + bufoff, tocpy,
	- UIO_READ, uio);
	-#else
	- err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
	- tocpy, uio);
	-#endif
	- }
	- if (err)
	- break;
	-
	- size -= tocpy;
	- }
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	-
	- return (err);
	-}
	-
	-/*
	- * Read 'size' bytes into the uio buffer.
	- * From object zdb->db_object.
	- * Starting at offset uio->uio_loffset.
	- *
	- * If the caller already has a dbuf in the target object
	- * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
	- * because we don't have to find the dnode_t for the object.
	- */
	-int
	-dmu_read_uio_dbuf(dmu_buf_t zdb, uio_t uio, uint64_t size)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )zdb;
	- dnode_t *dn;
	- int err;
	-
	- if (size == 0)
	- return (0);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- err = dmu_read_uio_dnode(dn, uio, size);
	- DB_DNODE_EXIT(db);
	-
	- return (err);
	-}
	-
	-/*
	- * Read 'size' bytes into the uio buffer.
	- * From the specified object
	- * Starting at offset uio->uio_loffset.
	- */
	-int
	-dmu_read_uio(objset_t os, uint64_t object, uio_t uio, uint64_t size)
	-{
	- dnode_t *dn;
	- int err;
	-
	- if (size == 0)
	- return (0);
	-
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err)
	- return (err);
	-
	- err = dmu_read_uio_dnode(dn, uio, size);
	-
	- dnode_rele(dn, FTAG);
	-
	- return (err);
	-}
	-
	-int
	-dmu_write_uio_dnode(dnode_t dn, uio_t uio, uint64_t size, dmu_tx_t *tx)
	-{
	- dmu_buf_t **dbp;
	- int numbufs;
	- int err = 0;
	- int i;
	-
	- err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
	- FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
	- if (err)
	- return (err);
	-
	- for (i = 0; i < numbufs; i++) {
	- int tocpy;
	- int bufoff;
	- dmu_buf_t *db = dbp[i];
	-
	- ASSERT(size > 0);
	-
	- bufoff = uio->uio_loffset - db->db_offset;
	- tocpy = (int)MIN(db->db_size - bufoff, size);
	-
	- ASSERT(i == 0 \|\| i == numbufs-1 \|\| tocpy == db->db_size);
	-
	- if (tocpy == db->db_size)
	- dmu_buf_will_fill(db, tx);
	- else
	- dmu_buf_will_dirty(db, tx);
	-
	-#ifdef illumos
	- /*
	- * XXX uiomove could block forever (eg. nfs-backed
	- * pages). There needs to be a uiolockdown() function
	- * to lock the pages in memory, so that uiomove won't
	- * block.
	- */
	- err = uiomove((char *)db->db_data + bufoff, tocpy,
	- UIO_WRITE, uio);
	-#else
	- err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
	- uio);
	-#endif
	-
	- if (tocpy == db->db_size)
	- dmu_buf_fill_done(db, tx);
	-
	- if (err)
	- break;
	-
	- size -= tocpy;
	- }
	-
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	- return (err);
	-}
	-
	-/*
	- * Write 'size' bytes from the uio buffer.
	- * To object zdb->db_object.
	- * Starting at offset uio->uio_loffset.
	- *
	- * If the caller already has a dbuf in the target object
	- * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
	- * because we don't have to find the dnode_t for the object.
	- */
	-int
	-dmu_write_uio_dbuf(dmu_buf_t zdb, uio_t uio, uint64_t size,
	- dmu_tx_t *tx)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )zdb;
	- dnode_t *dn;
	- int err;
	-
	- if (size == 0)
	- return (0);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- err = dmu_write_uio_dnode(dn, uio, size, tx);
	- DB_DNODE_EXIT(db);
	-
	- return (err);
	-}
	-
	-/*
	- * Write 'size' bytes from the uio buffer.
	- * To the specified object.
	- * Starting at offset uio->uio_loffset.
	- */
	-int
	-dmu_write_uio(objset_t os, uint64_t object, uio_t uio, uint64_t size,
	- dmu_tx_t *tx)
	-{
	- dnode_t *dn;
	- int err;
	-
	- if (size == 0)
	- return (0);
	-
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err)
	- return (err);
	-
	- err = dmu_write_uio_dnode(dn, uio, size, tx);
	-
	- dnode_rele(dn, FTAG);
	-
	- return (err);
	-}
	-
	-#ifdef illumos
	-int
	-dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	- page_t pp, dmu_tx_t tx)
	-{
	- dmu_buf_t **dbp;
	- int numbufs, i;
	- int err;
	-
	- if (size == 0)
	- return (0);
	-
	- err = dmu_buf_hold_array(os, object, offset, size,
	- FALSE, FTAG, &numbufs, &dbp);
	- if (err)
	- return (err);
	-
	- for (i = 0; i < numbufs; i++) {
	- int tocpy, copied, thiscpy;
	- int bufoff;
	- dmu_buf_t *db = dbp[i];
	- caddr_t va;
	-
	- ASSERT(size > 0);
	- ASSERT3U(db->db_size, >=, PAGESIZE);
	-
	- bufoff = offset - db->db_offset;
	- tocpy = (int)MIN(db->db_size - bufoff, size);
	-
	- ASSERT(i == 0 \|\| i == numbufs-1 \|\| tocpy == db->db_size);
	-
	- if (tocpy == db->db_size)
	- dmu_buf_will_fill(db, tx);
	- else
	- dmu_buf_will_dirty(db, tx);
	-
	- for (copied = 0; copied < tocpy; copied += PAGESIZE) {
	- ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
	- thiscpy = MIN(PAGESIZE, tocpy - copied);
	- va = zfs_map_page(pp, S_READ);
	- bcopy(va, (char *)db->db_data + bufoff, thiscpy);
	- zfs_unmap_page(pp, va);
	- pp = pp->p_next;
	- bufoff += PAGESIZE;
	- }
	-
	- if (tocpy == db->db_size)
	- dmu_buf_fill_done(db, tx);
	-
	- offset += tocpy;
	- size -= tocpy;
	- }
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	- return (err);
	-}
	-
	-#else /* !illumos */
	-
	-int
	-dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	- vm_page_t ma, dmu_tx_t tx)
	-{
	- dmu_buf_t **dbp;
	- struct sf_buf *sf;
	- int numbufs, i;
	- int err;
	-
	- if (size == 0)
	- return (0);
	-
	- err = dmu_buf_hold_array(os, object, offset, size,
	- FALSE, FTAG, &numbufs, &dbp);
	- if (err)
	- return (err);
	-
	- for (i = 0; i < numbufs; i++) {
	- int tocpy, copied, thiscpy;
	- int bufoff;
	- dmu_buf_t *db = dbp[i];
	- caddr_t va;
	-
	- ASSERT(size > 0);
	- ASSERT3U(db->db_size, >=, PAGESIZE);
	-
	- bufoff = offset - db->db_offset;
	- tocpy = (int)MIN(db->db_size - bufoff, size);
	-
	- ASSERT(i == 0 \|\| i == numbufs-1 \|\| tocpy == db->db_size);
	-
	- if (tocpy == db->db_size)
	- dmu_buf_will_fill(db, tx);
	- else
	- dmu_buf_will_dirty(db, tx);
	-
	- for (copied = 0; copied < tocpy; copied += PAGESIZE) {
	- ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
	- thiscpy = MIN(PAGESIZE, tocpy - copied);
	- va = zfs_map_page(*ma, &sf);
	- bcopy(va, (char *)db->db_data + bufoff, thiscpy);
	- zfs_unmap_page(sf);
	- ma += 1;
	- bufoff += PAGESIZE;
	- }
	-
	- if (tocpy == db->db_size)
	- dmu_buf_fill_done(db, tx);
	-
	- offset += tocpy;
	- size -= tocpy;
	- }
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	- return (err);
	-}
	-
	-int
	-dmu_read_pages(objset_t os, uint64_t object, vm_page_t ma, int count,
	- int rbehind, int rahead, int last_size)
	-{
	- struct sf_buf *sf;
	- vm_object_t vmobj;
	- vm_page_t m;
	- dmu_buf_t **dbp;
	- dmu_buf_t *db;
	- caddr_t va;
	- int numbufs, i;
	- int bufoff, pgoff, tocpy;
	- int mi, di;
	- int err;
	-
	- ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
	- ASSERT(last_size <= PAGE_SIZE);
	-
	- err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
	- IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
	- if (err != 0)
	- return (err);
	-
	-#ifdef DEBUG
	- IMPLY(last_size < PAGE_SIZE, *rahead == 0);
	- if (dbp[0]->db_offset != 0 \|\| numbufs > 1) {
	- for (i = 0; i < numbufs; i++) {
	- ASSERT(ISP2(dbp[i]->db_size));
	- ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
	- ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
	- }
	- }
	-#endif
	-
	- vmobj = ma[0]->object;
	-
	- db = dbp[0];
	- for (i = 0; i < *rbehind; i++) {
	- m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i,
	- VM_ALLOC_NORMAL \| VM_ALLOC_NOWAIT \|
	- VM_ALLOC_SBUSY \| VM_ALLOC_IGN_SBUSY);
	- if (m == NULL)
	- break;
	- if (!vm_page_none_valid(m)) {
	- ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
	- vm_page_sunbusy(m);
	- break;
	- }
	- ASSERT(m->dirty == 0);
	- ASSERT(!pmap_page_is_write_mapped(m));
	-
	- ASSERT(db->db_size > PAGE_SIZE);
	- bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
	- va = zfs_map_page(m, &sf);
	- bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
	- zfs_unmap_page(sf);
	- vm_page_valid(m);
	- if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
	- vm_page_activate(m);
	- else
	- vm_page_deactivate(m);
	- vm_page_sunbusy(m);
	- }
	- *rbehind = i;
	-
	- bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
	- pgoff = 0;
	- for (mi = 0, di = 0; mi < count && di < numbufs; ) {
	- if (pgoff == 0) {
	- m = ma[mi];
	- if (m != bogus_page) {
	- vm_page_assert_xbusied(m);
	- ASSERT(vm_page_none_valid(m));
	- ASSERT(m->dirty == 0);
	- ASSERT(!pmap_page_is_mapped(m));
	- va = zfs_map_page(m, &sf);
	- }
	- }
	- if (bufoff == 0)
	- db = dbp[di];
	-
	- if (m != bogus_page) {
	- ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
	- db->db_offset + bufoff);
	- }
	-
	- /*
	- * We do not need to clamp the copy size by the file
	- * size as the last block is zero-filled beyond the
	- * end of file anyway.
	- */
	- tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
	- if (m != bogus_page)
	- bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
	-
	- pgoff += tocpy;
	- ASSERT(pgoff <= PAGESIZE);
	- if (pgoff == PAGESIZE) {
	- if (m != bogus_page) {
	- zfs_unmap_page(sf);
	- vm_page_valid(m);
	- }
	- ASSERT(mi < count);
	- mi++;
	- pgoff = 0;
	- }
	-
	- bufoff += tocpy;
	- ASSERT(bufoff <= db->db_size);
	- if (bufoff == db->db_size) {
	- ASSERT(di < numbufs);
	- di++;
	- bufoff = 0;
	- }
	- }
	-
	-#ifdef DEBUG
	- /*
	- * Three possibilities:
	- * - last requested page ends at a buffer boundary and , thus,
	- * all pages and buffers have been iterated;
	- * - all requested pages are filled, but the last buffer
	- * has not been exhausted;
	- * the read-ahead is possible only in this case;
	- * - all buffers have been read, but the last page has not been
	- * fully filled;
	- * this is only possible if the file has only a single buffer
	- * with a size that is not a multiple of the page size.
	- */
	- if (mi == count) {
	- ASSERT(di >= numbufs - 1);
	- IMPLY(*rahead != 0, di == numbufs - 1);
	- IMPLY(*rahead != 0, bufoff != 0);
	- ASSERT(pgoff == 0);
	- }
	- if (di == numbufs) {
	- ASSERT(mi >= count - 1);
	- ASSERT(*rahead == 0);
	- IMPLY(pgoff == 0, mi == count);
	- if (pgoff != 0) {
	- ASSERT(mi == count - 1);
	- ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
	- }
	- }
	-#endif
	- if (pgoff != 0) {
	- ASSERT(m != bogus_page);
	- bzero(va + pgoff, PAGESIZE - pgoff);
	- zfs_unmap_page(sf);
	- vm_page_valid(m);
	- }
	-
	- for (i = 0; i < *rahead; i++) {
	- m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i,
	- VM_ALLOC_NORMAL \| VM_ALLOC_NOWAIT \|
	- VM_ALLOC_SBUSY \| VM_ALLOC_IGN_SBUSY);
	- if (m == NULL)
	- break;
	- if (!vm_page_none_valid(m)) {
	- ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
	- vm_page_sunbusy(m);
	- break;
	- }
	- ASSERT(m->dirty == 0);
	- ASSERT(!pmap_page_is_write_mapped(m));
	-
	- ASSERT(db->db_size > PAGE_SIZE);
	- bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
	- tocpy = MIN(db->db_size - bufoff, PAGESIZE);
	- va = zfs_map_page(m, &sf);
	- bcopy((char *)db->db_data + bufoff, va, tocpy);
	- if (tocpy < PAGESIZE) {
	- ASSERT(i == *rahead - 1);
	- ASSERT((db->db_size & PAGE_MASK) != 0);
	- bzero(va + tocpy, PAGESIZE - tocpy);
	- }
	- zfs_unmap_page(sf);
	- vm_page_valid(m);
	- if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
	- vm_page_activate(m);
	- else
	- vm_page_deactivate(m);
	- vm_page_sunbusy(m);
	- }
	- *rahead = i;
	-
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	- return (0);
	-}
	-#endif /* illumos */
	-#endif /* _KERNEL */
	-
	-/*
	- * Allocate a loaned anonymous arc buffer.
	- */
	-arc_buf_t *
	-dmu_request_arcbuf(dmu_buf_t *handle, int size)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )handle;
	-
	- return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
	-}
	-
	-/*
	- * Free a loaned arc buffer.
	- */
	-void
	-dmu_return_arcbuf(arc_buf_t *buf)
	-{
	- arc_return_buf(buf, FTAG);
	- arc_buf_destroy(buf, FTAG);
	-}
	-
	-/*
	- * When possible directly assign passed loaned arc buffer to a dbuf.
	- * If this is not possible copy the contents of passed arc buf via
	- * dmu_write().
	- */
	-void
	-dmu_assign_arcbuf_dnode(dnode_t dn, uint64_t offset, arc_buf_t buf,
	- dmu_tx_t *tx)
	-{
	- dmu_buf_impl_t *db;
	- uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
	- uint64_t blkid;
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- blkid = dbuf_whichblock(dn, 0, offset);
	- VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- /*
	- * We can only assign if the offset is aligned, the arc buf is the
	- * same size as the dbuf, and the dbuf is not metadata.
	- */
	- if (offset == db->db.db_offset && blksz == db->db.db_size) {
	-#ifdef _KERNEL
	- curthread->td_ru.ru_oublock++;
	-#ifdef RACCT
	- if (racct_enable) {
	- PROC_LOCK(curproc);
	- racct_add_force(curproc, RACCT_WRITEBPS, blksz);
	- racct_add_force(curproc, RACCT_WRITEIOPS, 1);
	- PROC_UNLOCK(curproc);
	- }
	-#endif /* RACCT */
	-#endif /* _KERNEL */
	- dbuf_assign_arcbuf(db, buf, tx);
	- dbuf_rele(db, FTAG);
	- } else {
	- objset_t *os;
	- uint64_t object;
	-
	- /* compressed bufs must always be assignable to their dbuf */
	- ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
	- ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
	-
	- os = dn->dn_objset;
	- object = dn->dn_object;
	-
	- dbuf_rele(db, FTAG);
	- dmu_write(os, object, offset, blksz, buf->b_data, tx);
	- dmu_return_arcbuf(buf);
	- XUIOSTAT_BUMP(xuiostat_wbuf_copied);
	- }
	-}
	-
	-void
	-dmu_assign_arcbuf(dmu_buf_t handle, uint64_t offset, arc_buf_t buf,
	- dmu_tx_t *tx)
	-{
	- dmu_buf_impl_t dbuf = (dmu_buf_impl_t )handle;
	-
	- DB_DNODE_ENTER(dbuf);
	- dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx);
	- DB_DNODE_EXIT(dbuf);
	-}
	-
	-typedef struct {
	- dbuf_dirty_record_t *dsa_dr;
	- dmu_sync_cb_t *dsa_done;
	- zgd_t *dsa_zgd;
	- dmu_tx_t *dsa_tx;
	-} dmu_sync_arg_t;
	-
	-/* ARGSUSED */
	-static void
	-dmu_sync_ready(zio_t zio, arc_buf_t buf, void *varg)
	-{
	- dmu_sync_arg_t *dsa = varg;
	- dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
	- blkptr_t *bp = zio->io_bp;
	-
	- if (zio->io_error == 0) {
	- if (BP_IS_HOLE(bp)) {
	- /*
	- * A block of zeros may compress to a hole, but the
	- * block size still needs to be known for replay.
	- */
	- BP_SET_LSIZE(bp, db->db_size);
	- } else if (!BP_IS_EMBEDDED(bp)) {
	- ASSERT(BP_GET_LEVEL(bp) == 0);
	- bp->blk_fill = 1;
	- }
	- }
	-}
	-
	-static void
	-dmu_sync_late_arrival_ready(zio_t *zio)
	-{
	- dmu_sync_ready(zio, NULL, zio->io_private);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dmu_sync_done(zio_t zio, arc_buf_t buf, void *varg)
	-{
	- dmu_sync_arg_t *dsa = varg;
	- dbuf_dirty_record_t *dr = dsa->dsa_dr;
	- dmu_buf_impl_t *db = dr->dr_dbuf;
	- zgd_t *zgd = dsa->dsa_zgd;
	-
	- /*
	- * Record the vdev(s) backing this blkptr so they can be flushed after
	- * the writes for the lwb have completed.
	- */
	- if (zio->io_error == 0) {
	- zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
	- }
	-
	- mutex_enter(&db->db_mtx);
	- ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
	- if (zio->io_error == 0) {
	- dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
	- if (dr->dt.dl.dr_nopwrite) {
	- blkptr_t *bp = zio->io_bp;
	- blkptr_t *bp_orig = &zio->io_bp_orig;
	- uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
	-
	- ASSERT(BP_EQUAL(bp, bp_orig));
	- VERIFY(BP_EQUAL(bp, db->db_blkptr));
	- ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
	- ASSERT(zio_checksum_table[chksum].ci_flags &
	- ZCHECKSUM_FLAG_NOPWRITE);
	- }
	- dr->dt.dl.dr_overridden_by = *zio->io_bp;
	- dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
	- dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
	-
	- /*
	- * Old style holes are filled with all zeros, whereas
	- * new-style holes maintain their lsize, type, level,
	- * and birth time (see zio_write_compress). While we
	- * need to reset the BP_SET_LSIZE() call that happened
	- * in dmu_sync_ready for old style holes, we do not
	- * want to wipe out the information contained in new
	- * style holes. Thus, only zero out the block pointer if
	- * it's an old style hole.
	- */
	- if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
	- dr->dt.dl.dr_overridden_by.blk_birth == 0)
	- BP_ZERO(&dr->dt.dl.dr_overridden_by);
	- } else {
	- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
	- }
	- cv_broadcast(&db->db_changed);
	- mutex_exit(&db->db_mtx);
	-
	- dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
	-
	- kmem_free(dsa, sizeof (*dsa));
	-}
	-
	-static void
	-dmu_sync_late_arrival_done(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	- dmu_sync_arg_t *dsa = zio->io_private;
	- blkptr_t *bp_orig = &zio->io_bp_orig;
	- zgd_t *zgd = dsa->dsa_zgd;
	-
	- if (zio->io_error == 0) {
	- /*
	- * Record the vdev(s) backing this blkptr so they can be
	- * flushed after the writes for the lwb have completed.
	- */
	- zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
	-
	- if (!BP_IS_HOLE(bp)) {
	- ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
	- ASSERT(BP_IS_HOLE(bp_orig) \|\| !BP_EQUAL(bp, bp_orig));
	- ASSERT(zio->io_bp->blk_birth == zio->io_txg);
	- ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
	- zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
	- }
	- }
	-
	- dmu_tx_commit(dsa->dsa_tx);
	-
	- dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
	-
	- abd_put(zio->io_abd);
	- kmem_free(dsa, sizeof (*dsa));
	-}
	-
	-static int
	-dmu_sync_late_arrival(zio_t pio, objset_t os, dmu_sync_cb_t done, zgd_t zgd,
	- zio_prop_t zp, zbookmark_phys_t zb)
	-{
	- dmu_sync_arg_t *dsa;
	- dmu_tx_t *tx;
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
	- if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
	- dmu_tx_abort(tx);
	- /* Make zl_get_data do txg_waited_synced() */
	- return (SET_ERROR(EIO));
	- }
	-
	- /*
	- * In order to prevent the zgd's lwb from being free'd prior to
	- * dmu_sync_late_arrival_done() being called, we have to ensure
	- * the lwb's "max txg" takes this tx's txg into account.
	- */
	- zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
	-
	- dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
	- dsa->dsa_dr = NULL;
	- dsa->dsa_done = done;
	- dsa->dsa_zgd = zgd;
	- dsa->dsa_tx = tx;
	-
	- /*
	- * Since we are currently syncing this txg, it's nontrivial to
	- * determine what BP to nopwrite against, so we disable nopwrite.
	- *
	- * When syncing, the db_blkptr is initially the BP of the previous
	- * txg. We can not nopwrite against it because it will be changed
	- * (this is similar to the non-late-arrival case where the dbuf is
	- * dirty in a future txg).
	- *
	- * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
	- * We can not nopwrite against it because although the BP will not
	- * (typically) be changed, the data has not yet been persisted to this
	- * location.
	- *
	- * Finally, when dbuf_write_done() is called, it is theoretically
	- * possible to always nopwrite, because the data that was written in
	- * this txg is the same data that we are trying to write. However we
	- * would need to check that this dbuf is not dirty in any future
	- * txg's (as we do in the normal dmu_sync() path). For simplicity, we
	- * don't nopwrite in this case.
	- */
	- zp->zp_nopwrite = B_FALSE;
	-
	- zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
	- abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
	- zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
	- dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
	- dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
	-
	- return (0);
	-}
	-
	-/*
	- * Intent log support: sync the block associated with db to disk.
	- * N.B. and XXX: the caller is responsible for making sure that the
	- * data isn't changing while dmu_sync() is writing it.
	- *
	- * Return values:
	- *
	- * EEXIST: this txg has already been synced, so there's nothing to do.
	- * The caller should not log the write.
	- *
	- * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
	- * The caller should not log the write.
	- *
	- * EALREADY: this block is already in the process of being synced.
	- * The caller should track its progress (somehow).
	- *
	- * EIO: could not do the I/O.
	- * The caller should do a txg_wait_synced().
	- *
	- * 0: the I/O has been initiated.
	- * The caller should log this blkptr in the done callback.
	- * It is possible that the I/O will fail, in which case
	- * the error will be reported to the done callback and
	- * propagated to pio from zio_done().
	- */
	-int
	-dmu_sync(zio_t pio, uint64_t txg, dmu_sync_cb_t done, zgd_t *zgd)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )zgd->zgd_db;
	- objset_t *os = db->db_objset;
	- dsl_dataset_t *ds = os->os_dsl_dataset;
	- dbuf_dirty_record_t *dr;
	- dmu_sync_arg_t *dsa;
	- zbookmark_phys_t zb;
	- zio_prop_t zp;
	- dnode_t *dn;
	-
	- ASSERT(pio != NULL);
	- ASSERT(txg != 0);
	-
	- SET_BOOKMARK(&zb, ds->ds_object,
	- db->db.db_object, db->db_level, db->db_blkid);
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
	- DB_DNODE_EXIT(db);
	-
	- /*
	- * If we're frozen (running ziltest), we always need to generate a bp.
	- */
	- if (txg > spa_freeze_txg(os->os_spa))
	- return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
	-
	- /*
	- * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
	- * and us. If we determine that this txg is not yet syncing,
	- * but it begins to sync a moment later, that's OK because the
	- * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
	- */
	- mutex_enter(&db->db_mtx);
	-
	- if (txg <= spa_last_synced_txg(os->os_spa)) {
	- /*
	- * This txg has already synced. There's nothing to do.
	- */
	- mutex_exit(&db->db_mtx);
	- return (SET_ERROR(EEXIST));
	- }
	-
	- if (txg <= spa_syncing_txg(os->os_spa)) {
	- /*
	- * This txg is currently syncing, so we can't mess with
	- * the dirty record anymore; just write a new log block.
	- */
	- mutex_exit(&db->db_mtx);
	- return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
	- }
	-
	- dr = db->db_last_dirty;
	- while (dr && dr->dr_txg != txg)
	- dr = dr->dr_next;
	-
	- if (dr == NULL) {
	- /*
	- * There's no dr for this dbuf, so it must have been freed.
	- * There's no need to log writes to freed blocks, so we're done.
	- */
	- mutex_exit(&db->db_mtx);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- ASSERT(dr->dr_next == NULL \|\| dr->dr_next->dr_txg < txg);
	-
	- if (db->db_blkptr != NULL) {
	- /*
	- * We need to fill in zgd_bp with the current blkptr so that
	- * the nopwrite code can check if we're writing the same
	- * data that's already on disk. We can only nopwrite if we
	- * are sure that after making the copy, db_blkptr will not
	- * change until our i/o completes. We ensure this by
	- * holding the db_mtx, and only allowing nopwrite if the
	- * block is not already dirty (see below). This is verified
	- * by dmu_sync_done(), which VERIFYs that the db_blkptr has
	- * not changed.
	- */
	- zgd->zgd_bp = db->db_blkptr;
	- }
	-
	- /*
	- * Assume the on-disk data is X, the current syncing data (in
	- * txg - 1) is Y, and the current in-memory data is Z (currently
	- * in dmu_sync).
	- *
	- * We usually want to perform a nopwrite if X and Z are the
	- * same. However, if Y is different (i.e. the BP is going to
	- * change before this write takes effect), then a nopwrite will
	- * be incorrect - we would override with X, which could have
	- * been freed when Y was written.
	- *
	- * (Note that this is not a concern when we are nop-writing from
	- * syncing context, because X and Y must be identical, because
	- * all previous txgs have been synced.)
	- *
	- * Therefore, we disable nopwrite if the current BP could change
	- * before this TXG. There are two ways it could change: by
	- * being dirty (dr_next is non-NULL), or by being freed
	- * (dnode_block_freed()). This behavior is verified by
	- * zio_done(), which VERIFYs that the override BP is identical
	- * to the on-disk BP.
	- */
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- if (dr->dr_next != NULL \|\| dnode_block_freed(dn, db->db_blkid))
	- zp.zp_nopwrite = B_FALSE;
	- DB_DNODE_EXIT(db);
	-
	- ASSERT(dr->dr_txg == txg);
	- if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC \|\|
	- dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
	- /*
	- * We have already issued a sync write for this buffer,
	- * or this buffer has already been synced. It could not
	- * have been dirtied since, or we would have cleared the state.
	- */
	- mutex_exit(&db->db_mtx);
	- return (SET_ERROR(EALREADY));
	- }
	-
	- ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
	- dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
	- mutex_exit(&db->db_mtx);
	-
	- dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
	- dsa->dsa_dr = dr;
	- dsa->dsa_done = done;
	- dsa->dsa_zgd = zgd;
	- dsa->dsa_tx = NULL;
	-
	- zio_nowait(arc_write(pio, os->os_spa, txg,
	- zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
	- &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
	- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
	-
	- return (0);
	-}
	-
	-int
	-dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
	- dmu_tx_t *tx)
	-{
	- dnode_t *dn;
	- int err;
	-
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err)
	- return (err);
	- err = dnode_set_blksz(dn, size, ibs, tx);
	- dnode_rele(dn, FTAG);
	- return (err);
	-}
	-
	-void
	-dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
	- dmu_tx_t *tx)
	-{
	- dnode_t *dn;
	-
	- /*
	- * Send streams include each object's checksum function. This
	- * check ensures that the receiving system can understand the
	- * checksum function transmitted.
	- */
	- ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
	-
	- VERIFY0(dnode_hold(os, object, FTAG, &dn));
	- ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
	- dn->dn_checksum = checksum;
	- dnode_setdirty(dn, tx);
	- dnode_rele(dn, FTAG);
	-}
	-
	-void
	-dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
	- dmu_tx_t *tx)
	-{
	- dnode_t *dn;
	-
	- /*
	- * Send streams include each object's compression function. This
	- * check ensures that the receiving system can understand the
	- * compression function transmitted.
	- */
	- ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
	-
	- VERIFY0(dnode_hold(os, object, FTAG, &dn));
	- dn->dn_compress = compress;
	- dnode_setdirty(dn, tx);
	- dnode_rele(dn, FTAG);
	-}
	-
	-int zfs_mdcomp_disable = 0;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
	- &zfs_mdcomp_disable, 0, "Disable metadata compression");
	-
	-/*
	- * When the "redundant_metadata" property is set to "most", only indirect
	- * blocks of this level and higher will have an additional ditto block.
	- */
	-int zfs_redundant_metadata_most_ditto_level = 2;
	-
	-void
	-dmu_write_policy(objset_t os, dnode_t dn, int level, int wp, zio_prop_t *zp)
	-{
	- dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
	- boolean_t ismd = (level > 0 \|\| DMU_OT_IS_METADATA(type) \|\|
	- (wp & WP_SPILL));
	- enum zio_checksum checksum = os->os_checksum;
	- enum zio_compress compress = os->os_compress;
	- enum zio_checksum dedup_checksum = os->os_dedup_checksum;
	- boolean_t dedup = B_FALSE;
	- boolean_t nopwrite = B_FALSE;
	- boolean_t dedup_verify = os->os_dedup_verify;
	- int copies = os->os_copies;
	-
	- /*
	- * We maintain different write policies for each of the following
	- * types of data:
	- * 1. metadata
	- * 2. preallocated blocks (i.e. level-0 blocks of a dump device)
	- * 3. all other level 0 blocks
	- */
	- if (ismd) {
	- if (zfs_mdcomp_disable) {
	- compress = ZIO_COMPRESS_EMPTY;
	- } else {
	- /*
	- * XXX -- we should design a compression algorithm
	- * that specializes in arrays of bps.
	- */
	- compress = zio_compress_select(os->os_spa,
	- ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
	- }
	-
	- /*
	- * Metadata always gets checksummed. If the data
	- * checksum is multi-bit correctable, and it's not a
	- * ZBT-style checksum, then it's suitable for metadata
	- * as well. Otherwise, the metadata checksum defaults
	- * to fletcher4.
	- */
	- if (!(zio_checksum_table[checksum].ci_flags &
	- ZCHECKSUM_FLAG_METADATA) \|\|
	- (zio_checksum_table[checksum].ci_flags &
	- ZCHECKSUM_FLAG_EMBEDDED))
	- checksum = ZIO_CHECKSUM_FLETCHER_4;
	-
	- if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL \|\|
	- (os->os_redundant_metadata ==
	- ZFS_REDUNDANT_METADATA_MOST &&
	- (level >= zfs_redundant_metadata_most_ditto_level \|\|
	- DMU_OT_IS_METADATA(type) \|\| (wp & WP_SPILL))))
	- copies++;
	- } else if (wp & WP_NOFILL) {
	- ASSERT(level == 0);
	-
	- /*
	- * If we're writing preallocated blocks, we aren't actually
	- * writing them so don't set any policy properties. These
	- * blocks are currently only used by an external subsystem
	- * outside of zfs (i.e. dump) and not written by the zio
	- * pipeline.
	- */
	- compress = ZIO_COMPRESS_OFF;
	- checksum = ZIO_CHECKSUM_NOPARITY;
	- } else {
	- compress = zio_compress_select(os->os_spa, dn->dn_compress,
	- compress);
	-
	- checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
	- zio_checksum_select(dn->dn_checksum, checksum) :
	- dedup_checksum;
	-
	- /*
	- * Determine dedup setting. If we are in dmu_sync(),
	- * we won't actually dedup now because that's all
	- * done in syncing context; but we do want to use the
	- * dedup checkum. If the checksum is not strong
	- * enough to ensure unique signatures, force
	- * dedup_verify.
	- */
	- if (dedup_checksum != ZIO_CHECKSUM_OFF) {
	- dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
	- if (!(zio_checksum_table[checksum].ci_flags &
	- ZCHECKSUM_FLAG_DEDUP))
	- dedup_verify = B_TRUE;
	- }
	-
	- /*
	- * Enable nopwrite if we have secure enough checksum
	- * algorithm (see comment in zio_nop_write) and
	- * compression is enabled. We don't enable nopwrite if
	- * dedup is enabled as the two features are mutually
	- * exclusive.
	- */
	- nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
	- ZCHECKSUM_FLAG_NOPWRITE) &&
	- compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
	- }
	-
	- zp->zp_checksum = checksum;
	- zp->zp_compress = compress;
	- ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
	-
	- zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
	- zp->zp_level = level;
	- zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
	- zp->zp_dedup = dedup;
	- zp->zp_dedup_verify = dedup && dedup_verify;
	- zp->zp_nopwrite = nopwrite;
	- zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
	- os->os_zpl_special_smallblock : 0;
	-}
	-
	-int
	-dmu_offset_next(objset_t os, uint64_t object, boolean_t hole, uint64_t off)
	-{
	- dnode_t *dn;
	- int err;
	-
	- /*
	- * Sync any current changes before
	- * we go trundling through the block pointers.
	- */
	- err = dmu_object_wait_synced(os, object);
	- if (err) {
	- return (err);
	- }
	-
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err) {
	- return (err);
	- }
	-
	- err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
	- dnode_rele(dn, FTAG);
	-
	- return (err);
	-}
	-
	-/*
	- * Given the ZFS object, if it contains any dirty nodes
	- * this function flushes all dirty blocks to disk. This
	- * ensures the DMU object info is updated. A more efficient
	- * future version might just find the TXG with the maximum
	- * ID and wait for that to be synced.
	- */
	-int
	-dmu_object_wait_synced(objset_t *os, uint64_t object)
	-{
	- dnode_t *dn;
	- int error, i;
	-
	- error = dnode_hold(os, object, FTAG, &dn);
	- if (error) {
	- return (error);
	- }
	-
	- for (i = 0; i < TXG_SIZE; i++) {
	- if (list_link_active(&dn->dn_dirty_link[i])) {
	- break;
	- }
	- }
	- dnode_rele(dn, FTAG);
	- if (i != TXG_SIZE) {
	- txg_wait_synced(dmu_objset_pool(os), 0);
	- }
	-
	- return (0);
	-}
	-
	-void
	-__dmu_object_info_from_dnode(dnode_t dn, dmu_object_info_t doi)
	-{
	- dnode_phys_t *dnp = dn->dn_phys;
	-
	- doi->doi_data_block_size = dn->dn_datablksz;
	- doi->doi_metadata_block_size = dn->dn_indblkshift ?
	- 1ULL << dn->dn_indblkshift : 0;
	- doi->doi_type = dn->dn_type;
	- doi->doi_bonus_type = dn->dn_bonustype;
	- doi->doi_bonus_size = dn->dn_bonuslen;
	- doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
	- doi->doi_indirection = dn->dn_nlevels;
	- doi->doi_checksum = dn->dn_checksum;
	- doi->doi_compress = dn->dn_compress;
	- doi->doi_nblkptr = dn->dn_nblkptr;
	- doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
	- doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
	- doi->doi_fill_count = 0;
	- for (int i = 0; i < dnp->dn_nblkptr; i++)
	- doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
	-}
	-
	-void
	-dmu_object_info_from_dnode(dnode_t dn, dmu_object_info_t doi)
	-{
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- mutex_enter(&dn->dn_mtx);
	-
	- __dmu_object_info_from_dnode(dn, doi);
	-
	- mutex_exit(&dn->dn_mtx);
	- rw_exit(&dn->dn_struct_rwlock);
	-}
	-
	-/*
	- * Get information on a DMU object.
	- * If doi is NULL, just indicates whether the object exists.
	- */
	-int
	-dmu_object_info(objset_t os, uint64_t object, dmu_object_info_t doi)
	-{
	- dnode_t *dn;
	- int err = dnode_hold(os, object, FTAG, &dn);
	-
	- if (err)
	- return (err);
	-
	- if (doi != NULL)
	- dmu_object_info_from_dnode(dn, doi);
	-
	- dnode_rele(dn, FTAG);
	- return (0);
	-}
	-
	-/*
	- * As above, but faster; can be used when you have a held dbuf in hand.
	- */
	-void
	-dmu_object_info_from_db(dmu_buf_t db_fake, dmu_object_info_t doi)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	-
	- DB_DNODE_ENTER(db);
	- dmu_object_info_from_dnode(DB_DNODE(db), doi);
	- DB_DNODE_EXIT(db);
	-}
	-
	-/*
	- * Faster still when you only care about the size.
	- * This is specifically optimized for zfs_getattr().
	- */
	-void
	-dmu_object_size_from_db(dmu_buf_t db_fake, uint32_t blksize,
	- u_longlong_t *nblk512)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	- dnode_t *dn;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	-
	- *blksize = dn->dn_datablksz;
	- /* add in number of slots used for the dnode itself */
	- *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
	- SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
	- DB_DNODE_EXIT(db);
	-}
	-
	-void
	-dmu_object_dnsize_from_db(dmu_buf_t db_fake, int dnsize)
	-{
	- dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	- dnode_t *dn;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- *dnsize = dn->dn_num_slots << DNODE_SHIFT;
	- DB_DNODE_EXIT(db);
	-}
	-
	-void
	-byteswap_uint64_array(void *vbuf, size_t size)
	-{
	- uint64_t *buf = vbuf;
	- size_t count = size >> 3;
	- int i;
	-
	- ASSERT((size & 7) == 0);
	-
	- for (i = 0; i < count; i++)
	- buf[i] = BSWAP_64(buf[i]);
	-}
	-
	-void
	-byteswap_uint32_array(void *vbuf, size_t size)
	-{
	- uint32_t *buf = vbuf;
	- size_t count = size >> 2;
	- int i;
	-
	- ASSERT((size & 3) == 0);
	-
	- for (i = 0; i < count; i++)
	- buf[i] = BSWAP_32(buf[i]);
	-}
	-
	-void
	-byteswap_uint16_array(void *vbuf, size_t size)
	-{
	- uint16_t *buf = vbuf;
	- size_t count = size >> 1;
	- int i;
	-
	- ASSERT((size & 1) == 0);
	-
	- for (i = 0; i < count; i++)
	- buf[i] = BSWAP_16(buf[i]);
	-}
	-
	-/* ARGSUSED */
	-void
	-byteswap_uint8_array(void *vbuf, size_t size)
	-{
	-}
	-
	-void
	-dmu_init(void)
	-{
	- abd_init();
	- zfs_dbgmsg_init();
	- sa_cache_init();
	- xuio_stat_init();
	- dmu_objset_init();
	- dnode_init();
	- zfetch_init();
	- zio_compress_init();
	- l2arc_init();
	- arc_init();
	- dbuf_init();
	-}
	-
	-void
	-dmu_fini(void)
	-{
	- arc_fini(); /* arc depends on l2arc, so arc must go first */
	- l2arc_fini();
	- zfetch_fini();
	- zio_compress_fini();
	- dbuf_fini();
	- dnode_fini();
	- dmu_objset_fini();
	- xuio_stat_fini();
	- sa_cache_fini();
	- zfs_dbgmsg_fini();
	- abd_fini();
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
	@@ -1,251 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dbuf.h>
	-#include <sys/dnode.h>
	-#include <sys/zfs_context.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zap.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zfs_znode.h>
	-
	-struct diffarg {
	- struct file da_fp; / file to which we are reporting */
	- offset_t *da_offp;
	- int da_err; /* error that stopped diff search */
	- dmu_diff_record_t da_ddr;
	- kthread_t *da_td;
	-};
	-
	-static int
	-write_bytes(struct diffarg *da)
	-{
	- struct uio auio;
	- struct iovec aiov;
	-
	- aiov.iov_base = (caddr_t)&da->da_ddr;
	- aiov.iov_len = sizeof (da->da_ddr);
	- auio.uio_iov = &aiov;
	- auio.uio_iovcnt = 1;
	- auio.uio_resid = aiov.iov_len;
	- auio.uio_segflg = UIO_SYSSPACE;
	- auio.uio_rw = UIO_WRITE;
	- auio.uio_offset = (off_t)-1;
	- auio.uio_td = da->da_td;
	-#ifdef _KERNEL
	- if (da->da_fp->f_type == DTYPE_VNODE)
	- bwillwrite();
	- return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td));
	-#else
	- fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
	- return (EOPNOTSUPP);
	-#endif
	-}
	-
	-static int
	-write_record(struct diffarg *da)
	-{
	-
	- if (da->da_ddr.ddr_type == DDR_NONE) {
	- da->da_err = 0;
	- return (0);
	- }
	-
	- da->da_err = write_bytes(da);
	- *da->da_offp += sizeof (da->da_ddr);
	- return (da->da_err);
	-}
	-
	-static int
	-report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last)
	-{
	- ASSERT(first <= last);
	- if (da->da_ddr.ddr_type != DDR_FREE \|\|
	- first != da->da_ddr.ddr_last + 1) {
	- if (write_record(da) != 0)
	- return (da->da_err);
	- da->da_ddr.ddr_type = DDR_FREE;
	- da->da_ddr.ddr_first = first;
	- da->da_ddr.ddr_last = last;
	- return (0);
	- }
	- da->da_ddr.ddr_last = last;
	- return (0);
	-}
	-
	-static int
	-report_dnode(struct diffarg da, uint64_t object, dnode_phys_t dnp)
	-{
	- ASSERT(dnp != NULL);
	- if (dnp->dn_type == DMU_OT_NONE)
	- return (report_free_dnode_range(da, object, object));
	-
	- if (da->da_ddr.ddr_type != DDR_INUSE \|\|
	- object != da->da_ddr.ddr_last + 1) {
	- if (write_record(da) != 0)
	- return (da->da_err);
	- da->da_ddr.ddr_type = DDR_INUSE;
	- da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
	- return (0);
	- }
	- da->da_ddr.ddr_last = object;
	- return (0);
	-}
	-
	-#define DBP_SPAN(dnp, level) \
	- (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
	- (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
	-
	-/* ARGSUSED */
	-static int
	-diff_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	-{
	- struct diffarg *da = arg;
	- int err = 0;
	-
	- if (issig(JUSTLOOKING) && issig(FORREAL))
	- return (SET_ERROR(EINTR));
	-
	- if (bp == NULL \|\| zb->zb_object != DMU_META_DNODE_OBJECT)
	- return (0);
	-
	- if (BP_IS_HOLE(bp)) {
	- uint64_t span = DBP_SPAN(dnp, zb->zb_level);
	- uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
	-
	- err = report_free_dnode_range(da, dnobj,
	- dnobj + (span >> DNODE_SHIFT) - 1);
	- if (err)
	- return (err);
	- } else if (zb->zb_level == 0) {
	- dnode_phys_t *blk;
	- arc_buf_t *abuf;
	- arc_flags_t aflags = ARC_FLAG_WAIT;
	- int blksz = BP_GET_LSIZE(bp);
	- int i;
	-
	- if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
	- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
	- &aflags, zb) != 0)
	- return (SET_ERROR(EIO));
	-
	- blk = abuf->b_data;
	- for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
	- uint64_t dnobj = (zb->zb_blkid <<
	- (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
	- err = report_dnode(da, dnobj, blk+i);
	- if (err)
	- break;
	- }
	- arc_buf_destroy(abuf, &abuf);
	- if (err)
	- return (err);
	- /* Don't care about the data blocks */
	- return (TRAVERSE_VISIT_NO_CHILDREN);
	- }
	- return (0);
	-}
	-
	-int
	-dmu_diff(const char tosnap_name, const char fromsnap_name,
	-#ifdef illumos
	- struct vnode vp, offset_t offp)
	-#else
	- struct file fp, offset_t offp)
	-#endif
	-{
	- struct diffarg da;
	- dsl_dataset_t *fromsnap;
	- dsl_dataset_t *tosnap;
	- dsl_pool_t *dp;
	- int error;
	- uint64_t fromtxg;
	-
	- if (strchr(tosnap_name, '@') == NULL \|\|
	- strchr(fromsnap_name, '@') == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- error = dsl_pool_hold(tosnap_name, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap);
	- if (error != 0) {
	- dsl_dataset_rele(tosnap, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
	- dsl_dataset_rele(fromsnap, FTAG);
	- dsl_dataset_rele(tosnap, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (SET_ERROR(EXDEV));
	- }
	-
	- fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
	- dsl_dataset_rele(fromsnap, FTAG);
	-
	- dsl_dataset_long_hold(tosnap, FTAG);
	- dsl_pool_rele(dp, FTAG);
	-
	- da.da_fp = fp;
	- da.da_offp = offp;
	- da.da_ddr.ddr_type = DDR_NONE;
	- da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
	- da.da_err = 0;
	- da.da_td = curthread;
	-
	- error = traverse_dataset(tosnap, fromtxg,
	- TRAVERSE_PRE \| TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
	-
	- if (error != 0) {
	- da.da_err = error;
	- } else {
	- /* we set the da.da_err we return as side-effect */
	- (void) write_record(&da);
	- }
	-
	- dsl_dataset_long_rele(tosnap, FTAG);
	- dsl_dataset_rele(tosnap, FTAG);
	-
	- return (da.da_err);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
	@@ -1,444 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- * Copyright 2014 HybridCluster. All rights reserved.
	- */
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dnode.h>
	-#include <sys/zap.h>
	-#include <sys/zfeature.h>
	-#include <sys/dsl_dataset.h>
	-
	-/*
	- * Each of the concurrent object allocators will grab
	- * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
	- * grab 128 slots, which is 4 blocks worth. This was experimentally
	- * determined to be the lowest value that eliminates the measurable effect
	- * of lock contention from this code path.
	- */
	-int dmu_object_alloc_chunk_shift = 7;
	-
	-static uint64_t
	-dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
	- int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
	- int dnodesize, dmu_tx_t *tx)
	-{
	- uint64_t object;
	- uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
	- (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
	- dnode_t *dn = NULL;
	- int dn_slots = dnodesize >> DNODE_SHIFT;
	- boolean_t restarted = B_FALSE;
	- uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
	- os->os_obj_next_percpu_len];
	- int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
	- int error;
	-
	- if (dn_slots == 0) {
	- dn_slots = DNODE_MIN_SLOTS;
	- } else {
	- ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
	- ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
	- }
	-
	- /*
	- * The "chunk" of dnodes that is assigned to a CPU-specific
	- * allocator needs to be at least one block's worth, to avoid
	- * lock contention on the dbuf. It can be at most one L1 block's
	- * worth, so that the "rescan after polishing off a L1's worth"
	- * logic below will be sure to kick in.
	- */
	- if (dnodes_per_chunk < DNODES_PER_BLOCK)
	- dnodes_per_chunk = DNODES_PER_BLOCK;
	- if (dnodes_per_chunk > L1_dnode_count)
	- dnodes_per_chunk = L1_dnode_count;
	-
	-#ifdef __FreeBSD__
	- object = atomic_load_64(cpuobj);
	-#else
	- object = *cpuobj;
	-#endif
	-
	- for (;;) {
	- /*
	- * If we finished a chunk of dnodes, get a new one from
	- * the global allocator.
	- */
	- if ((P2PHASE(object, dnodes_per_chunk) == 0) \|\|
	- (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
	- dn_slots)) {
	- DNODE_STAT_BUMP(dnode_alloc_next_chunk);
	- mutex_enter(&os->os_obj_lock);
	- ASSERT0(P2PHASE(os->os_obj_next_chunk,
	- dnodes_per_chunk));
	- object = os->os_obj_next_chunk;
	-
	- /*
	- * Each time we polish off a L1 bp worth of dnodes
	- * (2^12 objects), move to another L1 bp that's
	- * still reasonably sparse (at most 1/4 full). Look
	- * from the beginning at most once per txg. If we
	- * still can't allocate from that L1 block, search
	- * for an empty L0 block, which will quickly skip
	- * to the end of the metadnode if the no nearby L0
	- * blocks are empty. This fallback avoids a
	- * pathology where full dnode blocks containing
	- * large dnodes appear sparse because they have a
	- * low blk_fill, leading to many failed allocation
	- * attempts. In the long term a better mechanism to
	- * search for sparse metadnode regions, such as
	- * spacemaps, could be implemented.
	- *
	- * os_scan_dnodes is set during txg sync if enough
	- * objects have been freed since the previous
	- * rescan to justify backfilling again.
	- *
	- * Note that dmu_traverse depends on the behavior
	- * that we use multiple blocks of the dnode object
	- * before going back to reuse objects. Any change
	- * to this algorithm should preserve that property
	- * or find another solution to the issues described
	- * in traverse_visitbp.
	- */
	- if (P2PHASE(object, L1_dnode_count) == 0) {
	- uint64_t offset;
	- uint64_t blkfill;
	- int minlvl;
	- if (os->os_rescan_dnodes) {
	- offset = 0;
	- os->os_rescan_dnodes = B_FALSE;
	- } else {
	- offset = object << DNODE_SHIFT;
	- }
	- blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
	- minlvl = restarted ? 1 : 2;
	- restarted = B_TRUE;
	- error = dnode_next_offset(DMU_META_DNODE(os),
	- DNODE_FIND_HOLE, &offset, minlvl,
	- blkfill, 0);
	- if (error == 0) {
	- object = offset >> DNODE_SHIFT;
	- }
	- }
	- /*
	- * Note: if "restarted", we may find a L0 that
	- * is not suitably aligned.
	- */
	- os->os_obj_next_chunk =
	- P2ALIGN(object, dnodes_per_chunk) +
	- dnodes_per_chunk;
	- (void) atomic_swap_64(cpuobj, object);
	- mutex_exit(&os->os_obj_lock);
	- }
	-
	- /*
	- * The value of (*cpuobj) before adding dn_slots is the object
	- * ID assigned to us. The value afterwards is the object ID
	- * assigned to whoever wants to do an allocation next.
	- */
	- object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
	-
	- /*
	- * XXX We should check for an i/o error here and return
	- * up to our caller. Actually we should pre-read it in
	- * dmu_tx_assign(), but there is currently no mechanism
	- * to do so.
	- */
	- error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
	- dn_slots, FTAG, &dn);
	- if (error == 0) {
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- /*
	- * Another thread could have allocated it; check
	- * again now that we have the struct lock.
	- */
	- if (dn->dn_type == DMU_OT_NONE) {
	- dnode_allocate(dn, ot, blocksize, 0,
	- bonustype, bonuslen, dn_slots, tx);
	- rw_exit(&dn->dn_struct_rwlock);
	- dmu_tx_add_new_object(tx, dn);
	- dnode_rele(dn, FTAG);
	- return (object);
	- }
	- rw_exit(&dn->dn_struct_rwlock);
	- dnode_rele(dn, FTAG);
	- DNODE_STAT_BUMP(dnode_alloc_race);
	- }
	-
	- /*
	- * Skip to next known valid starting point on error. This
	- * is the start of the next block of dnodes.
	- */
	- if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
	- object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
	- DNODE_STAT_BUMP(dnode_alloc_next_block);
	- }
	- (void) atomic_swap_64(cpuobj, object);
	- }
	-}
	-
	-uint64_t
	-dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	-{
	- return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
	- bonuslen, 0, tx));
	-}
	-
	-uint64_t
	-dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
	- int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
	- dmu_tx_t *tx)
	-{
	- return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
	- bonustype, bonuslen, 0, tx));
	-}
	-
	-uint64_t
	-dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
	- dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
	-{
	- return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
	- bonuslen, dnodesize, tx));
	-}
	-
	-int
	-dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
	- int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	-{
	- return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
	- bonuslen, 0, tx));
	-}
	-
	-int
	-dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
	- int blocksize, dmu_object_type_t bonustype, int bonuslen,
	- int dnodesize, dmu_tx_t *tx)
	-{
	- dnode_t *dn;
	- int dn_slots = dnodesize >> DNODE_SHIFT;
	- int err;
	-
	- if (dn_slots == 0)
	- dn_slots = DNODE_MIN_SLOTS;
	- ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
	- ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
	-
	- if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
	- return (SET_ERROR(EBADF));
	-
	- err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
	- FTAG, &dn);
	- if (err)
	- return (err);
	- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
	- dmu_tx_add_new_object(tx, dn);
	-
	- dnode_rele(dn, FTAG);
	-
	- return (0);
	-}
	-
	-int
	-dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
	- int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	-{
	- return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
	- bonuslen, DNODE_MIN_SIZE, tx));
	-}
	-
	-int
	-dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
	- int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
	- dmu_tx_t *tx)
	-{
	- dnode_t *dn;
	- int dn_slots = dnodesize >> DNODE_SHIFT;
	- int err;
	-
	- if (dn_slots == 0)
	- dn_slots = DNODE_MIN_SLOTS;
	-
	- if (object == DMU_META_DNODE_OBJECT)
	- return (SET_ERROR(EBADF));
	-
	- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
	- FTAG, &dn);
	- if (err)
	- return (err);
	-
	- dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
	-
	- dnode_rele(dn, FTAG);
	- return (err);
	-}
	-
	-
	-int
	-dmu_object_free(objset_t os, uint64_t object, dmu_tx_t tx)
	-{
	- dnode_t *dn;
	- int err;
	-
	- ASSERT(object != DMU_META_DNODE_OBJECT \|\| dmu_tx_private_ok(tx));
	-
	- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
	- FTAG, &dn);
	- if (err)
	- return (err);
	-
	- ASSERT(dn->dn_type != DMU_OT_NONE);
	- /*
	- * If we don't create this free range, we'll leak indirect blocks when
	- * we get to freeing the dnode in syncing context.
	- */
	- dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
	- dnode_free(dn, tx);
	- dnode_rele(dn, FTAG);
	-
	- return (0);
	-}
	-
	-/*
	- * Return (in *objectp) the next object which is allocated (or a hole)
	- * after *object, taking into account only objects that may have been modified
	- * after the specified txg.
	- */
	-int
	-dmu_object_next(objset_t os, uint64_t objectp, boolean_t hole, uint64_t txg)
	-{
	- uint64_t offset;
	- uint64_t start_obj;
	- struct dsl_dataset *ds = os->os_dsl_dataset;
	- int error;
	-
	- if (*objectp == 0) {
	- start_obj = 1;
	- } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
	- uint64_t i = *objectp + 1;
	- uint64_t last_obj = *objectp \| (DNODES_PER_BLOCK - 1);
	- dmu_object_info_t doi;
	-
	- /*
	- * Scan through the remaining meta dnode block. The contents
	- * of each slot in the block are known so it can be quickly
	- * checked. If the block is exhausted without a match then
	- * hand off to dnode_next_offset() for further scanning.
	- */
	- while (i <= last_obj) {
	- error = dmu_object_info(os, i, &doi);
	- if (error == ENOENT) {
	- if (hole) {
	- *objectp = i;
	- return (0);
	- } else {
	- i++;
	- }
	- } else if (error == EEXIST) {
	- i++;
	- } else if (error == 0) {
	- if (hole) {
	- i += doi.doi_dnodesize >> DNODE_SHIFT;
	- } else {
	- *objectp = i;
	- return (0);
	- }
	- } else {
	- return (error);
	- }
	- }
	-
	- start_obj = i;
	- } else {
	- start_obj = *objectp + 1;
	- }
	-
	- offset = start_obj << DNODE_SHIFT;
	-
	- error = dnode_next_offset(DMU_META_DNODE(os),
	- (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
	-
	- *objectp = offset >> DNODE_SHIFT;
	-
	- return (error);
	-}
	-
	-/*
	- * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
	- * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
	- *
	- * Only for use from syncing context, on MOS objects.
	- */
	-void
	-dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
	- dmu_tx_t *tx)
	-{
	- dnode_t *dn;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- VERIFY0(dnode_hold(mos, object, FTAG, &dn));
	- if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
	- dnode_rele(dn, FTAG);
	- return;
	- }
	- ASSERT3U(dn->dn_type, ==, old_type);
	- ASSERT0(dn->dn_maxblkid);
	-
	- /*
	- * We must initialize the ZAP data before changing the type,
	- * so that concurrent calls to *_is_zapified() can determine if
	- * the object has been completely zapified by checking the type.
	- */
	- mzap_create_impl(mos, object, 0, 0, tx);
	-
	- dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
	- DMU_OTN_ZAP_METADATA;
	- dnode_setdirty(dn, tx);
	- dnode_rele(dn, FTAG);
	-
	- spa_feature_incr(dmu_objset_spa(mos),
	- SPA_FEATURE_EXTENSIBLE_DATASET, tx);
	-}
	-
	-void
	-dmu_object_free_zapified(objset_t mos, uint64_t object, dmu_tx_t tx)
	-{
	- dnode_t *dn;
	- dmu_object_type_t t;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- VERIFY0(dnode_hold(mos, object, FTAG, &dn));
	- t = dn->dn_type;
	- dnode_rele(dn, FTAG);
	-
	- if (t == DMU_OTN_ZAP_METADATA) {
	- spa_feature_decr(dmu_objset_spa(mos),
	- SPA_FEATURE_EXTENSIBLE_DATASET, tx);
	- }
	- VERIFY0(dmu_object_free(mos, object, tx));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
	@@ -1,2484 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Nexenta Systems, Inc.
	- * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
	- */
	-
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#include <sys/cred.h>
	-#include <sys/zfs_context.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/dnode.h>
	-#include <sys/dbuf.h>
	-#include <sys/zvol.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/zap.h>
	-#include <sys/zil.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/sa.h>
	-#include <sys/zfs_onexit.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/vdev.h>
	-#include <sys/zfeature.h>
	-#include "zfs_namecheck.h"
	-
	-/*
	- * Needed to close a window in dnode_move() that allows the objset to be freed
	- * before it can be safely accessed.
	- */
	-krwlock_t os_lock;
	-
	-/*
	- * Tunable to overwrite the maximum number of threads for the parallization
	- * of dmu_objset_find_dp, needed to speed up the import of pools with many
	- * datasets.
	- * Default is 4 times the number of leaf vdevs.
	- */
	-int dmu_find_threads = 0;
	-
	-/*
	- * Backfill lower metadnode objects after this many have been freed.
	- * Backfilling negatively impacts object creation rates, so only do it
	- * if there are enough holes to fill.
	- */
	-int dmu_rescan_dnode_threshold = 131072;
	-
	-static void dmu_objset_find_dp_cb(void *arg);
	-
	-void
	-dmu_objset_init(void)
	-{
	- rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
	-}
	-
	-void
	-dmu_objset_fini(void)
	-{
	- rw_destroy(&os_lock);
	-}
	-
	-spa_t *
	-dmu_objset_spa(objset_t *os)
	-{
	- return (os->os_spa);
	-}
	-
	-zilog_t *
	-dmu_objset_zil(objset_t *os)
	-{
	- return (os->os_zil);
	-}
	-
	-dsl_pool_t *
	-dmu_objset_pool(objset_t *os)
	-{
	- dsl_dataset_t *ds;
	-
	- if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
	- return (ds->ds_dir->dd_pool);
	- else
	- return (spa_get_dsl(os->os_spa));
	-}
	-
	-dsl_dataset_t *
	-dmu_objset_ds(objset_t *os)
	-{
	- return (os->os_dsl_dataset);
	-}
	-
	-dmu_objset_type_t
	-dmu_objset_type(objset_t *os)
	-{
	- return (os->os_phys->os_type);
	-}
	-
	-void
	-dmu_objset_name(objset_t os, char buf)
	-{
	- dsl_dataset_name(os->os_dsl_dataset, buf);
	-}
	-
	-uint64_t
	-dmu_objset_id(objset_t *os)
	-{
	- dsl_dataset_t *ds = os->os_dsl_dataset;
	-
	- return (ds ? ds->ds_object : 0);
	-}
	-
	-uint64_t
	-dmu_objset_dnodesize(objset_t *os)
	-{
	- return (os->os_dnodesize);
	-}
	-
	-zfs_sync_type_t
	-dmu_objset_syncprop(objset_t *os)
	-{
	- return (os->os_sync);
	-}
	-
	-zfs_logbias_op_t
	-dmu_objset_logbias(objset_t *os)
	-{
	- return (os->os_logbias);
	-}
	-
	-static void
	-checksum_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- /*
	- * Inheritance should have been done by now.
	- */
	- ASSERT(newval != ZIO_CHECKSUM_INHERIT);
	-
	- os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
	-}
	-
	-static void
	-compression_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- /*
	- * Inheritance and range checking should have been done by now.
	- */
	- ASSERT(newval != ZIO_COMPRESS_INHERIT);
	-
	- os->os_compress = zio_compress_select(os->os_spa, newval,
	- ZIO_COMPRESS_ON);
	-}
	-
	-static void
	-copies_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- /*
	- * Inheritance and range checking should have been done by now.
	- */
	- ASSERT(newval > 0);
	- ASSERT(newval <= spa_max_replication(os->os_spa));
	-
	- os->os_copies = newval;
	-}
	-
	-static void
	-dedup_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	- spa_t *spa = os->os_spa;
	- enum zio_checksum checksum;
	-
	- /*
	- * Inheritance should have been done by now.
	- */
	- ASSERT(newval != ZIO_CHECKSUM_INHERIT);
	-
	- checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
	-
	- os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
	- os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
	-}
	-
	-static void
	-primary_cache_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- /*
	- * Inheritance and range checking should have been done by now.
	- */
	- ASSERT(newval == ZFS_CACHE_ALL \|\| newval == ZFS_CACHE_NONE \|\|
	- newval == ZFS_CACHE_METADATA);
	-
	- os->os_primary_cache = newval;
	-}
	-
	-static void
	-secondary_cache_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- /*
	- * Inheritance and range checking should have been done by now.
	- */
	- ASSERT(newval == ZFS_CACHE_ALL \|\| newval == ZFS_CACHE_NONE \|\|
	- newval == ZFS_CACHE_METADATA);
	-
	- os->os_secondary_cache = newval;
	-}
	-
	-static void
	-sync_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- /*
	- * Inheritance and range checking should have been done by now.
	- */
	- ASSERT(newval == ZFS_SYNC_STANDARD \|\| newval == ZFS_SYNC_ALWAYS \|\|
	- newval == ZFS_SYNC_DISABLED);
	-
	- os->os_sync = newval;
	- if (os->os_zil)
	- zil_set_sync(os->os_zil, newval);
	-}
	-
	-static void
	-redundant_metadata_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- /*
	- * Inheritance and range checking should have been done by now.
	- */
	- ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL \|\|
	- newval == ZFS_REDUNDANT_METADATA_MOST);
	-
	- os->os_redundant_metadata = newval;
	-}
	-
	-static void
	-dnodesize_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- switch (newval) {
	- case ZFS_DNSIZE_LEGACY:
	- os->os_dnodesize = DNODE_MIN_SIZE;
	- break;
	- case ZFS_DNSIZE_AUTO:
	- /*
	- * Choose a dnode size that will work well for most
	- * workloads if the user specified "auto". Future code
	- * improvements could dynamically select a dnode size
	- * based on observed workload patterns.
	- */
	- os->os_dnodesize = DNODE_MIN_SIZE * 2;
	- break;
	- case ZFS_DNSIZE_1K:
	- case ZFS_DNSIZE_2K:
	- case ZFS_DNSIZE_4K:
	- case ZFS_DNSIZE_8K:
	- case ZFS_DNSIZE_16K:
	- os->os_dnodesize = newval;
	- break;
	- }
	-}
	-
	-static void
	-smallblk_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- /*
	- * Inheritance and range checking should have been done by now.
	- */
	- ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
	- ASSERT(ISP2(newval));
	-
	- os->os_zpl_special_smallblock = newval;
	-}
	-
	-static void
	-logbias_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- ASSERT(newval == ZFS_LOGBIAS_LATENCY \|\|
	- newval == ZFS_LOGBIAS_THROUGHPUT);
	- os->os_logbias = newval;
	- if (os->os_zil)
	- zil_set_logbias(os->os_zil, newval);
	-}
	-
	-static void
	-recordsize_changed_cb(void *arg, uint64_t newval)
	-{
	- objset_t *os = arg;
	-
	- os->os_recordsize = newval;
	-}
	-
	-void
	-dmu_objset_byteswap(void *buf, size_t size)
	-{
	- objset_phys_t *osp = buf;
	-
	- ASSERT(size == OBJSET_OLD_PHYS_SIZE \|\| size == sizeof (objset_phys_t));
	- dnode_byteswap(&osp->os_meta_dnode);
	- byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
	- osp->os_type = BSWAP_64(osp->os_type);
	- osp->os_flags = BSWAP_64(osp->os_flags);
	- if (size == sizeof (objset_phys_t)) {
	- dnode_byteswap(&osp->os_userused_dnode);
	- dnode_byteswap(&osp->os_groupused_dnode);
	- }
	-}
	-
	-/*
	- * The hash is a CRC-based hash of the objset_t pointer and the object number.
	- */
	-static uint64_t
	-dnode_hash(const objset_t *os, uint64_t obj)
	-{
	- uintptr_t osv = (uintptr_t)os;
	- uint64_t crc = -1ULL;
	-
	- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
	- /*
	- * The low 6 bits of the pointer don't have much entropy, because
	- * the objset_t is larger than 2^6 bytes long.
	- */
	- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
	- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
	- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
	- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
	-
	- crc ^= (osv>>14) ^ (obj>>24);
	-
	- return (crc);
	-}
	-
	-unsigned int
	-dnode_multilist_index_func(multilist_t ml, void obj)
	-{
	- dnode_t *dn = obj;
	- return (dnode_hash(dn->dn_objset, dn->dn_object) %
	- multilist_get_num_sublists(ml));
	-}
	-
	-/*
	- * Instantiates the objset_t in-memory structure corresponding to the
	- * objset_phys_t that's pointed to by the specified blkptr_t.
	- */
	-int
	-dmu_objset_open_impl(spa_t spa, dsl_dataset_t ds, blkptr_t *bp,
	- objset_t **osp)
	-{
	- objset_t *os;
	- int i, err;
	-
	- ASSERT(ds == NULL \|\| MUTEX_HELD(&ds->ds_opening_lock));
	-
	-#if 0
	- /*
	- * The $ORIGIN dataset (if it exists) doesn't have an associated
	- * objset, so there's no reason to open it. The $ORIGIN dataset
	- * will not exist on pools older than SPA_VERSION_ORIGIN.
	- */
	- if (ds != NULL && spa_get_dsl(spa) != NULL &&
	- spa_get_dsl(spa)->dp_origin_snap != NULL) {
	- ASSERT3P(ds->ds_dir, !=,
	- spa_get_dsl(spa)->dp_origin_snap->ds_dir);
	- }
	-#endif
	-
	- os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
	- os->os_dsl_dataset = ds;
	- os->os_spa = spa;
	- os->os_rootbp = bp;
	- if (!BP_IS_HOLE(os->os_rootbp)) {
	- arc_flags_t aflags = ARC_FLAG_WAIT;
	- zbookmark_phys_t zb;
	- SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
	- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
	-
	- if (DMU_OS_IS_L2CACHEABLE(os))
	- aflags \|= ARC_FLAG_L2CACHE;
	-
	- dprintf_bp(os->os_rootbp, "reading %s", "");
	- err = arc_read(NULL, spa, os->os_rootbp,
	- arc_getbuf_func, &os->os_phys_buf,
	- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
	- if (err != 0) {
	- kmem_free(os, sizeof (objset_t));
	- /* convert checksum errors into IO errors */
	- if (err == ECKSUM)
	- err = SET_ERROR(EIO);
	- return (err);
	- }
	-
	- /* Increase the blocksize if we are permitted. */
	- if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
	- arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
	- arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
	- ARC_BUFC_METADATA, sizeof (objset_phys_t));
	- bzero(buf->b_data, sizeof (objset_phys_t));
	- bcopy(os->os_phys_buf->b_data, buf->b_data,
	- arc_buf_size(os->os_phys_buf));
	- arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
	- os->os_phys_buf = buf;
	- }
	-
	- os->os_phys = os->os_phys_buf->b_data;
	- os->os_flags = os->os_phys->os_flags;
	- } else {
	- int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
	- sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
	- os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
	- ARC_BUFC_METADATA, size);
	- os->os_phys = os->os_phys_buf->b_data;
	- bzero(os->os_phys, size);
	- }
	-
	- /*
	- * Note: the changed_cb will be called once before the register
	- * func returns, thus changing the checksum/compression from the
	- * default (fletcher2/off). Snapshots don't need to know about
	- * checksum/compression/copies.
	- */
	- if (ds != NULL) {
	- boolean_t needlock = B_FALSE;
	-
	- /*
	- * Note: it's valid to open the objset if the dataset is
	- * long-held, in which case the pool_config lock will not
	- * be held.
	- */
	- if (!dsl_pool_config_held(dmu_objset_pool(os))) {
	- needlock = B_TRUE;
	- dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	- }
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
	- primary_cache_changed_cb, os);
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
	- secondary_cache_changed_cb, os);
	- }
	- if (!ds->ds_is_snapshot) {
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_CHECKSUM),
	- checksum_changed_cb, os);
	- }
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_COMPRESSION),
	- compression_changed_cb, os);
	- }
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_COPIES),
	- copies_changed_cb, os);
	- }
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_DEDUP),
	- dedup_changed_cb, os);
	- }
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_LOGBIAS),
	- logbias_changed_cb, os);
	- }
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_SYNC),
	- sync_changed_cb, os);
	- }
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(
	- ZFS_PROP_REDUNDANT_METADATA),
	- redundant_metadata_changed_cb, os);
	- }
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
	- recordsize_changed_cb, os);
	- }
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_DNODESIZE),
	- dnodesize_changed_cb, os);
	- }
	- if (err == 0) {
	- err = dsl_prop_register(ds,
	- zfs_prop_to_name(
	- ZFS_PROP_SPECIAL_SMALL_BLOCKS),
	- smallblk_changed_cb, os);
	- }
	- }
	- if (needlock)
	- dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	- if (err != 0) {
	- arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
	- kmem_free(os, sizeof (objset_t));
	- return (err);
	- }
	- } else {
	- /* It's the meta-objset. */
	- os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
	- os->os_compress = ZIO_COMPRESS_ON;
	- os->os_copies = spa_max_replication(spa);
	- os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
	- os->os_dedup_verify = B_FALSE;
	- os->os_logbias = ZFS_LOGBIAS_LATENCY;
	- os->os_sync = ZFS_SYNC_STANDARD;
	- os->os_primary_cache = ZFS_CACHE_ALL;
	- os->os_secondary_cache = ZFS_CACHE_ALL;
	- os->os_dnodesize = DNODE_MIN_SIZE;
	- }
	- /*
	- * These properties will be filled in by the logic in zfs_get_zplprop()
	- * when they are queried for the first time.
	- */
	- os->os_version = OBJSET_PROP_UNINITIALIZED;
	- os->os_normalization = OBJSET_PROP_UNINITIALIZED;
	- os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
	- os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
	-
	- if (ds == NULL \|\| !ds->ds_is_snapshot)
	- os->os_zil_header = os->os_phys->os_zil_header;
	- os->os_zil = zil_alloc(os, &os->os_zil_header);
	-
	- for (i = 0; i < TXG_SIZE; i++) {
	- os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
	- offsetof(dnode_t, dn_dirty_link[i]),
	- dnode_multilist_index_func);
	- }
	- list_create(&os->os_dnodes, sizeof (dnode_t),
	- offsetof(dnode_t, dn_link));
	- list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
	- offsetof(dmu_buf_impl_t, db_link));
	-
	- mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
	- os->os_obj_next_percpu_len = boot_ncpus;
	- os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
	- sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
	-
	- dnode_special_open(os, &os->os_phys->os_meta_dnode,
	- DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
	- if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
	- dnode_special_open(os, &os->os_phys->os_userused_dnode,
	- DMU_USERUSED_OBJECT, &os->os_userused_dnode);
	- dnode_special_open(os, &os->os_phys->os_groupused_dnode,
	- DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
	- }
	-
	- *osp = os;
	- return (0);
	-}
	-
	-int
	-dmu_objset_from_ds(dsl_dataset_t ds, objset_t *osp)
	-{
	- int err = 0;
	-
	- /*
	- * We shouldn't be doing anything with dsl_dataset_t's unless the
	- * pool_config lock is held, or the dataset is long-held.
	- */
	- ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) \|\|
	- dsl_dataset_long_held(ds));
	-
	- mutex_enter(&ds->ds_opening_lock);
	- if (ds->ds_objset == NULL) {
	- objset_t *os;
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
	- ds, dsl_dataset_get_blkptr(ds), &os);
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	-
	- if (err == 0) {
	- mutex_enter(&ds->ds_lock);
	- ASSERT(ds->ds_objset == NULL);
	- ds->ds_objset = os;
	- mutex_exit(&ds->ds_lock);
	- }
	- }
	- *osp = ds->ds_objset;
	- mutex_exit(&ds->ds_opening_lock);
	- return (err);
	-}
	-
	-/*
	- * Holds the pool while the objset is held. Therefore only one objset
	- * can be held at a time.
	- */
	-int
	-dmu_objset_hold(const char name, void tag, objset_t **osp)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- int err;
	-
	- err = dsl_pool_hold(name, tag, &dp);
	- if (err != 0)
	- return (err);
	- err = dsl_dataset_hold(dp, name, tag, &ds);
	- if (err != 0) {
	- dsl_pool_rele(dp, tag);
	- return (err);
	- }
	-
	- err = dmu_objset_from_ds(ds, osp);
	- if (err != 0) {
	- dsl_dataset_rele(ds, tag);
	- dsl_pool_rele(dp, tag);
	- }
	-
	- return (err);
	-}
	-
	-static int
	-dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
	- boolean_t readonly, void tag, objset_t *osp)
	-{
	- int err;
	-
	- err = dmu_objset_from_ds(ds, osp);
	- if (err != 0) {
	- dsl_dataset_disown(ds, tag);
	- } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
	- dsl_dataset_disown(ds, tag);
	- return (SET_ERROR(EINVAL));
	- } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
	- dsl_dataset_disown(ds, tag);
	- return (SET_ERROR(EROFS));
	- }
	- return (err);
	-}
	-
	-/*
	- * dsl_pool must not be held when this is called.
	- * Upon successful return, there will be a longhold on the dataset,
	- * and the dsl_pool will not be held.
	- */
	-int
	-dmu_objset_own(const char *name, dmu_objset_type_t type,
	- boolean_t readonly, void tag, objset_t *osp)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- int err;
	-
	- err = dsl_pool_hold(name, FTAG, &dp);
	- if (err != 0)
	- return (err);
	- err = dsl_dataset_own(dp, name, tag, &ds);
	- if (err != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (err);
	- }
	- err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
	- dsl_pool_rele(dp, FTAG);
	-
	- return (err);
	-}
	-
	-int
	-dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
	- boolean_t readonly, void tag, objset_t *osp)
	-{
	- dsl_dataset_t *ds;
	- int err;
	-
	- err = dsl_dataset_own_obj(dp, obj, tag, &ds);
	- if (err != 0)
	- return (err);
	-
	- return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
	-}
	-
	-void
	-dmu_objset_rele(objset_t os, void tag)
	-{
	- dsl_pool_t *dp = dmu_objset_pool(os);
	- dsl_dataset_rele(os->os_dsl_dataset, tag);
	- dsl_pool_rele(dp, tag);
	-}
	-
	-/*
	- * When we are called, os MUST refer to an objset associated with a dataset
	- * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
	- * == tag. We will then release and reacquire ownership of the dataset while
	- * holding the pool config_rwlock to avoid intervening namespace or ownership
	- * changes may occur.
	- *
	- * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
	- * release the hold on its dataset and acquire a new one on the dataset of the
	- * same name so that it can be partially torn down and reconstructed.
	- */
	-void
	-dmu_objset_refresh_ownership(dsl_dataset_t ds, dsl_dataset_t *newds,
	- void *tag)
	-{
	- dsl_pool_t *dp;
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	-
	- VERIFY3P(ds, !=, NULL);
	- VERIFY3P(ds->ds_owner, ==, tag);
	- VERIFY(dsl_dataset_long_held(ds));
	-
	- dsl_dataset_name(ds, name);
	- dp = ds->ds_dir->dd_pool;
	- dsl_pool_config_enter(dp, FTAG);
	- dsl_dataset_disown(ds, tag);
	- VERIFY0(dsl_dataset_own(dp, name, tag, newds));
	- dsl_pool_config_exit(dp, FTAG);
	-}
	-
	-void
	-dmu_objset_disown(objset_t os, void tag)
	-{
	- dsl_dataset_disown(os->os_dsl_dataset, tag);
	-}
	-
	-void
	-dmu_objset_evict_dbufs(objset_t *os)
	-{
	- dnode_t dn_marker;
	- dnode_t *dn;
	-
	- mutex_enter(&os->os_lock);
	- dn = list_head(&os->os_dnodes);
	- while (dn != NULL) {
	- /*
	- * Skip dnodes without holds. We have to do this dance
	- * because dnode_add_ref() only works if there is already a
	- * hold. If the dnode has no holds, then it has no dbufs.
	- */
	- if (dnode_add_ref(dn, FTAG)) {
	- list_insert_after(&os->os_dnodes, dn, &dn_marker);
	- mutex_exit(&os->os_lock);
	-
	- dnode_evict_dbufs(dn);
	- dnode_rele(dn, FTAG);
	-
	- mutex_enter(&os->os_lock);
	- dn = list_next(&os->os_dnodes, &dn_marker);
	- list_remove(&os->os_dnodes, &dn_marker);
	- } else {
	- dn = list_next(&os->os_dnodes, dn);
	- }
	- }
	- mutex_exit(&os->os_lock);
	-
	- if (DMU_USERUSED_DNODE(os) != NULL) {
	- dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
	- dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
	- }
	- dnode_evict_dbufs(DMU_META_DNODE(os));
	-}
	-
	-/*
	- * Objset eviction processing is split into into two pieces.
	- * The first marks the objset as evicting, evicts any dbufs that
	- * have a refcount of zero, and then queues up the objset for the
	- * second phase of eviction. Once os->os_dnodes has been cleared by
	- * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
	- * The second phase closes the special dnodes, dequeues the objset from
	- * the list of those undergoing eviction, and finally frees the objset.
	- *
	- * NOTE: Due to asynchronous eviction processing (invocation of
	- * dnode_buf_pageout()), it is possible for the meta dnode for the
	- * objset to have no holds even though os->os_dnodes is not empty.
	- */
	-void
	-dmu_objset_evict(objset_t *os)
	-{
	- dsl_dataset_t *ds = os->os_dsl_dataset;
	-
	- for (int t = 0; t < TXG_SIZE; t++)
	- ASSERT(!dmu_objset_is_dirty(os, t));
	-
	- if (ds)
	- dsl_prop_unregister_all(ds, os);
	-
	- if (os->os_sa)
	- sa_tear_down(os);
	-
	- dmu_objset_evict_dbufs(os);
	-
	- mutex_enter(&os->os_lock);
	- spa_evicting_os_register(os->os_spa, os);
	- if (list_is_empty(&os->os_dnodes)) {
	- mutex_exit(&os->os_lock);
	- dmu_objset_evict_done(os);
	- } else {
	- mutex_exit(&os->os_lock);
	- }
	-}
	-
	-void
	-dmu_objset_evict_done(objset_t *os)
	-{
	- ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
	-
	- dnode_special_close(&os->os_meta_dnode);
	- if (DMU_USERUSED_DNODE(os)) {
	- dnode_special_close(&os->os_userused_dnode);
	- dnode_special_close(&os->os_groupused_dnode);
	- }
	- zil_free(os->os_zil);
	-
	- arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
	-
	- /*
	- * This is a barrier to prevent the objset from going away in
	- * dnode_move() until we can safely ensure that the objset is still in
	- * use. We consider the objset valid before the barrier and invalid
	- * after the barrier.
	- */
	- rw_enter(&os_lock, RW_READER);
	- rw_exit(&os_lock);
	-
	- kmem_free(os->os_obj_next_percpu,
	- os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
	-
	- mutex_destroy(&os->os_lock);
	- mutex_destroy(&os->os_userused_lock);
	- mutex_destroy(&os->os_obj_lock);
	- mutex_destroy(&os->os_user_ptr_lock);
	- for (int i = 0; i < TXG_SIZE; i++) {
	- multilist_destroy(os->os_dirty_dnodes[i]);
	- }
	- spa_evicting_os_deregister(os->os_spa, os);
	- kmem_free(os, sizeof (objset_t));
	-}
	-
	-timestruc_t
	-dmu_objset_snap_cmtime(objset_t *os)
	-{
	- return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
	-}
	-
	-/* called from dsl for meta-objset */
	-objset_t *
	-dmu_objset_create_impl(spa_t spa, dsl_dataset_t ds, blkptr_t *bp,
	- dmu_objset_type_t type, dmu_tx_t *tx)
	-{
	- objset_t *os;
	- dnode_t *mdn;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- if (ds != NULL)
	- VERIFY0(dmu_objset_from_ds(ds, &os));
	- else
	- VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
	-
	- mdn = DMU_META_DNODE(os);
	-
	- dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
	- DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
	-
	- /*
	- * We don't want to have to increase the meta-dnode's nlevels
	- * later, because then we could do it in quescing context while
	- * we are also accessing it in open context.
	- *
	- * This precaution is not necessary for the MOS (ds == NULL),
	- * because the MOS is only updated in syncing context.
	- * This is most fortunate: the MOS is the only objset that
	- * needs to be synced multiple times as spa_sync() iterates
	- * to convergence, so minimizing its dn_nlevels matters.
	- */
	- if (ds != NULL) {
	- int levels = 1;
	-
	- /*
	- * Determine the number of levels necessary for the meta-dnode
	- * to contain DN_MAX_OBJECT dnodes. Note that in order to
	- * ensure that we do not overflow 64 bits, there has to be
	- * a nlevels that gives us a number of blocks > DN_MAX_OBJECT
	- * but < 2^64. Therefore,
	- * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
	- * less than (64 - log2(DN_MAX_OBJECT)) (16).
	- */
	- while ((uint64_t)mdn->dn_nblkptr <<
	- (mdn->dn_datablkshift - DNODE_SHIFT +
	- (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
	- DN_MAX_OBJECT)
	- levels++;
	-
	- mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
	- mdn->dn_nlevels = levels;
	- }
	-
	- ASSERT(type != DMU_OST_NONE);
	- ASSERT(type != DMU_OST_ANY);
	- ASSERT(type < DMU_OST_NUMTYPES);
	- os->os_phys->os_type = type;
	- if (dmu_objset_userused_enabled(os)) {
	- os->os_phys->os_flags \|= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
	- os->os_flags = os->os_phys->os_flags;
	- }
	-
	- dsl_dataset_dirty(ds, tx);
	-
	- return (os);
	-}
	-
	-typedef struct dmu_objset_create_arg {
	- const char *doca_name;
	- cred_t *doca_cred;
	- void (doca_userfunc)(objset_t os, void *arg,
	- cred_t cr, dmu_tx_t tx);
	- void *doca_userarg;
	- dmu_objset_type_t doca_type;
	- uint64_t doca_flags;
	-} dmu_objset_create_arg_t;
	-
	-/ARGSUSED/
	-static int
	-dmu_objset_create_check(void arg, dmu_tx_t tx)
	-{
	- dmu_objset_create_arg_t *doca = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dir_t *pdd;
	- dsl_dataset_t *parentds;
	- objset_t *parentos;
	- const char *tail;
	- int error;
	-
	- if (strchr(doca->doca_name, '@') != NULL)
	- return (SET_ERROR(EINVAL));
	-
	- if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	-
	- if (dataset_nestcheck(doca->doca_name) != 0)
	- return (SET_ERROR(ENAMETOOLONG));
	-
	- error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
	- if (error != 0)
	- return (error);
	- if (tail == NULL) {
	- dsl_dir_rele(pdd, FTAG);
	- return (SET_ERROR(EEXIST));
	- }
	- error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
	- doca->doca_cred);
	- if (error != 0) {
	- dsl_dir_rele(pdd, FTAG);
	- return (error);
	- }
	-
	- /* can't create below anything but filesystems (eg. no ZVOLs) */
	- error = dsl_dataset_hold_obj(pdd->dd_pool,
	- dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);
	- if (error != 0) {
	- dsl_dir_rele(pdd, FTAG);
	- return (error);
	- }
	- error = dmu_objset_from_ds(parentds, &parentos);
	- if (error != 0) {
	- dsl_dataset_rele(parentds, FTAG);
	- dsl_dir_rele(pdd, FTAG);
	- return (error);
	- }
	- if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
	- dsl_dataset_rele(parentds, FTAG);
	- dsl_dir_rele(pdd, FTAG);
	- return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
	- }
	- dsl_dataset_rele(parentds, FTAG);
	- dsl_dir_rele(pdd, FTAG);
	-
	- return (error);
	-}
	-
	-static void
	-dmu_objset_create_sync(void arg, dmu_tx_t tx)
	-{
	- dmu_objset_create_arg_t *doca = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dir_t *pdd;
	- const char *tail;
	- dsl_dataset_t *ds;
	- uint64_t obj;
	- blkptr_t *bp;
	- objset_t *os;
	-
	- VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
	-
	- obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
	- doca->doca_cred, tx);
	-
	- VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- bp = dsl_dataset_get_blkptr(ds);
	- os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
	- ds, bp, doca->doca_type, tx);
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	-
	- if (doca->doca_userfunc != NULL) {
	- doca->doca_userfunc(os, doca->doca_userarg,
	- doca->doca_cred, tx);
	- }
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- zvol_create_minors(dp->dp_spa, doca->doca_name);
	-#endif
	- spa_history_log_internal_ds(ds, "create", tx, "");
	- dsl_dataset_rele(ds, FTAG);
	- dsl_dir_rele(pdd, FTAG);
	-}
	-
	-int
	-dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
	- void (func)(objset_t os, void arg, cred_t cr, dmu_tx_t tx), void arg)
	-{
	- dmu_objset_create_arg_t doca;
	-
	- doca.doca_name = name;
	- doca.doca_cred = CRED();
	- doca.doca_flags = flags;
	- doca.doca_userfunc = func;
	- doca.doca_userarg = arg;
	- doca.doca_type = type;
	-
	- return (dsl_sync_task(name,
	- dmu_objset_create_check, dmu_objset_create_sync, &doca,
	- 5, ZFS_SPACE_CHECK_NORMAL));
	-}
	-
	-typedef struct dmu_objset_clone_arg {
	- const char *doca_clone;
	- const char *doca_origin;
	- cred_t *doca_cred;
	-} dmu_objset_clone_arg_t;
	-
	-/ARGSUSED/
	-static int
	-dmu_objset_clone_check(void arg, dmu_tx_t tx)
	-{
	- dmu_objset_clone_arg_t *doca = arg;
	- dsl_dir_t *pdd;
	- const char *tail;
	- int error;
	- dsl_dataset_t *origin;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	-
	- if (strchr(doca->doca_clone, '@') != NULL)
	- return (SET_ERROR(EINVAL));
	-
	- if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	-
	- error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
	- if (error != 0)
	- return (error);
	- if (tail == NULL) {
	- dsl_dir_rele(pdd, FTAG);
	- return (SET_ERROR(EEXIST));
	- }
	-
	- error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
	- doca->doca_cred);
	- if (error != 0) {
	- dsl_dir_rele(pdd, FTAG);
	- return (SET_ERROR(EDQUOT));
	- }
	- dsl_dir_rele(pdd, FTAG);
	-
	- error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
	- if (error != 0)
	- return (error);
	-
	- /* You can only clone snapshots, not the head datasets. */
	- if (!origin->ds_is_snapshot) {
	- dsl_dataset_rele(origin, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	- dsl_dataset_rele(origin, FTAG);
	-
	- return (0);
	-}
	-
	-static void
	-dmu_objset_clone_sync(void arg, dmu_tx_t tx)
	-{
	- dmu_objset_clone_arg_t *doca = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dir_t *pdd;
	- const char *tail;
	- dsl_dataset_t origin, ds;
	- uint64_t obj;
	- char namebuf[ZFS_MAX_DATASET_NAME_LEN];
	-
	- VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
	- VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
	-
	- obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
	- doca->doca_cred, tx);
	-
	- VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
	- dsl_dataset_name(origin, namebuf);
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- zvol_create_minors(dp->dp_spa, doca->doca_clone);
	-#endif
	- spa_history_log_internal_ds(ds, "clone", tx,
	- "origin=%s (%llu)", namebuf, origin->ds_object);
	- dsl_dataset_rele(ds, FTAG);
	- dsl_dataset_rele(origin, FTAG);
	- dsl_dir_rele(pdd, FTAG);
	-}
	-
	-int
	-dmu_objset_clone(const char clone, const char origin)
	-{
	- dmu_objset_clone_arg_t doca;
	-
	- doca.doca_clone = clone;
	- doca.doca_origin = origin;
	- doca.doca_cred = CRED();
	-
	- return (dsl_sync_task(clone,
	- dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
	- 5, ZFS_SPACE_CHECK_NORMAL));
	-}
	-
	-static int
	-dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg)
	-{
	- int error = 0;
	- uint64_t object = 0;
	- while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
	- error = dmu_object_remap_indirects(os, object,
	- last_removed_txg);
	- /*
	- * If the ZPL removed the object before we managed to dnode_hold
	- * it, we would get an ENOENT. If the ZPL declares its intent
	- * to remove the object (dnode_free) before we manage to
	- * dnode_hold it, we would get an EEXIST. In either case, we
	- * want to continue remapping the other objects in the objset;
	- * in all other cases, we want to break early.
	- */
	- if (error != 0 && error != ENOENT && error != EEXIST) {
	- break;
	- }
	- }
	- if (error == ESRCH) {
	- error = 0;
	- }
	- return (error);
	-}
	-
	-int
	-dmu_objset_remap_indirects(const char *fsname)
	-{
	- int error = 0;
	- objset_t *os = NULL;
	- uint64_t last_removed_txg;
	- uint64_t remap_start_txg;
	- dsl_dir_t *dd;
	-
	- error = dmu_objset_hold(fsname, FTAG, &os);
	- if (error != 0) {
	- return (error);
	- }
	- dd = dmu_objset_ds(os)->ds_dir;
	-
	- if (!spa_feature_is_enabled(dmu_objset_spa(os),
	- SPA_FEATURE_OBSOLETE_COUNTS)) {
	- dmu_objset_rele(os, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) {
	- dmu_objset_rele(os, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * If there has not been a removal, we're done.
	- */
	- last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os));
	- if (last_removed_txg == -1ULL) {
	- dmu_objset_rele(os, FTAG);
	- return (0);
	- }
	-
	- /*
	- * If we have remapped since the last removal, we're done.
	- */
	- if (dsl_dir_is_zapified(dd)) {
	- uint64_t last_remap_txg;
	- if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)),
	- dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
	- sizeof (last_remap_txg), 1, &last_remap_txg) == 0 &&
	- last_remap_txg > last_removed_txg) {
	- dmu_objset_rele(os, FTAG);
	- return (0);
	- }
	- }
	-
	- dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
	- dsl_pool_rele(dmu_objset_pool(os), FTAG);
	-
	- remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os));
	- error = dmu_objset_remap_indirects_impl(os, last_removed_txg);
	- if (error == 0) {
	- /*
	- * We update the last_remap_txg to be the start txg so that
	- * we can guarantee that every block older than last_remap_txg
	- * that can be remapped has been remapped.
	- */
	- error = dsl_dir_update_last_remap_txg(dd, remap_start_txg);
	- }
	-
	- dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
	- dsl_dataset_rele(dmu_objset_ds(os), FTAG);
	-
	- return (error);
	-}
	-
	-int
	-dmu_objset_snapshot_one(const char fsname, const char snapname)
	-{
	- int err;
	- char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
	- nvlist_t *snaps = fnvlist_alloc();
	-
	- fnvlist_add_boolean(snaps, longsnap);
	- strfree(longsnap);
	- err = dsl_dataset_snapshot(snaps, NULL, NULL);
	- fnvlist_free(snaps);
	- return (err);
	-}
	-
	-static void
	-dmu_objset_sync_dnodes(multilist_sublist_t list, dmu_tx_t tx)
	-{
	- dnode_t *dn;
	-
	- while ((dn = multilist_sublist_head(list)) != NULL) {
	- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
	- ASSERT(dn->dn_dbuf->db_data_pending);
	- /*
	- * Initialize dn_zio outside dnode_sync() because the
	- * meta-dnode needs to set it ouside dnode_sync().
	- */
	- dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
	- ASSERT(dn->dn_zio);
	-
	- ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
	- multilist_sublist_remove(list, dn);
	-
	- /*
	- * If we are not doing useraccounting (os_synced_dnodes == NULL)
	- * we are done with this dnode for this txg. Unset dn_dirty_txg
	- * if later txgs aren't dirtying it so that future holders do
	- * not get a stale value. Otherwise, we will do this in
	- * userquota_updates_task() when processing has completely
	- * finished for this txg.
	- */
	- multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
	- if (newlist != NULL) {
	- (void) dnode_add_ref(dn, newlist);
	- multilist_insert(newlist, dn);
	- } else {
	- mutex_enter(&dn->dn_mtx);
	- if (dn->dn_dirty_txg == tx->tx_txg)
	- dn->dn_dirty_txg = 0;
	- mutex_exit(&dn->dn_mtx);
	- }
	-
	- dnode_sync(dn, tx);
	- }
	-}
	-
	-/* ARGSUSED */
	-static void
	-dmu_objset_write_ready(zio_t zio, arc_buf_t abuf, void *arg)
	-{
	- blkptr_t *bp = zio->io_bp;
	- objset_t *os = arg;
	- dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
	-
	- ASSERT(!BP_IS_EMBEDDED(bp));
	- ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
	- ASSERT0(BP_GET_LEVEL(bp));
	-
	- /*
	- * Update rootbp fill count: it should be the number of objects
	- * allocated in the object set (not counting the "special"
	- * objects that are stored in the objset_phys_t -- the meta
	- * dnode and user/group accounting objects).
	- */
	- bp->blk_fill = 0;
	- for (int i = 0; i < dnp->dn_nblkptr; i++)
	- bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
	- if (os->os_dsl_dataset != NULL)
	- rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
	- os->os_rootbp = bp;
	- if (os->os_dsl_dataset != NULL)
	- rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dmu_objset_write_done(zio_t zio, arc_buf_t abuf, void *arg)
	-{
	- blkptr_t *bp = zio->io_bp;
	- blkptr_t *bp_orig = &zio->io_bp_orig;
	- objset_t *os = arg;
	-
	- if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
	- ASSERT(BP_EQUAL(bp, bp_orig));
	- } else {
	- dsl_dataset_t *ds = os->os_dsl_dataset;
	- dmu_tx_t *tx = os->os_synctx;
	-
	- (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
	- dsl_dataset_block_born(ds, bp, tx);
	- }
	- kmem_free(bp, sizeof (*bp));
	-}
	-
	-typedef struct sync_dnodes_arg {
	- multilist_t *sda_list;
	- int sda_sublist_idx;
	- multilist_t *sda_newlist;
	- dmu_tx_t *sda_tx;
	-} sync_dnodes_arg_t;
	-
	-static void
	-sync_dnodes_task(void *arg)
	-{
	- sync_dnodes_arg_t *sda = arg;
	-
	- multilist_sublist_t *ms =
	- multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
	-
	- dmu_objset_sync_dnodes(ms, sda->sda_tx);
	-
	- multilist_sublist_unlock(ms);
	-
	- kmem_free(sda, sizeof (*sda));
	-}
	-
	-
	-/* called from dsl */
	-void
	-dmu_objset_sync(objset_t os, zio_t pio, dmu_tx_t *tx)
	-{
	- int txgoff;
	- zbookmark_phys_t zb;
	- zio_prop_t zp;
	- zio_t *zio;
	- list_t *list;
	- dbuf_dirty_record_t *dr;
	- int num_sublists;
	- multilist_t *ml;
	- blkptr_t blkptr_copy = kmem_alloc(sizeof (os->os_rootbp), KM_SLEEP);
	- blkptr_copy = os->os_rootbp;
	-
	- dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- /* XXX the write_done callback should really give us the tx... */
	- os->os_synctx = tx;
	-
	- if (os->os_dsl_dataset == NULL) {
	- /*
	- * This is the MOS. If we have upgraded,
	- * spa_max_replication() could change, so reset
	- * os_copies here.
	- */
	- os->os_copies = spa_max_replication(os->os_spa);
	- }
	-
	- /*
	- * Create the root block IO
	- */
	- SET_BOOKMARK(&zb, os->os_dsl_dataset ?
	- os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
	- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
	- arc_release(os->os_phys_buf, &os->os_phys_buf);
	-
	- dmu_write_policy(os, NULL, 0, 0, &zp);
	-
	- zio = arc_write(pio, os->os_spa, tx->tx_txg,
	- blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
	- &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
	- os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
	-
	- /*
	- * Sync special dnodes - the parent IO for the sync is the root block
	- */
	- DMU_META_DNODE(os)->dn_zio = zio;
	- dnode_sync(DMU_META_DNODE(os), tx);
	-
	- os->os_phys->os_flags = os->os_flags;
	-
	- if (DMU_USERUSED_DNODE(os) &&
	- DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
	- DMU_USERUSED_DNODE(os)->dn_zio = zio;
	- dnode_sync(DMU_USERUSED_DNODE(os), tx);
	- DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
	- dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
	- }
	-
	- txgoff = tx->tx_txg & TXG_MASK;
	-
	- if (dmu_objset_userused_enabled(os)) {
	- /*
	- * We must create the list here because it uses the
	- * dn_dirty_link[] of this txg. But it may already
	- * exist because we call dsl_dataset_sync() twice per txg.
	- */
	- if (os->os_synced_dnodes == NULL) {
	- os->os_synced_dnodes =
	- multilist_create(sizeof (dnode_t),
	- offsetof(dnode_t, dn_dirty_link[txgoff]),
	- dnode_multilist_index_func);
	- } else {
	- ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
	- offsetof(dnode_t, dn_dirty_link[txgoff]));
	- }
	- }
	-
	- ml = os->os_dirty_dnodes[txgoff];
	- num_sublists = multilist_get_num_sublists(ml);
	- for (int i = 0; i < num_sublists; i++) {
	- if (multilist_sublist_is_empty_idx(ml, i))
	- continue;
	- sync_dnodes_arg_t sda = kmem_alloc(sizeof (sda), KM_SLEEP);
	- sda->sda_list = ml;
	- sda->sda_sublist_idx = i;
	- sda->sda_tx = tx;
	- (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
	- sync_dnodes_task, sda, 0);
	- /* callback frees sda */
	- }
	- taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
	-
	- list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
	- while ((dr = list_head(list)) != NULL) {
	- ASSERT0(dr->dr_dbuf->db_level);
	- list_remove(list, dr);
	- if (dr->dr_zio)
	- zio_nowait(dr->dr_zio);
	- }
	-
	- /* Enable dnode backfill if enough objects have been freed. */
	- if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
	- os->os_rescan_dnodes = B_TRUE;
	- os->os_freed_dnodes = 0;
	- }
	-
	- /*
	- * Free intent log blocks up to this tx.
	- */
	- zil_sync(os->os_zil, tx);
	- os->os_phys->os_zil_header = os->os_zil_header;
	- zio_nowait(zio);
	-}
	-
	-boolean_t
	-dmu_objset_is_dirty(objset_t *os, uint64_t txg)
	-{
	- return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
	-}
	-
	-static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
	-
	-void
	-dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
	-{
	- used_cbs[ost] = cb;
	-}
	-
	-boolean_t
	-dmu_objset_userused_enabled(objset_t *os)
	-{
	- return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
	- used_cbs[os->os_phys->os_type] != NULL &&
	- DMU_USERUSED_DNODE(os) != NULL);
	-}
	-
	-typedef struct userquota_node {
	- uint64_t uqn_id;
	- int64_t uqn_delta;
	- avl_node_t uqn_node;
	-} userquota_node_t;
	-
	-typedef struct userquota_cache {
	- avl_tree_t uqc_user_deltas;
	- avl_tree_t uqc_group_deltas;
	-} userquota_cache_t;
	-
	-static int
	-userquota_compare(const void l, const void r)
	-{
	- const userquota_node_t *luqn = l;
	- const userquota_node_t *ruqn = r;
	-
	- if (luqn->uqn_id < ruqn->uqn_id)
	- return (-1);
	- if (luqn->uqn_id > ruqn->uqn_id)
	- return (1);
	- return (0);
	-}
	-
	-static void
	-do_userquota_cacheflush(objset_t os, userquota_cache_t cache, dmu_tx_t *tx)
	-{
	- void *cookie;
	- userquota_node_t *uqn;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- cookie = NULL;
	- while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
	- &cookie)) != NULL) {
	- /*
	- * os_userused_lock protects against concurrent calls to
	- * zap_increment_int(). It's needed because zap_increment_int()
	- * is not thread-safe (i.e. not atomic).
	- */
	- mutex_enter(&os->os_userused_lock);
	- VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
	- uqn->uqn_id, uqn->uqn_delta, tx));
	- mutex_exit(&os->os_userused_lock);
	- kmem_free(uqn, sizeof (*uqn));
	- }
	- avl_destroy(&cache->uqc_user_deltas);
	-
	- cookie = NULL;
	- while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
	- &cookie)) != NULL) {
	- mutex_enter(&os->os_userused_lock);
	- VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
	- uqn->uqn_id, uqn->uqn_delta, tx));
	- mutex_exit(&os->os_userused_lock);
	- kmem_free(uqn, sizeof (*uqn));
	- }
	- avl_destroy(&cache->uqc_group_deltas);
	-}
	-
	-static void
	-userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
	-{
	- userquota_node_t search = { .uqn_id = id };
	- avl_index_t idx;
	-
	- userquota_node_t *uqn = avl_find(avl, &search, &idx);
	- if (uqn == NULL) {
	- uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
	- uqn->uqn_id = id;
	- avl_insert(avl, uqn, idx);
	- }
	- uqn->uqn_delta += delta;
	-}
	-
	-static void
	-do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
	- uint64_t user, uint64_t group, boolean_t subtract)
	-{
	- if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
	- int64_t delta = DNODE_MIN_SIZE + used;
	- if (subtract)
	- delta = -delta;
	-
	- userquota_update_cache(&cache->uqc_user_deltas, user, delta);
	- userquota_update_cache(&cache->uqc_group_deltas, group, delta);
	- }
	-}
	-
	-typedef struct userquota_updates_arg {
	- objset_t *uua_os;
	- int uua_sublist_idx;
	- dmu_tx_t *uua_tx;
	-} userquota_updates_arg_t;
	-
	-static void
	-userquota_updates_task(void *arg)
	-{
	- userquota_updates_arg_t *uua = arg;
	- objset_t *os = uua->uua_os;
	- dmu_tx_t *tx = uua->uua_tx;
	- dnode_t *dn;
	- userquota_cache_t cache = { 0 };
	-
	- multilist_sublist_t *list =
	- multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
	-
	- ASSERT(multilist_sublist_head(list) == NULL \|\|
	- dmu_objset_userused_enabled(os));
	- avl_create(&cache.uqc_user_deltas, userquota_compare,
	- sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
	- avl_create(&cache.uqc_group_deltas, userquota_compare,
	- sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
	-
	- while ((dn = multilist_sublist_head(list)) != NULL) {
	- int flags;
	- ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
	- ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE \|\|
	- dn->dn_phys->dn_flags &
	- DNODE_FLAG_USERUSED_ACCOUNTED);
	-
	- flags = dn->dn_id_flags;
	- ASSERT(flags);
	- if (flags & DN_ID_OLD_EXIST) {
	- do_userquota_update(&cache,
	- dn->dn_oldused, dn->dn_oldflags,
	- dn->dn_olduid, dn->dn_oldgid, B_TRUE);
	- }
	- if (flags & DN_ID_NEW_EXIST) {
	- do_userquota_update(&cache,
	- DN_USED_BYTES(dn->dn_phys),
	- dn->dn_phys->dn_flags, dn->dn_newuid,
	- dn->dn_newgid, B_FALSE);
	- }
	-
	- mutex_enter(&dn->dn_mtx);
	- dn->dn_oldused = 0;
	- dn->dn_oldflags = 0;
	- if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
	- dn->dn_olduid = dn->dn_newuid;
	- dn->dn_oldgid = dn->dn_newgid;
	- dn->dn_id_flags \|= DN_ID_OLD_EXIST;
	- if (dn->dn_bonuslen == 0)
	- dn->dn_id_flags \|= DN_ID_CHKED_SPILL;
	- else
	- dn->dn_id_flags \|= DN_ID_CHKED_BONUS;
	- }
	- dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
	- if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
	- dn->dn_dirty_txg = 0;
	- mutex_exit(&dn->dn_mtx);
	-
	- multilist_sublist_remove(list, dn);
	- dnode_rele(dn, os->os_synced_dnodes);
	- }
	- do_userquota_cacheflush(os, &cache, tx);
	- multilist_sublist_unlock(list);
	- kmem_free(uua, sizeof (*uua));
	-}
	-
	-void
	-dmu_objset_do_userquota_updates(objset_t os, dmu_tx_t tx)
	-{
	- int num_sublists;
	-
	- if (!dmu_objset_userused_enabled(os))
	- return;
	-
	- /* Allocate the user/groupused objects if necessary. */
	- if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
	- VERIFY0(zap_create_claim(os,
	- DMU_USERUSED_OBJECT,
	- DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
	- VERIFY0(zap_create_claim(os,
	- DMU_GROUPUSED_OBJECT,
	- DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
	- }
	-
	- num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
	- for (int i = 0; i < num_sublists; i++) {
	- if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i))
	- continue;
	- userquota_updates_arg_t *uua =
	- kmem_alloc(sizeof (*uua), KM_SLEEP);
	- uua->uua_os = os;
	- uua->uua_sublist_idx = i;
	- uua->uua_tx = tx;
	- /* note: caller does taskq_wait() */
	- (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
	- userquota_updates_task, uua, 0);
	- /* callback frees uua */
	- }
	-}
	-
	-/*
	- * Returns a pointer to data to find uid/gid from
	- *
	- * If a dirty record for transaction group that is syncing can't
	- * be found then NULL is returned. In the NULL case it is assumed
	- * the uid/gid aren't changing.
	- */
	-static void *
	-dmu_objset_userquota_find_data(dmu_buf_impl_t db, dmu_tx_t tx)
	-{
	- dbuf_dirty_record_t dr, *drp;
	- void *data;
	-
	- if (db->db_dirtycnt == 0)
	- return (db->db.db_data); /* Nothing is changing */
	-
	- for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
	- if (dr->dr_txg == tx->tx_txg)
	- break;
	-
	- if (dr == NULL) {
	- data = NULL;
	- } else {
	- dnode_t *dn;
	-
	- DB_DNODE_ENTER(dr->dr_dbuf);
	- dn = DB_DNODE(dr->dr_dbuf);
	-
	- if (dn->dn_bonuslen == 0 &&
	- dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
	- data = dr->dt.dl.dr_data->b_data;
	- else
	- data = dr->dt.dl.dr_data;
	-
	- DB_DNODE_EXIT(dr->dr_dbuf);
	- }
	-
	- return (data);
	-}
	-
	-void
	-dmu_objset_userquota_get_ids(dnode_t dn, boolean_t before, dmu_tx_t tx)
	-{
	- objset_t *os = dn->dn_objset;
	- void *data = NULL;
	- dmu_buf_impl_t *db = NULL;
	- uint64_t *user = NULL;
	- uint64_t *group = NULL;
	- int flags = dn->dn_id_flags;
	- int error;
	- boolean_t have_spill = B_FALSE;
	-
	- if (!dmu_objset_userused_enabled(dn->dn_objset))
	- return;
	-
	- if (before && (flags & (DN_ID_CHKED_BONUS\|DN_ID_OLD_EXIST\|
	- DN_ID_CHKED_SPILL)))
	- return;
	-
	- if (before && dn->dn_bonuslen != 0)
	- data = DN_BONUS(dn->dn_phys);
	- else if (!before && dn->dn_bonuslen != 0) {
	- if (dn->dn_bonus) {
	- db = dn->dn_bonus;
	- mutex_enter(&db->db_mtx);
	- data = dmu_objset_userquota_find_data(db, tx);
	- } else {
	- data = DN_BONUS(dn->dn_phys);
	- }
	- } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
	- int rf = 0;
	-
	- if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
	- rf \|= DB_RF_HAVESTRUCT;
	- error = dmu_spill_hold_by_dnode(dn,
	- rf \| DB_RF_MUST_SUCCEED,
	- FTAG, (dmu_buf_t **)&db);
	- ASSERT(error == 0);
	- mutex_enter(&db->db_mtx);
	- data = (before) ? db->db.db_data :
	- dmu_objset_userquota_find_data(db, tx);
	- have_spill = B_TRUE;
	- } else {
	- mutex_enter(&dn->dn_mtx);
	- dn->dn_id_flags \|= DN_ID_CHKED_BONUS;
	- mutex_exit(&dn->dn_mtx);
	- return;
	- }
	-
	- if (before) {
	- ASSERT(data);
	- user = &dn->dn_olduid;
	- group = &dn->dn_oldgid;
	- } else if (data) {
	- user = &dn->dn_newuid;
	- group = &dn->dn_newgid;
	- }
	-
	- /*
	- * Must always call the callback in case the object
	- * type has changed and that type isn't an object type to track
	- */
	- error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
	- user, group);
	-
	- /*
	- * Preserve existing uid/gid when the callback can't determine
	- * what the new uid/gid are and the callback returned EEXIST.
	- * The EEXIST error tells us to just use the existing uid/gid.
	- * If we don't know what the old values are then just assign
	- * them to 0, since that is a new file being created.
	- */
	- if (!before && data == NULL && error == EEXIST) {
	- if (flags & DN_ID_OLD_EXIST) {
	- dn->dn_newuid = dn->dn_olduid;
	- dn->dn_newgid = dn->dn_oldgid;
	- } else {
	- dn->dn_newuid = 0;
	- dn->dn_newgid = 0;
	- }
	- error = 0;
	- }
	-
	- if (db)
	- mutex_exit(&db->db_mtx);
	-
	- mutex_enter(&dn->dn_mtx);
	- if (error == 0 && before)
	- dn->dn_id_flags \|= DN_ID_OLD_EXIST;
	- if (error == 0 && !before)
	- dn->dn_id_flags \|= DN_ID_NEW_EXIST;
	-
	- if (have_spill) {
	- dn->dn_id_flags \|= DN_ID_CHKED_SPILL;
	- } else {
	- dn->dn_id_flags \|= DN_ID_CHKED_BONUS;
	- }
	- mutex_exit(&dn->dn_mtx);
	- if (have_spill)
	- dmu_buf_rele((dmu_buf_t *)db, FTAG);
	-}
	-
	-boolean_t
	-dmu_objset_userspace_present(objset_t *os)
	-{
	- return (os->os_phys->os_flags &
	- OBJSET_FLAG_USERACCOUNTING_COMPLETE);
	-}
	-
	-int
	-dmu_objset_userspace_upgrade(objset_t *os)
	-{
	- uint64_t obj;
	- int err = 0;
	-
	- if (dmu_objset_userspace_present(os))
	- return (0);
	- if (!dmu_objset_userused_enabled(os))
	- return (SET_ERROR(ENOTSUP));
	- if (dmu_objset_is_snapshot(os))
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * We simply need to mark every object dirty, so that it will be
	- * synced out and now accounted. If this is called
	- * concurrently, or if we already did some work before crashing,
	- * that's fine, since we track each object's accounted state
	- * independently.
	- */
	-
	- for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
	- dmu_tx_t *tx;
	- dmu_buf_t *db;
	- int objerr;
	-
	- if (issig(JUSTLOOKING) && issig(FORREAL))
	- return (SET_ERROR(EINTR));
	-
	- objerr = dmu_bonus_hold(os, obj, FTAG, &db);
	- if (objerr != 0)
	- continue;
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_bonus(tx, obj);
	- objerr = dmu_tx_assign(tx, TXG_WAIT);
	- if (objerr != 0) {
	- dmu_tx_abort(tx);
	- continue;
	- }
	- dmu_buf_will_dirty(db, tx);
	- dmu_buf_rele(db, FTAG);
	- dmu_tx_commit(tx);
	- }
	-
	- os->os_flags \|= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
	- txg_wait_synced(dmu_objset_pool(os), 0);
	- return (0);
	-}
	-
	-void
	-dmu_objset_space(objset_t os, uint64_t refdbytesp, uint64_t *availbytesp,
	- uint64_t usedobjsp, uint64_t availobjsp)
	-{
	- dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
	- usedobjsp, availobjsp);
	-}
	-
	-uint64_t
	-dmu_objset_fsid_guid(objset_t *os)
	-{
	- return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
	-}
	-
	-void
	-dmu_objset_fast_stat(objset_t os, dmu_objset_stats_t stat)
	-{
	- stat->dds_type = os->os_phys->os_type;
	- if (os->os_dsl_dataset)
	- dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
	-}
	-
	-void
	-dmu_objset_stats(objset_t os, nvlist_t nv)
	-{
	- ASSERT(os->os_dsl_dataset \|\|
	- os->os_phys->os_type == DMU_OST_META);
	-
	- if (os->os_dsl_dataset != NULL)
	- dsl_dataset_stats(os->os_dsl_dataset, nv);
	-
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
	- os->os_phys->os_type);
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
	- dmu_objset_userspace_present(os));
	-}
	-
	-int
	-dmu_objset_is_snapshot(objset_t *os)
	-{
	- if (os->os_dsl_dataset != NULL)
	- return (os->os_dsl_dataset->ds_is_snapshot);
	- else
	- return (B_FALSE);
	-}
	-
	-int
	-dmu_snapshot_realname(objset_t os, char name, char *real, int maxlen,
	- boolean_t *conflict)
	-{
	- dsl_dataset_t *ds = os->os_dsl_dataset;
	- uint64_t ignored;
	-
	- if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
	- return (SET_ERROR(ENOENT));
	-
	- return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
	- MT_NORMALIZE, real, maxlen, conflict));
	-}
	-
	-int
	-dmu_snapshot_list_next(objset_t os, int namelen, char name,
	- uint64_t idp, uint64_t offp, boolean_t *case_conflict)
	-{
	- dsl_dataset_t *ds = os->os_dsl_dataset;
	- zap_cursor_t cursor;
	- zap_attribute_t attr;
	-
	- ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
	-
	- if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
	- return (SET_ERROR(ENOENT));
	-
	- zap_cursor_init_serialized(&cursor,
	- ds->ds_dir->dd_pool->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
	-
	- if (zap_cursor_retrieve(&cursor, &attr) != 0) {
	- zap_cursor_fini(&cursor);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- if (strlen(attr.za_name) + 1 > namelen) {
	- zap_cursor_fini(&cursor);
	- return (SET_ERROR(ENAMETOOLONG));
	- }
	-
	- (void) strcpy(name, attr.za_name);
	- if (idp)
	- *idp = attr.za_first_integer;
	- if (case_conflict)
	- *case_conflict = attr.za_normalization_conflict;
	- zap_cursor_advance(&cursor);
	- *offp = zap_cursor_serialize(&cursor);
	- zap_cursor_fini(&cursor);
	-
	- return (0);
	-}
	-
	-int
	-dmu_dir_list_next(objset_t os, int namelen, char name,
	- uint64_t idp, uint64_t offp)
	-{
	- dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
	- zap_cursor_t cursor;
	- zap_attribute_t attr;
	-
	- /* there is no next dir on a snapshot! */
	- if (os->os_dsl_dataset->ds_object !=
	- dsl_dir_phys(dd)->dd_head_dataset_obj)
	- return (SET_ERROR(ENOENT));
	-
	- zap_cursor_init_serialized(&cursor,
	- dd->dd_pool->dp_meta_objset,
	- dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
	-
	- if (zap_cursor_retrieve(&cursor, &attr) != 0) {
	- zap_cursor_fini(&cursor);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- if (strlen(attr.za_name) + 1 > namelen) {
	- zap_cursor_fini(&cursor);
	- return (SET_ERROR(ENAMETOOLONG));
	- }
	-
	- (void) strcpy(name, attr.za_name);
	- if (idp)
	- *idp = attr.za_first_integer;
	- zap_cursor_advance(&cursor);
	- *offp = zap_cursor_serialize(&cursor);
	- zap_cursor_fini(&cursor);
	-
	- return (0);
	-}
	-
	-typedef struct dmu_objset_find_ctx {
	- taskq_t *dc_tq;
	- dsl_pool_t *dc_dp;
	- uint64_t dc_ddobj;
	- char dc_ddname; / last component of ddobj's name */
	- int (dc_func)(dsl_pool_t , dsl_dataset_t , void );
	- void *dc_arg;
	- int dc_flags;
	- kmutex_t *dc_error_lock;
	- int *dc_error;
	-} dmu_objset_find_ctx_t;
	-
	-static void
	-dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
	-{
	- dsl_pool_t *dp = dcp->dc_dp;
	- dsl_dir_t *dd;
	- dsl_dataset_t *ds;
	- zap_cursor_t zc;
	- zap_attribute_t *attr;
	- uint64_t thisobj;
	- int err = 0;
	-
	- /* don't process if there already was an error */
	- if (*dcp->dc_error != 0)
	- goto out;
	-
	- /*
	- * Note: passing the name (dc_ddname) here is optional, but it
	- * improves performance because we don't need to call
	- * zap_value_search() to determine the name.
	- */
	- err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
	- if (err != 0)
	- goto out;
	-
	- /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
	- if (dd->dd_myname[0] == '$') {
	- dsl_dir_rele(dd, FTAG);
	- goto out;
	- }
	-
	- thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
	- attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
	-
	- /*
	- * Iterate over all children.
	- */
	- if (dcp->dc_flags & DS_FIND_CHILDREN) {
	- for (zap_cursor_init(&zc, dp->dp_meta_objset,
	- dsl_dir_phys(dd)->dd_child_dir_zapobj);
	- zap_cursor_retrieve(&zc, attr) == 0;
	- (void) zap_cursor_advance(&zc)) {
	- ASSERT3U(attr->za_integer_length, ==,
	- sizeof (uint64_t));
	- ASSERT3U(attr->za_num_integers, ==, 1);
	-
	- dmu_objset_find_ctx_t *child_dcp =
	- kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
	- child_dcp = dcp;
	- child_dcp->dc_ddobj = attr->za_first_integer;
	- child_dcp->dc_ddname = spa_strdup(attr->za_name);
	- if (dcp->dc_tq != NULL)
	- (void) taskq_dispatch(dcp->dc_tq,
	- dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
	- else
	- dmu_objset_find_dp_impl(child_dcp);
	- }
	- zap_cursor_fini(&zc);
	- }
	-
	- /*
	- * Iterate over all snapshots.
	- */
	- if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
	- dsl_dataset_t *ds;
	- err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
	-
	- if (err == 0) {
	- uint64_t snapobj;
	-
	- snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	- dsl_dataset_rele(ds, FTAG);
	-
	- for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
	- zap_cursor_retrieve(&zc, attr) == 0;
	- (void) zap_cursor_advance(&zc)) {
	- ASSERT3U(attr->za_integer_length, ==,
	- sizeof (uint64_t));
	- ASSERT3U(attr->za_num_integers, ==, 1);
	-
	- err = dsl_dataset_hold_obj(dp,
	- attr->za_first_integer, FTAG, &ds);
	- if (err != 0)
	- break;
	- err = dcp->dc_func(dp, ds, dcp->dc_arg);
	- dsl_dataset_rele(ds, FTAG);
	- if (err != 0)
	- break;
	- }
	- zap_cursor_fini(&zc);
	- }
	- }
	-
	- kmem_free(attr, sizeof (zap_attribute_t));
	-
	- if (err != 0) {
	- dsl_dir_rele(dd, FTAG);
	- goto out;
	- }
	-
	- /*
	- * Apply to self.
	- */
	- err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
	-
	- /*
	- * Note: we hold the dir while calling dsl_dataset_hold_obj() so
	- * that the dir will remain cached, and we won't have to re-instantiate
	- * it (which could be expensive due to finding its name via
	- * zap_value_search()).
	- */
	- dsl_dir_rele(dd, FTAG);
	- if (err != 0)
	- goto out;
	- err = dcp->dc_func(dp, ds, dcp->dc_arg);
	- dsl_dataset_rele(ds, FTAG);
	-
	-out:
	- if (err != 0) {
	- mutex_enter(dcp->dc_error_lock);
	- /* only keep first error */
	- if (*dcp->dc_error == 0)
	- *dcp->dc_error = err;
	- mutex_exit(dcp->dc_error_lock);
	- }
	-
	- if (dcp->dc_ddname != NULL)
	- spa_strfree(dcp->dc_ddname);
	- kmem_free(dcp, sizeof (*dcp));
	-}
	-
	-static void
	-dmu_objset_find_dp_cb(void *arg)
	-{
	- dmu_objset_find_ctx_t *dcp = arg;
	- dsl_pool_t *dp = dcp->dc_dp;
	-
	- /*
	- * We need to get a pool_config_lock here, as there are several
	- * asssert(pool_config_held) down the stack. Getting a lock via
	- * dsl_pool_config_enter is risky, as it might be stalled by a
	- * pending writer. This would deadlock, as the write lock can
	- * only be granted when our parent thread gives up the lock.
	- * The _prio interface gives us priority over a pending writer.
	- */
	- dsl_pool_config_enter_prio(dp, FTAG);
	-
	- dmu_objset_find_dp_impl(dcp);
	-
	- dsl_pool_config_exit(dp, FTAG);
	-}
	-
	-/*
	- * Find objsets under and including ddobj, call func(ds) on each.
	- * The order for the enumeration is completely undefined.
	- * func is called with dsl_pool_config held.
	- */
	-int
	-dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
	- int func(dsl_pool_t , dsl_dataset_t , void ), void arg, int flags)
	-{
	- int error = 0;
	- taskq_t *tq = NULL;
	- int ntasks;
	- dmu_objset_find_ctx_t *dcp;
	- kmutex_t err_lock;
	-
	- mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
	- dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
	- dcp->dc_tq = NULL;
	- dcp->dc_dp = dp;
	- dcp->dc_ddobj = ddobj;
	- dcp->dc_ddname = NULL;
	- dcp->dc_func = func;
	- dcp->dc_arg = arg;
	- dcp->dc_flags = flags;
	- dcp->dc_error_lock = &err_lock;
	- dcp->dc_error = &error;
	-
	- if ((flags & DS_FIND_SERIALIZE) \|\| dsl_pool_config_held_writer(dp)) {
	- /*
	- * In case a write lock is held we can't make use of
	- * parallelism, as down the stack of the worker threads
	- * the lock is asserted via dsl_pool_config_held.
	- * In case of a read lock this is solved by getting a read
	- * lock in each worker thread, which isn't possible in case
	- * of a writer lock. So we fall back to the synchronous path
	- * here.
	- * In the future it might be possible to get some magic into
	- * dsl_pool_config_held in a way that it returns true for
	- * the worker threads so that a single lock held from this
	- * thread suffices. For now, stay single threaded.
	- */
	- dmu_objset_find_dp_impl(dcp);
	- mutex_destroy(&err_lock);
	-
	- return (error);
	- }
	-
	- ntasks = dmu_find_threads;
	- if (ntasks == 0)
	- ntasks = vdev_count_leaves(dp->dp_spa) * 4;
	- tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
	- INT_MAX, 0);
	- if (tq == NULL) {
	- kmem_free(dcp, sizeof (*dcp));
	- mutex_destroy(&err_lock);
	-
	- return (SET_ERROR(ENOMEM));
	- }
	- dcp->dc_tq = tq;
	-
	- /* dcp will be freed by task */
	- (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
	-
	- /*
	- * PORTING: this code relies on the property of taskq_wait to wait
	- * until no more tasks are queued and no more tasks are active. As
	- * we always queue new tasks from within other tasks, task_wait
	- * reliably waits for the full recursion to finish, even though we
	- * enqueue new tasks after taskq_wait has been called.
	- * On platforms other than illumos, taskq_wait may not have this
	- * property.
	- */
	- taskq_wait(tq);
	- taskq_destroy(tq);
	- mutex_destroy(&err_lock);
	-
	- return (error);
	-}
	-
	-/*
	- * Find all objsets under name, and for each, call 'func(child_name, arg)'.
	- * The dp_config_rwlock must not be held when this is called, and it
	- * will not be held when the callback is called.
	- * Therefore this function should only be used when the pool is not changing
	- * (e.g. in syncing context), or the callback can deal with the possible races.
	- */
	-static int
	-dmu_objset_find_impl(spa_t spa, const char name,
	- int func(const char , void ), void *arg, int flags)
	-{
	- dsl_dir_t *dd;
	- dsl_pool_t *dp = spa_get_dsl(spa);
	- dsl_dataset_t *ds;
	- zap_cursor_t zc;
	- zap_attribute_t *attr;
	- char *child;
	- uint64_t thisobj;
	- int err;
	-
	- dsl_pool_config_enter(dp, FTAG);
	-
	- err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
	- if (err != 0) {
	- dsl_pool_config_exit(dp, FTAG);
	- return (err);
	- }
	-
	- /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
	- if (dd->dd_myname[0] == '$') {
	- dsl_dir_rele(dd, FTAG);
	- dsl_pool_config_exit(dp, FTAG);
	- return (0);
	- }
	-
	- thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
	- attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
	-
	- /*
	- * Iterate over all children.
	- */
	- if (flags & DS_FIND_CHILDREN) {
	- for (zap_cursor_init(&zc, dp->dp_meta_objset,
	- dsl_dir_phys(dd)->dd_child_dir_zapobj);
	- zap_cursor_retrieve(&zc, attr) == 0;
	- (void) zap_cursor_advance(&zc)) {
	- ASSERT3U(attr->za_integer_length, ==,
	- sizeof (uint64_t));
	- ASSERT3U(attr->za_num_integers, ==, 1);
	-
	- child = kmem_asprintf("%s/%s", name, attr->za_name);
	- dsl_pool_config_exit(dp, FTAG);
	- err = dmu_objset_find_impl(spa, child,
	- func, arg, flags);
	- dsl_pool_config_enter(dp, FTAG);
	- strfree(child);
	- if (err != 0)
	- break;
	- }
	- zap_cursor_fini(&zc);
	-
	- if (err != 0) {
	- dsl_dir_rele(dd, FTAG);
	- dsl_pool_config_exit(dp, FTAG);
	- kmem_free(attr, sizeof (zap_attribute_t));
	- return (err);
	- }
	- }
	-
	- /*
	- * Iterate over all snapshots.
	- */
	- if (flags & DS_FIND_SNAPSHOTS) {
	- err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
	-
	- if (err == 0) {
	- uint64_t snapobj;
	-
	- snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	- dsl_dataset_rele(ds, FTAG);
	-
	- for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
	- zap_cursor_retrieve(&zc, attr) == 0;
	- (void) zap_cursor_advance(&zc)) {
	- ASSERT3U(attr->za_integer_length, ==,
	- sizeof (uint64_t));
	- ASSERT3U(attr->za_num_integers, ==, 1);
	-
	- child = kmem_asprintf("%s@%s",
	- name, attr->za_name);
	- dsl_pool_config_exit(dp, FTAG);
	- err = func(child, arg);
	- dsl_pool_config_enter(dp, FTAG);
	- strfree(child);
	- if (err != 0)
	- break;
	- }
	- zap_cursor_fini(&zc);
	- }
	- }
	-
	- dsl_dir_rele(dd, FTAG);
	- kmem_free(attr, sizeof (zap_attribute_t));
	- dsl_pool_config_exit(dp, FTAG);
	-
	- if (err != 0)
	- return (err);
	-
	- /* Apply to self. */
	- return (func(name, arg));
	-}
	-
	-/*
	- * See comment above dmu_objset_find_impl().
	- */
	-int
	-dmu_objset_find(char name, int func(const char , void ), void arg,
	- int flags)
	-{
	- spa_t *spa;
	- int error;
	-
	- error = spa_open(name, &spa, FTAG);
	- if (error != 0)
	- return (error);
	- error = dmu_objset_find_impl(spa, name, func, arg, flags);
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-void
	-dmu_objset_set_user(objset_t os, void user_ptr)
	-{
	- ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
	- os->os_user_ptr = user_ptr;
	-}
	-
	-void *
	-dmu_objset_get_user(objset_t *os)
	-{
	- ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
	- return (os->os_user_ptr);
	-}
	-
	-/*
	- * Determine name of filesystem, given name of snapshot.
	- * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
	- */
	-int
	-dmu_fsname(const char snapname, char buf)
	-{
	- char *atp = strchr(snapname, '@');
	- if (atp == NULL)
	- return (SET_ERROR(EINVAL));
	- if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	- (void) strlcpy(buf, snapname, atp - snapname + 1);
	- return (0);
	-}
	-
	-/*
	- * Call when we think we're going to write/free space in open context to track
	- * the amount of dirty data in the open txg, which is also the amount
	- * of memory that can not be evicted until this txg syncs.
	- */
	-void
	-dmu_objset_willuse_space(objset_t os, int64_t space, dmu_tx_t tx)
	-{
	- dsl_dataset_t *ds = os->os_dsl_dataset;
	- int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
	-
	- if (ds != NULL) {
	- dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
	- dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
	@@ -1,3550 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright 2014 HybridCluster. All rights reserved.
	- * Copyright 2016 RackTop Systems.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
	- */
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dbuf.h>
	-#include <sys/dnode.h>
	-#include <sys/zfs_context.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zap.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zfs_znode.h>
	-#include <zfs_fletcher.h>
	-#include <sys/avl.h>
	-#include <sys/ddt.h>
	-#include <sys/zfs_onexit.h>
	-#include <sys/dmu_send.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/blkptr.h>
	-#include <sys/dsl_bookmark.h>
	-#include <sys/zfeature.h>
	-#include <sys/bqueue.h>
	-#ifdef __FreeBSD__
	-#include <sys/zvol.h>
	-#endif
	-
	-#ifdef __FreeBSD__
	-#undef dump_write
	-#define dump_write dmu_dump_write
	-#endif
	-
	-/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
	-int zfs_send_corrupt_data = B_FALSE;
	-int zfs_send_queue_length = 16 * 1024 * 1024;
	-int zfs_recv_queue_length = 16 * 1024 * 1024;
	-/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
	-int zfs_send_set_freerecords_bit = B_TRUE;
	-
	-#ifdef _KERNEL
	-TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit);
	-#endif
	-
	-static char *dmu_recv_tag = "dmu_recv_tag";
	-const char *recv_clone_name = "%recv";
	-
	-/*
	- * Use this to override the recordsize calculation for fast zfs send estimates.
	- */
	-uint64_t zfs_override_estimate_recordsize = 0;
	-
	-#define BP_SPAN(datablkszsec, indblkshift, level) \
	- (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
	- (level) * (indblkshift - SPA_BLKPTRSHIFT)))
	-
	-static void byteswap_record(dmu_replay_record_t *drr);
	-
	-struct send_thread_arg {
	- bqueue_t q;
	- dsl_dataset_t ds; / Dataset to traverse */
	- uint64_t fromtxg; /* Traverse from this txg */
	- int flags; /* flags to pass to traverse_dataset */
	- int error_code;
	- boolean_t cancel;
	- zbookmark_phys_t resume;
	-};
	-
	-struct send_block_record {
	- boolean_t eos_marker; /* Marks the end of the stream */
	- blkptr_t bp;
	- zbookmark_phys_t zb;
	- uint8_t indblkshift;
	- uint16_t datablkszsec;
	- bqueue_node_t ln;
	-};
	-
	-static int
	-dump_bytes(dmu_sendarg_t dsp, void buf, int len)
	-{
	- dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
	- struct uio auio;
	- struct iovec aiov;
	-
	- /*
	- * The code does not rely on this (len being a multiple of 8). We keep
	- * this assertion because of the corresponding assertion in
	- * receive_read(). Keeping this assertion ensures that we do not
	- * inadvertently break backwards compatibility (causing the assertion
	- * in receive_read() to trigger on old software).
	- *
	- * Removing the assertions could be rolled into a new feature that uses
	- * data that isn't 8-byte aligned; if the assertions were removed, a
	- * feature flag would have to be added.
	- */
	-
	- ASSERT0(len % 8);
	-
	- aiov.iov_base = buf;
	- aiov.iov_len = len;
	- auio.uio_iov = &aiov;
	- auio.uio_iovcnt = 1;
	- auio.uio_resid = len;
	- auio.uio_segflg = UIO_SYSSPACE;
	- auio.uio_rw = UIO_WRITE;
	- auio.uio_offset = (off_t)-1;
	- auio.uio_td = dsp->dsa_td;
	-#ifdef _KERNEL
	- if (dsp->dsa_fp->f_type == DTYPE_VNODE)
	- bwillwrite();
	- dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0,
	- dsp->dsa_td);
	-#else
	- fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
	- dsp->dsa_err = EOPNOTSUPP;
	-#endif
	- mutex_enter(&ds->ds_sendstream_lock);
	- *dsp->dsa_off += len;
	- mutex_exit(&ds->ds_sendstream_lock);
	-
	- return (dsp->dsa_err);
	-}
	-
	-/*
	- * For all record types except BEGIN, fill in the checksum (overlaid in
	- * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
	- * up to the start of the checksum itself.
	- */
	-static int
	-dump_record(dmu_sendarg_t dsp, void payload, int payload_len)
	-{
	- ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
	- ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
	- (void) fletcher_4_incremental_native(dsp->dsa_drr,
	- offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
	- &dsp->dsa_zc);
	- if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
	- dsp->dsa_sent_begin = B_TRUE;
	- } else {
	- ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
	- drr_checksum.drr_checksum));
	- dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
	- }
	- if (dsp->dsa_drr->drr_type == DRR_END) {
	- dsp->dsa_sent_end = B_TRUE;
	- }
	- (void) fletcher_4_incremental_native(&dsp->dsa_drr->
	- drr_u.drr_checksum.drr_checksum,
	- sizeof (zio_cksum_t), &dsp->dsa_zc);
	- if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
	- return (SET_ERROR(EINTR));
	- if (payload_len != 0) {
	- (void) fletcher_4_incremental_native(payload, payload_len,
	- &dsp->dsa_zc);
	- if (dump_bytes(dsp, payload, payload_len) != 0)
	- return (SET_ERROR(EINTR));
	- }
	- return (0);
	-}
	-
	-/*
	- * Fill in the drr_free struct, or perform aggregation if the previous record is
	- * also a free record, and the two are adjacent.
	- *
	- * Note that we send free records even for a full send, because we want to be
	- * able to receive a full send as a clone, which requires a list of all the free
	- * and freeobject records that were generated on the source.
	- */
	-static int
	-dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
	- uint64_t length)
	-{
	- struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
	-
	- /*
	- * When we receive a free record, dbuf_free_range() assumes
	- * that the receiving system doesn't have any dbufs in the range
	- * being freed. This is always true because there is a one-record
	- * constraint: we only send one WRITE record for any given
	- * object,offset. We know that the one-record constraint is
	- * true because we always send data in increasing order by
	- * object,offset.
	- *
	- * If the increasing-order constraint ever changes, we should find
	- * another way to assert that the one-record constraint is still
	- * satisfied.
	- */
	- ASSERT(object > dsp->dsa_last_data_object \|\|
	- (object == dsp->dsa_last_data_object &&
	- offset > dsp->dsa_last_data_offset));
	-
	- if (length != -1ULL && offset + length < offset)
	- length = -1ULL;
	-
	- /*
	- * If there is a pending op, but it's not PENDING_FREE, push it out,
	- * since free block aggregation can only be done for blocks of the
	- * same type (i.e., DRR_FREE records can only be aggregated with
	- * other DRR_FREE records. DRR_FREEOBJECTS records can only be
	- * aggregated with other DRR_FREEOBJECTS records.
	- */
	- if (dsp->dsa_pending_op != PENDING_NONE &&
	- dsp->dsa_pending_op != PENDING_FREE) {
	- if (dump_record(dsp, NULL, 0) != 0)
	- return (SET_ERROR(EINTR));
	- dsp->dsa_pending_op = PENDING_NONE;
	- }
	-
	- if (dsp->dsa_pending_op == PENDING_FREE) {
	- /*
	- * There should never be a PENDING_FREE if length is -1
	- * (because dump_dnode is the only place where this
	- * function is called with a -1, and only after flushing
	- * any pending record).
	- */
	- ASSERT(length != -1ULL);
	- /*
	- * Check to see whether this free block can be aggregated
	- * with pending one.
	- */
	- if (drrf->drr_object == object && drrf->drr_offset +
	- drrf->drr_length == offset) {
	- drrf->drr_length += length;
	- return (0);
	- } else {
	- /* not a continuation. Push out pending record */
	- if (dump_record(dsp, NULL, 0) != 0)
	- return (SET_ERROR(EINTR));
	- dsp->dsa_pending_op = PENDING_NONE;
	- }
	- }
	- /* create a FREE record and make it pending */
	- bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
	- dsp->dsa_drr->drr_type = DRR_FREE;
	- drrf->drr_object = object;
	- drrf->drr_offset = offset;
	- drrf->drr_length = length;
	- drrf->drr_toguid = dsp->dsa_toguid;
	- if (length == -1ULL) {
	- if (dump_record(dsp, NULL, 0) != 0)
	- return (SET_ERROR(EINTR));
	- } else {
	- dsp->dsa_pending_op = PENDING_FREE;
	- }
	-
	- return (0);
	-}
	-
	-static int
	-dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
	- uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp,
	- void *data)
	-{
	- uint64_t payload_size;
	- struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
	-
	- /*
	- * We send data in increasing object, offset order.
	- * See comment in dump_free() for details.
	- */
	- ASSERT(object > dsp->dsa_last_data_object \|\|
	- (object == dsp->dsa_last_data_object &&
	- offset > dsp->dsa_last_data_offset));
	- dsp->dsa_last_data_object = object;
	- dsp->dsa_last_data_offset = offset + lsize - 1;
	-
	- /*
	- * If there is any kind of pending aggregation (currently either
	- * a grouping of free objects or free blocks), push it out to
	- * the stream, since aggregation can't be done across operations
	- * of different types.
	- */
	- if (dsp->dsa_pending_op != PENDING_NONE) {
	- if (dump_record(dsp, NULL, 0) != 0)
	- return (SET_ERROR(EINTR));
	- dsp->dsa_pending_op = PENDING_NONE;
	- }
	- /* write a WRITE record */
	- bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
	- dsp->dsa_drr->drr_type = DRR_WRITE;
	- drrw->drr_object = object;
	- drrw->drr_type = type;
	- drrw->drr_offset = offset;
	- drrw->drr_toguid = dsp->dsa_toguid;
	- drrw->drr_logical_size = lsize;
	-
	- /* only set the compression fields if the buf is compressed */
	- if (lsize != psize) {
	- ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED);
	- ASSERT(!BP_IS_EMBEDDED(bp));
	- ASSERT(!BP_SHOULD_BYTESWAP(bp));
	- ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
	- ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
	- ASSERT3S(psize, >, 0);
	- ASSERT3S(lsize, >=, psize);
	-
	- drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
	- drrw->drr_compressed_size = psize;
	- payload_size = drrw->drr_compressed_size;
	- } else {
	- payload_size = drrw->drr_logical_size;
	- }
	-
	- if (bp == NULL \|\| BP_IS_EMBEDDED(bp)) {
	- /*
	- * There's no pre-computed checksum for partial-block
	- * writes or embedded BP's, so (like
	- * fletcher4-checkummed blocks) userland will have to
	- * compute a dedup-capable checksum itself.
	- */
	- drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
	- } else {
	- drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
	- if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
	- ZCHECKSUM_FLAG_DEDUP)
	- drrw->drr_checksumflags \|= DRR_CHECKSUM_DEDUP;
	- DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
	- DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
	- DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
	- drrw->drr_key.ddk_cksum = bp->blk_cksum;
	- }
	-
	- if (dump_record(dsp, data, payload_size) != 0)
	- return (SET_ERROR(EINTR));
	- return (0);
	-}
	-
	-static int
	-dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
	- int blksz, const blkptr_t *bp)
	-{
	- char buf[BPE_PAYLOAD_SIZE];
	- struct drr_write_embedded *drrw =
	- &(dsp->dsa_drr->drr_u.drr_write_embedded);
	-
	- if (dsp->dsa_pending_op != PENDING_NONE) {
	- if (dump_record(dsp, NULL, 0) != 0)
	- return (EINTR);
	- dsp->dsa_pending_op = PENDING_NONE;
	- }
	-
	- ASSERT(BP_IS_EMBEDDED(bp));
	-
	- bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
	- dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
	- drrw->drr_object = object;
	- drrw->drr_offset = offset;
	- drrw->drr_length = blksz;
	- drrw->drr_toguid = dsp->dsa_toguid;
	- drrw->drr_compression = BP_GET_COMPRESS(bp);
	- drrw->drr_etype = BPE_GET_ETYPE(bp);
	- drrw->drr_lsize = BPE_GET_LSIZE(bp);
	- drrw->drr_psize = BPE_GET_PSIZE(bp);
	-
	- decode_embedded_bp_compressed(bp, buf);
	-
	- if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
	- return (EINTR);
	- return (0);
	-}
	-
	-static int
	-dump_spill(dmu_sendarg_t dsp, uint64_t object, int blksz, void data)
	-{
	- struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
	-
	- if (dsp->dsa_pending_op != PENDING_NONE) {
	- if (dump_record(dsp, NULL, 0) != 0)
	- return (SET_ERROR(EINTR));
	- dsp->dsa_pending_op = PENDING_NONE;
	- }
	-
	- /* write a SPILL record */
	- bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
	- dsp->dsa_drr->drr_type = DRR_SPILL;
	- drrs->drr_object = object;
	- drrs->drr_length = blksz;
	- drrs->drr_toguid = dsp->dsa_toguid;
	-
	- if (dump_record(dsp, data, blksz) != 0)
	- return (SET_ERROR(EINTR));
	- return (0);
	-}
	-
	-static int
	-dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
	-{
	- struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
	-
	- /*
	- * If there is a pending op, but it's not PENDING_FREEOBJECTS,
	- * push it out, since free block aggregation can only be done for
	- * blocks of the same type (i.e., DRR_FREE records can only be
	- * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
	- * can only be aggregated with other DRR_FREEOBJECTS records.
	- */
	- if (dsp->dsa_pending_op != PENDING_NONE &&
	- dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
	- if (dump_record(dsp, NULL, 0) != 0)
	- return (SET_ERROR(EINTR));
	- dsp->dsa_pending_op = PENDING_NONE;
	- }
	- if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
	- /*
	- * See whether this free object array can be aggregated
	- * with pending one
	- */
	- if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
	- drrfo->drr_numobjs += numobjs;
	- return (0);
	- } else {
	- /* can't be aggregated. Push out pending record */
	- if (dump_record(dsp, NULL, 0) != 0)
	- return (SET_ERROR(EINTR));
	- dsp->dsa_pending_op = PENDING_NONE;
	- }
	- }
	-
	- /* write a FREEOBJECTS record */
	- bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
	- dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
	- drrfo->drr_firstobj = firstobj;
	- drrfo->drr_numobjs = numobjs;
	- drrfo->drr_toguid = dsp->dsa_toguid;
	-
	- dsp->dsa_pending_op = PENDING_FREEOBJECTS;
	-
	- return (0);
	-}
	-
	-static int
	-dump_dnode(dmu_sendarg_t dsp, uint64_t object, dnode_phys_t dnp)
	-{
	- struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
	-
	- if (object < dsp->dsa_resume_object) {
	- /*
	- * Note: when resuming, we will visit all the dnodes in
	- * the block of dnodes that we are resuming from. In
	- * this case it's unnecessary to send the dnodes prior to
	- * the one we are resuming from. We should be at most one
	- * block's worth of dnodes behind the resume point.
	- */
	- ASSERT3U(dsp->dsa_resume_object - object, <,
	- 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
	- return (0);
	- }
	-
	- if (dnp == NULL \|\| dnp->dn_type == DMU_OT_NONE)
	- return (dump_freeobjects(dsp, object, 1));
	-
	- if (dsp->dsa_pending_op != PENDING_NONE) {
	- if (dump_record(dsp, NULL, 0) != 0)
	- return (SET_ERROR(EINTR));
	- dsp->dsa_pending_op = PENDING_NONE;
	- }
	-
	- /* write an OBJECT record */
	- bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
	- dsp->dsa_drr->drr_type = DRR_OBJECT;
	- drro->drr_object = object;
	- drro->drr_type = dnp->dn_type;
	- drro->drr_bonustype = dnp->dn_bonustype;
	- drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
	- drro->drr_bonuslen = dnp->dn_bonuslen;
	- drro->drr_dn_slots = dnp->dn_extra_slots + 1;
	- drro->drr_checksumtype = dnp->dn_checksum;
	- drro->drr_compress = dnp->dn_compress;
	- drro->drr_toguid = dsp->dsa_toguid;
	-
	- if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
	- drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
	- drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
	-
	- if (dump_record(dsp, DN_BONUS(dnp),
	- P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
	- return (SET_ERROR(EINTR));
	- }
	-
	- /* Free anything past the end of the file. */
	- if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
	- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
	- return (SET_ERROR(EINTR));
	- if (dsp->dsa_err != 0)
	- return (SET_ERROR(EINTR));
	- return (0);
	-}
	-
	-static boolean_t
	-backup_do_embed(dmu_sendarg_t dsp, const blkptr_t bp)
	-{
	- if (!BP_IS_EMBEDDED(bp))
	- return (B_FALSE);
	-
	- /*
	- * Compression function must be legacy, or explicitly enabled.
	- */
	- if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
	- !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4)))
	- return (B_FALSE);
	-
	- /*
	- * Embed type must be explicitly enabled.
	- */
	- switch (BPE_GET_ETYPE(bp)) {
	- case BP_EMBEDDED_TYPE_DATA:
	- if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
	- return (B_TRUE);
	- break;
	- default:
	- return (B_FALSE);
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * This is the callback function to traverse_dataset that acts as the worker
	- * thread for dmu_send_impl.
	- */
	-/ARGSUSED/
	-static int
	-send_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const struct dnode_phys dnp, void *arg)
	-{
	- struct send_thread_arg *sta = arg;
	- struct send_block_record *record;
	- uint64_t record_size;
	- int err = 0;
	-
	- ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT \|\|
	- zb->zb_object >= sta->resume.zb_object);
	-
	- if (sta->cancel)
	- return (SET_ERROR(EINTR));
	-
	- if (bp == NULL) {
	- ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
	- return (0);
	- } else if (zb->zb_level < 0) {
	- return (0);
	- }
	-
	- record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
	- record->eos_marker = B_FALSE;
	- record->bp = *bp;
	- record->zb = *zb;
	- record->indblkshift = dnp->dn_indblkshift;
	- record->datablkszsec = dnp->dn_datablkszsec;
	- record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
	- bqueue_enqueue(&sta->q, record, record_size);
	-
	- return (err);
	-}
	-
	-/*
	- * This function kicks off the traverse_dataset. It also handles setting the
	- * error code of the thread in case something goes wrong, and pushes the End of
	- * Stream record when the traverse_dataset call has finished. If there is no
	- * dataset to traverse, the thread immediately pushes End of Stream marker.
	- */
	-static void
	-send_traverse_thread(void *arg)
	-{
	- struct send_thread_arg *st_arg = arg;
	- int err;
	- struct send_block_record *data;
	-
	- if (st_arg->ds != NULL) {
	- err = traverse_dataset_resume(st_arg->ds,
	- st_arg->fromtxg, &st_arg->resume,
	- st_arg->flags, send_cb, st_arg);
	-
	- if (err != EINTR)
	- st_arg->error_code = err;
	- }
	- data = kmem_zalloc(sizeof (*data), KM_SLEEP);
	- data->eos_marker = B_TRUE;
	- bqueue_enqueue(&st_arg->q, data, 1);
	- thread_exit();
	-}
	-
	-/*
	- * This function actually handles figuring out what kind of record needs to be
	- * dumped, reading the data (which has hopefully been prefetched), and calling
	- * the appropriate helper function.
	- */
	-static int
	-do_dump(dmu_sendarg_t dsa, struct send_block_record data)
	-{
	- dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
	- const blkptr_t *bp = &data->bp;
	- const zbookmark_phys_t *zb = &data->zb;
	- uint8_t indblkshift = data->indblkshift;
	- uint16_t dblkszsec = data->datablkszsec;
	- spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
	- dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
	- int err = 0;
	-
	- ASSERT3U(zb->zb_level, >=, 0);
	-
	- ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT \|\|
	- zb->zb_object >= dsa->dsa_resume_object);
	-
	- if (zb->zb_object != DMU_META_DNODE_OBJECT &&
	- DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
	- return (0);
	- } else if (BP_IS_HOLE(bp) &&
	- zb->zb_object == DMU_META_DNODE_OBJECT) {
	- uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
	- uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
	- err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
	- } else if (BP_IS_HOLE(bp)) {
	- uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
	- uint64_t offset = zb->zb_blkid * span;
	- err = dump_free(dsa, zb->zb_object, offset, span);
	- } else if (zb->zb_level > 0 \|\| type == DMU_OT_OBJSET) {
	- return (0);
	- } else if (type == DMU_OT_DNODE) {
	- int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
	- arc_flags_t aflags = ARC_FLAG_WAIT;
	- arc_buf_t *abuf;
	-
	- ASSERT0(zb->zb_level);
	-
	- if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
	- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
	- &aflags, zb) != 0)
	- return (SET_ERROR(EIO));
	-
	- dnode_phys_t *blk = abuf->b_data;
	- uint64_t dnobj = zb->zb_blkid * epb;
	- for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
	- err = dump_dnode(dsa, dnobj + i, blk + i);
	- if (err != 0)
	- break;
	- }
	- arc_buf_destroy(abuf, &abuf);
	- } else if (type == DMU_OT_SA) {
	- arc_flags_t aflags = ARC_FLAG_WAIT;
	- arc_buf_t *abuf;
	- int blksz = BP_GET_LSIZE(bp);
	-
	- if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
	- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
	- &aflags, zb) != 0)
	- return (SET_ERROR(EIO));
	-
	- err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
	- arc_buf_destroy(abuf, &abuf);
	- } else if (backup_do_embed(dsa, bp)) {
	- /* it's an embedded level-0 block of a regular object */
	- int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
	- ASSERT0(zb->zb_level);
	- err = dump_write_embedded(dsa, zb->zb_object,
	- zb->zb_blkid * blksz, blksz, bp);
	- } else {
	- /* it's a level-0 block of a regular object */
	- arc_flags_t aflags = ARC_FLAG_WAIT;
	- arc_buf_t *abuf;
	- int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
	- uint64_t offset;
	-
	- /*
	- * If we have large blocks stored on disk but the send flags
	- * don't allow us to send large blocks, we split the data from
	- * the arc buf into chunks.
	- */
	- boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
	- !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
	- /*
	- * We should only request compressed data from the ARC if all
	- * the following are true:
	- * - stream compression was requested
	- * - we aren't splitting large blocks into smaller chunks
	- * - the data won't need to be byteswapped before sending
	- * - this isn't an embedded block
	- * - this isn't metadata (if receiving on a different endian
	- * system it can be byteswapped more easily)
	- */
	- boolean_t request_compressed =
	- (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
	- !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
	- !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
	-
	- ASSERT0(zb->zb_level);
	- ASSERT(zb->zb_object > dsa->dsa_resume_object \|\|
	- (zb->zb_object == dsa->dsa_resume_object &&
	- zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
	-
	- ASSERT0(zb->zb_level);
	- ASSERT(zb->zb_object > dsa->dsa_resume_object \|\|
	- (zb->zb_object == dsa->dsa_resume_object &&
	- zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
	-
	- ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
	-
	- enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
	- if (request_compressed)
	- zioflags \|= ZIO_FLAG_RAW;
	- if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
	- ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
	- if (zfs_send_corrupt_data) {
	- /* Send a block filled with 0x"zfs badd bloc" */
	- abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA,
	- blksz);
	- uint64_t *ptr;
	- for (ptr = abuf->b_data;
	- (char )ptr < (char )abuf->b_data + blksz;
	- ptr++)
	- *ptr = 0x2f5baddb10cULL;
	- } else {
	- return (SET_ERROR(EIO));
	- }
	- }
	-
	- offset = zb->zb_blkid * blksz;
	-
	- if (split_large_blocks) {
	- ASSERT3U(arc_get_compression(abuf), ==,
	- ZIO_COMPRESS_OFF);
	- char *buf = abuf->b_data;
	- while (blksz > 0 && err == 0) {
	- int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
	- err = dump_write(dsa, type, zb->zb_object,
	- offset, n, n, NULL, buf);
	- offset += n;
	- buf += n;
	- blksz -= n;
	- }
	- } else {
	- err = dump_write(dsa, type, zb->zb_object, offset,
	- blksz, arc_buf_size(abuf), bp, abuf->b_data);
	- }
	- arc_buf_destroy(abuf, &abuf);
	- }
	-
	- ASSERT(err == 0 \|\| err == EINTR);
	- return (err);
	-}
	-
	-/*
	- * Pop the new data off the queue, and free the old data.
	- */
	-static struct send_block_record *
	-get_next_record(bqueue_t bq, struct send_block_record data)
	-{
	- struct send_block_record *tmp = bqueue_dequeue(bq);
	- kmem_free(data, sizeof (*data));
	- return (tmp);
	-}
	-
	-/*
	- * Actually do the bulk of the work in a zfs send.
	- *
	- * Note: Releases dp using the specified tag.
	- */
	-static int
	-dmu_send_impl(void tag, dsl_pool_t dp, dsl_dataset_t *to_ds,
	- zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
	- boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
	- int outfd, uint64_t resumeobj, uint64_t resumeoff,
	-#ifdef illumos
	- vnode_t vp, offset_t off)
	-#else
	- struct file fp, offset_t off)
	-#endif
	-{
	- objset_t *os;
	- dmu_replay_record_t *drr;
	- dmu_sendarg_t *dsp;
	- int err;
	- uint64_t fromtxg = 0;
	- uint64_t featureflags = 0;
	- struct send_thread_arg to_arg = { 0 };
	-
	- err = dmu_objset_from_ds(to_ds, &os);
	- if (err != 0) {
	- dsl_pool_rele(dp, tag);
	- return (err);
	- }
	-
	- drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
	- drr->drr_type = DRR_BEGIN;
	- drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
	- DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
	- DMU_SUBSTREAM);
	-
	-#ifdef _KERNEL
	- if (dmu_objset_type(os) == DMU_OST_ZFS) {
	- uint64_t version;
	- if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
	- kmem_free(drr, sizeof (dmu_replay_record_t));
	- dsl_pool_rele(dp, tag);
	- return (SET_ERROR(EINVAL));
	- }
	- if (version >= ZPL_VERSION_SA) {
	- featureflags \|= DMU_BACKUP_FEATURE_SA_SPILL;
	- }
	- }
	-#endif
	-
	- if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
	- featureflags \|= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
	- if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE])
	- featureflags \|= DMU_BACKUP_FEATURE_LARGE_DNODE;
	- if (embedok &&
	- spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
	- featureflags \|= DMU_BACKUP_FEATURE_EMBED_DATA;
	- if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
	- featureflags \|= DMU_BACKUP_FEATURE_LZ4;
	- }
	- if (compressok) {
	- featureflags \|= DMU_BACKUP_FEATURE_COMPRESSED;
	- }
	- if ((featureflags &
	- (DMU_BACKUP_FEATURE_EMBED_DATA \| DMU_BACKUP_FEATURE_COMPRESSED)) !=
	- 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
	- featureflags \|= DMU_BACKUP_FEATURE_LZ4;
	- }
	-
	- if (resumeobj != 0 \|\| resumeoff != 0) {
	- featureflags \|= DMU_BACKUP_FEATURE_RESUMING;
	- }
	-
	- DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
	- featureflags);
	-
	- drr->drr_u.drr_begin.drr_creation_time =
	- dsl_dataset_phys(to_ds)->ds_creation_time;
	- drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
	- if (is_clone)
	- drr->drr_u.drr_begin.drr_flags \|= DRR_FLAG_CLONE;
	- drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
	- if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
	- drr->drr_u.drr_begin.drr_flags \|= DRR_FLAG_CI_DATA;
	- if (zfs_send_set_freerecords_bit)
	- drr->drr_u.drr_begin.drr_flags \|= DRR_FLAG_FREERECORDS;
	-
	- if (ancestor_zb != NULL) {
	- drr->drr_u.drr_begin.drr_fromguid =
	- ancestor_zb->zbm_guid;
	- fromtxg = ancestor_zb->zbm_creation_txg;
	- }
	- dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
	- if (!to_ds->ds_is_snapshot) {
	- (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
	- sizeof (drr->drr_u.drr_begin.drr_toname));
	- }
	-
	- dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
	-
	- dsp->dsa_drr = drr;
	- dsp->dsa_outfd = outfd;
	- dsp->dsa_proc = curproc;
	- dsp->dsa_td = curthread;
	- dsp->dsa_fp = fp;
	- dsp->dsa_os = os;
	- dsp->dsa_off = off;
	- dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
	- dsp->dsa_pending_op = PENDING_NONE;
	- dsp->dsa_featureflags = featureflags;
	- dsp->dsa_resume_object = resumeobj;
	- dsp->dsa_resume_offset = resumeoff;
	-
	- mutex_enter(&to_ds->ds_sendstream_lock);
	- list_insert_head(&to_ds->ds_sendstreams, dsp);
	- mutex_exit(&to_ds->ds_sendstream_lock);
	-
	- dsl_dataset_long_hold(to_ds, FTAG);
	- dsl_pool_rele(dp, tag);
	-
	- void *payload = NULL;
	- size_t payload_len = 0;
	- if (resumeobj != 0 \|\| resumeoff != 0) {
	- dmu_object_info_t to_doi;
	- err = dmu_object_info(os, resumeobj, &to_doi);
	- if (err != 0)
	- goto out;
	- SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
	- resumeoff / to_doi.doi_data_block_size);
	-
	- nvlist_t *nvl = fnvlist_alloc();
	- fnvlist_add_uint64(nvl, "resume_object", resumeobj);
	- fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
	- payload = fnvlist_pack(nvl, &payload_len);
	- drr->drr_payloadlen = payload_len;
	- fnvlist_free(nvl);
	- }
	-
	- err = dump_record(dsp, payload, payload_len);
	- fnvlist_pack_free(payload, payload_len);
	- if (err != 0) {
	- err = dsp->dsa_err;
	- goto out;
	- }
	-
	- err = bqueue_init(&to_arg.q, zfs_send_queue_length,
	- offsetof(struct send_block_record, ln));
	- to_arg.error_code = 0;
	- to_arg.cancel = B_FALSE;
	- to_arg.ds = to_ds;
	- to_arg.fromtxg = fromtxg;
	- to_arg.flags = TRAVERSE_PRE \| TRAVERSE_PREFETCH;
	- (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0,
	- TS_RUN, minclsyspri);
	-
	- struct send_block_record *to_data;
	- to_data = bqueue_dequeue(&to_arg.q);
	-
	- while (!to_data->eos_marker && err == 0) {
	- err = do_dump(dsp, to_data);
	- to_data = get_next_record(&to_arg.q, to_data);
	- if (issig(JUSTLOOKING) && issig(FORREAL))
	- err = EINTR;
	- }
	-
	- if (err != 0) {
	- to_arg.cancel = B_TRUE;
	- while (!to_data->eos_marker) {
	- to_data = get_next_record(&to_arg.q, to_data);
	- }
	- }
	- kmem_free(to_data, sizeof (*to_data));
	-
	- bqueue_destroy(&to_arg.q);
	-
	- if (err == 0 && to_arg.error_code != 0)
	- err = to_arg.error_code;
	-
	- if (err != 0)
	- goto out;
	-
	- if (dsp->dsa_pending_op != PENDING_NONE)
	- if (dump_record(dsp, NULL, 0) != 0)
	- err = SET_ERROR(EINTR);
	-
	- if (err != 0) {
	- if (err == EINTR && dsp->dsa_err != 0)
	- err = dsp->dsa_err;
	- goto out;
	- }
	-
	- bzero(drr, sizeof (dmu_replay_record_t));
	- drr->drr_type = DRR_END;
	- drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
	- drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
	-
	- if (dump_record(dsp, NULL, 0) != 0)
	- err = dsp->dsa_err;
	-
	-out:
	- mutex_enter(&to_ds->ds_sendstream_lock);
	- list_remove(&to_ds->ds_sendstreams, dsp);
	- mutex_exit(&to_ds->ds_sendstream_lock);
	-
	- VERIFY(err != 0 \|\| (dsp->dsa_sent_begin && dsp->dsa_sent_end));
	-
	- kmem_free(drr, sizeof (dmu_replay_record_t));
	- kmem_free(dsp, sizeof (dmu_sendarg_t));
	-
	- dsl_dataset_long_rele(to_ds, FTAG);
	-
	- return (err);
	-}
	-
	-int
	-dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
	- boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
	-#ifdef illumos
	- int outfd, vnode_t vp, offset_t off)
	-#else
	- int outfd, struct file fp, offset_t off)
	-#endif
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- dsl_dataset_t *fromds = NULL;
	- int err;
	-
	- err = dsl_pool_hold(pool, FTAG, &dp);
	- if (err != 0)
	- return (err);
	-
	- err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
	- if (err != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (err);
	- }
	-
	- if (fromsnap != 0) {
	- zfs_bookmark_phys_t zb;
	- boolean_t is_clone;
	-
	- err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
	- if (err != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (err);
	- }
	- if (!dsl_dataset_is_before(ds, fromds, 0))
	- err = SET_ERROR(EXDEV);
	- zb.zbm_creation_time =
	- dsl_dataset_phys(fromds)->ds_creation_time;
	- zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
	- zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
	- is_clone = (fromds->ds_dir != ds->ds_dir);
	- dsl_dataset_rele(fromds, FTAG);
	- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
	- embedok, large_block_ok, compressok, outfd, 0, 0, fp, off);
	- } else {
	- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
	- embedok, large_block_ok, compressok, outfd, 0, 0, fp, off);
	- }
	- dsl_dataset_rele(ds, FTAG);
	- return (err);
	-}
	-
	-int
	-dmu_send(const char tosnap, const char fromsnap, boolean_t embedok,
	- boolean_t large_block_ok, boolean_t compressok, int outfd,
	- uint64_t resumeobj, uint64_t resumeoff,
	-#ifdef illumos
	- vnode_t vp, offset_t off)
	-#else
	- struct file fp, offset_t off)
	-#endif
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- int err;
	- boolean_t owned = B_FALSE;
	-
	- if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- err = dsl_pool_hold(tosnap, FTAG, &dp);
	- if (err != 0)
	- return (err);
	-
	- if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
	- /*
	- * We are sending a filesystem or volume. Ensure
	- * that it doesn't change by owning the dataset.
	- */
	- err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
	- owned = B_TRUE;
	- } else {
	- err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
	- }
	- if (err != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (err);
	- }
	-
	- if (fromsnap != NULL) {
	- zfs_bookmark_phys_t zb;
	- boolean_t is_clone = B_FALSE;
	- int fsnamelen = strchr(tosnap, '@') - tosnap;
	-
	- /*
	- * If the fromsnap is in a different filesystem, then
	- * mark the send stream as a clone.
	- */
	- if (strncmp(tosnap, fromsnap, fsnamelen) != 0 \|\|
	- (fromsnap[fsnamelen] != '@' &&
	- fromsnap[fsnamelen] != '#')) {
	- is_clone = B_TRUE;
	- }
	-
	- if (strchr(fromsnap, '@')) {
	- dsl_dataset_t *fromds;
	- err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
	- if (err == 0) {
	- if (!dsl_dataset_is_before(ds, fromds, 0))
	- err = SET_ERROR(EXDEV);
	- zb.zbm_creation_time =
	- dsl_dataset_phys(fromds)->ds_creation_time;
	- zb.zbm_creation_txg =
	- dsl_dataset_phys(fromds)->ds_creation_txg;
	- zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
	- is_clone = (ds->ds_dir != fromds->ds_dir);
	- dsl_dataset_rele(fromds, FTAG);
	- }
	- } else {
	- err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
	- }
	- if (err != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (err);
	- }
	- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
	- embedok, large_block_ok, compressok,
	- outfd, resumeobj, resumeoff, fp, off);
	- } else {
	- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
	- embedok, large_block_ok, compressok,
	- outfd, resumeobj, resumeoff, fp, off);
	- }
	- if (owned)
	- dsl_dataset_disown(ds, FTAG);
	- else
	- dsl_dataset_rele(ds, FTAG);
	- return (err);
	-}
	-
	-static int
	-dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
	- uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
	-{
	- int err = 0;
	- uint64_t size;
	- /*
	- * Assume that space (both on-disk and in-stream) is dominated by
	- * data. We will adjust for indirect blocks and the copies property,
	- * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
	- */
	- uint64_t recordsize;
	- uint64_t record_count;
	- objset_t *os;
	- VERIFY0(dmu_objset_from_ds(ds, &os));
	-
	- /* Assume all (uncompressed) blocks are recordsize. */
	- if (zfs_override_estimate_recordsize != 0) {
	- recordsize = zfs_override_estimate_recordsize;
	- } else if (os->os_phys->os_type == DMU_OST_ZVOL) {
	- err = dsl_prop_get_int_ds(ds,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
	- } else {
	- err = dsl_prop_get_int_ds(ds,
	- zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
	- }
	- if (err != 0)
	- return (err);
	- record_count = uncompressed / recordsize;
	-
	- /*
	- * If we're estimating a send size for a compressed stream, use the
	- * compressed data size to estimate the stream size. Otherwise, use the
	- * uncompressed data size.
	- */
	- size = stream_compressed ? compressed : uncompressed;
	-
	- /*
	- * Subtract out approximate space used by indirect blocks.
	- * Assume most space is used by data blocks (non-indirect, non-dnode).
	- * Assume no ditto blocks or internal fragmentation.
	- *
	- * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
	- * block.
	- */
	- size -= record_count * sizeof (blkptr_t);
	-
	- /* Add in the space for the record associated with each block. */
	- size += record_count * sizeof (dmu_replay_record_t);
	-
	- *sizep = size;
	-
	- return (0);
	-}
	-
	-int
	-dmu_send_estimate(dsl_dataset_t ds, dsl_dataset_t fromds,
	- boolean_t stream_compressed, uint64_t *sizep)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- int err;
	- uint64_t uncomp, comp;
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- /* tosnap must be a snapshot */
	- if (!ds->ds_is_snapshot)
	- return (SET_ERROR(EINVAL));
	-
	- /* fromsnap, if provided, must be a snapshot */
	- if (fromds != NULL && !fromds->ds_is_snapshot)
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * fromsnap must be an earlier snapshot from the same fs as tosnap,
	- * or the origin's fs.
	- */
	- if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
	- return (SET_ERROR(EXDEV));
	-
	- /* Get compressed and uncompressed size estimates of changed data. */
	- if (fromds == NULL) {
	- uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
	- comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
	- } else {
	- uint64_t used;
	- err = dsl_dataset_space_written(fromds, ds,
	- &used, &comp, &uncomp);
	- if (err != 0)
	- return (err);
	- }
	-
	- err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
	- stream_compressed, sizep);
	- /*
	- * Add the size of the BEGIN and END records to the estimate.
	- */
	- sizep += 2 sizeof (dmu_replay_record_t);
	- return (err);
	-}
	-
	-struct calculate_send_arg {
	- uint64_t uncompressed;
	- uint64_t compressed;
	-};
	-
	-/*
	- * Simple callback used to traverse the blocks of a snapshot and sum their
	- * uncompressed and compressed sizes.
	- */
	-/* ARGSUSED */
	-static int
	-dmu_calculate_send_traversal(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	-{
	- struct calculate_send_arg *space = arg;
	- if (bp != NULL && !BP_IS_HOLE(bp)) {
	- space->uncompressed += BP_GET_UCSIZE(bp);
	- space->compressed += BP_GET_PSIZE(bp);
	- }
	- return (0);
	-}
	-
	-/*
	- * Given a desination snapshot and a TXG, calculate the approximate size of a
	- * send stream sent from that TXG. from_txg may be zero, indicating that the
	- * whole snapshot will be sent.
	- */
	-int
	-dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
	- boolean_t stream_compressed, uint64_t *sizep)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- int err;
	- struct calculate_send_arg size = { 0 };
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- /* tosnap must be a snapshot */
	- if (!ds->ds_is_snapshot)
	- return (SET_ERROR(EINVAL));
	-
	- /* verify that from_txg is before the provided snapshot was taken */
	- if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
	- return (SET_ERROR(EXDEV));
	- }
	-
	- /*
	- * traverse the blocks of the snapshot with birth times after
	- * from_txg, summing their uncompressed size
	- */
	- err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
	- dmu_calculate_send_traversal, &size);
	- if (err)
	- return (err);
	-
	- err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed,
	- size.compressed, stream_compressed, sizep);
	- return (err);
	-}
	-
	-typedef struct dmu_recv_begin_arg {
	- const char *drba_origin;
	- dmu_recv_cookie_t *drba_cookie;
	- cred_t *drba_cred;
	- uint64_t drba_snapobj;
	-} dmu_recv_begin_arg_t;
	-
	-static int
	-recv_begin_check_existing_impl(dmu_recv_begin_arg_t drba, dsl_dataset_t ds,
	- uint64_t fromguid)
	-{
	- uint64_t val;
	- uint64_t children;
	- int error;
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	-
	- /* Temporary clone name must not exist. */
	- error = zap_lookup(dp->dp_meta_objset,
	- dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
	- 8, 1, &val);
	- if (error != ENOENT)
	- return (error == 0 ? SET_ERROR(EBUSY) : error);
	-
	- /* Resume state must not be set. */
	- if (dsl_dataset_has_resume_receive_state(ds))
	- return (SET_ERROR(EBUSY));
	-
	- /* New snapshot name must not exist. */
	- error = zap_lookup(dp->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_snapnames_zapobj,
	- drba->drba_cookie->drc_tosnap, 8, 1, &val);
	- if (error != ENOENT)
	- return (error == 0 ? SET_ERROR(EEXIST) : error);
	-
	- /* must not have children if receiving a ZVOL */
	- error = zap_count(dp->dp_meta_objset,
	- dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
	- if (error != 0)
	- return (error);
	- if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
	- children > 0)
	- return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
	-
	- /*
	- * Check snapshot limit before receiving. We'll recheck again at the
	- * end, but might as well abort before receiving if we're already over
	- * the limit.
	- *
	- * Note that we do not check the file system limit with
	- * dsl_dir_fscount_check because the temporary %clones don't count
	- * against that limit.
	- */
	- error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
	- NULL, drba->drba_cred);
	- if (error != 0)
	- return (error);
	-
	- if (fromguid != 0) {
	- dsl_dataset_t *snap;
	- uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
	-
	- /* Find snapshot in this dir that matches fromguid. */
	- while (obj != 0) {
	- error = dsl_dataset_hold_obj(dp, obj, FTAG,
	- &snap);
	- if (error != 0)
	- return (SET_ERROR(ENODEV));
	- if (snap->ds_dir != ds->ds_dir) {
	- dsl_dataset_rele(snap, FTAG);
	- return (SET_ERROR(ENODEV));
	- }
	- if (dsl_dataset_phys(snap)->ds_guid == fromguid)
	- break;
	- obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
	- dsl_dataset_rele(snap, FTAG);
	- }
	- if (obj == 0)
	- return (SET_ERROR(ENODEV));
	-
	- if (drba->drba_cookie->drc_force) {
	- drba->drba_snapobj = obj;
	- } else {
	- /*
	- * If we are not forcing, there must be no
	- * changes since fromsnap.
	- */
	- if (dsl_dataset_modified_since_snap(ds, snap)) {
	- dsl_dataset_rele(snap, FTAG);
	- return (SET_ERROR(ETXTBSY));
	- }
	- drba->drba_snapobj = ds->ds_prev->ds_object;
	- }
	-
	- dsl_dataset_rele(snap, FTAG);
	- } else {
	- /* if full, then must be forced */
	- if (!drba->drba_cookie->drc_force)
	- return (SET_ERROR(EEXIST));
	- /* start from $ORIGIN@$ORIGIN, if supported */
	- drba->drba_snapobj = dp->dp_origin_snap != NULL ?
	- dp->dp_origin_snap->ds_object : 0;
	- }
	-
	- return (0);
	-
	-}
	-
	-static int
	-dmu_recv_begin_check(void arg, dmu_tx_t tx)
	-{
	- dmu_recv_begin_arg_t *drba = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
	- uint64_t fromguid = drrb->drr_fromguid;
	- int flags = drrb->drr_flags;
	- int error;
	- uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
	- dsl_dataset_t *ds;
	- const char *tofs = drba->drba_cookie->drc_tofs;
	-
	- /* already checked */
	- ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
	- ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
	-
	- if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
	- DMU_COMPOUNDSTREAM \|\|
	- drrb->drr_type >= DMU_OST_NUMTYPES \|\|
	- ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
	- return (SET_ERROR(EINVAL));
	-
	- /* Verify pool version supports SA if SA_SPILL feature set */
	- if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
	- spa_version(dp->dp_spa) < SPA_VERSION_SA)
	- return (SET_ERROR(ENOTSUP));
	-
	- if (drba->drba_cookie->drc_resumable &&
	- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
	- return (SET_ERROR(ENOTSUP));
	-
	- /*
	- * The receiving code doesn't know how to translate a WRITE_EMBEDDED
	- * record to a plain WRITE record, so the pool must have the
	- * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
	- * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
	- */
	- if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
	- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
	- return (SET_ERROR(ENOTSUP));
	- if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
	- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
	- return (SET_ERROR(ENOTSUP));
	-
	- /*
	- * The receiving code doesn't know how to translate large blocks
	- * to smaller ones, so the pool must have the LARGE_BLOCKS
	- * feature enabled if the stream has LARGE_BLOCKS. Same with
	- * large dnodes.
	- */
	- if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
	- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
	- return (SET_ERROR(ENOTSUP));
	- if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
	- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
	- return (SET_ERROR(ENOTSUP));
	-
	- error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
	- if (error == 0) {
	- /* target fs already exists; recv into temp clone */
	-
	- /* Can't recv a clone into an existing fs */
	- if (flags & DRR_FLAG_CLONE \|\| drba->drba_origin) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- error = recv_begin_check_existing_impl(drba, ds, fromguid);
	- dsl_dataset_rele(ds, FTAG);
	- } else if (error == ENOENT) {
	- /* target fs does not exist; must be a full backup or clone */
	- char buf[ZFS_MAX_DATASET_NAME_LEN];
	- objset_t *os;
	-
	- /*
	- * If it's a non-clone incremental, we are missing the
	- * target fs, so fail the recv.
	- */
	- if (fromguid != 0 && !(flags & DRR_FLAG_CLONE \|\|
	- drba->drba_origin))
	- return (SET_ERROR(ENOENT));
	-
	- /*
	- * If we're receiving a full send as a clone, and it doesn't
	- * contain all the necessary free records and freeobject
	- * records, reject it.
	- */
	- if (fromguid == 0 && drba->drba_origin &&
	- !(flags & DRR_FLAG_FREERECORDS))
	- return (SET_ERROR(EINVAL));
	-
	- /* Open the parent of tofs */
	- ASSERT3U(strlen(tofs), <, sizeof (buf));
	- (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
	- error = dsl_dataset_hold(dp, buf, FTAG, &ds);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Check filesystem and snapshot limits before receiving. We'll
	- * recheck snapshot limits again at the end (we create the
	- * filesystems and increment those counts during begin_sync).
	- */
	- error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
	- ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
	- ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- /* can't recv below anything but filesystems (eg. no ZVOLs) */
	- error = dmu_objset_from_ds(ds, &os);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	- if (dmu_objset_type(os) != DMU_OST_ZFS) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
	- }
	-
	- if (drba->drba_origin != NULL) {
	- dsl_dataset_t *origin;
	- error = dsl_dataset_hold(dp, drba->drba_origin,
	- FTAG, &origin);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	- if (!origin->ds_is_snapshot) {
	- dsl_dataset_rele(origin, FTAG);
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	- if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
	- fromguid != 0) {
	- dsl_dataset_rele(origin, FTAG);
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ENODEV));
	- }
	- dsl_dataset_rele(origin, FTAG);
	- }
	-
	- dsl_dataset_rele(ds, FTAG);
	- error = 0;
	- }
	- return (error);
	-}
	-
	-static void
	-dmu_recv_begin_sync(void arg, dmu_tx_t tx)
	-{
	- dmu_recv_begin_arg_t *drba = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- objset_t *mos = dp->dp_meta_objset;
	- struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
	- const char *tofs = drba->drba_cookie->drc_tofs;
	- dsl_dataset_t ds, newds;
	- uint64_t dsobj;
	- int error;
	- uint64_t crflags = 0;
	-
	- if (drrb->drr_flags & DRR_FLAG_CI_DATA)
	- crflags \|= DS_FLAG_CI_DATASET;
	-
	- error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
	- if (error == 0) {
	- /* create temporary clone */
	- dsl_dataset_t *snap = NULL;
	- if (drba->drba_snapobj != 0) {
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- drba->drba_snapobj, FTAG, &snap));
	- }
	- dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
	- snap, crflags, drba->drba_cred, tx);
	- if (drba->drba_snapobj != 0)
	- dsl_dataset_rele(snap, FTAG);
	- dsl_dataset_rele(ds, FTAG);
	- } else {
	- dsl_dir_t *dd;
	- const char *tail;
	- dsl_dataset_t *origin = NULL;
	-
	- VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
	-
	- if (drba->drba_origin != NULL) {
	- VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
	- FTAG, &origin));
	- }
	-
	- /* Create new dataset. */
	- dsobj = dsl_dataset_create_sync(dd,
	- strrchr(tofs, '/') + 1,
	- origin, crflags, drba->drba_cred, tx);
	- if (origin != NULL)
	- dsl_dataset_rele(origin, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- drba->drba_cookie->drc_newfs = B_TRUE;
	- }
	- VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
	-
	- if (drba->drba_cookie->drc_resumable) {
	- dsl_dataset_zapify(newds, tx);
	- if (drrb->drr_fromguid != 0) {
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
	- 8, 1, &drrb->drr_fromguid, tx));
	- }
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
	- 8, 1, &drrb->drr_toguid, tx));
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
	- 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
	- uint64_t one = 1;
	- uint64_t zero = 0;
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
	- 8, 1, &one, tx));
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
	- 8, 1, &zero, tx));
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
	- 8, 1, &zero, tx));
	- if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
	- DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
	- 8, 1, &one, tx));
	- }
	- if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
	- DMU_BACKUP_FEATURE_EMBED_DATA) {
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
	- 8, 1, &one, tx));
	- }
	- if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
	- DMU_BACKUP_FEATURE_COMPRESSED) {
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
	- 8, 1, &one, tx));
	- }
	- }
	-
	- dmu_buf_will_dirty(newds->ds_dbuf, tx);
	- dsl_dataset_phys(newds)->ds_flags \|= DS_FLAG_INCONSISTENT;
	-
	- /*
	- * If we actually created a non-clone, we need to create the
	- * objset in our new dataset.
	- */
	- rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
	- if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
	- (void) dmu_objset_create_impl(dp->dp_spa,
	- newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
	- }
	- rrw_exit(&newds->ds_bp_rwlock, FTAG);
	-
	- drba->drba_cookie->drc_ds = newds;
	-
	- spa_history_log_internal_ds(newds, "receive", tx, "");
	-}
	-
	-static int
	-dmu_recv_resume_begin_check(void arg, dmu_tx_t tx)
	-{
	- dmu_recv_begin_arg_t *drba = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
	- int error;
	- uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
	- dsl_dataset_t *ds;
	- const char *tofs = drba->drba_cookie->drc_tofs;
	-
	- /* 6 extra bytes for /%recv */
	- char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
	-
	- /* already checked */
	- ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
	- ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
	-
	- if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
	- DMU_COMPOUNDSTREAM \|\|
	- drrb->drr_type >= DMU_OST_NUMTYPES)
	- return (SET_ERROR(EINVAL));
	-
	- /* Verify pool version supports SA if SA_SPILL feature set */
	- if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
	- spa_version(dp->dp_spa) < SPA_VERSION_SA)
	- return (SET_ERROR(ENOTSUP));
	-
	- /*
	- * The receiving code doesn't know how to translate a WRITE_EMBEDDED
	- * record to a plain WRITE record, so the pool must have the
	- * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
	- * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
	- */
	- if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
	- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
	- return (SET_ERROR(ENOTSUP));
	- if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
	- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
	- return (SET_ERROR(ENOTSUP));
	-
	- /*
	- * The receiving code doesn't know how to translate large blocks
	- * to smaller ones, so the pool must have the LARGE_BLOCKS
	- * feature enabled if the stream has LARGE_BLOCKS. Same with
	- * large dnodes.
	- */
	- if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
	- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
	- return (SET_ERROR(ENOTSUP));
	- if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
	- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
	- return (SET_ERROR(ENOTSUP));
	-
	- (void) snprintf(recvname, sizeof (recvname), "%s/%s",
	- tofs, recv_clone_name);
	-
	- if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
	- /* %recv does not exist; continue in tofs */
	- error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
	- if (error != 0)
	- return (error);
	- }
	-
	- /* check that ds is marked inconsistent */
	- if (!DS_IS_INCONSISTENT(ds)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /* check that there is resuming data, and that the toguid matches */
	- if (!dsl_dataset_is_zapified(ds)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	- uint64_t val;
	- error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
	- if (error != 0 \|\| drrb->drr_toguid != val) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * Check if the receive is still running. If so, it will be owned.
	- * Note that nothing else can own the dataset (e.g. after the receive
	- * fails) because it will be marked inconsistent.
	- */
	- if (dsl_dataset_has_owner(ds)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EBUSY));
	- }
	-
	- /* There should not be any snapshots of this fs yet. */
	- if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * Note: resume point will be checked when we process the first WRITE
	- * record.
	- */
	-
	- /* check that the origin matches */
	- val = 0;
	- (void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
	- if (drrb->drr_fromguid != val) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	-}
	-
	-static void
	-dmu_recv_resume_begin_sync(void arg, dmu_tx_t tx)
	-{
	- dmu_recv_begin_arg_t *drba = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- const char *tofs = drba->drba_cookie->drc_tofs;
	- dsl_dataset_t *ds;
	- uint64_t dsobj;
	- /* 6 extra bytes for /%recv */
	- char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
	-
	- (void) snprintf(recvname, sizeof (recvname), "%s/%s",
	- tofs, recv_clone_name);
	-
	- if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
	- /* %recv does not exist; continue in tofs */
	- VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
	- drba->drba_cookie->drc_newfs = B_TRUE;
	- }
	-
	- /* clear the inconsistent flag so that we can own it */
	- ASSERT(DS_IS_INCONSISTENT(ds));
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
	- dsobj = ds->ds_object;
	- dsl_dataset_rele(ds, FTAG);
	-
	- VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
	-
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_flags \|= DS_FLAG_INCONSISTENT;
	-
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	-
	- drba->drba_cookie->drc_ds = ds;
	-
	- spa_history_log_internal_ds(ds, "resume receive", tx, "");
	-}
	-
	-/*
	- * NB: callers MUST call dmu_recv_stream() if dmu_recv_begin()
	- * succeeds; otherwise we will leak the holds on the datasets.
	- */
	-int
	-dmu_recv_begin(char tofs, char tosnap, dmu_replay_record_t *drr_begin,
	- boolean_t force, boolean_t resumable, char origin, dmu_recv_cookie_t drc)
	-{
	- dmu_recv_begin_arg_t drba = { 0 };
	-
	- bzero(drc, sizeof (dmu_recv_cookie_t));
	- drc->drc_drr_begin = drr_begin;
	- drc->drc_drrb = &drr_begin->drr_u.drr_begin;
	- drc->drc_tosnap = tosnap;
	- drc->drc_tofs = tofs;
	- drc->drc_force = force;
	- drc->drc_resumable = resumable;
	- drc->drc_cred = CRED();
	- drc->drc_clone = (origin != NULL);
	-
	- if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
	- drc->drc_byteswap = B_TRUE;
	- (void) fletcher_4_incremental_byteswap(drr_begin,
	- sizeof (dmu_replay_record_t), &drc->drc_cksum);
	- byteswap_record(drr_begin);
	- } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
	- (void) fletcher_4_incremental_native(drr_begin,
	- sizeof (dmu_replay_record_t), &drc->drc_cksum);
	- } else {
	- return (SET_ERROR(EINVAL));
	- }
	-
	- drba.drba_origin = origin;
	- drba.drba_cookie = drc;
	- drba.drba_cred = CRED();
	-
	- if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
	- DMU_BACKUP_FEATURE_RESUMING) {
	- return (dsl_sync_task(tofs,
	- dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
	- &drba, 5, ZFS_SPACE_CHECK_NORMAL));
	- } else {
	- return (dsl_sync_task(tofs,
	- dmu_recv_begin_check, dmu_recv_begin_sync,
	- &drba, 5, ZFS_SPACE_CHECK_NORMAL));
	- }
	-}
	-
	-struct receive_record_arg {
	- dmu_replay_record_t header;
	- void payload; / Pointer to a buffer containing the payload */
	- /*
	- * If the record is a write, pointer to the arc_buf_t containing the
	- * payload.
	- */
	- arc_buf_t *write_buf;
	- int payload_size;
	- uint64_t bytes_read; /* bytes read from stream when record created */
	- boolean_t eos_marker; /* Marks the end of the stream */
	- bqueue_node_t node;
	-};
	-
	-struct receive_writer_arg {
	- objset_t *os;
	- boolean_t byteswap;
	- bqueue_t q;
	-
	- /*
	- * These three args are used to signal to the main thread that we're
	- * done.
	- */
	- kmutex_t mutex;
	- kcondvar_t cv;
	- boolean_t done;
	-
	- int err;
	- /* A map from guid to dataset to help handle dedup'd streams. */
	- avl_tree_t *guid_to_ds_map;
	- boolean_t resumable;
	- uint64_t last_object;
	- uint64_t last_offset;
	- uint64_t max_object; /* highest object ID referenced in stream */
	- uint64_t bytes_read; /* bytes read when current record created */
	-};
	-
	-struct objlist {
	- list_t list; /* List of struct receive_objnode. */
	- /*
	- * Last object looked up. Used to assert that objects are being looked
	- * up in ascending order.
	- */
	- uint64_t last_lookup;
	-};
	-
	-struct receive_objnode {
	- list_node_t node;
	- uint64_t object;
	-};
	-
	-struct receive_arg {
	- objset_t *os;
	- kthread_t *td;
	- struct file *fp;
	- uint64_t voff; /* The current offset in the stream */
	- uint64_t bytes_read;
	- /*
	- * A record that has had its payload read in, but hasn't yet been handed
	- * off to the worker thread.
	- */
	- struct receive_record_arg *rrd;
	- /* A record that has had its header read in, but not its payload. */
	- struct receive_record_arg *next_rrd;
	- zio_cksum_t cksum;
	- zio_cksum_t prev_cksum;
	- int err;
	- boolean_t byteswap;
	- /* Sorted list of objects not to issue prefetches for. */
	- struct objlist ignore_objlist;
	-};
	-
	-typedef struct guid_map_entry {
	- uint64_t guid;
	- dsl_dataset_t *gme_ds;
	- avl_node_t avlnode;
	-} guid_map_entry_t;
	-
	-static int
	-guid_compare(const void arg1, const void arg2)
	-{
	- const guid_map_entry_t gmep1 = (const guid_map_entry_t )arg1;
	- const guid_map_entry_t gmep2 = (const guid_map_entry_t )arg2;
	-
	- return (AVL_CMP(gmep1->guid, gmep2->guid));
	-}
	-
	-static void
	-free_guid_map_onexit(void *arg)
	-{
	- avl_tree_t *ca = arg;
	- void *cookie = NULL;
	- guid_map_entry_t *gmep;
	-
	- while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
	- dsl_dataset_long_rele(gmep->gme_ds, gmep);
	- dsl_dataset_rele(gmep->gme_ds, gmep);
	- kmem_free(gmep, sizeof (guid_map_entry_t));
	- }
	- avl_destroy(ca);
	- kmem_free(ca, sizeof (avl_tree_t));
	-}
	-
	-static int
	-restore_bytes(struct receive_arg ra, void buf, int len, off_t off, ssize_t *resid)
	-{
	- struct uio auio;
	- struct iovec aiov;
	- int error;
	-
	- aiov.iov_base = buf;
	- aiov.iov_len = len;
	- auio.uio_iov = &aiov;
	- auio.uio_iovcnt = 1;
	- auio.uio_resid = len;
	- auio.uio_segflg = UIO_SYSSPACE;
	- auio.uio_rw = UIO_READ;
	- auio.uio_offset = off;
	- auio.uio_td = ra->td;
	-#ifdef _KERNEL
	- error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
	-#else
	- fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
	- error = EOPNOTSUPP;
	-#endif
	- *resid = auio.uio_resid;
	- return (error);
	-}
	-
	-static int
	-receive_read(struct receive_arg ra, int len, void buf)
	-{
	- int done = 0;
	-
	- /*
	- * The code doesn't rely on this (lengths being multiples of 8). See
	- * comment in dump_bytes.
	- */
	- ASSERT0(len % 8);
	-
	- while (done < len) {
	- ssize_t resid;
	-
	- ra->err = restore_bytes(ra, buf + done,
	- len - done, ra->voff, &resid);
	-
	- if (resid == len - done) {
	- /*
	- * Note: ECKSUM indicates that the receive
	- * was interrupted and can potentially be resumed.
	- */
	- ra->err = SET_ERROR(ECKSUM);
	- }
	- ra->voff += len - done - resid;
	- done = len - resid;
	- if (ra->err != 0)
	- return (ra->err);
	- }
	-
	- ra->bytes_read += len;
	-
	- ASSERT3U(done, ==, len);
	- return (0);
	-}
	-
	-noinline static void
	-byteswap_record(dmu_replay_record_t *drr)
	-{
	-#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
	-#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
	- drr->drr_type = BSWAP_32(drr->drr_type);
	- drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
	-
	- switch (drr->drr_type) {
	- case DRR_BEGIN:
	- DO64(drr_begin.drr_magic);
	- DO64(drr_begin.drr_versioninfo);
	- DO64(drr_begin.drr_creation_time);
	- DO32(drr_begin.drr_type);
	- DO32(drr_begin.drr_flags);
	- DO64(drr_begin.drr_toguid);
	- DO64(drr_begin.drr_fromguid);
	- break;
	- case DRR_OBJECT:
	- DO64(drr_object.drr_object);
	- DO32(drr_object.drr_type);
	- DO32(drr_object.drr_bonustype);
	- DO32(drr_object.drr_blksz);
	- DO32(drr_object.drr_bonuslen);
	- DO64(drr_object.drr_toguid);
	- break;
	- case DRR_FREEOBJECTS:
	- DO64(drr_freeobjects.drr_firstobj);
	- DO64(drr_freeobjects.drr_numobjs);
	- DO64(drr_freeobjects.drr_toguid);
	- break;
	- case DRR_WRITE:
	- DO64(drr_write.drr_object);
	- DO32(drr_write.drr_type);
	- DO64(drr_write.drr_offset);
	- DO64(drr_write.drr_logical_size);
	- DO64(drr_write.drr_toguid);
	- ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
	- DO64(drr_write.drr_key.ddk_prop);
	- DO64(drr_write.drr_compressed_size);
	- break;
	- case DRR_WRITE_BYREF:
	- DO64(drr_write_byref.drr_object);
	- DO64(drr_write_byref.drr_offset);
	- DO64(drr_write_byref.drr_length);
	- DO64(drr_write_byref.drr_toguid);
	- DO64(drr_write_byref.drr_refguid);
	- DO64(drr_write_byref.drr_refobject);
	- DO64(drr_write_byref.drr_refoffset);
	- ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
	- drr_key.ddk_cksum);
	- DO64(drr_write_byref.drr_key.ddk_prop);
	- break;
	- case DRR_WRITE_EMBEDDED:
	- DO64(drr_write_embedded.drr_object);
	- DO64(drr_write_embedded.drr_offset);
	- DO64(drr_write_embedded.drr_length);
	- DO64(drr_write_embedded.drr_toguid);
	- DO32(drr_write_embedded.drr_lsize);
	- DO32(drr_write_embedded.drr_psize);
	- break;
	- case DRR_FREE:
	- DO64(drr_free.drr_object);
	- DO64(drr_free.drr_offset);
	- DO64(drr_free.drr_length);
	- DO64(drr_free.drr_toguid);
	- break;
	- case DRR_SPILL:
	- DO64(drr_spill.drr_object);
	- DO64(drr_spill.drr_length);
	- DO64(drr_spill.drr_toguid);
	- break;
	- case DRR_END:
	- DO64(drr_end.drr_toguid);
	- ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
	- break;
	- }
	-
	- if (drr->drr_type != DRR_BEGIN) {
	- ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
	- }
	-
	-#undef DO64
	-#undef DO32
	-}
	-
	-static inline uint8_t
	-deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
	-{
	- if (bonus_type == DMU_OT_SA) {
	- return (1);
	- } else {
	- return (1 +
	- ((DN_OLD_MAX_BONUSLEN -
	- MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
	- }
	-}
	-
	-static void
	-save_resume_state(struct receive_writer_arg *rwa,
	- uint64_t object, uint64_t offset, dmu_tx_t *tx)
	-{
	- int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
	-
	- if (!rwa->resumable)
	- return;
	-
	- /*
	- * We use ds_resume_bytes[] != 0 to indicate that we need to
	- * update this on disk, so it must not be 0.
	- */
	- ASSERT(rwa->bytes_read != 0);
	-
	- /*
	- * We only resume from write records, which have a valid
	- * (non-meta-dnode) object number.
	- */
	- ASSERT(object != 0);
	-
	- /*
	- * For resuming to work correctly, we must receive records in order,
	- * sorted by object,offset. This is checked by the callers, but
	- * assert it here for good measure.
	- */
	- ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
	- ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] \|\|
	- offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
	- ASSERT3U(rwa->bytes_read, >=,
	- rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
	-
	- rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
	- rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
	- rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
	-}
	-
	-noinline static int
	-receive_object(struct receive_writer_arg rwa, struct drr_object drro,
	- void *data)
	-{
	- dmu_object_info_t doi;
	- dmu_tx_t *tx;
	- uint64_t object;
	- int err;
	- uint8_t dn_slots = drro->drr_dn_slots != 0 ?
	- drro->drr_dn_slots : DNODE_MIN_SLOTS;
	-
	- if (drro->drr_type == DMU_OT_NONE \|\|
	- !DMU_OT_IS_VALID(drro->drr_type) \|\|
	- !DMU_OT_IS_VALID(drro->drr_bonustype) \|\|
	- drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS \|\|
	- drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS \|\|
	- P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) \|\|
	- drro->drr_blksz < SPA_MINBLOCKSIZE \|\|
	- drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) \|\|
	- drro->drr_bonuslen >
	- DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) \|\|
	- dn_slots >
	- (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
	- return (SET_ERROR(EINVAL));
	- }
	-
	- err = dmu_object_info(rwa->os, drro->drr_object, &doi);
	-
	- if (err != 0 && err != ENOENT && err != EEXIST)
	- return (SET_ERROR(EINVAL));
	-
	- if (drro->drr_object > rwa->max_object)
	- rwa->max_object = drro->drr_object;
	-
	- /*
	- * If we are losing blkptrs or changing the block size this must
	- * be a new file instance. We must clear out the previous file
	- * contents before we can change this type of metadata in the dnode.
	- */
	- if (err == 0) {
	- int nblkptr;
	-
	- object = drro->drr_object;
	-
	- nblkptr = deduce_nblkptr(drro->drr_bonustype,
	- drro->drr_bonuslen);
	-
	- if (drro->drr_blksz != doi.doi_data_block_size \|\|
	- nblkptr < doi.doi_nblkptr \|\|
	- dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
	- err = dmu_free_long_range(rwa->os, drro->drr_object,
	- 0, DMU_OBJECT_END);
	- if (err != 0)
	- return (SET_ERROR(EINVAL));
	- }
	- } else if (err == EEXIST) {
	- /*
	- * The object requested is currently an interior slot of a
	- * multi-slot dnode. This will be resolved when the next txg
	- * is synced out, since the send stream will have told us
	- * to free this slot when we freed the associated dnode
	- * earlier in the stream.
	- */
	- txg_wait_synced(dmu_objset_pool(rwa->os), 0);
	- object = drro->drr_object;
	- } else {
	- /* object is free and we are about to allocate a new one */
	- object = DMU_NEW_OBJECT;
	- }
	-
	- /*
	- * If this is a multi-slot dnode there is a chance that this
	- * object will expand into a slot that is already used by
	- * another object from the previous snapshot. We must free
	- * these objects before we attempt to allocate the new dnode.
	- */
	- if (dn_slots > 1) {
	- boolean_t need_sync = B_FALSE;
	-
	- for (uint64_t slot = drro->drr_object + 1;
	- slot < drro->drr_object + dn_slots;
	- slot++) {
	- dmu_object_info_t slot_doi;
	-
	- err = dmu_object_info(rwa->os, slot, &slot_doi);
	- if (err == ENOENT \|\| err == EEXIST)
	- continue;
	- else if (err != 0)
	- return (err);
	-
	- err = dmu_free_long_object(rwa->os, slot);
	-
	- if (err != 0)
	- return (err);
	-
	- need_sync = B_TRUE;
	- }
	-
	- if (need_sync)
	- txg_wait_synced(dmu_objset_pool(rwa->os), 0);
	- }
	-
	- tx = dmu_tx_create(rwa->os);
	- dmu_tx_hold_bonus(tx, object);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err != 0) {
	- dmu_tx_abort(tx);
	- return (err);
	- }
	-
	- if (object == DMU_NEW_OBJECT) {
	- /* currently free, want to be allocated */
	- err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
	- drro->drr_type, drro->drr_blksz,
	- drro->drr_bonustype, drro->drr_bonuslen,
	- dn_slots << DNODE_SHIFT, tx);
	- } else if (drro->drr_type != doi.doi_type \|\|
	- drro->drr_blksz != doi.doi_data_block_size \|\|
	- drro->drr_bonustype != doi.doi_bonus_type \|\|
	- drro->drr_bonuslen != doi.doi_bonus_size \|\|
	- drro->drr_dn_slots != (doi.doi_dnodesize >> DNODE_SHIFT)) {
	- /* currently allocated, but with different properties */
	- err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
	- drro->drr_type, drro->drr_blksz,
	- drro->drr_bonustype, drro->drr_bonuslen,
	- drro->drr_dn_slots << DNODE_SHIFT, tx);
	- }
	- if (err != 0) {
	- dmu_tx_commit(tx);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- dmu_object_set_checksum(rwa->os, drro->drr_object,
	- drro->drr_checksumtype, tx);
	- dmu_object_set_compress(rwa->os, drro->drr_object,
	- drro->drr_compress, tx);
	-
	- if (data != NULL) {
	- dmu_buf_t *db;
	-
	- VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
	- dmu_buf_will_dirty(db, tx);
	-
	- ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
	- bcopy(data, db->db_data, drro->drr_bonuslen);
	- if (rwa->byteswap) {
	- dmu_object_byteswap_t byteswap =
	- DMU_OT_BYTESWAP(drro->drr_bonustype);
	- dmu_ot_byteswap[byteswap].ob_func(db->db_data,
	- drro->drr_bonuslen);
	- }
	- dmu_buf_rele(db, FTAG);
	- }
	- dmu_tx_commit(tx);
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-noinline static int
	-receive_freeobjects(struct receive_writer_arg *rwa,
	- struct drr_freeobjects *drrfo)
	-{
	- uint64_t obj;
	- int next_err = 0;
	-
	- if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
	- return (SET_ERROR(EINVAL));
	-
	- for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
	- obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
	- next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
	- dmu_object_info_t doi;
	- int err;
	-
	- err = dmu_object_info(rwa->os, obj, NULL);
	- if (err == ENOENT)
	- continue;
	- else if (err != 0)
	- return (err);
	-
	- err = dmu_free_long_object(rwa->os, obj);
	- if (err != 0)
	- return (err);
	-
	- if (obj > rwa->max_object)
	- rwa->max_object = obj;
	- }
	- if (next_err != ESRCH)
	- return (next_err);
	- return (0);
	-}
	-
	-noinline static int
	-receive_write(struct receive_writer_arg rwa, struct drr_write drrw,
	- arc_buf_t *abuf)
	-{
	- dmu_tx_t *tx;
	- int err;
	-
	- if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset \|\|
	- !DMU_OT_IS_VALID(drrw->drr_type))
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * For resuming to work, records must be in increasing order
	- * by (object, offset).
	- */
	- if (drrw->drr_object < rwa->last_object \|\|
	- (drrw->drr_object == rwa->last_object &&
	- drrw->drr_offset < rwa->last_offset)) {
	- return (SET_ERROR(EINVAL));
	- }
	- rwa->last_object = drrw->drr_object;
	- rwa->last_offset = drrw->drr_offset;
	-
	- if (rwa->last_object > rwa->max_object)
	- rwa->max_object = rwa->last_object;
	-
	- if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
	- return (SET_ERROR(EINVAL));
	-
	- tx = dmu_tx_create(rwa->os);
	- dmu_tx_hold_write(tx, drrw->drr_object,
	- drrw->drr_offset, drrw->drr_logical_size);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err != 0) {
	- dmu_tx_abort(tx);
	- return (err);
	- }
	- if (rwa->byteswap) {
	- dmu_object_byteswap_t byteswap =
	- DMU_OT_BYTESWAP(drrw->drr_type);
	- dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
	- DRR_WRITE_PAYLOAD_SIZE(drrw));
	- }
	-
	- /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */
	- dmu_buf_t *bonus;
	- if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
	- return (SET_ERROR(EINVAL));
	- dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
	-
	- /*
	- * Note: If the receive fails, we want the resume stream to start
	- * with the same record that we last successfully received (as opposed
	- * to the next record), so that we can verify that we are
	- * resuming from the correct location.
	- */
	- save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
	- dmu_tx_commit(tx);
	- dmu_buf_rele(bonus, FTAG);
	-
	- return (0);
	-}
	-
	-/*
	- * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed
	- * streams to refer to a copy of the data that is already on the
	- * system because it came in earlier in the stream. This function
	- * finds the earlier copy of the data, and uses that copy instead of
	- * data from the stream to fulfill this write.
	- */
	-static int
	-receive_write_byref(struct receive_writer_arg *rwa,
	- struct drr_write_byref *drrwbr)
	-{
	- dmu_tx_t *tx;
	- int err;
	- guid_map_entry_t gmesrch;
	- guid_map_entry_t *gmep;
	- avl_index_t where;
	- objset_t *ref_os = NULL;
	- dmu_buf_t *dbp;
	-
	- if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * If the GUID of the referenced dataset is different from the
	- * GUID of the target dataset, find the referenced dataset.
	- */
	- if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
	- gmesrch.guid = drrwbr->drr_refguid;
	- if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
	- &where)) == NULL) {
	- return (SET_ERROR(EINVAL));
	- }
	- if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
	- return (SET_ERROR(EINVAL));
	- } else {
	- ref_os = rwa->os;
	- }
	-
	- if (drrwbr->drr_object > rwa->max_object)
	- rwa->max_object = drrwbr->drr_object;
	-
	- err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
	- drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
	- if (err != 0)
	- return (err);
	-
	- tx = dmu_tx_create(rwa->os);
	-
	- dmu_tx_hold_write(tx, drrwbr->drr_object,
	- drrwbr->drr_offset, drrwbr->drr_length);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err != 0) {
	- dmu_tx_abort(tx);
	- return (err);
	- }
	- dmu_write(rwa->os, drrwbr->drr_object,
	- drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
	- dmu_buf_rele(dbp, FTAG);
	-
	- /* See comment in restore_write. */
	- save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
	- dmu_tx_commit(tx);
	- return (0);
	-}
	-
	-static int
	-receive_write_embedded(struct receive_writer_arg *rwa,
	- struct drr_write_embedded drrwe, void data)
	-{
	- dmu_tx_t *tx;
	- int err;
	-
	- if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
	- return (EINVAL);
	-
	- if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
	- return (EINVAL);
	-
	- if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
	- return (EINVAL);
	- if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
	- return (EINVAL);
	-
	- if (drrwe->drr_object > rwa->max_object)
	- rwa->max_object = drrwe->drr_object;
	-
	- tx = dmu_tx_create(rwa->os);
	-
	- dmu_tx_hold_write(tx, drrwe->drr_object,
	- drrwe->drr_offset, drrwe->drr_length);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err != 0) {
	- dmu_tx_abort(tx);
	- return (err);
	- }
	-
	- dmu_write_embedded(rwa->os, drrwe->drr_object,
	- drrwe->drr_offset, data, drrwe->drr_etype,
	- drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
	- rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
	-
	- /* See comment in restore_write. */
	- save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
	- dmu_tx_commit(tx);
	- return (0);
	-}
	-
	-static int
	-receive_spill(struct receive_writer_arg rwa, struct drr_spill drrs,
	- void *data)
	-{
	- dmu_tx_t *tx;
	- dmu_buf_t db, db_spill;
	- int err;
	-
	- if (drrs->drr_length < SPA_MINBLOCKSIZE \|\|
	- drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
	- return (SET_ERROR(EINVAL));
	-
	- if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
	- return (SET_ERROR(EINVAL));
	-
	- if (drrs->drr_object > rwa->max_object)
	- rwa->max_object = drrs->drr_object;
	-
	- VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
	- if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
	- dmu_buf_rele(db, FTAG);
	- return (err);
	- }
	-
	- tx = dmu_tx_create(rwa->os);
	-
	- dmu_tx_hold_spill(tx, db->db_object);
	-
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err != 0) {
	- dmu_buf_rele(db, FTAG);
	- dmu_buf_rele(db_spill, FTAG);
	- dmu_tx_abort(tx);
	- return (err);
	- }
	- dmu_buf_will_dirty(db_spill, tx);
	-
	- if (db_spill->db_size < drrs->drr_length)
	- VERIFY(0 == dbuf_spill_set_blksz(db_spill,
	- drrs->drr_length, tx));
	- bcopy(data, db_spill->db_data, drrs->drr_length);
	-
	- dmu_buf_rele(db, FTAG);
	- dmu_buf_rele(db_spill, FTAG);
	-
	- dmu_tx_commit(tx);
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-noinline static int
	-receive_free(struct receive_writer_arg rwa, struct drr_free drrf)
	-{
	- int err;
	-
	- if (drrf->drr_length != -1ULL &&
	- drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
	- return (SET_ERROR(EINVAL));
	-
	- if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
	- return (SET_ERROR(EINVAL));
	-
	- if (drrf->drr_object > rwa->max_object)
	- rwa->max_object = drrf->drr_object;
	-
	- err = dmu_free_long_range(rwa->os, drrf->drr_object,
	- drrf->drr_offset, drrf->drr_length);
	-
	- return (err);
	-}
	-
	-/* used to destroy the drc_ds on error */
	-static void
	-dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
	-{
	- if (drc->drc_resumable) {
	- /* wait for our resume state to be written to disk */
	- txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
	- dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
	- } else {
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- dsl_dataset_name(drc->drc_ds, name);
	- dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
	- (void) dsl_destroy_head(name);
	- }
	-}
	-
	-static void
	-receive_cksum(struct receive_arg ra, int len, void buf)
	-{
	- if (ra->byteswap) {
	- (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
	- } else {
	- (void) fletcher_4_incremental_native(buf, len, &ra->cksum);
	- }
	-}
	-
	-/*
	- * Read the payload into a buffer of size len, and update the current record's
	- * payload field.
	- * Allocate ra->next_rrd and read the next record's header into
	- * ra->next_rrd->header.
	- * Verify checksum of payload and next record.
	- */
	-static int
	-receive_read_payload_and_next_header(struct receive_arg ra, int len, void buf)
	-{
	- int err;
	-
	- if (len != 0) {
	- ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
	- err = receive_read(ra, len, buf);
	- if (err != 0)
	- return (err);
	- receive_cksum(ra, len, buf);
	-
	- /* note: rrd is NULL when reading the begin record's payload */
	- if (ra->rrd != NULL) {
	- ra->rrd->payload = buf;
	- ra->rrd->payload_size = len;
	- ra->rrd->bytes_read = ra->bytes_read;
	- }
	- }
	-
	- ra->prev_cksum = ra->cksum;
	-
	- ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
	- err = receive_read(ra, sizeof (ra->next_rrd->header),
	- &ra->next_rrd->header);
	- ra->next_rrd->bytes_read = ra->bytes_read;
	- if (err != 0) {
	- kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
	- ra->next_rrd = NULL;
	- return (err);
	- }
	- if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
	- kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
	- ra->next_rrd = NULL;
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * Note: checksum is of everything up to but not including the
	- * checksum itself.
	- */
	- ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
	- ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
	- receive_cksum(ra,
	- offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
	- &ra->next_rrd->header);
	-
	- zio_cksum_t cksum_orig =
	- ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
	- zio_cksum_t *cksump =
	- &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
	-
	- if (ra->byteswap)
	- byteswap_record(&ra->next_rrd->header);
	-
	- if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
	- !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
	- kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
	- ra->next_rrd = NULL;
	- return (SET_ERROR(ECKSUM));
	- }
	-
	- receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
	-
	- return (0);
	-}
	-
	-static void
	-objlist_create(struct objlist *list)
	-{
	- list_create(&list->list, sizeof (struct receive_objnode),
	- offsetof(struct receive_objnode, node));
	- list->last_lookup = 0;
	-}
	-
	-static void
	-objlist_destroy(struct objlist *list)
	-{
	- for (struct receive_objnode *n = list_remove_head(&list->list);
	- n != NULL; n = list_remove_head(&list->list)) {
	- kmem_free(n, sizeof (*n));
	- }
	- list_destroy(&list->list);
	-}
	-
	-/*
	- * This function looks through the objlist to see if the specified object number
	- * is contained in the objlist. In the process, it will remove all object
	- * numbers in the list that are smaller than the specified object number. Thus,
	- * any lookup of an object number smaller than a previously looked up object
	- * number will always return false; therefore, all lookups should be done in
	- * ascending order.
	- */
	-static boolean_t
	-objlist_exists(struct objlist *list, uint64_t object)
	-{
	- struct receive_objnode *node = list_head(&list->list);
	- ASSERT3U(object, >=, list->last_lookup);
	- list->last_lookup = object;
	- while (node != NULL && node->object < object) {
	- VERIFY3P(node, ==, list_remove_head(&list->list));
	- kmem_free(node, sizeof (*node));
	- node = list_head(&list->list);
	- }
	- return (node != NULL && node->object == object);
	-}
	-
	-/*
	- * The objlist is a list of object numbers stored in ascending order. However,
	- * the insertion of new object numbers does not seek out the correct location to
	- * store a new object number; instead, it appends it to the list for simplicity.
	- * Thus, any users must take care to only insert new object numbers in ascending
	- * order.
	- */
	-static void
	-objlist_insert(struct objlist *list, uint64_t object)
	-{
	- struct receive_objnode node = kmem_zalloc(sizeof (node), KM_SLEEP);
	- node->object = object;
	-#ifdef ZFS_DEBUG
	- struct receive_objnode *last_object = list_tail(&list->list);
	- uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
	- ASSERT3U(node->object, >, last_objnum);
	-#endif
	- list_insert_tail(&list->list, node);
	-}
	-
	-/*
	- * Issue the prefetch reads for any necessary indirect blocks.
	- *
	- * We use the object ignore list to tell us whether or not to issue prefetches
	- * for a given object. We do this for both correctness (in case the blocksize
	- * of an object has changed) and performance (if the object doesn't exist, don't
	- * needlessly try to issue prefetches). We also trim the list as we go through
	- * the stream to prevent it from growing to an unbounded size.
	- *
	- * The object numbers within will always be in sorted order, and any write
	- * records we see will also be in sorted order, but they're not sorted with
	- * respect to each other (i.e. we can get several object records before
	- * receiving each object's write records). As a result, once we've reached a
	- * given object number, we can safely remove any reference to lower object
	- * numbers in the ignore list. In practice, we receive up to 32 object records
	- * before receiving write records, so the list can have up to 32 nodes in it.
	- */
	-/* ARGSUSED */
	-static void
	-receive_read_prefetch(struct receive_arg *ra,
	- uint64_t object, uint64_t offset, uint64_t length)
	-{
	- if (!objlist_exists(&ra->ignore_objlist, object)) {
	- dmu_prefetch(ra->os, object, 1, offset, length,
	- ZIO_PRIORITY_SYNC_READ);
	- }
	-}
	-
	-/*
	- * Read records off the stream, issuing any necessary prefetches.
	- */
	-static int
	-receive_read_record(struct receive_arg *ra)
	-{
	- int err;
	-
	- switch (ra->rrd->header.drr_type) {
	- case DRR_OBJECT:
	- {
	- struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
	- uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
	- void *buf = kmem_zalloc(size, KM_SLEEP);
	- dmu_object_info_t doi;
	- err = receive_read_payload_and_next_header(ra, size, buf);
	- if (err != 0) {
	- kmem_free(buf, size);
	- return (err);
	- }
	- err = dmu_object_info(ra->os, drro->drr_object, &doi);
	- /*
	- * See receive_read_prefetch for an explanation why we're
	- * storing this object in the ignore_obj_list.
	- */
	- if (err == ENOENT \|\|
	- (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
	- objlist_insert(&ra->ignore_objlist, drro->drr_object);
	- err = 0;
	- }
	- return (err);
	- }
	- case DRR_FREEOBJECTS:
	- {
	- err = receive_read_payload_and_next_header(ra, 0, NULL);
	- return (err);
	- }
	- case DRR_WRITE:
	- {
	- struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
	- arc_buf_t *abuf;
	- boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
	- if (DRR_WRITE_COMPRESSED(drrw)) {
	- ASSERT3U(drrw->drr_compressed_size, >, 0);
	- ASSERT3U(drrw->drr_logical_size, >=,
	- drrw->drr_compressed_size);
	- ASSERT(!is_meta);
	- abuf = arc_loan_compressed_buf(
	- dmu_objset_spa(ra->os),
	- drrw->drr_compressed_size, drrw->drr_logical_size,
	- drrw->drr_compressiontype);
	- } else {
	- abuf = arc_loan_buf(dmu_objset_spa(ra->os),
	- is_meta, drrw->drr_logical_size);
	- }
	-
	- err = receive_read_payload_and_next_header(ra,
	- DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data);
	- if (err != 0) {
	- dmu_return_arcbuf(abuf);
	- return (err);
	- }
	- ra->rrd->write_buf = abuf;
	- receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
	- drrw->drr_logical_size);
	- return (err);
	- }
	- case DRR_WRITE_BYREF:
	- {
	- struct drr_write_byref *drrwb =
	- &ra->rrd->header.drr_u.drr_write_byref;
	- err = receive_read_payload_and_next_header(ra, 0, NULL);
	- receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
	- drrwb->drr_length);
	- return (err);
	- }
	- case DRR_WRITE_EMBEDDED:
	- {
	- struct drr_write_embedded *drrwe =
	- &ra->rrd->header.drr_u.drr_write_embedded;
	- uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
	- void *buf = kmem_zalloc(size, KM_SLEEP);
	-
	- err = receive_read_payload_and_next_header(ra, size, buf);
	- if (err != 0) {
	- kmem_free(buf, size);
	- return (err);
	- }
	-
	- receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
	- drrwe->drr_length);
	- return (err);
	- }
	- case DRR_FREE:
	- {
	- /*
	- * It might be beneficial to prefetch indirect blocks here, but
	- * we don't really have the data to decide for sure.
	- */
	- err = receive_read_payload_and_next_header(ra, 0, NULL);
	- return (err);
	- }
	- case DRR_END:
	- {
	- struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
	- if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
	- return (SET_ERROR(ECKSUM));
	- return (0);
	- }
	- case DRR_SPILL:
	- {
	- struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
	- void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
	- err = receive_read_payload_and_next_header(ra, drrs->drr_length,
	- buf);
	- if (err != 0)
	- kmem_free(buf, drrs->drr_length);
	- return (err);
	- }
	- default:
	- return (SET_ERROR(EINVAL));
	- }
	-}
	-
	-/*
	- * Commit the records to the pool.
	- */
	-static int
	-receive_process_record(struct receive_writer_arg *rwa,
	- struct receive_record_arg *rrd)
	-{
	- int err;
	-
	- /* Processing in order, therefore bytes_read should be increasing. */
	- ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
	- rwa->bytes_read = rrd->bytes_read;
	-
	- switch (rrd->header.drr_type) {
	- case DRR_OBJECT:
	- {
	- struct drr_object *drro = &rrd->header.drr_u.drr_object;
	- err = receive_object(rwa, drro, rrd->payload);
	- kmem_free(rrd->payload, rrd->payload_size);
	- rrd->payload = NULL;
	- return (err);
	- }
	- case DRR_FREEOBJECTS:
	- {
	- struct drr_freeobjects *drrfo =
	- &rrd->header.drr_u.drr_freeobjects;
	- return (receive_freeobjects(rwa, drrfo));
	- }
	- case DRR_WRITE:
	- {
	- struct drr_write *drrw = &rrd->header.drr_u.drr_write;
	- err = receive_write(rwa, drrw, rrd->write_buf);
	- /* if receive_write() is successful, it consumes the arc_buf */
	- if (err != 0)
	- dmu_return_arcbuf(rrd->write_buf);
	- rrd->write_buf = NULL;
	- rrd->payload = NULL;
	- return (err);
	- }
	- case DRR_WRITE_BYREF:
	- {
	- struct drr_write_byref *drrwbr =
	- &rrd->header.drr_u.drr_write_byref;
	- return (receive_write_byref(rwa, drrwbr));
	- }
	- case DRR_WRITE_EMBEDDED:
	- {
	- struct drr_write_embedded *drrwe =
	- &rrd->header.drr_u.drr_write_embedded;
	- err = receive_write_embedded(rwa, drrwe, rrd->payload);
	- kmem_free(rrd->payload, rrd->payload_size);
	- rrd->payload = NULL;
	- return (err);
	- }
	- case DRR_FREE:
	- {
	- struct drr_free *drrf = &rrd->header.drr_u.drr_free;
	- return (receive_free(rwa, drrf));
	- }
	- case DRR_SPILL:
	- {
	- struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
	- err = receive_spill(rwa, drrs, rrd->payload);
	- kmem_free(rrd->payload, rrd->payload_size);
	- rrd->payload = NULL;
	- return (err);
	- }
	- default:
	- return (SET_ERROR(EINVAL));
	- }
	-}
	-
	-/*
	- * dmu_recv_stream's worker thread; pull records off the queue, and then call
	- * receive_process_record When we're done, signal the main thread and exit.
	- */
	-static void
	-receive_writer_thread(void *arg)
	-{
	- struct receive_writer_arg *rwa = arg;
	- struct receive_record_arg *rrd;
	- for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
	- rrd = bqueue_dequeue(&rwa->q)) {
	- /*
	- * If there's an error, the main thread will stop putting things
	- * on the queue, but we need to clear everything in it before we
	- * can exit.
	- */
	- if (rwa->err == 0) {
	- rwa->err = receive_process_record(rwa, rrd);
	- } else if (rrd->write_buf != NULL) {
	- dmu_return_arcbuf(rrd->write_buf);
	- rrd->write_buf = NULL;
	- rrd->payload = NULL;
	- } else if (rrd->payload != NULL) {
	- kmem_free(rrd->payload, rrd->payload_size);
	- rrd->payload = NULL;
	- }
	- kmem_free(rrd, sizeof (*rrd));
	- }
	- kmem_free(rrd, sizeof (*rrd));
	- mutex_enter(&rwa->mutex);
	- rwa->done = B_TRUE;
	- cv_signal(&rwa->cv);
	- mutex_exit(&rwa->mutex);
	- thread_exit();
	-}
	-
	-static int
	-resume_check(struct receive_arg ra, nvlist_t begin_nvl)
	-{
	- uint64_t val;
	- objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
	- uint64_t dsobj = dmu_objset_id(ra->os);
	- uint64_t resume_obj, resume_off;
	-
	- if (nvlist_lookup_uint64(begin_nvl,
	- "resume_object", &resume_obj) != 0 \|\|
	- nvlist_lookup_uint64(begin_nvl,
	- "resume_offset", &resume_off) != 0) {
	- return (SET_ERROR(EINVAL));
	- }
	- VERIFY0(zap_lookup(mos, dsobj,
	- DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
	- if (resume_obj != val)
	- return (SET_ERROR(EINVAL));
	- VERIFY0(zap_lookup(mos, dsobj,
	- DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
	- if (resume_off != val)
	- return (SET_ERROR(EINVAL));
	-
	- return (0);
	-}
	-
	-/*
	- * Read in the stream's records, one by one, and apply them to the pool. There
	- * are two threads involved; the thread that calls this function will spin up a
	- * worker thread, read the records off the stream one by one, and issue
	- * prefetches for any necessary indirect blocks. It will then push the records
	- * onto an internal blocking queue. The worker thread will pull the records off
	- * the queue, and actually write the data into the DMU. This way, the worker
	- * thread doesn't have to wait for reads to complete, since everything it needs
	- * (the indirect blocks) will be prefetched.
	- *
	- * NB: callers must call dmu_recv_end() if this succeeds.
	- */
	-int
	-dmu_recv_stream(dmu_recv_cookie_t drc, struct file fp, offset_t *voffp,
	- int cleanup_fd, uint64_t *action_handlep)
	-{
	- int err = 0;
	- struct receive_arg ra = { 0 };
	- struct receive_writer_arg rwa = { 0 };
	- int featureflags;
	- nvlist_t *begin_nvl = NULL;
	-
	- ra.byteswap = drc->drc_byteswap;
	- ra.cksum = drc->drc_cksum;
	- ra.td = curthread;
	- ra.fp = fp;
	- ra.voff = *voffp;
	-
	- if (dsl_dataset_is_zapified(drc->drc_ds)) {
	- (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
	- drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
	- sizeof (ra.bytes_read), 1, &ra.bytes_read);
	- }
	-
	- objlist_create(&ra.ignore_objlist);
	-
	- /* these were verified in dmu_recv_begin */
	- ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
	- DMU_SUBSTREAM);
	- ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
	-
	- /*
	- * Open the objset we are modifying.
	- */
	- VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os));
	-
	- ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
	-
	- featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
	-
	- /* if this stream is dedup'ed, set up the avl tree for guid mapping */
	- if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
	- minor_t minor;
	-
	- if (cleanup_fd == -1) {
	- ra.err = SET_ERROR(EBADF);
	- goto out;
	- }
	- ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
	- if (ra.err != 0) {
	- cleanup_fd = -1;
	- goto out;
	- }
	-
	- if (*action_handlep == 0) {
	- rwa.guid_to_ds_map =
	- kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
	- avl_create(rwa.guid_to_ds_map, guid_compare,
	- sizeof (guid_map_entry_t),
	- offsetof(guid_map_entry_t, avlnode));
	- err = zfs_onexit_add_cb(minor,
	- free_guid_map_onexit, rwa.guid_to_ds_map,
	- action_handlep);
	- if (ra.err != 0)
	- goto out;
	- } else {
	- err = zfs_onexit_cb_data(minor, *action_handlep,
	- (void **)&rwa.guid_to_ds_map);
	- if (ra.err != 0)
	- goto out;
	- }
	-
	- drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
	- }
	-
	- uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
	- void *payload = NULL;
	- if (payloadlen != 0)
	- payload = kmem_alloc(payloadlen, KM_SLEEP);
	-
	- err = receive_read_payload_and_next_header(&ra, payloadlen, payload);
	- if (err != 0) {
	- if (payloadlen != 0)
	- kmem_free(payload, payloadlen);
	- goto out;
	- }
	- if (payloadlen != 0) {
	- err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
	- kmem_free(payload, payloadlen);
	- if (err != 0)
	- goto out;
	- }
	-
	- if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
	- err = resume_check(&ra, begin_nvl);
	- if (err != 0)
	- goto out;
	- }
	-
	- (void) bqueue_init(&rwa.q, zfs_recv_queue_length,
	- offsetof(struct receive_record_arg, node));
	- cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
	- mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
	- rwa.os = ra.os;
	- rwa.byteswap = drc->drc_byteswap;
	- rwa.resumable = drc->drc_resumable;
	-
	- (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0,
	- TS_RUN, minclsyspri);
	- /*
	- * We're reading rwa.err without locks, which is safe since we are the
	- * only reader, and the worker thread is the only writer. It's ok if we
	- * miss a write for an iteration or two of the loop, since the writer
	- * thread will keep freeing records we send it until we send it an eos
	- * marker.
	- *
	- * We can leave this loop in 3 ways: First, if rwa.err is
	- * non-zero. In that case, the writer thread will free the rrd we just
	- * pushed. Second, if we're interrupted; in that case, either it's the
	- * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
	- * has been handed off to the writer thread who will free it. Finally,
	- * if receive_read_record fails or we're at the end of the stream, then
	- * we free ra.rrd and exit.
	- */
	- while (rwa.err == 0) {
	- if (issig(JUSTLOOKING) && issig(FORREAL)) {
	- err = SET_ERROR(EINTR);
	- break;
	- }
	-
	- ASSERT3P(ra.rrd, ==, NULL);
	- ra.rrd = ra.next_rrd;
	- ra.next_rrd = NULL;
	- /* Allocates and loads header into ra.next_rrd */
	- err = receive_read_record(&ra);
	-
	- if (ra.rrd->header.drr_type == DRR_END \|\| err != 0) {
	- kmem_free(ra.rrd, sizeof (*ra.rrd));
	- ra.rrd = NULL;
	- break;
	- }
	-
	- bqueue_enqueue(&rwa.q, ra.rrd,
	- sizeof (struct receive_record_arg) + ra.rrd->payload_size);
	- ra.rrd = NULL;
	- }
	- if (ra.next_rrd == NULL)
	- ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
	- ra.next_rrd->eos_marker = B_TRUE;
	- bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
	-
	- mutex_enter(&rwa.mutex);
	- while (!rwa.done) {
	- cv_wait(&rwa.cv, &rwa.mutex);
	- }
	- mutex_exit(&rwa.mutex);
	-
	- /*
	- * If we are receiving a full stream as a clone, all object IDs which
	- * are greater than the maximum ID referenced in the stream are
	- * by definition unused and must be freed. Note that it's possible that
	- * we've resumed this send and the first record we received was the END
	- * record. In that case, max_object would be 0, but we shouldn't start
	- * freeing all objects from there; instead we should start from the
	- * resumeobj.
	- */
	- if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
	- uint64_t obj;
	- if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0)
	- obj = 0;
	- if (rwa.max_object > obj)
	- obj = rwa.max_object;
	- obj++;
	- int free_err = 0;
	- int next_err = 0;
	-
	- while (next_err == 0) {
	- free_err = dmu_free_long_object(rwa.os, obj);
	- if (free_err != 0 && free_err != ENOENT)
	- break;
	-
	- next_err = dmu_object_next(rwa.os, &obj, FALSE, 0);
	- }
	-
	- if (err == 0) {
	- if (free_err != 0 && free_err != ENOENT)
	- err = free_err;
	- else if (next_err != ESRCH)
	- err = next_err;
	- }
	- }
	-
	- cv_destroy(&rwa.cv);
	- mutex_destroy(&rwa.mutex);
	- bqueue_destroy(&rwa.q);
	- if (err == 0)
	- err = rwa.err;
	-
	-out:
	- nvlist_free(begin_nvl);
	- if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
	- zfs_onexit_fd_rele(cleanup_fd);
	-
	- if (err != 0) {
	- /*
	- * Clean up references. If receive is not resumable,
	- * destroy what we created, so we don't leave it in
	- * the inconsistent state.
	- */
	- dmu_recv_cleanup_ds(drc);
	- }
	-
	- *voffp = ra.voff;
	- objlist_destroy(&ra.ignore_objlist);
	- return (err);
	-}
	-
	-static int
	-dmu_recv_end_check(void arg, dmu_tx_t tx)
	-{
	- dmu_recv_cookie_t *drc = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- int error;
	-
	- ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
	-
	- if (!drc->drc_newfs) {
	- dsl_dataset_t *origin_head;
	-
	- error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
	- if (error != 0)
	- return (error);
	- if (drc->drc_force) {
	- /*
	- * We will destroy any snapshots in tofs (i.e. before
	- * origin_head) that are after the origin (which is
	- * the snap before drc_ds, because drc_ds can not
	- * have any snaps of its own).
	- */
	- uint64_t obj;
	-
	- obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
	- while (obj !=
	- dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
	- dsl_dataset_t *snap;
	- error = dsl_dataset_hold_obj(dp, obj, FTAG,
	- &snap);
	- if (error != 0)
	- break;
	- if (snap->ds_dir != origin_head->ds_dir)
	- error = SET_ERROR(EINVAL);
	- if (error == 0) {
	- error = dsl_destroy_snapshot_check_impl(
	- snap, B_FALSE);
	- }
	- obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
	- dsl_dataset_rele(snap, FTAG);
	- if (error != 0)
	- break;
	- }
	- if (error != 0) {
	- dsl_dataset_rele(origin_head, FTAG);
	- return (error);
	- }
	- }
	- error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
	- origin_head, drc->drc_force, drc->drc_owner, tx);
	- if (error != 0) {
	- dsl_dataset_rele(origin_head, FTAG);
	- return (error);
	- }
	- error = dsl_dataset_snapshot_check_impl(origin_head,
	- drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
	- dsl_dataset_rele(origin_head, FTAG);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
	- } else {
	- error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
	- drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
	- }
	- return (error);
	-}
	-
	-static void
	-dmu_recv_end_sync(void arg, dmu_tx_t tx)
	-{
	- dmu_recv_cookie_t *drc = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	-
	- spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
	- tx, "snap=%s", drc->drc_tosnap);
	-
	- if (!drc->drc_newfs) {
	- dsl_dataset_t *origin_head;
	-
	- VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
	- &origin_head));
	-
	- if (drc->drc_force) {
	- /*
	- * Destroy any snapshots of drc_tofs (origin_head)
	- * after the origin (the snap before drc_ds).
	- */
	- uint64_t obj;
	-
	- obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
	- while (obj !=
	- dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
	- dsl_dataset_t *snap;
	- VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
	- &snap));
	- ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
	- obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
	- dsl_destroy_snapshot_sync_impl(snap,
	- B_FALSE, tx);
	- dsl_dataset_rele(snap, FTAG);
	- }
	- }
	- VERIFY3P(drc->drc_ds->ds_prev, ==,
	- origin_head->ds_prev);
	-
	- dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
	- origin_head, tx);
	- dsl_dataset_snapshot_sync_impl(origin_head,
	- drc->drc_tosnap, tx);
	-
	- /* set snapshot's creation time and guid */
	- dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
	- dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
	- drc->drc_drrb->drr_creation_time;
	- dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
	- drc->drc_drrb->drr_toguid;
	- dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
	- ~DS_FLAG_INCONSISTENT;
	-
	- dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
	- dsl_dataset_phys(origin_head)->ds_flags &=
	- ~DS_FLAG_INCONSISTENT;
	-
	- drc->drc_newsnapobj =
	- dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
	-
	- dsl_dataset_rele(origin_head, FTAG);
	- dsl_destroy_head_sync_impl(drc->drc_ds, tx);
	-
	- if (drc->drc_owner != NULL)
	- VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
	- } else {
	- dsl_dataset_t *ds = drc->drc_ds;
	-
	- dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
	-
	- /* set snapshot's creation time and guid */
	- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
	- dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
	- drc->drc_drrb->drr_creation_time;
	- dsl_dataset_phys(ds->ds_prev)->ds_guid =
	- drc->drc_drrb->drr_toguid;
	- dsl_dataset_phys(ds->ds_prev)->ds_flags &=
	- ~DS_FLAG_INCONSISTENT;
	-
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
	- if (dsl_dataset_has_resume_receive_state(ds)) {
	- (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_FROMGUID, tx);
	- (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_OBJECT, tx);
	- (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_OFFSET, tx);
	- (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_BYTES, tx);
	- (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_TOGUID, tx);
	- (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_TONAME, tx);
	- }
	- drc->drc_newsnapobj =
	- dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
	- }
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- zvol_create_minors(dp->dp_spa, drc->drc_tofs);
	-#endif
	-
	- /*
	- * Release the hold from dmu_recv_begin. This must be done before
	- * we return to open context, so that when we free the dataset's dnode,
	- * we can evict its bonus buffer.
	- */
	- dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
	- drc->drc_ds = NULL;
	-}
	-
	-static int
	-add_ds_to_guidmap(const char name, avl_tree_t guid_map, uint64_t snapobj)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *snapds;
	- guid_map_entry_t *gmep;
	- int err;
	-
	- ASSERT(guid_map != NULL);
	-
	- err = dsl_pool_hold(name, FTAG, &dp);
	- if (err != 0)
	- return (err);
	- gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
	- err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
	- if (err == 0) {
	- gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
	- gmep->gme_ds = snapds;
	- avl_add(guid_map, gmep);
	- dsl_dataset_long_hold(snapds, gmep);
	- } else
	- kmem_free(gmep, sizeof (*gmep));
	-
	- dsl_pool_rele(dp, FTAG);
	- return (err);
	-}
	-
	-static int dmu_recv_end_modified_blocks = 3;
	-
	-static int
	-dmu_recv_existing_end(dmu_recv_cookie_t *drc)
	-{
	-#ifdef _KERNEL
	- /*
	- * We will be destroying the ds; make sure its origin is unmounted if
	- * necessary.
	- */
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- dsl_dataset_name(drc->drc_ds, name);
	- zfs_destroy_unmount_origin(name);
	-#endif
	-
	- return (dsl_sync_task(drc->drc_tofs,
	- dmu_recv_end_check, dmu_recv_end_sync, drc,
	- dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
	-}
	-
	-static int
	-dmu_recv_new_end(dmu_recv_cookie_t *drc)
	-{
	- return (dsl_sync_task(drc->drc_tofs,
	- dmu_recv_end_check, dmu_recv_end_sync, drc,
	- dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
	-}
	-
	-int
	-dmu_recv_end(dmu_recv_cookie_t drc, void owner)
	-{
	- int error;
	-
	- drc->drc_owner = owner;
	-
	- if (drc->drc_newfs)
	- error = dmu_recv_new_end(drc);
	- else
	- error = dmu_recv_existing_end(drc);
	-
	- if (error != 0) {
	- dmu_recv_cleanup_ds(drc);
	- } else if (drc->drc_guid_to_ds_map != NULL) {
	- (void) add_ds_to_guidmap(drc->drc_tofs,
	- drc->drc_guid_to_ds_map,
	- drc->drc_newsnapobj);
	- }
	- return (error);
	-}
	-
	-/*
	- * Return TRUE if this objset is currently being received into.
	- */
	-boolean_t
	-dmu_objset_is_receiving(objset_t *os)
	-{
	- return (os->os_dsl_dataset != NULL &&
	- os->os_dsl_dataset->ds_owner == dmu_recv_tag);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
	@@ -1,712 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2015 Chunwei Chen. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dnode.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zio.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/sa.h>
	-#include <sys/sa_impl.h>
	-#include <sys/callb.h>
	-#include <sys/zfeature.h>
	-
	-int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */
	-boolean_t send_holes_without_birth_time = B_TRUE;
	-
	-#ifdef _KERNEL
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN,
	- &send_holes_without_birth_time, 0, "Send holes without birth time");
	-#endif
	-
	-typedef struct prefetch_data {
	- kmutex_t pd_mtx;
	- kcondvar_t pd_cv;
	- int32_t pd_bytes_fetched;
	- int pd_flags;
	- boolean_t pd_cancel;
	- boolean_t pd_exited;
	- zbookmark_phys_t pd_resume;
	-} prefetch_data_t;
	-
	-typedef struct traverse_data {
	- spa_t *td_spa;
	- uint64_t td_objset;
	- blkptr_t *td_rootbp;
	- uint64_t td_min_txg;
	- zbookmark_phys_t *td_resume;
	- int td_flags;
	- prefetch_data_t *td_pfd;
	- boolean_t td_paused;
	- uint64_t td_hole_birth_enabled_txg;
	- blkptr_cb_t *td_func;
	- void *td_arg;
	- boolean_t td_realloc_possible;
	-} traverse_data_t;
	-
	-static int traverse_dnode(traverse_data_t td, const dnode_phys_t dnp,
	- uint64_t objset, uint64_t object);
	-static void prefetch_dnode_metadata(traverse_data_t td, const dnode_phys_t ,
	- uint64_t objset, uint64_t object);
	-
	-static int
	-traverse_zil_block(zilog_t zilog, blkptr_t bp, void *arg, uint64_t claim_txg)
	-{
	- traverse_data_t *td = arg;
	- zbookmark_phys_t zb;
	-
	- if (BP_IS_HOLE(bp))
	- return (0);
	-
	- if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
	- return (-1);
	-
	- SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
	- bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
	-
	- (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
	-
	- return (0);
	-}
	-
	-static int
	-traverse_zil_record(zilog_t zilog, lr_t lrc, void *arg, uint64_t claim_txg)
	-{
	- traverse_data_t *td = arg;
	-
	- if (lrc->lrc_txtype == TX_WRITE) {
	- lr_write_t lr = (lr_write_t )lrc;
	- blkptr_t *bp = &lr->lr_blkptr;
	- zbookmark_phys_t zb;
	-
	- if (BP_IS_HOLE(bp))
	- return (0);
	-
	- if (claim_txg == 0 \|\| bp->blk_birth < claim_txg)
	- return (0);
	-
	- SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
	- ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
	-
	- (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
	- td->td_arg);
	- }
	- return (0);
	-}
	-
	-static void
	-traverse_zil(traverse_data_t td, zil_header_t zh)
	-{
	- uint64_t claim_txg = zh->zh_claim_txg;
	-
	- /*
	- * We only want to visit blocks that have been claimed but not yet
	- * replayed; plus blocks that are already stable in read-only mode.
	- */
	- if (claim_txg == 0 && spa_writeable(td->td_spa))
	- return;
	-
	- zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
	- (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
	- claim_txg);
	- zil_free(zilog);
	-}
	-
	-typedef enum resume_skip {
	- RESUME_SKIP_ALL,
	- RESUME_SKIP_NONE,
	- RESUME_SKIP_CHILDREN
	-} resume_skip_t;
	-
	-/*
	- * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
	- * the block indicated by zb does not need to be visited at all. Returns
	- * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
	- * resume point. This indicates that this block should be visited but not its
	- * children (since they must have been visited in a previous traversal).
	- * Otherwise returns RESUME_SKIP_NONE.
	- */
	-static resume_skip_t
	-resume_skip_check(traverse_data_t td, const dnode_phys_t dnp,
	- const zbookmark_phys_t *zb)
	-{
	- if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
	- /*
	- * If we already visited this bp & everything below,
	- * don't bother doing it again.
	- */
	- if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
	- return (RESUME_SKIP_ALL);
	-
	- /*
	- * If we found the block we're trying to resume from, zero
	- * the bookmark out to indicate that we have resumed.
	- */
	- if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
	- bzero(td->td_resume, sizeof (*zb));
	- if (td->td_flags & TRAVERSE_POST)
	- return (RESUME_SKIP_CHILDREN);
	- }
	- }
	- return (RESUME_SKIP_NONE);
	-}
	-
	-static void
	-traverse_prefetch_metadata(traverse_data_t *td,
	- const blkptr_t bp, const zbookmark_phys_t zb)
	-{
	- arc_flags_t flags = ARC_FLAG_NOWAIT \| ARC_FLAG_PREFETCH;
	-
	- if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
	- return;
	- /*
	- * If we are in the process of resuming, don't prefetch, because
	- * some children will not be needed (and in fact may have already
	- * been freed).
	- */
	- if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
	- return;
	- if (BP_IS_HOLE(bp) \|\| bp->blk_birth <= td->td_min_txg)
	- return;
	- if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
	- return;
	-
	- (void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
	- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
	-}
	-
	-static boolean_t
	-prefetch_needed(prefetch_data_t pfd, const blkptr_t bp)
	-{
	- ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
	- if (BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp) \|\|
	- BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
	- return (B_FALSE);
	- return (B_TRUE);
	-}
	-
	-static int
	-traverse_visitbp(traverse_data_t td, const dnode_phys_t dnp,
	- const blkptr_t bp, const zbookmark_phys_t zb)
	-{
	- zbookmark_phys_t czb;
	- int err = 0;
	- arc_buf_t *buf = NULL;
	- prefetch_data_t *pd = td->td_pfd;
	- boolean_t hard = td->td_flags & TRAVERSE_HARD;
	-
	- switch (resume_skip_check(td, dnp, zb)) {
	- case RESUME_SKIP_ALL:
	- return (0);
	- case RESUME_SKIP_CHILDREN:
	- goto post;
	- case RESUME_SKIP_NONE:
	- break;
	- default:
	- ASSERT(0);
	- }
	-
	- if (bp->blk_birth == 0) {
	- /*
	- * Since this block has a birth time of 0 it must be one of
	- * two things: a hole created before the
	- * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
	- * which has always been a hole in an object.
	- *
	- * If a file is written sparsely, then the unwritten parts of
	- * the file were "always holes" -- that is, they have been
	- * holes since this object was allocated. However, we (and
	- * our callers) can not necessarily tell when an object was
	- * allocated. Therefore, if it's possible that this object
	- * was freed and then its object number reused, we need to
	- * visit all the holes with birth==0.
	- *
	- * If it isn't possible that the object number was reused,
	- * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
	- * all the blocks we will visit as part of this traversal,
	- * then this hole must have always existed, so we can skip
	- * it. We visit blocks born after (exclusive) td_min_txg.
	- *
	- * Note that the meta-dnode cannot be reallocated.
	- */
	- if (!send_holes_without_birth_time &&
	- (!td->td_realloc_possible \|\|
	- zb->zb_object == DMU_META_DNODE_OBJECT) &&
	- td->td_hole_birth_enabled_txg <= td->td_min_txg)
	- return (0);
	- } else if (bp->blk_birth <= td->td_min_txg) {
	- return (0);
	- }
	-
	- if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
	- uint64_t size = BP_GET_LSIZE(bp);
	- mutex_enter(&pd->pd_mtx);
	- ASSERT(pd->pd_bytes_fetched >= 0);
	- while (pd->pd_bytes_fetched < size && !pd->pd_exited)
	- cv_wait(&pd->pd_cv, &pd->pd_mtx);
	- pd->pd_bytes_fetched -= size;
	- cv_broadcast(&pd->pd_cv);
	- mutex_exit(&pd->pd_mtx);
	- }
	-
	- if (BP_IS_HOLE(bp)) {
	- err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
	- if (err != 0)
	- goto post;
	- return (0);
	- }
	-
	- if (td->td_flags & TRAVERSE_PRE) {
	- err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
	- td->td_arg);
	- if (err == TRAVERSE_VISIT_NO_CHILDREN)
	- return (0);
	- if (err != 0)
	- goto post;
	- }
	-
	- if (BP_GET_LEVEL(bp) > 0) {
	- arc_flags_t flags = ARC_FLAG_WAIT;
	- int i;
	- blkptr_t *cbp;
	- int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
	-
	- err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
	- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
	- if (err != 0)
	- goto post;
	- cbp = buf->b_data;
	-
	- for (i = 0; i < epb; i++) {
	- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
	- zb->zb_level - 1,
	- zb->zb_blkid * epb + i);
	- traverse_prefetch_metadata(td, &cbp[i], &czb);
	- }
	-
	- /* recursively visitbp() blocks below this */
	- for (i = 0; i < epb; i++) {
	- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
	- zb->zb_level - 1,
	- zb->zb_blkid * epb + i);
	- err = traverse_visitbp(td, dnp, &cbp[i], &czb);
	- if (err != 0)
	- break;
	- }
	- } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
	- arc_flags_t flags = ARC_FLAG_WAIT;
	- int i;
	- int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
	-
	- err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
	- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
	- if (err != 0)
	- goto post;
	- dnode_phys_t *child_dnp = buf->b_data;
	-
	- for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
	- prefetch_dnode_metadata(td, &child_dnp[i],
	- zb->zb_objset, zb->zb_blkid * epb + i);
	- }
	-
	- /* recursively visitbp() blocks below this */
	- for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
	- err = traverse_dnode(td, &child_dnp[i],
	- zb->zb_objset, zb->zb_blkid * epb + i);
	- if (err != 0)
	- break;
	- }
	- } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
	- arc_flags_t flags = ARC_FLAG_WAIT;
	-
	- err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
	- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
	- if (err != 0)
	- goto post;
	-
	- objset_phys_t *osp = buf->b_data;
	- prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
	- DMU_META_DNODE_OBJECT);
	- /*
	- * See the block comment above for the goal of this variable.
	- * If the maxblkid of the meta-dnode is 0, then we know that
	- * we've never had more than DNODES_PER_BLOCK objects in the
	- * dataset, which means we can't have reused any object ids.
	- */
	- if (osp->os_meta_dnode.dn_maxblkid == 0)
	- td->td_realloc_possible = B_FALSE;
	-
	- if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
	- prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
	- zb->zb_objset, DMU_GROUPUSED_OBJECT);
	- prefetch_dnode_metadata(td, &osp->os_userused_dnode,
	- zb->zb_objset, DMU_USERUSED_OBJECT);
	- }
	-
	- err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
	- DMU_META_DNODE_OBJECT);
	- if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
	- err = traverse_dnode(td, &osp->os_groupused_dnode,
	- zb->zb_objset, DMU_GROUPUSED_OBJECT);
	- }
	- if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
	- err = traverse_dnode(td, &osp->os_userused_dnode,
	- zb->zb_objset, DMU_USERUSED_OBJECT);
	- }
	- }
	-
	- if (buf)
	- arc_buf_destroy(buf, &buf);
	-
	-post:
	- if (err == 0 && (td->td_flags & TRAVERSE_POST))
	- err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
	-
	- if (hard && (err == EIO \|\| err == ECKSUM)) {
	- /*
	- * Ignore this disk error as requested by the HARD flag,
	- * and continue traversal.
	- */
	- err = 0;
	- }
	-
	- /*
	- * If we are stopping here, set td_resume.
	- */
	- if (td->td_resume != NULL && err != 0 && !td->td_paused) {
	- td->td_resume->zb_objset = zb->zb_objset;
	- td->td_resume->zb_object = zb->zb_object;
	- td->td_resume->zb_level = 0;
	- /*
	- * If we have stopped on an indirect block (e.g. due to
	- * i/o error), we have not visited anything below it.
	- * Set the bookmark to the first level-0 block that we need
	- * to visit. This way, the resuming code does not need to
	- * deal with resuming from indirect blocks.
	- *
	- * Note, if zb_level <= 0, dnp may be NULL, so we don't want
	- * to dereference it.
	- */
	- td->td_resume->zb_blkid = zb->zb_blkid;
	- if (zb->zb_level > 0) {
	- td->td_resume->zb_blkid <<= zb->zb_level *
	- (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
	- }
	- td->td_paused = B_TRUE;
	- }
	-
	- return (err);
	-}
	-
	-static void
	-prefetch_dnode_metadata(traverse_data_t td, const dnode_phys_t dnp,
	- uint64_t objset, uint64_t object)
	-{
	- int j;
	- zbookmark_phys_t czb;
	-
	- for (j = 0; j < dnp->dn_nblkptr; j++) {
	- SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
	- traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
	- }
	-
	- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
	- SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
	- traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
	- }
	-}
	-
	-static int
	-traverse_dnode(traverse_data_t td, const dnode_phys_t dnp,
	- uint64_t objset, uint64_t object)
	-{
	- int j, err = 0;
	- zbookmark_phys_t czb;
	-
	- if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
	- object < td->td_resume->zb_object)
	- return (0);
	-
	- if (td->td_flags & TRAVERSE_PRE) {
	- SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
	- ZB_DNODE_BLKID);
	- err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
	- td->td_arg);
	- if (err == TRAVERSE_VISIT_NO_CHILDREN)
	- return (0);
	- if (err != 0)
	- return (err);
	- }
	-
	- for (j = 0; j < dnp->dn_nblkptr; j++) {
	- SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
	- err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
	- if (err != 0)
	- break;
	- }
	-
	- if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
	- SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
	- err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
	- }
	-
	- if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
	- SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
	- ZB_DNODE_BLKID);
	- err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
	- td->td_arg);
	- if (err == TRAVERSE_VISIT_NO_CHILDREN)
	- return (0);
	- if (err != 0)
	- return (err);
	- }
	- return (err);
	-}
	-
	-/* ARGSUSED */
	-static int
	-traverse_prefetcher(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	-{
	- prefetch_data_t *pfd = arg;
	- arc_flags_t aflags = ARC_FLAG_NOWAIT \| ARC_FLAG_PREFETCH \|
	- ARC_FLAG_PRESCIENT_PREFETCH;
	-
	- ASSERT(pfd->pd_bytes_fetched >= 0);
	- if (bp == NULL)
	- return (0);
	- if (pfd->pd_cancel)
	- return (SET_ERROR(EINTR));
	-
	- if (!prefetch_needed(pfd, bp))
	- return (0);
	-
	- mutex_enter(&pfd->pd_mtx);
	- while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
	- cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
	- pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
	- cv_broadcast(&pfd->pd_cv);
	- mutex_exit(&pfd->pd_mtx);
	-
	- (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE, &aflags, zb);
	-
	- return (0);
	-}
	-
	-static void
	-traverse_prefetch_thread(void *arg)
	-{
	- traverse_data_t *td_main = arg;
	- traverse_data_t td = *td_main;
	- zbookmark_phys_t czb;
	-
	- td.td_func = traverse_prefetcher;
	- td.td_arg = td_main->td_pfd;
	- td.td_pfd = NULL;
	- td.td_resume = &td_main->td_pfd->pd_resume;
	-
	- SET_BOOKMARK(&czb, td.td_objset,
	- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
	- (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
	-
	- mutex_enter(&td_main->td_pfd->pd_mtx);
	- td_main->td_pfd->pd_exited = B_TRUE;
	- cv_broadcast(&td_main->td_pfd->pd_cv);
	- mutex_exit(&td_main->td_pfd->pd_mtx);
	-}
	-
	-/*
	- * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
	- * in syncing context).
	- */
	-static int
	-traverse_impl(spa_t spa, dsl_dataset_t ds, uint64_t objset, blkptr_t *rootbp,
	- uint64_t txg_start, zbookmark_phys_t *resume, int flags,
	- blkptr_cb_t func, void *arg)
	-{
	- traverse_data_t td;
	- prefetch_data_t pd = { 0 };
	- zbookmark_phys_t czb;
	- int err;
	-
	- ASSERT(ds == NULL \|\| objset == ds->ds_object);
	- ASSERT(!(flags & TRAVERSE_PRE) \|\| !(flags & TRAVERSE_POST));
	-
	- td.td_spa = spa;
	- td.td_objset = objset;
	- td.td_rootbp = rootbp;
	- td.td_min_txg = txg_start;
	- td.td_resume = resume;
	- td.td_func = func;
	- td.td_arg = arg;
	- td.td_pfd = &pd;
	- td.td_flags = flags;
	- td.td_paused = B_FALSE;
	- td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
	-
	- if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
	- VERIFY(spa_feature_enabled_txg(spa,
	- SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
	- } else {
	- td.td_hole_birth_enabled_txg = UINT64_MAX;
	- }
	-
	- pd.pd_flags = flags;
	- if (resume != NULL)
	- pd.pd_resume = *resume;
	- mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
	-
	- /* See comment on ZIL traversal in dsl_scan_visitds. */
	- if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
	- arc_flags_t flags = ARC_FLAG_WAIT;
	- objset_phys_t *osp;
	- arc_buf_t *buf;
	-
	- err = arc_read(NULL, td.td_spa, rootbp,
	- arc_getbuf_func, &buf,
	- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
	- if (err != 0)
	- return (err);
	-
	- osp = buf->b_data;
	- traverse_zil(&td, &osp->os_zil_header);
	- arc_buf_destroy(buf, &buf);
	- }
	-
	- if (!(flags & TRAVERSE_PREFETCH_DATA) \|\|
	- 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
	- &td, TQ_NOQUEUE))
	- pd.pd_exited = B_TRUE;
	-
	- SET_BOOKMARK(&czb, td.td_objset,
	- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
	- err = traverse_visitbp(&td, NULL, rootbp, &czb);
	-
	- mutex_enter(&pd.pd_mtx);
	- pd.pd_cancel = B_TRUE;
	- cv_broadcast(&pd.pd_cv);
	- while (!pd.pd_exited)
	- cv_wait(&pd.pd_cv, &pd.pd_mtx);
	- mutex_exit(&pd.pd_mtx);
	-
	- mutex_destroy(&pd.pd_mtx);
	- cv_destroy(&pd.pd_cv);
	-
	- return (err);
	-}
	-
	-/*
	- * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
	- * in syncing context).
	- */
	-int
	-traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
	- zbookmark_phys_t *resume,
	- int flags, blkptr_cb_t func, void *arg)
	-{
	- return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
	- &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
	-}
	-
	-int
	-traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
	- int flags, blkptr_cb_t func, void *arg)
	-{
	- return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
	-}
	-
	-int
	-traverse_dataset_destroyed(spa_t spa, blkptr_t blkptr,
	- uint64_t txg_start, zbookmark_phys_t *resume, int flags,
	- blkptr_cb_t func, void *arg)
	-{
	- return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
	- blkptr, txg_start, resume, flags, func, arg));
	-}
	-
	-/*
	- * NB: pool must not be changing on-disk (eg, from zdb or sync context).
	- */
	-int
	-traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
	- blkptr_cb_t func, void *arg)
	-{
	- int err;
	- dsl_pool_t *dp = spa_get_dsl(spa);
	- objset_t *mos = dp->dp_meta_objset;
	- boolean_t hard = (flags & TRAVERSE_HARD);
	-
	- /* visit the MOS */
	- err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
	- txg_start, NULL, flags, func, arg);
	- if (err != 0)
	- return (err);
	-
	- /* visit each dataset */
	- for (uint64_t obj = 1; err == 0;
	- err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
	- dmu_object_info_t doi;
	-
	- err = dmu_object_info(mos, obj, &doi);
	- if (err != 0) {
	- if (hard)
	- continue;
	- break;
	- }
	-
	- if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
	- dsl_dataset_t *ds;
	- uint64_t txg = txg_start;
	-
	- dsl_pool_config_enter(dp, FTAG);
	- err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
	- dsl_pool_config_exit(dp, FTAG);
	- if (err != 0) {
	- if (hard)
	- continue;
	- break;
	- }
	- if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
	- txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
	- err = traverse_dataset(ds, txg, flags, func, arg);
	- dsl_dataset_rele(ds, FTAG);
	- if (err != 0)
	- break;
	- }
	- }
	- if (err == ESRCH)
	- err = 0;
	- return (err);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
	@@ -1,1345 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dbuf.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/zap_impl.h>
	-#include <sys/spa.h>
	-#include <sys/sa.h>
	-#include <sys/sa_impl.h>
	-#include <sys/zfs_context.h>
	-#include <sys/varargs.h>
	-
	-typedef void (dmu_tx_hold_func_t)(dmu_tx_t tx, struct dnode *dn,
	- uint64_t arg1, uint64_t arg2);
	-
	-
	-dmu_tx_t *
	-dmu_tx_create_dd(dsl_dir_t *dd)
	-{
	- dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
	- tx->tx_dir = dd;
	- if (dd != NULL)
	- tx->tx_pool = dd->dd_pool;
	- list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
	- offsetof(dmu_tx_hold_t, txh_node));
	- list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
	- offsetof(dmu_tx_callback_t, dcb_node));
	- tx->tx_start = gethrtime();
	- return (tx);
	-}
	-
	-dmu_tx_t *
	-dmu_tx_create(objset_t *os)
	-{
	- dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
	- tx->tx_objset = os;
	- return (tx);
	-}
	-
	-dmu_tx_t *
	-dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
	-{
	- dmu_tx_t *tx = dmu_tx_create_dd(NULL);
	-
	- txg_verify(dp->dp_spa, txg);
	- tx->tx_pool = dp;
	- tx->tx_txg = txg;
	- tx->tx_anyobj = TRUE;
	-
	- return (tx);
	-}
	-
	-int
	-dmu_tx_is_syncing(dmu_tx_t *tx)
	-{
	- return (tx->tx_anyobj);
	-}
	-
	-int
	-dmu_tx_private_ok(dmu_tx_t *tx)
	-{
	- return (tx->tx_anyobj);
	-}
	-
	-static dmu_tx_hold_t *
	-dmu_tx_hold_dnode_impl(dmu_tx_t tx, dnode_t dn, enum dmu_tx_hold_type type,
	- uint64_t arg1, uint64_t arg2)
	-{
	- dmu_tx_hold_t *txh;
	-
	- if (dn != NULL) {
	- (void) zfs_refcount_add(&dn->dn_holds, tx);
	- if (tx->tx_txg != 0) {
	- mutex_enter(&dn->dn_mtx);
	- /*
	- * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
	- * problem, but there's no way for it to happen (for
	- * now, at least).
	- */
	- ASSERT(dn->dn_assigned_txg == 0);
	- dn->dn_assigned_txg = tx->tx_txg;
	- (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
	- mutex_exit(&dn->dn_mtx);
	- }
	- }
	-
	- txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
	- txh->txh_tx = tx;
	- txh->txh_dnode = dn;
	- zfs_refcount_create(&txh->txh_space_towrite);
	- zfs_refcount_create(&txh->txh_memory_tohold);
	- txh->txh_type = type;
	- txh->txh_arg1 = arg1;
	- txh->txh_arg2 = arg2;
	- list_insert_tail(&tx->tx_holds, txh);
	-
	- return (txh);
	-}
	-
	-static dmu_tx_hold_t *
	-dmu_tx_hold_object_impl(dmu_tx_t tx, objset_t os, uint64_t object,
	- enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
	-{
	- dnode_t *dn = NULL;
	- dmu_tx_hold_t *txh;
	- int err;
	-
	- if (object != DMU_NEW_OBJECT) {
	- err = dnode_hold(os, object, FTAG, &dn);
	- if (err != 0) {
	- tx->tx_err = err;
	- return (NULL);
	- }
	- }
	- txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
	- if (dn != NULL)
	- dnode_rele(dn, FTAG);
	- return (txh);
	-}
	-
	-void
	-dmu_tx_add_new_object(dmu_tx_t tx, dnode_t dn)
	-{
	- /*
	- * If we're syncing, they can manipulate any object anyhow, and
	- * the hold on the dnode_t can cause problems.
	- */
	- if (!dmu_tx_is_syncing(tx))
	- (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
	-}
	-
	-/*
	- * This function reads specified data from disk. The specified data will
	- * be needed to perform the transaction -- i.e, it will be read after
	- * we do dmu_tx_assign(). There are two reasons that we read the data now
	- * (before dmu_tx_assign()):
	- *
	- * 1. Reading it now has potentially better performance. The transaction
	- * has not yet been assigned, so the TXG is not held open, and also the
	- * caller typically has less locks held when calling dmu_tx_hold_*() than
	- * after the transaction has been assigned. This reduces the lock (and txg)
	- * hold times, thus reducing lock contention.
	- *
	- * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
	- * that are detected before they start making changes to the DMU state
	- * (i.e. now). Once the transaction has been assigned, and some DMU
	- * state has been changed, it can be difficult to recover from an i/o
	- * error (e.g. to undo the changes already made in memory at the DMU
	- * layer). Typically code to do so does not exist in the caller -- it
	- * assumes that the data has already been cached and thus i/o errors are
	- * not possible.
	- *
	- * It has been observed that the i/o initiated here can be a performance
	- * problem, and it appears to be optional, because we don't look at the
	- * data which is read. However, removing this read would only serve to
	- * move the work elsewhere (after the dmu_tx_assign()), where it may
	- * have a greater impact on performance (in addition to the impact on
	- * fault tolerance noted above).
	- */
	-static int
	-dmu_tx_check_ioerr(zio_t zio, dnode_t dn, int level, uint64_t blkid)
	-{
	- int err;
	- dmu_buf_impl_t *db;
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- db = dbuf_hold_level(dn, level, blkid, FTAG);
	- rw_exit(&dn->dn_struct_rwlock);
	- if (db == NULL)
	- return (SET_ERROR(EIO));
	- err = dbuf_read(db, zio, DB_RF_CANFAIL \| DB_RF_NOPREFETCH);
	- dbuf_rele(db, FTAG);
	- return (err);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
	-{
	- dnode_t *dn = txh->txh_dnode;
	- int err = 0;
	-
	- if (len == 0)
	- return;
	-
	- (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
	-
	- if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
	- err = SET_ERROR(EFBIG);
	-
	- if (dn == NULL)
	- return;
	-
	- /*
	- * For i/o error checking, read the blocks that will be needed
	- * to perform the write: the first and last level-0 blocks (if
	- * they are not aligned, i.e. if they are partial-block writes),
	- * and all the level-1 blocks.
	- */
	- if (dn->dn_maxblkid == 0) {
	- if (off < dn->dn_datablksz &&
	- (off > 0 \|\| len < dn->dn_datablksz)) {
	- err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
	- if (err != 0) {
	- txh->txh_tx->tx_err = err;
	- }
	- }
	- } else {
	- zio_t *zio = zio_root(dn->dn_objset->os_spa,
	- NULL, NULL, ZIO_FLAG_CANFAIL);
	-
	- /* first level-0 block */
	- uint64_t start = off >> dn->dn_datablkshift;
	- if (P2PHASE(off, dn->dn_datablksz) \|\| len < dn->dn_datablksz) {
	- err = dmu_tx_check_ioerr(zio, dn, 0, start);
	- if (err != 0) {
	- txh->txh_tx->tx_err = err;
	- }
	- }
	-
	- /* last level-0 block */
	- uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
	- if (end != start && end <= dn->dn_maxblkid &&
	- P2PHASE(off + len, dn->dn_datablksz)) {
	- err = dmu_tx_check_ioerr(zio, dn, 0, end);
	- if (err != 0) {
	- txh->txh_tx->tx_err = err;
	- }
	- }
	-
	- /* level-1 blocks */
	- if (dn->dn_nlevels > 1) {
	- int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	- for (uint64_t i = (start >> shft) + 1;
	- i < end >> shft; i++) {
	- err = dmu_tx_check_ioerr(zio, dn, 1, i);
	- if (err != 0) {
	- txh->txh_tx->tx_err = err;
	- }
	- }
	- }
	-
	- err = zio_wait(zio);
	- if (err != 0) {
	- txh->txh_tx->tx_err = err;
	- }
	- }
	-}
	-
	-static void
	-dmu_tx_count_dnode(dmu_tx_hold_t *txh)
	-{
	- (void) zfs_refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE,
	- FTAG);
	-}
	-
	-void
	-dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
	-{
	- dmu_tx_hold_t *txh;
	-
	- ASSERT0(tx->tx_txg);
	- ASSERT3U(len, <=, DMU_MAX_ACCESS);
	- ASSERT(len == 0 \|\| UINT64_MAX - off >= len - 1);
	-
	- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	- object, THT_WRITE, off, len);
	- if (txh != NULL) {
	- dmu_tx_count_write(txh, off, len);
	- dmu_tx_count_dnode(txh);
	- }
	-}
	-
	-void
	-dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
	-{
	- dmu_tx_hold_t *txh;
	-
	- ASSERT(tx->tx_txg == 0);
	- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	- object, THT_WRITE, 0, 0);
	- if (txh == NULL)
	- return;
	-
	- dnode_t *dn = txh->txh_dnode;
	- (void) zfs_refcount_add_many(&txh->txh_space_towrite,
	- 1ULL << dn->dn_indblkshift, FTAG);
	- dmu_tx_count_dnode(txh);
	-}
	-
	-void
	-dmu_tx_hold_write_by_dnode(dmu_tx_t tx, dnode_t dn, uint64_t off, int len)
	-{
	- dmu_tx_hold_t *txh;
	-
	- ASSERT0(tx->tx_txg);
	- ASSERT3U(len, <=, DMU_MAX_ACCESS);
	- ASSERT(len == 0 \|\| UINT64_MAX - off >= len - 1);
	-
	- txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
	- if (txh != NULL) {
	- dmu_tx_count_write(txh, off, len);
	- dmu_tx_count_dnode(txh);
	- }
	-}
	-
	-/*
	- * This function marks the transaction as being a "net free". The end
	- * result is that refquotas will be disabled for this transaction, and
	- * this transaction will be able to use half of the pool space overhead
	- * (see dsl_pool_adjustedsize()). Therefore this function should only
	- * be called for transactions that we expect will not cause a net increase
	- * in the amount of space used (but it's OK if that is occasionally not true).
	- */
	-void
	-dmu_tx_mark_netfree(dmu_tx_t *tx)
	-{
	- tx->tx_netfree = B_TRUE;
	-}
	-
	-static void
	-dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
	-{
	- dmu_tx_t *tx;
	- dnode_t *dn;
	- int err;
	-
	- tx = txh->txh_tx;
	- ASSERT(tx->tx_txg == 0);
	-
	- dn = txh->txh_dnode;
	- dmu_tx_count_dnode(txh);
	-
	- if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
	- return;
	- if (len == DMU_OBJECT_END)
	- len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
	-
	-
	- /*
	- * For i/o error checking, we read the first and last level-0
	- * blocks if they are not aligned, and all the level-1 blocks.
	- *
	- * Note: dbuf_free_range() assumes that we have not instantiated
	- * any level-0 dbufs that will be completely freed. Therefore we must
	- * exercise care to not read or count the first and last blocks
	- * if they are blocksize-aligned.
	- */
	- if (dn->dn_datablkshift == 0) {
	- if (off != 0 \|\| len < dn->dn_datablksz)
	- dmu_tx_count_write(txh, 0, dn->dn_datablksz);
	- } else {
	- /* first block will be modified if it is not aligned */
	- if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
	- dmu_tx_count_write(txh, off, 1);
	- /* last block will be modified if it is not aligned */
	- if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
	- dmu_tx_count_write(txh, off + len, 1);
	- }
	-
	- /*
	- * Check level-1 blocks.
	- */
	- if (dn->dn_nlevels > 1) {
	- int shift = dn->dn_datablkshift + dn->dn_indblkshift -
	- SPA_BLKPTRSHIFT;
	- uint64_t start = off >> shift;
	- uint64_t end = (off + len) >> shift;
	-
	- ASSERT(dn->dn_indblkshift != 0);
	-
	- /*
	- * dnode_reallocate() can result in an object with indirect
	- * blocks having an odd data block size. In this case,
	- * just check the single block.
	- */
	- if (dn->dn_datablkshift == 0)
	- start = end = 0;
	-
	- zio_t *zio = zio_root(tx->tx_pool->dp_spa,
	- NULL, NULL, ZIO_FLAG_CANFAIL);
	- for (uint64_t i = start; i <= end; i++) {
	- uint64_t ibyte = i << shift;
	- err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
	- i = ibyte >> shift;
	- if (err == ESRCH \|\| i > end)
	- break;
	- if (err != 0) {
	- tx->tx_err = err;
	- (void) zio_wait(zio);
	- return;
	- }
	-
	- (void) zfs_refcount_add_many(&txh->txh_memory_tohold,
	- 1 << dn->dn_indblkshift, FTAG);
	-
	- err = dmu_tx_check_ioerr(zio, dn, 1, i);
	- if (err != 0) {
	- tx->tx_err = err;
	- (void) zio_wait(zio);
	- return;
	- }
	- }
	- err = zio_wait(zio);
	- if (err != 0) {
	- tx->tx_err = err;
	- return;
	- }
	- }
	-}
	-
	-void
	-dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
	-{
	- dmu_tx_hold_t *txh;
	-
	- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	- object, THT_FREE, off, len);
	- if (txh != NULL)
	- (void) dmu_tx_hold_free_impl(txh, off, len);
	-}
	-
	-void
	-dmu_tx_hold_free_by_dnode(dmu_tx_t tx, dnode_t dn, uint64_t off, uint64_t len)
	-{
	- dmu_tx_hold_t *txh;
	-
	- txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
	- if (txh != NULL)
	- (void) dmu_tx_hold_free_impl(txh, off, len);
	-}
	-
	-static void
	-dmu_tx_hold_zap_impl(dmu_tx_hold_t txh, const char name)
	-{
	- dmu_tx_t *tx = txh->txh_tx;
	- dnode_t *dn;
	- int err;
	-
	- ASSERT(tx->tx_txg == 0);
	-
	- dn = txh->txh_dnode;
	-
	- dmu_tx_count_dnode(txh);
	-
	- /*
	- * Modifying a almost-full microzap is around the worst case (128KB)
	- *
	- * If it is a fat zap, the worst case would be 7*16KB=112KB:
	- * - 3 blocks overwritten: target leaf, ptrtbl block, header block
	- * - 4 new blocks written if adding:
	- * - 2 blocks for possibly split leaves,
	- * - 2 grown ptrtbl blocks
	- */
	- (void) zfs_refcount_add_many(&txh->txh_space_towrite,
	- MZAP_MAX_BLKSZ, FTAG);
	-
	- if (dn == NULL)
	- return;
	-
	- ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
	-
	- if (dn->dn_maxblkid == 0 \|\| name == NULL) {
	- /*
	- * This is a microzap (only one block), or we don't know
	- * the name. Check the first block for i/o errors.
	- */
	- err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
	- if (err != 0) {
	- tx->tx_err = err;
	- }
	- } else {
	- /*
	- * Access the name so that we'll check for i/o errors to
	- * the leaf blocks, etc. We ignore ENOENT, as this name
	- * may not yet exist.
	- */
	- err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
	- if (err == EIO \|\| err == ECKSUM \|\| err == ENXIO) {
	- tx->tx_err = err;
	- }
	- }
	-}
	-
	-void
	-dmu_tx_hold_zap(dmu_tx_t tx, uint64_t object, int add, const char name)
	-{
	- dmu_tx_hold_t *txh;
	-
	- ASSERT0(tx->tx_txg);
	-
	- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	- object, THT_ZAP, add, (uintptr_t)name);
	- if (txh != NULL)
	- dmu_tx_hold_zap_impl(txh, name);
	-}
	-
	-void
	-dmu_tx_hold_zap_by_dnode(dmu_tx_t tx, dnode_t dn, int add, const char *name)
	-{
	- dmu_tx_hold_t *txh;
	-
	- ASSERT0(tx->tx_txg);
	- ASSERT(dn != NULL);
	-
	- txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
	- if (txh != NULL)
	- dmu_tx_hold_zap_impl(txh, name);
	-}
	-
	-void
	-dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
	-{
	- dmu_tx_hold_t *txh;
	-
	- ASSERT(tx->tx_txg == 0);
	-
	- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	- object, THT_BONUS, 0, 0);
	- if (txh)
	- dmu_tx_count_dnode(txh);
	-}
	-
	-void
	-dmu_tx_hold_bonus_by_dnode(dmu_tx_t tx, dnode_t dn)
	-{
	- dmu_tx_hold_t *txh;
	-
	- ASSERT0(tx->tx_txg);
	-
	- txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
	- if (txh)
	- dmu_tx_count_dnode(txh);
	-}
	-
	-void
	-dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
	-{
	- dmu_tx_hold_t *txh;
	- ASSERT(tx->tx_txg == 0);
	-
	- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	- DMU_NEW_OBJECT, THT_SPACE, space, 0);
	-
	- (void) zfs_refcount_add_many(&txh->txh_space_towrite, space, FTAG);
	-}
	-
	-#ifdef ZFS_DEBUG
	-void
	-dmu_tx_dirty_buf(dmu_tx_t tx, dmu_buf_impl_t db)
	-{
	- boolean_t match_object = B_FALSE;
	- boolean_t match_offset = B_FALSE;
	-
	- DB_DNODE_ENTER(db);
	- dnode_t *dn = DB_DNODE(db);
	- ASSERT(tx->tx_txg != 0);
	- ASSERT(tx->tx_objset == NULL \|\| dn->dn_objset == tx->tx_objset);
	- ASSERT3U(dn->dn_object, ==, db->db.db_object);
	-
	- if (tx->tx_anyobj) {
	- DB_DNODE_EXIT(db);
	- return;
	- }
	-
	- /* XXX No checking on the meta dnode for now */
	- if (db->db.db_object == DMU_META_DNODE_OBJECT) {
	- DB_DNODE_EXIT(db);
	- return;
	- }
	-
	- for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
	- txh = list_next(&tx->tx_holds, txh)) {
	- ASSERT(dn == NULL \|\| dn->dn_assigned_txg == tx->tx_txg);
	- if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
	- match_object = TRUE;
	- if (txh->txh_dnode == NULL \|\| txh->txh_dnode == dn) {
	- int datablkshift = dn->dn_datablkshift ?
	- dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
	- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	- int shift = datablkshift + epbs * db->db_level;
	- uint64_t beginblk = shift >= 64 ? 0 :
	- (txh->txh_arg1 >> shift);
	- uint64_t endblk = shift >= 64 ? 0 :
	- ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
	- uint64_t blkid = db->db_blkid;
	-
	- /* XXX txh_arg2 better not be zero... */
	-
	- dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
	- txh->txh_type, beginblk, endblk);
	-
	- switch (txh->txh_type) {
	- case THT_WRITE:
	- if (blkid >= beginblk && blkid <= endblk)
	- match_offset = TRUE;
	- /*
	- * We will let this hold work for the bonus
	- * or spill buffer so that we don't need to
	- * hold it when creating a new object.
	- */
	- if (blkid == DMU_BONUS_BLKID \|\|
	- blkid == DMU_SPILL_BLKID)
	- match_offset = TRUE;
	- /*
	- * They might have to increase nlevels,
	- * thus dirtying the new TLIBs. Or the
	- * might have to change the block size,
	- * thus dirying the new lvl=0 blk=0.
	- */
	- if (blkid == 0)
	- match_offset = TRUE;
	- break;
	- case THT_FREE:
	- /*
	- * We will dirty all the level 1 blocks in
	- * the free range and perhaps the first and
	- * last level 0 block.
	- */
	- if (blkid >= beginblk && (blkid <= endblk \|\|
	- txh->txh_arg2 == DMU_OBJECT_END))
	- match_offset = TRUE;
	- break;
	- case THT_SPILL:
	- if (blkid == DMU_SPILL_BLKID)
	- match_offset = TRUE;
	- break;
	- case THT_BONUS:
	- if (blkid == DMU_BONUS_BLKID)
	- match_offset = TRUE;
	- break;
	- case THT_ZAP:
	- match_offset = TRUE;
	- break;
	- case THT_NEWOBJECT:
	- match_object = TRUE;
	- break;
	- default:
	- ASSERT(!"bad txh_type");
	- }
	- }
	- if (match_object && match_offset) {
	- DB_DNODE_EXIT(db);
	- return;
	- }
	- }
	- DB_DNODE_EXIT(db);
	- panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
	- (u_longlong_t)db->db.db_object, db->db_level,
	- (u_longlong_t)db->db_blkid);
	-}
	-#endif
	-
	-/*
	- * If we can't do 10 iops, something is wrong. Let us go ahead
	- * and hit zfs_dirty_data_max.
	- */
	-hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
	-int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
	-
	-/*
	- * We delay transactions when we've determined that the backend storage
	- * isn't able to accommodate the rate of incoming writes.
	- *
	- * If there is already a transaction waiting, we delay relative to when
	- * that transaction finishes waiting. This way the calculated min_time
	- * is independent of the number of threads concurrently executing
	- * transactions.
	- *
	- * If we are the only waiter, wait relative to when the transaction
	- * started, rather than the current time. This credits the transaction for
	- * "time already served", e.g. reading indirect blocks.
	- *
	- * The minimum time for a transaction to take is calculated as:
	- * min_time = scale * (dirty - min) / (max - dirty)
	- * min_time is then capped at zfs_delay_max_ns.
	- *
	- * The delay has two degrees of freedom that can be adjusted via tunables.
	- * The percentage of dirty data at which we start to delay is defined by
	- * zfs_delay_min_dirty_percent. This should typically be at or above
	- * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
	- * delay after writing at full speed has failed to keep up with the incoming
	- * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
	- * speaking, this variable determines the amount of delay at the midpoint of
	- * the curve.
	- *
	- * delay
	- * 10ms +-------------------------------------------------------------*+
	- * \| *\|
	- * 9ms + *+
	- * \| *\|
	- * 8ms + *+
	- * \| * \|
	- * 7ms + * +
	- * \| * \|
	- * 6ms + * +
	- * \| * \|
	- * 5ms + * +
	- * \| * \|
	- * 4ms + * +
	- * \| * \|
	- * 3ms + * +
	- * \| * \|
	- * 2ms + (midpoint) * +
	- * \| \| ** \|
	- * 1ms + v *** +
	- * \| zfs_delay_scale ----------> ******** \|
	- * 0 +-------------------------------------*********----------------+
	- * 0% <- zfs_dirty_data_max -> 100%
	- *
	- * Note that since the delay is added to the outstanding time remaining on the
	- * most recent transaction, the delay is effectively the inverse of IOPS.
	- * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
	- * was chosen such that small changes in the amount of accumulated dirty data
	- * in the first 3/4 of the curve yield relatively small differences in the
	- * amount of delay.
	- *
	- * The effects can be easier to understand when the amount of delay is
	- * represented on a log scale:
	- *
	- * delay
	- * 100ms +-------------------------------------------------------------++
	- * + +
	- * \| \|
	- * + *+
	- * 10ms + *+
	- * + ** +
	- * \| (midpoint) ** \|
	- * + \| ** +
	- * 1ms + v **** +
	- * + zfs_delay_scale ----------> ***** +
	- * \| **** \|
	- * + **** +
	- * 100us + ** +
	- * + * +
	- * \| * \|
	- * + * +
	- * 10us + * +
	- * + +
	- * \| \|
	- * + +
	- * +--------------------------------------------------------------+
	- * 0% <- zfs_dirty_data_max -> 100%
	- *
	- * Note here that only as the amount of dirty data approaches its limit does
	- * the delay start to increase rapidly. The goal of a properly tuned system
	- * should be to keep the amount of dirty data out of that range by first
	- * ensuring that the appropriate limits are set for the I/O scheduler to reach
	- * optimal throughput on the backend storage, and then by changing the value
	- * of zfs_delay_scale to increase the steepness of the curve.
	- */
	-static void
	-dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
	-{
	- dsl_pool_t *dp = tx->tx_pool;
	- uint64_t delay_min_bytes =
	- zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
	- hrtime_t wakeup, min_tx_time, now;
	-
	- if (dirty <= delay_min_bytes)
	- return;
	-
	- /*
	- * The caller has already waited until we are under the max.
	- * We make them pass us the amount of dirty data so we don't
	- * have to handle the case of it being >= the max, which could
	- * cause a divide-by-zero if it's == the max.
	- */
	- ASSERT3U(dirty, <, zfs_dirty_data_max);
	-
	- now = gethrtime();
	- min_tx_time = zfs_delay_scale *
	- (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
	- if (now > tx->tx_start + min_tx_time)
	- return;
	-
	- min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
	-
	- DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
	- uint64_t, min_tx_time);
	-
	- mutex_enter(&dp->dp_lock);
	- wakeup = MAX(tx->tx_start + min_tx_time,
	- dp->dp_last_wakeup + min_tx_time);
	- dp->dp_last_wakeup = wakeup;
	- mutex_exit(&dp->dp_lock);
	-
	-#ifdef _KERNEL
	-#ifdef illumos
	- mutex_enter(&curthread->t_delay_lock);
	- while (cv_timedwait_hires(&curthread->t_delay_cv,
	- &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
	- CALLOUT_FLAG_ABSOLUTE \| CALLOUT_FLAG_ROUNDUP) > 0)
	- continue;
	- mutex_exit(&curthread->t_delay_lock);
	-#else
	- pause_sbt("dmu_tx_delay", nstosbt(wakeup),
	- nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE);
	-#endif
	-#else
	- hrtime_t delta = wakeup - gethrtime();
	- struct timespec ts;
	- ts.tv_sec = delta / NANOSEC;
	- ts.tv_nsec = delta % NANOSEC;
	- (void) nanosleep(&ts, NULL);
	-#endif
	-}
	-
	-/*
	- * This routine attempts to assign the transaction to a transaction group.
	- * To do so, we must determine if there is sufficient free space on disk.
	- *
	- * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
	- * on it), then it is assumed that there is sufficient free space,
	- * unless there's insufficient slop space in the pool (see the comment
	- * above spa_slop_shift in spa_misc.c).
	- *
	- * If it is not a "netfree" transaction, then if the data already on disk
	- * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
	- * ENOSPC. Otherwise, if the current rough estimate of pending changes,
	- * plus the rough estimate of this transaction's changes, may exceed the
	- * allowed usage, then this will fail with ERESTART, which will cause the
	- * caller to wait for the pending changes to be written to disk (by waiting
	- * for the next TXG to open), and then check the space usage again.
	- *
	- * The rough estimate of pending changes is comprised of the sum of:
	- *
	- * - this transaction's holds' txh_space_towrite
	- *
	- * - dd_tempreserved[], which is the sum of in-flight transactions'
	- * holds' txh_space_towrite (i.e. those transactions that have called
	- * dmu_tx_assign() but not yet called dmu_tx_commit()).
	- *
	- * - dd_space_towrite[], which is the amount of dirtied dbufs.
	- *
	- * Note that all of these values are inflated by spa_get_worst_case_asize(),
	- * which means that we may get ERESTART well before we are actually in danger
	- * of running out of space, but this also mitigates any small inaccuracies
	- * in the rough estimate (e.g. txh_space_towrite doesn't take into account
	- * indirect blocks, and dd_space_towrite[] doesn't take into account changes
	- * to the MOS).
	- *
	- * Note that due to this algorithm, it is possible to exceed the allowed
	- * usage by one transaction. Also, as we approach the allowed usage,
	- * we will allow a very limited amount of changes into each TXG, thus
	- * decreasing performance.
	- */
	-static int
	-dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
	-{
	- spa_t *spa = tx->tx_pool->dp_spa;
	-
	- ASSERT0(tx->tx_txg);
	-
	- if (tx->tx_err)
	- return (tx->tx_err);
	-
	- if (spa_suspended(spa)) {
	- /*
	- * If the user has indicated a blocking failure mode
	- * then return ERESTART which will block in dmu_tx_wait().
	- * Otherwise, return EIO so that an error can get
	- * propagated back to the VOP calls.
	- *
	- * Note that we always honor the txg_how flag regardless
	- * of the failuremode setting.
	- */
	- if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
	- !(txg_how & TXG_WAIT))
	- return (SET_ERROR(EIO));
	-
	- return (SET_ERROR(ERESTART));
	- }
	-
	- if (!tx->tx_dirty_delayed &&
	- dsl_pool_need_dirty_delay(tx->tx_pool)) {
	- tx->tx_wait_dirty = B_TRUE;
	- return (SET_ERROR(ERESTART));
	- }
	-
	- tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
	- tx->tx_needassign_txh = NULL;
	-
	- /*
	- * NB: No error returns are allowed after txg_hold_open, but
	- * before processing the dnode holds, due to the
	- * dmu_tx_unassign() logic.
	- */
	-
	- uint64_t towrite = 0;
	- uint64_t tohold = 0;
	- for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
	- txh = list_next(&tx->tx_holds, txh)) {
	- dnode_t *dn = txh->txh_dnode;
	- if (dn != NULL) {
	- mutex_enter(&dn->dn_mtx);
	- if (dn->dn_assigned_txg == tx->tx_txg - 1) {
	- mutex_exit(&dn->dn_mtx);
	- tx->tx_needassign_txh = txh;
	- return (SET_ERROR(ERESTART));
	- }
	- if (dn->dn_assigned_txg == 0)
	- dn->dn_assigned_txg = tx->tx_txg;
	- ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
	- (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
	- mutex_exit(&dn->dn_mtx);
	- }
	- towrite += zfs_refcount_count(&txh->txh_space_towrite);
	- tohold += zfs_refcount_count(&txh->txh_memory_tohold);
	- }
	-
	- /* needed allocation: worst-case estimate of write space */
	- uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
	- /* calculate memory footprint estimate */
	- uint64_t memory = towrite + tohold;
	-
	- if (tx->tx_dir != NULL && asize != 0) {
	- int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
	- asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
	- if (err != 0)
	- return (err);
	- }
	-
	- return (0);
	-}
	-
	-static void
	-dmu_tx_unassign(dmu_tx_t *tx)
	-{
	- if (tx->tx_txg == 0)
	- return;
	-
	- txg_rele_to_quiesce(&tx->tx_txgh);
	-
	- /*
	- * Walk the transaction's hold list, removing the hold on the
	- * associated dnode, and notifying waiters if the refcount drops to 0.
	- */
	- for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
	- txh != tx->tx_needassign_txh;
	- txh = list_next(&tx->tx_holds, txh)) {
	- dnode_t *dn = txh->txh_dnode;
	-
	- if (dn == NULL)
	- continue;
	- mutex_enter(&dn->dn_mtx);
	- ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
	-
	- if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
	- dn->dn_assigned_txg = 0;
	- cv_broadcast(&dn->dn_notxholds);
	- }
	- mutex_exit(&dn->dn_mtx);
	- }
	-
	- txg_rele_to_sync(&tx->tx_txgh);
	-
	- tx->tx_lasttried_txg = tx->tx_txg;
	- tx->tx_txg = 0;
	-}
	-
	-/*
	- * Assign tx to a transaction group; txg_how is a bitmask:
	- *
	- * If TXG_WAIT is set and the currently open txg is full, this function
	- * will wait until there's a new txg. This should be used when no locks
	- * are being held. With this bit set, this function will only fail if
	- * we're truly out of space (or over quota).
	- *
	- * If TXG_WAIT is not set and we can't assign into the currently open
	- * txg without blocking, this function will return immediately with
	- * ERESTART. This should be used whenever locks are being held. On an
	- * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
	- * and try again.
	- *
	- * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
	- * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
	- * details on the throttle). This is used by the VFS operations, after
	- * they have already called dmu_tx_wait() (though most likely on a
	- * different tx).
	- */
	-int
	-dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
	-{
	- int err;
	-
	- ASSERT(tx->tx_txg == 0);
	- ASSERT0(txg_how & ~(TXG_WAIT \| TXG_NOTHROTTLE));
	- ASSERT(!dsl_pool_sync_context(tx->tx_pool));
	-
	- /* If we might wait, we must not hold the config lock. */
	- IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
	-
	- if ((txg_how & TXG_NOTHROTTLE))
	- tx->tx_dirty_delayed = B_TRUE;
	-
	- while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
	- dmu_tx_unassign(tx);
	-
	- if (err != ERESTART \|\| !(txg_how & TXG_WAIT))
	- return (err);
	-
	- dmu_tx_wait(tx);
	- }
	-
	- txg_rele_to_quiesce(&tx->tx_txgh);
	-
	- return (0);
	-}
	-
	-void
	-dmu_tx_wait(dmu_tx_t *tx)
	-{
	- spa_t *spa = tx->tx_pool->dp_spa;
	- dsl_pool_t *dp = tx->tx_pool;
	-
	- ASSERT(tx->tx_txg == 0);
	- ASSERT(!dsl_pool_config_held(tx->tx_pool));
	-
	- if (tx->tx_wait_dirty) {
	- /*
	- * dmu_tx_try_assign() has determined that we need to wait
	- * because we've consumed much or all of the dirty buffer
	- * space.
	- */
	- mutex_enter(&dp->dp_lock);
	- while (dp->dp_dirty_total >= zfs_dirty_data_max)
	- cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
	- uint64_t dirty = dp->dp_dirty_total;
	- mutex_exit(&dp->dp_lock);
	-
	- dmu_tx_delay(tx, dirty);
	-
	- tx->tx_wait_dirty = B_FALSE;
	-
	- /*
	- * Note: setting tx_dirty_delayed only has effect if the
	- * caller used TX_WAIT. Otherwise they are going to
	- * destroy this tx and try again. The common case,
	- * zfs_write(), uses TX_WAIT.
	- */
	- tx->tx_dirty_delayed = B_TRUE;
	- } else if (spa_suspended(spa) \|\| tx->tx_lasttried_txg == 0) {
	- /*
	- * If the pool is suspended we need to wait until it
	- * is resumed. Note that it's possible that the pool
	- * has become active after this thread has tried to
	- * obtain a tx. If that's the case then tx_lasttried_txg
	- * would not have been set.
	- */
	- txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
	- } else if (tx->tx_needassign_txh) {
	- /*
	- * A dnode is assigned to the quiescing txg. Wait for its
	- * transaction to complete.
	- */
	- dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
	-
	- mutex_enter(&dn->dn_mtx);
	- while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
	- cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
	- mutex_exit(&dn->dn_mtx);
	- tx->tx_needassign_txh = NULL;
	- } else {
	- /*
	- * If we have a lot of dirty data just wait until we sync
	- * out a TXG at which point we'll hopefully have synced
	- * a portion of the changes.
	- */
	- txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
	- }
	-}
	-
	-static void
	-dmu_tx_destroy(dmu_tx_t *tx)
	-{
	- dmu_tx_hold_t *txh;
	-
	- while ((txh = list_head(&tx->tx_holds)) != NULL) {
	- dnode_t *dn = txh->txh_dnode;
	-
	- list_remove(&tx->tx_holds, txh);
	- zfs_refcount_destroy_many(&txh->txh_space_towrite,
	- zfs_refcount_count(&txh->txh_space_towrite));
	- zfs_refcount_destroy_many(&txh->txh_memory_tohold,
	- zfs_refcount_count(&txh->txh_memory_tohold));
	- kmem_free(txh, sizeof (dmu_tx_hold_t));
	- if (dn != NULL)
	- dnode_rele(dn, tx);
	- }
	-
	- list_destroy(&tx->tx_callbacks);
	- list_destroy(&tx->tx_holds);
	- kmem_free(tx, sizeof (dmu_tx_t));
	-}
	-
	-void
	-dmu_tx_commit(dmu_tx_t *tx)
	-{
	- ASSERT(tx->tx_txg != 0);
	-
	- /*
	- * Go through the transaction's hold list and remove holds on
	- * associated dnodes, notifying waiters if no holds remain.
	- */
	- for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
	- txh = list_next(&tx->tx_holds, txh)) {
	- dnode_t *dn = txh->txh_dnode;
	-
	- if (dn == NULL)
	- continue;
	-
	- mutex_enter(&dn->dn_mtx);
	- ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
	-
	- if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
	- dn->dn_assigned_txg = 0;
	- cv_broadcast(&dn->dn_notxholds);
	- }
	- mutex_exit(&dn->dn_mtx);
	- }
	-
	- if (tx->tx_tempreserve_cookie)
	- dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
	-
	- if (!list_is_empty(&tx->tx_callbacks))
	- txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
	-
	- if (tx->tx_anyobj == FALSE)
	- txg_rele_to_sync(&tx->tx_txgh);
	-
	- dmu_tx_destroy(tx);
	-}
	-
	-void
	-dmu_tx_abort(dmu_tx_t *tx)
	-{
	- ASSERT(tx->tx_txg == 0);
	-
	- /*
	- * Call any registered callbacks with an error code.
	- */
	- if (!list_is_empty(&tx->tx_callbacks))
	- dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
	-
	- dmu_tx_destroy(tx);
	-}
	-
	-uint64_t
	-dmu_tx_get_txg(dmu_tx_t *tx)
	-{
	- ASSERT(tx->tx_txg != 0);
	- return (tx->tx_txg);
	-}
	-
	-dsl_pool_t *
	-dmu_tx_pool(dmu_tx_t *tx)
	-{
	- ASSERT(tx->tx_pool != NULL);
	- return (tx->tx_pool);
	-}
	-
	-void
	-dmu_tx_callback_register(dmu_tx_t tx, dmu_tx_callback_func_t func, void *data)
	-{
	- dmu_tx_callback_t *dcb;
	-
	- dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
	-
	- dcb->dcb_func = func;
	- dcb->dcb_data = data;
	-
	- list_insert_tail(&tx->tx_callbacks, dcb);
	-}
	-
	-/*
	- * Call all the commit callbacks on a list, with a given error code.
	- */
	-void
	-dmu_tx_do_callbacks(list_t *cb_list, int error)
	-{
	- dmu_tx_callback_t *dcb;
	-
	- while ((dcb = list_head(cb_list)) != NULL) {
	- list_remove(cb_list, dcb);
	- dcb->dcb_func(dcb->dcb_data, error);
	- kmem_free(dcb, sizeof (dmu_tx_callback_t));
	- }
	-}
	-
	-/*
	- * Interface to hold a bunch of attributes.
	- * used for creating new files.
	- * attrsize is the total size of all attributes
	- * to be added during object creation
	- *
	- * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
	- */
	-
	-/*
	- * hold necessary attribute name for attribute registration.
	- * should be a very rare case where this is needed. If it does
	- * happen it would only happen on the first write to the file system.
	- */
	-static void
	-dmu_tx_sa_registration_hold(sa_os_t sa, dmu_tx_t tx)
	-{
	- if (!sa->sa_need_attr_registration)
	- return;
	-
	- for (int i = 0; i != sa->sa_num_attrs; i++) {
	- if (!sa->sa_attr_table[i].sa_registered) {
	- if (sa->sa_reg_attr_obj)
	- dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
	- B_TRUE, sa->sa_attr_table[i].sa_name);
	- else
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
	- B_TRUE, sa->sa_attr_table[i].sa_name);
	- }
	- }
	-}
	-
	-void
	-dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
	-{
	- dmu_tx_hold_t *txh;
	-
	- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
	- THT_SPILL, 0, 0);
	- if (txh != NULL)
	- (void) zfs_refcount_add_many(&txh->txh_space_towrite,
	- SPA_OLD_MAXBLOCKSIZE, FTAG);
	-}
	-
	-void
	-dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
	-{
	- sa_os_t *sa = tx->tx_objset->os_sa;
	-
	- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
	-
	- if (tx->tx_objset->os_sa->sa_master_obj == 0)
	- return;
	-
	- if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
	- dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
	- } else {
	- dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
	- dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	- }
	-
	- dmu_tx_sa_registration_hold(sa, tx);
	-
	- if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
	- return;
	-
	- (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
	- THT_SPILL, 0, 0);
	-}
	-
	-/*
	- * Hold SA attribute
	- *
	- * dmu_tx_hold_sa(dmu_tx_t tx, sa_handle_t , attribute, add, size)
	- *
	- * variable_size is the total size of all variable sized attributes
	- * passed to this function. It is not the total size of all
	- * variable size attributes that may exist on this object.
	- */
	-void
	-dmu_tx_hold_sa(dmu_tx_t tx, sa_handle_t hdl, boolean_t may_grow)
	-{
	- uint64_t object;
	- sa_os_t *sa = tx->tx_objset->os_sa;
	-
	- ASSERT(hdl != NULL);
	-
	- object = sa_handle_object(hdl);
	-
	- dmu_tx_hold_bonus(tx, object);
	-
	- if (tx->tx_objset->os_sa->sa_master_obj == 0)
	- return;
	-
	- if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 \|\|
	- tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
	- dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
	- dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	- }
	-
	- dmu_tx_sa_registration_hold(sa, tx);
	-
	- if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
	- dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
	-
	- if (sa->sa_force_spill \|\| may_grow \|\| hdl->sa_spill) {
	- ASSERT(tx->tx_txg == 0);
	- dmu_tx_hold_spill(tx, object);
	- } else {
	- dmu_buf_impl_t db = (dmu_buf_impl_t )hdl->sa_bonus;
	- dnode_t *dn;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- if (dn->dn_have_spill) {
	- ASSERT(tx->tx_txg == 0);
	- dmu_tx_hold_spill(tx, object);
	- }
	- DB_DNODE_EXIT(db);
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
	@@ -1,374 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dnode.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_zfetch.h>
	-#include <sys/dmu.h>
	-#include <sys/dbuf.h>
	-#include <sys/kstat.h>
	-
	-/*
	- * This tunable disables predictive prefetch. Note that it leaves "prescient"
	- * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
	- * prescient prefetch never issues i/os that end up not being needed,
	- * so it can't hurt performance.
	- */
	-boolean_t zfs_prefetch_disable = B_FALSE;
	-
	-/* max # of streams per zfetch */
	-uint32_t zfetch_max_streams = 8;
	-/* min time before stream reclaim */
	-uint32_t zfetch_min_sec_reap = 2;
	-/* max bytes to prefetch per stream (default 8MB) */
	-uint32_t zfetch_max_distance = 8 * 1024 * 1024;
	-/* max bytes to prefetch indirects for per stream (default 64MB) */
	-uint32_t zfetch_max_idistance = 64 * 1024 * 1024;
	-/* max number of bytes in an array_read in which we allow prefetching (1MB) */
	-uint64_t zfetch_array_rd_sz = 1024 * 1024;
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
	- &zfs_prefetch_disable, 0, "Disable prefetch");
	-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "ZFS ZFETCH");
	-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN,
	- &zfetch_max_streams, 0, "Max # of streams per zfetch");
	-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
	- &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
	-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
	- &zfetch_max_distance, 0, "Max bytes to prefetch per stream");
	-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
	- &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
	-SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
	- &zfetch_array_rd_sz, 0,
	- "Number of bytes in a array_read at which we stop prefetching");
	-
	-typedef struct zfetch_stats {
	- kstat_named_t zfetchstat_hits;
	- kstat_named_t zfetchstat_misses;
	- kstat_named_t zfetchstat_max_streams;
	-} zfetch_stats_t;
	-
	-static zfetch_stats_t zfetch_stats = {
	- { "hits", KSTAT_DATA_UINT64 },
	- { "misses", KSTAT_DATA_UINT64 },
	- { "max_streams", KSTAT_DATA_UINT64 },
	-};
	-
	-#define ZFETCHSTAT_BUMP(stat) \
	- atomic_inc_64(&zfetch_stats.stat.value.ui64);
	-
	-kstat_t *zfetch_ksp;
	-
	-void
	-zfetch_init(void)
	-{
	- zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
	- KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
	- KSTAT_FLAG_VIRTUAL);
	-
	- if (zfetch_ksp != NULL) {
	- zfetch_ksp->ks_data = &zfetch_stats;
	- kstat_install(zfetch_ksp);
	- }
	-}
	-
	-void
	-zfetch_fini(void)
	-{
	- if (zfetch_ksp != NULL) {
	- kstat_delete(zfetch_ksp);
	- zfetch_ksp = NULL;
	- }
	-}
	-
	-/*
	- * This takes a pointer to a zfetch structure and a dnode. It performs the
	- * necessary setup for the zfetch structure, grokking data from the
	- * associated dnode.
	- */
	-void
	-dmu_zfetch_init(zfetch_t zf, dnode_t dno)
	-{
	- if (zf == NULL)
	- return;
	-
	- zf->zf_dnode = dno;
	-
	- list_create(&zf->zf_stream, sizeof (zstream_t),
	- offsetof(zstream_t, zs_node));
	-
	- rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
	-}
	-
	-static void
	-dmu_zfetch_stream_remove(zfetch_t zf, zstream_t zs)
	-{
	- ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
	- list_remove(&zf->zf_stream, zs);
	- mutex_destroy(&zs->zs_lock);
	- kmem_free(zs, sizeof (*zs));
	-}
	-
	-/*
	- * Clean-up state associated with a zfetch structure (e.g. destroy the
	- * streams). This doesn't free the zfetch_t itself, that's left to the caller.
	- */
	-void
	-dmu_zfetch_fini(zfetch_t *zf)
	-{
	- zstream_t *zs;
	-
	- ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
	-
	- rw_enter(&zf->zf_rwlock, RW_WRITER);
	- while ((zs = list_head(&zf->zf_stream)) != NULL)
	- dmu_zfetch_stream_remove(zf, zs);
	- rw_exit(&zf->zf_rwlock);
	- list_destroy(&zf->zf_stream);
	- rw_destroy(&zf->zf_rwlock);
	-
	- zf->zf_dnode = NULL;
	-}
	-
	-/*
	- * If there aren't too many streams already, create a new stream.
	- * The "blkid" argument is the next block that we expect this stream to access.
	- * While we're here, clean up old streams (which haven't been
	- * accessed for at least zfetch_min_sec_reap seconds).
	- */
	-static void
	-dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
	-{
	- zstream_t *zs_next;
	- int numstreams = 0;
	-
	- ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
	-
	- /*
	- * Clean up old streams.
	- */
	- for (zstream_t *zs = list_head(&zf->zf_stream);
	- zs != NULL; zs = zs_next) {
	- zs_next = list_next(&zf->zf_stream, zs);
	- if (((gethrtime() - zs->zs_atime) / NANOSEC) >
	- zfetch_min_sec_reap)
	- dmu_zfetch_stream_remove(zf, zs);
	- else
	- numstreams++;
	- }
	-
	- /*
	- * The maximum number of streams is normally zfetch_max_streams,
	- * but for small files we lower it such that it's at least possible
	- * for all the streams to be non-overlapping.
	- *
	- * If we are already at the maximum number of streams for this file,
	- * even after removing old streams, then don't create this stream.
	- */
	- uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
	- zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
	- zfetch_max_distance));
	- if (numstreams >= max_streams) {
	- ZFETCHSTAT_BUMP(zfetchstat_max_streams);
	- return;
	- }
	-
	- zstream_t zs = kmem_zalloc(sizeof (zs), KM_SLEEP);
	- zs->zs_blkid = blkid;
	- zs->zs_pf_blkid = blkid;
	- zs->zs_ipf_blkid = blkid;
	- zs->zs_atime = gethrtime();
	- mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- list_insert_head(&zf->zf_stream, zs);
	-}
	-
	-/*
	- * This is the predictive prefetch entry point. It associates dnode access
	- * specified with blkid and nblks arguments with prefetch stream, predicts
	- * further accesses based on that stats and initiates speculative prefetch.
	- * fetch_data argument specifies whether actual data blocks should be fetched:
	- * FALSE -- prefetch only indirect blocks for predicted data blocks;
	- * TRUE -- prefetch predicted data blocks plus following indirect blocks.
	- */
	-void
	-dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
	-{
	- zstream_t *zs;
	- int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
	- int64_t pf_ahead_blks, max_blks;
	- int epbs, max_dist_blks, pf_nblks, ipf_nblks;
	- uint64_t end_of_access_blkid = blkid + nblks;
	- spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
	-
	- if (zfs_prefetch_disable)
	- return;
	-
	- /*
	- * If we haven't yet loaded the indirect vdevs' mappings, we
	- * can only read from blocks that we carefully ensure are on
	- * concrete vdevs (or previously-loaded indirect vdevs). So we
	- * can't allow the predictive prefetcher to attempt reads of other
	- * blocks (e.g. of the MOS's dnode obejct).
	- */
	- if (!spa_indirect_vdevs_loaded(spa))
	- return;
	-
	- /*
	- * As a fast path for small (single-block) files, ignore access
	- * to the first block.
	- */
	- if (blkid == 0)
	- return;
	-
	- rw_enter(&zf->zf_rwlock, RW_READER);
	-
	- /*
	- * Find matching prefetch stream. Depending on whether the accesses
	- * are block-aligned, first block of the new access may either follow
	- * the last block of the previous access, or be equal to it.
	- */
	- for (zs = list_head(&zf->zf_stream); zs != NULL;
	- zs = list_next(&zf->zf_stream, zs)) {
	- if (blkid == zs->zs_blkid \|\| blkid + 1 == zs->zs_blkid) {
	- mutex_enter(&zs->zs_lock);
	- /*
	- * zs_blkid could have changed before we
	- * acquired zs_lock; re-check them here.
	- */
	- if (blkid == zs->zs_blkid) {
	- break;
	- } else if (blkid + 1 == zs->zs_blkid) {
	- blkid++;
	- nblks--;
	- if (nblks == 0) {
	- /* Already prefetched this before. */
	- mutex_exit(&zs->zs_lock);
	- rw_exit(&zf->zf_rwlock);
	- return;
	- }
	- break;
	- }
	- mutex_exit(&zs->zs_lock);
	- }
	- }
	-
	- if (zs == NULL) {
	- /*
	- * This access is not part of any existing stream. Create
	- * a new stream for it.
	- */
	- ZFETCHSTAT_BUMP(zfetchstat_misses);
	- if (rw_tryupgrade(&zf->zf_rwlock))
	- dmu_zfetch_stream_create(zf, end_of_access_blkid);
	- rw_exit(&zf->zf_rwlock);
	- return;
	- }
	-
	- /*
	- * This access was to a block that we issued a prefetch for on
	- * behalf of this stream. Issue further prefetches for this stream.
	- *
	- * Normally, we start prefetching where we stopped
	- * prefetching last (zs_pf_blkid). But when we get our first
	- * hit on this stream, zs_pf_blkid == zs_blkid, we don't
	- * want to prefetch the block we just accessed. In this case,
	- * start just after the block we just accessed.
	- */
	- pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
	-
	- /*
	- * Double our amount of prefetched data, but don't let the
	- * prefetch get further ahead than zfetch_max_distance.
	- */
	- if (fetch_data) {
	- max_dist_blks =
	- zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
	- /*
	- * Previously, we were (zs_pf_blkid - blkid) ahead. We
	- * want to now be double that, so read that amount again,
	- * plus the amount we are catching up by (i.e. the amount
	- * read just now).
	- */
	- pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
	- max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
	- pf_nblks = MIN(pf_ahead_blks, max_blks);
	- } else {
	- pf_nblks = 0;
	- }
	-
	- zs->zs_pf_blkid = pf_start + pf_nblks;
	-
	- /*
	- * Do the same for indirects, starting from where we stopped last,
	- * or where we will stop reading data blocks (and the indirects
	- * that point to them).
	- */
	- ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
	- max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
	- /*
	- * We want to double our distance ahead of the data prefetch
	- * (or reader, if we are not prefetching data). Previously, we
	- * were (zs_ipf_blkid - blkid) ahead. To double that, we read
	- * that amount again, plus the amount we are catching up by
	- * (i.e. the amount read now + the amount of data prefetched now).
	- */
	- pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
	- max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
	- ipf_nblks = MIN(pf_ahead_blks, max_blks);
	- zs->zs_ipf_blkid = ipf_start + ipf_nblks;
	-
	- epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
	- ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
	- ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
	-
	- zs->zs_atime = gethrtime();
	- zs->zs_blkid = end_of_access_blkid;
	- mutex_exit(&zs->zs_lock);
	- rw_exit(&zf->zf_rwlock);
	-
	- /*
	- * dbuf_prefetch() is asynchronous (even when it needs to read
	- * indirect blocks), but we still prefer to drop our locks before
	- * calling it to reduce the time we hold them.
	- */
	-
	- for (int i = 0; i < pf_nblks; i++) {
	- dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
	- ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
	- }
	- for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
	- dbuf_prefetch(zf->zf_dnode, 1, iblk,
	- ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
	- }
	- ZFETCHSTAT_BUMP(zfetchstat_hits);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
	@@ -1,2418 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 RackTop Systems.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dbuf.h>
	-#include <sys/dnode.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/spa.h>
	-#include <sys/zio.h>
	-#include <sys/dmu_zfetch.h>
	-#include <sys/range_tree.h>
	-
	-dnode_stats_t dnode_stats = {
	- { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },
	- { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },
	- { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },
	- { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },
	- { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },
	- { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },
	- { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },
	- { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },
	- { "dnode_hold_free_hits", KSTAT_DATA_UINT64 },
	- { "dnode_hold_free_misses", KSTAT_DATA_UINT64 },
	- { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },
	- { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
	- { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
	- { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
	- { "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
	- { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
	- { "dnode_allocate", KSTAT_DATA_UINT64 },
	- { "dnode_reallocate", KSTAT_DATA_UINT64 },
	- { "dnode_buf_evict", KSTAT_DATA_UINT64 },
	- { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },
	- { "dnode_alloc_race", KSTAT_DATA_UINT64 },
	- { "dnode_alloc_next_block", KSTAT_DATA_UINT64 },
	- { "dnode_move_invalid", KSTAT_DATA_UINT64 },
	- { "dnode_move_recheck1", KSTAT_DATA_UINT64 },
	- { "dnode_move_recheck2", KSTAT_DATA_UINT64 },
	- { "dnode_move_special", KSTAT_DATA_UINT64 },
	- { "dnode_move_handle", KSTAT_DATA_UINT64 },
	- { "dnode_move_rwlock", KSTAT_DATA_UINT64 },
	- { "dnode_move_active", KSTAT_DATA_UINT64 },
	-};
	-
	-static kstat_t *dnode_ksp;
	-static kmem_cache_t *dnode_cache;
	-
	-static dnode_phys_t dnode_phys_zero;
	-
	-int zfs_default_bs = SPA_MINBLOCKSHIFT;
	-int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN,
	- &zfs_default_bs, 0, "Default dnode block shift");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN,
	- &zfs_default_ibs, 0, "Default dnode indirect block shift");
	-
	-#ifdef illumos
	-#ifdef _KERNEL
	-static kmem_cbrc_t dnode_move(void , void , size_t, void *);
	-#endif /* _KERNEL */
	-#endif
	-
	-static int
	-dbuf_compare(const void x1, const void x2)
	-{
	- const dmu_buf_impl_t *d1 = x1;
	- const dmu_buf_impl_t *d2 = x2;
	-
	- int cmp = AVL_CMP(d1->db_level, d2->db_level);
	- if (likely(cmp))
	- return (cmp);
	-
	- cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
	- if (likely(cmp))
	- return (cmp);
	-
	- if (d1->db_state == DB_SEARCH) {
	- ASSERT3S(d2->db_state, !=, DB_SEARCH);
	- return (-1);
	- } else if (d2->db_state == DB_SEARCH) {
	- ASSERT3S(d1->db_state, !=, DB_SEARCH);
	- return (1);
	- }
	-
	- return (AVL_PCMP(d1, d2));
	-}
	-
	-/* ARGSUSED */
	-static int
	-dnode_cons(void arg, void unused, int kmflag)
	-{
	- dnode_t *dn = arg;
	- int i;
	-
	- rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
	- mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
	-
	- /*
	- * Every dbuf has a reference, and dropping a tracked reference is
	- * O(number of references), so don't track dn_holds.
	- */
	- zfs_refcount_create_untracked(&dn->dn_holds);
	- zfs_refcount_create(&dn->dn_tx_holds);
	- list_link_init(&dn->dn_link);
	-
	- bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
	- bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
	- bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
	- bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
	- bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
	- bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
	- bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
	-
	- for (i = 0; i < TXG_SIZE; i++) {
	- multilist_link_init(&dn->dn_dirty_link[i]);
	- dn->dn_free_ranges[i] = NULL;
	- list_create(&dn->dn_dirty_records[i],
	- sizeof (dbuf_dirty_record_t),
	- offsetof(dbuf_dirty_record_t, dr_dirty_node));
	- }
	-
	- dn->dn_allocated_txg = 0;
	- dn->dn_free_txg = 0;
	- dn->dn_assigned_txg = 0;
	- dn->dn_dirty_txg = 0;
	- dn->dn_dirtyctx = 0;
	- dn->dn_dirtyctx_firstset = NULL;
	- dn->dn_bonus = NULL;
	- dn->dn_have_spill = B_FALSE;
	- dn->dn_zio = NULL;
	- dn->dn_oldused = 0;
	- dn->dn_oldflags = 0;
	- dn->dn_olduid = 0;
	- dn->dn_oldgid = 0;
	- dn->dn_newuid = 0;
	- dn->dn_newgid = 0;
	- dn->dn_id_flags = 0;
	-
	- dn->dn_dbufs_count = 0;
	- avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
	- offsetof(dmu_buf_impl_t, db_link));
	-
	- dn->dn_moved = 0;
	- POINTER_INVALIDATE(&dn->dn_objset);
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dnode_dest(void arg, void unused)
	-{
	- int i;
	- dnode_t *dn = arg;
	-
	- rw_destroy(&dn->dn_struct_rwlock);
	- mutex_destroy(&dn->dn_mtx);
	- mutex_destroy(&dn->dn_dbufs_mtx);
	- cv_destroy(&dn->dn_notxholds);
	- zfs_refcount_destroy(&dn->dn_holds);
	- zfs_refcount_destroy(&dn->dn_tx_holds);
	- ASSERT(!list_link_active(&dn->dn_link));
	-
	- for (i = 0; i < TXG_SIZE; i++) {
	- ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
	- ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
	- list_destroy(&dn->dn_dirty_records[i]);
	- ASSERT0(dn->dn_next_nblkptr[i]);
	- ASSERT0(dn->dn_next_nlevels[i]);
	- ASSERT0(dn->dn_next_indblkshift[i]);
	- ASSERT0(dn->dn_next_bonustype[i]);
	- ASSERT0(dn->dn_rm_spillblk[i]);
	- ASSERT0(dn->dn_next_bonuslen[i]);
	- ASSERT0(dn->dn_next_blksz[i]);
	- }
	-
	- ASSERT0(dn->dn_allocated_txg);
	- ASSERT0(dn->dn_free_txg);
	- ASSERT0(dn->dn_assigned_txg);
	- ASSERT0(dn->dn_dirty_txg);
	- ASSERT0(dn->dn_dirtyctx);
	- ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
	- ASSERT3P(dn->dn_bonus, ==, NULL);
	- ASSERT(!dn->dn_have_spill);
	- ASSERT3P(dn->dn_zio, ==, NULL);
	- ASSERT0(dn->dn_oldused);
	- ASSERT0(dn->dn_oldflags);
	- ASSERT0(dn->dn_olduid);
	- ASSERT0(dn->dn_oldgid);
	- ASSERT0(dn->dn_newuid);
	- ASSERT0(dn->dn_newgid);
	- ASSERT0(dn->dn_id_flags);
	-
	- ASSERT0(dn->dn_dbufs_count);
	- avl_destroy(&dn->dn_dbufs);
	-}
	-
	-void
	-dnode_init(void)
	-{
	- ASSERT(dnode_cache == NULL);
	- dnode_cache = kmem_cache_create("dnode_t",
	- sizeof (dnode_t),
	- 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
	-#ifdef _KERNEL
	- kmem_cache_set_move(dnode_cache, dnode_move);
	-
	- dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
	- KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
	- KSTAT_FLAG_VIRTUAL);
	- if (dnode_ksp != NULL) {
	- dnode_ksp->ks_data = &dnode_stats;
	- kstat_install(dnode_ksp);
	- }
	-#endif /* _KERNEL */
	-}
	-
	-void
	-dnode_fini(void)
	-{
	- if (dnode_ksp != NULL) {
	- kstat_delete(dnode_ksp);
	- dnode_ksp = NULL;
	- }
	-
	- kmem_cache_destroy(dnode_cache);
	- dnode_cache = NULL;
	-}
	-
	-
	-#ifdef ZFS_DEBUG
	-void
	-dnode_verify(dnode_t *dn)
	-{
	- int drop_struct_lock = FALSE;
	-
	- ASSERT(dn->dn_phys);
	- ASSERT(dn->dn_objset);
	- ASSERT(dn->dn_handle->dnh_dnode == dn);
	-
	- ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
	-
	- if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
	- return;
	-
	- if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- drop_struct_lock = TRUE;
	- }
	- if (dn->dn_phys->dn_type != DMU_OT_NONE \|\| dn->dn_allocated_txg != 0) {
	- int i;
	- int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
	- ASSERT3U(dn->dn_indblkshift, >=, 0);
	- ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
	- if (dn->dn_datablkshift) {
	- ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
	- ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
	- ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
	- }
	- ASSERT3U(dn->dn_nlevels, <=, 30);
	- ASSERT(DMU_OT_IS_VALID(dn->dn_type));
	- ASSERT3U(dn->dn_nblkptr, >=, 1);
	- ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
	- ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
	- ASSERT3U(dn->dn_datablksz, ==,
	- dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
	- ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
	- ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
	- dn->dn_bonuslen, <=, max_bonuslen);
	- for (i = 0; i < TXG_SIZE; i++) {
	- ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
	- }
	- }
	- if (dn->dn_phys->dn_type != DMU_OT_NONE)
	- ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
	- ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\| dn->dn_dbuf != NULL);
	- if (dn->dn_dbuf != NULL) {
	- ASSERT3P(dn->dn_phys, ==,
	- (dnode_phys_t *)dn->dn_dbuf->db.db_data +
	- (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
	- }
	- if (drop_struct_lock)
	- rw_exit(&dn->dn_struct_rwlock);
	-}
	-#endif
	-
	-void
	-dnode_byteswap(dnode_phys_t *dnp)
	-{
	- uint64_t buf64 = (void)&dnp->dn_blkptr;
	- int i;
	-
	- if (dnp->dn_type == DMU_OT_NONE) {
	- bzero(dnp, sizeof (dnode_phys_t));
	- return;
	- }
	-
	- dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
	- dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
	- dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
	- dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
	- dnp->dn_used = BSWAP_64(dnp->dn_used);
	-
	- /*
	- * dn_nblkptr is only one byte, so it's OK to read it in either
	- * byte order. We can't read dn_bouslen.
	- */
	- ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
	- ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
	- for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
	- buf64[i] = BSWAP_64(buf64[i]);
	-
	- /*
	- * OK to check dn_bonuslen for zero, because it won't matter if
	- * we have the wrong byte order. This is necessary because the
	- * dnode dnode is smaller than a regular dnode.
	- */
	- if (dnp->dn_bonuslen != 0) {
	- /*
	- * Note that the bonus length calculated here may be
	- * longer than the actual bonus buffer. This is because
	- * we always put the bonus buffer after the last block
	- * pointer (instead of packing it against the end of the
	- * dnode buffer).
	- */
	- int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
	- int slots = dnp->dn_extra_slots + 1;
	- size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
	- ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
	- dmu_object_byteswap_t byteswap =
	- DMU_OT_BYTESWAP(dnp->dn_bonustype);
	- dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
	- }
	-
	- /* Swap SPILL block if we have one */
	- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
	- byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
	-
	-}
	-
	-void
	-dnode_buf_byteswap(void *vbuf, size_t size)
	-{
	- int i = 0;
	-
	- ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
	- ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
	-
	- while (i < size) {
	- dnode_phys_t dnp = (void )(((char *)vbuf) + i);
	- dnode_byteswap(dnp);
	-
	- i += DNODE_MIN_SIZE;
	- if (dnp->dn_type != DMU_OT_NONE)
	- i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
	- }
	-}
	-
	-void
	-dnode_setbonuslen(dnode_t dn, int newsize, dmu_tx_t tx)
	-{
	- ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
	-
	- dnode_setdirty(dn, tx);
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
	- (dn->dn_nblkptr-1) * sizeof (blkptr_t));
	- dn->dn_bonuslen = newsize;
	- if (newsize == 0)
	- dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
	- else
	- dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
	- rw_exit(&dn->dn_struct_rwlock);
	-}
	-
	-void
	-dnode_setbonus_type(dnode_t dn, dmu_object_type_t newtype, dmu_tx_t tx)
	-{
	- ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
	- dnode_setdirty(dn, tx);
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- dn->dn_bonustype = newtype;
	- dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
	- rw_exit(&dn->dn_struct_rwlock);
	-}
	-
	-void
	-dnode_rm_spill(dnode_t dn, dmu_tx_t tx)
	-{
	- ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
	- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
	- dnode_setdirty(dn, tx);
	- dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
	- dn->dn_have_spill = B_FALSE;
	-}
	-
	-static void
	-dnode_setdblksz(dnode_t *dn, int size)
	-{
	- ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
	- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
	- ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
	- ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
	- 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
	- dn->dn_datablksz = size;
	- dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
	- dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
	-}
	-
	-static dnode_t *
	-dnode_create(objset_t os, dnode_phys_t dnp, dmu_buf_impl_t *db,
	- uint64_t object, dnode_handle_t *dnh)
	-{
	- dnode_t *dn;
	-
	- dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
	-#ifdef _KERNEL
	- ASSERT(!POINTER_IS_VALID(dn->dn_objset));
	-#endif /* _KERNEL */
	- dn->dn_moved = 0;
	-
	- /*
	- * Defer setting dn_objset until the dnode is ready to be a candidate
	- * for the dnode_move() callback.
	- */
	- dn->dn_object = object;
	- dn->dn_dbuf = db;
	- dn->dn_handle = dnh;
	- dn->dn_phys = dnp;
	-
	- if (dnp->dn_datablkszsec) {
	- dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
	- } else {
	- dn->dn_datablksz = 0;
	- dn->dn_datablkszsec = 0;
	- dn->dn_datablkshift = 0;
	- }
	- dn->dn_indblkshift = dnp->dn_indblkshift;
	- dn->dn_nlevels = dnp->dn_nlevels;
	- dn->dn_type = dnp->dn_type;
	- dn->dn_nblkptr = dnp->dn_nblkptr;
	- dn->dn_checksum = dnp->dn_checksum;
	- dn->dn_compress = dnp->dn_compress;
	- dn->dn_bonustype = dnp->dn_bonustype;
	- dn->dn_bonuslen = dnp->dn_bonuslen;
	- dn->dn_num_slots = dnp->dn_extra_slots + 1;
	- dn->dn_maxblkid = dnp->dn_maxblkid;
	- dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
	- dn->dn_id_flags = 0;
	-
	- dmu_zfetch_init(&dn->dn_zfetch, dn);
	-
	- ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
	- ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
	- ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
	-
	- mutex_enter(&os->os_lock);
	-
	- /*
	- * Exclude special dnodes from os_dnodes so an empty os_dnodes
	- * signifies that the special dnodes have no references from
	- * their children (the entries in os_dnodes). This allows
	- * dnode_destroy() to easily determine if the last child has
	- * been removed and then complete eviction of the objset.
	- */
	- if (!DMU_OBJECT_IS_SPECIAL(object))
	- list_insert_head(&os->os_dnodes, dn);
	- membar_producer();
	-
	- /*
	- * Everything else must be valid before assigning dn_objset
	- * makes the dnode eligible for dnode_move().
	- */
	- dn->dn_objset = os;
	-
	- dnh->dnh_dnode = dn;
	- mutex_exit(&os->os_lock);
	-
	- arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
	-
	- return (dn);
	-}
	-
	-/*
	- * Caller must be holding the dnode handle, which is released upon return.
	- */
	-static void
	-dnode_destroy(dnode_t *dn)
	-{
	- objset_t *os = dn->dn_objset;
	- boolean_t complete_os_eviction = B_FALSE;
	-
	- ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
	-
	- mutex_enter(&os->os_lock);
	- POINTER_INVALIDATE(&dn->dn_objset);
	- if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
	- list_remove(&os->os_dnodes, dn);
	- complete_os_eviction =
	- list_is_empty(&os->os_dnodes) &&
	- list_link_active(&os->os_evicting_node);
	- }
	- mutex_exit(&os->os_lock);
	-
	- /* the dnode can no longer move, so we can release the handle */
	- if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
	- zrl_remove(&dn->dn_handle->dnh_zrlock);
	-
	- dn->dn_allocated_txg = 0;
	- dn->dn_free_txg = 0;
	- dn->dn_assigned_txg = 0;
	- dn->dn_dirty_txg = 0;
	-
	- dn->dn_dirtyctx = 0;
	- if (dn->dn_dirtyctx_firstset != NULL) {
	- kmem_free(dn->dn_dirtyctx_firstset, 1);
	- dn->dn_dirtyctx_firstset = NULL;
	- }
	- if (dn->dn_bonus != NULL) {
	- mutex_enter(&dn->dn_bonus->db_mtx);
	- dbuf_destroy(dn->dn_bonus);
	- dn->dn_bonus = NULL;
	- }
	- dn->dn_zio = NULL;
	-
	- dn->dn_have_spill = B_FALSE;
	- dn->dn_oldused = 0;
	- dn->dn_oldflags = 0;
	- dn->dn_olduid = 0;
	- dn->dn_oldgid = 0;
	- dn->dn_newuid = 0;
	- dn->dn_newgid = 0;
	- dn->dn_id_flags = 0;
	-
	- dmu_zfetch_fini(&dn->dn_zfetch);
	- kmem_cache_free(dnode_cache, dn);
	- arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
	-
	- if (complete_os_eviction)
	- dmu_objset_evict_done(os);
	-}
	-
	-void
	-dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
	- dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
	-{
	- int i;
	-
	- ASSERT3U(dn_slots, >, 0);
	- ASSERT3U(dn_slots << DNODE_SHIFT, <=,
	- spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
	- ASSERT3U(blocksize, <=,
	- spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
	- if (blocksize == 0)
	- blocksize = 1 << zfs_default_bs;
	- else
	- blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
	-
	- if (ibs == 0)
	- ibs = zfs_default_ibs;
	-
	- ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
	-
	- dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
	- " blocksize=%d ibs=%d dn_slots=%d\n",
	- dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
	- DNODE_STAT_BUMP(dnode_allocate);
	-
	- ASSERT(dn->dn_type == DMU_OT_NONE);
	- ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
	- ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
	- ASSERT(ot != DMU_OT_NONE);
	- ASSERT(DMU_OT_IS_VALID(ot));
	- ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) \|\|
	- (bonustype == DMU_OT_SA && bonuslen == 0) \|\|
	- (bonustype != DMU_OT_NONE && bonuslen != 0));
	- ASSERT(DMU_OT_IS_VALID(bonustype));
	- ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
	- ASSERT(dn->dn_type == DMU_OT_NONE);
	- ASSERT0(dn->dn_maxblkid);
	- ASSERT0(dn->dn_allocated_txg);
	- ASSERT0(dn->dn_dirty_txg);
	- ASSERT0(dn->dn_assigned_txg);
	- ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
	- ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
	- ASSERT(avl_is_empty(&dn->dn_dbufs));
	-
	- for (i = 0; i < TXG_SIZE; i++) {
	- ASSERT0(dn->dn_next_nblkptr[i]);
	- ASSERT0(dn->dn_next_nlevels[i]);
	- ASSERT0(dn->dn_next_indblkshift[i]);
	- ASSERT0(dn->dn_next_bonuslen[i]);
	- ASSERT0(dn->dn_next_bonustype[i]);
	- ASSERT0(dn->dn_rm_spillblk[i]);
	- ASSERT0(dn->dn_next_blksz[i]);
	- ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
	- ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
	- ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
	- }
	-
	- dn->dn_type = ot;
	- dnode_setdblksz(dn, blocksize);
	- dn->dn_indblkshift = ibs;
	- dn->dn_nlevels = 1;
	- dn->dn_num_slots = dn_slots;
	- if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
	- dn->dn_nblkptr = 1;
	- else {
	- dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
	- 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
	- SPA_BLKPTRSHIFT));
	- }
	-
	- dn->dn_bonustype = bonustype;
	- dn->dn_bonuslen = bonuslen;
	- dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
	- dn->dn_compress = ZIO_COMPRESS_INHERIT;
	- dn->dn_dirtyctx = 0;
	-
	- dn->dn_free_txg = 0;
	- if (dn->dn_dirtyctx_firstset) {
	- kmem_free(dn->dn_dirtyctx_firstset, 1);
	- dn->dn_dirtyctx_firstset = NULL;
	- }
	-
	- dn->dn_allocated_txg = tx->tx_txg;
	- dn->dn_id_flags = 0;
	-
	- dnode_setdirty(dn, tx);
	- dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
	- dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
	- dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
	- dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
	-}
	-
	-void
	-dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
	- dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
	-{
	- int nblkptr;
	-
	- ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
	- ASSERT3U(blocksize, <=,
	- spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
	- ASSERT0(blocksize % SPA_MINBLOCKSIZE);
	- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT \|\| dmu_tx_private_ok(tx));
	- ASSERT(tx->tx_txg != 0);
	- ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) \|\|
	- (bonustype != DMU_OT_NONE && bonuslen != 0) \|\|
	- (bonustype == DMU_OT_SA && bonuslen == 0));
	- ASSERT(DMU_OT_IS_VALID(bonustype));
	- ASSERT3U(bonuslen, <=,
	- DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
	- ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
	-
	- dnode_free_interior_slots(dn);
	- DNODE_STAT_BUMP(dnode_reallocate);
	-
	- /* clean up any unreferenced dbufs */
	- dnode_evict_dbufs(dn);
	-
	- dn->dn_id_flags = 0;
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- dnode_setdirty(dn, tx);
	- if (dn->dn_datablksz != blocksize) {
	- /* change blocksize */
	- ASSERT(dn->dn_maxblkid == 0 &&
	- (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) \|\|
	- dnode_block_freed(dn, 0)));
	- dnode_setdblksz(dn, blocksize);
	- dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
	- }
	- if (dn->dn_bonuslen != bonuslen)
	- dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
	-
	- if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
	- nblkptr = 1;
	- else
	- nblkptr = MIN(DN_MAX_NBLKPTR,
	- 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
	- SPA_BLKPTRSHIFT));
	- if (dn->dn_bonustype != bonustype)
	- dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
	- if (dn->dn_nblkptr != nblkptr)
	- dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
	- if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
	- dbuf_rm_spill(dn, tx);
	- dnode_rm_spill(dn, tx);
	- }
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- /* change type */
	- dn->dn_type = ot;
	-
	- /* change bonus size and type */
	- mutex_enter(&dn->dn_mtx);
	- dn->dn_bonustype = bonustype;
	- dn->dn_bonuslen = bonuslen;
	- dn->dn_num_slots = dn_slots;
	- dn->dn_nblkptr = nblkptr;
	- dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
	- dn->dn_compress = ZIO_COMPRESS_INHERIT;
	- ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
	-
	- /* fix up the bonus db_size */
	- if (dn->dn_bonus) {
	- dn->dn_bonus->db.db_size =
	- DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
	- (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
	- ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
	- }
	-
	- dn->dn_allocated_txg = tx->tx_txg;
	- mutex_exit(&dn->dn_mtx);
	-}
	-
	-#ifdef _KERNEL
	-static void
	-dnode_move_impl(dnode_t odn, dnode_t ndn)
	-{
	- int i;
	-
	- ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
	- ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
	- ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
	- ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
	-
	- /* Copy fields. */
	- ndn->dn_objset = odn->dn_objset;
	- ndn->dn_object = odn->dn_object;
	- ndn->dn_dbuf = odn->dn_dbuf;
	- ndn->dn_handle = odn->dn_handle;
	- ndn->dn_phys = odn->dn_phys;
	- ndn->dn_type = odn->dn_type;
	- ndn->dn_bonuslen = odn->dn_bonuslen;
	- ndn->dn_bonustype = odn->dn_bonustype;
	- ndn->dn_nblkptr = odn->dn_nblkptr;
	- ndn->dn_checksum = odn->dn_checksum;
	- ndn->dn_compress = odn->dn_compress;
	- ndn->dn_nlevels = odn->dn_nlevels;
	- ndn->dn_indblkshift = odn->dn_indblkshift;
	- ndn->dn_datablkshift = odn->dn_datablkshift;
	- ndn->dn_datablkszsec = odn->dn_datablkszsec;
	- ndn->dn_datablksz = odn->dn_datablksz;
	- ndn->dn_maxblkid = odn->dn_maxblkid;
	- ndn->dn_num_slots = odn->dn_num_slots;
	- bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
	- sizeof (odn->dn_next_type));
	- bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
	- sizeof (odn->dn_next_nblkptr));
	- bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
	- sizeof (odn->dn_next_nlevels));
	- bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
	- sizeof (odn->dn_next_indblkshift));
	- bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
	- sizeof (odn->dn_next_bonustype));
	- bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
	- sizeof (odn->dn_rm_spillblk));
	- bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
	- sizeof (odn->dn_next_bonuslen));
	- bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
	- sizeof (odn->dn_next_blksz));
	- for (i = 0; i < TXG_SIZE; i++) {
	- list_move_tail(&ndn->dn_dirty_records[i],
	- &odn->dn_dirty_records[i]);
	- }
	- bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
	- sizeof (odn->dn_free_ranges));
	- ndn->dn_allocated_txg = odn->dn_allocated_txg;
	- ndn->dn_free_txg = odn->dn_free_txg;
	- ndn->dn_assigned_txg = odn->dn_assigned_txg;
	- ndn->dn_dirty_txg = odn->dn_dirty_txg;
	- ndn->dn_dirtyctx = odn->dn_dirtyctx;
	- ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
	- ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
	- zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
	- ASSERT(avl_is_empty(&ndn->dn_dbufs));
	- avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
	- ndn->dn_dbufs_count = odn->dn_dbufs_count;
	- ndn->dn_bonus = odn->dn_bonus;
	- ndn->dn_have_spill = odn->dn_have_spill;
	- ndn->dn_zio = odn->dn_zio;
	- ndn->dn_oldused = odn->dn_oldused;
	- ndn->dn_oldflags = odn->dn_oldflags;
	- ndn->dn_olduid = odn->dn_olduid;
	- ndn->dn_oldgid = odn->dn_oldgid;
	- ndn->dn_newuid = odn->dn_newuid;
	- ndn->dn_newgid = odn->dn_newgid;
	- ndn->dn_id_flags = odn->dn_id_flags;
	- dmu_zfetch_init(&ndn->dn_zfetch, NULL);
	- list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
	- ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
	-
	- /*
	- * Update back pointers. Updating the handle fixes the back pointer of
	- * every descendant dbuf as well as the bonus dbuf.
	- */
	- ASSERT(ndn->dn_handle->dnh_dnode == odn);
	- ndn->dn_handle->dnh_dnode = ndn;
	- if (ndn->dn_zfetch.zf_dnode == odn) {
	- ndn->dn_zfetch.zf_dnode = ndn;
	- }
	-
	- /*
	- * Invalidate the original dnode by clearing all of its back pointers.
	- */
	- odn->dn_dbuf = NULL;
	- odn->dn_handle = NULL;
	- avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
	- offsetof(dmu_buf_impl_t, db_link));
	- odn->dn_dbufs_count = 0;
	- odn->dn_bonus = NULL;
	- odn->dn_zfetch.zf_dnode = NULL;
	-
	- /*
	- * Set the low bit of the objset pointer to ensure that dnode_move()
	- * recognizes the dnode as invalid in any subsequent callback.
	- */
	- POINTER_INVALIDATE(&odn->dn_objset);
	-
	- /*
	- * Satisfy the destructor.
	- */
	- for (i = 0; i < TXG_SIZE; i++) {
	- list_create(&odn->dn_dirty_records[i],
	- sizeof (dbuf_dirty_record_t),
	- offsetof(dbuf_dirty_record_t, dr_dirty_node));
	- odn->dn_free_ranges[i] = NULL;
	- odn->dn_next_nlevels[i] = 0;
	- odn->dn_next_indblkshift[i] = 0;
	- odn->dn_next_bonustype[i] = 0;
	- odn->dn_rm_spillblk[i] = 0;
	- odn->dn_next_bonuslen[i] = 0;
	- odn->dn_next_blksz[i] = 0;
	- }
	- odn->dn_allocated_txg = 0;
	- odn->dn_free_txg = 0;
	- odn->dn_assigned_txg = 0;
	- odn->dn_dirty_txg = 0;
	- odn->dn_dirtyctx = 0;
	- odn->dn_dirtyctx_firstset = NULL;
	- odn->dn_have_spill = B_FALSE;
	- odn->dn_zio = NULL;
	- odn->dn_oldused = 0;
	- odn->dn_oldflags = 0;
	- odn->dn_olduid = 0;
	- odn->dn_oldgid = 0;
	- odn->dn_newuid = 0;
	- odn->dn_newgid = 0;
	- odn->dn_id_flags = 0;
	-
	- /*
	- * Mark the dnode.
	- */
	- ndn->dn_moved = 1;
	- odn->dn_moved = (uint8_t)-1;
	-}
	-
	-#ifdef illumos
	-/ARGSUSED/
	-static kmem_cbrc_t
	-dnode_move(void buf, void newbuf, size_t size, void *arg)
	-{
	- dnode_t odn = buf, ndn = newbuf;
	- objset_t *os;
	- int64_t refcount;
	- uint32_t dbufs;
	-
	- /*
	- * The dnode is on the objset's list of known dnodes if the objset
	- * pointer is valid. We set the low bit of the objset pointer when
	- * freeing the dnode to invalidate it, and the memory patterns written
	- * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
	- * A newly created dnode sets the objset pointer last of all to indicate
	- * that the dnode is known and in a valid state to be moved by this
	- * function.
	- */
	- os = odn->dn_objset;
	- if (!POINTER_IS_VALID(os)) {
	- DNODE_STAT_BUMP(dnode_move_invalid);
	- return (KMEM_CBRC_DONT_KNOW);
	- }
	-
	- /*
	- * Ensure that the objset does not go away during the move.
	- */
	- rw_enter(&os_lock, RW_WRITER);
	- if (os != odn->dn_objset) {
	- rw_exit(&os_lock);
	- DNODE_STAT_BUMP(dnode_move_recheck1);
	- return (KMEM_CBRC_DONT_KNOW);
	- }
	-
	- /*
	- * If the dnode is still valid, then so is the objset. We know that no
	- * valid objset can be freed while we hold os_lock, so we can safely
	- * ensure that the objset remains in use.
	- */
	- mutex_enter(&os->os_lock);
	-
	- /*
	- * Recheck the objset pointer in case the dnode was removed just before
	- * acquiring the lock.
	- */
	- if (os != odn->dn_objset) {
	- mutex_exit(&os->os_lock);
	- rw_exit(&os_lock);
	- DNODE_STAT_BUMP(dnode_move_recheck2);
	- return (KMEM_CBRC_DONT_KNOW);
	- }
	-
	- /*
	- * At this point we know that as long as we hold os->os_lock, the dnode
	- * cannot be freed and fields within the dnode can be safely accessed.
	- * The objset listing this dnode cannot go away as long as this dnode is
	- * on its list.
	- */
	- rw_exit(&os_lock);
	- if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
	- mutex_exit(&os->os_lock);
	- DNODE_STAT_BUMP(dnode_move_special);
	- return (KMEM_CBRC_NO);
	- }
	- ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
	-
	- /*
	- * Lock the dnode handle to prevent the dnode from obtaining any new
	- * holds. This also prevents the descendant dbufs and the bonus dbuf
	- * from accessing the dnode, so that we can discount their holds. The
	- * handle is safe to access because we know that while the dnode cannot
	- * go away, neither can its handle. Once we hold dnh_zrlock, we can
	- * safely move any dnode referenced only by dbufs.
	- */
	- if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
	- mutex_exit(&os->os_lock);
	- DNODE_STAT_BUMP(dnode_move_handle);
	- return (KMEM_CBRC_LATER);
	- }
	-
	- /*
	- * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
	- * We need to guarantee that there is a hold for every dbuf in order to
	- * determine whether the dnode is actively referenced. Falsely matching
	- * a dbuf to an active hold would lead to an unsafe move. It's possible
	- * that a thread already having an active dnode hold is about to add a
	- * dbuf, and we can't compare hold and dbuf counts while the add is in
	- * progress.
	- */
	- if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
	- zrl_exit(&odn->dn_handle->dnh_zrlock);
	- mutex_exit(&os->os_lock);
	- DNODE_STAT_BUMP(dnode_move_rwlock);
	- return (KMEM_CBRC_LATER);
	- }
	-
	- /*
	- * A dbuf may be removed (evicted) without an active dnode hold. In that
	- * case, the dbuf count is decremented under the handle lock before the
	- * dbuf's hold is released. This order ensures that if we count the hold
	- * after the dbuf is removed but before its hold is released, we will
	- * treat the unmatched hold as active and exit safely. If we count the
	- * hold before the dbuf is removed, the hold is discounted, and the
	- * removal is blocked until the move completes.
	- */
	- refcount = zfs_refcount_count(&odn->dn_holds);
	- ASSERT(refcount >= 0);
	- dbufs = DN_DBUFS_COUNT(odn);
	-
	- /* We can't have more dbufs than dnode holds. */
	- ASSERT3U(dbufs, <=, refcount);
	- DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
	- uint32_t, dbufs);
	-
	- if (refcount > dbufs) {
	- rw_exit(&odn->dn_struct_rwlock);
	- zrl_exit(&odn->dn_handle->dnh_zrlock);
	- mutex_exit(&os->os_lock);
	- DNODE_STAT_BUMP(dnode_move_active);
	- return (KMEM_CBRC_LATER);
	- }
	-
	- rw_exit(&odn->dn_struct_rwlock);
	-
	- /*
	- * At this point we know that anyone with a hold on the dnode is not
	- * actively referencing it. The dnode is known and in a valid state to
	- * move. We're holding the locks needed to execute the critical section.
	- */
	- dnode_move_impl(odn, ndn);
	-
	- list_link_replace(&odn->dn_link, &ndn->dn_link);
	- /* If the dnode was safe to move, the refcount cannot have changed. */
	- ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
	- ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
	- zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
	- mutex_exit(&os->os_lock);
	-
	- return (KMEM_CBRC_YES);
	-}
	-#endif /* illumos */
	-#endif /* _KERNEL */
	-
	-static void
	-dnode_slots_hold(dnode_children_t *children, int idx, int slots)
	-{
	- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
	-
	- for (int i = idx; i < idx + slots; i++) {
	- dnode_handle_t *dnh = &children->dnc_children[i];
	- zrl_add(&dnh->dnh_zrlock);
	- }
	-}
	-
	-static void
	-dnode_slots_rele(dnode_children_t *children, int idx, int slots)
	-{
	- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
	-
	- for (int i = idx; i < idx + slots; i++) {
	- dnode_handle_t *dnh = &children->dnc_children[i];
	-
	- if (zrl_is_locked(&dnh->dnh_zrlock))
	- zrl_exit(&dnh->dnh_zrlock);
	- else
	- zrl_remove(&dnh->dnh_zrlock);
	- }
	-}
	-
	-static int
	-dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
	-{
	- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
	-
	- for (int i = idx; i < idx + slots; i++) {
	- dnode_handle_t *dnh = &children->dnc_children[i];
	-
	- if (!zrl_tryenter(&dnh->dnh_zrlock)) {
	- for (int j = idx; j < i; j++) {
	- dnh = &children->dnc_children[j];
	- zrl_exit(&dnh->dnh_zrlock);
	- }
	-
	- return (0);
	- }
	- }
	-
	- return (1);
	-}
	-
	-static void
	-dnode_set_slots(dnode_children_t children, int idx, int slots, void ptr)
	-{
	- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
	-
	- for (int i = idx; i < idx + slots; i++) {
	- dnode_handle_t *dnh = &children->dnc_children[i];
	- dnh->dnh_dnode = ptr;
	- }
	-}
	-
	-static boolean_t
	-dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
	-{
	- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
	-
	- /*
	- * If all dnode slots are either already free or
	- * evictable return B_TRUE.
	- */
	- for (int i = idx; i < idx + slots; i++) {
	- dnode_handle_t *dnh = &children->dnc_children[i];
	- dnode_t *dn = dnh->dnh_dnode;
	-
	- if (dn == DN_SLOT_FREE) {
	- continue;
	- } else if (DN_SLOT_IS_PTR(dn)) {
	- mutex_enter(&dn->dn_mtx);
	- boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
	- zfs_refcount_is_zero(&dn->dn_holds) &&
	- !DNODE_IS_DIRTY(dn));
	- mutex_exit(&dn->dn_mtx);
	-
	- if (!can_free)
	- return (B_FALSE);
	- else
	- continue;
	- } else {
	- return (B_FALSE);
	- }
	- }
	-
	- return (B_TRUE);
	-}
	-
	-static void
	-dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
	-{
	- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
	-
	- for (int i = idx; i < idx + slots; i++) {
	- dnode_handle_t *dnh = &children->dnc_children[i];
	-
	- ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
	-
	- if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
	- ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
	- dnode_destroy(dnh->dnh_dnode);
	- dnh->dnh_dnode = DN_SLOT_FREE;
	- }
	- }
	-}
	-
	-void
	-dnode_free_interior_slots(dnode_t *dn)
	-{
	- dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
	- int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
	- int idx = (dn->dn_object & (epb - 1)) + 1;
	- int slots = dn->dn_num_slots - 1;
	-
	- if (slots == 0)
	- return;
	-
	- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
	-
	- while (!dnode_slots_tryenter(children, idx, slots))
	- DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
	-
	- dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
	- dnode_slots_rele(children, idx, slots);
	-}
	-
	-void
	-dnode_special_close(dnode_handle_t *dnh)
	-{
	- dnode_t *dn = dnh->dnh_dnode;
	-
	- /*
	- * Wait for final references to the dnode to clear. This can
	- * only happen if the arc is asynchronously evicting state that
	- * has a hold on this dnode while we are trying to evict this
	- * dnode.
	- */
	- while (zfs_refcount_count(&dn->dn_holds) > 0)
	- delay(1);
	- ASSERT(dn->dn_dbuf == NULL \|\|
	- dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
	- zrl_add(&dnh->dnh_zrlock);
	- dnode_destroy(dn); /* implicit zrl_remove() */
	- zrl_destroy(&dnh->dnh_zrlock);
	- dnh->dnh_dnode = NULL;
	-}
	-
	-void
	-dnode_special_open(objset_t os, dnode_phys_t dnp, uint64_t object,
	- dnode_handle_t *dnh)
	-{
	- dnode_t *dn;
	-
	- zrl_init(&dnh->dnh_zrlock);
	- zrl_tryenter(&dnh->dnh_zrlock);
	-
	- dn = dnode_create(os, dnp, NULL, object, dnh);
	- DNODE_VERIFY(dn);
	-
	- zrl_exit(&dnh->dnh_zrlock);
	-}
	-
	-static void
	-dnode_buf_evict_async(void *dbu)
	-{
	- dnode_children_t *dnc = dbu;
	-
	- DNODE_STAT_BUMP(dnode_buf_evict);
	-
	- for (int i = 0; i < dnc->dnc_count; i++) {
	- dnode_handle_t *dnh = &dnc->dnc_children[i];
	- dnode_t *dn;
	-
	- /*
	- * The dnode handle lock guards against the dnode moving to
	- * another valid address, so there is no need here to guard
	- * against changes to or from NULL.
	- */
	- if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
	- zrl_destroy(&dnh->dnh_zrlock);
	- dnh->dnh_dnode = DN_SLOT_UNINIT;
	- continue;
	- }
	-
	- zrl_add(&dnh->dnh_zrlock);
	- dn = dnh->dnh_dnode;
	- /*
	- * If there are holds on this dnode, then there should
	- * be holds on the dnode's containing dbuf as well; thus
	- * it wouldn't be eligible for eviction and this function
	- * would not have been called.
	- */
	- ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
	- ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
	-
	- dnode_destroy(dn); /* implicit zrl_remove() for first slot */
	- zrl_destroy(&dnh->dnh_zrlock);
	- dnh->dnh_dnode = DN_SLOT_UNINIT;
	- }
	- kmem_free(dnc, sizeof (dnode_children_t) +
	- dnc->dnc_count * sizeof (dnode_handle_t));
	-}
	-
	-/*
	- * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
	- * to ensure the hole at the specified object offset is large enough to
	- * hold the dnode being created. The slots parameter is also used to ensure
	- * a dnode does not span multiple dnode blocks. In both of these cases, if
	- * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
	- * are only possible when using DNODE_MUST_BE_FREE.
	- *
	- * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
	- * dnode_hold_impl() will check if the requested dnode is already consumed
	- * as an extra dnode slot by an large dnode, in which case it returns
	- * ENOENT.
	- *
	- * errors:
	- * EINVAL - invalid object number or flags.
	- * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
	- * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
	- * - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
	- * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
	- * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
	- * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
	- * EIO - i/o error error when reading the meta dnode dbuf.
	- * succeeds even for free dnodes.
	- */
	-int
	-dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
	- void tag, dnode_t *dnp)
	-{
	- int epb, idx, err, i;
	- int drop_struct_lock = FALSE;
	- int type;
	- uint64_t blk;
	- dnode_t mdn, dn;
	- dmu_buf_impl_t *db;
	- dnode_children_t *dnc;
	- dnode_phys_t *dn_block;
	- dnode_phys_t *dn_block_begin;
	- dnode_handle_t *dnh;
	-
	- ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) \|\| (slots == 0));
	- ASSERT(!(flag & DNODE_MUST_BE_FREE) \|\| (slots > 0));
	-
	- /*
	- * If you are holding the spa config lock as writer, you shouldn't
	- * be asking the DMU to do anything unless it's the root pool
	- * which may require us to read from the root filesystem while
	- * holding some (not all) of the locks as writer.
	- */
	- ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 \|\|
	- (spa_is_root(os->os_spa) &&
	- spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
	-
	- ASSERT((flag & DNODE_MUST_BE_ALLOCATED) \|\| (flag & DNODE_MUST_BE_FREE));
	-
	- if (object == DMU_USERUSED_OBJECT \|\| object == DMU_GROUPUSED_OBJECT) {
	- dn = (object == DMU_USERUSED_OBJECT) ?
	- DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
	- if (dn == NULL)
	- return (SET_ERROR(ENOENT));
	- type = dn->dn_type;
	- if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
	- return (SET_ERROR(ENOENT));
	- if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
	- return (SET_ERROR(EEXIST));
	- DNODE_VERIFY(dn);
	- (void) zfs_refcount_add(&dn->dn_holds, tag);
	- *dnp = dn;
	- return (0);
	- }
	-
	- if (object == 0 \|\| object >= DN_MAX_OBJECT)
	- return (SET_ERROR(EINVAL));
	-
	- mdn = DMU_META_DNODE(os);
	- ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
	-
	- DNODE_VERIFY(mdn);
	-
	- if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
	- rw_enter(&mdn->dn_struct_rwlock, RW_READER);
	- drop_struct_lock = TRUE;
	- }
	-
	- blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
	-
	- db = dbuf_hold(mdn, blk, FTAG);
	- if (drop_struct_lock)
	- rw_exit(&mdn->dn_struct_rwlock);
	- if (db == NULL) {
	- DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
	- return (SET_ERROR(EIO));
	- }
	- err = dbuf_read(db, NULL, DB_RF_CANFAIL);
	- if (err) {
	- DNODE_STAT_BUMP(dnode_hold_dbuf_read);
	- dbuf_rele(db, FTAG);
	- return (err);
	- }
	-
	- ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
	- epb = db->db.db_size >> DNODE_SHIFT;
	-
	- idx = object & (epb - 1);
	- dn_block = (dnode_phys_t *)db->db.db_data;
	-
	- ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
	- dnc = dmu_buf_get_user(&db->db);
	- dnh = NULL;
	- if (dnc == NULL) {
	- dnode_children_t *winner;
	- int skip = 0;
	-
	- dnc = kmem_zalloc(sizeof (dnode_children_t) +
	- epb * sizeof (dnode_handle_t), KM_SLEEP);
	- dnc->dnc_count = epb;
	- dnh = &dnc->dnc_children[0];
	-
	- /* Initialize dnode slot status from dnode_phys_t */
	- for (int i = 0; i < epb; i++) {
	- zrl_init(&dnh[i].dnh_zrlock);
	-
	- if (skip) {
	- skip--;
	- continue;
	- }
	-
	- if (dn_block[i].dn_type != DMU_OT_NONE) {
	- int interior = dn_block[i].dn_extra_slots;
	-
	- dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
	- dnode_set_slots(dnc, i + 1, interior,
	- DN_SLOT_INTERIOR);
	- skip = interior;
	- } else {
	- dnh[i].dnh_dnode = DN_SLOT_FREE;
	- skip = 0;
	- }
	- }
	-
	- dmu_buf_init_user(&dnc->dnc_dbu, NULL,
	- dnode_buf_evict_async, NULL);
	- winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
	- if (winner != NULL) {
	-
	- for (int i = 0; i < epb; i++)
	- zrl_destroy(&dnh[i].dnh_zrlock);
	-
	- kmem_free(dnc, sizeof (dnode_children_t) +
	- epb * sizeof (dnode_handle_t));
	- dnc = winner;
	- }
	- }
	-
	- ASSERT(dnc->dnc_count == epb);
	- dn = DN_SLOT_UNINIT;
	-
	- if (flag & DNODE_MUST_BE_ALLOCATED) {
	- slots = 1;
	-
	- while (dn == DN_SLOT_UNINIT) {
	- dnode_slots_hold(dnc, idx, slots);
	- dnh = &dnc->dnc_children[idx];
	-
	- if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
	- dn = dnh->dnh_dnode;
	- break;
	- } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
	- DNODE_STAT_BUMP(dnode_hold_alloc_interior);
	- dnode_slots_rele(dnc, idx, slots);
	- dbuf_rele(db, FTAG);
	- return (SET_ERROR(EEXIST));
	- } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
	- DNODE_STAT_BUMP(dnode_hold_alloc_misses);
	- dnode_slots_rele(dnc, idx, slots);
	- dbuf_rele(db, FTAG);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- dnode_slots_rele(dnc, idx, slots);
	- if (!dnode_slots_tryenter(dnc, idx, slots)) {
	- DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
	- continue;
	- }
	-
	- /*
	- * Someone else won the race and called dnode_create()
	- * after we checked DN_SLOT_IS_PTR() above but before
	- * we acquired the lock.
	- */
	- if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
	- DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
	- dn = dnh->dnh_dnode;
	- } else {
	- dn = dnode_create(os, dn_block + idx, db,
	- object, dnh);
	- }
	- }
	-
	- mutex_enter(&dn->dn_mtx);
	- if (dn->dn_type == DMU_OT_NONE \|\| dn->dn_free_txg != 0) {
	- DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
	- mutex_exit(&dn->dn_mtx);
	- dnode_slots_rele(dnc, idx, slots);
	- dbuf_rele(db, FTAG);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- DNODE_STAT_BUMP(dnode_hold_alloc_hits);
	- } else if (flag & DNODE_MUST_BE_FREE) {
	-
	- if (idx + slots - 1 >= DNODES_PER_BLOCK) {
	- DNODE_STAT_BUMP(dnode_hold_free_overflow);
	- dbuf_rele(db, FTAG);
	- return (SET_ERROR(ENOSPC));
	- }
	-
	- while (dn == DN_SLOT_UNINIT) {
	- dnode_slots_hold(dnc, idx, slots);
	-
	- if (!dnode_check_slots_free(dnc, idx, slots)) {
	- DNODE_STAT_BUMP(dnode_hold_free_misses);
	- dnode_slots_rele(dnc, idx, slots);
	- dbuf_rele(db, FTAG);
	- return (SET_ERROR(ENOSPC));
	- }
	-
	- dnode_slots_rele(dnc, idx, slots);
	- if (!dnode_slots_tryenter(dnc, idx, slots)) {
	- DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
	- continue;
	- }
	-
	- if (!dnode_check_slots_free(dnc, idx, slots)) {
	- DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
	- dnode_slots_rele(dnc, idx, slots);
	- dbuf_rele(db, FTAG);
	- return (SET_ERROR(ENOSPC));
	- }
	-
	- /*
	- * Allocated but otherwise free dnodes which would
	- * be in the interior of a multi-slot dnodes need
	- * to be freed. Single slot dnodes can be safely
	- * re-purposed as a performance optimization.
	- */
	- if (slots > 1)
	- dnode_reclaim_slots(dnc, idx + 1, slots - 1);
	-
	- dnh = &dnc->dnc_children[idx];
	- if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
	- dn = dnh->dnh_dnode;
	- } else {
	- dn = dnode_create(os, dn_block + idx, db,
	- object, dnh);
	- }
	- }
	-
	- mutex_enter(&dn->dn_mtx);
	- if (!zfs_refcount_is_zero(&dn->dn_holds) \|\| dn->dn_free_txg) {
	- DNODE_STAT_BUMP(dnode_hold_free_refcount);
	- mutex_exit(&dn->dn_mtx);
	- dnode_slots_rele(dnc, idx, slots);
	- dbuf_rele(db, FTAG);
	- return (SET_ERROR(EEXIST));
	- }
	-
	- dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
	- DNODE_STAT_BUMP(dnode_hold_free_hits);
	- } else {
	- dbuf_rele(db, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (dn->dn_free_txg) {
	- DNODE_STAT_BUMP(dnode_hold_free_txg);
	- type = dn->dn_type;
	- mutex_exit(&dn->dn_mtx);
	- dnode_slots_rele(dnc, idx, slots);
	- dbuf_rele(db, FTAG);
	- return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
	- ENOENT : EEXIST));
	- }
	-
	- if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
	- dbuf_add_ref(db, dnh);
	-
	- mutex_exit(&dn->dn_mtx);
	-
	- /* Now we can rely on the hold to prevent the dnode from moving. */
	- dnode_slots_rele(dnc, idx, slots);
	-
	- DNODE_VERIFY(dn);
	- ASSERT3P(dn->dn_dbuf, ==, db);
	- ASSERT3U(dn->dn_object, ==, object);
	- dbuf_rele(db, FTAG);
	-
	- *dnp = dn;
	- return (0);
	-}
	-
	-/*
	- * Return held dnode if the object is allocated, NULL if not.
	- */
	-int
	-dnode_hold(objset_t os, uint64_t object, void tag, dnode_t **dnp)
	-{
	- return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
	- dnp));
	-}
	-
	-/*
	- * Can only add a reference if there is already at least one
	- * reference on the dnode. Returns FALSE if unable to add a
	- * new reference.
	- */
	-boolean_t
	-dnode_add_ref(dnode_t dn, void tag)
	-{
	- mutex_enter(&dn->dn_mtx);
	- if (zfs_refcount_is_zero(&dn->dn_holds)) {
	- mutex_exit(&dn->dn_mtx);
	- return (FALSE);
	- }
	- VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
	- mutex_exit(&dn->dn_mtx);
	- return (TRUE);
	-}
	-
	-void
	-dnode_rele(dnode_t dn, void tag)
	-{
	- mutex_enter(&dn->dn_mtx);
	- dnode_rele_and_unlock(dn, tag, B_FALSE);
	-}
	-
	-void
	-dnode_rele_and_unlock(dnode_t dn, void tag, boolean_t evicting)
	-{
	- uint64_t refs;
	- /* Get while the hold prevents the dnode from moving. */
	- dmu_buf_impl_t *db = dn->dn_dbuf;
	- dnode_handle_t *dnh = dn->dn_handle;
	-
	- refs = zfs_refcount_remove(&dn->dn_holds, tag);
	- mutex_exit(&dn->dn_mtx);
	-
	- /*
	- * It's unsafe to release the last hold on a dnode by dnode_rele() or
	- * indirectly by dbuf_rele() while relying on the dnode handle to
	- * prevent the dnode from moving, since releasing the last hold could
	- * result in the dnode's parent dbuf evicting its dnode handles. For
	- * that reason anyone calling dnode_rele() or dbuf_rele() without some
	- * other direct or indirect hold on the dnode must first drop the dnode
	- * handle.
	- */
	- ASSERT(refs > 0 \|\| dnh->dnh_zrlock.zr_owner != curthread);
	-
	- /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
	- if (refs == 0 && db != NULL) {
	- /*
	- * Another thread could add a hold to the dnode handle in
	- * dnode_hold_impl() while holding the parent dbuf. Since the
	- * hold on the parent dbuf prevents the handle from being
	- * destroyed, the hold on the handle is OK. We can't yet assert
	- * that the handle has zero references, but that will be
	- * asserted anyway when the handle gets destroyed.
	- */
	- mutex_enter(&db->db_mtx);
	- dbuf_rele_and_unlock(db, dnh, evicting);
	- }
	-}
	-
	-void
	-dnode_setdirty(dnode_t dn, dmu_tx_t tx)
	-{
	- objset_t *os = dn->dn_objset;
	- uint64_t txg = tx->tx_txg;
	-
	- if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
	- dsl_dataset_dirty(os->os_dsl_dataset, tx);
	- return;
	- }
	-
	- DNODE_VERIFY(dn);
	-
	-#ifdef ZFS_DEBUG
	- mutex_enter(&dn->dn_mtx);
	- ASSERT(dn->dn_phys->dn_type \|\| dn->dn_allocated_txg);
	- ASSERT(dn->dn_free_txg == 0 \|\| dn->dn_free_txg >= txg);
	- mutex_exit(&dn->dn_mtx);
	-#endif
	-
	- /*
	- * Determine old uid/gid when necessary
	- */
	- dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
	-
	- multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
	- multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
	-
	- /*
	- * If we are already marked dirty, we're done.
	- */
	- if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
	- multilist_sublist_unlock(mls);
	- return;
	- }
	-
	- ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) \|\|
	- !avl_is_empty(&dn->dn_dbufs));
	- ASSERT(dn->dn_datablksz != 0);
	- ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
	- ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
	- ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
	-
	- dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
	- dn->dn_object, txg);
	-
	- multilist_sublist_insert_head(mls, dn);
	-
	- multilist_sublist_unlock(mls);
	-
	- /*
	- * The dnode maintains a hold on its containing dbuf as
	- * long as there are holds on it. Each instantiated child
	- * dbuf maintains a hold on the dnode. When the last child
	- * drops its hold, the dnode will drop its hold on the
	- * containing dbuf. We add a "dirty hold" here so that the
	- * dnode will hang around after we finish processing its
	- * children.
	- */
	- VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
	-
	- (void) dbuf_dirty(dn->dn_dbuf, tx);
	-
	- dsl_dataset_dirty(os->os_dsl_dataset, tx);
	-}
	-
	-void
	-dnode_free(dnode_t dn, dmu_tx_t tx)
	-{
	- mutex_enter(&dn->dn_mtx);
	- if (dn->dn_type == DMU_OT_NONE \|\| dn->dn_free_txg) {
	- mutex_exit(&dn->dn_mtx);
	- return;
	- }
	- dn->dn_free_txg = tx->tx_txg;
	- mutex_exit(&dn->dn_mtx);
	-
	- dnode_setdirty(dn, tx);
	-}
	-
	-/*
	- * Try to change the block size for the indicated dnode. This can only
	- * succeed if there are no blocks allocated or dirty beyond first block
	- */
	-int
	-dnode_set_blksz(dnode_t dn, uint64_t size, int ibs, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t *db;
	- int err;
	-
	- ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
	- if (size == 0)
	- size = SPA_MINBLOCKSIZE;
	- else
	- size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
	-
	- if (ibs == dn->dn_indblkshift)
	- ibs = 0;
	-
	- if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
	- return (0);
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	-
	- /* Check for any allocated blocks beyond the first */
	- if (dn->dn_maxblkid != 0)
	- goto fail;
	-
	- mutex_enter(&dn->dn_dbufs_mtx);
	- for (db = avl_first(&dn->dn_dbufs); db != NULL;
	- db = AVL_NEXT(&dn->dn_dbufs, db)) {
	- if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
	- db->db_blkid != DMU_SPILL_BLKID) {
	- mutex_exit(&dn->dn_dbufs_mtx);
	- goto fail;
	- }
	- }
	- mutex_exit(&dn->dn_dbufs_mtx);
	-
	- if (ibs && dn->dn_nlevels != 1)
	- goto fail;
	-
	- /* resize the old block */
	- err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
	- if (err == 0)
	- dbuf_new_size(db, size, tx);
	- else if (err != ENOENT)
	- goto fail;
	-
	- dnode_setdblksz(dn, size);
	- dnode_setdirty(dn, tx);
	- dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
	- if (ibs) {
	- dn->dn_indblkshift = ibs;
	- dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
	- }
	- /* rele after we have fixed the blocksize in the dnode */
	- if (db)
	- dbuf_rele(db, FTAG);
	-
	- rw_exit(&dn->dn_struct_rwlock);
	- return (0);
	-
	-fail:
	- rw_exit(&dn->dn_struct_rwlock);
	- return (SET_ERROR(ENOTSUP));
	-}
	-
	-/* read-holding callers must not rely on the lock being continuously held */
	-void
	-dnode_new_blkid(dnode_t dn, uint64_t blkid, dmu_tx_t tx, boolean_t have_read)
	-{
	- uint64_t txgoff = tx->tx_txg & TXG_MASK;
	- int epbs, new_nlevels;
	- uint64_t sz;
	-
	- ASSERT(blkid != DMU_BONUS_BLKID);
	-
	- ASSERT(have_read ?
	- RW_READ_HELD(&dn->dn_struct_rwlock) :
	- RW_WRITE_HELD(&dn->dn_struct_rwlock));
	-
	- /*
	- * if we have a read-lock, check to see if we need to do any work
	- * before upgrading to a write-lock.
	- */
	- if (have_read) {
	- if (blkid <= dn->dn_maxblkid)
	- return;
	-
	- if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
	- rw_exit(&dn->dn_struct_rwlock);
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- }
	- }
	-
	- if (blkid <= dn->dn_maxblkid)
	- goto out;
	-
	- dn->dn_maxblkid = blkid;
	-
	- /*
	- * Compute the number of levels necessary to support the new maxblkid.
	- */
	- new_nlevels = 1;
	- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	- for (sz = dn->dn_nblkptr;
	- sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
	- new_nlevels++;
	-
	- if (new_nlevels > dn->dn_nlevels) {
	- int old_nlevels = dn->dn_nlevels;
	- dmu_buf_impl_t *db;
	- list_t *list;
	- dbuf_dirty_record_t new, dr, *dr_next;
	-
	- dn->dn_nlevels = new_nlevels;
	-
	- ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
	- dn->dn_next_nlevels[txgoff] = new_nlevels;
	-
	- /* dirty the left indirects */
	- db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
	- ASSERT(db != NULL);
	- new = dbuf_dirty(db, tx);
	- dbuf_rele(db, FTAG);
	-
	- /* transfer the dirty records to the new indirect */
	- mutex_enter(&dn->dn_mtx);
	- mutex_enter(&new->dt.di.dr_mtx);
	- list = &dn->dn_dirty_records[txgoff];
	- for (dr = list_head(list); dr; dr = dr_next) {
	- dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
	- if (dr->dr_dbuf->db_level != new_nlevels-1 &&
	- dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
	- dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
	- ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
	- list_remove(&dn->dn_dirty_records[txgoff], dr);
	- list_insert_tail(&new->dt.di.dr_children, dr);
	- dr->dr_parent = new;
	- }
	- }
	- mutex_exit(&new->dt.di.dr_mtx);
	- mutex_exit(&dn->dn_mtx);
	- }
	-
	-out:
	- if (have_read)
	- rw_downgrade(&dn->dn_struct_rwlock);
	-}
	-
	-static void
	-dnode_dirty_l1(dnode_t dn, uint64_t l1blkid, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
	- if (db != NULL) {
	- dmu_buf_will_dirty(&db->db, tx);
	- dbuf_rele(db, FTAG);
	- }
	-}
	-
	-/*
	- * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
	- * and end_blkid.
	- */
	-static void
	-dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
	- dmu_tx_t *tx)
	-{
	- dmu_buf_impl_t db_search;
	- dmu_buf_impl_t *db;
	- avl_index_t where;
	-
	- mutex_enter(&dn->dn_dbufs_mtx);
	-
	- db_search.db_level = 1;
	- db_search.db_blkid = start_blkid + 1;
	- db_search.db_state = DB_SEARCH;
	- for (;;) {
	-
	- db = avl_find(&dn->dn_dbufs, &db_search, &where);
	- if (db == NULL)
	- db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
	-
	- if (db == NULL \|\| db->db_level != 1 \|\|
	- db->db_blkid >= end_blkid) {
	- break;
	- }
	-
	- /*
	- * Setup the next blkid we want to search for.
	- */
	- db_search.db_blkid = db->db_blkid + 1;
	- ASSERT3U(db->db_blkid, >=, start_blkid);
	-
	- /*
	- * If the dbuf transitions to DB_EVICTING while we're trying
	- * to dirty it, then we will be unable to discover it in
	- * the dbuf hash table. This will result in a call to
	- * dbuf_create() which needs to acquire the dn_dbufs_mtx
	- * lock. To avoid a deadlock, we drop the lock before
	- * dirtying the level-1 dbuf.
	- */
	- mutex_exit(&dn->dn_dbufs_mtx);
	- dnode_dirty_l1(dn, db->db_blkid, tx);
	- mutex_enter(&dn->dn_dbufs_mtx);
	- }
	-
	-#ifdef ZFS_DEBUG
	- /*
	- * Walk all the in-core level-1 dbufs and verify they have been dirtied.
	- */
	- db_search.db_level = 1;
	- db_search.db_blkid = start_blkid + 1;
	- db_search.db_state = DB_SEARCH;
	- db = avl_find(&dn->dn_dbufs, &db_search, &where);
	- if (db == NULL)
	- db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
	- for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
	- if (db->db_level != 1 \|\| db->db_blkid >= end_blkid)
	- break;
	- ASSERT(db->db_dirtycnt > 0);
	- }
	-#endif
	- mutex_exit(&dn->dn_dbufs_mtx);
	-}
	-
	-void
	-dnode_free_range(dnode_t dn, uint64_t off, uint64_t len, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t *db;
	- uint64_t blkoff, blkid, nblks;
	- int blksz, blkshift, head, tail;
	- int trunc = FALSE;
	- int epbs;
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- blksz = dn->dn_datablksz;
	- blkshift = dn->dn_datablkshift;
	- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	-
	- if (len == DMU_OBJECT_END) {
	- len = UINT64_MAX - off;
	- trunc = TRUE;
	- }
	-
	- /*
	- * First, block align the region to free:
	- */
	- if (ISP2(blksz)) {
	- head = P2NPHASE(off, blksz);
	- blkoff = P2PHASE(off, blksz);
	- if ((off >> blkshift) > dn->dn_maxblkid)
	- goto out;
	- } else {
	- ASSERT(dn->dn_maxblkid == 0);
	- if (off == 0 && len >= blksz) {
	- /*
	- * Freeing the whole block; fast-track this request.
	- */
	- blkid = 0;
	- nblks = 1;
	- if (dn->dn_nlevels > 1)
	- dnode_dirty_l1(dn, 0, tx);
	- goto done;
	- } else if (off >= blksz) {
	- /* Freeing past end-of-data */
	- goto out;
	- } else {
	- /* Freeing part of the block. */
	- head = blksz - off;
	- ASSERT3U(head, >, 0);
	- }
	- blkoff = off;
	- }
	- /* zero out any partial block data at the start of the range */
	- if (head) {
	- ASSERT3U(blkoff + head, ==, blksz);
	- if (len < head)
	- head = len;
	- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
	- TRUE, FALSE, FTAG, &db) == 0) {
	- caddr_t data;
	-
	- /* don't dirty if it isn't on disk and isn't dirty */
	- if (db->db_last_dirty \|\|
	- (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
	- rw_exit(&dn->dn_struct_rwlock);
	- dmu_buf_will_dirty(&db->db, tx);
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- data = db->db.db_data;
	- bzero(data + blkoff, head);
	- }
	- dbuf_rele(db, FTAG);
	- }
	- off += head;
	- len -= head;
	- }
	-
	- /* If the range was less than one block, we're done */
	- if (len == 0)
	- goto out;
	-
	- /* If the remaining range is past end of file, we're done */
	- if ((off >> blkshift) > dn->dn_maxblkid)
	- goto out;
	-
	- ASSERT(ISP2(blksz));
	- if (trunc)
	- tail = 0;
	- else
	- tail = P2PHASE(len, blksz);
	-
	- ASSERT0(P2PHASE(off, blksz));
	- /* zero out any partial block data at the end of the range */
	- if (tail) {
	- if (len < tail)
	- tail = len;
	- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
	- TRUE, FALSE, FTAG, &db) == 0) {
	- /* don't dirty if not on disk and not dirty */
	- if (db->db_last_dirty \|\|
	- (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
	- rw_exit(&dn->dn_struct_rwlock);
	- dmu_buf_will_dirty(&db->db, tx);
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- bzero(db->db.db_data, tail);
	- }
	- dbuf_rele(db, FTAG);
	- }
	- len -= tail;
	- }
	-
	- /* If the range did not include a full block, we are done */
	- if (len == 0)
	- goto out;
	-
	- ASSERT(IS_P2ALIGNED(off, blksz));
	- ASSERT(trunc \|\| IS_P2ALIGNED(len, blksz));
	- blkid = off >> blkshift;
	- nblks = len >> blkshift;
	- if (trunc)
	- nblks += 1;
	-
	- /*
	- * Dirty all the indirect blocks in this range. Note that only
	- * the first and last indirect blocks can actually be written
	- * (if they were partially freed) -- they must be dirtied, even if
	- * they do not exist on disk yet. The interior blocks will
	- * be freed by free_children(), so they will not actually be written.
	- * Even though these interior blocks will not be written, we
	- * dirty them for two reasons:
	- *
	- * - It ensures that the indirect blocks remain in memory until
	- * syncing context. (They have already been prefetched by
	- * dmu_tx_hold_free(), so we don't have to worry about reading
	- * them serially here.)
	- *
	- * - The dirty space accounting will put pressure on the txg sync
	- * mechanism to begin syncing, and to delay transactions if there
	- * is a large amount of freeing. Even though these indirect
	- * blocks will not be written, we could need to write the same
	- * amount of space if we copy the freed BPs into deadlists.
	- */
	- if (dn->dn_nlevels > 1) {
	- uint64_t first, last;
	-
	- first = blkid >> epbs;
	- dnode_dirty_l1(dn, first, tx);
	- if (trunc)
	- last = dn->dn_maxblkid >> epbs;
	- else
	- last = (blkid + nblks - 1) >> epbs;
	- if (last != first)
	- dnode_dirty_l1(dn, last, tx);
	-
	- dnode_dirty_l1range(dn, first, last, tx);
	-
	- int shift = dn->dn_datablkshift + dn->dn_indblkshift -
	- SPA_BLKPTRSHIFT;
	- for (uint64_t i = first + 1; i < last; i++) {
	- /*
	- * Set i to the blockid of the next non-hole
	- * level-1 indirect block at or after i. Note
	- * that dnode_next_offset() operates in terms of
	- * level-0-equivalent bytes.
	- */
	- uint64_t ibyte = i << shift;
	- int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
	- &ibyte, 2, 1, 0);
	- i = ibyte >> shift;
	- if (i >= last)
	- break;
	-
	- /*
	- * Normally we should not see an error, either
	- * from dnode_next_offset() or dbuf_hold_level()
	- * (except for ESRCH from dnode_next_offset).
	- * If there is an i/o error, then when we read
	- * this block in syncing context, it will use
	- * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
	- * to the "failmode" property. dnode_next_offset()
	- * doesn't have a flag to indicate MUSTSUCCEED.
	- */
	- if (err != 0)
	- break;
	-
	- dnode_dirty_l1(dn, i, tx);
	- }
	- }
	-
	-done:
	- /*
	- * Add this range to the dnode range list.
	- * We will finish up this free operation in the syncing phase.
	- */
	- mutex_enter(&dn->dn_mtx);
	- int txgoff = tx->tx_txg & TXG_MASK;
	- if (dn->dn_free_ranges[txgoff] == NULL) {
	- dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
	- }
	- range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
	- range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
	- dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
	- blkid, nblks, tx->tx_txg);
	- mutex_exit(&dn->dn_mtx);
	-
	- dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
	- dnode_setdirty(dn, tx);
	-out:
	-
	- rw_exit(&dn->dn_struct_rwlock);
	-}
	-
	-static boolean_t
	-dnode_spill_freed(dnode_t *dn)
	-{
	- int i;
	-
	- mutex_enter(&dn->dn_mtx);
	- for (i = 0; i < TXG_SIZE; i++) {
	- if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
	- break;
	- }
	- mutex_exit(&dn->dn_mtx);
	- return (i < TXG_SIZE);
	-}
	-
	-/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
	-uint64_t
	-dnode_block_freed(dnode_t *dn, uint64_t blkid)
	-{
	- void *dp = spa_get_dsl(dn->dn_objset->os_spa);
	- int i;
	-
	- if (blkid == DMU_BONUS_BLKID)
	- return (FALSE);
	-
	- /*
	- * If we're in the process of opening the pool, dp will not be
	- * set yet, but there shouldn't be anything dirty.
	- */
	- if (dp == NULL)
	- return (FALSE);
	-
	- if (dn->dn_free_txg)
	- return (TRUE);
	-
	- if (blkid == DMU_SPILL_BLKID)
	- return (dnode_spill_freed(dn));
	-
	- mutex_enter(&dn->dn_mtx);
	- for (i = 0; i < TXG_SIZE; i++) {
	- if (dn->dn_free_ranges[i] != NULL &&
	- range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
	- break;
	- }
	- mutex_exit(&dn->dn_mtx);
	- return (i < TXG_SIZE);
	-}
	-
	-/* call from syncing context when we actually write/free space for this dnode */
	-void
	-dnode_diduse_space(dnode_t *dn, int64_t delta)
	-{
	- uint64_t space;
	- dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
	- dn, dn->dn_phys,
	- (u_longlong_t)dn->dn_phys->dn_used,
	- (longlong_t)delta);
	-
	- mutex_enter(&dn->dn_mtx);
	- space = DN_USED_BYTES(dn->dn_phys);
	- if (delta > 0) {
	- ASSERT3U(space + delta, >=, space); /* no overflow */
	- } else {
	- ASSERT3U(space, >=, -delta); /* no underflow */
	- }
	- space += delta;
	- if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
	- ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
	- ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
	- dn->dn_phys->dn_used = space >> DEV_BSHIFT;
	- } else {
	- dn->dn_phys->dn_used = space;
	- dn->dn_phys->dn_flags \|= DNODE_FLAG_USED_BYTES;
	- }
	- mutex_exit(&dn->dn_mtx);
	-}
	-
	-/*
	- * Scans a block at the indicated "level" looking for a hole or data,
	- * depending on 'flags'.
	- *
	- * If level > 0, then we are scanning an indirect block looking at its
	- * pointers. If level == 0, then we are looking at a block of dnodes.
	- *
	- * If we don't find what we are looking for in the block, we return ESRCH.
	- * Otherwise, return with *offset pointing to the beginning (if searching
	- * forwards) or end (if searching backwards) of the range covered by the
	- * block pointer we matched on (or dnode).
	- *
	- * The basic search algorithm used below by dnode_next_offset() is to
	- * use this function to search up the block tree (widen the search) until
	- * we find something (i.e., we don't return ESRCH) and then search back
	- * down the tree (narrow the search) until we reach our original search
	- * level.
	- */
	-static int
	-dnode_next_offset_level(dnode_t dn, int flags, uint64_t offset,
	- int lvl, uint64_t blkfill, uint64_t txg)
	-{
	- dmu_buf_impl_t *db = NULL;
	- void *data = NULL;
	- uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	- uint64_t epb = 1ULL << epbs;
	- uint64_t minfill, maxfill;
	- boolean_t hole;
	- int i, inc, error, span;
	-
	- dprintf("probing object %llu offset %llx level %d of %u\n",
	- dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
	-
	- hole = ((flags & DNODE_FIND_HOLE) != 0);
	- inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
	- ASSERT(txg == 0 \|\| !hole);
	-
	- if (lvl == dn->dn_phys->dn_nlevels) {
	- error = 0;
	- epb = dn->dn_phys->dn_nblkptr;
	- data = dn->dn_phys->dn_blkptr;
	- } else {
	- uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
	- error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
	- if (error) {
	- if (error != ENOENT)
	- return (error);
	- if (hole)
	- return (0);
	- /*
	- * This can only happen when we are searching up
	- * the block tree for data. We don't really need to
	- * adjust the offset, as we will just end up looking
	- * at the pointer to this block in its parent, and its
	- * going to be unallocated, so we will skip over it.
	- */
	- return (SET_ERROR(ESRCH));
	- }
	- error = dbuf_read(db, NULL, DB_RF_CANFAIL \| DB_RF_HAVESTRUCT);
	- if (error) {
	- dbuf_rele(db, FTAG);
	- return (error);
	- }
	- data = db->db.db_data;
	- }
	-
	-
	- if (db != NULL && txg != 0 && (db->db_blkptr == NULL \|\|
	- db->db_blkptr->blk_birth <= txg \|\|
	- BP_IS_HOLE(db->db_blkptr))) {
	- /*
	- * This can only happen when we are searching up the tree
	- * and these conditions mean that we need to keep climbing.
	- */
	- error = SET_ERROR(ESRCH);
	- } else if (lvl == 0) {
	- dnode_phys_t *dnp = data;
	-
	- ASSERT(dn->dn_type == DMU_OT_DNODE);
	- ASSERT(!(flags & DNODE_FIND_BACKWARDS));
	-
	- for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
	- i < blkfill; i += dnp[i].dn_extra_slots + 1) {
	- if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
	- break;
	- }
	-
	- if (i == blkfill)
	- error = SET_ERROR(ESRCH);
	-
	- offset = (offset & ~(DNODE_BLOCK_SIZE - 1)) +
	- (i << DNODE_SHIFT);
	- } else {
	- blkptr_t *bp = data;
	- uint64_t start = *offset;
	- span = (lvl - 1) * epbs + dn->dn_datablkshift;
	- minfill = 0;
	- maxfill = blkfill << ((lvl - 1) * epbs);
	-
	- if (hole)
	- maxfill--;
	- else
	- minfill++;
	-
	- offset = offset >> span;
	- for (i = BF64_GET(*offset, 0, epbs);
	- i >= 0 && i < epb; i += inc) {
	- if (BP_GET_FILL(&bp[i]) >= minfill &&
	- BP_GET_FILL(&bp[i]) <= maxfill &&
	- (hole \|\| bp[i].blk_birth > txg))
	- break;
	- if (inc > 0 \|\| *offset > 0)
	- *offset += inc;
	- }
	- offset = offset << span;
	- if (inc < 0) {
	- /* traversing backwards; position offset at the end */
	- ASSERT3U(*offset, <=, start);
	- offset = MIN(offset + (1ULL << span) - 1, start);
	- } else if (*offset < start) {
	- *offset = start;
	- }
	- if (i < 0 \|\| i >= epb)
	- error = SET_ERROR(ESRCH);
	- }
	-
	- if (db)
	- dbuf_rele(db, FTAG);
	-
	- return (error);
	-}
	-
	-/*
	- * Find the next hole, data, or sparse region at or after *offset.
	- * The value 'blkfill' tells us how many items we expect to find
	- * in an L0 data block; this value is 1 for normal objects,
	- * DNODES_PER_BLOCK for the meta dnode, and some fraction of
	- * DNODES_PER_BLOCK when searching for sparse regions thereof.
	- *
	- * Examples:
	- *
	- * dnode_next_offset(dn, flags, offset, 1, 1, 0);
	- * Finds the next/previous hole/data in a file.
	- * Used in dmu_offset_next().
	- *
	- * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
	- * Finds the next free/allocated dnode an objset's meta-dnode.
	- * Only finds objects that have new contents since txg (ie.
	- * bonus buffer changes and content removal are ignored).
	- * Used in dmu_object_next().
	- *
	- * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
	- * Finds the next L2 meta-dnode bp that's at most 1/4 full.
	- * Used in dmu_object_alloc().
	- */
	-int
	-dnode_next_offset(dnode_t dn, int flags, uint64_t offset,
	- int minlvl, uint64_t blkfill, uint64_t txg)
	-{
	- uint64_t initial_offset = *offset;
	- int lvl, maxlvl;
	- int error = 0;
	-
	- if (!(flags & DNODE_FIND_HAVELOCK))
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	-
	- if (dn->dn_phys->dn_nlevels == 0) {
	- error = SET_ERROR(ESRCH);
	- goto out;
	- }
	-
	- if (dn->dn_datablkshift == 0) {
	- if (*offset < dn->dn_datablksz) {
	- if (flags & DNODE_FIND_HOLE)
	- *offset = dn->dn_datablksz;
	- } else {
	- error = SET_ERROR(ESRCH);
	- }
	- goto out;
	- }
	-
	- maxlvl = dn->dn_phys->dn_nlevels;
	-
	- for (lvl = minlvl; lvl <= maxlvl; lvl++) {
	- error = dnode_next_offset_level(dn,
	- flags, offset, lvl, blkfill, txg);
	- if (error != ESRCH)
	- break;
	- }
	-
	- while (error == 0 && --lvl >= minlvl) {
	- error = dnode_next_offset_level(dn,
	- flags, offset, lvl, blkfill, txg);
	- }
	-
	- /*
	- * There's always a "virtual hole" at the end of the object, even
	- * if all BP's which physically exist are non-holes.
	- */
	- if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
	- minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
	- error = 0;
	- }
	-
	- if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
	- initial_offset < offset : initial_offset > offset))
	- error = SET_ERROR(ESRCH);
	-out:
	- if (!(flags & DNODE_FIND_HAVELOCK))
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- return (error);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
	@@ -1,779 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dbuf.h>
	-#include <sys/dnode.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/spa.h>
	-#include <sys/range_tree.h>
	-#include <sys/zfeature.h>
	-
	-static void
	-dnode_increase_indirection(dnode_t dn, dmu_tx_t tx)
	-{
	- dmu_buf_impl_t *db;
	- int txgoff = tx->tx_txg & TXG_MASK;
	- int nblkptr = dn->dn_phys->dn_nblkptr;
	- int old_toplvl = dn->dn_phys->dn_nlevels - 1;
	- int new_level = dn->dn_next_nlevels[txgoff];
	- int i;
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	-
	- /* this dnode can't be paged out because it's dirty */
	- ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
	- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
	- ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
	-
	- db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
	- ASSERT(db != NULL);
	-
	- dn->dn_phys->dn_nlevels = new_level;
	- dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
	- dn->dn_object, dn->dn_phys->dn_nlevels);
	-
	- /* transfer dnode's block pointers to new indirect block */
	- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED\|DB_RF_HAVESTRUCT);
	- ASSERT(db->db.db_data);
	- ASSERT(arc_released(db->db_buf));
	- ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
	- bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
	- sizeof (blkptr_t) * nblkptr);
	- arc_buf_freeze(db->db_buf);
	-
	- /* set dbuf's parent pointers to new indirect buf */
	- for (i = 0; i < nblkptr; i++) {
	- dmu_buf_impl_t *child =
	- dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
	-
	- if (child == NULL)
	- continue;
	-#ifdef DEBUG
	- DB_DNODE_ENTER(child);
	- ASSERT3P(DB_DNODE(child), ==, dn);
	- DB_DNODE_EXIT(child);
	-#endif /* DEBUG */
	- if (child->db_parent && child->db_parent != dn->dn_dbuf) {
	- ASSERT(child->db_parent->db_level == db->db_level);
	- ASSERT(child->db_blkptr !=
	- &dn->dn_phys->dn_blkptr[child->db_blkid]);
	- mutex_exit(&child->db_mtx);
	- continue;
	- }
	- ASSERT(child->db_parent == NULL \|\|
	- child->db_parent == dn->dn_dbuf);
	-
	- child->db_parent = db;
	- dbuf_add_ref(db, child);
	- if (db->db.db_data)
	- child->db_blkptr = (blkptr_t *)db->db.db_data + i;
	- else
	- child->db_blkptr = NULL;
	- dprintf_dbuf_bp(child, child->db_blkptr,
	- "changed db_blkptr to new indirect %s", "");
	-
	- mutex_exit(&child->db_mtx);
	- }
	-
	- bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
	-
	- dbuf_rele(db, FTAG);
	-
	- rw_exit(&dn->dn_struct_rwlock);
	-}
	-
	-static void
	-free_blocks(dnode_t dn, blkptr_t bp, int num, dmu_tx_t *tx)
	-{
	- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
	- uint64_t bytesfreed = 0;
	-
	- dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
	-
	- for (int i = 0; i < num; i++, bp++) {
	- if (BP_IS_HOLE(bp))
	- continue;
	-
	- bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
	- ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
	-
	- /*
	- * Save some useful information on the holes being
	- * punched, including logical size, type, and indirection
	- * level. Retaining birth time enables detection of when
	- * holes are punched for reducing the number of free
	- * records transmitted during a zfs send.
	- */
	-
	- uint64_t lsize = BP_GET_LSIZE(bp);
	- dmu_object_type_t type = BP_GET_TYPE(bp);
	- uint64_t lvl = BP_GET_LEVEL(bp);
	-
	- bzero(bp, sizeof (blkptr_t));
	-
	- if (spa_feature_is_active(dn->dn_objset->os_spa,
	- SPA_FEATURE_HOLE_BIRTH)) {
	- BP_SET_LSIZE(bp, lsize);
	- BP_SET_TYPE(bp, type);
	- BP_SET_LEVEL(bp, lvl);
	- BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
	- }
	- }
	- dnode_diduse_space(dn, -bytesfreed);
	-}
	-
	-#ifdef ZFS_DEBUG
	-static void
	-free_verify(dmu_buf_impl_t db, uint64_t start, uint64_t end, dmu_tx_t tx)
	-{
	- int off, num;
	- int i, err, epbs;
	- uint64_t txg = tx->tx_txg;
	- dnode_t *dn;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	- off = start - (db->db_blkid * 1<<epbs);
	- num = end - start + 1;
	-
	- ASSERT3U(off, >=, 0);
	- ASSERT3U(num, >=, 0);
	- ASSERT3U(db->db_level, >, 0);
	- ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
	- ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
	- ASSERT(db->db_blkptr != NULL);
	-
	- for (i = off; i < off+num; i++) {
	- uint64_t *buf;
	- dmu_buf_impl_t *child;
	- dbuf_dirty_record_t *dr;
	- int j;
	-
	- ASSERT(db->db_level == 1);
	-
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- err = dbuf_hold_impl(dn, db->db_level-1,
	- (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
	- rw_exit(&dn->dn_struct_rwlock);
	- if (err == ENOENT)
	- continue;
	- ASSERT(err == 0);
	- ASSERT(child->db_level == 0);
	- dr = child->db_last_dirty;
	- while (dr && dr->dr_txg > txg)
	- dr = dr->dr_next;
	- ASSERT(dr == NULL \|\| dr->dr_txg == txg);
	-
	- /* data_old better be zeroed */
	- if (dr) {
	- buf = dr->dt.dl.dr_data->b_data;
	- for (j = 0; j < child->db.db_size >> 3; j++) {
	- if (buf[j] != 0) {
	- panic("freed data not zero: "
	- "child=%p i=%d off=%d num=%d\n",
	- (void *)child, i, off, num);
	- }
	- }
	- }
	-
	- /*
	- * db_data better be zeroed unless it's dirty in a
	- * future txg.
	- */
	- mutex_enter(&child->db_mtx);
	- buf = child->db.db_data;
	- if (buf != NULL && child->db_state != DB_FILL &&
	- child->db_last_dirty == NULL) {
	- for (j = 0; j < child->db.db_size >> 3; j++) {
	- if (buf[j] != 0) {
	- panic("freed data not zero: "
	- "child=%p i=%d off=%d num=%d\n",
	- (void *)child, i, off, num);
	- }
	- }
	- }
	- mutex_exit(&child->db_mtx);
	-
	- dbuf_rele(child, FTAG);
	- }
	- DB_DNODE_EXIT(db);
	-}
	-#endif
	-
	-/*
	- * We don't usually free the indirect blocks here. If in one txg we have a
	- * free_range and a write to the same indirect block, it's important that we
	- * preserve the hole's birth times. Therefore, we don't free any any indirect
	- * blocks in free_children(). If an indirect block happens to turn into all
	- * holes, it will be freed by dbuf_write_children_ready, which happens at a
	- * point in the syncing process where we know for certain the contents of the
	- * indirect block.
	- *
	- * However, if we're freeing a dnode, its space accounting must go to zero
	- * before we actually try to free the dnode, or we will trip an assertion. In
	- * addition, we know the case described above cannot occur, because the dnode is
	- * being freed. Therefore, we free the indirect blocks immediately in that
	- * case.
	- */
	-static void
	-free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
	- boolean_t free_indirects, dmu_tx_t *tx)
	-{
	- dnode_t *dn;
	- blkptr_t *bp;
	- dmu_buf_impl_t *subdb;
	- uint64_t start, end, dbstart, dbend;
	- unsigned int epbs, shift, i;
	-
	- /*
	- * There is a small possibility that this block will not be cached:
	- * 1 - if level > 1 and there are no children with level <= 1
	- * 2 - if this block was evicted since we read it from
	- * dmu_tx_hold_free().
	- */
	- if (db->db_state != DB_CACHED)
	- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
	-
	- /*
	- * If we modify this indirect block, and we are not freeing the
	- * dnode (!free_indirects), then this indirect block needs to get
	- * written to disk by dbuf_write(). If it is dirty, we know it will
	- * be written (otherwise, we would have incorrect on-disk state
	- * because the space would be freed but still referenced by the BP
	- * in this indirect block). Therefore we VERIFY that it is
	- * dirty.
	- *
	- * Our VERIFY covers some cases that do not actually have to be
	- * dirty, but the open-context code happens to dirty. E.g. if the
	- * blocks we are freeing are all holes, because in that case, we
	- * are only freeing part of this indirect block, so it is an
	- * ancestor of the first or last block to be freed. The first and
	- * last L1 indirect blocks are always dirtied by dnode_free_range().
	- */
	- VERIFY(BP_GET_FILL(db->db_blkptr) == 0 \|\| db->db_dirtycnt > 0);
	-
	- dbuf_release_bp(db);
	- bp = db->db.db_data;
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	- ASSERT3U(epbs, <, 31);
	- shift = (db->db_level - 1) * epbs;
	- dbstart = db->db_blkid << epbs;
	- start = blkid >> shift;
	- if (dbstart < start) {
	- bp += start - dbstart;
	- } else {
	- start = dbstart;
	- }
	- dbend = ((db->db_blkid + 1) << epbs) - 1;
	- end = (blkid + nblks - 1) >> shift;
	- if (dbend <= end)
	- end = dbend;
	-
	- ASSERT3U(start, <=, end);
	-
	- if (db->db_level == 1) {
	- FREE_VERIFY(db, start, end, tx);
	- free_blocks(dn, bp, end-start+1, tx);
	- } else {
	- for (uint64_t id = start; id <= end; id++, bp++) {
	- if (BP_IS_HOLE(bp))
	- continue;
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
	- id, TRUE, FALSE, FTAG, &subdb));
	- rw_exit(&dn->dn_struct_rwlock);
	- ASSERT3P(bp, ==, subdb->db_blkptr);
	-
	- free_children(subdb, blkid, nblks, free_indirects, tx);
	- dbuf_rele(subdb, FTAG);
	- }
	- }
	-
	- if (free_indirects) {
	- for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
	- ASSERT(BP_IS_HOLE(bp));
	- bzero(db->db.db_data, db->db.db_size);
	- free_blocks(dn, db->db_blkptr, 1, tx);
	- }
	-
	- DB_DNODE_EXIT(db);
	- arc_buf_freeze(db->db_buf);
	-}
	-
	-/*
	- * Traverse the indicated range of the provided file
	- * and "free" all the blocks contained there.
	- */
	-static void
	-dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
	- boolean_t free_indirects, dmu_tx_t *tx)
	-{
	- blkptr_t *bp = dn->dn_phys->dn_blkptr;
	- int dnlevel = dn->dn_phys->dn_nlevels;
	- boolean_t trunc = B_FALSE;
	-
	- if (blkid > dn->dn_phys->dn_maxblkid)
	- return;
	-
	- ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
	- if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
	- nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
	- trunc = B_TRUE;
	- }
	-
	- /* There are no indirect blocks in the object */
	- if (dnlevel == 1) {
	- if (blkid >= dn->dn_phys->dn_nblkptr) {
	- /* this range was never made persistent */
	- return;
	- }
	- ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
	- free_blocks(dn, bp + blkid, nblks, tx);
	- } else {
	- int shift = (dnlevel - 1) *
	- (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
	- int start = blkid >> shift;
	- int end = (blkid + nblks - 1) >> shift;
	- dmu_buf_impl_t *db;
	-
	- ASSERT(start < dn->dn_phys->dn_nblkptr);
	- bp += start;
	- for (int i = start; i <= end; i++, bp++) {
	- if (BP_IS_HOLE(bp))
	- continue;
	- rw_enter(&dn->dn_struct_rwlock, RW_READER);
	- VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
	- TRUE, FALSE, FTAG, &db));
	- rw_exit(&dn->dn_struct_rwlock);
	-
	- free_children(db, blkid, nblks, free_indirects, tx);
	- dbuf_rele(db, FTAG);
	- }
	- }
	-
	- if (trunc) {
	- dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
	-
	- uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
	- (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
	- ASSERT(off < dn->dn_phys->dn_maxblkid \|\|
	- dn->dn_phys->dn_maxblkid == 0 \|\|
	- dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
	- }
	-}
	-
	-typedef struct dnode_sync_free_range_arg {
	- dnode_t *dsfra_dnode;
	- dmu_tx_t *dsfra_tx;
	- boolean_t dsfra_free_indirects;
	-} dnode_sync_free_range_arg_t;
	-
	-static void
	-dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
	-{
	- dnode_sync_free_range_arg_t *dsfra = arg;
	- dnode_t *dn = dsfra->dsfra_dnode;
	-
	- mutex_exit(&dn->dn_mtx);
	- dnode_sync_free_range_impl(dn, blkid, nblks,
	- dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
	- mutex_enter(&dn->dn_mtx);
	-}
	-
	-/*
	- * Try to kick all the dnode's dbufs out of the cache...
	- */
	-void
	-dnode_evict_dbufs(dnode_t *dn)
	-{
	- dmu_buf_impl_t db_marker;
	- dmu_buf_impl_t db, db_next;
	-
	- mutex_enter(&dn->dn_dbufs_mtx);
	- for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
	-
	-#ifdef DEBUG
	- DB_DNODE_ENTER(db);
	- ASSERT3P(DB_DNODE(db), ==, dn);
	- DB_DNODE_EXIT(db);
	-#endif /* DEBUG */
	-
	- mutex_enter(&db->db_mtx);
	- if (db->db_state != DB_EVICTING &&
	- zfs_refcount_is_zero(&db->db_holds)) {
	- db_marker.db_level = db->db_level;
	- db_marker.db_blkid = db->db_blkid;
	- db_marker.db_state = DB_SEARCH;
	- avl_insert_here(&dn->dn_dbufs, &db_marker, db,
	- AVL_BEFORE);
	-
	- /*
	- * We need to use the "marker" dbuf rather than
	- * simply getting the next dbuf, because
	- * dbuf_destroy() may actually remove multiple dbufs.
	- * It can call itself recursively on the parent dbuf,
	- * which may also be removed from dn_dbufs. The code
	- * flow would look like:
	- *
	- * dbuf_destroy():
	- * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
	- * if (!cacheable \|\| pending_evict)
	- * dbuf_destroy()
	- */
	- dbuf_destroy(db);
	-
	- db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
	- avl_remove(&dn->dn_dbufs, &db_marker);
	- } else {
	- db->db_pending_evict = TRUE;
	- mutex_exit(&db->db_mtx);
	- db_next = AVL_NEXT(&dn->dn_dbufs, db);
	- }
	- }
	- mutex_exit(&dn->dn_dbufs_mtx);
	-
	- dnode_evict_bonus(dn);
	-}
	-
	-void
	-dnode_evict_bonus(dnode_t *dn)
	-{
	- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	- if (dn->dn_bonus != NULL) {
	- if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) {
	- mutex_enter(&dn->dn_bonus->db_mtx);
	- dbuf_destroy(dn->dn_bonus);
	- dn->dn_bonus = NULL;
	- } else {
	- dn->dn_bonus->db_pending_evict = TRUE;
	- }
	- }
	- rw_exit(&dn->dn_struct_rwlock);
	-}
	-
	-static void
	-dnode_undirty_dbufs(list_t *list)
	-{
	- dbuf_dirty_record_t *dr;
	-
	- while (dr = list_head(list)) {
	- dmu_buf_impl_t *db = dr->dr_dbuf;
	- uint64_t txg = dr->dr_txg;
	-
	- if (db->db_level != 0)
	- dnode_undirty_dbufs(&dr->dt.di.dr_children);
	-
	- mutex_enter(&db->db_mtx);
	- /* XXX - use dbuf_undirty()? */
	- list_remove(list, dr);
	- ASSERT(db->db_last_dirty == dr);
	- db->db_last_dirty = NULL;
	- db->db_dirtycnt -= 1;
	- if (db->db_level == 0) {
	- ASSERT(db->db_blkid == DMU_BONUS_BLKID \|\|
	- dr->dt.dl.dr_data == db->db_buf);
	- dbuf_unoverride(dr);
	- } else {
	- mutex_destroy(&dr->dt.di.dr_mtx);
	- list_destroy(&dr->dt.di.dr_children);
	- }
	- kmem_free(dr, sizeof (dbuf_dirty_record_t));
	- dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
	- }
	-}
	-
	-static void
	-dnode_sync_free(dnode_t dn, dmu_tx_t tx)
	-{
	- int txgoff = tx->tx_txg & TXG_MASK;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- /*
	- * Our contents should have been freed in dnode_sync() by the
	- * free range record inserted by the caller of dnode_free().
	- */
	- ASSERT0(DN_USED_BYTES(dn->dn_phys));
	- ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
	-
	- dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
	- dnode_evict_dbufs(dn);
	-
	- /*
	- * XXX - It would be nice to assert this, but we may still
	- * have residual holds from async evictions from the arc...
	- *
	- * zfs_obj_to_path() also depends on this being
	- * commented out.
	- *
	- * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1);
	- */
	-
	- /* Undirty next bits */
	- dn->dn_next_nlevels[txgoff] = 0;
	- dn->dn_next_indblkshift[txgoff] = 0;
	- dn->dn_next_blksz[txgoff] = 0;
	-
	- /* ASSERT(blkptrs are zero); */
	- ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
	- ASSERT(dn->dn_type != DMU_OT_NONE);
	-
	- ASSERT(dn->dn_free_txg > 0);
	- if (dn->dn_allocated_txg != dn->dn_free_txg)
	- dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
	- bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
	- dnode_free_interior_slots(dn);
	-
	- mutex_enter(&dn->dn_mtx);
	- dn->dn_type = DMU_OT_NONE;
	- dn->dn_maxblkid = 0;
	- dn->dn_allocated_txg = 0;
	- dn->dn_free_txg = 0;
	- dn->dn_have_spill = B_FALSE;
	- dn->dn_num_slots = 1;
	- mutex_exit(&dn->dn_mtx);
	-
	- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
	-
	- dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
	- /*
	- * Now that we've released our hold, the dnode may
	- * be evicted, so we musn't access it.
	- */
	-}
	-
	-/*
	- * Write out the dnode's dirty buffers.
	- */
	-void
	-dnode_sync(dnode_t dn, dmu_tx_t tx)
	-{
	- dnode_phys_t *dnp = dn->dn_phys;
	- int txgoff = tx->tx_txg & TXG_MASK;
	- list_t *list = &dn->dn_dirty_records[txgoff];
	- static const dnode_phys_t zerodn = { 0 };
	- boolean_t kill_spill = B_FALSE;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(dnp->dn_type != DMU_OT_NONE \|\| dn->dn_allocated_txg);
	- ASSERT(dnp->dn_type != DMU_OT_NONE \|\|
	- bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
	- DNODE_VERIFY(dn);
	-
	- ASSERT(dn->dn_dbuf == NULL \|\| arc_released(dn->dn_dbuf->db_buf));
	-
	- if (dmu_objset_userused_enabled(dn->dn_objset) &&
	- !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
	- mutex_enter(&dn->dn_mtx);
	- dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
	- dn->dn_oldflags = dn->dn_phys->dn_flags;
	- dn->dn_phys->dn_flags \|= DNODE_FLAG_USERUSED_ACCOUNTED;
	- mutex_exit(&dn->dn_mtx);
	- dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
	- } else {
	- /* Once we account for it, we should always account for it. */
	- ASSERT(!(dn->dn_phys->dn_flags &
	- DNODE_FLAG_USERUSED_ACCOUNTED));
	- }
	-
	- mutex_enter(&dn->dn_mtx);
	- if (dn->dn_allocated_txg == tx->tx_txg) {
	- /* The dnode is newly allocated or reallocated */
	- if (dnp->dn_type == DMU_OT_NONE) {
	- /* this is a first alloc, not a realloc */
	- dnp->dn_nlevels = 1;
	- dnp->dn_nblkptr = dn->dn_nblkptr;
	- }
	-
	- dnp->dn_type = dn->dn_type;
	- dnp->dn_bonustype = dn->dn_bonustype;
	- dnp->dn_bonuslen = dn->dn_bonuslen;
	- }
	-
	- dnp->dn_extra_slots = dn->dn_num_slots - 1;
	-
	- ASSERT(dnp->dn_nlevels > 1 \|\|
	- BP_IS_HOLE(&dnp->dn_blkptr[0]) \|\|
	- BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) \|\|
	- BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
	- dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
	- ASSERT(dnp->dn_nlevels < 2 \|\|
	- BP_IS_HOLE(&dnp->dn_blkptr[0]) \|\|
	- BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
	-
	- if (dn->dn_next_type[txgoff] != 0) {
	- dnp->dn_type = dn->dn_type;
	- dn->dn_next_type[txgoff] = 0;
	- }
	-
	- if (dn->dn_next_blksz[txgoff] != 0) {
	- ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
	- SPA_MINBLOCKSIZE) == 0);
	- ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) \|\|
	- dn->dn_maxblkid == 0 \|\| list_head(list) != NULL \|\|
	- dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
	- dnp->dn_datablkszsec \|\|
	- !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
	- dnp->dn_datablkszsec =
	- dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
	- dn->dn_next_blksz[txgoff] = 0;
	- }
	-
	- if (dn->dn_next_bonuslen[txgoff] != 0) {
	- if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
	- dnp->dn_bonuslen = 0;
	- else
	- dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
	- ASSERT(dnp->dn_bonuslen <=
	- DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
	- dn->dn_next_bonuslen[txgoff] = 0;
	- }
	-
	- if (dn->dn_next_bonustype[txgoff] != 0) {
	- ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
	- dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
	- dn->dn_next_bonustype[txgoff] = 0;
	- }
	-
	- boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
	- dn->dn_free_txg <= tx->tx_txg;
	-
	- /*
	- * Remove the spill block if we have been explicitly asked to
	- * remove it, or if the object is being removed.
	- */
	- if (dn->dn_rm_spillblk[txgoff] \|\| freeing_dnode) {
	- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
	- kill_spill = B_TRUE;
	- dn->dn_rm_spillblk[txgoff] = 0;
	- }
	-
	- if (dn->dn_next_indblkshift[txgoff] != 0) {
	- ASSERT(dnp->dn_nlevels == 1);
	- dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
	- dn->dn_next_indblkshift[txgoff] = 0;
	- }
	-
	- /*
	- * Just take the live (open-context) values for checksum and compress.
	- * Strictly speaking it's a future leak, but nothing bad happens if we
	- * start using the new checksum or compress algorithm a little early.
	- */
	- dnp->dn_checksum = dn->dn_checksum;
	- dnp->dn_compress = dn->dn_compress;
	-
	- mutex_exit(&dn->dn_mtx);
	-
	- if (kill_spill) {
	- free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
	- mutex_enter(&dn->dn_mtx);
	- dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
	- mutex_exit(&dn->dn_mtx);
	- }
	-
	- /* process all the "freed" ranges in the file */
	- if (dn->dn_free_ranges[txgoff] != NULL) {
	- dnode_sync_free_range_arg_t dsfra;
	- dsfra.dsfra_dnode = dn;
	- dsfra.dsfra_tx = tx;
	- dsfra.dsfra_free_indirects = freeing_dnode;
	- if (freeing_dnode) {
	- ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
	- 0, dn->dn_maxblkid + 1));
	- }
	- mutex_enter(&dn->dn_mtx);
	- range_tree_vacate(dn->dn_free_ranges[txgoff],
	- dnode_sync_free_range, &dsfra);
	- range_tree_destroy(dn->dn_free_ranges[txgoff]);
	- dn->dn_free_ranges[txgoff] = NULL;
	- mutex_exit(&dn->dn_mtx);
	- }
	-
	- if (freeing_dnode) {
	- dn->dn_objset->os_freed_dnodes++;
	- dnode_sync_free(dn, tx);
	- return;
	- }
	-
	- if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
	- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
	- mutex_enter(&ds->ds_lock);
	- ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] =
	- B_TRUE;
	- mutex_exit(&ds->ds_lock);
	- }
	-
	- if (dn->dn_next_nlevels[txgoff]) {
	- dnode_increase_indirection(dn, tx);
	- dn->dn_next_nlevels[txgoff] = 0;
	- }
	-
	- if (dn->dn_next_nblkptr[txgoff]) {
	- /* this should only happen on a realloc */
	- ASSERT(dn->dn_allocated_txg == tx->tx_txg);
	- if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
	- /* zero the new blkptrs we are gaining */
	- bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
	- sizeof (blkptr_t) *
	- (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
	-#ifdef ZFS_DEBUG
	- } else {
	- int i;
	- ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
	- /* the blkptrs we are losing better be unallocated */
	- for (i = dn->dn_next_nblkptr[txgoff];
	- i < dnp->dn_nblkptr; i++)
	- ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
	-#endif
	- }
	- mutex_enter(&dn->dn_mtx);
	- dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
	- dn->dn_next_nblkptr[txgoff] = 0;
	- mutex_exit(&dn->dn_mtx);
	- }
	-
	- dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
	-
	- if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
	- ASSERT3P(list_head(list), ==, NULL);
	- dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
	- }
	-
	- /*
	- * Although we have dropped our reference to the dnode, it
	- * can't be evicted until its written, and we haven't yet
	- * initiated the IO for the dnode's dbuf.
	- */
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c
	@@ -1,566 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
	- * Copyright 2017 Nexenta Systems, Inc.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/arc.h>
	-#include <sys/zap.h>
	-#include <sys/zfeature.h>
	-#include <sys/spa.h>
	-#include <sys/dsl_bookmark.h>
	-#include <zfs_namecheck.h>
	-
	-static int
	-dsl_bookmark_hold_ds(dsl_pool_t dp, const char fullname,
	- dsl_dataset_t *dsp, void tag, char **shortnamep)
	-{
	- char buf[ZFS_MAX_DATASET_NAME_LEN];
	- char *hashp;
	-
	- if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	- hashp = strchr(fullname, '#');
	- if (hashp == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- *shortnamep = hashp + 1;
	- if (zfs_component_namecheck(*shortnamep, NULL, NULL))
	- return (SET_ERROR(EINVAL));
	- (void) strlcpy(buf, fullname, hashp - fullname + 1);
	- return (dsl_dataset_hold(dp, buf, tag, dsp));
	-}
	-
	-/*
	- * Returns ESRCH if bookmark is not found.
	- */
	-static int
	-dsl_dataset_bmark_lookup(dsl_dataset_t ds, const char shortname,
	- zfs_bookmark_phys_t *bmark_phys)
	-{
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- uint64_t bmark_zapobj = ds->ds_bookmarks;
	- matchtype_t mt = 0;
	- int err;
	-
	- if (bmark_zapobj == 0)
	- return (SET_ERROR(ESRCH));
	-
	- if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
	- mt = MT_NORMALIZE;
	-
	- err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
	- sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt,
	- NULL, 0, NULL);
	-
	- return (err == ENOENT ? ESRCH : err);
	-}
	-
	-/*
	- * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark
	- * does not represents an earlier point in later_ds's timeline.
	- *
	- * Returns ENOENT if the dataset containing the bookmark does not exist.
	- * Returns ESRCH if the dataset exists but the bookmark was not found in it.
	- */
	-int
	-dsl_bookmark_lookup(dsl_pool_t dp, const char fullname,
	- dsl_dataset_t later_ds, zfs_bookmark_phys_t bmp)
	-{
	- char *shortname;
	- dsl_dataset_t *ds;
	- int error;
	-
	- error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_bmark_lookup(ds, shortname, bmp);
	- if (error == 0 && later_ds != NULL) {
	- if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg))
	- error = SET_ERROR(EXDEV);
	- }
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	-}
	-
	-typedef struct dsl_bookmark_create_arg {
	- nvlist_t *dbca_bmarks;
	- nvlist_t *dbca_errors;
	-} dsl_bookmark_create_arg_t;
	-
	-static int
	-dsl_bookmark_create_check_impl(dsl_dataset_t snapds, const char bookmark_name,
	- dmu_tx_t *tx)
	-{
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *bmark_fs;
	- char *shortname;
	- int error;
	- zfs_bookmark_phys_t bmark_phys;
	-
	- if (!snapds->ds_is_snapshot)
	- return (SET_ERROR(EINVAL));
	-
	- error = dsl_bookmark_hold_ds(dp, bookmark_name,
	- &bmark_fs, FTAG, &shortname);
	- if (error != 0)
	- return (error);
	-
	- if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) {
	- dsl_dataset_rele(bmark_fs, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- error = dsl_dataset_bmark_lookup(bmark_fs, shortname,
	- &bmark_phys);
	- dsl_dataset_rele(bmark_fs, FTAG);
	- if (error == 0)
	- return (SET_ERROR(EEXIST));
	- if (error == ESRCH)
	- return (0);
	- return (error);
	-}
	-
	-static int
	-dsl_bookmark_create_check(void arg, dmu_tx_t tx)
	-{
	- dsl_bookmark_create_arg_t *dbca = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- int rv = 0;
	-
	- if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
	- return (SET_ERROR(ENOTSUP));
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
	- dsl_dataset_t *snapds;
	- int error;
	-
	- /* note: validity of nvlist checked by ioctl layer */
	- error = dsl_dataset_hold(dp, fnvpair_value_string(pair),
	- FTAG, &snapds);
	- if (error == 0) {
	- error = dsl_bookmark_create_check_impl(snapds,
	- nvpair_name(pair), tx);
	- dsl_dataset_rele(snapds, FTAG);
	- }
	- if (error != 0) {
	- fnvlist_add_int32(dbca->dbca_errors,
	- nvpair_name(pair), error);
	- rv = error;
	- }
	- }
	-
	- return (rv);
	-}
	-
	-static void
	-dsl_bookmark_create_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_bookmark_create_arg_t *dbca = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- objset_t *mos = dp->dp_meta_objset;
	-
	- ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS));
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
	- dsl_dataset_t snapds, bmark_fs;
	- zfs_bookmark_phys_t bmark_phys;
	- char *shortname;
	-
	- VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair),
	- FTAG, &snapds));
	- VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
	- &bmark_fs, FTAG, &shortname));
	- if (bmark_fs->ds_bookmarks == 0) {
	- bmark_fs->ds_bookmarks =
	- zap_create_norm(mos, U8_TEXTPREP_TOUPPER,
	- DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
	- spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
	-
	- dsl_dataset_zapify(bmark_fs, tx);
	- VERIFY0(zap_add(mos, bmark_fs->ds_object,
	- DS_FIELD_BOOKMARK_NAMES,
	- sizeof (bmark_fs->ds_bookmarks), 1,
	- &bmark_fs->ds_bookmarks, tx));
	- }
	-
	- bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid;
	- bmark_phys.zbm_creation_txg =
	- dsl_dataset_phys(snapds)->ds_creation_txg;
	- bmark_phys.zbm_creation_time =
	- dsl_dataset_phys(snapds)->ds_creation_time;
	-
	- VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks,
	- shortname, sizeof (uint64_t),
	- sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
	- &bmark_phys, tx));
	-
	- spa_history_log_internal_ds(bmark_fs, "bookmark", tx,
	- "name=%s creation_txg=%llu target_snap=%llu",
	- shortname,
	- (longlong_t)bmark_phys.zbm_creation_txg,
	- (longlong_t)snapds->ds_object);
	-
	- dsl_dataset_rele(bmark_fs, FTAG);
	- dsl_dataset_rele(snapds, FTAG);
	- }
	-}
	-
	-/*
	- * The bookmarks must all be in the same pool.
	- */
	-int
	-dsl_bookmark_create(nvlist_t bmarks, nvlist_t errors)
	-{
	- nvpair_t *pair;
	- dsl_bookmark_create_arg_t dbca;
	-
	- pair = nvlist_next_nvpair(bmarks, NULL);
	- if (pair == NULL)
	- return (0);
	-
	- dbca.dbca_bmarks = bmarks;
	- dbca.dbca_errors = errors;
	-
	- return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
	- dsl_bookmark_create_sync, &dbca,
	- fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
	-}
	-
	-int
	-dsl_get_bookmarks_impl(dsl_dataset_t ds, nvlist_t props, nvlist_t *outnvl)
	-{
	- int err = 0;
	- zap_cursor_t zc;
	- zap_attribute_t attr;
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	-
	- uint64_t bmark_zapobj = ds->ds_bookmarks;
	- if (bmark_zapobj == 0)
	- return (0);
	-
	- for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj);
	- zap_cursor_retrieve(&zc, &attr) == 0;
	- zap_cursor_advance(&zc)) {
	- char *bmark_name = attr.za_name;
	- zfs_bookmark_phys_t bmark_phys;
	-
	- err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys);
	- ASSERT3U(err, !=, ENOENT);
	- if (err != 0)
	- break;
	-
	- nvlist_t *out_props = fnvlist_alloc();
	- if (nvlist_exists(props,
	- zfs_prop_to_name(ZFS_PROP_GUID))) {
	- dsl_prop_nvlist_add_uint64(out_props,
	- ZFS_PROP_GUID, bmark_phys.zbm_guid);
	- }
	- if (nvlist_exists(props,
	- zfs_prop_to_name(ZFS_PROP_CREATETXG))) {
	- dsl_prop_nvlist_add_uint64(out_props,
	- ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg);
	- }
	- if (nvlist_exists(props,
	- zfs_prop_to_name(ZFS_PROP_CREATION))) {
	- dsl_prop_nvlist_add_uint64(out_props,
	- ZFS_PROP_CREATION, bmark_phys.zbm_creation_time);
	- }
	-
	- fnvlist_add_nvlist(outnvl, bmark_name, out_props);
	- fnvlist_free(out_props);
	- }
	- zap_cursor_fini(&zc);
	- return (err);
	-}
	-
	-/*
	- * Retrieve the bookmarks that exist in the specified dataset, and the
	- * requested properties of each bookmark.
	- *
	- * The "props" nvlist specifies which properties are requested.
	- * See lzc_get_bookmarks() for the list of valid properties.
	- */
	-int
	-dsl_get_bookmarks(const char dsname, nvlist_t props, nvlist_t *outnvl)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- int err;
	-
	- err = dsl_pool_hold(dsname, FTAG, &dp);
	- if (err != 0)
	- return (err);
	- err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
	- if (err != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (err);
	- }
	-
	- err = dsl_get_bookmarks_impl(ds, props, outnvl);
	-
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (err);
	-}
	-
	-typedef struct dsl_bookmark_destroy_arg {
	- nvlist_t *dbda_bmarks;
	- nvlist_t *dbda_success;
	- nvlist_t *dbda_errors;
	-} dsl_bookmark_destroy_arg_t;
	-
	-static int
	-dsl_dataset_bookmark_remove(dsl_dataset_t ds, const char name, dmu_tx_t *tx)
	-{
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- uint64_t bmark_zapobj = ds->ds_bookmarks;
	- matchtype_t mt = 0;
	-
	- if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
	- mt = MT_NORMALIZE;
	-
	- return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx));
	-}
	-
	-static int
	-dsl_bookmark_destroy_check(void arg, dmu_tx_t tx)
	-{
	- dsl_bookmark_destroy_arg_t *dbda = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- int rv = 0;
	-
	- ASSERT(nvlist_empty(dbda->dbda_success));
	- ASSERT(nvlist_empty(dbda->dbda_errors));
	-
	- if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
	- return (0);
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) {
	- const char *fullname = nvpair_name(pair);
	- dsl_dataset_t *ds;
	- zfs_bookmark_phys_t bm;
	- int error;
	- char *shortname;
	-
	- error = dsl_bookmark_hold_ds(dp, fullname, &ds,
	- FTAG, &shortname);
	- if (error == ENOENT) {
	- /* ignore it; the bookmark is "already destroyed" */
	- continue;
	- }
	- if (error == 0) {
	- error = dsl_dataset_bmark_lookup(ds, shortname, &bm);
	- dsl_dataset_rele(ds, FTAG);
	- if (error == ESRCH) {
	- /*
	- * ignore it; the bookmark is
	- * "already destroyed"
	- */
	- continue;
	- }
	- }
	- if (error == 0) {
	- if (dmu_tx_is_syncing(tx)) {
	- fnvlist_add_boolean(dbda->dbda_success,
	- fullname);
	- }
	- } else {
	- fnvlist_add_int32(dbda->dbda_errors, fullname, error);
	- rv = error;
	- }
	- }
	- return (rv);
	-}
	-
	-static void
	-dsl_bookmark_destroy_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_bookmark_destroy_arg_t *dbda = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- objset_t *mos = dp->dp_meta_objset;
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) {
	- dsl_dataset_t *ds;
	- char *shortname;
	- uint64_t zap_cnt;
	-
	- VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
	- &ds, FTAG, &shortname));
	- VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx));
	-
	- /*
	- * If all of this dataset's bookmarks have been destroyed,
	- * free the zap object and decrement the feature's use count.
	- */
	- VERIFY0(zap_count(mos, ds->ds_bookmarks,
	- &zap_cnt));
	- if (zap_cnt == 0) {
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
	- ds->ds_bookmarks = 0;
	- spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
	- VERIFY0(zap_remove(mos, ds->ds_object,
	- DS_FIELD_BOOKMARK_NAMES, tx));
	- }
	-
	- spa_history_log_internal_ds(ds, "remove bookmark", tx,
	- "name=%s", shortname);
	-
	- dsl_dataset_rele(ds, FTAG);
	- }
	-}
	-
	-/*
	- * The bookmarks must all be in the same pool.
	- */
	-int
	-dsl_bookmark_destroy(nvlist_t bmarks, nvlist_t errors)
	-{
	- int rv;
	- dsl_bookmark_destroy_arg_t dbda;
	- nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
	- if (pair == NULL)
	- return (0);
	-
	- dbda.dbda_bmarks = bmarks;
	- dbda.dbda_errors = errors;
	- dbda.dbda_success = fnvlist_alloc();
	-
	- rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
	- dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
	- ZFS_SPACE_CHECK_RESERVED);
	- fnvlist_free(dbda.dbda_success);
	- return (rv);
	-}
	-
	-typedef struct dsl_bookmark_rename_arg {
	- const char *dbra_fsname;
	- const char *dbra_oldname;
	- const char *dbra_newname;
	-} dsl_bookmark_rename_arg_t;
	-
	-static int
	-dsl_bookmark_rename_check(void arg, dmu_tx_t tx)
	-{
	- dsl_bookmark_rename_arg_t *dbra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- zfs_bookmark_phys_t bmark_phys;
	- int error;
	-
	- if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
	- return (SET_ERROR(ENOTSUP));
	-
	- /* Check validity and the full length of the new bookmark name. */
	- if (zfs_component_namecheck(dbra->dbra_newname, NULL, NULL))
	- return (SET_ERROR(EINVAL));
	- if (strlen(dbra->dbra_fsname) + strlen(dbra->dbra_newname) + 1 >=
	- ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	-
	- error = dsl_dataset_hold(dp, dbra->dbra_fsname, FTAG, &ds);
	- if (error != 0)
	- return (error);
	- if (ds->ds_is_snapshot) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	- error = dsl_dataset_bmark_lookup(ds, dbra->dbra_oldname, &bmark_phys);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- error = dsl_dataset_bmark_lookup(ds, dbra->dbra_newname, &bmark_phys);
	- dsl_dataset_rele(ds, FTAG);
	- if (error == 0)
	- return (SET_ERROR(EEXIST));
	- if (error != ESRCH)
	- return (error);
	- return (0);
	-}
	-
	-static void
	-dsl_bookmark_rename_sync(void arg, dmu_tx_t tx)
	-{
	- zfs_bookmark_phys_t bmark_phys;
	- dsl_bookmark_rename_arg_t *dbra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- objset_t *mos;
	- dsl_dataset_t *ds;
	- uint64_t bmark_zapobj;
	- uint64_t int_size, num_ints;
	- matchtype_t mt = 0;
	- int error;
	-
	- ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS));
	- VERIFY0(dsl_dataset_hold(dp, dbra->dbra_fsname, FTAG, &ds));
	-
	- mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- bmark_zapobj = ds->ds_bookmarks;
	-
	- if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
	- mt = MT_NORMALIZE;
	-
	- VERIFY0(zap_length(mos, bmark_zapobj, dbra->dbra_oldname,
	- &int_size, &num_ints));
	- ASSERT3U(int_size, ==, sizeof (uint64_t));
	- VERIFY0(zap_lookup_norm(mos, bmark_zapobj, dbra->dbra_oldname, int_size,
	- num_ints, &bmark_phys, mt, NULL, 0, NULL));
	- VERIFY0(zap_remove_norm(mos, bmark_zapobj, dbra->dbra_oldname, mt, tx));
	-
	- VERIFY0(zap_add(mos, bmark_zapobj, dbra->dbra_newname, int_size,
	- num_ints, &bmark_phys, tx));
	-
	- spa_history_log_internal_ds(ds, "rename bookmark", tx,
	- "#%s -> #%s creation_txg=%llu",
	- dbra->dbra_oldname, dbra->dbra_newname,
	- (longlong_t)bmark_phys.zbm_creation_txg);
	-
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-/*
	- * The bookmarks must all be in the same pool.
	- */
	-int
	-dsl_bookmark_rename(const char fsname, const char oldbmark,
	- const char *newbmark)
	-{
	- dsl_bookmark_rename_arg_t dbra;
	-
	- dbra.dbra_fsname = fsname;
	- dbra.dbra_oldname = oldbmark;
	- dbra.dbra_newname = newbmark;
	-
	- return (dsl_sync_task(fsname, dsl_bookmark_rename_check,
	- dsl_bookmark_rename_sync, &dbra, 1, ZFS_SPACE_CHECK_NORMAL));
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
	@@ -1,4252 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
	- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 RackTop Systems.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
	- * Copyright 2017 Nexenta Systems, Inc.
	- */
	-
	-#include <sys/dmu_objset.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dmu_send.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/arc.h>
	-#include <sys/zio.h>
	-#include <sys/zap.h>
	-#include <sys/zfeature.h>
	-#include <sys/unique.h>
	-#include <sys/zfs_context.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/vdev.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zfs_onexit.h>
	-#include <sys/zvol.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/dsl_deadlist.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/dsl_userhold.h>
	-#include <sys/dsl_bookmark.h>
	-#include <sys/dmu_send.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zio_compress.h>
	-#include <zfs_fletcher.h>
	-
	-SYSCTL_DECL(_vfs_zfs);
	-
	-/*
	- * The SPA supports block sizes up to 16MB. However, very large blocks
	- * can have an impact on i/o latency (e.g. tying up a spinning disk for
	- * ~300ms), and also potentially on the memory allocator. Therefore,
	- * we do not allow the recordsize to be set larger than zfs_max_recordsize
	- * (default 1MB). Larger blocks can be created by changing this tunable,
	- * and pools with larger blocks can always be imported and used, regardless
	- * of this setting.
	- */
	-int zfs_max_recordsize = 1 * 1024 * 1024;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
	- &zfs_max_recordsize, 0,
	- "Maximum block size. Expect dragons when tuning this.");
	-
	-#define SWITCH64(x, y) \
	- { \
	- uint64_t __tmp = (x); \
	- (x) = (y); \
	- (y) = __tmp; \
	- }
	-
	-#define DS_REF_MAX (1ULL << 62)
	-
	-extern inline dsl_dataset_phys_t dsl_dataset_phys(dsl_dataset_t ds);
	-
	-static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
	- uint64_t obj, dmu_tx_t *tx);
	-static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
	- dmu_tx_t *tx);
	-
	-extern int spa_asize_inflation;
	-
	-static zil_header_t zero_zil;
	-
	-/*
	- * Figure out how much of this delta should be propogated to the dsl_dir
	- * layer. If there's a refreservation, that space has already been
	- * partially accounted for in our ancestors.
	- */
	-static int64_t
	-parent_delta(dsl_dataset_t *ds, int64_t delta)
	-{
	- dsl_dataset_phys_t *ds_phys;
	- uint64_t old_bytes, new_bytes;
	-
	- if (ds->ds_reserved == 0)
	- return (delta);
	-
	- ds_phys = dsl_dataset_phys(ds);
	- old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
	- new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
	-
	- ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
	- return (new_bytes - old_bytes);
	-}
	-
	-void
	-dsl_dataset_block_born(dsl_dataset_t ds, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
	- int compressed = BP_GET_PSIZE(bp);
	- int uncompressed = BP_GET_UCSIZE(bp);
	- int64_t delta;
	-
	- dprintf_bp(bp, "ds=%p", ds);
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- /* It could have been compressed away to nothing */
	- if (BP_IS_HOLE(bp))
	- return;
	- ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
	- ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
	- if (ds == NULL) {
	- dsl_pool_mos_diduse_space(tx->tx_pool,
	- used, compressed, uncompressed);
	- return;
	- }
	-
	- ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- mutex_enter(&ds->ds_lock);
	- delta = parent_delta(ds, used);
	- dsl_dataset_phys(ds)->ds_referenced_bytes += used;
	- dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
	- dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
	- dsl_dataset_phys(ds)->ds_unique_bytes += used;
	-
	- if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
	- ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
	- B_TRUE;
	- }
	-
	- spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
	- if (f != SPA_FEATURE_NONE)
	- ds->ds_feature_activation_needed[f] = B_TRUE;
	-
	- mutex_exit(&ds->ds_lock);
	- dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
	- compressed, uncompressed, tx);
	- dsl_dir_transfer_space(ds->ds_dir, used - delta,
	- DD_USED_REFRSRV, DD_USED_HEAD, NULL);
	-}
	-
	-/*
	- * Called when the specified segment has been remapped, and is thus no
	- * longer referenced in the head dataset. The vdev must be indirect.
	- *
	- * If the segment is referenced by a snapshot, put it on the remap deadlist.
	- * Otherwise, add this segment to the obsolete spacemap.
	- */
	-void
	-dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
	- uint64_t size, uint64_t birth, dmu_tx_t *tx)
	-{
	- spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(birth <= tx->tx_txg);
	- ASSERT(!ds->ds_is_snapshot);
	-
	- if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
	- spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
	- } else {
	- blkptr_t fakebp;
	- dva_t *dva = &fakebp.blk_dva[0];
	-
	- ASSERT(ds != NULL);
	-
	- mutex_enter(&ds->ds_remap_deadlist_lock);
	- if (!dsl_dataset_remap_deadlist_exists(ds)) {
	- dsl_dataset_create_remap_deadlist(ds, tx);
	- }
	- mutex_exit(&ds->ds_remap_deadlist_lock);
	-
	- BP_ZERO(&fakebp);
	- fakebp.blk_birth = birth;
	- DVA_SET_VDEV(dva, vdev);
	- DVA_SET_OFFSET(dva, offset);
	- DVA_SET_ASIZE(dva, size);
	-
	- dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
	- }
	-}
	-
	-int
	-dsl_dataset_block_kill(dsl_dataset_t ds, const blkptr_t bp, dmu_tx_t *tx,
	- boolean_t async)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- int used = bp_get_dsize_sync(spa, bp);
	- int compressed = BP_GET_PSIZE(bp);
	- int uncompressed = BP_GET_UCSIZE(bp);
	-
	- if (BP_IS_HOLE(bp))
	- return (0);
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(bp->blk_birth <= tx->tx_txg);
	-
	- if (ds == NULL) {
	- dsl_free(tx->tx_pool, tx->tx_txg, bp);
	- dsl_pool_mos_diduse_space(tx->tx_pool,
	- -used, -compressed, -uncompressed);
	- return (used);
	- }
	- ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
	-
	- ASSERT(!ds->ds_is_snapshot);
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	-
	- if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
	- int64_t delta;
	-
	- dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
	- dsl_free(tx->tx_pool, tx->tx_txg, bp);
	-
	- mutex_enter(&ds->ds_lock);
	- ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used \|\|
	- !DS_UNIQUE_IS_ACCURATE(ds));
	- delta = parent_delta(ds, -used);
	- dsl_dataset_phys(ds)->ds_unique_bytes -= used;
	- mutex_exit(&ds->ds_lock);
	- dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
	- delta, -compressed, -uncompressed, tx);
	- dsl_dir_transfer_space(ds->ds_dir, -used - delta,
	- DD_USED_REFRSRV, DD_USED_HEAD, NULL);
	- } else {
	- dprintf_bp(bp, "putting on dead list: %s", "");
	- if (async) {
	- /*
	- * We are here as part of zio's write done callback,
	- * which means we're a zio interrupt thread. We can't
	- * call dsl_deadlist_insert() now because it may block
	- * waiting for I/O. Instead, put bp on the deferred
	- * queue and let dsl_pool_sync() finish the job.
	- */
	- bplist_append(&ds->ds_pending_deadlist, bp);
	- } else {
	- dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
	- }
	- ASSERT3U(ds->ds_prev->ds_object, ==,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj);
	- ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
	- /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
	- if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
	- ds->ds_object && bp->blk_birth >
	- dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
	- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
	- mutex_enter(&ds->ds_prev->ds_lock);
	- dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
	- mutex_exit(&ds->ds_prev->ds_lock);
	- }
	- if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
	- dsl_dir_transfer_space(ds->ds_dir, used,
	- DD_USED_HEAD, DD_USED_SNAP, tx);
	- }
	- }
	- mutex_enter(&ds->ds_lock);
	- ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
	- dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
	- ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
	- dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
	- ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
	- dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
	- mutex_exit(&ds->ds_lock);
	-
	- return (used);
	-}
	-
	-/*
	- * We have to release the fsid syncronously or we risk that a subsequent
	- * mount of the same dataset will fail to unique_insert the fsid. This
	- * failure would manifest itself as the fsid of this dataset changing
	- * between mounts which makes NFS clients quite unhappy.
	- */
	-static void
	-dsl_dataset_evict_sync(void *dbu)
	-{
	- dsl_dataset_t *ds = dbu;
	-
	- ASSERT(ds->ds_owner == NULL);
	-
	- unique_remove(ds->ds_fsid_guid);
	-}
	-
	-static void
	-dsl_dataset_evict_async(void *dbu)
	-{
	- dsl_dataset_t *ds = dbu;
	-
	- ASSERT(ds->ds_owner == NULL);
	-
	- ds->ds_dbuf = NULL;
	-
	- if (ds->ds_objset != NULL)
	- dmu_objset_evict(ds->ds_objset);
	-
	- if (ds->ds_prev) {
	- dsl_dataset_rele(ds->ds_prev, ds);
	- ds->ds_prev = NULL;
	- }
	-
	- bplist_destroy(&ds->ds_pending_deadlist);
	- if (dsl_deadlist_is_open(&ds->ds_deadlist))
	- dsl_deadlist_close(&ds->ds_deadlist);
	- if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
	- dsl_deadlist_close(&ds->ds_remap_deadlist);
	- if (ds->ds_dir)
	- dsl_dir_async_rele(ds->ds_dir, ds);
	-
	- ASSERT(!list_link_active(&ds->ds_synced_link));
	-
	- list_destroy(&ds->ds_prop_cbs);
	- if (mutex_owned(&ds->ds_lock))
	- mutex_exit(&ds->ds_lock);
	- mutex_destroy(&ds->ds_lock);
	- if (mutex_owned(&ds->ds_opening_lock))
	- mutex_exit(&ds->ds_opening_lock);
	- mutex_destroy(&ds->ds_opening_lock);
	- mutex_destroy(&ds->ds_sendstream_lock);
	- mutex_destroy(&ds->ds_remap_deadlist_lock);
	- zfs_refcount_destroy(&ds->ds_longholds);
	- rrw_destroy(&ds->ds_bp_rwlock);
	-
	- kmem_free(ds, sizeof (dsl_dataset_t));
	-}
	-
	-int
	-dsl_dataset_get_snapname(dsl_dataset_t *ds)
	-{
	- dsl_dataset_phys_t *headphys;
	- int err;
	- dmu_buf_t *headdbuf;
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- objset_t *mos = dp->dp_meta_objset;
	-
	- if (ds->ds_snapname[0])
	- return (0);
	- if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
	- return (0);
	-
	- err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
	- FTAG, &headdbuf);
	- if (err != 0)
	- return (err);
	- headphys = headdbuf->db_data;
	- err = zap_value_search(dp->dp_meta_objset,
	- headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
	- dmu_buf_rele(headdbuf, FTAG);
	- return (err);
	-}
	-
	-int
	-dsl_dataset_snap_lookup(dsl_dataset_t ds, const char name, uint64_t *value)
	-{
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	- matchtype_t mt = 0;
	- int err;
	-
	- if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
	- mt = MT_NORMALIZE;
	-
	- err = zap_lookup_norm(mos, snapobj, name, 8, 1,
	- value, mt, NULL, 0, NULL);
	- if (err == ENOTSUP && (mt & MT_NORMALIZE))
	- err = zap_lookup(mos, snapobj, name, 8, 1, value);
	- return (err);
	-}
	-
	-int
	-dsl_dataset_snap_remove(dsl_dataset_t ds, const char name, dmu_tx_t *tx,
	- boolean_t adj_cnt)
	-{
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	- matchtype_t mt = 0;
	- int err;
	-
	- dsl_dir_snap_cmtime_update(ds->ds_dir);
	-
	- if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
	- mt = MT_NORMALIZE;
	-
	- err = zap_remove_norm(mos, snapobj, name, mt, tx);
	- if (err == ENOTSUP && (mt & MT_NORMALIZE))
	- err = zap_remove(mos, snapobj, name, tx);
	-
	- if (err == 0 && adj_cnt)
	- dsl_fs_ss_count_adjust(ds->ds_dir, -1,
	- DD_FIELD_SNAPSHOT_COUNT, tx);
	-
	- return (err);
	-}
	-
	-boolean_t
	-dsl_dataset_try_add_ref(dsl_pool_t dp, dsl_dataset_t ds, void *tag)
	-{
	- dmu_buf_t *dbuf = ds->ds_dbuf;
	- boolean_t result = B_FALSE;
	-
	- if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
	- ds->ds_object, DMU_BONUS_BLKID, tag)) {
	-
	- if (ds == dmu_buf_get_user(dbuf))
	- result = B_TRUE;
	- else
	- dmu_buf_rele(dbuf, tag);
	- }
	-
	- return (result);
	-}
	-
	-int
	-dsl_dataset_hold_obj(dsl_pool_t dp, uint64_t dsobj, void tag,
	- dsl_dataset_t **dsp)
	-{
	- objset_t *mos = dp->dp_meta_objset;
	- dmu_buf_t *dbuf;
	- dsl_dataset_t *ds;
	- int err;
	- dmu_object_info_t doi;
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
	- if (err != 0)
	- return (err);
	-
	- /* Make sure dsobj has the correct object type. */
	- dmu_object_info_from_db(dbuf, &doi);
	- if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
	- dmu_buf_rele(dbuf, tag);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- ds = dmu_buf_get_user(dbuf);
	- if (ds == NULL) {
	- dsl_dataset_t *winner = NULL;
	-
	- ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
	- ds->ds_dbuf = dbuf;
	- ds->ds_object = dsobj;
	- ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
	-
	- err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,
	- NULL, ds, &ds->ds_dir);
	- if (err != 0) {
	- kmem_free(ds, sizeof (dsl_dataset_t));
	- dmu_buf_rele(dbuf, tag);
	- return (err);
	- }
	-
	- mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&ds->ds_remap_deadlist_lock,
	- NULL, MUTEX_DEFAULT, NULL);
	- rrw_init(&ds->ds_bp_rwlock, B_FALSE);
	- zfs_refcount_create(&ds->ds_longholds);
	-
	- bplist_create(&ds->ds_pending_deadlist);
	-
	- list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
	- offsetof(dmu_sendarg_t, dsa_link));
	-
	- list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
	- offsetof(dsl_prop_cb_record_t, cbr_ds_node));
	-
	- if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
	- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	- if (!(spa_feature_table[f].fi_flags &
	- ZFEATURE_FLAG_PER_DATASET))
	- continue;
	- err = zap_contains(mos, dsobj,
	- spa_feature_table[f].fi_guid);
	- if (err == 0) {
	- ds->ds_feature_inuse[f] = B_TRUE;
	- } else {
	- ASSERT3U(err, ==, ENOENT);
	- err = 0;
	- }
	- }
	- }
	-
	- if (!ds->ds_is_snapshot) {
	- ds->ds_snapname[0] = '\0';
	- if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	- err = dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj,
	- ds, &ds->ds_prev);
	- }
	- if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
	- int zaperr = zap_lookup(mos, ds->ds_object,
	- DS_FIELD_BOOKMARK_NAMES,
	- sizeof (ds->ds_bookmarks), 1,
	- &ds->ds_bookmarks);
	- if (zaperr != ENOENT)
	- VERIFY0(zaperr);
	- }
	- } else {
	- if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
	- err = dsl_dataset_get_snapname(ds);
	- if (err == 0 &&
	- dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
	- err = zap_count(
	- ds->ds_dir->dd_pool->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_userrefs_obj,
	- &ds->ds_userrefs);
	- }
	- }
	-
	- if (err == 0 && !ds->ds_is_snapshot) {
	- err = dsl_prop_get_int_ds(ds,
	- zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
	- &ds->ds_reserved);
	- if (err == 0) {
	- err = dsl_prop_get_int_ds(ds,
	- zfs_prop_to_name(ZFS_PROP_REFQUOTA),
	- &ds->ds_quota);
	- }
	- } else {
	- ds->ds_reserved = ds->ds_quota = 0;
	- }
	-
	- dsl_deadlist_open(&ds->ds_deadlist,
	- mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
	- uint64_t remap_deadlist_obj =
	- dsl_dataset_get_remap_deadlist_object(ds);
	- if (remap_deadlist_obj != 0) {
	- dsl_deadlist_open(&ds->ds_remap_deadlist, mos,
	- remap_deadlist_obj);
	- }
	-
	- dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
	- dsl_dataset_evict_async, &ds->ds_dbuf);
	- if (err == 0)
	- winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
	-
	- if (err != 0 \|\| winner != NULL) {
	- bplist_destroy(&ds->ds_pending_deadlist);
	- dsl_deadlist_close(&ds->ds_deadlist);
	- if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
	- dsl_deadlist_close(&ds->ds_remap_deadlist);
	- if (ds->ds_prev)
	- dsl_dataset_rele(ds->ds_prev, ds);
	- dsl_dir_rele(ds->ds_dir, ds);
	- list_destroy(&ds->ds_prop_cbs);
	- list_destroy(&ds->ds_sendstreams);
	- mutex_destroy(&ds->ds_lock);
	- mutex_destroy(&ds->ds_opening_lock);
	- mutex_destroy(&ds->ds_sendstream_lock);
	- mutex_destroy(&ds->ds_remap_deadlist_lock);
	- zfs_refcount_destroy(&ds->ds_longholds);
	- rrw_destroy(&ds->ds_bp_rwlock);
	- kmem_free(ds, sizeof (dsl_dataset_t));
	- if (err != 0) {
	- dmu_buf_rele(dbuf, tag);
	- return (err);
	- }
	- ds = winner;
	- } else {
	- ds->ds_fsid_guid =
	- unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
	- if (ds->ds_fsid_guid !=
	- dsl_dataset_phys(ds)->ds_fsid_guid) {
	- zfs_dbgmsg("ds_fsid_guid changed from "
	- "%llx to %llx for pool %s dataset id %llu",
	- (long long)
	- dsl_dataset_phys(ds)->ds_fsid_guid,
	- (long long)ds->ds_fsid_guid,
	- spa_name(dp->dp_spa),
	- dsobj);
	- }
	- }
	- }
	- ASSERT3P(ds->ds_dbuf, ==, dbuf);
	- ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
	- ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 \|\|
	- spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN \|\|
	- dp->dp_origin_snap == NULL \|\| ds == dp->dp_origin_snap);
	- *dsp = ds;
	- return (0);
	-}
	-
	-int
	-dsl_dataset_hold(dsl_pool_t dp, const char name,
	- void tag, dsl_dataset_t *dsp)
	-{
	- dsl_dir_t *dd;
	- const char *snapname;
	- uint64_t obj;
	- int err = 0;
	- dsl_dataset_t *ds;
	-
	- err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
	- if (err != 0)
	- return (err);
	-
	- ASSERT(dsl_pool_config_held(dp));
	- obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
	- if (obj != 0)
	- err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
	- else
	- err = SET_ERROR(ENOENT);
	-
	- /* we may be looking for a snapshot */
	- if (err == 0 && snapname != NULL) {
	- dsl_dataset_t *snap_ds;
	-
	- if (*snapname++ != '@') {
	- dsl_dataset_rele(ds, tag);
	- dsl_dir_rele(dd, FTAG);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- dprintf("looking for snapshot '%s'\n", snapname);
	- err = dsl_dataset_snap_lookup(ds, snapname, &obj);
	- if (err == 0)
	- err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
	- dsl_dataset_rele(ds, tag);
	-
	- if (err == 0) {
	- mutex_enter(&snap_ds->ds_lock);
	- if (snap_ds->ds_snapname[0] == 0)
	- (void) strlcpy(snap_ds->ds_snapname, snapname,
	- sizeof (snap_ds->ds_snapname));
	- mutex_exit(&snap_ds->ds_lock);
	- ds = snap_ds;
	- }
	- }
	- if (err == 0)
	- *dsp = ds;
	- dsl_dir_rele(dd, FTAG);
	- return (err);
	-}
	-
	-int
	-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
	- void tag, dsl_dataset_t *dsp)
	-{
	- int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
	- if (err != 0)
	- return (err);
	- if (!dsl_dataset_tryown(*dsp, tag)) {
	- dsl_dataset_rele(*dsp, tag);
	- *dsp = NULL;
	- return (SET_ERROR(EBUSY));
	- }
	- return (0);
	-}
	-
	-int
	-dsl_dataset_own(dsl_pool_t dp, const char name,
	- void tag, dsl_dataset_t *dsp)
	-{
	- int err = dsl_dataset_hold(dp, name, tag, dsp);
	- if (err != 0)
	- return (err);
	- if (!dsl_dataset_tryown(*dsp, tag)) {
	- dsl_dataset_rele(*dsp, tag);
	- return (SET_ERROR(EBUSY));
	- }
	- return (0);
	-}
	-
	-/*
	- * See the comment above dsl_pool_hold() for details. In summary, a long
	- * hold is used to prevent destruction of a dataset while the pool hold
	- * is dropped, allowing other concurrent operations (e.g. spa_sync()).
	- *
	- * The dataset and pool must be held when this function is called. After it
	- * is called, the pool hold may be released while the dataset is still held
	- * and accessed.
	- */
	-void
	-dsl_dataset_long_hold(dsl_dataset_t ds, void tag)
	-{
	- ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
	- (void) zfs_refcount_add(&ds->ds_longholds, tag);
	-}
	-
	-void
	-dsl_dataset_long_rele(dsl_dataset_t ds, void tag)
	-{
	- (void) zfs_refcount_remove(&ds->ds_longholds, tag);
	-}
	-
	-/* Return B_TRUE if there are any long holds on this dataset. */
	-boolean_t
	-dsl_dataset_long_held(dsl_dataset_t *ds)
	-{
	- return (!zfs_refcount_is_zero(&ds->ds_longholds));
	-}
	-
	-void
	-dsl_dataset_name(dsl_dataset_t ds, char name)
	-{
	- if (ds == NULL) {
	- (void) strcpy(name, "mos");
	- } else {
	- dsl_dir_name(ds->ds_dir, name);
	- VERIFY0(dsl_dataset_get_snapname(ds));
	- if (ds->ds_snapname[0]) {
	- VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
	- <, ZFS_MAX_DATASET_NAME_LEN);
	- /*
	- * We use a "recursive" mutex so that we
	- * can call dprintf_ds() with ds_lock held.
	- */
	- if (!MUTEX_HELD(&ds->ds_lock)) {
	- mutex_enter(&ds->ds_lock);
	- VERIFY3U(strlcat(name, ds->ds_snapname,
	- ZFS_MAX_DATASET_NAME_LEN), <,
	- ZFS_MAX_DATASET_NAME_LEN);
	- mutex_exit(&ds->ds_lock);
	- } else {
	- VERIFY3U(strlcat(name, ds->ds_snapname,
	- ZFS_MAX_DATASET_NAME_LEN), <,
	- ZFS_MAX_DATASET_NAME_LEN);
	- }
	- }
	- }
	-}
	-
	-int
	-dsl_dataset_namelen(dsl_dataset_t *ds)
	-{
	- VERIFY0(dsl_dataset_get_snapname(ds));
	- mutex_enter(&ds->ds_lock);
	- int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname);
	- mutex_exit(&ds->ds_lock);
	- return (len);
	-}
	-
	-void
	-dsl_dataset_rele(dsl_dataset_t ds, void tag)
	-{
	- dmu_buf_rele(ds->ds_dbuf, tag);
	-}
	-
	-void
	-dsl_dataset_disown(dsl_dataset_t ds, void tag)
	-{
	- ASSERT3P(ds->ds_owner, ==, tag);
	- ASSERT(ds->ds_dbuf != NULL);
	-
	- mutex_enter(&ds->ds_lock);
	- ds->ds_owner = NULL;
	- mutex_exit(&ds->ds_lock);
	- dsl_dataset_long_rele(ds, tag);
	- dsl_dataset_rele(ds, tag);
	-}
	-
	-boolean_t
	-dsl_dataset_tryown(dsl_dataset_t ds, void tag)
	-{
	- boolean_t gotit = FALSE;
	-
	- ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
	- mutex_enter(&ds->ds_lock);
	- if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
	- ds->ds_owner = tag;
	- dsl_dataset_long_hold(ds, tag);
	- gotit = TRUE;
	- }
	- mutex_exit(&ds->ds_lock);
	- return (gotit);
	-}
	-
	-boolean_t
	-dsl_dataset_has_owner(dsl_dataset_t *ds)
	-{
	- boolean_t rv;
	- mutex_enter(&ds->ds_lock);
	- rv = (ds->ds_owner != NULL);
	- mutex_exit(&ds->ds_lock);
	- return (rv);
	-}
	-
	-static void
	-dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
	- uint64_t zero = 0;
	-
	- VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
	-
	- spa_feature_incr(spa, f, tx);
	- dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
	-
	- VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
	- sizeof (zero), 1, &zero, tx));
	-}
	-
	-void
	-dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
	-
	- VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
	-
	- VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
	- spa_feature_decr(spa, f, tx);
	-}
	-
	-uint64_t
	-dsl_dataset_create_sync_dd(dsl_dir_t dd, dsl_dataset_t origin,
	- uint64_t flags, dmu_tx_t *tx)
	-{
	- dsl_pool_t *dp = dd->dd_pool;
	- dmu_buf_t *dbuf;
	- dsl_dataset_phys_t *dsphys;
	- uint64_t dsobj;
	- objset_t *mos = dp->dp_meta_objset;
	-
	- if (origin == NULL)
	- origin = dp->dp_origin_snap;
	-
	- ASSERT(origin == NULL \|\| origin->ds_dir->dd_pool == dp);
	- ASSERT(origin == NULL \|\| dsl_dataset_phys(origin)->ds_num_children > 0);
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
	-
	- dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
	- DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
	- VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
	- dmu_buf_will_dirty(dbuf, tx);
	- dsphys = dbuf->db_data;
	- bzero(dsphys, sizeof (dsl_dataset_phys_t));
	- dsphys->ds_dir_obj = dd->dd_object;
	- dsphys->ds_flags = flags;
	- dsphys->ds_fsid_guid = unique_create();
	- do {
	- (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
	- sizeof (dsphys->ds_guid));
	- } while (dsphys->ds_guid == 0);
	- dsphys->ds_snapnames_zapobj =
	- zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
	- DMU_OT_NONE, 0, tx);
	- dsphys->ds_creation_time = gethrestime_sec();
	- dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
	-
	- if (origin == NULL) {
	- dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
	- } else {
	- dsl_dataset_t ohds; / head of the origin snapshot */
	-
	- dsphys->ds_prev_snap_obj = origin->ds_object;
	- dsphys->ds_prev_snap_txg =
	- dsl_dataset_phys(origin)->ds_creation_txg;
	- dsphys->ds_referenced_bytes =
	- dsl_dataset_phys(origin)->ds_referenced_bytes;
	- dsphys->ds_compressed_bytes =
	- dsl_dataset_phys(origin)->ds_compressed_bytes;
	- dsphys->ds_uncompressed_bytes =
	- dsl_dataset_phys(origin)->ds_uncompressed_bytes;
	- rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
	- dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
	- rrw_exit(&origin->ds_bp_rwlock, FTAG);
	-
	- /*
	- * Inherit flags that describe the dataset's contents
	- * (INCONSISTENT) or properties (Case Insensitive).
	- */
	- dsphys->ds_flags \|= dsl_dataset_phys(origin)->ds_flags &
	- (DS_FLAG_INCONSISTENT \| DS_FLAG_CI_DATASET);
	-
	- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	- if (origin->ds_feature_inuse[f])
	- dsl_dataset_activate_feature(dsobj, f, tx);
	- }
	-
	- dmu_buf_will_dirty(origin->ds_dbuf, tx);
	- dsl_dataset_phys(origin)->ds_num_children++;
	-
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
	- FTAG, &ohds));
	- dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
	- dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
	- dsl_dataset_rele(ohds, FTAG);
	-
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
	- if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
	- dsl_dataset_phys(origin)->ds_next_clones_obj =
	- zap_create(mos,
	- DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
	- }
	- VERIFY0(zap_add_int(mos,
	- dsl_dataset_phys(origin)->ds_next_clones_obj,
	- dsobj, tx));
	- }
	-
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	- dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
	- if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
	- dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
	- dsl_dir_phys(origin->ds_dir)->dd_clones =
	- zap_create(mos,
	- DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
	- }
	- VERIFY0(zap_add_int(mos,
	- dsl_dir_phys(origin->ds_dir)->dd_clones,
	- dsobj, tx));
	- }
	- }
	-
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
	- dsphys->ds_flags \|= DS_FLAG_UNIQUE_ACCURATE;
	-
	- dmu_buf_rele(dbuf, FTAG);
	-
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	- dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
	-
	- return (dsobj);
	-}
	-
	-static void
	-dsl_dataset_zero_zil(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- objset_t *os;
	-
	- VERIFY0(dmu_objset_from_ds(ds, &os));
	- if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- zio_t *zio;
	-
	- bzero(&os->os_zil_header, sizeof (os->os_zil_header));
	-
	- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	- dsl_dataset_sync(ds, zio, tx);
	- VERIFY0(zio_wait(zio));
	-
	- /* dsl_dataset_sync_done will drop this reference. */
	- dmu_buf_add_ref(ds->ds_dbuf, ds);
	- dsl_dataset_sync_done(ds, tx);
	- }
	-}
	-
	-uint64_t
	-dsl_dataset_create_sync(dsl_dir_t pdd, const char lastname,
	- dsl_dataset_t origin, uint64_t flags, cred_t cr, dmu_tx_t *tx)
	-{
	- dsl_pool_t *dp = pdd->dd_pool;
	- uint64_t dsobj, ddobj;
	- dsl_dir_t *dd;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(lastname[0] != '@');
	-
	- ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
	- VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
	-
	- dsobj = dsl_dataset_create_sync_dd(dd, origin,
	- flags & ~DS_CREATE_FLAG_NODIRTY, tx);
	-
	- dsl_deleg_set_create_perms(dd, tx, cr);
	-
	- /*
	- * Since we're creating a new node we know it's a leaf, so we can
	- * initialize the counts if the limit feature is active.
	- */
	- if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
	- uint64_t cnt = 0;
	- objset_t *os = dd->dd_pool->dp_meta_objset;
	-
	- dsl_dir_zapify(dd, tx);
	- VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
	- sizeof (cnt), 1, &cnt, tx));
	- VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
	- sizeof (cnt), 1, &cnt, tx));
	- }
	-
	- dsl_dir_rele(dd, FTAG);
	-
	- /*
	- * If we are creating a clone, make sure we zero out any stale
	- * data from the origin snapshots zil header.
	- */
	- if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
	- dsl_dataset_t *ds;
	-
	- VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
	- dsl_dataset_zero_zil(ds, tx);
	- dsl_dataset_rele(ds, FTAG);
	- }
	-
	- return (dsobj);
	-}
	-
	-#ifdef __FreeBSD__
	-/* FreeBSD ioctl compat begin */
	-struct destroyarg {
	- nvlist_t *nvl;
	- const char *snapname;
	-};
	-
	-static int
	-dsl_check_snap_cb(const char name, void arg)
	-{
	- struct destroyarg *da = arg;
	- dsl_dataset_t *ds;
	- char *dsname;
	-
	- dsname = kmem_asprintf("%s@%s", name, da->snapname);
	- fnvlist_add_boolean(da->nvl, dsname);
	- kmem_free(dsname, strlen(dsname) + 1);
	-
	- return (0);
	-}
	-
	-int
	-dmu_get_recursive_snaps_nvl(char fsname, const char snapname,
	- nvlist_t *snaps)
	-{
	- struct destroyarg *da;
	- int err;
	-
	- da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
	- da->nvl = snaps;
	- da->snapname = snapname;
	- err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
	- DS_FIND_CHILDREN);
	- kmem_free(da, sizeof (struct destroyarg));
	-
	- return (err);
	-}
	-/* FreeBSD ioctl compat end */
	-#endif /* __FreeBSD__ */
	-
	-/*
	- * The unique space in the head dataset can be calculated by subtracting
	- * the space used in the most recent snapshot, that is still being used
	- * in this file system, from the space currently in use. To figure out
	- * the space in the most recent snapshot still in use, we need to take
	- * the total space used in the snapshot and subtract out the space that
	- * has been freed up since the snapshot was taken.
	- */
	-void
	-dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
	-{
	- uint64_t mrs_used;
	- uint64_t dlused, dlcomp, dluncomp;
	-
	- ASSERT(!ds->ds_is_snapshot);
	-
	- if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
	- mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
	- else
	- mrs_used = 0;
	-
	- dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
	-
	- ASSERT3U(dlused, <=, mrs_used);
	- dsl_dataset_phys(ds)->ds_unique_bytes =
	- dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
	-
	- if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
	- SPA_VERSION_UNIQUE_ACCURATE)
	- dsl_dataset_phys(ds)->ds_flags \|= DS_FLAG_UNIQUE_ACCURATE;
	-}
	-
	-void
	-dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
	- dmu_tx_t *tx)
	-{
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- uint64_t count;
	- int err;
	-
	- ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
	- err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
	- obj, tx);
	- /*
	- * The err should not be ENOENT, but a bug in a previous version
	- * of the code could cause upgrade_clones_cb() to not set
	- * ds_next_snap_obj when it should, leading to a missing entry.
	- * If we knew that the pool was created after
	- * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
	- * ENOENT. However, at least we can check that we don't have
	- * too many entries in the next_clones_obj even after failing to
	- * remove this one.
	- */
	- if (err != ENOENT)
	- VERIFY0(err);
	- ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
	- &count));
	- ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
	-}
	-
	-
	-blkptr_t *
	-dsl_dataset_get_blkptr(dsl_dataset_t *ds)
	-{
	- return (&dsl_dataset_phys(ds)->ds_bp);
	-}
	-
	-spa_t *
	-dsl_dataset_get_spa(dsl_dataset_t *ds)
	-{
	- return (ds->ds_dir->dd_pool->dp_spa);
	-}
	-
	-void
	-dsl_dataset_dirty(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp;
	-
	- if (ds == NULL) /* this is the meta-objset */
	- return;
	-
	- ASSERT(ds->ds_objset != NULL);
	-
	- if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
	- panic("dirtying snapshot!");
	-
	- /* Must not dirty a dataset in the same txg where it got snapshotted. */
	- ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
	-
	- dp = ds->ds_dir->dd_pool;
	- if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
	- /* up the hold count until we can be written out */
	- dmu_buf_add_ref(ds->ds_dbuf, ds);
	- }
	-}
	-
	-boolean_t
	-dsl_dataset_is_dirty(dsl_dataset_t *ds)
	-{
	- for (int t = 0; t < TXG_SIZE; t++) {
	- if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
	- ds, t))
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-static int
	-dsl_dataset_snapshot_reserve_space(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- uint64_t asize;
	-
	- if (!dmu_tx_is_syncing(tx))
	- return (0);
	-
	- /*
	- * If there's an fs-only reservation, any blocks that might become
	- * owned by the snapshot dataset must be accommodated by space
	- * outside of the reservation.
	- */
	- ASSERT(ds->ds_reserved == 0 \|\| DS_UNIQUE_IS_ACCURATE(ds));
	- asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
	- if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
	- return (SET_ERROR(ENOSPC));
	-
	- /*
	- * Propagate any reserved space for this snapshot to other
	- * snapshot checks in this sync group.
	- */
	- if (asize > 0)
	- dsl_dir_willuse_space(ds->ds_dir, asize, tx);
	-
	- return (0);
	-}
	-
	-int
	-dsl_dataset_snapshot_check_impl(dsl_dataset_t ds, const char snapname,
	- dmu_tx_t tx, boolean_t recv, uint64_t cnt, cred_t cr)
	-{
	- int error;
	- uint64_t value;
	-
	- ds->ds_trysnap_txg = tx->tx_txg;
	-
	- if (!dmu_tx_is_syncing(tx))
	- return (0);
	-
	- /*
	- * We don't allow multiple snapshots of the same txg. If there
	- * is already one, try again.
	- */
	- if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
	- return (SET_ERROR(EAGAIN));
	-
	- /*
	- * Check for conflicting snapshot name.
	- */
	- error = dsl_dataset_snap_lookup(ds, snapname, &value);
	- if (error == 0)
	- return (SET_ERROR(EEXIST));
	- if (error != ENOENT)
	- return (error);
	-
	- /*
	- * We don't allow taking snapshots of inconsistent datasets, such as
	- * those into which we are currently receiving. However, if we are
	- * creating this snapshot as part of a receive, this check will be
	- * executed atomically with respect to the completion of the receive
	- * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
	- * case we ignore this, knowing it will be fixed up for us shortly in
	- * dmu_recv_end_sync().
	- */
	- if (!recv && DS_IS_INCONSISTENT(ds))
	- return (SET_ERROR(EBUSY));
	-
	- /*
	- * Skip the check for temporary snapshots or if we have already checked
	- * the counts in dsl_dataset_snapshot_check. This means we really only
	- * check the count here when we're receiving a stream.
	- */
	- if (cnt != 0 && cr != NULL) {
	- error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
	- ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
	- if (error != 0)
	- return (error);
	- }
	-
	- error = dsl_dataset_snapshot_reserve_space(ds, tx);
	- if (error != 0)
	- return (error);
	-
	- return (0);
	-}
	-
	-int
	-dsl_dataset_snapshot_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_snapshot_arg_t *ddsa = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- nvpair_t *pair;
	- int rv = 0;
	-
	- /*
	- * Pre-compute how many total new snapshots will be created for each
	- * level in the tree and below. This is needed for validating the
	- * snapshot limit when either taking a recursive snapshot or when
	- * taking multiple snapshots.
	- *
	- * The problem is that the counts are not actually adjusted when
	- * we are checking, only when we finally sync. For a single snapshot,
	- * this is easy, the count will increase by 1 at each node up the tree,
	- * but its more complicated for the recursive/multiple snapshot case.
	- *
	- * The dsl_fs_ss_limit_check function does recursively check the count
	- * at each level up the tree but since it is validating each snapshot
	- * independently we need to be sure that we are validating the complete
	- * count for the entire set of snapshots. We do this by rolling up the
	- * counts for each component of the name into an nvlist and then
	- * checking each of those cases with the aggregated count.
	- *
	- * This approach properly handles not only the recursive snapshot
	- * case (where we get all of those on the ddsa_snaps list) but also
	- * the sibling case (e.g. snapshot a/b and a/c so that we will also
	- * validate the limit on 'a' using a count of 2).
	- *
	- * We validate the snapshot names in the third loop and only report
	- * name errors once.
	- */
	- if (dmu_tx_is_syncing(tx)) {
	- nvlist_t *cnt_track = NULL;
	- cnt_track = fnvlist_alloc();
	-
	- /* Rollup aggregated counts into the cnt_track list */
	- for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
	- pair != NULL;
	- pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
	- char *pdelim;
	- uint64_t val;
	- char nm[MAXPATHLEN];
	-
	- (void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
	- pdelim = strchr(nm, '@');
	- if (pdelim == NULL)
	- continue;
	- *pdelim = '\0';
	-
	- do {
	- if (nvlist_lookup_uint64(cnt_track, nm,
	- &val) == 0) {
	- /* update existing entry */
	- fnvlist_add_uint64(cnt_track, nm,
	- val + 1);
	- } else {
	- /* add to list */
	- fnvlist_add_uint64(cnt_track, nm, 1);
	- }
	-
	- pdelim = strrchr(nm, '/');
	- if (pdelim != NULL)
	- *pdelim = '\0';
	- } while (pdelim != NULL);
	- }
	-
	- /* Check aggregated counts at each level */
	- for (pair = nvlist_next_nvpair(cnt_track, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
	- int error = 0;
	- char *name;
	- uint64_t cnt = 0;
	- dsl_dataset_t *ds;
	-
	- name = nvpair_name(pair);
	- cnt = fnvpair_value_uint64(pair);
	- ASSERT(cnt > 0);
	-
	- error = dsl_dataset_hold(dp, name, FTAG, &ds);
	- if (error == 0) {
	- error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
	- ZFS_PROP_SNAPSHOT_LIMIT, NULL,
	- ddsa->ddsa_cr);
	- dsl_dataset_rele(ds, FTAG);
	- }
	-
	- if (error != 0) {
	- if (ddsa->ddsa_errors != NULL)
	- fnvlist_add_int32(ddsa->ddsa_errors,
	- name, error);
	- rv = error;
	- /* only report one error for this check */
	- break;
	- }
	- }
	- nvlist_free(cnt_track);
	- }
	-
	- for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
	- int error = 0;
	- dsl_dataset_t *ds;
	- char name, atp;
	- char dsname[ZFS_MAX_DATASET_NAME_LEN];
	-
	- name = nvpair_name(pair);
	- if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
	- error = SET_ERROR(ENAMETOOLONG);
	- if (error == 0) {
	- atp = strchr(name, '@');
	- if (atp == NULL)
	- error = SET_ERROR(EINVAL);
	- if (error == 0)
	- (void) strlcpy(dsname, name, atp - name + 1);
	- }
	- if (error == 0)
	- error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
	- if (error == 0) {
	- /* passing 0/NULL skips dsl_fs_ss_limit_check */
	- error = dsl_dataset_snapshot_check_impl(ds,
	- atp + 1, tx, B_FALSE, 0, NULL);
	- dsl_dataset_rele(ds, FTAG);
	- }
	-
	- if (error != 0) {
	- if (ddsa->ddsa_errors != NULL) {
	- fnvlist_add_int32(ddsa->ddsa_errors,
	- name, error);
	- }
	- rv = error;
	- }
	- }
	-
	- return (rv);
	-}
	-
	-void
	-dsl_dataset_snapshot_sync_impl(dsl_dataset_t ds, const char snapname,
	- dmu_tx_t *tx)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- dmu_buf_t *dbuf;
	- dsl_dataset_phys_t *dsphys;
	- uint64_t dsobj, crtxg;
	- objset_t *mos = dp->dp_meta_objset;
	- objset_t *os;
	-
	- ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
	-
	- /*
	- * If we are on an old pool, the zil must not be active, in which
	- * case it will be zeroed. Usually zil_suspend() accomplishes this.
	- */
	- ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP \|\|
	- dmu_objset_from_ds(ds, &os) != 0 \|\|
	- bcmp(&os->os_phys->os_zil_header, &zero_zil,
	- sizeof (zero_zil)) == 0);
	-
	- /* Should not snapshot a dirty dataset. */
	- ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
	- ds, tx->tx_txg));
	-
	- dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
	-
	- /*
	- * The origin's ds_creation_txg has to be < TXG_INITIAL
	- */
	- if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
	- crtxg = 1;
	- else
	- crtxg = tx->tx_txg;
	-
	- dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
	- DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
	- VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
	- dmu_buf_will_dirty(dbuf, tx);
	- dsphys = dbuf->db_data;
	- bzero(dsphys, sizeof (dsl_dataset_phys_t));
	- dsphys->ds_dir_obj = ds->ds_dir->dd_object;
	- dsphys->ds_fsid_guid = unique_create();
	- do {
	- (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
	- sizeof (dsphys->ds_guid));
	- } while (dsphys->ds_guid == 0);
	- dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
	- dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
	- dsphys->ds_next_snap_obj = ds->ds_object;
	- dsphys->ds_num_children = 1;
	- dsphys->ds_creation_time = gethrestime_sec();
	- dsphys->ds_creation_txg = crtxg;
	- dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
	- dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
	- dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
	- dsphys->ds_uncompressed_bytes =
	- dsl_dataset_phys(ds)->ds_uncompressed_bytes;
	- dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	- dmu_buf_rele(dbuf, FTAG);
	-
	- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	- if (ds->ds_feature_inuse[f])
	- dsl_dataset_activate_feature(dsobj, f, tx);
	- }
	-
	- ASSERT3U(ds->ds_prev != 0, ==,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
	- if (ds->ds_prev) {
	- uint64_t next_clones_obj =
	- dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
	- ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
	- ds->ds_object \|\|
	- dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
	- if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
	- ds->ds_object) {
	- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
	- ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
	- dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
	- dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
	- } else if (next_clones_obj != 0) {
	- dsl_dataset_remove_from_next_clones(ds->ds_prev,
	- dsphys->ds_next_snap_obj, tx);
	- VERIFY0(zap_add_int(mos,
	- next_clones_obj, dsobj, tx));
	- }
	- }
	-
	- /*
	- * If we have a reference-reservation on this dataset, we will
	- * need to increase the amount of refreservation being charged
	- * since our unique space is going to zero.
	- */
	- if (ds->ds_reserved) {
	- int64_t delta;
	- ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
	- delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
	- ds->ds_reserved);
	- dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
	- delta, 0, 0, tx);
	- }
	-
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_deadlist_obj =
	- dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
	- dsl_deadlist_close(&ds->ds_deadlist);
	- dsl_deadlist_open(&ds->ds_deadlist, mos,
	- dsl_dataset_phys(ds)->ds_deadlist_obj);
	- dsl_deadlist_add_key(&ds->ds_deadlist,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
	-
	- if (dsl_dataset_remap_deadlist_exists(ds)) {
	- uint64_t remap_deadlist_obj =
	- dsl_dataset_get_remap_deadlist_object(ds);
	- /*
	- * Move the remap_deadlist to the snapshot. The head
	- * will create a new remap deadlist on demand, from
	- * dsl_dataset_block_remapped().
	- */
	- dsl_dataset_unset_remap_deadlist_object(ds, tx);
	- dsl_deadlist_close(&ds->ds_remap_deadlist);
	-
	- dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
	- VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,
	- sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));
	- }
	-
	- ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
	- dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
	- dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
	- dsl_dataset_phys(ds)->ds_unique_bytes = 0;
	-
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
	- dsl_dataset_phys(ds)->ds_flags \|= DS_FLAG_UNIQUE_ACCURATE;
	-
	- VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
	- snapname, 8, 1, &dsobj, tx));
	-
	- if (ds->ds_prev)
	- dsl_dataset_rele(ds->ds_prev, ds);
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
	-
	- dsl_scan_ds_snapshotted(ds, tx);
	-
	- dsl_dir_snap_cmtime_update(ds->ds_dir);
	-
	- spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
	-}
	-
	-void
	-dsl_dataset_snapshot_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_snapshot_arg_t *ddsa = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- nvpair_t *pair;
	-
	- for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
	- dsl_dataset_t *ds;
	- char name, atp;
	- char dsname[ZFS_MAX_DATASET_NAME_LEN];
	-
	- name = nvpair_name(pair);
	- atp = strchr(name, '@');
	- (void) strlcpy(dsname, name, atp - name + 1);
	- VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
	-
	- dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
	- if (ddsa->ddsa_props != NULL) {
	- dsl_props_set_sync_impl(ds->ds_prev,
	- ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
	- }
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- zvol_create_minors(dp->dp_spa, name);
	-#endif
	- dsl_dataset_rele(ds, FTAG);
	- }
	-}
	-
	-/*
	- * The snapshots must all be in the same pool.
	- * All-or-nothing: if there are any failures, nothing will be modified.
	- */
	-int
	-dsl_dataset_snapshot(nvlist_t snaps, nvlist_t props, nvlist_t *errors)
	-{
	- dsl_dataset_snapshot_arg_t ddsa;
	- nvpair_t *pair;
	- boolean_t needsuspend;
	- int error;
	- spa_t *spa;
	- char *firstname;
	- nvlist_t *suspended = NULL;
	-
	- pair = nvlist_next_nvpair(snaps, NULL);
	- if (pair == NULL)
	- return (0);
	- firstname = nvpair_name(pair);
	-
	- error = spa_open(firstname, &spa, FTAG);
	- if (error != 0)
	- return (error);
	- needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
	- spa_close(spa, FTAG);
	-
	- if (needsuspend) {
	- suspended = fnvlist_alloc();
	- for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(snaps, pair)) {
	- char fsname[ZFS_MAX_DATASET_NAME_LEN];
	- char *snapname = nvpair_name(pair);
	- char *atp;
	- void *cookie;
	-
	- atp = strchr(snapname, '@');
	- if (atp == NULL) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	- (void) strlcpy(fsname, snapname, atp - snapname + 1);
	-
	- error = zil_suspend(fsname, &cookie);
	- if (error != 0)
	- break;
	- fnvlist_add_uint64(suspended, fsname,
	- (uintptr_t)cookie);
	- }
	- }
	-
	- ddsa.ddsa_snaps = snaps;
	- ddsa.ddsa_props = props;
	- ddsa.ddsa_errors = errors;
	- ddsa.ddsa_cr = CRED();
	-
	- if (error == 0) {
	- error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
	- dsl_dataset_snapshot_sync, &ddsa,
	- fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
	- }
	-
	- if (suspended != NULL) {
	- for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(suspended, pair)) {
	- zil_resume((void *)(uintptr_t)
	- fnvpair_value_uint64(pair));
	- }
	- fnvlist_free(suspended);
	- }
	-
	- return (error);
	-}
	-
	-typedef struct dsl_dataset_snapshot_tmp_arg {
	- const char *ddsta_fsname;
	- const char *ddsta_snapname;
	- minor_t ddsta_cleanup_minor;
	- const char *ddsta_htag;
	-} dsl_dataset_snapshot_tmp_arg_t;
	-
	-static int
	-dsl_dataset_snapshot_tmp_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- int error;
	-
	- error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
	- if (error != 0)
	- return (error);
	-
	- /* NULL cred means no limit check for tmp snapshot */
	- error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
	- tx, B_FALSE, 0, NULL);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
	- B_TRUE, tx);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	-}
	-
	-static void
	-dsl_dataset_snapshot_tmp_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	-
	- VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
	-
	- dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
	- dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
	- ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
	- dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
	-
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-int
	-dsl_dataset_snapshot_tmp(const char fsname, const char snapname,
	- minor_t cleanup_minor, const char *htag)
	-{
	- dsl_dataset_snapshot_tmp_arg_t ddsta;
	- int error;
	- spa_t *spa;
	- boolean_t needsuspend;
	- void *cookie;
	-
	- ddsta.ddsta_fsname = fsname;
	- ddsta.ddsta_snapname = snapname;
	- ddsta.ddsta_cleanup_minor = cleanup_minor;
	- ddsta.ddsta_htag = htag;
	-
	- error = spa_open(fsname, &spa, FTAG);
	- if (error != 0)
	- return (error);
	- needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
	- spa_close(spa, FTAG);
	-
	- if (needsuspend) {
	- error = zil_suspend(fsname, &cookie);
	- if (error != 0)
	- return (error);
	- }
	-
	- error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
	- dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
	-
	- if (needsuspend)
	- zil_resume(cookie);
	- return (error);
	-}
	-
	-void
	-dsl_dataset_sync(dsl_dataset_t ds, zio_t zio, dmu_tx_t *tx)
	-{
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(ds->ds_objset != NULL);
	- ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
	-
	- /*
	- * in case we had to change ds_fsid_guid when we opened it,
	- * sync it out now.
	- */
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
	-
	- if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
	- VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
	- ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
	- &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
	- VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
	- ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
	- &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
	- VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
	- ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
	- &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
	- ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
	- ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
	- ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
	- }
	-
	- dmu_objset_sync(ds->ds_objset, zio, tx);
	-
	- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	- if (ds->ds_feature_activation_needed[f]) {
	- if (ds->ds_feature_inuse[f])
	- continue;
	- dsl_dataset_activate_feature(ds->ds_object, f, tx);
	- ds->ds_feature_inuse[f] = B_TRUE;
	- }
	- }
	-}
	-
	-static int
	-deadlist_enqueue_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- dsl_deadlist_t *dl = arg;
	- dsl_deadlist_insert(dl, bp, tx);
	- return (0);
	-}
	-
	-void
	-dsl_dataset_sync_done(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- objset_t *os = ds->ds_objset;
	-
	- bplist_iterate(&ds->ds_pending_deadlist,
	- deadlist_enqueue_cb, &ds->ds_deadlist, tx);
	-
	- if (os->os_synced_dnodes != NULL) {
	- multilist_destroy(os->os_synced_dnodes);
	- os->os_synced_dnodes = NULL;
	- }
	-
	- ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
	-
	- dmu_buf_rele(ds->ds_dbuf, ds);
	-}
	-
	-int
	-get_clones_stat_impl(dsl_dataset_t ds, nvlist_t val)
	-{
	- uint64_t count = 0;
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
	-
	- /*
	- * There may be missing entries in ds_next_clones_obj
	- * due to a bug in a previous version of the code.
	- * Only trust it if it has the right number of entries.
	- */
	- if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
	- VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
	- &count));
	- }
	- if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {
	- return (ENOENT);
	- }
	- for (zap_cursor_init(&zc, mos,
	- dsl_dataset_phys(ds)->ds_next_clones_obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- dsl_dataset_t *clone;
	- char buf[ZFS_MAX_DATASET_NAME_LEN];
	- VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
	- za.za_first_integer, FTAG, &clone));
	- dsl_dir_name(clone->ds_dir, buf);
	- fnvlist_add_boolean(val, buf);
	- dsl_dataset_rele(clone, FTAG);
	- }
	- zap_cursor_fini(&zc);
	- return (0);
	-}
	-
	-void
	-get_clones_stat(dsl_dataset_t ds, nvlist_t nv)
	-{
	- nvlist_t *propval = fnvlist_alloc();
	- nvlist_t *val;
	-
	- /*
	- * We use nvlist_alloc() instead of fnvlist_alloc() because the
	- * latter would allocate the list with NV_UNIQUE_NAME flag.
	- * As a result, every time a clone name is appended to the list
	- * it would be (linearly) searched for for a duplicate name.
	- * We already know that all clone names must be unique and we
	- * want avoid the quadratic complexity of double-checking that
	- * because we can have a large number of clones.
	- */
	- VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP));
	-
	- if (get_clones_stat_impl(ds, val) == 0) {
	- fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
	- fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
	- propval);
	- }
	-
	- nvlist_free(val);
	- nvlist_free(propval);
	-}
	-
	-/*
	- * Returns a string that represents the receive resume stats token. It should
	- * be freed with strfree().
	- */
	-char *
	-get_receive_resume_stats_impl(dsl_dataset_t *ds)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	-
	- if (dsl_dataset_has_resume_receive_state(ds)) {
	- char *str;
	- void *packed;
	- uint8_t *compressed;
	- uint64_t val;
	- nvlist_t *token_nv = fnvlist_alloc();
	- size_t packed_size, compressed_size;
	-
	- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
	- fnvlist_add_uint64(token_nv, "fromguid", val);
	- }
	- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
	- fnvlist_add_uint64(token_nv, "object", val);
	- }
	- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
	- fnvlist_add_uint64(token_nv, "offset", val);
	- }
	- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
	- fnvlist_add_uint64(token_nv, "bytes", val);
	- }
	- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
	- fnvlist_add_uint64(token_nv, "toguid", val);
	- }
	- char buf[256];
	- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
	- fnvlist_add_string(token_nv, "toname", buf);
	- }
	- if (zap_contains(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_LARGEBLOCK) == 0) {
	- fnvlist_add_boolean(token_nv, "largeblockok");
	- }
	- if (zap_contains(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_EMBEDOK) == 0) {
	- fnvlist_add_boolean(token_nv, "embedok");
	- }
	- if (zap_contains(dp->dp_meta_objset, ds->ds_object,
	- DS_FIELD_RESUME_COMPRESSOK) == 0) {
	- fnvlist_add_boolean(token_nv, "compressok");
	- }
	- packed = fnvlist_pack(token_nv, &packed_size);
	- fnvlist_free(token_nv);
	- compressed = kmem_alloc(packed_size, KM_SLEEP);
	-
	- compressed_size = gzip_compress(packed, compressed,
	- packed_size, packed_size, 6);
	-
	- zio_cksum_t cksum;
	- fletcher_4_native(compressed, compressed_size, NULL, &cksum);
	-
	- str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
	- for (int i = 0; i < compressed_size; i++) {
	- (void) sprintf(str + i * 2, "%02x", compressed[i]);
	- }
	- str[compressed_size * 2] = '\0';
	- char *propval = kmem_asprintf("%u-%llx-%llx-%s",
	- ZFS_SEND_RESUME_TOKEN_VERSION,
	- (longlong_t)cksum.zc_word[0],
	- (longlong_t)packed_size, str);
	- kmem_free(packed, packed_size);
	- kmem_free(str, compressed_size * 2 + 1);
	- kmem_free(compressed, packed_size);
	- return (propval);
	- }
	- return (spa_strdup(""));
	-}
	-
	-/*
	- * Returns a string that represents the receive resume stats token of the
	- * dataset's child. It should be freed with strfree().
	- */
	-char *
	-get_child_receive_stats(dsl_dataset_t *ds)
	-{
	- char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
	- dsl_dataset_t *recv_ds;
	- dsl_dataset_name(ds, recvname);
	- if (strlcat(recvname, "/", sizeof (recvname)) <
	- sizeof (recvname) &&
	- strlcat(recvname, recv_clone_name, sizeof (recvname)) <
	- sizeof (recvname) &&
	- dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
	- &recv_ds) == 0) {
	- char *propval = get_receive_resume_stats_impl(recv_ds);
	- dsl_dataset_rele(recv_ds, FTAG);
	- return (propval);
	- }
	- return (spa_strdup(""));
	-}
	-
	-static void
	-get_receive_resume_stats(dsl_dataset_t ds, nvlist_t nv)
	-{
	- char *propval = get_receive_resume_stats_impl(ds);
	- if (strcmp(propval, "") != 0) {
	- dsl_prop_nvlist_add_string(nv,
	- ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
	- } else {
	- char *childval = get_child_receive_stats(ds);
	- if (strcmp(childval, "") != 0) {
	- dsl_prop_nvlist_add_string(nv,
	- ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
	- }
	- strfree(childval);
	- }
	- strfree(propval);
	-}
	-
	-uint64_t
	-dsl_get_refratio(dsl_dataset_t *ds)
	-{
	- uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
	- (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
	- dsl_dataset_phys(ds)->ds_compressed_bytes);
	- return (ratio);
	-}
	-
	-uint64_t
	-dsl_get_logicalreferenced(dsl_dataset_t *ds)
	-{
	- return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);
	-}
	-
	-uint64_t
	-dsl_get_compressratio(dsl_dataset_t *ds)
	-{
	- if (ds->ds_is_snapshot) {
	- return (dsl_get_refratio(ds));
	- } else {
	- dsl_dir_t *dd = ds->ds_dir;
	- mutex_enter(&dd->dd_lock);
	- uint64_t val = dsl_dir_get_compressratio(dd);
	- mutex_exit(&dd->dd_lock);
	- return (val);
	- }
	-}
	-
	-uint64_t
	-dsl_get_used(dsl_dataset_t *ds)
	-{
	- if (ds->ds_is_snapshot) {
	- return (dsl_dataset_phys(ds)->ds_unique_bytes);
	- } else {
	- dsl_dir_t *dd = ds->ds_dir;
	- mutex_enter(&dd->dd_lock);
	- uint64_t val = dsl_dir_get_used(dd);
	- mutex_exit(&dd->dd_lock);
	- return (val);
	- }
	-}
	-
	-uint64_t
	-dsl_get_creation(dsl_dataset_t *ds)
	-{
	- return (dsl_dataset_phys(ds)->ds_creation_time);
	-}
	-
	-uint64_t
	-dsl_get_creationtxg(dsl_dataset_t *ds)
	-{
	- return (dsl_dataset_phys(ds)->ds_creation_txg);
	-}
	-
	-uint64_t
	-dsl_get_refquota(dsl_dataset_t *ds)
	-{
	- return (ds->ds_quota);
	-}
	-
	-uint64_t
	-dsl_get_refreservation(dsl_dataset_t *ds)
	-{
	- return (ds->ds_reserved);
	-}
	-
	-uint64_t
	-dsl_get_guid(dsl_dataset_t *ds)
	-{
	- return (dsl_dataset_phys(ds)->ds_guid);
	-}
	-
	-uint64_t
	-dsl_get_unique(dsl_dataset_t *ds)
	-{
	- return (dsl_dataset_phys(ds)->ds_unique_bytes);
	-}
	-
	-uint64_t
	-dsl_get_objsetid(dsl_dataset_t *ds)
	-{
	- return (ds->ds_object);
	-}
	-
	-uint64_t
	-dsl_get_userrefs(dsl_dataset_t *ds)
	-{
	- return (ds->ds_userrefs);
	-}
	-
	-uint64_t
	-dsl_get_defer_destroy(dsl_dataset_t *ds)
	-{
	- return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
	-}
	-
	-uint64_t
	-dsl_get_referenced(dsl_dataset_t *ds)
	-{
	- return (dsl_dataset_phys(ds)->ds_referenced_bytes);
	-}
	-
	-uint64_t
	-dsl_get_numclones(dsl_dataset_t *ds)
	-{
	- ASSERT(ds->ds_is_snapshot);
	- return (dsl_dataset_phys(ds)->ds_num_children - 1);
	-}
	-
	-uint64_t
	-dsl_get_inconsistent(dsl_dataset_t *ds)
	-{
	- return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?
	- 1 : 0);
	-}
	-
	-uint64_t
	-dsl_get_available(dsl_dataset_t *ds)
	-{
	- uint64_t refdbytes = dsl_get_referenced(ds);
	- uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,
	- NULL, 0, TRUE);
	- if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
	- availbytes +=
	- ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
	- }
	- if (ds->ds_quota != 0) {
	- /*
	- * Adjust available bytes according to refquota
	- */
	- if (refdbytes < ds->ds_quota) {
	- availbytes = MIN(availbytes,
	- ds->ds_quota - refdbytes);
	- } else {
	- availbytes = 0;
	- }
	- }
	- return (availbytes);
	-}
	-
	-int
	-dsl_get_written(dsl_dataset_t ds, uint64_t written)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- dsl_dataset_t *prev;
	- int err = dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
	- if (err == 0) {
	- uint64_t comp, uncomp;
	- err = dsl_dataset_space_written(prev, ds, written,
	- &comp, &uncomp);
	- dsl_dataset_rele(prev, FTAG);
	- }
	- return (err);
	-}
	-
	-/*
	- * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.
	- */
	-int
	-dsl_get_prev_snap(dsl_dataset_t ds, char snap)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
	- dsl_dataset_name(ds->ds_prev, snap);
	- return (0);
	- } else {
	- return (ENOENT);
	- }
	-}
	-
	-/*
	- * Returns the mountpoint property and source for the given dataset in the value
	- * and source buffers. The value buffer must be at least as large as MAXPATHLEN
	- * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.
	- * Returns 0 on success and an error on failure.
	- */
	-int
	-dsl_get_mountpoint(dsl_dataset_t ds, const char dsname, char *value,
	- char *source)
	-{
	- int error;
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	-
	- /* Retrieve the mountpoint value stored in the zap opbject */
	- error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,
	- ZAP_MAXVALUELEN, value, source);
	- if (error != 0) {
	- return (error);
	- }
	-
	- /*
	- * Process the dsname and source to find the full mountpoint string.
	- * Can be skipped for 'legacy' or 'none'.
	- */
	- if (value[0] == '/') {
	- char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
	- char *root = buf;
	- const char *relpath;
	-
	- /*
	- * If we inherit the mountpoint, even from a dataset
	- * with a received value, the source will be the path of
	- * the dataset we inherit from. If source is
	- * ZPROP_SOURCE_VAL_RECVD, the received value is not
	- * inherited.
	- */
	- if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
	- relpath = "";
	- } else {
	- ASSERT0(strncmp(dsname, source, strlen(source)));
	- relpath = dsname + strlen(source);
	- if (relpath[0] == '/')
	- relpath++;
	- }
	-
	- spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);
	-
	- /*
	- * Special case an alternate root of '/'. This will
	- * avoid having multiple leading slashes in the
	- * mountpoint path.
	- */
	- if (strcmp(root, "/") == 0)
	- root++;
	-
	- /*
	- * If the mountpoint is '/' then skip over this
	- * if we are obtaining either an alternate root or
	- * an inherited mountpoint.
	- */
	- char *mnt = value;
	- if (value[1] == '\0' && (root[0] != '\0' \|\|
	- relpath[0] != '\0'))
	- mnt = value + 1;
	-
	- if (relpath[0] == '\0') {
	- (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
	- root, mnt);
	- } else {
	- (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",
	- root, mnt, relpath[0] == '@' ? "" : "/",
	- relpath);
	- }
	- kmem_free(buf, ZAP_MAXVALUELEN);
	- }
	-
	- return (0);
	-}
	-
	-void
	-dsl_dataset_stats(dsl_dataset_t ds, nvlist_t nv)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,
	- dsl_get_refratio(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
	- dsl_get_logicalreferenced(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
	- dsl_get_compressratio(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
	- dsl_get_used(ds));
	-
	- if (ds->ds_is_snapshot) {
	- get_clones_stat(ds, nv);
	- } else {
	- char buf[ZFS_MAX_DATASET_NAME_LEN];
	- if (dsl_get_prev_snap(ds, buf) == 0)
	- dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,
	- buf);
	- dsl_dir_stats(ds->ds_dir, nv);
	- }
	-
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
	- dsl_get_available(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
	- dsl_get_referenced(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
	- dsl_get_creation(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
	- dsl_get_creationtxg(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
	- dsl_get_refquota(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
	- dsl_get_refreservation(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
	- dsl_get_guid(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
	- dsl_get_unique(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
	- dsl_get_objsetid(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
	- dsl_get_userrefs(ds));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
	- dsl_get_defer_destroy(ds));
	-
	- if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	- uint64_t written;
	- if (dsl_get_written(ds, &written) == 0) {
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
	- written);
	- }
	- }
	-
	- if (!dsl_dataset_is_snapshot(ds)) {
	- /*
	- * A failed "newfs" (e.g. full) resumable receive leaves
	- * the stats set on this dataset. Check here for the prop.
	- */
	- get_receive_resume_stats(ds, nv);
	-
	- /*
	- * A failed incremental resumable receive leaves the
	- * stats set on our child named "%recv". Check the child
	- * for the prop.
	- */
	- /* 6 extra bytes for /%recv */
	- char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
	- dsl_dataset_t *recv_ds;
	- dsl_dataset_name(ds, recvname);
	- if (strlcat(recvname, "/", sizeof (recvname)) <
	- sizeof (recvname) &&
	- strlcat(recvname, recv_clone_name, sizeof (recvname)) <
	- sizeof (recvname) &&
	- dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
	- get_receive_resume_stats(recv_ds, nv);
	- dsl_dataset_rele(recv_ds, FTAG);
	- }
	- }
	-}
	-
	-void
	-dsl_dataset_fast_stat(dsl_dataset_t ds, dmu_objset_stats_t stat)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- ASSERT(dsl_pool_config_held(dp));
	-
	- stat->dds_creation_txg = dsl_get_creationtxg(ds);
	- stat->dds_inconsistent = dsl_get_inconsistent(ds);
	- stat->dds_guid = dsl_get_guid(ds);
	- stat->dds_origin[0] = '\0';
	- if (ds->ds_is_snapshot) {
	- stat->dds_is_snapshot = B_TRUE;
	- stat->dds_num_clones = dsl_get_numclones(ds);
	- } else {
	- stat->dds_is_snapshot = B_FALSE;
	- stat->dds_num_clones = 0;
	-
	- if (dsl_dir_is_clone(ds->ds_dir)) {
	- dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);
	- }
	- }
	-}
	-
	-uint64_t
	-dsl_dataset_fsid_guid(dsl_dataset_t *ds)
	-{
	- return (ds->ds_fsid_guid);
	-}
	-
	-void
	-dsl_dataset_space(dsl_dataset_t *ds,
	- uint64_t refdbytesp, uint64_t availbytesp,
	- uint64_t usedobjsp, uint64_t availobjsp)
	-{
	- *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
	- *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
	- if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
	- *availbytesp +=
	- ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
	- if (ds->ds_quota != 0) {
	- /*
	- * Adjust available bytes according to refquota
	- */
	- if (*refdbytesp < ds->ds_quota)
	- availbytesp = MIN(availbytesp,
	- ds->ds_quota - *refdbytesp);
	- else
	- *availbytesp = 0;
	- }
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	- availobjsp = DN_MAX_OBJECT - usedobjsp;
	-}
	-
	-boolean_t
	-dsl_dataset_modified_since_snap(dsl_dataset_t ds, dsl_dataset_t snap)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- uint64_t birth;
	-
	- ASSERT(dsl_pool_config_held(dp));
	- if (snap == NULL)
	- return (B_FALSE);
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- birth = dsl_dataset_get_blkptr(ds)->blk_birth;
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	- if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
	- objset_t os, os_snap;
	- /*
	- * It may be that only the ZIL differs, because it was
	- * reset in the head. Don't count that as being
	- * modified.
	- */
	- if (dmu_objset_from_ds(ds, &os) != 0)
	- return (B_TRUE);
	- if (dmu_objset_from_ds(snap, &os_snap) != 0)
	- return (B_TRUE);
	- return (bcmp(&os->os_phys->os_meta_dnode,
	- &os_snap->os_phys->os_meta_dnode,
	- sizeof (os->os_phys->os_meta_dnode)) != 0);
	- }
	- return (B_FALSE);
	-}
	-
	-typedef struct dsl_dataset_rename_snapshot_arg {
	- const char *ddrsa_fsname;
	- const char *ddrsa_oldsnapname;
	- const char *ddrsa_newsnapname;
	- boolean_t ddrsa_recursive;
	- dmu_tx_t *ddrsa_tx;
	-} dsl_dataset_rename_snapshot_arg_t;
	-
	-/* ARGSUSED */
	-static int
	-dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
	- dsl_dataset_t hds, void arg)
	-{
	- dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
	- int error;
	- uint64_t val;
	-
	- error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
	- if (error != 0) {
	- /* ignore nonexistent snapshots */
	- return (error == ENOENT ? 0 : error);
	- }
	-
	- /* new name should not exist */
	- error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
	- if (error == 0)
	- error = SET_ERROR(EEXIST);
	- else if (error == ENOENT)
	- error = 0;
	-
	- /* dataset name + 1 for the "@" + the new snapshot name must fit */
	- if (dsl_dir_namelen(hds->ds_dir) + 1 +
	- strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
	- error = SET_ERROR(ENAMETOOLONG);
	-
	- return (error);
	-}
	-
	-static int
	-dsl_dataset_rename_snapshot_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *hds;
	- int error;
	-
	- error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
	- if (error != 0)
	- return (error);
	-
	- if (ddrsa->ddrsa_recursive) {
	- error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
	- dsl_dataset_rename_snapshot_check_impl, ddrsa,
	- DS_FIND_CHILDREN);
	- } else {
	- error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
	- }
	- dsl_dataset_rele(hds, FTAG);
	- return (error);
	-}
	-
	-static int
	-dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
	- dsl_dataset_t hds, void arg)
	-{
	-#ifdef __FreeBSD__
	-#ifdef _KERNEL
	- char oldname, newname;
	-#endif
	-#endif
	- dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
	- dsl_dataset_t *ds;
	- uint64_t val;
	- dmu_tx_t *tx = ddrsa->ddrsa_tx;
	- int error;
	-
	- error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
	- ASSERT(error == 0 \|\| error == ENOENT);
	- if (error == ENOENT) {
	- /* ignore nonexistent snapshots */
	- return (0);
	- }
	-
	- VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
	-
	- /* log before we change the name */
	- spa_history_log_internal_ds(ds, "rename", tx,
	- "-> @%s", ddrsa->ddrsa_newsnapname);
	-
	- VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
	- B_FALSE));
	- mutex_enter(&ds->ds_lock);
	- (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
	- mutex_exit(&ds->ds_lock);
	- VERIFY0(zap_add(dp->dp_meta_objset,
	- dsl_dataset_phys(hds)->ds_snapnames_zapobj,
	- ds->ds_snapname, 8, 1, &ds->ds_object, tx));
	-
	-#ifdef __FreeBSD__
	-#ifdef _KERNEL
	- oldname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
	- newname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
	- snprintf(oldname, ZFS_MAX_DATASET_NAME_LEN, "%s@%s",
	- ddrsa->ddrsa_fsname, ddrsa->ddrsa_oldsnapname);
	- snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%s@%s",
	- ddrsa->ddrsa_fsname, ddrsa->ddrsa_newsnapname);
	- zfsvfs_update_fromname(oldname, newname);
	- zvol_rename_minors(dp->dp_spa, oldname, newname);
	- kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN);
	- kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN);
	-#endif
	-#endif
	- dsl_dataset_rele(ds, FTAG);
	-
	- return (0);
	-}
	-
	-static void
	-dsl_dataset_rename_snapshot_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *hds;
	-
	- VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
	- ddrsa->ddrsa_tx = tx;
	- if (ddrsa->ddrsa_recursive) {
	- VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
	- dsl_dataset_rename_snapshot_sync_impl, ddrsa,
	- DS_FIND_CHILDREN));
	- } else {
	- VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
	- }
	- dsl_dataset_rele(hds, FTAG);
	-}
	-
	-int
	-dsl_dataset_rename_snapshot(const char *fsname,
	- const char oldsnapname, const char newsnapname, boolean_t recursive)
	-{
	- dsl_dataset_rename_snapshot_arg_t ddrsa;
	-
	- ddrsa.ddrsa_fsname = fsname;
	- ddrsa.ddrsa_oldsnapname = oldsnapname;
	- ddrsa.ddrsa_newsnapname = newsnapname;
	- ddrsa.ddrsa_recursive = recursive;
	-
	- return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
	- dsl_dataset_rename_snapshot_sync, &ddrsa,
	- 1, ZFS_SPACE_CHECK_RESERVED));
	-}
	-
	-/*
	- * If we're doing an ownership handoff, we need to make sure that there is
	- * only one long hold on the dataset. We're not allowed to change anything here
	- * so we don't permanently release the long hold or regular hold here. We want
	- * to do this only when syncing to avoid the dataset unexpectedly going away
	- * when we release the long hold.
	- */
	-static int
	-dsl_dataset_handoff_check(dsl_dataset_t ds, void owner, dmu_tx_t *tx)
	-{
	- boolean_t held;
	-
	- if (!dmu_tx_is_syncing(tx))
	- return (0);
	-
	- if (owner != NULL) {
	- VERIFY3P(ds->ds_owner, ==, owner);
	- dsl_dataset_long_rele(ds, owner);
	- }
	-
	- held = dsl_dataset_long_held(ds);
	-
	- if (owner != NULL)
	- dsl_dataset_long_hold(ds, owner);
	-
	- if (held)
	- return (SET_ERROR(EBUSY));
	-
	- return (0);
	-}
	-
	-int
	-dsl_dataset_rollback_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_rollback_arg_t *ddra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- int64_t unused_refres_delta;
	- int error;
	-
	- error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
	- if (error != 0)
	- return (error);
	-
	- /* must not be a snapshot */
	- if (ds->ds_is_snapshot) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /* must have a most recent snapshot */
	- if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ESRCH));
	- }
	-
	- /*
	- * No rollback to a snapshot created in the current txg, because
	- * the rollback may dirty the dataset and create blocks that are
	- * not reachable from the rootbp while having a birth txg that
	- * falls into the snapshot's range.
	- */
	- if (dmu_tx_is_syncing(tx) &&
	- dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EAGAIN));
	- }
	-
	- /*
	- * If the expected target snapshot is specified, then check that
	- * the latest snapshot is it.
	- */
	- if (ddra->ddra_tosnap != NULL) {
	- dsl_dataset_t *snapds;
	-
	- /* Check if the target snapshot exists at all. */
	- error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);
	- if (error != 0) {
	- /*
	- * ESRCH is used to signal that the target snapshot does
	- * not exist, while ENOENT is used to report that
	- * the rolled back dataset does not exist.
	- * ESRCH is also used to cover other cases where the
	- * target snapshot is not related to the dataset being
	- * rolled back such as being in a different pool.
	- */
	- if (error == ENOENT \|\| error == EXDEV)
	- error = SET_ERROR(ESRCH);
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	- ASSERT(snapds->ds_is_snapshot);
	-
	- /* Check if the snapshot is the latest snapshot indeed. */
	- if (snapds != ds->ds_prev) {
	- /*
	- * Distinguish between the case where the only problem
	- * is intervening snapshots (EEXIST) vs the snapshot
	- * not being a valid target for rollback (ESRCH).
	- */
	- if (snapds->ds_dir == ds->ds_dir \|\|
	- (dsl_dir_is_clone(ds->ds_dir) &&
	- dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==
	- snapds->ds_object)) {
	- error = SET_ERROR(EEXIST);
	- } else {
	- error = SET_ERROR(ESRCH);
	- }
	- dsl_dataset_rele(snapds, FTAG);
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	- dsl_dataset_rele(snapds, FTAG);
	- }
	-
	- /* must not have any bookmarks after the most recent snapshot */
	- nvlist_t *proprequest = fnvlist_alloc();
	- fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
	- nvlist_t *bookmarks = fnvlist_alloc();
	- error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
	- fnvlist_free(proprequest);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	- for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
	- nvlist_t *valuenv =
	- fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
	- zfs_prop_to_name(ZFS_PROP_CREATETXG));
	- uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
	- if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
	- fnvlist_free(bookmarks);
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EEXIST));
	- }
	- }
	- fnvlist_free(bookmarks);
	-
	- error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- /*
	- * Check if the snap we are rolling back to uses more than
	- * the refquota.
	- */
	- if (ds->ds_quota != 0 &&
	- dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EDQUOT));
	- }
	-
	- /*
	- * When we do the clone swap, we will temporarily use more space
	- * due to the refreservation (the head will no longer have any
	- * unique space, so the entire amount of the refreservation will need
	- * to be free). We will immediately destroy the clone, freeing
	- * this space, but the freeing happens over many txg's.
	- */
	- unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
	- dsl_dataset_phys(ds)->ds_unique_bytes);
	-
	- if (unused_refres_delta > 0 &&
	- unused_refres_delta >
	- dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ENOSPC));
	- }
	-
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	-}
	-
	-void
	-dsl_dataset_rollback_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_rollback_arg_t *ddra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t ds, clone;
	- uint64_t cloneobj;
	- char namebuf[ZFS_MAX_DATASET_NAME_LEN];
	-
	- VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
	-
	- dsl_dataset_name(ds->ds_prev, namebuf);
	- fnvlist_add_string(ddra->ddra_result, "target", namebuf);
	-
	- cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
	- ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
	-
	- VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
	-
	- dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
	- dsl_dataset_zero_zil(ds, tx);
	-
	- dsl_destroy_head_sync_impl(clone, tx);
	-
	- dsl_dataset_rele(clone, FTAG);
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-/*
	- * Rolls back the given filesystem or volume to the most recent snapshot.
	- * The name of the most recent snapshot will be returned under key "target"
	- * in the result nvlist.
	- *
	- * If owner != NULL:
	- * - The existing dataset MUST be owned by the specified owner at entry
	- * - Upon return, dataset will still be held by the same owner, whether we
	- * succeed or not.
	- *
	- * This mode is required any time the existing filesystem is mounted. See
	- * notes above zfs_suspend_fs() for further details.
	- */
	-int
	-dsl_dataset_rollback(const char fsname, const char tosnap, void *owner,
	- nvlist_t *result)
	-{
	- dsl_dataset_rollback_arg_t ddra;
	-
	- ddra.ddra_fsname = fsname;
	- ddra.ddra_tosnap = tosnap;
	- ddra.ddra_owner = owner;
	- ddra.ddra_result = result;
	-
	- return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
	- dsl_dataset_rollback_sync, &ddra,
	- 1, ZFS_SPACE_CHECK_RESERVED));
	-}
	-
	-struct promotenode {
	- list_node_t link;
	- dsl_dataset_t *ds;
	-};
	-
	-static int snaplist_space(list_t l, uint64_t mintxg, uint64_t spacep);
	-static int promote_hold(dsl_dataset_promote_arg_t ddpa, dsl_pool_t dp,
	- void *tag);
	-static void promote_rele(dsl_dataset_promote_arg_t ddpa, void tag);
	-
	-int
	-dsl_dataset_promote_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_promote_arg_t *ddpa = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *hds;
	- struct promotenode *snap;
	- dsl_dataset_t *origin_ds;
	- int err;
	- uint64_t unused;
	- uint64_t ss_mv_cnt;
	- size_t max_snap_len;
	- boolean_t conflicting_snaps;
	-
	- err = promote_hold(ddpa, dp, FTAG);
	- if (err != 0)
	- return (err);
	-
	- hds = ddpa->ddpa_clone;
	- snap = list_head(&ddpa->shared_snaps);
	- origin_ds = snap->ds;
	- max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
	-
	- snap = list_head(&ddpa->origin_snaps);
	-
	- if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
	- promote_rele(ddpa, FTAG);
	- return (SET_ERROR(EXDEV));
	- }
	-
	- /*
	- * Compute and check the amount of space to transfer. Since this is
	- * so expensive, don't do the preliminary check.
	- */
	- if (!dmu_tx_is_syncing(tx)) {
	- promote_rele(ddpa, FTAG);
	- return (0);
	- }
	-
	- /* compute origin's new unique space */
	- snap = list_tail(&ddpa->clone_snaps);
	- ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
	- origin_ds->ds_object);
	- dsl_deadlist_space_range(&snap->ds->ds_deadlist,
	- dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
	- &ddpa->unique, &unused, &unused);
	-
	- /*
	- * Walk the snapshots that we are moving
	- *
	- * Compute space to transfer. Consider the incremental changes
	- * to used by each snapshot:
	- * (my used) = (prev's used) + (blocks born) - (blocks killed)
	- * So each snapshot gave birth to:
	- * (blocks born) = (my used) - (prev's used) + (blocks killed)
	- * So a sequence would look like:
	- * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
	- * Which simplifies to:
	- * uN + kN + kN-1 + ... + k1 + k0
	- * Note however, if we stop before we reach the ORIGIN we get:
	- * uN + kN + kN-1 + ... + kM - uM-1
	- */
	- conflicting_snaps = B_FALSE;
	- ss_mv_cnt = 0;
	- ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
	- ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
	- ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
	- for (snap = list_head(&ddpa->shared_snaps); snap;
	- snap = list_next(&ddpa->shared_snaps, snap)) {
	- uint64_t val, dlused, dlcomp, dluncomp;
	- dsl_dataset_t *ds = snap->ds;
	-
	- ss_mv_cnt++;
	-
	- /*
	- * If there are long holds, we won't be able to evict
	- * the objset.
	- */
	- if (dsl_dataset_long_held(ds)) {
	- err = SET_ERROR(EBUSY);
	- goto out;
	- }
	-
	- /* Check that the snapshot name does not conflict */
	- VERIFY0(dsl_dataset_get_snapname(ds));
	- if (strlen(ds->ds_snapname) >= max_snap_len) {
	- err = SET_ERROR(ENAMETOOLONG);
	- goto out;
	- }
	- err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
	- if (err == 0) {
	- fnvlist_add_boolean(ddpa->err_ds,
	- snap->ds->ds_snapname);
	- conflicting_snaps = B_TRUE;
	- } else if (err != ENOENT) {
	- goto out;
	- }
	-
	- /* The very first snapshot does not have a deadlist */
	- if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
	- continue;
	-
	- dsl_deadlist_space(&ds->ds_deadlist,
	- &dlused, &dlcomp, &dluncomp);
	- ddpa->used += dlused;
	- ddpa->comp += dlcomp;
	- ddpa->uncomp += dluncomp;
	- }
	-
	- /*
	- * In order to return the full list of conflicting snapshots, we check
	- * whether there was a conflict after traversing all of them.
	- */
	- if (conflicting_snaps) {
	- err = SET_ERROR(EEXIST);
	- goto out;
	- }
	-
	- /*
	- * If we are a clone of a clone then we never reached ORIGIN,
	- * so we need to subtract out the clone origin's used space.
	- */
	- if (ddpa->origin_origin) {
	- ddpa->used -=
	- dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
	- ddpa->comp -=
	- dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
	- ddpa->uncomp -=
	- dsl_dataset_phys(ddpa->origin_origin)->
	- ds_uncompressed_bytes;
	- }
	-
	- /* Check that there is enough space and limit headroom here */
	- err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
	- 0, ss_mv_cnt, ddpa->used, ddpa->cr);
	- if (err != 0)
	- goto out;
	-
	- /*
	- * Compute the amounts of space that will be used by snapshots
	- * after the promotion (for both origin and clone). For each,
	- * it is the amount of space that will be on all of their
	- * deadlists (that was not born before their new origin).
	- */
	- if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
	- uint64_t space;
	-
	- /*
	- * Note, typically this will not be a clone of a clone,
	- * so dd_origin_txg will be < TXG_INITIAL, so
	- * these snaplist_space() -> dsl_deadlist_space_range()
	- * calls will be fast because they do not have to
	- * iterate over all bps.
	- */
	- snap = list_head(&ddpa->origin_snaps);
	- err = snaplist_space(&ddpa->shared_snaps,
	- snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
	- if (err != 0)
	- goto out;
	-
	- err = snaplist_space(&ddpa->clone_snaps,
	- snap->ds->ds_dir->dd_origin_txg, &space);
	- if (err != 0)
	- goto out;
	- ddpa->cloneusedsnap += space;
	- }
	- if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
	- DD_FLAG_USED_BREAKDOWN) {
	- err = snaplist_space(&ddpa->origin_snaps,
	- dsl_dataset_phys(origin_ds)->ds_creation_txg,
	- &ddpa->originusedsnap);
	- if (err != 0)
	- goto out;
	- }
	-
	-out:
	- promote_rele(ddpa, FTAG);
	- return (err);
	-}
	-
	-void
	-dsl_dataset_promote_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_promote_arg_t *ddpa = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *hds;
	- struct promotenode *snap;
	- dsl_dataset_t *origin_ds;
	- dsl_dataset_t *origin_head;
	- dsl_dir_t *dd;
	- dsl_dir_t *odd = NULL;
	- uint64_t oldnext_obj;
	- int64_t delta;
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- char oldname, newname;
	-#endif
	-
	- VERIFY0(promote_hold(ddpa, dp, FTAG));
	- hds = ddpa->ddpa_clone;
	-
	- ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
	-
	- snap = list_head(&ddpa->shared_snaps);
	- origin_ds = snap->ds;
	- dd = hds->ds_dir;
	-
	- snap = list_head(&ddpa->origin_snaps);
	- origin_head = snap->ds;
	-
	- /*
	- * We need to explicitly open odd, since origin_ds's dd will be
	- * changing.
	- */
	- VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
	- NULL, FTAG, &odd));
	-
	- /* change origin's next snap */
	- dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
	- oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
	- snap = list_tail(&ddpa->clone_snaps);
	- ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
	- origin_ds->ds_object);
	- dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
	-
	- /* change the origin's next clone */
	- if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
	- dsl_dataset_remove_from_next_clones(origin_ds,
	- snap->ds->ds_object, tx);
	- VERIFY0(zap_add_int(dp->dp_meta_objset,
	- dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
	- oldnext_obj, tx));
	- }
	-
	- /* change origin */
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	- ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
	- dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
	- dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
	- dmu_buf_will_dirty(odd->dd_dbuf, tx);
	- dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
	- origin_head->ds_dir->dd_origin_txg =
	- dsl_dataset_phys(origin_ds)->ds_creation_txg;
	-
	- /* change dd_clone entries */
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
	- VERIFY0(zap_remove_int(dp->dp_meta_objset,
	- dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
	- VERIFY0(zap_add_int(dp->dp_meta_objset,
	- dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
	- hds->ds_object, tx));
	-
	- VERIFY0(zap_remove_int(dp->dp_meta_objset,
	- dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
	- origin_head->ds_object, tx));
	- if (dsl_dir_phys(dd)->dd_clones == 0) {
	- dsl_dir_phys(dd)->dd_clones =
	- zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
	- DMU_OT_NONE, 0, tx);
	- }
	- VERIFY0(zap_add_int(dp->dp_meta_objset,
	- dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
	- }
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- oldname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
	- newname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
	-#endif
	-
	- /* move snapshots to this dir */
	- for (snap = list_head(&ddpa->shared_snaps); snap;
	- snap = list_next(&ddpa->shared_snaps, snap)) {
	- dsl_dataset_t *ds = snap->ds;
	-
	- /*
	- * Property callbacks are registered to a particular
	- * dsl_dir. Since ours is changing, evict the objset
	- * so that they will be unregistered from the old dsl_dir.
	- */
	- if (ds->ds_objset) {
	- dmu_objset_evict(ds->ds_objset);
	- ds->ds_objset = NULL;
	- }
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- dsl_dataset_name(ds, oldname);
	-#endif
	-
	- /* move snap name entry */
	- VERIFY0(dsl_dataset_get_snapname(ds));
	- VERIFY0(dsl_dataset_snap_remove(origin_head,
	- ds->ds_snapname, tx, B_TRUE));
	- VERIFY0(zap_add(dp->dp_meta_objset,
	- dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
	- 8, 1, &ds->ds_object, tx));
	- dsl_fs_ss_count_adjust(hds->ds_dir, 1,
	- DD_FIELD_SNAPSHOT_COUNT, tx);
	-
	- /* change containing dsl_dir */
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
	- dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
	- ASSERT3P(ds->ds_dir, ==, odd);
	- dsl_dir_rele(ds->ds_dir, ds);
	- VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
	- NULL, ds, &ds->ds_dir));
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- dsl_dataset_name(ds, newname);
	- zfsvfs_update_fromname(oldname, newname);
	- zvol_rename_minors(dp->dp_spa, oldname, newname);
	-#endif
	-
	- /* move any clone references */
	- if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
	- spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- for (zap_cursor_init(&zc, dp->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_next_clones_obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- dsl_dataset_t *cnds;
	- uint64_t o;
	-
	- if (za.za_first_integer == oldnext_obj) {
	- /*
	- * We've already moved the
	- * origin's reference.
	- */
	- continue;
	- }
	-
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- za.za_first_integer, FTAG, &cnds));
	- o = dsl_dir_phys(cnds->ds_dir)->
	- dd_head_dataset_obj;
	-
	- VERIFY0(zap_remove_int(dp->dp_meta_objset,
	- dsl_dir_phys(odd)->dd_clones, o, tx));
	- VERIFY0(zap_add_int(dp->dp_meta_objset,
	- dsl_dir_phys(dd)->dd_clones, o, tx));
	- dsl_dataset_rele(cnds, FTAG);
	- }
	- zap_cursor_fini(&zc);
	- }
	-
	- ASSERT(!dsl_prop_hascb(ds));
	- }
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN);
	- kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN);
	-#endif
	- /*
	- * Change space accounting.
	- * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
	- * both be valid, or both be 0 (resulting in delta == 0). This
	- * is true for each of {clone,origin} independently.
	- */
	-
	- delta = ddpa->cloneusedsnap -
	- dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
	- ASSERT3S(delta, >=, 0);
	- ASSERT3U(ddpa->used, >=, delta);
	- dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
	- dsl_dir_diduse_space(dd, DD_USED_HEAD,
	- ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
	-
	- delta = ddpa->originusedsnap -
	- dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
	- ASSERT3S(delta, <=, 0);
	- ASSERT3U(ddpa->used, >=, -delta);
	- dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
	- dsl_dir_diduse_space(odd, DD_USED_HEAD,
	- -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
	-
	- dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
	-
	- /* log history record */
	- spa_history_log_internal_ds(hds, "promote", tx, "");
	-
	- dsl_dir_rele(odd, FTAG);
	- promote_rele(ddpa, FTAG);
	-}
	-
	-/*
	- * Make a list of dsl_dataset_t's for the snapshots between first_obj
	- * (exclusive) and last_obj (inclusive). The list will be in reverse
	- * order (last_obj will be the list_head()). If first_obj == 0, do all
	- * snapshots back to this dataset's origin.
	- */
	-static int
	-snaplist_make(dsl_pool_t *dp,
	- uint64_t first_obj, uint64_t last_obj, list_t l, void tag)
	-{
	- uint64_t obj = last_obj;
	-
	- list_create(l, sizeof (struct promotenode),
	- offsetof(struct promotenode, link));
	-
	- while (obj != first_obj) {
	- dsl_dataset_t *ds;
	- struct promotenode *snap;
	- int err;
	-
	- err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
	- ASSERT(err != ENOENT);
	- if (err != 0)
	- return (err);
	-
	- if (first_obj == 0)
	- first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
	-
	- snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
	- snap->ds = ds;
	- list_insert_tail(l, snap);
	- obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
	- }
	-
	- return (0);
	-}
	-
	-static int
	-snaplist_space(list_t l, uint64_t mintxg, uint64_t spacep)
	-{
	- struct promotenode *snap;
	-
	- *spacep = 0;
	- for (snap = list_head(l); snap; snap = list_next(l, snap)) {
	- uint64_t used, comp, uncomp;
	- dsl_deadlist_space_range(&snap->ds->ds_deadlist,
	- mintxg, UINT64_MAX, &used, &comp, &uncomp);
	- *spacep += used;
	- }
	- return (0);
	-}
	-
	-static void
	-snaplist_destroy(list_t l, void tag)
	-{
	- struct promotenode *snap;
	-
	- if (l == NULL \|\| !list_link_active(&l->list_head))
	- return;
	-
	- while ((snap = list_tail(l)) != NULL) {
	- list_remove(l, snap);
	- dsl_dataset_rele(snap->ds, tag);
	- kmem_free(snap, sizeof (*snap));
	- }
	- list_destroy(l);
	-}
	-
	-static int
	-promote_hold(dsl_dataset_promote_arg_t ddpa, dsl_pool_t dp, void *tag)
	-{
	- int error;
	- dsl_dir_t *dd;
	- struct promotenode *snap;
	-
	- error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
	- &ddpa->ddpa_clone);
	- if (error != 0)
	- return (error);
	- dd = ddpa->ddpa_clone->ds_dir;
	-
	- if (ddpa->ddpa_clone->ds_is_snapshot \|\|
	- !dsl_dir_is_clone(dd)) {
	- dsl_dataset_rele(ddpa->ddpa_clone, tag);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
	- &ddpa->shared_snaps, tag);
	- if (error != 0)
	- goto out;
	-
	- error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
	- &ddpa->clone_snaps, tag);
	- if (error != 0)
	- goto out;
	-
	- snap = list_head(&ddpa->shared_snaps);
	- ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
	- error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
	- dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
	- &ddpa->origin_snaps, tag);
	- if (error != 0)
	- goto out;
	-
	- if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
	- error = dsl_dataset_hold_obj(dp,
	- dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
	- tag, &ddpa->origin_origin);
	- if (error != 0)
	- goto out;
	- }
	-out:
	- if (error != 0)
	- promote_rele(ddpa, tag);
	- return (error);
	-}
	-
	-static void
	-promote_rele(dsl_dataset_promote_arg_t ddpa, void tag)
	-{
	- snaplist_destroy(&ddpa->shared_snaps, tag);
	- snaplist_destroy(&ddpa->clone_snaps, tag);
	- snaplist_destroy(&ddpa->origin_snaps, tag);
	- if (ddpa->origin_origin != NULL)
	- dsl_dataset_rele(ddpa->origin_origin, tag);
	- dsl_dataset_rele(ddpa->ddpa_clone, tag);
	-}
	-
	-/*
	- * Promote a clone.
	- *
	- * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
	- * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
	- */
	-int
	-dsl_dataset_promote(const char name, char conflsnap)
	-{
	- dsl_dataset_promote_arg_t ddpa = { 0 };
	- uint64_t numsnaps;
	- int error;
	- nvpair_t *snap_pair;
	- objset_t *os;
	-
	- /*
	- * We will modify space proportional to the number of
	- * snapshots. Compute numsnaps.
	- */
	- error = dmu_objset_hold(name, FTAG, &os);
	- if (error != 0)
	- return (error);
	- error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
	- dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
	- &numsnaps);
	- dmu_objset_rele(os, FTAG);
	- if (error != 0)
	- return (error);
	-
	- ddpa.ddpa_clonename = name;
	- ddpa.err_ds = fnvlist_alloc();
	- ddpa.cr = CRED();
	-
	- error = dsl_sync_task(name, dsl_dataset_promote_check,
	- dsl_dataset_promote_sync, &ddpa,
	- 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED);
	-
	- /*
	- * Return the first conflicting snapshot found.
	- */
	- snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);
	- if (snap_pair != NULL && conflsnap != NULL)
	- (void) strcpy(conflsnap, nvpair_name(snap_pair));
	-
	- fnvlist_free(ddpa.err_ds);
	- return (error);
	-}
	-
	-int
	-dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
	- dsl_dataset_t origin_head, boolean_t force, void owner, dmu_tx_t *tx)
	-{
	- /*
	- * "slack" factor for received datasets with refquota set on them.
	- * See the bottom of this function for details on its use.
	- */
	- uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation;
	- int64_t unused_refres_delta;
	-
	- /* they should both be heads */
	- if (clone->ds_is_snapshot \|\|
	- origin_head->ds_is_snapshot)
	- return (SET_ERROR(EINVAL));
	-
	- /* if we are not forcing, the branch point should be just before them */
	- if (!force && clone->ds_prev != origin_head->ds_prev)
	- return (SET_ERROR(EINVAL));
	-
	- /* clone should be the clone (unless they are unrelated) */
	- if (clone->ds_prev != NULL &&
	- clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
	- origin_head->ds_dir != clone->ds_prev->ds_dir)
	- return (SET_ERROR(EINVAL));
	-
	- /* the clone should be a child of the origin */
	- if (clone->ds_dir->dd_parent != origin_head->ds_dir)
	- return (SET_ERROR(EINVAL));
	-
	- /* origin_head shouldn't be modified unless 'force' */
	- if (!force &&
	- dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
	- return (SET_ERROR(ETXTBSY));
	-
	- /* origin_head should have no long holds (e.g. is not mounted) */
	- if (dsl_dataset_handoff_check(origin_head, owner, tx))
	- return (SET_ERROR(EBUSY));
	-
	- /* check amount of any unconsumed refreservation */
	- unused_refres_delta =
	- (int64_t)MIN(origin_head->ds_reserved,
	- dsl_dataset_phys(origin_head)->ds_unique_bytes) -
	- (int64_t)MIN(origin_head->ds_reserved,
	- dsl_dataset_phys(clone)->ds_unique_bytes);
	-
	- if (unused_refres_delta > 0 &&
	- unused_refres_delta >
	- dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
	- return (SET_ERROR(ENOSPC));
	-
	- /*
	- * The clone can't be too much over the head's refquota.
	- *
	- * To ensure that the entire refquota can be used, we allow one
	- * transaction to exceed the the refquota. Therefore, this check
	- * needs to also allow for the space referenced to be more than the
	- * refquota. The maximum amount of space that one transaction can use
	- * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this
	- * overage ensures that we are able to receive a filesystem that
	- * exceeds the refquota on the source system.
	- *
	- * So that overage is the refquota_slack we use below.
	- */
	- if (origin_head->ds_quota != 0 &&
	- dsl_dataset_phys(clone)->ds_referenced_bytes >
	- origin_head->ds_quota + refquota_slack)
	- return (SET_ERROR(EDQUOT));
	-
	- return (0);
	-}
	-
	-static void
	-dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,
	- dsl_dataset_t origin, dmu_tx_t tx)
	-{
	- uint64_t clone_remap_dl_obj, origin_remap_dl_obj;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	-
	- ASSERT(dsl_pool_sync_context(dp));
	-
	- clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);
	- origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);
	-
	- if (clone_remap_dl_obj != 0) {
	- dsl_deadlist_close(&clone->ds_remap_deadlist);
	- dsl_dataset_unset_remap_deadlist_object(clone, tx);
	- }
	- if (origin_remap_dl_obj != 0) {
	- dsl_deadlist_close(&origin->ds_remap_deadlist);
	- dsl_dataset_unset_remap_deadlist_object(origin, tx);
	- }
	-
	- if (clone_remap_dl_obj != 0) {
	- dsl_dataset_set_remap_deadlist_object(origin,
	- clone_remap_dl_obj, tx);
	- dsl_deadlist_open(&origin->ds_remap_deadlist,
	- dp->dp_meta_objset, clone_remap_dl_obj);
	- }
	- if (origin_remap_dl_obj != 0) {
	- dsl_dataset_set_remap_deadlist_object(clone,
	- origin_remap_dl_obj, tx);
	- dsl_deadlist_open(&clone->ds_remap_deadlist,
	- dp->dp_meta_objset, origin_remap_dl_obj);
	- }
	-}
	-
	-void
	-dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
	- dsl_dataset_t origin_head, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- int64_t unused_refres_delta;
	-
	- ASSERT(clone->ds_reserved == 0);
	- /*
	- * NOTE: On DEBUG kernels there could be a race between this and
	- * the check function if spa_asize_inflation is adjusted...
	- */
	- ASSERT(origin_head->ds_quota == 0 \|\|
	- dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
	- DMU_MAX_ACCESS * spa_asize_inflation);
	- ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
	-
	- /*
	- * Swap per-dataset feature flags.
	- */
	- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	- if (!(spa_feature_table[f].fi_flags &
	- ZFEATURE_FLAG_PER_DATASET)) {
	- ASSERT(!clone->ds_feature_inuse[f]);
	- ASSERT(!origin_head->ds_feature_inuse[f]);
	- continue;
	- }
	-
	- boolean_t clone_inuse = clone->ds_feature_inuse[f];
	- boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
	-
	- if (clone_inuse) {
	- dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
	- clone->ds_feature_inuse[f] = B_FALSE;
	- }
	- if (origin_head_inuse) {
	- dsl_dataset_deactivate_feature(origin_head->ds_object,
	- f, tx);
	- origin_head->ds_feature_inuse[f] = B_FALSE;
	- }
	- if (clone_inuse) {
	- dsl_dataset_activate_feature(origin_head->ds_object,
	- f, tx);
	- origin_head->ds_feature_inuse[f] = B_TRUE;
	- }
	- if (origin_head_inuse) {
	- dsl_dataset_activate_feature(clone->ds_object, f, tx);
	- clone->ds_feature_inuse[f] = B_TRUE;
	- }
	- }
	-
	- dmu_buf_will_dirty(clone->ds_dbuf, tx);
	- dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
	-
	- if (clone->ds_objset != NULL) {
	- dmu_objset_evict(clone->ds_objset);
	- clone->ds_objset = NULL;
	- }
	-
	- if (origin_head->ds_objset != NULL) {
	- dmu_objset_evict(origin_head->ds_objset);
	- origin_head->ds_objset = NULL;
	- }
	-
	- unused_refres_delta =
	- (int64_t)MIN(origin_head->ds_reserved,
	- dsl_dataset_phys(origin_head)->ds_unique_bytes) -
	- (int64_t)MIN(origin_head->ds_reserved,
	- dsl_dataset_phys(clone)->ds_unique_bytes);
	-
	- /*
	- * Reset origin's unique bytes, if it exists.
	- */
	- if (clone->ds_prev) {
	- dsl_dataset_t *origin = clone->ds_prev;
	- uint64_t comp, uncomp;
	-
	- dmu_buf_will_dirty(origin->ds_dbuf, tx);
	- dsl_deadlist_space_range(&clone->ds_deadlist,
	- dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
	- &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
	- }
	-
	- /* swap blkptrs */
	- {
	- rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
	- rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
	- blkptr_t tmp;
	- tmp = dsl_dataset_phys(origin_head)->ds_bp;
	- dsl_dataset_phys(origin_head)->ds_bp =
	- dsl_dataset_phys(clone)->ds_bp;
	- dsl_dataset_phys(clone)->ds_bp = tmp;
	- rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
	- rrw_exit(&clone->ds_bp_rwlock, FTAG);
	- }
	-
	- /* set dd__bytes /
	- {
	- int64_t dused, dcomp, duncomp;
	- uint64_t cdl_used, cdl_comp, cdl_uncomp;
	- uint64_t odl_used, odl_comp, odl_uncomp;
	-
	- ASSERT3U(dsl_dir_phys(clone->ds_dir)->
	- dd_used_breakdown[DD_USED_SNAP], ==, 0);
	-
	- dsl_deadlist_space(&clone->ds_deadlist,
	- &cdl_used, &cdl_comp, &cdl_uncomp);
	- dsl_deadlist_space(&origin_head->ds_deadlist,
	- &odl_used, &odl_comp, &odl_uncomp);
	-
	- dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
	- cdl_used -
	- (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
	- odl_used);
	- dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
	- cdl_comp -
	- (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
	- odl_comp);
	- duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
	- cdl_uncomp -
	- (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
	- odl_uncomp);
	-
	- dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
	- dused, dcomp, duncomp, tx);
	- dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
	- -dused, -dcomp, -duncomp, tx);
	-
	- /*
	- * The difference in the space used by snapshots is the
	- * difference in snapshot space due to the head's
	- * deadlist (since that's the only thing that's
	- * changing that affects the snapused).
	- */
	- dsl_deadlist_space_range(&clone->ds_deadlist,
	- origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
	- &cdl_used, &cdl_comp, &cdl_uncomp);
	- dsl_deadlist_space_range(&origin_head->ds_deadlist,
	- origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
	- &odl_used, &odl_comp, &odl_uncomp);
	- dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
	- DD_USED_HEAD, DD_USED_SNAP, NULL);
	- }
	-
	- /* swap ds__bytes /
	- SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
	- dsl_dataset_phys(clone)->ds_referenced_bytes);
	- SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
	- dsl_dataset_phys(clone)->ds_compressed_bytes);
	- SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
	- dsl_dataset_phys(clone)->ds_uncompressed_bytes);
	- SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
	- dsl_dataset_phys(clone)->ds_unique_bytes);
	-
	- /* apply any parent delta for change in unconsumed refreservation */
	- dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
	- unused_refres_delta, 0, 0, tx);
	-
	- /*
	- * Swap deadlists.
	- */
	- dsl_deadlist_close(&clone->ds_deadlist);
	- dsl_deadlist_close(&origin_head->ds_deadlist);
	- SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
	- dsl_dataset_phys(clone)->ds_deadlist_obj);
	- dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
	- dsl_dataset_phys(clone)->ds_deadlist_obj);
	- dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
	- dsl_dataset_phys(origin_head)->ds_deadlist_obj);
	- dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);
	-
	- dsl_scan_ds_clone_swapped(origin_head, clone, tx);
	-
	- spa_history_log_internal_ds(clone, "clone swap", tx,
	- "parent=%s", origin_head->ds_dir->dd_myname);
	-}
	-
	-/*
	- * Given a pool name and a dataset object number in that pool,
	- * return the name of that dataset.
	- */
	-int
	-dsl_dsobj_to_dsname(char pname, uint64_t obj, char buf)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- int error;
	-
	- error = dsl_pool_hold(pname, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
	- if (error == 0) {
	- dsl_dataset_name(ds, buf);
	- dsl_dataset_rele(ds, FTAG);
	- }
	- dsl_pool_rele(dp, FTAG);
	-
	- return (error);
	-}
	-
	-int
	-dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
	- uint64_t asize, uint64_t inflight, uint64_t used, uint64_t ref_rsrv)
	-{
	- int error = 0;
	-
	- ASSERT3S(asize, >, 0);
	-
	- /*
	- * *ref_rsrv is the portion of asize that will come from any
	- * unconsumed refreservation space.
	- */
	- *ref_rsrv = 0;
	-
	- mutex_enter(&ds->ds_lock);
	- /*
	- * Make a space adjustment for reserved bytes.
	- */
	- if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
	- ASSERT3U(*used, >=,
	- ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
	- *used -=
	- (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
	- *ref_rsrv =
	- asize - MIN(asize, parent_delta(ds, asize + inflight));
	- }
	-
	- if (!check_quota \|\| ds->ds_quota == 0) {
	- mutex_exit(&ds->ds_lock);
	- return (0);
	- }
	- /*
	- * If they are requesting more space, and our current estimate
	- * is over quota, they get to try again unless the actual
	- * on-disk is over quota and there are no pending changes (which
	- * may free up space for us).
	- */
	- if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
	- ds->ds_quota) {
	- if (inflight > 0 \|\|
	- dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
	- error = SET_ERROR(ERESTART);
	- else
	- error = SET_ERROR(EDQUOT);
	- }
	- mutex_exit(&ds->ds_lock);
	-
	- return (error);
	-}
	-
	-typedef struct dsl_dataset_set_qr_arg {
	- const char *ddsqra_name;
	- zprop_source_t ddsqra_source;
	- uint64_t ddsqra_value;
	-} dsl_dataset_set_qr_arg_t;
	-
	-
	-/* ARGSUSED */
	-static int
	-dsl_dataset_set_refquota_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_set_qr_arg_t *ddsqra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- int error;
	- uint64_t newval;
	-
	- if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
	- return (SET_ERROR(ENOTSUP));
	-
	- error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
	- if (error != 0)
	- return (error);
	-
	- if (ds->ds_is_snapshot) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- error = dsl_prop_predict(ds->ds_dir,
	- zfs_prop_to_name(ZFS_PROP_REFQUOTA),
	- ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- if (newval == 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	- }
	-
	- if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes \|\|
	- newval < ds->ds_reserved) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ENOSPC));
	- }
	-
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	-}
	-
	-static void
	-dsl_dataset_set_refquota_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_set_qr_arg_t *ddsqra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- uint64_t newval;
	-
	- VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
	-
	- dsl_prop_set_sync_impl(ds,
	- zfs_prop_to_name(ZFS_PROP_REFQUOTA),
	- ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
	- &ddsqra->ddsqra_value, tx);
	-
	- VERIFY0(dsl_prop_get_int_ds(ds,
	- zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
	-
	- if (ds->ds_quota != newval) {
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- ds->ds_quota = newval;
	- }
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-int
	-dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
	- uint64_t refquota)
	-{
	- dsl_dataset_set_qr_arg_t ddsqra;
	-
	- ddsqra.ddsqra_name = dsname;
	- ddsqra.ddsqra_source = source;
	- ddsqra.ddsqra_value = refquota;
	-
	- return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
	- dsl_dataset_set_refquota_sync, &ddsqra, 0,
	- ZFS_SPACE_CHECK_EXTRA_RESERVED));
	-}
	-
	-static int
	-dsl_dataset_set_refreservation_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_set_qr_arg_t *ddsqra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- int error;
	- uint64_t newval, unique;
	-
	- if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
	- return (SET_ERROR(ENOTSUP));
	-
	- error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
	- if (error != 0)
	- return (error);
	-
	- if (ds->ds_is_snapshot) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- error = dsl_prop_predict(ds->ds_dir,
	- zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
	- ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- /*
	- * If we are doing the preliminary check in open context, the
	- * space estimates may be inaccurate.
	- */
	- if (!dmu_tx_is_syncing(tx)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	- }
	-
	- mutex_enter(&ds->ds_lock);
	- if (!DS_UNIQUE_IS_ACCURATE(ds))
	- dsl_dataset_recalc_head_uniq(ds);
	- unique = dsl_dataset_phys(ds)->ds_unique_bytes;
	- mutex_exit(&ds->ds_lock);
	-
	- if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
	- uint64_t delta = MAX(unique, newval) -
	- MAX(unique, ds->ds_reserved);
	-
	- if (delta >
	- dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) \|\|
	- (ds->ds_quota > 0 && newval > ds->ds_quota)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ENOSPC));
	- }
	- }
	-
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	-}
	-
	-void
	-dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
	- zprop_source_t source, uint64_t value, dmu_tx_t *tx)
	-{
	- uint64_t newval;
	- uint64_t unique;
	- int64_t delta;
	-
	- dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
	- source, sizeof (value), 1, &value, tx);
	-
	- VERIFY0(dsl_prop_get_int_ds(ds,
	- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
	-
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- mutex_enter(&ds->ds_dir->dd_lock);
	- mutex_enter(&ds->ds_lock);
	- ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
	- unique = dsl_dataset_phys(ds)->ds_unique_bytes;
	- delta = MAX(0, (int64_t)(newval - unique)) -
	- MAX(0, (int64_t)(ds->ds_reserved - unique));
	- ds->ds_reserved = newval;
	- mutex_exit(&ds->ds_lock);
	-
	- dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
	- mutex_exit(&ds->ds_dir->dd_lock);
	-}
	-
	-static void
	-dsl_dataset_set_refreservation_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_set_qr_arg_t *ddsqra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	-
	- VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
	- dsl_dataset_set_refreservation_sync_impl(ds,
	- ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-int
	-dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
	- uint64_t refreservation)
	-{
	- dsl_dataset_set_qr_arg_t ddsqra;
	-
	- ddsqra.ddsqra_name = dsname;
	- ddsqra.ddsqra_source = source;
	- ddsqra.ddsqra_value = refreservation;
	-
	- return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
	- dsl_dataset_set_refreservation_sync, &ddsqra, 0,
	- ZFS_SPACE_CHECK_EXTRA_RESERVED));
	-}
	-
	-/*
	- * Return (in *usedp) the amount of space written in new that is not
	- * present in oldsnap. New may be a snapshot or the head. Old must be
	- * a snapshot before new, in new's filesystem (or its origin). If not then
	- * fail and return EINVAL.
	- *
	- * The written space is calculated by considering two components: First, we
	- * ignore any freed space, and calculate the written as new's used space
	- * minus old's used space. Next, we add in the amount of space that was freed
	- * between the two snapshots, thus reducing new's used space relative to old's.
	- * Specifically, this is the space that was born before old->ds_creation_txg,
	- * and freed before new (ie. on new's deadlist or a previous deadlist).
	- *
	- * space freed [---------------------]
	- * snapshots ---O-------O--------O-------O------
	- * oldsnap new
	- */
	-int
	-dsl_dataset_space_written(dsl_dataset_t oldsnap, dsl_dataset_t new,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	-{
	- int err = 0;
	- uint64_t snapobj;
	- dsl_pool_t *dp = new->ds_dir->dd_pool;
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- *usedp = 0;
	- *usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
	- *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
	-
	- *compp = 0;
	- *compp += dsl_dataset_phys(new)->ds_compressed_bytes;
	- *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
	-
	- *uncompp = 0;
	- *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
	- *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
	-
	- snapobj = new->ds_object;
	- while (snapobj != oldsnap->ds_object) {
	- dsl_dataset_t *snap;
	- uint64_t used, comp, uncomp;
	-
	- if (snapobj == new->ds_object) {
	- snap = new;
	- } else {
	- err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
	- if (err != 0)
	- break;
	- }
	-
	- if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
	- dsl_dataset_phys(oldsnap)->ds_creation_txg) {
	- /*
	- * The blocks in the deadlist can not be born after
	- * ds_prev_snap_txg, so get the whole deadlist space,
	- * which is more efficient (especially for old-format
	- * deadlists). Unfortunately the deadlist code
	- * doesn't have enough information to make this
	- * optimization itself.
	- */
	- dsl_deadlist_space(&snap->ds_deadlist,
	- &used, &comp, &uncomp);
	- } else {
	- dsl_deadlist_space_range(&snap->ds_deadlist,
	- 0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
	- &used, &comp, &uncomp);
	- }
	- *usedp += used;
	- *compp += comp;
	- *uncompp += uncomp;
	-
	- /*
	- * If we get to the beginning of the chain of snapshots
	- * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
	- * was not a snapshot of/before new.
	- */
	- snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
	- if (snap != new)
	- dsl_dataset_rele(snap, FTAG);
	- if (snapobj == 0) {
	- err = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- }
	- return (err);
	-}
	-
	-/*
	- * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
	- * lastsnap, and all snapshots in between are deleted.
	- *
	- * blocks that would be freed [---------------------------]
	- * snapshots ---O-------O--------O-------O--------O
	- * firstsnap lastsnap
	- *
	- * This is the set of blocks that were born after the snap before firstsnap,
	- * (birth > firstsnap->prev_snap_txg) and died before the snap after the
	- * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
	- * We calculate this by iterating over the relevant deadlists (from the snap
	- * after lastsnap, backward to the snap after firstsnap), summing up the
	- * space on the deadlist that was born after the snap before firstsnap.
	- */
	-int
	-dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
	- dsl_dataset_t *lastsnap,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	-{
	- int err = 0;
	- uint64_t snapobj;
	- dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
	-
	- ASSERT(firstsnap->ds_is_snapshot);
	- ASSERT(lastsnap->ds_is_snapshot);
	-
	- /*
	- * Check that the snapshots are in the same dsl_dir, and firstsnap
	- * is before lastsnap.
	- */
	- if (firstsnap->ds_dir != lastsnap->ds_dir \|\|
	- dsl_dataset_phys(firstsnap)->ds_creation_txg >
	- dsl_dataset_phys(lastsnap)->ds_creation_txg)
	- return (SET_ERROR(EINVAL));
	-
	- usedp = compp = *uncompp = 0;
	-
	- snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
	- while (snapobj != firstsnap->ds_object) {
	- dsl_dataset_t *ds;
	- uint64_t used, comp, uncomp;
	-
	- err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
	- if (err != 0)
	- break;
	-
	- dsl_deadlist_space_range(&ds->ds_deadlist,
	- dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
	- &used, &comp, &uncomp);
	- *usedp += used;
	- *compp += comp;
	- *uncompp += uncomp;
	-
	- snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
	- ASSERT3U(snapobj, !=, 0);
	- dsl_dataset_rele(ds, FTAG);
	- }
	- return (err);
	-}
	-
	-/*
	- * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
	- * For example, they could both be snapshots of the same filesystem, and
	- * 'earlier' is before 'later'. Or 'earlier' could be the origin of
	- * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's
	- * filesystem. Or 'earlier' could be the origin's origin.
	- *
	- * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
	- */
	-boolean_t
	-dsl_dataset_is_before(dsl_dataset_t later, dsl_dataset_t earlier,
	- uint64_t earlier_txg)
	-{
	- dsl_pool_t *dp = later->ds_dir->dd_pool;
	- int error;
	- boolean_t ret;
	-
	- ASSERT(dsl_pool_config_held(dp));
	- ASSERT(earlier->ds_is_snapshot \|\| earlier_txg != 0);
	-
	- if (earlier_txg == 0)
	- earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
	-
	- if (later->ds_is_snapshot &&
	- earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
	- return (B_FALSE);
	-
	- if (later->ds_dir == earlier->ds_dir)
	- return (B_TRUE);
	- if (!dsl_dir_is_clone(later->ds_dir))
	- return (B_FALSE);
	-
	- if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
	- return (B_TRUE);
	- dsl_dataset_t *origin;
	- error = dsl_dataset_hold_obj(dp,
	- dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
	- if (error != 0)
	- return (B_FALSE);
	- ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
	- dsl_dataset_rele(origin, FTAG);
	- return (ret);
	-}
	-
	-void
	-dsl_dataset_zapify(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
	-}
	-
	-boolean_t
	-dsl_dataset_is_zapified(dsl_dataset_t *ds)
	-{
	- dmu_object_info_t doi;
	-
	- dmu_object_info_from_db(ds->ds_dbuf, &doi);
	- return (doi.doi_type == DMU_OTN_ZAP_METADATA);
	-}
	-
	-boolean_t
	-dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
	-{
	- return (dsl_dataset_is_zapified(ds) &&
	- zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
	- ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
	-}
	-
	-uint64_t
	-dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)
	-{
	- uint64_t remap_deadlist_obj;
	- int err;
	-
	- if (!dsl_dataset_is_zapified(ds))
	- return (0);
	-
	- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
	- DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,
	- &remap_deadlist_obj);
	-
	- if (err != 0) {
	- VERIFY3S(err, ==, ENOENT);
	- return (0);
	- }
	-
	- ASSERT(remap_deadlist_obj != 0);
	- return (remap_deadlist_obj);
	-}
	-
	-boolean_t
	-dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)
	-{
	- EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),
	- dsl_dataset_get_remap_deadlist_object(ds) != 0);
	- return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));
	-}
	-
	-static void
	-dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,
	- dmu_tx_t *tx)
	-{
	- ASSERT(obj != 0);
	- dsl_dataset_zapify(ds, tx);
	- VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
	- DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx));
	-}
	-
	-static void
	-dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset,
	- ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx));
	-}
	-
	-void
	-dsl_dataset_destroy_remap_deadlist(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- uint64_t remap_deadlist_object;
	- spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(dsl_dataset_remap_deadlist_exists(ds));
	-
	- remap_deadlist_object = ds->ds_remap_deadlist.dl_object;
	- dsl_deadlist_close(&ds->ds_remap_deadlist);
	- dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx);
	- dsl_dataset_unset_remap_deadlist_object(ds, tx);
	- spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	-}
	-
	-void
	-dsl_dataset_create_remap_deadlist(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- uint64_t remap_deadlist_obj;
	- spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock));
	- /*
	- * Currently we only create remap deadlists when there are indirect
	- * vdevs with referenced mappings.
	- */
	- ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
	-
	- remap_deadlist_obj = dsl_deadlist_clone(
	- &ds->ds_deadlist, UINT64_MAX,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
	- dsl_dataset_set_remap_deadlist_object(ds,
	- remap_deadlist_obj, tx);
	- dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa),
	- remap_deadlist_obj);
	- spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
	@@ -1,561 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/dsl_dataset.h>
	-#include <sys/dmu.h>
	-#include <sys/refcount.h>
	-#include <sys/zap.h>
	-#include <sys/zfs_context.h>
	-#include <sys/dsl_pool.h>
	-
	-/*
	- * Deadlist concurrency:
	- *
	- * Deadlists can only be modified from the syncing thread.
	- *
	- * Except for dsl_deadlist_insert(), it can only be modified with the
	- * dp_config_rwlock held with RW_WRITER.
	- *
	- * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
	- * be called concurrently, from open context, with the dl_config_rwlock held
	- * with RW_READER.
	- *
	- * Therefore, we only need to provide locking between dsl_deadlist_insert() and
	- * the accessors, protecting:
	- * dl_phys->dl_used,comp,uncomp
	- * and protecting the dl_tree from being loaded.
	- * The locking is provided by dl_lock. Note that locking on the bpobj_t
	- * provides its own locking, and dl_oldfmt is immutable.
	- */
	-
	-static int
	-dsl_deadlist_compare(const void arg1, const void arg2)
	-{
	- const dsl_deadlist_entry_t dle1 = (const dsl_deadlist_entry_t )arg1;
	- const dsl_deadlist_entry_t dle2 = (const dsl_deadlist_entry_t )arg2;
	-
	- return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
	-}
	-
	-static void
	-dsl_deadlist_load_tree(dsl_deadlist_t *dl)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- ASSERT(MUTEX_HELD(&dl->dl_lock));
	-
	- ASSERT(!dl->dl_oldfmt);
	- if (dl->dl_havetree)
	- return;
	-
	- avl_create(&dl->dl_tree, dsl_deadlist_compare,
	- sizeof (dsl_deadlist_entry_t),
	- offsetof(dsl_deadlist_entry_t, dle_node));
	- for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- dsl_deadlist_entry_t dle = kmem_alloc(sizeof (dle), KM_SLEEP);
	- dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
	- VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
	- za.za_first_integer));
	- avl_add(&dl->dl_tree, dle);
	- }
	- zap_cursor_fini(&zc);
	- dl->dl_havetree = B_TRUE;
	-}
	-
	-void
	-dsl_deadlist_open(dsl_deadlist_t dl, objset_t os, uint64_t object)
	-{
	- dmu_object_info_t doi;
	-
	- ASSERT(!dsl_deadlist_is_open(dl));
	-
	- mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
	- dl->dl_os = os;
	- dl->dl_object = object;
	- VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
	- dmu_object_info_from_db(dl->dl_dbuf, &doi);
	- if (doi.doi_type == DMU_OT_BPOBJ) {
	- dmu_buf_rele(dl->dl_dbuf, dl);
	- dl->dl_dbuf = NULL;
	- dl->dl_oldfmt = B_TRUE;
	- VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
	- return;
	- }
	-
	- dl->dl_oldfmt = B_FALSE;
	- dl->dl_phys = dl->dl_dbuf->db_data;
	- dl->dl_havetree = B_FALSE;
	-}
	-
	-boolean_t
	-dsl_deadlist_is_open(dsl_deadlist_t *dl)
	-{
	- return (dl->dl_os != NULL);
	-}
	-
	-void
	-dsl_deadlist_close(dsl_deadlist_t *dl)
	-{
	- void *cookie = NULL;
	- dsl_deadlist_entry_t *dle;
	-
	- ASSERT(dsl_deadlist_is_open(dl));
	-
	- if (dl->dl_oldfmt) {
	- dl->dl_oldfmt = B_FALSE;
	- bpobj_close(&dl->dl_bpobj);
	- dl->dl_os = NULL;
	- dl->dl_object = 0;
	- return;
	- }
	-
	- if (dl->dl_havetree) {
	- while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
	- != NULL) {
	- bpobj_close(&dle->dle_bpobj);
	- kmem_free(dle, sizeof (*dle));
	- }
	- avl_destroy(&dl->dl_tree);
	- }
	- dmu_buf_rele(dl->dl_dbuf, dl);
	- mutex_destroy(&dl->dl_lock);
	- dl->dl_dbuf = NULL;
	- dl->dl_phys = NULL;
	- dl->dl_os = NULL;
	- dl->dl_object = 0;
	-}
	-
	-uint64_t
	-dsl_deadlist_alloc(objset_t os, dmu_tx_t tx)
	-{
	- if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
	- return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
	- return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
	- sizeof (dsl_deadlist_phys_t), tx));
	-}
	-
	-void
	-dsl_deadlist_free(objset_t os, uint64_t dlobj, dmu_tx_t tx)
	-{
	- dmu_object_info_t doi;
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
	- if (doi.doi_type == DMU_OT_BPOBJ) {
	- bpobj_free(os, dlobj, tx);
	- return;
	- }
	-
	- for (zap_cursor_init(&zc, os, dlobj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- uint64_t obj = za.za_first_integer;
	- if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
	- bpobj_decr_empty(os, tx);
	- else
	- bpobj_free(os, obj, tx);
	- }
	- zap_cursor_fini(&zc);
	- VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
	-}
	-
	-static void
	-dle_enqueue(dsl_deadlist_t dl, dsl_deadlist_entry_t dle,
	- const blkptr_t bp, dmu_tx_t tx)
	-{
	- ASSERT(MUTEX_HELD(&dl->dl_lock));
	- if (dle->dle_bpobj.bpo_object ==
	- dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
	- uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
	- bpobj_close(&dle->dle_bpobj);
	- bpobj_decr_empty(dl->dl_os, tx);
	- VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
	- VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
	- dle->dle_mintxg, obj, tx));
	- }
	- bpobj_enqueue(&dle->dle_bpobj, bp, tx);
	-}
	-
	-static void
	-dle_enqueue_subobj(dsl_deadlist_t dl, dsl_deadlist_entry_t dle,
	- uint64_t obj, dmu_tx_t *tx)
	-{
	- ASSERT(MUTEX_HELD(&dl->dl_lock));
	- if (dle->dle_bpobj.bpo_object !=
	- dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
	- bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
	- } else {
	- bpobj_close(&dle->dle_bpobj);
	- bpobj_decr_empty(dl->dl_os, tx);
	- VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
	- VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
	- dle->dle_mintxg, obj, tx));
	- }
	-}
	-
	-void
	-dsl_deadlist_insert(dsl_deadlist_t dl, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- dsl_deadlist_entry_t dle_tofind;
	- dsl_deadlist_entry_t *dle;
	- avl_index_t where;
	-
	- if (dl->dl_oldfmt) {
	- bpobj_enqueue(&dl->dl_bpobj, bp, tx);
	- return;
	- }
	-
	- mutex_enter(&dl->dl_lock);
	- dsl_deadlist_load_tree(dl);
	-
	- dmu_buf_will_dirty(dl->dl_dbuf, tx);
	- dl->dl_phys->dl_used +=
	- bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
	- dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
	- dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
	-
	- dle_tofind.dle_mintxg = bp->blk_birth;
	- dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
	- if (dle == NULL)
	- dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
	- else
	- dle = AVL_PREV(&dl->dl_tree, dle);
	- dle_enqueue(dl, dle, bp, tx);
	- mutex_exit(&dl->dl_lock);
	-}
	-
	-/*
	- * Insert new key in deadlist, which must be > all current entries.
	- * mintxg is not inclusive.
	- */
	-void
	-dsl_deadlist_add_key(dsl_deadlist_t dl, uint64_t mintxg, dmu_tx_t tx)
	-{
	- uint64_t obj;
	- dsl_deadlist_entry_t *dle;
	-
	- if (dl->dl_oldfmt)
	- return;
	-
	- dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
	- dle->dle_mintxg = mintxg;
	-
	- mutex_enter(&dl->dl_lock);
	- dsl_deadlist_load_tree(dl);
	-
	- obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
	- VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
	- avl_add(&dl->dl_tree, dle);
	-
	- VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
	- mintxg, obj, tx));
	- mutex_exit(&dl->dl_lock);
	-}
	-
	-/*
	- * Remove this key, merging its entries into the previous key.
	- */
	-void
	-dsl_deadlist_remove_key(dsl_deadlist_t dl, uint64_t mintxg, dmu_tx_t tx)
	-{
	- dsl_deadlist_entry_t dle_tofind;
	- dsl_deadlist_entry_t dle, dle_prev;
	-
	- if (dl->dl_oldfmt)
	- return;
	-
	- mutex_enter(&dl->dl_lock);
	- dsl_deadlist_load_tree(dl);
	-
	- dle_tofind.dle_mintxg = mintxg;
	- dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
	- dle_prev = AVL_PREV(&dl->dl_tree, dle);
	-
	- dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
	-
	- avl_remove(&dl->dl_tree, dle);
	- bpobj_close(&dle->dle_bpobj);
	- kmem_free(dle, sizeof (*dle));
	-
	- VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
	- mutex_exit(&dl->dl_lock);
	-}
	-
	-/*
	- * Walk ds's snapshots to regenerate generate ZAP & AVL.
	- */
	-static void
	-dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
	- uint64_t mrs_obj, dmu_tx_t *tx)
	-{
	- dsl_deadlist_t dl = { 0 };
	- dsl_pool_t *dp = dmu_objset_pool(os);
	-
	- dsl_deadlist_open(&dl, os, dlobj);
	- if (dl.dl_oldfmt) {
	- dsl_deadlist_close(&dl);
	- return;
	- }
	-
	- while (mrs_obj != 0) {
	- dsl_dataset_t *ds;
	- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
	- dsl_deadlist_add_key(&dl,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
	- mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
	- dsl_dataset_rele(ds, FTAG);
	- }
	- dsl_deadlist_close(&dl);
	-}
	-
	-uint64_t
	-dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
	- uint64_t mrs_obj, dmu_tx_t *tx)
	-{
	- dsl_deadlist_entry_t *dle;
	- uint64_t newobj;
	-
	- newobj = dsl_deadlist_alloc(dl->dl_os, tx);
	-
	- if (dl->dl_oldfmt) {
	- dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
	- return (newobj);
	- }
	-
	- mutex_enter(&dl->dl_lock);
	- dsl_deadlist_load_tree(dl);
	-
	- for (dle = avl_first(&dl->dl_tree); dle;
	- dle = AVL_NEXT(&dl->dl_tree, dle)) {
	- uint64_t obj;
	-
	- if (dle->dle_mintxg >= maxtxg)
	- break;
	-
	- obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
	- VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
	- dle->dle_mintxg, obj, tx));
	- }
	- mutex_exit(&dl->dl_lock);
	- return (newobj);
	-}
	-
	-void
	-dsl_deadlist_space(dsl_deadlist_t *dl,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	-{
	- ASSERT(dsl_deadlist_is_open(dl));
	- if (dl->dl_oldfmt) {
	- VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
	- usedp, compp, uncompp));
	- return;
	- }
	-
	- mutex_enter(&dl->dl_lock);
	- *usedp = dl->dl_phys->dl_used;
	- *compp = dl->dl_phys->dl_comp;
	- *uncompp = dl->dl_phys->dl_uncomp;
	- mutex_exit(&dl->dl_lock);
	-}
	-
	-/*
	- * return space used in the range (mintxg, maxtxg].
	- * Includes maxtxg, does not include mintxg.
	- * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
	- * larger than any bp in the deadlist (eg. UINT64_MAX)).
	- */
	-void
	-dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	-{
	- dsl_deadlist_entry_t *dle;
	- dsl_deadlist_entry_t dle_tofind;
	- avl_index_t where;
	-
	- if (dl->dl_oldfmt) {
	- VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
	- mintxg, maxtxg, usedp, compp, uncompp));
	- return;
	- }
	-
	- usedp = compp = *uncompp = 0;
	-
	- mutex_enter(&dl->dl_lock);
	- dsl_deadlist_load_tree(dl);
	- dle_tofind.dle_mintxg = mintxg;
	- dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
	- /*
	- * If we don't find this mintxg, there shouldn't be anything
	- * after it either.
	- */
	- ASSERT(dle != NULL \|\|
	- avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
	-
	- for (; dle && dle->dle_mintxg < maxtxg;
	- dle = AVL_NEXT(&dl->dl_tree, dle)) {
	- uint64_t used, comp, uncomp;
	-
	- VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
	- &used, &comp, &uncomp));
	-
	- *usedp += used;
	- *compp += comp;
	- *uncompp += uncomp;
	- }
	- mutex_exit(&dl->dl_lock);
	-}
	-
	-static void
	-dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
	- dmu_tx_t *tx)
	-{
	- dsl_deadlist_entry_t dle_tofind;
	- dsl_deadlist_entry_t *dle;
	- avl_index_t where;
	- uint64_t used, comp, uncomp;
	- bpobj_t bpo;
	-
	- ASSERT(MUTEX_HELD(&dl->dl_lock));
	-
	- VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
	- VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
	- bpobj_close(&bpo);
	-
	- dsl_deadlist_load_tree(dl);
	-
	- dmu_buf_will_dirty(dl->dl_dbuf, tx);
	- dl->dl_phys->dl_used += used;
	- dl->dl_phys->dl_comp += comp;
	- dl->dl_phys->dl_uncomp += uncomp;
	-
	- dle_tofind.dle_mintxg = birth;
	- dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
	- if (dle == NULL)
	- dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
	- dle_enqueue_subobj(dl, dle, obj, tx);
	-}
	-
	-static int
	-dsl_deadlist_insert_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- dsl_deadlist_t *dl = arg;
	- dsl_deadlist_insert(dl, bp, tx);
	- return (0);
	-}
	-
	-/*
	- * Merge the deadlist pointed to by 'obj' into dl. obj will be left as
	- * an empty deadlist.
	- */
	-void
	-dsl_deadlist_merge(dsl_deadlist_t dl, uint64_t obj, dmu_tx_t tx)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- dmu_buf_t *bonus;
	- dsl_deadlist_phys_t *dlp;
	- dmu_object_info_t doi;
	-
	- VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
	- if (doi.doi_type == DMU_OT_BPOBJ) {
	- bpobj_t bpo;
	- VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
	- VERIFY3U(0, ==, bpobj_iterate(&bpo,
	- dsl_deadlist_insert_cb, dl, tx));
	- bpobj_close(&bpo);
	- return;
	- }
	-
	- mutex_enter(&dl->dl_lock);
	- for (zap_cursor_init(&zc, dl->dl_os, obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
	- dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
	- VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
	- }
	- zap_cursor_fini(&zc);
	-
	- VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
	- dlp = bonus->db_data;
	- dmu_buf_will_dirty(bonus, tx);
	- bzero(dlp, sizeof (*dlp));
	- dmu_buf_rele(bonus, FTAG);
	- mutex_exit(&dl->dl_lock);
	-}
	-
	-/*
	- * Remove entries on dl that are >= mintxg, and put them on the bpobj.
	- */
	-void
	-dsl_deadlist_move_bpobj(dsl_deadlist_t dl, bpobj_t bpo, uint64_t mintxg,
	- dmu_tx_t *tx)
	-{
	- dsl_deadlist_entry_t dle_tofind;
	- dsl_deadlist_entry_t *dle;
	- avl_index_t where;
	-
	- ASSERT(!dl->dl_oldfmt);
	-
	- mutex_enter(&dl->dl_lock);
	- dmu_buf_will_dirty(dl->dl_dbuf, tx);
	- dsl_deadlist_load_tree(dl);
	-
	- dle_tofind.dle_mintxg = mintxg;
	- dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
	- if (dle == NULL)
	- dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
	- while (dle) {
	- uint64_t used, comp, uncomp;
	- dsl_deadlist_entry_t *dle_next;
	-
	- bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
	-
	- VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
	- &used, &comp, &uncomp));
	- ASSERT3U(dl->dl_phys->dl_used, >=, used);
	- ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
	- ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
	- dl->dl_phys->dl_used -= used;
	- dl->dl_phys->dl_comp -= comp;
	- dl->dl_phys->dl_uncomp -= uncomp;
	-
	- VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
	- dle->dle_mintxg, tx));
	-
	- dle_next = AVL_NEXT(&dl->dl_tree, dle);
	- avl_remove(&dl->dl_tree, dle);
	- bpobj_close(&dle->dle_bpobj);
	- kmem_free(dle, sizeof (*dle));
	- dle = dle_next;
	- }
	- mutex_exit(&dl->dl_lock);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
	@@ -1,760 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * DSL permissions are stored in a two level zap attribute
	- * mechanism. The first level identifies the "class" of
	- * entry. The class is identified by the first 2 letters of
	- * the attribute. The second letter "l" or "d" identifies whether
	- * it is a local or descendent permission. The first letter
	- * identifies the type of entry.
	- *
	- * ul$<id> identifies permissions granted locally for this userid.
	- * ud$<id> identifies permissions granted on descendent datasets for
	- * this userid.
	- * Ul$<id> identifies permission sets granted locally for this userid.
	- * Ud$<id> identifies permission sets granted on descendent datasets for
	- * this userid.
	- * gl$<id> identifies permissions granted locally for this groupid.
	- * gd$<id> identifies permissions granted on descendent datasets for
	- * this groupid.
	- * Gl$<id> identifies permission sets granted locally for this groupid.
	- * Gd$<id> identifies permission sets granted on descendent datasets for
	- * this groupid.
	- * el$ identifies permissions granted locally for everyone.
	- * ed$ identifies permissions granted on descendent datasets
	- * for everyone.
	- * El$ identifies permission sets granted locally for everyone.
	- * Ed$ identifies permission sets granted to descendent datasets for
	- * everyone.
	- * c-$ identifies permission to create at dataset creation time.
	- * C-$ identifies permission sets to grant locally at dataset creation
	- * time.
	- * s-$@<name> permissions defined in specified set @<name>
	- * S-$@<name> Sets defined in named set @<name>
	- *
	- * Each of the above entities points to another zap attribute that contains one
	- * attribute for each allowed permission, such as create, destroy,...
	- * All of the "upper" case class types will specify permission set names
	- * rather than permissions.
	- *
	- * Basically it looks something like this:
	- * ul$12 -> ZAP OBJ -> permissions...
	- *
	- * The ZAP OBJ is referred to as the jump object.
	- */
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/spa.h>
	-#include <sys/zap.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/cred.h>
	-#include <sys/sunddi.h>
	-
	-#include "zfs_deleg.h"
	-
	-/*
	- * Validate that user is allowed to delegate specified permissions.
	- *
	- * In order to delegate "create" you must have "create"
	- * and "allow".
	- */
	-int
	-dsl_deleg_can_allow(char ddname, nvlist_t nvp, cred_t *cr)
	-{
	- nvpair_t *whopair = NULL;
	- int error;
	-
	- if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
	- return (error);
	-
	- while (whopair = nvlist_next_nvpair(nvp, whopair)) {
	- nvlist_t *perms;
	- nvpair_t *permpair = NULL;
	-
	- VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
	-
	- while (permpair = nvlist_next_nvpair(perms, permpair)) {
	- const char *perm = nvpair_name(permpair);
	-
	- if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
	- return (SET_ERROR(EPERM));
	-
	- if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
	- return (error);
	- }
	- }
	- return (0);
	-}
	-
	-/*
	- * Validate that user is allowed to unallow specified permissions. They
	- * must have the 'allow' permission, and even then can only unallow
	- * perms for their uid.
	- */
	-int
	-dsl_deleg_can_unallow(char ddname, nvlist_t nvp, cred_t *cr)
	-{
	- nvpair_t *whopair = NULL;
	- int error;
	- char idstr[32];
	-
	- if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
	- return (error);
	-
	- (void) snprintf(idstr, sizeof (idstr), "%lld",
	- (longlong_t)crgetuid(cr));
	-
	- while (whopair = nvlist_next_nvpair(nvp, whopair)) {
	- zfs_deleg_who_type_t type = nvpair_name(whopair)[0];
	-
	- if (type != ZFS_DELEG_USER &&
	- type != ZFS_DELEG_USER_SETS)
	- return (SET_ERROR(EPERM));
	-
	- if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
	- return (SET_ERROR(EPERM));
	- }
	- return (0);
	-}
	-
	-typedef struct dsl_deleg_arg {
	- const char *dda_name;
	- nvlist_t *dda_nvlist;
	-} dsl_deleg_arg_t;
	-
	-static void
	-dsl_deleg_set_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_deleg_arg_t *dda = arg;
	- dsl_dir_t *dd;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- objset_t *mos = dp->dp_meta_objset;
	- nvpair_t *whopair = NULL;
	- uint64_t zapobj;
	-
	- VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
	-
	- zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
	- if (zapobj == 0) {
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	- zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
	- DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
	- }
	-
	- while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
	- const char *whokey = nvpair_name(whopair);
	- nvlist_t *perms;
	- nvpair_t *permpair = NULL;
	- uint64_t jumpobj;
	-
	- perms = fnvpair_value_nvlist(whopair);
	-
	- if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
	- jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
	- zapobj, whokey, tx);
	- }
	-
	- while (permpair = nvlist_next_nvpair(perms, permpair)) {
	- const char *perm = nvpair_name(permpair);
	- uint64_t n = 0;
	-
	- VERIFY(zap_update(mos, jumpobj,
	- perm, 8, 1, &n, tx) == 0);
	- spa_history_log_internal_dd(dd, "permission update", tx,
	- "%s %s", whokey, perm);
	- }
	- }
	- dsl_dir_rele(dd, FTAG);
	-}
	-
	-static void
	-dsl_deleg_unset_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_deleg_arg_t *dda = arg;
	- dsl_dir_t *dd;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- objset_t *mos = dp->dp_meta_objset;
	- nvpair_t *whopair = NULL;
	- uint64_t zapobj;
	-
	- VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
	- zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
	- if (zapobj == 0) {
	- dsl_dir_rele(dd, FTAG);
	- return;
	- }
	-
	- while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
	- const char *whokey = nvpair_name(whopair);
	- nvlist_t *perms;
	- nvpair_t *permpair = NULL;
	- uint64_t jumpobj;
	-
	- if (nvpair_value_nvlist(whopair, &perms) != 0) {
	- if (zap_lookup(mos, zapobj, whokey, 8,
	- 1, &jumpobj) == 0) {
	- (void) zap_remove(mos, zapobj, whokey, tx);
	- VERIFY(0 == zap_destroy(mos, jumpobj, tx));
	- }
	- spa_history_log_internal_dd(dd, "permission who remove",
	- tx, "%s", whokey);
	- continue;
	- }
	-
	- if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0)
	- continue;
	-
	- while (permpair = nvlist_next_nvpair(perms, permpair)) {
	- const char *perm = nvpair_name(permpair);
	- uint64_t n = 0;
	-
	- (void) zap_remove(mos, jumpobj, perm, tx);
	- if (zap_count(mos, jumpobj, &n) == 0 && n == 0) {
	- (void) zap_remove(mos, zapobj,
	- whokey, tx);
	- VERIFY(0 == zap_destroy(mos,
	- jumpobj, tx));
	- }
	- spa_history_log_internal_dd(dd, "permission remove", tx,
	- "%s %s", whokey, perm);
	- }
	- }
	- dsl_dir_rele(dd, FTAG);
	-}
	-
	-static int
	-dsl_deleg_check(void arg, dmu_tx_t tx)
	-{
	- dsl_deleg_arg_t *dda = arg;
	- dsl_dir_t *dd;
	- int error;
	-
	- if (spa_version(dmu_tx_pool(tx)->dp_spa) <
	- SPA_VERSION_DELEGATED_PERMS) {
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL);
	- if (error == 0)
	- dsl_dir_rele(dd, FTAG);
	- return (error);
	-}
	-
	-int
	-dsl_deleg_set(const char ddname, nvlist_t nvp, boolean_t unset)
	-{
	- dsl_deleg_arg_t dda;
	-
	- /* nvp must already have been verified to be valid */
	-
	- dda.dda_name = ddname;
	- dda.dda_nvlist = nvp;
	-
	- return (dsl_sync_task(ddname, dsl_deleg_check,
	- unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
	- &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
	-}
	-
	-/*
	- * Find all 'allow' permissions from a given point and then continue
	- * traversing up to the root.
	- *
	- * This function constructs an nvlist of nvlists.
	- * each setpoint is an nvlist composed of an nvlist of an nvlist
	- * of the individual * users/groups/everyone/create
	- * permissions.
	- *
	- * The nvlist will look like this.
	- *
	- * { source fsname -> { whokeys { permissions,...}, ...}}
	- *
	- * The fsname nvpairs will be arranged in a bottom up order. For example,
	- * if we have the following structure a/b/c then the nvpairs for the fsnames
	- * will be ordered a/b/c, a/b, a.
	- */
	-int
	-dsl_deleg_get(const char ddname, nvlist_t *nvp)
	-{
	- dsl_dir_t dd, startdd;
	- dsl_pool_t *dp;
	- int error;
	- objset_t *mos;
	-
	- error = dsl_pool_hold(ddname, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- dp = startdd->dd_pool;
	- mos = dp->dp_meta_objset;
	-
	- VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
	- zap_cursor_t basezc;
	- zap_attribute_t baseza;
	- nvlist_t *sp_nvp;
	- uint64_t n;
	- char source[ZFS_MAX_DATASET_NAME_LEN];
	-
	- if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 \|\|
	- zap_count(mos,
	- dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 \|\| n == 0)
	- continue;
	-
	- sp_nvp = fnvlist_alloc();
	- for (zap_cursor_init(&basezc, mos,
	- dsl_dir_phys(dd)->dd_deleg_zapobj);
	- zap_cursor_retrieve(&basezc, &baseza) == 0;
	- zap_cursor_advance(&basezc)) {
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- nvlist_t *perms_nvp;
	-
	- ASSERT(baseza.za_integer_length == 8);
	- ASSERT(baseza.za_num_integers == 1);
	-
	- perms_nvp = fnvlist_alloc();
	- for (zap_cursor_init(&zc, mos, baseza.za_first_integer);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- fnvlist_add_boolean(perms_nvp, za.za_name);
	- }
	- zap_cursor_fini(&zc);
	- fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp);
	- fnvlist_free(perms_nvp);
	- }
	-
	- zap_cursor_fini(&basezc);
	-
	- dsl_dir_name(dd, source);
	- fnvlist_add_nvlist(*nvp, source, sp_nvp);
	- nvlist_free(sp_nvp);
	- }
	-
	- dsl_dir_rele(startdd, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (0);
	-}
	-
	-/*
	- * Routines for dsl_deleg_access() -- access checking.
	- */
	-typedef struct perm_set {
	- avl_node_t p_node;
	- boolean_t p_matched;
	- char p_setname[ZFS_MAX_DELEG_NAME];
	-} perm_set_t;
	-
	-static int
	-perm_set_compare(const void arg1, const void arg2)
	-{
	- const perm_set_t node1 = (const perm_set_t )arg1;
	- const perm_set_t node2 = (const perm_set_t )arg2;
	- int val;
	-
	- val = strcmp(node1->p_setname, node2->p_setname);
	-
	- return (AVL_ISIGN(val));
	-}
	-
	-/*
	- * Determine whether a specified permission exists.
	- *
	- * First the base attribute has to be retrieved. i.e. ul$12
	- * Once the base object has been retrieved the actual permission
	- * is lookup up in the zap object the base object points to.
	- *
	- * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if
	- * there is no perm in that jumpobj.
	- */
	-static int
	-dsl_check_access(objset_t *mos, uint64_t zapobj,
	- char type, char checkflag, void valp, const char perm)
	-{
	- int error;
	- uint64_t jumpobj, zero;
	- char whokey[ZFS_MAX_DELEG_NAME];
	-
	- zfs_deleg_whokey(whokey, type, checkflag, valp);
	- error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
	- if (error == 0) {
	- error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
	- if (error == ENOENT)
	- error = SET_ERROR(EPERM);
	- }
	- return (error);
	-}
	-
	-/*
	- * check a specified user/group for a requested permission
	- */
	-static int
	-dsl_check_user_access(objset_t mos, uint64_t zapobj, const char perm,
	- int checkflag, cred_t *cr)
	-{
	- const gid_t *gids;
	- int ngids;
	- int i;
	- uint64_t id;
	-
	- /* check for user */
	- id = crgetuid(cr);
	- if (dsl_check_access(mos, zapobj,
	- ZFS_DELEG_USER, checkflag, &id, perm) == 0)
	- return (0);
	-
	- /* check for users primary group */
	- id = crgetgid(cr);
	- if (dsl_check_access(mos, zapobj,
	- ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
	- return (0);
	-
	- /* check for everyone entry */
	- id = -1;
	- if (dsl_check_access(mos, zapobj,
	- ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0)
	- return (0);
	-
	- /* check each supplemental group user is a member of */
	- ngids = crgetngroups(cr);
	- gids = crgetgroups(cr);
	- for (i = 0; i != ngids; i++) {
	- id = gids[i];
	- if (dsl_check_access(mos, zapobj,
	- ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
	- return (0);
	- }
	-
	- return (SET_ERROR(EPERM));
	-}
	-
	-/*
	- * Iterate over the sets specified in the specified zapobj
	- * and load them into the permsets avl tree.
	- */
	-static int
	-dsl_load_sets(objset_t *mos, uint64_t zapobj,
	- char type, char checkflag, void valp, avl_tree_t avl)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- perm_set_t *permnode;
	- avl_index_t idx;
	- uint64_t jumpobj;
	- int error;
	- char whokey[ZFS_MAX_DELEG_NAME];
	-
	- zfs_deleg_whokey(whokey, type, checkflag, valp);
	-
	- error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
	- if (error != 0)
	- return (error);
	-
	- for (zap_cursor_init(&zc, mos, jumpobj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP);
	- (void) strlcpy(permnode->p_setname, za.za_name,
	- sizeof (permnode->p_setname));
	- permnode->p_matched = B_FALSE;
	-
	- if (avl_find(avl, permnode, &idx) == NULL) {
	- avl_insert(avl, permnode, idx);
	- } else {
	- kmem_free(permnode, sizeof (perm_set_t));
	- }
	- }
	- zap_cursor_fini(&zc);
	- return (0);
	-}
	-
	-/*
	- * Load all permissions user based on cred belongs to.
	- */
	-static void
	-dsl_load_user_sets(objset_t mos, uint64_t zapobj, avl_tree_t avl,
	- char checkflag, cred_t *cr)
	-{
	- const gid_t *gids;
	- int ngids, i;
	- uint64_t id;
	-
	- id = crgetuid(cr);
	- (void) dsl_load_sets(mos, zapobj,
	- ZFS_DELEG_USER_SETS, checkflag, &id, avl);
	-
	- id = crgetgid(cr);
	- (void) dsl_load_sets(mos, zapobj,
	- ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
	-
	- (void) dsl_load_sets(mos, zapobj,
	- ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl);
	-
	- ngids = crgetngroups(cr);
	- gids = crgetgroups(cr);
	- for (i = 0; i != ngids; i++) {
	- id = gids[i];
	- (void) dsl_load_sets(mos, zapobj,
	- ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
	- }
	-}
	-
	-/*
	- * Check if user has requested permission.
	- */
	-int
	-dsl_deleg_access_impl(dsl_dataset_t ds, const char perm, cred_t *cr)
	-{
	- dsl_dir_t *dd;
	- dsl_pool_t *dp;
	- void *cookie;
	- int error;
	- char checkflag;
	- objset_t *mos;
	- avl_tree_t permsets;
	- perm_set_t *setnode;
	-
	- dp = ds->ds_dir->dd_pool;
	- mos = dp->dp_meta_objset;
	-
	- if (dsl_delegation_on(mos) == B_FALSE)
	- return (SET_ERROR(ECANCELED));
	-
	- if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
	- SPA_VERSION_DELEGATED_PERMS)
	- return (SET_ERROR(EPERM));
	-
	- if (ds->ds_is_snapshot) {
	- /*
	- * Snapshots are treated as descendents only,
	- * local permissions do not apply.
	- */
	- checkflag = ZFS_DELEG_DESCENDENT;
	- } else {
	- checkflag = ZFS_DELEG_LOCAL;
	- }
	-
	- avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
	- offsetof(perm_set_t, p_node));
	-
	- ASSERT(dsl_pool_config_held(dp));
	- for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
	- checkflag = ZFS_DELEG_DESCENDENT) {
	- uint64_t zapobj;
	- boolean_t expanded;
	-
	- /*
	- * If not in global zone then make sure
	- * the zoned property is set
	- */
	- if (!INGLOBALZONE(curthread)) {
	- uint64_t zoned;
	-
	- if (dsl_prop_get_dd(dd,
	- zfs_prop_to_name(ZFS_PROP_ZONED),
	- 8, 1, &zoned, NULL, B_FALSE) != 0)
	- break;
	- if (!zoned)
	- break;
	- }
	- zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
	-
	- if (zapobj == 0)
	- continue;
	-
	- dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr);
	-again:
	- expanded = B_FALSE;
	- for (setnode = avl_first(&permsets); setnode;
	- setnode = AVL_NEXT(&permsets, setnode)) {
	- if (setnode->p_matched == B_TRUE)
	- continue;
	-
	- /* See if this set directly grants this permission */
	- error = dsl_check_access(mos, zapobj,
	- ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm);
	- if (error == 0)
	- goto success;
	- if (error == EPERM)
	- setnode->p_matched = B_TRUE;
	-
	- /* See if this set includes other sets */
	- error = dsl_load_sets(mos, zapobj,
	- ZFS_DELEG_NAMED_SET_SETS, 0,
	- setnode->p_setname, &permsets);
	- if (error == 0)
	- setnode->p_matched = expanded = B_TRUE;
	- }
	- /*
	- * If we expanded any sets, that will define more sets,
	- * which we need to check.
	- */
	- if (expanded)
	- goto again;
	-
	- error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr);
	- if (error == 0)
	- goto success;
	- }
	- error = SET_ERROR(EPERM);
	-success:
	-
	- cookie = NULL;
	- while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
	- kmem_free(setnode, sizeof (perm_set_t));
	-
	- return (error);
	-}
	-
	-int
	-dsl_deleg_access(const char dsname, const char perm, cred_t *cr)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- int error;
	-
	- error = dsl_pool_hold(dsname, FTAG, &dp);
	- if (error != 0)
	- return (error);
	- error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
	- if (error == 0) {
	- error = dsl_deleg_access_impl(ds, perm, cr);
	- dsl_dataset_rele(ds, FTAG);
	- }
	- dsl_pool_rele(dp, FTAG);
	-
	- return (error);
	-}
	-
	-/*
	- * Other routines.
	- */
	-
	-static void
	-copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
	- boolean_t dosets, uint64_t uid, dmu_tx_t *tx)
	-{
	- objset_t *mos = dd->dd_pool->dp_meta_objset;
	- uint64_t jumpobj, pjumpobj;
	- uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- char whokey[ZFS_MAX_DELEG_NAME];
	-
	- zfs_deleg_whokey(whokey,
	- dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE,
	- ZFS_DELEG_LOCAL, NULL);
	- if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0)
	- return;
	-
	- if (zapobj == 0) {
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	- zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
	- DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
	- }
	-
	- zfs_deleg_whokey(whokey,
	- dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER,
	- ZFS_DELEG_LOCAL, &uid);
	- if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) {
	- jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
	- VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0);
	- }
	-
	- for (zap_cursor_init(&zc, mos, pjumpobj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- uint64_t zero = 0;
	- ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
	-
	- VERIFY(zap_update(mos, jumpobj, za.za_name,
	- 8, 1, &zero, tx) == 0);
	- }
	- zap_cursor_fini(&zc);
	-}
	-
	-/*
	- * set all create time permission on new dataset.
	- */
	-void
	-dsl_deleg_set_create_perms(dsl_dir_t sdd, dmu_tx_t tx, cred_t *cr)
	-{
	- dsl_dir_t *dd;
	- uint64_t uid = crgetuid(cr);
	-
	- if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) <
	- SPA_VERSION_DELEGATED_PERMS)
	- return;
	-
	- for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
	- uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
	-
	- if (pzapobj == 0)
	- continue;
	-
	- copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx);
	- copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx);
	- }
	-}
	-
	-int
	-dsl_deleg_destroy(objset_t mos, uint64_t zapobj, dmu_tx_t tx)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- if (zapobj == 0)
	- return (0);
	-
	- for (zap_cursor_init(&zc, mos, zapobj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
	- VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx));
	- }
	- zap_cursor_fini(&zc);
	- VERIFY(0 == zap_destroy(mos, zapobj, tx));
	- return (0);
	-}
	-
	-boolean_t
	-dsl_delegation_on(objset_t *os)
	-{
	- return (!!spa_delegation(os->os_spa));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
	@@ -1,1097 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dsl_userhold.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/zap.h>
	-#include <sys/zfeature.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/zcp.h>
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-#include <sys/zvol.h>
	-#endif
	-
	-
	-int
	-dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
	-{
	- if (!ds->ds_is_snapshot)
	- return (SET_ERROR(EINVAL));
	-
	- if (dsl_dataset_long_held(ds))
	- return (SET_ERROR(EBUSY));
	-
	- /*
	- * Only allow deferred destroy on pools that support it.
	- * NOTE: deferred destroy is only supported on snapshots.
	- */
	- if (defer) {
	- if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
	- SPA_VERSION_USERREFS)
	- return (SET_ERROR(ENOTSUP));
	- return (0);
	- }
	-
	- /*
	- * If this snapshot has an elevated user reference count,
	- * we can't destroy it yet.
	- */
	- if (ds->ds_userrefs > 0)
	- return (SET_ERROR(EBUSY));
	-
	- /*
	- * Can't delete a branch point.
	- */
	- if (dsl_dataset_phys(ds)->ds_num_children > 1)
	- return (SET_ERROR(EEXIST));
	-
	- return (0);
	-}
	-
	-int
	-dsl_destroy_snapshot_check(void arg, dmu_tx_t tx)
	-{
	- dsl_destroy_snapshot_arg_t *ddsa = arg;
	- const char *dsname = ddsa->ddsa_name;
	- boolean_t defer = ddsa->ddsa_defer;
	-
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- int error = 0;
	- dsl_dataset_t *ds;
	-
	- error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
	-
	- /*
	- * If the snapshot does not exist, silently ignore it, and
	- * dsl_destroy_snapshot_sync() will be a no-op
	- * (it's "already destroyed").
	- */
	- if (error == ENOENT)
	- return (0);
	-
	- if (error == 0) {
	- error = dsl_destroy_snapshot_check_impl(ds, defer);
	- dsl_dataset_rele(ds, FTAG);
	- }
	-
	- return (error);
	-}
	-
	-struct process_old_arg {
	- dsl_dataset_t *ds;
	- dsl_dataset_t *ds_prev;
	- boolean_t after_branch_point;
	- zio_t *pio;
	- uint64_t used, comp, uncomp;
	-};
	-
	-static int
	-process_old_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- struct process_old_arg *poa = arg;
	- dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
	-
	- ASSERT(!BP_IS_HOLE(bp));
	-
	- if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
	- dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
	- if (poa->ds_prev && !poa->after_branch_point &&
	- bp->blk_birth >
	- dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
	- dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
	- bp_get_dsize_sync(dp->dp_spa, bp);
	- }
	- } else {
	- poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
	- poa->comp += BP_GET_PSIZE(bp);
	- poa->uncomp += BP_GET_UCSIZE(bp);
	- dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
	- }
	- return (0);
	-}
	-
	-static void
	-process_old_deadlist(dsl_dataset_t ds, dsl_dataset_t ds_prev,
	- dsl_dataset_t ds_next, boolean_t after_branch_point, dmu_tx_t tx)
	-{
	- struct process_old_arg poa = { 0 };
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- objset_t *mos = dp->dp_meta_objset;
	- uint64_t deadlist_obj;
	-
	- ASSERT(ds->ds_deadlist.dl_oldfmt);
	- ASSERT(ds_next->ds_deadlist.dl_oldfmt);
	-
	- poa.ds = ds;
	- poa.ds_prev = ds_prev;
	- poa.after_branch_point = after_branch_point;
	- poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	- VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
	- process_old_cb, &poa, tx));
	- VERIFY0(zio_wait(poa.pio));
	- ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
	-
	- /* change snapused */
	- dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
	- -poa.used, -poa.comp, -poa.uncomp, tx);
	-
	- /* swap next's deadlist to our deadlist */
	- dsl_deadlist_close(&ds->ds_deadlist);
	- dsl_deadlist_close(&ds_next->ds_deadlist);
	- deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
	- dsl_dataset_phys(ds)->ds_deadlist_obj =
	- dsl_dataset_phys(ds_next)->ds_deadlist_obj;
	- dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
	- dsl_deadlist_open(&ds->ds_deadlist, mos,
	- dsl_dataset_phys(ds)->ds_deadlist_obj);
	- dsl_deadlist_open(&ds_next->ds_deadlist, mos,
	- dsl_dataset_phys(ds_next)->ds_deadlist_obj);
	-}
	-
	-static void
	-dsl_dataset_remove_clones_key(dsl_dataset_t ds, uint64_t mintxg, dmu_tx_t tx)
	-{
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- /*
	- * If it is the old version, dd_clones doesn't exist so we can't
	- * find the clones, but dsl_deadlist_remove_key() is a no-op so it
	- * doesn't matter.
	- */
	- if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
	- return;
	-
	- for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- dsl_dataset_t *clone;
	-
	- VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
	- za.za_first_integer, FTAG, &clone));
	- if (clone->ds_dir->dd_origin_txg > mintxg) {
	- dsl_deadlist_remove_key(&clone->ds_deadlist,
	- mintxg, tx);
	- if (dsl_dataset_remap_deadlist_exists(clone)) {
	- dsl_deadlist_remove_key(
	- &clone->ds_remap_deadlist, mintxg, tx);
	- }
	- dsl_dataset_remove_clones_key(clone, mintxg, tx);
	- }
	- dsl_dataset_rele(clone, FTAG);
	- }
	- zap_cursor_fini(&zc);
	-}
	-
	-static void
	-dsl_destroy_snapshot_handle_remaps(dsl_dataset_t ds, dsl_dataset_t ds_next,
	- dmu_tx_t *tx)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	-
	- /* Move blocks to be obsoleted to pool's obsolete list. */
	- if (dsl_dataset_remap_deadlist_exists(ds_next)) {
	- if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
	- dsl_pool_create_obsolete_bpobj(dp, tx);
	-
	- dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
	- &dp->dp_obsolete_bpobj,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
	- }
	-
	- /* Merge our deadlist into next's and free it. */
	- if (dsl_dataset_remap_deadlist_exists(ds)) {
	- uint64_t remap_deadlist_object =
	- dsl_dataset_get_remap_deadlist_object(ds);
	- ASSERT(remap_deadlist_object != 0);
	-
	- mutex_enter(&ds_next->ds_remap_deadlist_lock);
	- if (!dsl_dataset_remap_deadlist_exists(ds_next))
	- dsl_dataset_create_remap_deadlist(ds_next, tx);
	- mutex_exit(&ds_next->ds_remap_deadlist_lock);
	-
	- dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
	- remap_deadlist_object, tx);
	- dsl_dataset_destroy_remap_deadlist(ds, tx);
	- }
	-}
	-
	-void
	-dsl_destroy_snapshot_sync_impl(dsl_dataset_t ds, boolean_t defer, dmu_tx_t tx)
	-{
	- int err;
	- int after_branch_point = FALSE;
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- objset_t *mos = dp->dp_meta_objset;
	- dsl_dataset_t *ds_prev = NULL;
	- uint64_t obj;
	-
	- ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	- ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
	-
	- if (defer &&
	- (ds->ds_userrefs > 0 \|\|
	- dsl_dataset_phys(ds)->ds_num_children > 1)) {
	- ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_flags \|= DS_FLAG_DEFER_DESTROY;
	- spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
	- return;
	- }
	-
	- ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
	-
	- /* We need to log before removing it from the namespace. */
	- spa_history_log_internal_ds(ds, "destroy", tx, "");
	-
	- dsl_scan_ds_destroyed(ds, tx);
	-
	- obj = ds->ds_object;
	-
	- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	- if (ds->ds_feature_inuse[f]) {
	- dsl_dataset_deactivate_feature(obj, f, tx);
	- ds->ds_feature_inuse[f] = B_FALSE;
	- }
	- }
	- if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	- ASSERT3P(ds->ds_prev, ==, NULL);
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
	- after_branch_point =
	- (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
	-
	- dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
	- if (after_branch_point &&
	- dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
	- dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
	- if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
	- VERIFY0(zap_add_int(mos,
	- dsl_dataset_phys(ds_prev)->
	- ds_next_clones_obj,
	- dsl_dataset_phys(ds)->ds_next_snap_obj,
	- tx));
	- }
	- }
	- if (!after_branch_point) {
	- dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
	- dsl_dataset_phys(ds)->ds_next_snap_obj;
	- }
	- }
	-
	- dsl_dataset_t *ds_next;
	- uint64_t old_unique;
	- uint64_t used = 0, comp = 0, uncomp = 0;
	-
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
	- ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
	-
	- old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
	-
	- dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
	- dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
	- dsl_dataset_phys(ds)->ds_prev_snap_obj;
	- dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
	- dsl_dataset_phys(ds)->ds_prev_snap_txg;
	- ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
	- ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
	-
	- if (ds_next->ds_deadlist.dl_oldfmt) {
	- process_old_deadlist(ds, ds_prev, ds_next,
	- after_branch_point, tx);
	- } else {
	- /* Adjust prev's unique space. */
	- if (ds_prev && !after_branch_point) {
	- dsl_deadlist_space_range(&ds_next->ds_deadlist,
	- dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg,
	- &used, &comp, &uncomp);
	- dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
	- }
	-
	- /* Adjust snapused. */
	- dsl_deadlist_space_range(&ds_next->ds_deadlist,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
	- &used, &comp, &uncomp);
	- dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
	- -used, -comp, -uncomp, tx);
	-
	- /* Move blocks to be freed to pool's free list. */
	- dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
	- &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
	- tx);
	- dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
	- DD_USED_HEAD, used, comp, uncomp, tx);
	-
	- /* Merge our deadlist into next's and free it. */
	- dsl_deadlist_merge(&ds_next->ds_deadlist,
	- dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
	- }
	-
	- dsl_deadlist_close(&ds->ds_deadlist);
	- dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
	-
	- dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
	-
	- /* Collapse range in clone heads */
	- dsl_dataset_remove_clones_key(ds,
	- dsl_dataset_phys(ds)->ds_creation_txg, tx);
	-
	- if (ds_next->ds_is_snapshot) {
	- dsl_dataset_t *ds_nextnext;
	-
	- /*
	- * Update next's unique to include blocks which
	- * were previously shared by only this snapshot
	- * and it. Those blocks will be born after the
	- * prev snap and before this snap, and will have
	- * died after the next snap and before the one
	- * after that (ie. be on the snap after next's
	- * deadlist).
	- */
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds_next)->ds_next_snap_obj,
	- FTAG, &ds_nextnext));
	- dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg,
	- dsl_dataset_phys(ds)->ds_creation_txg,
	- &used, &comp, &uncomp);
	- dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
	- dsl_dataset_rele(ds_nextnext, FTAG);
	- ASSERT3P(ds_next->ds_prev, ==, NULL);
	-
	- /* Collapse range in this head. */
	- dsl_dataset_t *hds;
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
	- dsl_deadlist_remove_key(&hds->ds_deadlist,
	- dsl_dataset_phys(ds)->ds_creation_txg, tx);
	- if (dsl_dataset_remap_deadlist_exists(hds)) {
	- dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
	- dsl_dataset_phys(ds)->ds_creation_txg, tx);
	- }
	- dsl_dataset_rele(hds, FTAG);
	-
	- } else {
	- ASSERT3P(ds_next->ds_prev, ==, ds);
	- dsl_dataset_rele(ds_next->ds_prev, ds_next);
	- ds_next->ds_prev = NULL;
	- if (ds_prev) {
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj,
	- ds_next, &ds_next->ds_prev));
	- }
	-
	- dsl_dataset_recalc_head_uniq(ds_next);
	-
	- /*
	- * Reduce the amount of our unconsumed refreservation
	- * being charged to our parent by the amount of
	- * new unique data we have gained.
	- */
	- if (old_unique < ds_next->ds_reserved) {
	- int64_t mrsdelta;
	- uint64_t new_unique =
	- dsl_dataset_phys(ds_next)->ds_unique_bytes;
	-
	- ASSERT(old_unique <= new_unique);
	- mrsdelta = MIN(new_unique - old_unique,
	- ds_next->ds_reserved - old_unique);
	- dsl_dir_diduse_space(ds->ds_dir,
	- DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
	- }
	- }
	- dsl_dataset_rele(ds_next, FTAG);
	-
	- /*
	- * This must be done after the dsl_traverse(), because it will
	- * re-open the objset.
	- */
	- if (ds->ds_objset) {
	- dmu_objset_evict(ds->ds_objset);
	- ds->ds_objset = NULL;
	- }
	-
	- /* remove from snapshot namespace */
	- dsl_dataset_t *ds_head;
	- ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
	- VERIFY0(dsl_dataset_get_snapname(ds));
	-#ifdef ZFS_DEBUG
	- {
	- uint64_t val;
	-
	- err = dsl_dataset_snap_lookup(ds_head,
	- ds->ds_snapname, &val);
	- ASSERT0(err);
	- ASSERT3U(val, ==, obj);
	- }
	-#endif
	- VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
	- dsl_dataset_rele(ds_head, FTAG);
	-
	- if (ds_prev != NULL)
	- dsl_dataset_rele(ds_prev, FTAG);
	-
	- spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
	-
	- if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
	- uint64_t count;
	- ASSERT0(zap_count(mos,
	- dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
	- count == 0);
	- VERIFY0(dmu_object_free(mos,
	- dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
	- }
	- if (dsl_dataset_phys(ds)->ds_props_obj != 0)
	- VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
	- tx));
	- if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
	- VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
	- tx));
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- char dsname[ZFS_MAX_DATASET_NAME_LEN];
	-
	- dsl_dataset_name(ds, dsname);
	- zvol_remove_minors(dp->dp_spa, dsname);
	-#endif
	-
	- dsl_dir_rele(ds->ds_dir, ds);
	- ds->ds_dir = NULL;
	- dmu_object_free_zapified(mos, obj, tx);
	-}
	-
	-void
	-dsl_destroy_snapshot_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_destroy_snapshot_arg_t *ddsa = arg;
	- const char *dsname = ddsa->ddsa_name;
	- boolean_t defer = ddsa->ddsa_defer;
	-
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	-
	- int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
	- if (error == ENOENT)
	- return;
	- ASSERT0(error);
	- dsl_destroy_snapshot_sync_impl(ds, defer, tx);
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-/*
	- * The semantics of this function are described in the comment above
	- * lzc_destroy_snaps(). To summarize:
	- *
	- * The snapshots must all be in the same pool.
	- *
	- * Snapshots that don't exist will be silently ignored (considered to be
	- * "already deleted").
	- *
	- * On success, all snaps will be destroyed and this will return 0.
	- * On failure, no snaps will be destroyed, the errlist will be filled in,
	- * and this will return an errno.
	- */
	-int
	-dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
	- nvlist_t *errlist)
	-{
	- if (nvlist_next_nvpair(snaps, NULL) == NULL)
	- return (0);
	-
	- /*
	- * lzc_destroy_snaps() is documented to take an nvlist whose
	- * values "don't matter". We need to convert that nvlist to
	- * one that we know can be converted to LUA. We also don't
	- * care about any duplicate entries because the nvlist will
	- * be converted to a LUA table which should take care of this.
	- */
	- nvlist_t *snaps_normalized;
	- VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP));
	- for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) {
	- fnvlist_add_boolean_value(snaps_normalized,
	- nvpair_name(pair), B_TRUE);
	- }
	-
	- nvlist_t *arg;
	- VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP));
	- fnvlist_add_nvlist(arg, "snaps", snaps_normalized);
	- fnvlist_free(snaps_normalized);
	- fnvlist_add_boolean_value(arg, "defer", defer);
	-
	- nvlist_t *wrapper;
	- VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP));
	- fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg);
	- fnvlist_free(arg);
	-
	- const char *program =
	- "arg = ...\n"
	- "snaps = arg['snaps']\n"
	- "defer = arg['defer']\n"
	- "errors = { }\n"
	- "has_errors = false\n"
	- "for snap, v in pairs(snaps) do\n"
	- " errno = zfs.check.destroy{snap, defer=defer}\n"
	- " zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n"
	- " if errno == ENOENT then\n"
	- " snaps[snap] = nil\n"
	- " elseif errno ~= 0 then\n"
	- " errors[snap] = errno\n"
	- " has_errors = true\n"
	- " end\n"
	- "end\n"
	- "if has_errors then\n"
	- " return errors\n"
	- "end\n"
	- "for snap, v in pairs(snaps) do\n"
	- " errno = zfs.sync.destroy{snap, defer=defer}\n"
	- " assert(errno == 0)\n"
	- "end\n"
	- "return { }\n";
	-
	- nvlist_t *result = fnvlist_alloc();
	- int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
	- program,
	- B_TRUE,
	- 0,
	- zfs_lua_max_memlimit,
	- nvlist_next_nvpair(wrapper, NULL), result);
	- if (error != 0) {
	- char *errorstr = NULL;
	- (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
	- if (errorstr != NULL) {
	- zfs_dbgmsg(errorstr);
	- }
	- return (error);
	- }
	- fnvlist_free(wrapper);
	-
	- /*
	- * lzc_destroy_snaps() is documented to fill the errlist with
	- * int32 values, so we need to covert the int64 values that are
	- * returned from LUA.
	- */
	- int rv = 0;
	- nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN);
	- for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) {
	- int32_t val = (int32_t)fnvpair_value_int64(pair);
	- if (rv == 0)
	- rv = val;
	- fnvlist_add_int32(errlist, nvpair_name(pair), val);
	- }
	- fnvlist_free(result);
	- return (rv);
	-}
	-
	-int
	-dsl_destroy_snapshot(const char *name, boolean_t defer)
	-{
	- int error;
	- nvlist_t *nvl = fnvlist_alloc();
	- nvlist_t *errlist = fnvlist_alloc();
	-
	- fnvlist_add_boolean(nvl, name);
	- error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
	- fnvlist_free(errlist);
	- fnvlist_free(nvl);
	- return (error);
	-}
	-
	-struct killarg {
	- dsl_dataset_t *ds;
	- dmu_tx_t *tx;
	-};
	-
	-/* ARGSUSED */
	-static int
	-kill_blkptr(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	-{
	- struct killarg *ka = arg;
	- dmu_tx_t *tx = ka->tx;
	-
	- if (bp == NULL \|\| BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp))
	- return (0);
	-
	- if (zb->zb_level == ZB_ZIL_LEVEL) {
	- ASSERT(zilog != NULL);
	- /*
	- * It's a block in the intent log. It has no
	- * accounting, so just free it.
	- */
	- dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
	- } else {
	- ASSERT(zilog == NULL);
	- ASSERT3U(bp->blk_birth, >,
	- dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
	- (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
	- }
	-
	- return (0);
	-}
	-
	-static void
	-old_synchronous_dataset_destroy(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- struct killarg ka;
	-
	- /*
	- * Free everything that we point to (that's born after
	- * the previous snapshot, if we are a clone)
	- *
	- * NB: this should be very quick, because we already
	- * freed all the objects in open context.
	- */
	- ka.ds = ds;
	- ka.tx = tx;
	- VERIFY0(traverse_dataset(ds,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
	- kill_blkptr, &ka));
	- ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) \|\|
	- dsl_dataset_phys(ds)->ds_unique_bytes == 0);
	-}
	-
	-int
	-dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
	-{
	- int error;
	- uint64_t count;
	- objset_t *mos;
	-
	- ASSERT(!ds->ds_is_snapshot);
	- if (ds->ds_is_snapshot)
	- return (SET_ERROR(EINVAL));
	-
	- if (zfs_refcount_count(&ds->ds_longholds) != expected_holds)
	- return (SET_ERROR(EBUSY));
	-
	- mos = ds->ds_dir->dd_pool->dp_meta_objset;
	-
	- /*
	- * Can't delete a head dataset if there are snapshots of it.
	- * (Except if the only snapshots are from the branch we cloned
	- * from.)
	- */
	- if (ds->ds_prev != NULL &&
	- dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
	- return (SET_ERROR(EBUSY));
	-
	- /*
	- * Can't delete if there are children of this fs.
	- */
	- error = zap_count(mos,
	- dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
	- if (error != 0)
	- return (error);
	- if (count != 0)
	- return (SET_ERROR(EEXIST));
	-
	- if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
	- dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
	- ds->ds_prev->ds_userrefs == 0) {
	- /* We need to remove the origin snapshot as well. */
	- if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds))
	- return (SET_ERROR(EBUSY));
	- }
	- return (0);
	-}
	-
	-int
	-dsl_destroy_head_check(void arg, dmu_tx_t tx)
	-{
	- dsl_destroy_head_arg_t *ddha = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- int error;
	-
	- error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_destroy_head_check_impl(ds, 0);
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	-}
	-
	-static void
	-dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
	-{
	- dsl_dir_t *dd;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- objset_t *mos = dp->dp_meta_objset;
	- dd_used_t t;
	-
	- ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
	-
	- VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
	-
	- ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
	-
	- /*
	- * Decrement the filesystem count for all parent filesystems.
	- *
	- * When we receive an incremental stream into a filesystem that already
	- * exists, a temporary clone is created. We never count this temporary
	- * clone, whose name begins with a '%'.
	- */
	- if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
	- dsl_fs_ss_count_adjust(dd->dd_parent, -1,
	- DD_FIELD_FILESYSTEM_COUNT, tx);
	-
	- /*
	- * Remove our reservation. The impl() routine avoids setting the
	- * actual property, which would require the (already destroyed) ds.
	- */
	- dsl_dir_set_reservation_sync_impl(dd, 0, tx);
	-
	- ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
	- ASSERT0(dsl_dir_phys(dd)->dd_reserved);
	- for (t = 0; t < DD_USED_NUM; t++)
	- ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
	-
	- VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
	- VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
	- if (dsl_dir_phys(dd)->dd_clones != 0)
	- VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx));
	- VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
	- VERIFY0(zap_remove(mos,
	- dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
	- dd->dd_myname, tx));
	-
	- dsl_dir_rele(dd, FTAG);
	- dmu_object_free_zapified(mos, ddobj, tx);
	-}
	-
	-void
	-dsl_destroy_head_sync_impl(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- objset_t *mos = dp->dp_meta_objset;
	- uint64_t obj, ddobj, prevobj = 0;
	- boolean_t rmorigin;
	-
	- ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
	- ASSERT(ds->ds_prev == NULL \|\|
	- dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	- ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
	-
	- /* We need to log before removing it from the namespace. */
	- spa_history_log_internal_ds(ds, "destroy", tx, "");
	-
	- rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
	- DS_IS_DEFER_DESTROY(ds->ds_prev) &&
	- dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
	- ds->ds_prev->ds_userrefs == 0);
	-
	- /* Remove our reservation. */
	- if (ds->ds_reserved != 0) {
	- dsl_dataset_set_refreservation_sync_impl(ds,
	- (ZPROP_SRC_NONE \| ZPROP_SRC_LOCAL \| ZPROP_SRC_RECEIVED),
	- 0, tx);
	- ASSERT0(ds->ds_reserved);
	- }
	-
	- obj = ds->ds_object;
	-
	- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	- if (ds->ds_feature_inuse[f]) {
	- dsl_dataset_deactivate_feature(obj, f, tx);
	- ds->ds_feature_inuse[f] = B_FALSE;
	- }
	- }
	-
	- dsl_scan_ds_destroyed(ds, tx);
	-
	- if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	- /* This is a clone */
	- ASSERT(ds->ds_prev != NULL);
	- ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
	- obj);
	- ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
	-
	- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
	- if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
	- dsl_dataset_remove_from_next_clones(ds->ds_prev,
	- obj, tx);
	- }
	-
	- ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
	- dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
	- }
	-
	- /*
	- * Destroy the deadlist. Unless it's a clone, the
	- * deadlist should be empty since the dataset has no snapshots.
	- * (If it's a clone, it's safe to ignore the deadlist contents
	- * since they are still referenced by the origin snapshot.)
	- */
	- dsl_deadlist_close(&ds->ds_deadlist);
	- dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
	-
	- if (dsl_dataset_remap_deadlist_exists(ds))
	- dsl_dataset_destroy_remap_deadlist(ds, tx);
	-
	- objset_t *os;
	- VERIFY0(dmu_objset_from_ds(ds, &os));
	-
	- if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
	- old_synchronous_dataset_destroy(ds, tx);
	- } else {
	- /*
	- * Move the bptree into the pool's list of trees to
	- * clean up and update space accounting information.
	- */
	- uint64_t used, comp, uncomp;
	-
	- zil_destroy_sync(dmu_objset_zil(os), tx);
	-
	- if (!spa_feature_is_active(dp->dp_spa,
	- SPA_FEATURE_ASYNC_DESTROY)) {
	- dsl_scan_t *scn = dp->dp_scan;
	- spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
	- tx);
	- dp->dp_bptree_obj = bptree_alloc(mos, tx);
	- VERIFY0(zap_add(mos,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
	- &dp->dp_bptree_obj, tx));
	- ASSERT(!scn->scn_async_destroying);
	- scn->scn_async_destroying = B_TRUE;
	- }
	-
	- used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
	- comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
	- uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
	-
	- ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) \|\|
	- dsl_dataset_phys(ds)->ds_unique_bytes == used);
	-
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- bptree_add(mos, dp->dp_bptree_obj,
	- &dsl_dataset_phys(ds)->ds_bp,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg,
	- used, comp, uncomp, tx);
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	- dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
	- -used, -comp, -uncomp, tx);
	- dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
	- used, comp, uncomp, tx);
	- }
	-
	- if (ds->ds_prev != NULL) {
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
	- VERIFY0(zap_remove_int(mos,
	- dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
	- ds->ds_object, tx));
	- }
	- prevobj = ds->ds_prev->ds_object;
	- dsl_dataset_rele(ds->ds_prev, ds);
	- ds->ds_prev = NULL;
	- }
	-
	- /*
	- * This must be done after the dsl_traverse(), because it will
	- * re-open the objset.
	- */
	- if (ds->ds_objset) {
	- dmu_objset_evict(ds->ds_objset);
	- ds->ds_objset = NULL;
	- }
	-
	- /* Erase the link in the dir */
	- dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
	- dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
	- ddobj = ds->ds_dir->dd_object;
	- ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
	- VERIFY0(zap_destroy(mos,
	- dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
	-
	- if (ds->ds_bookmarks != 0) {
	- VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
	- spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
	- }
	-
	- spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
	-
	- ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
	- ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
	- ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
	- dsl_dir_rele(ds->ds_dir, ds);
	- ds->ds_dir = NULL;
	- dmu_object_free_zapified(mos, obj, tx);
	-
	- dsl_dir_destroy_sync(ddobj, tx);
	-
	- if (rmorigin) {
	- dsl_dataset_t *prev;
	- VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
	- dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
	- dsl_dataset_rele(prev, FTAG);
	- }
	-}
	-
	-void
	-dsl_destroy_head_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_destroy_head_arg_t *ddha = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	-
	- VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
	- dsl_destroy_head_sync_impl(ds, tx);
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- zvol_remove_minors(dp->dp_spa, ddha->ddha_name);
	-#endif
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-static void
	-dsl_destroy_head_begin_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_destroy_head_arg_t *ddha = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	-
	- VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
	-
	- /* Mark it as inconsistent on-disk, in case we crash */
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_flags \|= DS_FLAG_INCONSISTENT;
	-
	- spa_history_log_internal_ds(ds, "destroy begin", tx, "");
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-int
	-dsl_destroy_head(const char *name)
	-{
	- dsl_destroy_head_arg_t ddha;
	- int error;
	- spa_t *spa;
	- boolean_t isenabled;
	-
	-#ifdef _KERNEL
	- zfs_destroy_unmount_origin(name);
	-#endif
	-
	- error = spa_open(name, &spa, FTAG);
	- if (error != 0)
	- return (error);
	- isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
	- spa_close(spa, FTAG);
	-
	- ddha.ddha_name = name;
	-
	- if (!isenabled) {
	- objset_t *os;
	-
	- error = dsl_sync_task(name, dsl_destroy_head_check,
	- dsl_destroy_head_begin_sync, &ddha,
	- 0, ZFS_SPACE_CHECK_DESTROY);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Head deletion is processed in one txg on old pools;
	- * remove the objects from open context so that the txg sync
	- * is not too long.
	- */
	- error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
	- if (error == 0) {
	- uint64_t prev_snap_txg =
	- dsl_dataset_phys(dmu_objset_ds(os))->
	- ds_prev_snap_txg;
	- for (uint64_t obj = 0; error == 0;
	- error = dmu_object_next(os, &obj, FALSE,
	- prev_snap_txg))
	- (void) dmu_free_long_object(os, obj);
	- /* sync out all frees */
	- txg_wait_synced(dmu_objset_pool(os), 0);
	- dmu_objset_disown(os, FTAG);
	- }
	- }
	-
	- return (dsl_sync_task(name, dsl_destroy_head_check,
	- dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
	-}
	-
	-/*
	- * Note, this function is used as the callback for dmu_objset_find(). We
	- * always return 0 so that we will continue to find and process
	- * inconsistent datasets, even if we encounter an error trying to
	- * process one of them.
	- */
	-/* ARGSUSED */
	-int
	-dsl_destroy_inconsistent(const char dsname, void arg)
	-{
	- objset_t *os;
	-
	- if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
	- boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
	-
	- /*
	- * If the dataset is inconsistent because a resumable receive
	- * has failed, then do not destroy it.
	- */
	- if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
	- need_destroy = B_FALSE;
	-
	- dmu_objset_rele(os, FTAG);
	- if (need_destroy)
	- (void) dsl_destroy_head(dsname);
	- }
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
	@@ -1,2184 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
	- * All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
	- */
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/spa.h>
	-#include <sys/metaslab.h>
	-#include <sys/zap.h>
	-#include <sys/zio.h>
	-#include <sys/arc.h>
	-#include <sys/sunddi.h>
	-#include <sys/zvol.h>
	-#ifdef _KERNEL
	-#include <sys/zfs_vfsops.h>
	-#endif
	-#include <sys/zfeature.h>
	-#include <sys/policy.h>
	-#include <sys/zfs_znode.h>
	-#include "zfs_namecheck.h"
	-#include "zfs_prop.h"
	-
	-/*
	- * Filesystem and Snapshot Limits
	- * ------------------------------
	- *
	- * These limits are used to restrict the number of filesystems and/or snapshots
	- * that can be created at a given level in the tree or below. A typical
	- * use-case is with a delegated dataset where the administrator wants to ensure
	- * that a user within the zone is not creating too many additional filesystems
	- * or snapshots, even though they're not exceeding their space quota.
	- *
	- * The filesystem and snapshot counts are stored as extensible properties. This
	- * capability is controlled by a feature flag and must be enabled to be used.
	- * Once enabled, the feature is not active until the first limit is set. At
	- * that point, future operations to create/destroy filesystems or snapshots
	- * will validate and update the counts.
	- *
	- * Because the count properties will not exist before the feature is active,
	- * the counts are updated when a limit is first set on an uninitialized
	- * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
	- * all of the nested filesystems/snapshots. Thus, a new leaf node has a
	- * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
	- * snapshot count properties on a node indicate uninitialized counts on that
	- * node.) When first setting a limit on an uninitialized node, the code starts
	- * at the filesystem with the new limit and descends into all sub-filesystems
	- * to add the count properties.
	- *
	- * In practice this is lightweight since a limit is typically set when the
	- * filesystem is created and thus has no children. Once valid, changing the
	- * limit value won't require a re-traversal since the counts are already valid.
	- * When recursively fixing the counts, if a node with a limit is encountered
	- * during the descent, the counts are known to be valid and there is no need to
	- * descend into that filesystem's children. The counts on filesystems above the
	- * one with the new limit will still be uninitialized, unless a limit is
	- * eventually set on one of those filesystems. The counts are always recursively
	- * updated when a limit is set on a dataset, unless there is already a limit.
	- * When a new limit value is set on a filesystem with an existing limit, it is
	- * possible for the new limit to be less than the current count at that level
	- * since a user who can change the limit is also allowed to exceed the limit.
	- *
	- * Once the feature is active, then whenever a filesystem or snapshot is
	- * created, the code recurses up the tree, validating the new count against the
	- * limit at each initialized level. In practice, most levels will not have a
	- * limit set. If there is a limit at any initialized level up the tree, the
	- * check must pass or the creation will fail. Likewise, when a filesystem or
	- * snapshot is destroyed, the counts are recursively adjusted all the way up
	- * the initizized nodes in the tree. Renaming a filesystem into different point
	- * in the tree will first validate, then update the counts on each branch up to
	- * the common ancestor. A receive will also validate the counts and then update
	- * them.
	- *
	- * An exception to the above behavior is that the limit is not enforced if the
	- * user has permission to modify the limit. This is primarily so that
	- * recursive snapshots in the global zone always work. We want to prevent a
	- * denial-of-service in which a lower level delegated dataset could max out its
	- * limit and thus block recursive snapshots from being taken in the global zone.
	- * Because of this, it is possible for the snapshot count to be over the limit
	- * and snapshots taken in the global zone could cause a lower level dataset to
	- * hit or exceed its limit. The administrator taking the global zone recursive
	- * snapshot should be aware of this side-effect and behave accordingly.
	- * For consistency, the filesystem limit is also not enforced if the user can
	- * modify the limit.
	- *
	- * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
	- * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
	- * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
	- * dsl_dir_init_fs_ss_count().
	- *
	- * There is a special case when we receive a filesystem that already exists. In
	- * this case a temporary clone name of %X is created (see dmu_recv_begin). We
	- * never update the filesystem counts for temporary clones.
	- *
	- * Likewise, we do not update the snapshot counts for temporary snapshots,
	- * such as those created by zfs diff.
	- */
	-
	-extern inline dsl_dir_phys_t dsl_dir_phys(dsl_dir_t dd);
	-
	-static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
	-
	-typedef struct ddulrt_arg {
	- dsl_dir_t *ddulrta_dd;
	- uint64_t ddlrta_txg;
	-} ddulrt_arg_t;
	-
	-static void
	-dsl_dir_evict_async(void *dbu)
	-{
	- dsl_dir_t *dd = dbu;
	- dsl_pool_t *dp = dd->dd_pool;
	- int t;
	-
	- dd->dd_dbuf = NULL;
	-
	- for (t = 0; t < TXG_SIZE; t++) {
	- ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
	- ASSERT(dd->dd_tempreserved[t] == 0);
	- ASSERT(dd->dd_space_towrite[t] == 0);
	- }
	-
	- if (dd->dd_parent)
	- dsl_dir_async_rele(dd->dd_parent, dd);
	-
	- spa_async_close(dd->dd_pool->dp_spa, dd);
	-
	- dsl_prop_fini(dd);
	- mutex_destroy(&dd->dd_lock);
	- kmem_free(dd, sizeof (dsl_dir_t));
	-}
	-
	-int
	-dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
	- const char tail, void tag, dsl_dir_t **ddp)
	-{
	- dmu_buf_t *dbuf;
	- dsl_dir_t *dd;
	- int err;
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
	- if (err != 0)
	- return (err);
	- dd = dmu_buf_get_user(dbuf);
	-#ifdef ZFS_DEBUG
	- {
	- dmu_object_info_t doi;
	- dmu_object_info_from_db(dbuf, &doi);
	- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
	- ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
	- }
	-#endif
	- if (dd == NULL) {
	- dsl_dir_t *winner;
	-
	- dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
	- dd->dd_object = ddobj;
	- dd->dd_dbuf = dbuf;
	- dd->dd_pool = dp;
	- mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
	- dsl_prop_init(dd);
	-
	- dsl_dir_snap_cmtime_update(dd);
	-
	- if (dsl_dir_phys(dd)->dd_parent_obj) {
	- err = dsl_dir_hold_obj(dp,
	- dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
	- &dd->dd_parent);
	- if (err != 0)
	- goto errout;
	- if (tail) {
	-#ifdef ZFS_DEBUG
	- uint64_t foundobj;
	-
	- err = zap_lookup(dp->dp_meta_objset,
	- dsl_dir_phys(dd->dd_parent)->
	- dd_child_dir_zapobj, tail,
	- sizeof (foundobj), 1, &foundobj);
	- ASSERT(err \|\| foundobj == ddobj);
	-#endif
	- (void) strcpy(dd->dd_myname, tail);
	- } else {
	- err = zap_value_search(dp->dp_meta_objset,
	- dsl_dir_phys(dd->dd_parent)->
	- dd_child_dir_zapobj,
	- ddobj, 0, dd->dd_myname);
	- }
	- if (err != 0)
	- goto errout;
	- } else {
	- (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
	- }
	-
	- if (dsl_dir_is_clone(dd)) {
	- dmu_buf_t *origin_bonus;
	- dsl_dataset_phys_t *origin_phys;
	-
	- /*
	- * We can't open the origin dataset, because
	- * that would require opening this dsl_dir.
	- * Just look at its phys directly instead.
	- */
	- err = dmu_bonus_hold(dp->dp_meta_objset,
	- dsl_dir_phys(dd)->dd_origin_obj, FTAG,
	- &origin_bonus);
	- if (err != 0)
	- goto errout;
	- origin_phys = origin_bonus->db_data;
	- dd->dd_origin_txg =
	- origin_phys->ds_creation_txg;
	- dmu_buf_rele(origin_bonus, FTAG);
	- }
	-
	- dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
	- &dd->dd_dbuf);
	- winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
	- if (winner != NULL) {
	- if (dd->dd_parent)
	- dsl_dir_rele(dd->dd_parent, dd);
	- dsl_prop_fini(dd);
	- mutex_destroy(&dd->dd_lock);
	- kmem_free(dd, sizeof (dsl_dir_t));
	- dd = winner;
	- } else {
	- spa_open_ref(dp->dp_spa, dd);
	- }
	- }
	-
	- /*
	- * The dsl_dir_t has both open-to-close and instantiate-to-evict
	- * holds on the spa. We need the open-to-close holds because
	- * otherwise the spa_refcnt wouldn't change when we open a
	- * dir which the spa also has open, so we could incorrectly
	- * think it was OK to unload/export/destroy the pool. We need
	- * the instantiate-to-evict hold because the dsl_dir_t has a
	- * pointer to the dd_pool, which has a pointer to the spa_t.
	- */
	- spa_open_ref(dp->dp_spa, tag);
	- ASSERT3P(dd->dd_pool, ==, dp);
	- ASSERT3U(dd->dd_object, ==, ddobj);
	- ASSERT3P(dd->dd_dbuf, ==, dbuf);
	- *ddp = dd;
	- return (0);
	-
	-errout:
	- if (dd->dd_parent)
	- dsl_dir_rele(dd->dd_parent, dd);
	- dsl_prop_fini(dd);
	- mutex_destroy(&dd->dd_lock);
	- kmem_free(dd, sizeof (dsl_dir_t));
	- dmu_buf_rele(dbuf, tag);
	- return (err);
	-}
	-
	-void
	-dsl_dir_rele(dsl_dir_t dd, void tag)
	-{
	- dprintf_dd(dd, "%s\n", "");
	- spa_close(dd->dd_pool->dp_spa, tag);
	- dmu_buf_rele(dd->dd_dbuf, tag);
	-}
	-
	-/*
	- * Remove a reference to the given dsl dir that is being asynchronously
	- * released. Async releases occur from a taskq performing eviction of
	- * dsl datasets and dirs. This process is identical to a normal release
	- * with the exception of using the async API for releasing the reference on
	- * the spa.
	- */
	-void
	-dsl_dir_async_rele(dsl_dir_t dd, void tag)
	-{
	- dprintf_dd(dd, "%s\n", "");
	- spa_async_close(dd->dd_pool->dp_spa, tag);
	- dmu_buf_rele(dd->dd_dbuf, tag);
	-}
	-
	-/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
	-void
	-dsl_dir_name(dsl_dir_t dd, char buf)
	-{
	- if (dd->dd_parent) {
	- dsl_dir_name(dd->dd_parent, buf);
	- VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
	- ZFS_MAX_DATASET_NAME_LEN);
	- } else {
	- buf[0] = '\0';
	- }
	- if (!MUTEX_HELD(&dd->dd_lock)) {
	- /*
	- * recursive mutex so that we can use
	- * dprintf_dd() with dd_lock held
	- */
	- mutex_enter(&dd->dd_lock);
	- VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
	- <, ZFS_MAX_DATASET_NAME_LEN);
	- mutex_exit(&dd->dd_lock);
	- } else {
	- VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
	- <, ZFS_MAX_DATASET_NAME_LEN);
	- }
	-}
	-
	-/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
	-int
	-dsl_dir_namelen(dsl_dir_t *dd)
	-{
	- int result = 0;
	-
	- if (dd->dd_parent) {
	- /* parent's name + 1 for the "/" */
	- result = dsl_dir_namelen(dd->dd_parent) + 1;
	- }
	-
	- if (!MUTEX_HELD(&dd->dd_lock)) {
	- /* see dsl_dir_name */
	- mutex_enter(&dd->dd_lock);
	- result += strlen(dd->dd_myname);
	- mutex_exit(&dd->dd_lock);
	- } else {
	- result += strlen(dd->dd_myname);
	- }
	-
	- return (result);
	-}
	-
	-static int
	-getcomponent(const char path, char component, const char **nextp)
	-{
	- char *p;
	-
	- if ((path == NULL) \|\| (path[0] == '\0'))
	- return (SET_ERROR(ENOENT));
	- /* This would be a good place to reserve some namespace... */
	- p = strpbrk(path, "/@");
	- if (p && (p[1] == '/' \|\| p[1] == '@')) {
	- /* two separators in a row */
	- return (SET_ERROR(EINVAL));
	- }
	- if (p == NULL \|\| p == path) {
	- /*
	- * if the first thing is an @ or /, it had better be an
	- * @ and it had better not have any more ats or slashes,
	- * and it had better have something after the @.
	- */
	- if (p != NULL &&
	- (p[0] != '@' \|\| strpbrk(path+1, "/@") \|\| p[1] == '\0'))
	- return (SET_ERROR(EINVAL));
	- if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	- (void) strcpy(component, path);
	- p = NULL;
	- } else if (p[0] == '/') {
	- if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	- (void) strncpy(component, path, p - path);
	- component[p - path] = '\0';
	- p++;
	- } else if (p[0] == '@') {
	- /*
	- * if the next separator is an @, there better not be
	- * any more slashes.
	- */
	- if (strchr(path, '/'))
	- return (SET_ERROR(EINVAL));
	- if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	- (void) strncpy(component, path, p - path);
	- component[p - path] = '\0';
	- } else {
	- panic("invalid p=%p", (void *)p);
	- }
	- *nextp = p;
	- return (0);
	-}
	-
	-/*
	- * Return the dsl_dir_t, and possibly the last component which couldn't
	- * be found in *tail. The name must be in the specified dsl_pool_t. This
	- * thread must hold the dp_config_rwlock for the pool. Returns NULL if the
	- * path is bogus, or if tail==NULL and we couldn't parse the whole name.
	- * (*tail)[0] == '@' means that the last component is a snapshot.
	- */
	-int
	-dsl_dir_hold(dsl_pool_t dp, const char name, void *tag,
	- dsl_dir_t ddp, const char tailp)
	-{
	- char buf[ZFS_MAX_DATASET_NAME_LEN];
	- const char spaname, next, *nextnext = NULL;
	- int err;
	- dsl_dir_t *dd;
	- uint64_t ddobj;
	-
	- err = getcomponent(name, buf, &next);
	- if (err != 0)
	- return (err);
	-
	- /* Make sure the name is in the specified pool. */
	- spaname = spa_name(dp->dp_spa);
	- if (strcmp(buf, spaname) != 0)
	- return (SET_ERROR(EXDEV));
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
	- if (err != 0) {
	- return (err);
	- }
	-
	- while (next != NULL) {
	- dsl_dir_t *child_dd;
	- err = getcomponent(next, buf, &nextnext);
	- if (err != 0)
	- break;
	- ASSERT(next[0] != '\0');
	- if (next[0] == '@')
	- break;
	- dprintf("looking up %s in obj%lld\n",
	- buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
	-
	- err = zap_lookup(dp->dp_meta_objset,
	- dsl_dir_phys(dd)->dd_child_dir_zapobj,
	- buf, sizeof (ddobj), 1, &ddobj);
	- if (err != 0) {
	- if (err == ENOENT)
	- err = 0;
	- break;
	- }
	-
	- err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
	- if (err != 0)
	- break;
	- dsl_dir_rele(dd, tag);
	- dd = child_dd;
	- next = nextnext;
	- }
	-
	- if (err != 0) {
	- dsl_dir_rele(dd, tag);
	- return (err);
	- }
	-
	- /*
	- * It's an error if there's more than one component left, or
	- * tailp==NULL and there's any component left.
	- */
	- if (next != NULL &&
	- (tailp == NULL \|\| (nextnext && nextnext[0] != '\0'))) {
	- /* bad path name */
	- dsl_dir_rele(dd, tag);
	- dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
	- err = SET_ERROR(ENOENT);
	- }
	- if (tailp != NULL)
	- *tailp = next;
	- *ddp = dd;
	- return (err);
	-}
	-
	-/*
	- * If the counts are already initialized for this filesystem and its
	- * descendants then do nothing, otherwise initialize the counts.
	- *
	- * The counts on this filesystem, and those below, may be uninitialized due to
	- * either the use of a pre-existing pool which did not support the
	- * filesystem/snapshot limit feature, or one in which the feature had not yet
	- * been enabled.
	- *
	- * Recursively descend the filesystem tree and update the filesystem/snapshot
	- * counts on each filesystem below, then update the cumulative count on the
	- * current filesystem. If the filesystem already has a count set on it,
	- * then we know that its counts, and the counts on the filesystems below it,
	- * are already correct, so we don't have to update this filesystem.
	- */
	-static void
	-dsl_dir_init_fs_ss_count(dsl_dir_t dd, dmu_tx_t tx)
	-{
	- uint64_t my_fs_cnt = 0;
	- uint64_t my_ss_cnt = 0;
	- dsl_pool_t *dp = dd->dd_pool;
	- objset_t *os = dp->dp_meta_objset;
	- zap_cursor_t *zc;
	- zap_attribute_t *za;
	- dsl_dataset_t *ds;
	-
	- ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
	- ASSERT(dsl_pool_config_held(dp));
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- dsl_dir_zapify(dd, tx);
	-
	- /*
	- * If the filesystem count has already been initialized then we
	- * don't need to recurse down any further.
	- */
	- if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
	- return;
	-
	- zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
	- za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
	-
	- /* Iterate my child dirs */
	- for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
	- zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
	- dsl_dir_t *chld_dd;
	- uint64_t count;
	-
	- VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
	- &chld_dd));
	-
	- /*
	- * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
	- * temporary datasets.
	- */
	- if (chld_dd->dd_myname[0] == '$' \|\|
	- chld_dd->dd_myname[0] == '%') {
	- dsl_dir_rele(chld_dd, FTAG);
	- continue;
	- }
	-
	- my_fs_cnt++; /* count this child */
	-
	- dsl_dir_init_fs_ss_count(chld_dd, tx);
	-
	- VERIFY0(zap_lookup(os, chld_dd->dd_object,
	- DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
	- my_fs_cnt += count;
	- VERIFY0(zap_lookup(os, chld_dd->dd_object,
	- DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
	- my_ss_cnt += count;
	-
	- dsl_dir_rele(chld_dd, FTAG);
	- }
	- zap_cursor_fini(zc);
	- /* Count my snapshots (we counted children's snapshots above) */
	- VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
	- dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
	-
	- for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
	- zap_cursor_retrieve(zc, za) == 0;
	- zap_cursor_advance(zc)) {
	- /* Don't count temporary snapshots */
	- if (za->za_name[0] != '%')
	- my_ss_cnt++;
	- }
	- zap_cursor_fini(zc);
	-
	- dsl_dataset_rele(ds, FTAG);
	-
	- kmem_free(zc, sizeof (zap_cursor_t));
	- kmem_free(za, sizeof (zap_attribute_t));
	-
	- /* we're in a sync task, update counts */
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	- VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
	- sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
	- VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
	- sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
	-}
	-
	-static int
	-dsl_dir_actv_fs_ss_limit_check(void arg, dmu_tx_t tx)
	-{
	- char ddname = (char )arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- dsl_dir_t *dd;
	- int error;
	-
	- error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
	- if (error != 0)
	- return (error);
	-
	- if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- dd = ds->ds_dir;
	- if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
	- dsl_dir_is_zapified(dd) &&
	- zap_contains(dp->dp_meta_objset, dd->dd_object,
	- DD_FIELD_FILESYSTEM_COUNT) == 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(EALREADY));
	- }
	-
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	-}
	-
	-static void
	-dsl_dir_actv_fs_ss_limit_sync(void arg, dmu_tx_t tx)
	-{
	- char ddname = (char )arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- spa_t *spa;
	-
	- VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
	-
	- spa = dsl_dataset_get_spa(ds);
	-
	- if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
	- /*
	- * Since the feature was not active and we're now setting a
	- * limit, increment the feature-active counter so that the
	- * feature becomes active for the first time.
	- *
	- * We are already in a sync task so we can update the MOS.
	- */
	- spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
	- }
	-
	- /*
	- * Since we are now setting a non-UINT64_MAX limit on the filesystem,
	- * we need to ensure the counts are correct. Descend down the tree from
	- * this point and update all of the counts to be accurate.
	- */
	- dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
	-
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-/*
	- * Make sure the feature is enabled and activate it if necessary.
	- * Since we're setting a limit, ensure the on-disk counts are valid.
	- * This is only called by the ioctl path when setting a limit value.
	- *
	- * We do not need to validate the new limit, since users who can change the
	- * limit are also allowed to exceed the limit.
	- */
	-int
	-dsl_dir_activate_fs_ss_limit(const char *ddname)
	-{
	- int error;
	-
	- error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
	- dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
	- ZFS_SPACE_CHECK_RESERVED);
	-
	- if (error == EALREADY)
	- error = 0;
	-
	- return (error);
	-}
	-
	-/*
	- * Used to determine if the filesystem_limit or snapshot_limit should be
	- * enforced. We allow the limit to be exceeded if the user has permission to
	- * write the property value. We pass in the creds that we got in the open
	- * context since we will always be the GZ root in syncing context. We also have
	- * to handle the case where we are allowed to change the limit on the current
	- * dataset, but there may be another limit in the tree above.
	- *
	- * We can never modify these two properties within a non-global zone. In
	- * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
	- * can't use that function since we are already holding the dp_config_rwlock.
	- * In addition, we already have the dd and dealing with snapshots is simplified
	- * in this code.
	- */
	-
	-typedef enum {
	- ENFORCE_ALWAYS,
	- ENFORCE_NEVER,
	- ENFORCE_ABOVE
	-} enforce_res_t;
	-
	-static enforce_res_t
	-dsl_enforce_ds_ss_limits(dsl_dir_t dd, zfs_prop_t prop, cred_t cr)
	-{
	- enforce_res_t enforce = ENFORCE_ALWAYS;
	- uint64_t obj;
	- dsl_dataset_t *ds;
	- uint64_t zoned;
	-
	- ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT \|\|
	- prop == ZFS_PROP_SNAPSHOT_LIMIT);
	-
	-#ifdef _KERNEL
	-#ifdef __FreeBSD__
	- if (jailed(cr))
	-#else
	- if (crgetzoneid(cr) != GLOBAL_ZONEID)
	-#endif
	- return (ENFORCE_ALWAYS);
	-
	- if (secpolicy_zfs(cr) == 0)
	- return (ENFORCE_NEVER);
	-#endif
	-
	- if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
	- return (ENFORCE_ALWAYS);
	-
	- ASSERT(dsl_pool_config_held(dd->dd_pool));
	-
	- if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
	- return (ENFORCE_ALWAYS);
	-
	- if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) \|\| zoned) {
	- /* Only root can access zoned fs's from the GZ */
	- enforce = ENFORCE_ALWAYS;
	- } else {
	- if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
	- enforce = ENFORCE_ABOVE;
	- }
	-
	- dsl_dataset_rele(ds, FTAG);
	- return (enforce);
	-}
	-
	-static void
	-dsl_dir_update_last_remap_txg_sync(void varg, dmu_tx_t tx)
	-{
	- ddulrt_arg_t *arg = varg;
	- uint64_t last_remap_txg;
	- dsl_dir_t *dd = arg->ddulrta_dd;
	- objset_t *mos = dd->dd_pool->dp_meta_objset;
	-
	- dsl_dir_zapify(dd, tx);
	- if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
	- sizeof (last_remap_txg), 1, &last_remap_txg) != 0 \|\|
	- last_remap_txg < arg->ddlrta_txg) {
	- VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
	- sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx));
	- }
	-}
	-
	-int
	-dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg)
	-{
	- ddulrt_arg_t arg;
	- arg.ddulrta_dd = dd;
	- arg.ddlrta_txg = txg;
	-
	- return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa),
	- NULL, dsl_dir_update_last_remap_txg_sync, &arg,
	- 1, ZFS_SPACE_CHECK_RESERVED));
	-}
	-
	-/*
	- * Check if adding additional child filesystem(s) would exceed any filesystem
	- * limits or adding additional snapshot(s) would exceed any snapshot limits.
	- * The prop argument indicates which limit to check.
	- *
	- * Note that all filesystem limits up to the root (or the highest
	- * initialized) filesystem or the given ancestor must be satisfied.
	- */
	-int
	-dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
	- dsl_dir_t ancestor, cred_t cr)
	-{
	- objset_t *os = dd->dd_pool->dp_meta_objset;
	- uint64_t limit, count;
	- char *count_prop;
	- enforce_res_t enforce;
	- int err = 0;
	-
	- ASSERT(dsl_pool_config_held(dd->dd_pool));
	- ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT \|\|
	- prop == ZFS_PROP_SNAPSHOT_LIMIT);
	-
	- /*
	- * If we're allowed to change the limit, don't enforce the limit
	- * e.g. this can happen if a snapshot is taken by an administrative
	- * user in the global zone (i.e. a recursive snapshot by root).
	- * However, we must handle the case of delegated permissions where we
	- * are allowed to change the limit on the current dataset, but there
	- * is another limit in the tree above.
	- */
	- enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
	- if (enforce == ENFORCE_NEVER)
	- return (0);
	-
	- /*
	- * e.g. if renaming a dataset with no snapshots, count adjustment
	- * is 0.
	- */
	- if (delta == 0)
	- return (0);
	-
	- if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
	- /*
	- * We don't enforce the limit for temporary snapshots. This is
	- * indicated by a NULL cred_t argument.
	- */
	- if (cr == NULL)
	- return (0);
	-
	- count_prop = DD_FIELD_SNAPSHOT_COUNT;
	- } else {
	- count_prop = DD_FIELD_FILESYSTEM_COUNT;
	- }
	-
	- /*
	- * If an ancestor has been provided, stop checking the limit once we
	- * hit that dir. We need this during rename so that we don't overcount
	- * the check once we recurse up to the common ancestor.
	- */
	- if (ancestor == dd)
	- return (0);
	-
	- /*
	- * If we hit an uninitialized node while recursing up the tree, we can
	- * stop since we know there is no limit here (or above). The counts are
	- * not valid on this node and we know we won't touch this node's counts.
	- */
	- if (!dsl_dir_is_zapified(dd) \|\| zap_lookup(os, dd->dd_object,
	- count_prop, sizeof (count), 1, &count) == ENOENT)
	- return (0);
	-
	- err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
	- B_FALSE);
	- if (err != 0)
	- return (err);
	-
	- /* Is there a limit which we've hit? */
	- if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
	- return (SET_ERROR(EDQUOT));
	-
	- if (dd->dd_parent != NULL)
	- err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
	- ancestor, cr);
	-
	- return (err);
	-}
	-
	-/*
	- * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
	- * parents. When a new filesystem/snapshot is created, increment the count on
	- * all parents, and when a filesystem/snapshot is destroyed, decrement the
	- * count.
	- */
	-void
	-dsl_fs_ss_count_adjust(dsl_dir_t dd, int64_t delta, const char prop,
	- dmu_tx_t *tx)
	-{
	- int err;
	- objset_t *os = dd->dd_pool->dp_meta_objset;
	- uint64_t count;
	-
	- ASSERT(dsl_pool_config_held(dd->dd_pool));
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 \|\|
	- strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
	-
	- /*
	- * When we receive an incremental stream into a filesystem that already
	- * exists, a temporary clone is created. We don't count this temporary
	- * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
	- * $MOS & $ORIGIN) objsets.
	- */
	- if ((dd->dd_myname[0] == '%' \|\| dd->dd_myname[0] == '$') &&
	- strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
	- return;
	-
	- /*
	- * e.g. if renaming a dataset with no snapshots, count adjustment is 0
	- */
	- if (delta == 0)
	- return;
	-
	- /*
	- * If we hit an uninitialized node while recursing up the tree, we can
	- * stop since we know the counts are not valid on this node and we
	- * know we shouldn't touch this node's counts. An uninitialized count
	- * on the node indicates that either the feature has not yet been
	- * activated or there are no limits on this part of the tree.
	- */
	- if (!dsl_dir_is_zapified(dd) \|\| (err = zap_lookup(os, dd->dd_object,
	- prop, sizeof (count), 1, &count)) == ENOENT)
	- return;
	- VERIFY0(err);
	-
	- count += delta;
	- /* Use a signed verify to make sure we're not neg. */
	- VERIFY3S(count, >=, 0);
	-
	- VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
	- tx));
	-
	- /* Roll up this additional count into our ancestors */
	- if (dd->dd_parent != NULL)
	- dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
	-}
	-
	-uint64_t
	-dsl_dir_create_sync(dsl_pool_t dp, dsl_dir_t pds, const char *name,
	- dmu_tx_t *tx)
	-{
	- objset_t *mos = dp->dp_meta_objset;
	- uint64_t ddobj;
	- dsl_dir_phys_t *ddphys;
	- dmu_buf_t *dbuf;
	-
	- ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
	- DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
	- if (pds) {
	- VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
	- name, sizeof (uint64_t), 1, &ddobj, tx));
	- } else {
	- /* it's the root dir */
	- VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
	- }
	- VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
	- dmu_buf_will_dirty(dbuf, tx);
	- ddphys = dbuf->db_data;
	-
	- ddphys->dd_creation_time = gethrestime_sec();
	- if (pds) {
	- ddphys->dd_parent_obj = pds->dd_object;
	-
	- /* update the filesystem counts */
	- dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
	- }
	- ddphys->dd_props_zapobj = zap_create(mos,
	- DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
	- ddphys->dd_child_dir_zapobj = zap_create(mos,
	- DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
	- ddphys->dd_flags \|= DD_FLAG_USED_BREAKDOWN;
	- dmu_buf_rele(dbuf, FTAG);
	-
	- return (ddobj);
	-}
	-
	-boolean_t
	-dsl_dir_is_clone(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_origin_obj &&
	- (dd->dd_pool->dp_origin_snap == NULL \|\|
	- dsl_dir_phys(dd)->dd_origin_obj !=
	- dd->dd_pool->dp_origin_snap->ds_object));
	-}
	-
	-
	-uint64_t
	-dsl_dir_get_used(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_used_bytes);
	-}
	-
	-uint64_t
	-dsl_dir_get_compressed(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_compressed_bytes);
	-}
	-
	-uint64_t
	-dsl_dir_get_quota(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_quota);
	-}
	-
	-uint64_t
	-dsl_dir_get_reservation(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_reserved);
	-}
	-
	-uint64_t
	-dsl_dir_get_compressratio(dsl_dir_t *dd)
	-{
	- /* a fixed point number, 100x the ratio */
	- return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
	- (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
	- dsl_dir_phys(dd)->dd_compressed_bytes));
	-}
	-
	-uint64_t
	-dsl_dir_get_logicalused(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
	-}
	-
	-uint64_t
	-dsl_dir_get_usedsnap(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
	-}
	-
	-uint64_t
	-dsl_dir_get_usedds(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
	-}
	-
	-uint64_t
	-dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
	-}
	-
	-uint64_t
	-dsl_dir_get_usedchild(dsl_dir_t *dd)
	-{
	- return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
	- dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
	-}
	-
	-void
	-dsl_dir_get_origin(dsl_dir_t dd, char buf)
	-{
	- dsl_dataset_t *ds;
	- VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
	- dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
	-
	- dsl_dataset_name(ds, buf);
	-
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-int
	-dsl_dir_get_filesystem_count(dsl_dir_t dd, uint64_t count)
	-{
	- if (dsl_dir_is_zapified(dd)) {
	- objset_t *os = dd->dd_pool->dp_meta_objset;
	- return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
	- sizeof (*count), 1, count));
	- } else {
	- return (ENOENT);
	- }
	-}
	-
	-int
	-dsl_dir_get_snapshot_count(dsl_dir_t dd, uint64_t count)
	-{
	- if (dsl_dir_is_zapified(dd)) {
	- objset_t *os = dd->dd_pool->dp_meta_objset;
	- return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
	- sizeof (*count), 1, count));
	- } else {
	- return (ENOENT);
	- }
	-}
	-
	-int
	-dsl_dir_get_remaptxg(dsl_dir_t dd, uint64_t count)
	-{
	- if (dsl_dir_is_zapified(dd)) {
	- objset_t *os = dd->dd_pool->dp_meta_objset;
	- return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
	- sizeof (*count), 1, count));
	- } else {
	- return (ENOENT);
	- }
	-}
	-
	-void
	-dsl_dir_stats(dsl_dir_t dd, nvlist_t nv)
	-{
	- mutex_enter(&dd->dd_lock);
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
	- dsl_dir_get_quota(dd));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
	- dsl_dir_get_reservation(dd));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
	- dsl_dir_get_logicalused(dd));
	- if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
	- dsl_dir_get_usedsnap(dd));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
	- dsl_dir_get_usedds(dd));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
	- dsl_dir_get_usedrefreserv(dd));
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
	- dsl_dir_get_usedchild(dd));
	- }
	- mutex_exit(&dd->dd_lock);
	-
	- uint64_t count;
	- if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
	- count);
	- }
	- if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
	- count);
	- }
	- if (dsl_dir_get_remaptxg(dd, &count) == 0) {
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG,
	- count);
	- }
	-
	- if (dsl_dir_is_clone(dd)) {
	- char buf[ZFS_MAX_DATASET_NAME_LEN];
	- dsl_dir_get_origin(dd, buf);
	- dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
	- }
	-
	-}
	-
	-void
	-dsl_dir_dirty(dsl_dir_t dd, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = dd->dd_pool;
	-
	- ASSERT(dsl_dir_phys(dd));
	-
	- if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
	- /* up the hold count until we can be written out */
	- dmu_buf_add_ref(dd->dd_dbuf, dd);
	- }
	-}
	-
	-static int64_t
	-parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
	-{
	- uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
	- uint64_t new_accounted =
	- MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
	- return (new_accounted - old_accounted);
	-}
	-
	-void
	-dsl_dir_sync(dsl_dir_t dd, dmu_tx_t tx)
	-{
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- mutex_enter(&dd->dd_lock);
	- ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
	- dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
	- dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
	- dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
	- mutex_exit(&dd->dd_lock);
	-
	- /* release the hold from dsl_dir_dirty */
	- dmu_buf_rele(dd->dd_dbuf, dd);
	-}
	-
	-static uint64_t
	-dsl_dir_space_towrite(dsl_dir_t *dd)
	-{
	- uint64_t space = 0;
	-
	- ASSERT(MUTEX_HELD(&dd->dd_lock));
	-
	- for (int i = 0; i < TXG_SIZE; i++) {
	- space += dd->dd_space_towrite[i & TXG_MASK];
	- ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
	- }
	- return (space);
	-}
	-
	-/*
	- * How much space would dd have available if ancestor had delta applied
	- * to it? If ondiskonly is set, we're only interested in what's
	- * on-disk, not estimated pending changes.
	- */
	-uint64_t
	-dsl_dir_space_available(dsl_dir_t *dd,
	- dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
	-{
	- uint64_t parentspace, myspace, quota, used;
	-
	- /*
	- * If there are no restrictions otherwise, assume we have
	- * unlimited space available.
	- */
	- quota = UINT64_MAX;
	- parentspace = UINT64_MAX;
	-
	- if (dd->dd_parent != NULL) {
	- parentspace = dsl_dir_space_available(dd->dd_parent,
	- ancestor, delta, ondiskonly);
	- }
	-
	- mutex_enter(&dd->dd_lock);
	- if (dsl_dir_phys(dd)->dd_quota != 0)
	- quota = dsl_dir_phys(dd)->dd_quota;
	- used = dsl_dir_phys(dd)->dd_used_bytes;
	- if (!ondiskonly)
	- used += dsl_dir_space_towrite(dd);
	-
	- if (dd->dd_parent == NULL) {
	- uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
	- ZFS_SPACE_CHECK_NORMAL);
	- quota = MIN(quota, poolsize);
	- }
	-
	- if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
	- /*
	- * We have some space reserved, in addition to what our
	- * parent gave us.
	- */
	- parentspace += dsl_dir_phys(dd)->dd_reserved - used;
	- }
	-
	- if (dd == ancestor) {
	- ASSERT(delta <= 0);
	- ASSERT(used >= -delta);
	- used += delta;
	- if (parentspace != UINT64_MAX)
	- parentspace -= delta;
	- }
	-
	- if (used > quota) {
	- /* over quota */
	- myspace = 0;
	- } else {
	- /*
	- * the lesser of the space provided by our parent and
	- * the space left in our quota
	- */
	- myspace = MIN(parentspace, quota - used);
	- }
	-
	- mutex_exit(&dd->dd_lock);
	-
	- return (myspace);
	-}
	-
	-struct tempreserve {
	- list_node_t tr_node;
	- dsl_dir_t *tr_ds;
	- uint64_t tr_size;
	-};
	-
	-static int
	-dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
	- boolean_t ignorequota, list_t *tr_list,
	- dmu_tx_t *tx, boolean_t first)
	-{
	- uint64_t txg = tx->tx_txg;
	- uint64_t quota;
	- struct tempreserve *tr;
	- int retval = EDQUOT;
	- uint64_t ref_rsrv = 0;
	-
	- ASSERT3U(txg, !=, 0);
	- ASSERT3S(asize, >, 0);
	-
	- mutex_enter(&dd->dd_lock);
	-
	- /*
	- * Check against the dsl_dir's quota. We don't add in the delta
	- * when checking for over-quota because they get one free hit.
	- */
	- uint64_t est_inflight = dsl_dir_space_towrite(dd);
	- for (int i = 0; i < TXG_SIZE; i++)
	- est_inflight += dd->dd_tempreserved[i];
	- uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
	-
	- /*
	- * On the first iteration, fetch the dataset's used-on-disk and
	- * refreservation values. Also, if checkrefquota is set, test if
	- * allocating this space would exceed the dataset's refquota.
	- */
	- if (first && tx->tx_objset) {
	- int error;
	- dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
	-
	- error = dsl_dataset_check_quota(ds, !netfree,
	- asize, est_inflight, &used_on_disk, &ref_rsrv);
	- if (error != 0) {
	- mutex_exit(&dd->dd_lock);
	- return (error);
	- }
	- }
	-
	- /*
	- * If this transaction will result in a net free of space,
	- * we want to let it through.
	- */
	- if (ignorequota \|\| netfree \|\| dsl_dir_phys(dd)->dd_quota == 0)
	- quota = UINT64_MAX;
	- else
	- quota = dsl_dir_phys(dd)->dd_quota;
	-
	- /*
	- * Adjust the quota against the actual pool size at the root
	- * minus any outstanding deferred frees.
	- * To ensure that it's possible to remove files from a full
	- * pool without inducing transient overcommits, we throttle
	- * netfree transactions against a quota that is slightly larger,
	- * but still within the pool's allocation slop. In cases where
	- * we're very close to full, this will allow a steady trickle of
	- * removes to get through.
	- */
	- uint64_t deferred = 0;
	- if (dd->dd_parent == NULL) {
	- uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
	- (netfree) ?
	- ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
	-
	- if (avail < quota) {
	- quota = avail;
	- retval = ENOSPC;
	- }
	- }
	-
	- /*
	- * If they are requesting more space, and our current estimate
	- * is over quota, they get to try again unless the actual
	- * on-disk is over quota and there are no pending changes (which
	- * may free up space for us).
	- */
	- if (used_on_disk + est_inflight >= quota) {
	- if (est_inflight > 0 \|\| used_on_disk < quota \|\|
	- (retval == ENOSPC && used_on_disk < quota + deferred))
	- retval = ERESTART;
	- dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
	- "quota=%lluK tr=%lluK err=%d\n",
	- used_on_disk>>10, est_inflight>>10,
	- quota>>10, asize>>10, retval);
	- mutex_exit(&dd->dd_lock);
	- return (SET_ERROR(retval));
	- }
	-
	- /* We need to up our estimated delta before dropping dd_lock */
	- dd->dd_tempreserved[txg & TXG_MASK] += asize;
	-
	- uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
	- asize - ref_rsrv);
	- mutex_exit(&dd->dd_lock);
	-
	- tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
	- tr->tr_ds = dd;
	- tr->tr_size = asize;
	- list_insert_tail(tr_list, tr);
	-
	- /* see if it's OK with our parent */
	- if (dd->dd_parent != NULL && parent_rsrv != 0) {
	- boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
	-
	- return (dsl_dir_tempreserve_impl(dd->dd_parent,
	- parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE));
	- } else {
	- return (0);
	- }
	-}
	-
	-/*
	- * Reserve space in this dsl_dir, to be used in this tx's txg.
	- * After the space has been dirtied (and dsl_dir_willuse_space()
	- * has been called), the reservation should be canceled, using
	- * dsl_dir_tempreserve_clear().
	- */
	-int
	-dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
	- boolean_t netfree, void *tr_cookiep, dmu_tx_t tx)
	-{
	- int err;
	- list_t *tr_list;
	-
	- if (asize == 0) {
	- *tr_cookiep = NULL;
	- return (0);
	- }
	-
	- tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
	- list_create(tr_list, sizeof (struct tempreserve),
	- offsetof(struct tempreserve, tr_node));
	- ASSERT3S(asize, >, 0);
	-
	- err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
	- if (err == 0) {
	- struct tempreserve *tr;
	-
	- tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
	- tr->tr_size = lsize;
	- list_insert_tail(tr_list, tr);
	- } else {
	- if (err == EAGAIN) {
	- /*
	- * If arc_memory_throttle() detected that pageout
	- * is running and we are low on memory, we delay new
	- * non-pageout transactions to give pageout an
	- * advantage.
	- *
	- * It is unfortunate to be delaying while the caller's
	- * locks are held.
	- */
	- txg_delay(dd->dd_pool, tx->tx_txg,
	- MSEC2NSEC(10), MSEC2NSEC(10));
	- err = SET_ERROR(ERESTART);
	- }
	- }
	-
	- if (err == 0) {
	- err = dsl_dir_tempreserve_impl(dd, asize, netfree,
	- B_FALSE, tr_list, tx, B_TRUE);
	- }
	-
	- if (err != 0)
	- dsl_dir_tempreserve_clear(tr_list, tx);
	- else
	- *tr_cookiep = tr_list;
	-
	- return (err);
	-}
	-
	-/*
	- * Clear a temporary reservation that we previously made with
	- * dsl_dir_tempreserve_space().
	- */
	-void
	-dsl_dir_tempreserve_clear(void tr_cookie, dmu_tx_t tx)
	-{
	- int txgidx = tx->tx_txg & TXG_MASK;
	- list_t *tr_list = tr_cookie;
	- struct tempreserve *tr;
	-
	- ASSERT3U(tx->tx_txg, !=, 0);
	-
	- if (tr_cookie == NULL)
	- return;
	-
	- while ((tr = list_head(tr_list)) != NULL) {
	- if (tr->tr_ds) {
	- mutex_enter(&tr->tr_ds->dd_lock);
	- ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
	- tr->tr_size);
	- tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
	- mutex_exit(&tr->tr_ds->dd_lock);
	- } else {
	- arc_tempreserve_clear(tr->tr_size);
	- }
	- list_remove(tr_list, tr);
	- kmem_free(tr, sizeof (struct tempreserve));
	- }
	-
	- kmem_free(tr_list, sizeof (list_t));
	-}
	-
	-/*
	- * This should be called from open context when we think we're going to write
	- * or free space, for example when dirtying data. Be conservative; it's okay
	- * to write less space or free more, but we don't want to write more or free
	- * less than the amount specified.
	- */
	-void
	-dsl_dir_willuse_space(dsl_dir_t dd, int64_t space, dmu_tx_t tx)
	-{
	- int64_t parent_space;
	- uint64_t est_used;
	-
	- mutex_enter(&dd->dd_lock);
	- if (space > 0)
	- dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
	-
	- est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
	- parent_space = parent_delta(dd, est_used, space);
	- mutex_exit(&dd->dd_lock);
	-
	- /* Make sure that we clean up dd_space_to* */
	- dsl_dir_dirty(dd, tx);
	-
	- /* XXX this is potentially expensive and unnecessary... */
	- if (parent_space && dd->dd_parent)
	- dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
	-}
	-
	-/* call from syncing context when we actually write/free space for this dd */
	-void
	-dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
	- int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
	-{
	- int64_t accounted_delta;
	-
	- /*
	- * dsl_dataset_set_refreservation_sync_impl() calls this with
	- * dd_lock held, so that it can atomically update
	- * ds->ds_reserved and the dsl_dir accounting, so that
	- * dsl_dataset_check_quota() can see dataset and dir accounting
	- * consistently.
	- */
	- boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(type < DD_USED_NUM);
	-
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	-
	- if (needlock)
	- mutex_enter(&dd->dd_lock);
	- accounted_delta =
	- parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
	- ASSERT(used >= 0 \|\| dsl_dir_phys(dd)->dd_used_bytes >= -used);
	- ASSERT(compressed >= 0 \|\|
	- dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
	- ASSERT(uncompressed >= 0 \|\|
	- dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
	- dsl_dir_phys(dd)->dd_used_bytes += used;
	- dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
	- dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
	-
	- if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
	- ASSERT(used > 0 \|\|
	- dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
	- dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
	-#ifdef DEBUG
	- dd_used_t t;
	- uint64_t u = 0;
	- for (t = 0; t < DD_USED_NUM; t++)
	- u += dsl_dir_phys(dd)->dd_used_breakdown[t];
	- ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
	-#endif
	- }
	- if (needlock)
	- mutex_exit(&dd->dd_lock);
	-
	- if (dd->dd_parent != NULL) {
	- dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
	- accounted_delta, compressed, uncompressed, tx);
	- dsl_dir_transfer_space(dd->dd_parent,
	- used - accounted_delta,
	- DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL);
	- }
	-}
	-
	-void
	-dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
	- dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
	-{
	- ASSERT(tx == NULL \|\| dmu_tx_is_syncing(tx));
	- ASSERT(oldtype < DD_USED_NUM);
	- ASSERT(newtype < DD_USED_NUM);
	-
	- if (delta == 0 \|\|
	- !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
	- return;
	-
	- if (tx != NULL)
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	- mutex_enter(&dd->dd_lock);
	- ASSERT(delta > 0 ?
	- dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
	- dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
	- ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
	- dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
	- dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
	- mutex_exit(&dd->dd_lock);
	-}
	-
	-typedef struct dsl_dir_set_qr_arg {
	- const char *ddsqra_name;
	- zprop_source_t ddsqra_source;
	- uint64_t ddsqra_value;
	-} dsl_dir_set_qr_arg_t;
	-
	-static int
	-dsl_dir_set_quota_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dir_set_qr_arg_t *ddsqra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- int error;
	- uint64_t towrite, newval;
	-
	- error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_prop_predict(ds->ds_dir, "quota",
	- ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- if (newval == 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	- }
	-
	- mutex_enter(&ds->ds_dir->dd_lock);
	- /*
	- * If we are doing the preliminary check in open context, and
	- * there are pending changes, then don't fail it, since the
	- * pending changes could under-estimate the amount of space to be
	- * freed up.
	- */
	- towrite = dsl_dir_space_towrite(ds->ds_dir);
	- if ((dmu_tx_is_syncing(tx) \|\| towrite == 0) &&
	- (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved \|\|
	- newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
	- error = SET_ERROR(ENOSPC);
	- }
	- mutex_exit(&ds->ds_dir->dd_lock);
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	-}
	-
	-static void
	-dsl_dir_set_quota_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dir_set_qr_arg_t *ddsqra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- uint64_t newval;
	-
	- VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
	-
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
	- dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
	- ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
	- &ddsqra->ddsqra_value, tx);
	-
	- VERIFY0(dsl_prop_get_int_ds(ds,
	- zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
	- } else {
	- newval = ddsqra->ddsqra_value;
	- spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
	- zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
	- }
	-
	- dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
	- mutex_enter(&ds->ds_dir->dd_lock);
	- dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
	- mutex_exit(&ds->ds_dir->dd_lock);
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-int
	-dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
	-{
	- dsl_dir_set_qr_arg_t ddsqra;
	-
	- ddsqra.ddsqra_name = ddname;
	- ddsqra.ddsqra_source = source;
	- ddsqra.ddsqra_value = quota;
	-
	- return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
	- dsl_dir_set_quota_sync, &ddsqra, 0,
	- ZFS_SPACE_CHECK_EXTRA_RESERVED));
	-}
	-
	-int
	-dsl_dir_set_reservation_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dir_set_qr_arg_t *ddsqra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- dsl_dir_t *dd;
	- uint64_t newval, used, avail;
	- int error;
	-
	- error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
	- if (error != 0)
	- return (error);
	- dd = ds->ds_dir;
	-
	- /*
	- * If we are doing the preliminary check in open context, the
	- * space estimates may be inaccurate.
	- */
	- if (!dmu_tx_is_syncing(tx)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	- }
	-
	- error = dsl_prop_predict(ds->ds_dir,
	- zfs_prop_to_name(ZFS_PROP_RESERVATION),
	- ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	- }
	-
	- mutex_enter(&dd->dd_lock);
	- used = dsl_dir_phys(dd)->dd_used_bytes;
	- mutex_exit(&dd->dd_lock);
	-
	- if (dd->dd_parent) {
	- avail = dsl_dir_space_available(dd->dd_parent,
	- NULL, 0, FALSE);
	- } else {
	- avail = dsl_pool_adjustedsize(dd->dd_pool,
	- ZFS_SPACE_CHECK_NORMAL) - used;
	- }
	-
	- if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
	- uint64_t delta = MAX(used, newval) -
	- MAX(used, dsl_dir_phys(dd)->dd_reserved);
	-
	- if (delta > avail \|\|
	- (dsl_dir_phys(dd)->dd_quota > 0 &&
	- newval > dsl_dir_phys(dd)->dd_quota))
	- error = SET_ERROR(ENOSPC);
	- }
	-
	- dsl_dataset_rele(ds, FTAG);
	- return (error);
	-}
	-
	-void
	-dsl_dir_set_reservation_sync_impl(dsl_dir_t dd, uint64_t value, dmu_tx_t tx)
	-{
	- uint64_t used;
	- int64_t delta;
	-
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	-
	- mutex_enter(&dd->dd_lock);
	- used = dsl_dir_phys(dd)->dd_used_bytes;
	- delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
	- dsl_dir_phys(dd)->dd_reserved = value;
	-
	- if (dd->dd_parent != NULL) {
	- /* Roll up this additional usage into our ancestors */
	- dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
	- delta, 0, 0, tx);
	- }
	- mutex_exit(&dd->dd_lock);
	-}
	-
	-static void
	-dsl_dir_set_reservation_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dir_set_qr_arg_t *ddsqra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- uint64_t newval;
	-
	- VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
	-
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
	- dsl_prop_set_sync_impl(ds,
	- zfs_prop_to_name(ZFS_PROP_RESERVATION),
	- ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
	- &ddsqra->ddsqra_value, tx);
	-
	- VERIFY0(dsl_prop_get_int_ds(ds,
	- zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
	- } else {
	- newval = ddsqra->ddsqra_value;
	- spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
	- zfs_prop_to_name(ZFS_PROP_RESERVATION),
	- (longlong_t)newval);
	- }
	-
	- dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-int
	-dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
	- uint64_t reservation)
	-{
	- dsl_dir_set_qr_arg_t ddsqra;
	-
	- ddsqra.ddsqra_name = ddname;
	- ddsqra.ddsqra_source = source;
	- ddsqra.ddsqra_value = reservation;
	-
	- return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
	- dsl_dir_set_reservation_sync, &ddsqra, 0,
	- ZFS_SPACE_CHECK_EXTRA_RESERVED));
	-}
	-
	-static dsl_dir_t *
	-closest_common_ancestor(dsl_dir_t ds1, dsl_dir_t ds2)
	-{
	- for (; ds1; ds1 = ds1->dd_parent) {
	- dsl_dir_t *dd;
	- for (dd = ds2; dd; dd = dd->dd_parent) {
	- if (ds1 == dd)
	- return (dd);
	- }
	- }
	- return (NULL);
	-}
	-
	-/*
	- * If delta is applied to dd, how much of that delta would be applied to
	- * ancestor? Syncing context only.
	- */
	-static int64_t
	-would_change(dsl_dir_t dd, int64_t delta, dsl_dir_t ancestor)
	-{
	- if (dd == ancestor)
	- return (delta);
	-
	- mutex_enter(&dd->dd_lock);
	- delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
	- mutex_exit(&dd->dd_lock);
	- return (would_change(dd->dd_parent, delta, ancestor));
	-}
	-
	-typedef struct dsl_dir_rename_arg {
	- const char *ddra_oldname;
	- const char *ddra_newname;
	- cred_t *ddra_cred;
	-} dsl_dir_rename_arg_t;
	-
	-typedef struct dsl_valid_rename_arg {
	- int char_delta;
	- int nest_delta;
	-} dsl_valid_rename_arg_t;
	-
	-/* ARGSUSED */
	-static int
	-dsl_valid_rename(dsl_pool_t dp, dsl_dataset_t ds, void *arg)
	-{
	- dsl_valid_rename_arg_t *dvra = arg;
	- char namebuf[ZFS_MAX_DATASET_NAME_LEN];
	-
	- dsl_dataset_name(ds, namebuf);
	-
	- ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
	- <, ZFS_MAX_DATASET_NAME_LEN);
	- int namelen = strlen(namebuf) + dvra->char_delta;
	- int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
	-
	- if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	- if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
	- return (SET_ERROR(ENAMETOOLONG));
	- return (0);
	-}
	-
	-static int
	-dsl_dir_rename_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dir_rename_arg_t *ddra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dir_t dd, newparent;
	- dsl_valid_rename_arg_t dvra;
	- dsl_dataset_t *parentds;
	- objset_t *parentos;
	- const char *mynewname;
	- int error;
	-
	- /* target dir should exist */
	- error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
	- if (error != 0)
	- return (error);
	-
	- /* new parent should exist */
	- error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
	- &newparent, &mynewname);
	- if (error != 0) {
	- dsl_dir_rele(dd, FTAG);
	- return (error);
	- }
	-
	- /* can't rename to different pool */
	- if (dd->dd_pool != newparent->dd_pool) {
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (SET_ERROR(EXDEV));
	- }
	-
	- /* new name should not already exist */
	- if (mynewname == NULL) {
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (SET_ERROR(EEXIST));
	- }
	-
	- /* can't rename below anything but filesystems (eg. no ZVOLs) */
	- error = dsl_dataset_hold_obj(newparent->dd_pool,
	- dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
	- if (error != 0) {
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (error);
	- }
	- error = dmu_objset_from_ds(parentds, &parentos);
	- if (error != 0) {
	- dsl_dataset_rele(parentds, FTAG);
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (error);
	- }
	- if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
	- dsl_dataset_rele(parentds, FTAG);
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (error);
	- }
	- dsl_dataset_rele(parentds, FTAG);
	-
	- ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
	- <, ZFS_MAX_DATASET_NAME_LEN);
	- ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
	- <, ZFS_MAX_DATASET_NAME_LEN);
	- dvra.char_delta = strlen(ddra->ddra_newname)
	- - strlen(ddra->ddra_oldname);
	- dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
	- - get_dataset_depth(ddra->ddra_oldname);
	-
	- /* if the name length is growing, validate child name lengths */
	- if (dvra.char_delta > 0 \|\| dvra.nest_delta > 0) {
	- error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
	- &dvra, DS_FIND_CHILDREN \| DS_FIND_SNAPSHOTS);
	- if (error != 0) {
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (error);
	- }
	- }
	-
	- if (dmu_tx_is_syncing(tx)) {
	- if (spa_feature_is_active(dp->dp_spa,
	- SPA_FEATURE_FS_SS_LIMIT)) {
	- /*
	- * Although this is the check function and we don't
	- * normally make on-disk changes in check functions,
	- * we need to do that here.
	- *
	- * Ensure this portion of the tree's counts have been
	- * initialized in case the new parent has limits set.
	- */
	- dsl_dir_init_fs_ss_count(dd, tx);
	- }
	- }
	-
	- if (newparent != dd->dd_parent) {
	- /* is there enough space? */
	- uint64_t myspace =
	- MAX(dsl_dir_phys(dd)->dd_used_bytes,
	- dsl_dir_phys(dd)->dd_reserved);
	- objset_t *os = dd->dd_pool->dp_meta_objset;
	- uint64_t fs_cnt = 0;
	- uint64_t ss_cnt = 0;
	-
	- if (dsl_dir_is_zapified(dd)) {
	- int err;
	-
	- err = zap_lookup(os, dd->dd_object,
	- DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
	- &fs_cnt);
	- if (err != ENOENT && err != 0) {
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (err);
	- }
	-
	- /*
	- * have to add 1 for the filesystem itself that we're
	- * moving
	- */
	- fs_cnt++;
	-
	- err = zap_lookup(os, dd->dd_object,
	- DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
	- &ss_cnt);
	- if (err != ENOENT && err != 0) {
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (err);
	- }
	- }
	-
	- /* no rename into our descendant */
	- if (closest_common_ancestor(dd, newparent) == dd) {
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- error = dsl_dir_transfer_possible(dd->dd_parent,
	- newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
	- if (error != 0) {
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (error);
	- }
	- }
	-
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	- return (0);
	-}
	-
	-static void
	-dsl_dir_rename_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dir_rename_arg_t *ddra = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dir_t dd, newparent;
	- const char *mynewname;
	- int error;
	- objset_t *mos = dp->dp_meta_objset;
	-
	- VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
	- VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
	- &mynewname));
	-
	- /* Log this before we change the name. */
	- spa_history_log_internal_dd(dd, "rename", tx,
	- "-> %s", ddra->ddra_newname);
	-
	- if (newparent != dd->dd_parent) {
	- objset_t *os = dd->dd_pool->dp_meta_objset;
	- uint64_t fs_cnt = 0;
	- uint64_t ss_cnt = 0;
	-
	- /*
	- * We already made sure the dd counts were initialized in the
	- * check function.
	- */
	- if (spa_feature_is_active(dp->dp_spa,
	- SPA_FEATURE_FS_SS_LIMIT)) {
	- VERIFY0(zap_lookup(os, dd->dd_object,
	- DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
	- &fs_cnt));
	- /* add 1 for the filesystem itself that we're moving */
	- fs_cnt++;
	-
	- VERIFY0(zap_lookup(os, dd->dd_object,
	- DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
	- &ss_cnt));
	- }
	-
	- dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
	- DD_FIELD_FILESYSTEM_COUNT, tx);
	- dsl_fs_ss_count_adjust(newparent, fs_cnt,
	- DD_FIELD_FILESYSTEM_COUNT, tx);
	-
	- dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
	- DD_FIELD_SNAPSHOT_COUNT, tx);
	- dsl_fs_ss_count_adjust(newparent, ss_cnt,
	- DD_FIELD_SNAPSHOT_COUNT, tx);
	-
	- dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
	- -dsl_dir_phys(dd)->dd_used_bytes,
	- -dsl_dir_phys(dd)->dd_compressed_bytes,
	- -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
	- dsl_dir_diduse_space(newparent, DD_USED_CHILD,
	- dsl_dir_phys(dd)->dd_used_bytes,
	- dsl_dir_phys(dd)->dd_compressed_bytes,
	- dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
	-
	- if (dsl_dir_phys(dd)->dd_reserved >
	- dsl_dir_phys(dd)->dd_used_bytes) {
	- uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
	- dsl_dir_phys(dd)->dd_used_bytes;
	-
	- dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
	- -unused_rsrv, 0, 0, tx);
	- dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
	- unused_rsrv, 0, 0, tx);
	- }
	- }
	-
	- dmu_buf_will_dirty(dd->dd_dbuf, tx);
	-
	- /* remove from old parent zapobj */
	- error = zap_remove(mos,
	- dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
	- dd->dd_myname, tx);
	- ASSERT0(error);
	-
	- (void) strcpy(dd->dd_myname, mynewname);
	- dsl_dir_rele(dd->dd_parent, dd);
	- dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
	- VERIFY0(dsl_dir_hold_obj(dp,
	- newparent->dd_object, NULL, dd, &dd->dd_parent));
	-
	- /* add to new parent zapobj */
	- VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
	- dd->dd_myname, 8, 1, &dd->dd_object, tx));
	-
	-#ifdef __FreeBSD__
	-#ifdef _KERNEL
	- zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
	- zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, ddra->ddra_newname);
	-#endif
	-#endif
	-
	- dsl_prop_notify_all(dd);
	-
	- dsl_dir_rele(newparent, FTAG);
	- dsl_dir_rele(dd, FTAG);
	-}
	-
	-int
	-dsl_dir_rename(const char oldname, const char newname)
	-{
	- dsl_dir_rename_arg_t ddra;
	-
	- ddra.ddra_oldname = oldname;
	- ddra.ddra_newname = newname;
	- ddra.ddra_cred = CRED();
	-
	- return (dsl_sync_task(oldname,
	- dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
	- 3, ZFS_SPACE_CHECK_RESERVED));
	-}
	-
	-int
	-dsl_dir_transfer_possible(dsl_dir_t sdd, dsl_dir_t tdd,
	- uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
	-{
	- dsl_dir_t *ancestor;
	- int64_t adelta;
	- uint64_t avail;
	- int err;
	-
	- ancestor = closest_common_ancestor(sdd, tdd);
	- adelta = would_change(sdd, -space, ancestor);
	- avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
	- if (avail < space)
	- return (SET_ERROR(ENOSPC));
	-
	- err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
	- ancestor, cr);
	- if (err != 0)
	- return (err);
	- err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
	- ancestor, cr);
	- if (err != 0)
	- return (err);
	-
	- return (0);
	-}
	-
	-timestruc_t
	-dsl_dir_snap_cmtime(dsl_dir_t *dd)
	-{
	- timestruc_t t;
	-
	- mutex_enter(&dd->dd_lock);
	- t = dd->dd_snap_cmtime;
	- mutex_exit(&dd->dd_lock);
	-
	- return (t);
	-}
	-
	-void
	-dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
	-{
	- timestruc_t t;
	-
	- gethrestime(&t);
	- mutex_enter(&dd->dd_lock);
	- dd->dd_snap_cmtime = t;
	- mutex_exit(&dd->dd_lock);
	-}
	-
	-void
	-dsl_dir_zapify(dsl_dir_t dd, dmu_tx_t tx)
	-{
	- objset_t *mos = dd->dd_pool->dp_meta_objset;
	- dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
	-}
	-
	-boolean_t
	-dsl_dir_is_zapified(dsl_dir_t *dd)
	-{
	- dmu_object_info_t doi;
	-
	- dmu_object_info_from_db(dd->dd_dbuf, &doi);
	- return (doi.doi_type == DMU_OTN_ZAP_METADATA);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
	@@ -1,1372 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/dnode.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/arc.h>
	-#include <sys/zap.h>
	-#include <sys/zio.h>
	-#include <sys/zfs_context.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/spa_impl.h>
	-#include <sys/dsl_deadlist.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/bptree.h>
	-#include <sys/zfeature.h>
	-#include <sys/zil_impl.h>
	-#include <sys/dsl_userhold.h>
	-#include <sys/mmp.h>
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-#include <sys/types.h>
	-#include <sys/sysctl.h>
	-#endif
	-
	-/*
	- * ZFS Write Throttle
	- * ------------------
	- *
	- * ZFS must limit the rate of incoming writes to the rate at which it is able
	- * to sync data modifications to the backend storage. Throttling by too much
	- * creates an artificial limit; throttling by too little can only be sustained
	- * for short periods and would lead to highly lumpy performance. On a per-pool
	- * basis, ZFS tracks the amount of modified (dirty) data. As operations change
	- * data, the amount of dirty data increases; as ZFS syncs out data, the amount
	- * of dirty data decreases. When the amount of dirty data exceeds a
	- * predetermined threshold further modifications are blocked until the amount
	- * of dirty data decreases (as data is synced out).
	- *
	- * The limit on dirty data is tunable, and should be adjusted according to
	- * both the IO capacity and available memory of the system. The larger the
	- * window, the more ZFS is able to aggregate and amortize metadata (and data)
	- * changes. However, memory is a limited resource, and allowing for more dirty
	- * data comes at the cost of keeping other useful data in memory (for example
	- * ZFS data cached by the ARC).
	- *
	- * Implementation
	- *
	- * As buffers are modified dsl_pool_willuse_space() increments both the per-
	- * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
	- * dirty space used; dsl_pool_dirty_space() decrements those values as data
	- * is synced out from dsl_pool_sync(). While only the poolwide value is
	- * relevant, the per-txg value is useful for debugging. The tunable
	- * zfs_dirty_data_max determines the dirty space limit. Once that value is
	- * exceeded, new writes are halted until space frees up.
	- *
	- * The zfs_dirty_data_sync tunable dictates the threshold at which we
	- * ensure that there is a txg syncing (see the comment in txg.c for a full
	- * description of transaction group stages).
	- *
	- * The IO scheduler uses both the dirty space limit and current amount of
	- * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
	- * issues. See the comment in vdev_queue.c for details of the IO scheduler.
	- *
	- * The delay is also calculated based on the amount of dirty data. See the
	- * comment above dmu_tx_delay() for details.
	- */
	-
	-/*
	- * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
	- * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system.
	- */
	-uint64_t zfs_dirty_data_max;
	-uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
	-int zfs_dirty_data_max_percent = 10;
	-
	-/*
	- * If there's at least this much dirty data (as a percentage of
	- * zfs_dirty_data_max), push out a txg. This should be less than
	- * zfs_vdev_async_write_active_min_dirty_percent.
	- */
	-uint64_t zfs_dirty_data_sync_pct = 20;
	-
	-/*
	- * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
	- * and delay each transaction.
	- * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
	- */
	-int zfs_delay_min_dirty_percent = 60;
	-
	-/*
	- * This controls how quickly the delay approaches infinity.
	- * Larger values cause it to delay more for a given amount of dirty data.
	- * Therefore larger values will cause there to be less dirty data for a
	- * given throughput.
	- *
	- * For the smoothest delay, this value should be about 1 billion divided
	- * by the maximum number of operations per second. This will smoothly
	- * handle between 10x and 1/10th this number.
	- *
	- * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
	- * multiply in dmu_tx_delay().
	- */
	-uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
	-
	-/*
	- * This determines the number of threads used by the dp_sync_taskq.
	- */
	-int zfs_sync_taskq_batch_pct = 75;
	-
	-/*
	- * These tunables determine the behavior of how zil_itxg_clean() is
	- * called via zil_clean() in the context of spa_sync(). When an itxg
	- * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
	- * If the dispatch fails, the call to zil_itxg_clean() will occur
	- * synchronously in the context of spa_sync(), which can negatively
	- * impact the performance of spa_sync() (e.g. in the case of the itxg
	- * list having a large number of itxs that needs to be cleaned).
	- *
	- * Thus, these tunables can be used to manipulate the behavior of the
	- * taskq used by zil_clean(); they determine the number of taskq entries
	- * that are pre-populated when the taskq is first created (via the
	- * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
	- * taskq entries that are cached after an on-demand allocation (via the
	- * "zfs_zil_clean_taskq_maxalloc").
	- *
	- * The idea being, we want to try reasonably hard to ensure there will
	- * already be a taskq entry pre-allocated by the time that it is needed
	- * by zil_clean(). This way, we can avoid the possibility of an
	- * on-demand allocation of a new taskq entry from failing, which would
	- * result in zil_itxg_clean() being called synchronously from zil_clean()
	- * (which can adversely affect performance of spa_sync()).
	- *
	- * Additionally, the number of threads used by the taskq can be
	- * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
	- */
	-int zfs_zil_clean_taskq_nthr_pct = 100;
	-int zfs_zil_clean_taskq_minalloc = 1024;
	-int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-
	-extern int zfs_vdev_async_write_active_max_dirty_percent;
	-
	-SYSCTL_DECL(_vfs_zfs);
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
	- &zfs_dirty_data_max, 0,
	- "The maximum amount of dirty data in bytes after which new writes are "
	- "halted until space becomes available");
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
	- &zfs_dirty_data_max_max, 0,
	- "The absolute cap on dirty_data_max when auto calculating");
	-
	-static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
	- CTLTYPE_INT \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN, 0, sizeof(int),
	- sysctl_zfs_dirty_data_max_percent, "I",
	- "The percent of physical memory used to auto calculate dirty_data_max");
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync_pct, CTLFLAG_RWTUN,
	- &zfs_dirty_data_sync_pct, 0,
	- "Force a txg if the percent of dirty buffer bytes exceed this value");
	-
	-static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
	-/* No zfs_delay_min_dirty_percent tunable due to limit requirements */
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
	- CTLTYPE_INT \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(int),
	- sysctl_zfs_delay_min_dirty_percent, "I",
	- "The limit of outstanding dirty data before transactions are delayed");
	-
	-static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
	-/* No zfs_delay_scale tunable due to limit requirements */
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
	- CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t),
	- sysctl_zfs_delay_scale, "QU",
	- "Controls how quickly the delay approaches infinity");
	-
	-static int
	-sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
	-{
	- int val, err;
	-
	- val = zfs_dirty_data_max_percent;
	- err = sysctl_handle_int(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val < 0 \|\| val > 100)
	- return (EINVAL);
	-
	- zfs_dirty_data_max_percent = val;
	-
	- return (0);
	-}
	-
	-static int
	-sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
	-{
	- int val, err;
	-
	- val = zfs_delay_min_dirty_percent;
	- err = sysctl_handle_int(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val < zfs_vdev_async_write_active_max_dirty_percent)
	- return (EINVAL);
	-
	- zfs_delay_min_dirty_percent = val;
	-
	- return (0);
	-}
	-
	-static int
	-sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
	-{
	- uint64_t val;
	- int err;
	-
	- val = zfs_delay_scale;
	- err = sysctl_handle_64(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val > UINT64_MAX / zfs_dirty_data_max)
	- return (EINVAL);
	-
	- zfs_delay_scale = val;
	-
	- return (0);
	-}
	-#endif
	-
	-int
	-dsl_pool_open_special_dir(dsl_pool_t dp, const char name, dsl_dir_t **ddp)
	-{
	- uint64_t obj;
	- int err;
	-
	- err = zap_lookup(dp->dp_meta_objset,
	- dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
	- name, sizeof (obj), 1, &obj);
	- if (err)
	- return (err);
	-
	- return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
	-}
	-
	-static dsl_pool_t *
	-dsl_pool_open_impl(spa_t *spa, uint64_t txg)
	-{
	- dsl_pool_t *dp;
	- blkptr_t *bp = spa_get_rootblkptr(spa);
	-
	- dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
	- dp->dp_spa = spa;
	- dp->dp_meta_rootbp = *bp;
	- rrw_init(&dp->dp_config_rwlock, B_TRUE);
	- txg_init(dp, txg);
	- mmp_init(spa);
	-
	- txg_list_create(&dp->dp_dirty_datasets, spa,
	- offsetof(dsl_dataset_t, ds_dirty_link));
	- txg_list_create(&dp->dp_dirty_zilogs, spa,
	- offsetof(zilog_t, zl_dirty_link));
	- txg_list_create(&dp->dp_dirty_dirs, spa,
	- offsetof(dsl_dir_t, dd_dirty_link));
	- txg_list_create(&dp->dp_sync_tasks, spa,
	- offsetof(dsl_sync_task_t, dst_node));
	- txg_list_create(&dp->dp_early_sync_tasks, spa,
	- offsetof(dsl_sync_task_t, dst_node));
	-
	- dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
	- zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
	- TASKQ_THREADS_CPU_PCT);
	-
	- dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
	- zfs_zil_clean_taskq_nthr_pct, minclsyspri,
	- zfs_zil_clean_taskq_minalloc,
	- zfs_zil_clean_taskq_maxalloc,
	- TASKQ_PREPOPULATE \| TASKQ_THREADS_CPU_PCT);
	-
	- mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
	-
	- dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
	- 1, 4, 0);
	-
	- return (dp);
	-}
	-
	-int
	-dsl_pool_init(spa_t spa, uint64_t txg, dsl_pool_t *dpp)
	-{
	- int err;
	- dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
	-
	- err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
	- &dp->dp_meta_objset);
	- if (err != 0)
	- dsl_pool_close(dp);
	- else
	- *dpp = dp;
	-
	- return (err);
	-}
	-
	-int
	-dsl_pool_open(dsl_pool_t *dp)
	-{
	- int err;
	- dsl_dir_t *dd;
	- dsl_dataset_t *ds;
	- uint64_t obj;
	-
	- rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
	- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
	- &dp->dp_root_dir_obj);
	- if (err)
	- goto out;
	-
	- err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
	- NULL, dp, &dp->dp_root_dir);
	- if (err)
	- goto out;
	-
	- err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
	- if (err)
	- goto out;
	-
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
	- err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
	- if (err)
	- goto out;
	- err = dsl_dataset_hold_obj(dp,
	- dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
	- if (err == 0) {
	- err = dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
	- &dp->dp_origin_snap);
	- dsl_dataset_rele(ds, FTAG);
	- }
	- dsl_dir_rele(dd, dp);
	- if (err)
	- goto out;
	- }
	-
	- if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
	- err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
	- &dp->dp_free_dir);
	- if (err)
	- goto out;
	-
	- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
	- if (err)
	- goto out;
	- VERIFY0(bpobj_open(&dp->dp_free_bpobj,
	- dp->dp_meta_objset, obj));
	- }
	-
	- if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);
	- if (err == 0) {
	- VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,
	- dp->dp_meta_objset, obj));
	- } else if (err == ENOENT) {
	- /*
	- * We might not have created the remap bpobj yet.
	- */
	- err = 0;
	- } else {
	- goto out;
	- }
	- }
	-
	- /*
	- * Note: errors ignored, because the these special dirs, used for
	- * space accounting, are only created on demand.
	- */
	- (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
	- &dp->dp_leak_dir);
	-
	- if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
	- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
	- &dp->dp_bptree_obj);
	- if (err != 0)
	- goto out;
	- }
	-
	- if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
	- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
	- &dp->dp_empty_bpobj);
	- if (err != 0)
	- goto out;
	- }
	-
	- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
	- &dp->dp_tmp_userrefs_obj);
	- if (err == ENOENT)
	- err = 0;
	- if (err)
	- goto out;
	-
	- err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
	-
	-out:
	- rrw_exit(&dp->dp_config_rwlock, FTAG);
	- return (err);
	-}
	-
	-void
	-dsl_pool_close(dsl_pool_t *dp)
	-{
	- /*
	- * Drop our references from dsl_pool_open().
	- *
	- * Since we held the origin_snap from "syncing" context (which
	- * includes pool-opening context), it actually only got a "ref"
	- * and not a hold, so just drop that here.
	- */
	- if (dp->dp_origin_snap != NULL)
	- dsl_dataset_rele(dp->dp_origin_snap, dp);
	- if (dp->dp_mos_dir != NULL)
	- dsl_dir_rele(dp->dp_mos_dir, dp);
	- if (dp->dp_free_dir != NULL)
	- dsl_dir_rele(dp->dp_free_dir, dp);
	- if (dp->dp_leak_dir != NULL)
	- dsl_dir_rele(dp->dp_leak_dir, dp);
	- if (dp->dp_root_dir != NULL)
	- dsl_dir_rele(dp->dp_root_dir, dp);
	-
	- bpobj_close(&dp->dp_free_bpobj);
	- bpobj_close(&dp->dp_obsolete_bpobj);
	-
	- /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
	- if (dp->dp_meta_objset != NULL)
	- dmu_objset_evict(dp->dp_meta_objset);
	-
	- txg_list_destroy(&dp->dp_dirty_datasets);
	- txg_list_destroy(&dp->dp_dirty_zilogs);
	- txg_list_destroy(&dp->dp_sync_tasks);
	- txg_list_destroy(&dp->dp_early_sync_tasks);
	- txg_list_destroy(&dp->dp_dirty_dirs);
	-
	- taskq_destroy(dp->dp_zil_clean_taskq);
	- taskq_destroy(dp->dp_sync_taskq);
	-
	- /*
	- * We can't set retry to TRUE since we're explicitly specifying
	- * a spa to flush. This is good enough; any missed buffers for
	- * this spa won't cause trouble, and they'll eventually fall
	- * out of the ARC just like any other unused buffer.
	- */
	- arc_flush(dp->dp_spa, FALSE);
	-
	- mmp_fini(dp->dp_spa);
	- txg_fini(dp);
	- dsl_scan_fini(dp);
	- dmu_buf_user_evict_wait();
	-
	- rrw_destroy(&dp->dp_config_rwlock);
	- mutex_destroy(&dp->dp_lock);
	- taskq_destroy(dp->dp_vnrele_taskq);
	- if (dp->dp_blkstats != NULL) {
	- mutex_destroy(&dp->dp_blkstats->zab_lock);
	- kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
	- }
	- kmem_free(dp, sizeof (dsl_pool_t));
	-}
	-
	-void
	-dsl_pool_create_obsolete_bpobj(dsl_pool_t dp, dmu_tx_t tx)
	-{
	- uint64_t obj;
	- /*
	- * Currently, we only create the obsolete_bpobj where there are
	- * indirect vdevs with referenced mappings.
	- */
	- ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));
	- /* create and open the obsolete_bpobj */
	- obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
	- VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));
	- VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
	- spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	-}
	-
	-void
	-dsl_pool_destroy_obsolete_bpobj(dsl_pool_t dp, dmu_tx_t tx)
	-{
	- spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	- VERIFY0(zap_remove(dp->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_OBSOLETE_BPOBJ, tx));
	- bpobj_free(dp->dp_meta_objset,
	- dp->dp_obsolete_bpobj.bpo_object, tx);
	- bpobj_close(&dp->dp_obsolete_bpobj);
	-}
	-
	-dsl_pool_t *
	-dsl_pool_create(spa_t spa, nvlist_t zplprops, uint64_t txg)
	-{
	- int err;
	- dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
	- dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
	- dsl_dataset_t *ds;
	- uint64_t obj;
	-
	- rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
	-
	- /* create and open the MOS (meta-objset) */
	- dp->dp_meta_objset = dmu_objset_create_impl(spa,
	- NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
	-
	- /* create the pool directory */
	- err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
	- ASSERT0(err);
	-
	- /* Initialize scan structures */
	- VERIFY0(dsl_scan_init(dp, txg));
	-
	- /* create and open the root dir */
	- dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
	- VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
	- NULL, dp, &dp->dp_root_dir));
	-
	- /* create and open the meta-objset dir */
	- (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
	- VERIFY0(dsl_pool_open_special_dir(dp,
	- MOS_DIR_NAME, &dp->dp_mos_dir));
	-
	- if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
	- /* create and open the free dir */
	- (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
	- FREE_DIR_NAME, tx);
	- VERIFY0(dsl_pool_open_special_dir(dp,
	- FREE_DIR_NAME, &dp->dp_free_dir));
	-
	- /* create and open the free_bplist */
	- obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
	- VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
	- VERIFY0(bpobj_open(&dp->dp_free_bpobj,
	- dp->dp_meta_objset, obj));
	- }
	-
	- if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
	- dsl_pool_create_origin(dp, tx);
	-
	- /* create the root dataset */
	- obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
	-
	- /* create the root objset */
	- VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
	-#ifdef _KERNEL
	- {
	- objset_t *os;
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- os = dmu_objset_create_impl(dp->dp_spa, ds,
	- dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	- zfs_create_fs(os, kcred, zplprops, tx);
	- }
	-#endif
	- dsl_dataset_rele(ds, FTAG);
	-
	- dmu_tx_commit(tx);
	-
	- rrw_exit(&dp->dp_config_rwlock, FTAG);
	-
	- return (dp);
	-}
	-
	-/*
	- * Account for the meta-objset space in its placeholder dsl_dir.
	- */
	-void
	-dsl_pool_mos_diduse_space(dsl_pool_t *dp,
	- int64_t used, int64_t comp, int64_t uncomp)
	-{
	- ASSERT3U(comp, ==, uncomp); /* it's all metadata */
	- mutex_enter(&dp->dp_lock);
	- dp->dp_mos_used_delta += used;
	- dp->dp_mos_compressed_delta += comp;
	- dp->dp_mos_uncompressed_delta += uncomp;
	- mutex_exit(&dp->dp_lock);
	-}
	-
	-static void
	-dsl_pool_sync_mos(dsl_pool_t dp, dmu_tx_t tx)
	-{
	- zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	- dmu_objset_sync(dp->dp_meta_objset, zio, tx);
	- VERIFY0(zio_wait(zio));
	- dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
	- spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
	-}
	-
	-static void
	-dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
	-{
	- ASSERT(MUTEX_HELD(&dp->dp_lock));
	-
	- if (delta < 0)
	- ASSERT3U(-delta, <=, dp->dp_dirty_total);
	-
	- dp->dp_dirty_total += delta;
	-
	- /*
	- * Note: we signal even when increasing dp_dirty_total.
	- * This ensures forward progress -- each thread wakes the next waiter.
	- */
	- if (dp->dp_dirty_total < zfs_dirty_data_max)
	- cv_signal(&dp->dp_spaceavail_cv);
	-}
	-
	-static boolean_t
	-dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
	-{
	- spa_t *spa = dp->dp_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *vd = rvd->vdev_child[c];
	- txg_list_t *tl = &vd->vdev_ms_list;
	- metaslab_t *ms;
	-
	- for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
	- ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
	- VERIFY(range_tree_is_empty(ms->ms_freeing));
	- VERIFY(range_tree_is_empty(ms->ms_checkpointing));
	- }
	- }
	-
	- return (B_TRUE);
	-}
	-
	-void
	-dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
	-{
	- zio_t *zio;
	- dmu_tx_t *tx;
	- dsl_dir_t *dd;
	- dsl_dataset_t *ds;
	- objset_t *mos = dp->dp_meta_objset;
	- list_t synced_datasets;
	-
	- list_create(&synced_datasets, sizeof (dsl_dataset_t),
	- offsetof(dsl_dataset_t, ds_synced_link));
	-
	- tx = dmu_tx_create_assigned(dp, txg);
	-
	- /*
	- * Run all early sync tasks before writing out any dirty blocks.
	- * For more info on early sync tasks see block comment in
	- * dsl_early_sync_task().
	- */
	- if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
	- dsl_sync_task_t *dst;
	-
	- ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
	- while ((dst =
	- txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
	- ASSERT(dsl_early_sync_task_verify(dp, txg));
	- dsl_sync_task_sync(dst, tx);
	- }
	- ASSERT(dsl_early_sync_task_verify(dp, txg));
	- }
	-
	- /*
	- * Write out all dirty blocks of dirty datasets.
	- */
	- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	- while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
	- /*
	- * We must not sync any non-MOS datasets twice, because
	- * we may have taken a snapshot of them. However, we
	- * may sync newly-created datasets on pass 2.
	- */
	- ASSERT(!list_link_active(&ds->ds_synced_link));
	- list_insert_tail(&synced_datasets, ds);
	- dsl_dataset_sync(ds, zio, tx);
	- }
	- VERIFY0(zio_wait(zio));
	-
	- /*
	- * We have written all of the accounted dirty data, so our
	- * dp_space_towrite should now be zero. However, some seldom-used
	- * code paths do not adhere to this (e.g. dbuf_undirty(), also
	- * rounding error in dbuf_write_physdone).
	- * Shore up the accounting of any dirtied space now.
	- */
	- dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
	-
	- /*
	- * Update the long range free counter after
	- * we're done syncing user data
	- */
	- mutex_enter(&dp->dp_lock);
	- ASSERT(spa_sync_pass(dp->dp_spa) == 1 \|\|
	- dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
	- dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
	- mutex_exit(&dp->dp_lock);
	-
	- /*
	- * After the data blocks have been written (ensured by the zio_wait()
	- * above), update the user/group space accounting. This happens
	- * in tasks dispatched to dp_sync_taskq, so wait for them before
	- * continuing.
	- */
	- for (ds = list_head(&synced_datasets); ds != NULL;
	- ds = list_next(&synced_datasets, ds)) {
	- dmu_objset_do_userquota_updates(ds->ds_objset, tx);
	- }
	- taskq_wait(dp->dp_sync_taskq);
	-
	- /*
	- * Sync the datasets again to push out the changes due to
	- * userspace updates. This must be done before we process the
	- * sync tasks, so that any snapshots will have the correct
	- * user accounting information (and we won't get confused
	- * about which blocks are part of the snapshot).
	- */
	- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	- while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
	- ASSERT(list_link_active(&ds->ds_synced_link));
	- dmu_buf_rele(ds->ds_dbuf, ds);
	- dsl_dataset_sync(ds, zio, tx);
	- }
	- VERIFY0(zio_wait(zio));
	-
	- /*
	- * Now that the datasets have been completely synced, we can
	- * clean up our in-memory structures accumulated while syncing:
	- *
	- * - move dead blocks from the pending deadlist to the on-disk deadlist
	- * - release hold from dsl_dataset_dirty()
	- */
	- while ((ds = list_remove_head(&synced_datasets)) != NULL) {
	- dsl_dataset_sync_done(ds, tx);
	- }
	- while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
	- dsl_dir_sync(dd, tx);
	- }
	-
	- /*
	- * The MOS's space is accounted for in the pool/$MOS
	- * (dp_mos_dir). We can't modify the mos while we're syncing
	- * it, so we remember the deltas and apply them here.
	- */
	- if (dp->dp_mos_used_delta != 0 \|\| dp->dp_mos_compressed_delta != 0 \|\|
	- dp->dp_mos_uncompressed_delta != 0) {
	- dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
	- dp->dp_mos_used_delta,
	- dp->dp_mos_compressed_delta,
	- dp->dp_mos_uncompressed_delta, tx);
	- dp->dp_mos_used_delta = 0;
	- dp->dp_mos_compressed_delta = 0;
	- dp->dp_mos_uncompressed_delta = 0;
	- }
	-
	- if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) {
	- dsl_pool_sync_mos(dp, tx);
	- }
	-
	- /*
	- * If we modify a dataset in the same txg that we want to destroy it,
	- * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
	- * dsl_dir_destroy_check() will fail if there are unexpected holds.
	- * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
	- * and clearing the hold on it) before we process the sync_tasks.
	- * The MOS data dirtied by the sync_tasks will be synced on the next
	- * pass.
	- */
	- if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
	- dsl_sync_task_t *dst;
	- /*
	- * No more sync tasks should have been added while we
	- * were syncing.
	- */
	- ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
	- while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
	- dsl_sync_task_sync(dst, tx);
	- }
	-
	- dmu_tx_commit(tx);
	-
	- DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
	-}
	-
	-void
	-dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
	-{
	- zilog_t *zilog;
	-
	- while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) {
	- dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
	- /*
	- * We don't remove the zilog from the dp_dirty_zilogs
	- * list until after we've cleaned it. This ensures that
	- * callers of zilog_is_dirty() receive an accurate
	- * answer when they are racing with the spa sync thread.
	- */
	- zil_clean(zilog, txg);
	- (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
	- ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
	- dmu_buf_rele(ds->ds_dbuf, zilog);
	- }
	- ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
	-}
	-
	-/*
	- * TRUE if the current thread is the tx_sync_thread or if we
	- * are being called from SPA context during pool initialization.
	- */
	-int
	-dsl_pool_sync_context(dsl_pool_t *dp)
	-{
	- return (curthread == dp->dp_tx.tx_sync_thread \|\|
	- spa_is_initializing(dp->dp_spa) \|\|
	- taskq_member(dp->dp_sync_taskq, curthread));
	-}
	-
	-/*
	- * This function returns the amount of allocatable space in the pool
	- * minus whatever space is currently reserved by ZFS for specific
	- * purposes. Specifically:
	- *
	- * 1] Any reserved SLOP space
	- * 2] Any space used by the checkpoint
	- * 3] Any space used for deferred frees
	- *
	- * The latter 2 are especially important because they are needed to
	- * rectify the SPA's and DMU's different understanding of how much space
	- * is used. Now the DMU is aware of that extra space tracked by the SPA
	- * without having to maintain a separate special dir (e.g similar to
	- * $MOS, $FREEING, and $LEAKED).
	- *
	- * Note: By deferred frees here, we mean the frees that were deferred
	- * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
	- * segments placed in ms_defer trees during metaslab_sync_done().
	- */
	-uint64_t
	-dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
	-{
	- spa_t *spa = dp->dp_spa;
	- uint64_t space, resv, adjustedsize;
	- uint64_t spa_deferred_frees =
	- spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
	-
	- space = spa_get_dspace(spa)
	- - spa_get_checkpoint_space(spa) - spa_deferred_frees;
	- resv = spa_get_slop_space(spa);
	-
	- switch (slop_policy) {
	- case ZFS_SPACE_CHECK_NORMAL:
	- break;
	- case ZFS_SPACE_CHECK_RESERVED:
	- resv >>= 1;
	- break;
	- case ZFS_SPACE_CHECK_EXTRA_RESERVED:
	- resv >>= 2;
	- break;
	- case ZFS_SPACE_CHECK_NONE:
	- resv = 0;
	- break;
	- default:
	- panic("invalid slop policy value: %d", slop_policy);
	- break;
	- }
	- adjustedsize = (space >= resv) ? (space - resv) : 0;
	-
	- return (adjustedsize);
	-}
	-
	-uint64_t
	-dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
	-{
	- uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
	- uint64_t deferred =
	- metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
	- uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
	- return (quota);
	-}
	-
	-boolean_t
	-dsl_pool_need_dirty_delay(dsl_pool_t *dp)
	-{
	- uint64_t delay_min_bytes =
	- zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
	- uint64_t dirty_min_bytes =
	- zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100;
	- boolean_t rv;
	-
	- mutex_enter(&dp->dp_lock);
	- if (dp->dp_dirty_total > dirty_min_bytes)
	- txg_kick(dp);
	- rv = (dp->dp_dirty_total > delay_min_bytes);
	- mutex_exit(&dp->dp_lock);
	- return (rv);
	-}
	-
	-void
	-dsl_pool_dirty_space(dsl_pool_t dp, int64_t space, dmu_tx_t tx)
	-{
	- if (space > 0) {
	- mutex_enter(&dp->dp_lock);
	- dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
	- dsl_pool_dirty_delta(dp, space);
	- mutex_exit(&dp->dp_lock);
	- }
	-}
	-
	-void
	-dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
	-{
	- ASSERT3S(space, >=, 0);
	- if (space == 0)
	- return;
	- mutex_enter(&dp->dp_lock);
	- if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
	- /* XXX writing something we didn't dirty? */
	- space = dp->dp_dirty_pertxg[txg & TXG_MASK];
	- }
	- ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
	- dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
	- ASSERT3U(dp->dp_dirty_total, >=, space);
	- dsl_pool_dirty_delta(dp, -space);
	- mutex_exit(&dp->dp_lock);
	-}
	-
	-/* ARGSUSED */
	-static int
	-upgrade_clones_cb(dsl_pool_t dp, dsl_dataset_t hds, void *arg)
	-{
	- dmu_tx_t *tx = arg;
	- dsl_dataset_t ds, prev = NULL;
	- int err;
	-
	- err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
	- if (err)
	- return (err);
	-
	- while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	- err = dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
	- if (err) {
	- dsl_dataset_rele(ds, FTAG);
	- return (err);
	- }
	-
	- if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
	- break;
	- dsl_dataset_rele(ds, FTAG);
	- ds = prev;
	- prev = NULL;
	- }
	-
	- if (prev == NULL) {
	- prev = dp->dp_origin_snap;
	-
	- /*
	- * The $ORIGIN can't have any data, or the accounting
	- * will be wrong.
	- */
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	-
	- /* The origin doesn't get attached to itself */
	- if (ds->ds_object == prev->ds_object) {
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	- }
	-
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
	- dsl_dataset_phys(ds)->ds_prev_snap_txg =
	- dsl_dataset_phys(prev)->ds_creation_txg;
	-
	- dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
	- dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
	-
	- dmu_buf_will_dirty(prev->ds_dbuf, tx);
	- dsl_dataset_phys(prev)->ds_num_children++;
	-
	- if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
	- ASSERT(ds->ds_prev == NULL);
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj,
	- ds, &ds->ds_prev));
	- }
	- }
	-
	- ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
	- ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
	-
	- if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
	- dmu_buf_will_dirty(prev->ds_dbuf, tx);
	- dsl_dataset_phys(prev)->ds_next_clones_obj =
	- zap_create(dp->dp_meta_objset,
	- DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
	- }
	- VERIFY0(zap_add_int(dp->dp_meta_objset,
	- dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
	-
	- dsl_dataset_rele(ds, FTAG);
	- if (prev != dp->dp_origin_snap)
	- dsl_dataset_rele(prev, FTAG);
	- return (0);
	-}
	-
	-void
	-dsl_pool_upgrade_clones(dsl_pool_t dp, dmu_tx_t tx)
	-{
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(dp->dp_origin_snap != NULL);
	-
	- VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
	- tx, DS_FIND_CHILDREN \| DS_FIND_SERIALIZE));
	-}
	-
	-/* ARGSUSED */
	-static int
	-upgrade_dir_clones_cb(dsl_pool_t dp, dsl_dataset_t ds, void *arg)
	-{
	- dmu_tx_t *tx = arg;
	- objset_t *mos = dp->dp_meta_objset;
	-
	- if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
	- dsl_dataset_t *origin;
	-
	- VERIFY0(dsl_dataset_hold_obj(dp,
	- dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
	-
	- if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
	- dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
	- dsl_dir_phys(origin->ds_dir)->dd_clones =
	- zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
	- 0, tx);
	- }
	-
	- VERIFY0(zap_add_int(dp->dp_meta_objset,
	- dsl_dir_phys(origin->ds_dir)->dd_clones,
	- ds->ds_object, tx));
	-
	- dsl_dataset_rele(origin, FTAG);
	- }
	- return (0);
	-}
	-
	-void
	-dsl_pool_upgrade_dir_clones(dsl_pool_t dp, dmu_tx_t tx)
	-{
	- ASSERT(dmu_tx_is_syncing(tx));
	- uint64_t obj;
	-
	- (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
	- VERIFY0(dsl_pool_open_special_dir(dp,
	- FREE_DIR_NAME, &dp->dp_free_dir));
	-
	- /*
	- * We can't use bpobj_alloc(), because spa_version() still
	- * returns the old version, and we need a new-version bpobj with
	- * subobj support. So call dmu_object_alloc() directly.
	- */
	- obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
	- SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
	- VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
	- VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
	-
	- VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	- upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN \| DS_FIND_SERIALIZE));
	-}
	-
	-void
	-dsl_pool_create_origin(dsl_pool_t dp, dmu_tx_t tx)
	-{
	- uint64_t dsobj;
	- dsl_dataset_t *ds;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(dp->dp_origin_snap == NULL);
	- ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
	-
	- /* create the origin dir, ds, & snap-ds */
	- dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
	- NULL, 0, kcred, tx);
	- VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
	- dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
	- VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
	- dp, &dp->dp_origin_snap));
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-taskq_t *
	-dsl_pool_vnrele_taskq(dsl_pool_t *dp)
	-{
	- return (dp->dp_vnrele_taskq);
	-}
	-
	-/*
	- * Walk through the pool-wide zap object of temporary snapshot user holds
	- * and release them.
	- */
	-void
	-dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
	-{
	- zap_attribute_t za;
	- zap_cursor_t zc;
	- objset_t *mos = dp->dp_meta_objset;
	- uint64_t zapobj = dp->dp_tmp_userrefs_obj;
	- nvlist_t *holds;
	-
	- if (zapobj == 0)
	- return;
	- ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
	-
	- holds = fnvlist_alloc();
	-
	- for (zap_cursor_init(&zc, mos, zapobj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- char *htag;
	- nvlist_t *tags;
	-
	- htag = strchr(za.za_name, '-');
	- *htag = '\0';
	- ++htag;
	- if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
	- tags = fnvlist_alloc();
	- fnvlist_add_boolean(tags, htag);
	- fnvlist_add_nvlist(holds, za.za_name, tags);
	- fnvlist_free(tags);
	- } else {
	- fnvlist_add_boolean(tags, htag);
	- }
	- }
	- dsl_dataset_user_release_tmp(dp, holds);
	- fnvlist_free(holds);
	- zap_cursor_fini(&zc);
	-}
	-
	-/*
	- * Create the pool-wide zap object for storing temporary snapshot holds.
	- */
	-void
	-dsl_pool_user_hold_create_obj(dsl_pool_t dp, dmu_tx_t tx)
	-{
	- objset_t *mos = dp->dp_meta_objset;
	-
	- ASSERT(dp->dp_tmp_userrefs_obj == 0);
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
	-}
	-
	-static int
	-dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
	- const char tag, uint64_t now, dmu_tx_t tx, boolean_t holding)
	-{
	- objset_t *mos = dp->dp_meta_objset;
	- uint64_t zapobj = dp->dp_tmp_userrefs_obj;
	- char *name;
	- int error;
	-
	- ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- /*
	- * If the pool was created prior to SPA_VERSION_USERREFS, the
	- * zap object for temporary holds might not exist yet.
	- */
	- if (zapobj == 0) {
	- if (holding) {
	- dsl_pool_user_hold_create_obj(dp, tx);
	- zapobj = dp->dp_tmp_userrefs_obj;
	- } else {
	- return (SET_ERROR(ENOENT));
	- }
	- }
	-
	- name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
	- if (holding)
	- error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
	- else
	- error = zap_remove(mos, zapobj, name, tx);
	- strfree(name);
	-
	- return (error);
	-}
	-
	-/*
	- * Add a temporary hold for the given dataset object and tag.
	- */
	-int
	-dsl_pool_user_hold(dsl_pool_t dp, uint64_t dsobj, const char tag,
	- uint64_t now, dmu_tx_t *tx)
	-{
	- return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
	-}
	-
	-/*
	- * Release a temporary hold for the given dataset object and tag.
	- */
	-int
	-dsl_pool_user_release(dsl_pool_t dp, uint64_t dsobj, const char tag,
	- dmu_tx_t *tx)
	-{
	- return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE));
	-}
	-
	-/*
	- * DSL Pool Configuration Lock
	- *
	- * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
	- * creation / destruction / rename / property setting). It must be held for
	- * read to hold a dataset or dsl_dir. I.e. you must call
	- * dsl_pool_config_enter() or dsl_pool_hold() before calling
	- * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock
	- * must be held continuously until all datasets and dsl_dirs are released.
	- *
	- * The only exception to this rule is that if a "long hold" is placed on
	- * a dataset, then the dp_config_rwlock may be dropped while the dataset
	- * is still held. The long hold will prevent the dataset from being
	- * destroyed -- the destroy will fail with EBUSY. A long hold can be
	- * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
	- * (by calling dsl_{dataset,objset}_{try}own{_obj}).
	- *
	- * Legitimate long-holders (including owners) should be long-running, cancelable
	- * tasks that should cause "zfs destroy" to fail. This includes DMU
	- * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
	- * "zfs send", and "zfs diff". There are several other long-holders whose
	- * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
	- *
	- * The usual formula for long-holding would be:
	- * dsl_pool_hold()
	- * dsl_dataset_hold()
	- * ... perform checks ...
	- * dsl_dataset_long_hold()
	- * dsl_pool_rele()
	- * ... perform long-running task ...
	- * dsl_dataset_long_rele()
	- * dsl_dataset_rele()
	- *
	- * Note that when the long hold is released, the dataset is still held but
	- * the pool is not held. The dataset may change arbitrarily during this time
	- * (e.g. it could be destroyed). Therefore you shouldn't do anything to the
	- * dataset except release it.
	- *
	- * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
	- * or modifying operations.
	- *
	- * Modifying operations should generally use dsl_sync_task(). The synctask
	- * infrastructure enforces proper locking strategy with respect to the
	- * dp_config_rwlock. See the comment above dsl_sync_task() for details.
	- *
	- * Read-only operations will manually hold the pool, then the dataset, obtain
	- * information from the dataset, then release the pool and dataset.
	- * dmu_objset_{hold,rele}() are convenience routines that also do the pool
	- * hold/rele.
	- */
	-
	-int
	-dsl_pool_hold(const char name, void tag, dsl_pool_t **dp)
	-{
	- spa_t *spa;
	- int error;
	-
	- error = spa_open(name, &spa, tag);
	- if (error == 0) {
	- *dp = spa_get_dsl(spa);
	- dsl_pool_config_enter(*dp, tag);
	- }
	- return (error);
	-}
	-
	-void
	-dsl_pool_rele(dsl_pool_t dp, void tag)
	-{
	- dsl_pool_config_exit(dp, tag);
	- spa_close(dp->dp_spa, tag);
	-}
	-
	-void
	-dsl_pool_config_enter(dsl_pool_t dp, void tag)
	-{
	- /*
	- * We use a "reentrant" reader-writer lock, but not reentrantly.
	- *
	- * The rrwlock can (with the track_all flag) track all reading threads,
	- * which is very useful for debugging which code path failed to release
	- * the lock, and for verifying that the current thread does hold
	- * the lock.
	- *
	- * (Unlike a rwlock, which knows that N threads hold it for
	- * read, but not which threads, so rw_held(RW_READER) returns TRUE
	- * if any thread holds it for read, even if this thread doesn't).
	- */
	- ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
	- rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
	-}
	-
	-void
	-dsl_pool_config_enter_prio(dsl_pool_t dp, void tag)
	-{
	- ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
	- rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
	-}
	-
	-void
	-dsl_pool_config_exit(dsl_pool_t dp, void tag)
	-{
	- rrw_exit(&dp->dp_config_rwlock, tag);
	-}
	-
	-boolean_t
	-dsl_pool_config_held(dsl_pool_t *dp)
	-{
	- return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
	-}
	-
	-boolean_t
	-dsl_pool_config_held_writer(dsl_pool_t *dp)
	-{
	- return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
	@@ -1,1211 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- * Copyright 2015, Joyent, Inc.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/spa.h>
	-#include <sys/zap.h>
	-#include <sys/fs/zfs.h>
	-
	-#include "zfs_prop.h"
	-
	-#define ZPROP_INHERIT_SUFFIX "$inherit"
	-#define ZPROP_RECVD_SUFFIX "$recvd"
	-
	-static int
	-dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
	-{
	- /*
	- * The setonce properties are read-only, BUT they still
	- * have a default value that can be used as the initial
	- * value.
	- */
	- if (prop == ZPROP_INVAL \|\|
	- (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
	- return (SET_ERROR(ENOENT));
	-
	- if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
	- if (zfs_prop_default_string(prop) == NULL)
	- return (SET_ERROR(ENOENT));
	- if (intsz != 1)
	- return (SET_ERROR(EOVERFLOW));
	- (void) strncpy(buf, zfs_prop_default_string(prop),
	- numints);
	- } else {
	- if (intsz != 8 \|\| numints < 1)
	- return (SET_ERROR(EOVERFLOW));
	-
	- (uint64_t )buf = zfs_prop_default_numeric(prop);
	- }
	-
	- return (0);
	-}
	-
	-int
	-dsl_prop_get_dd(dsl_dir_t dd, const char propname,
	- int intsz, int numints, void buf, char setpoint, boolean_t snapshot)
	-{
	- int err = ENOENT;
	- dsl_dir_t *target = dd;
	- objset_t *mos = dd->dd_pool->dp_meta_objset;
	- zfs_prop_t prop;
	- boolean_t inheritable;
	- boolean_t inheriting = B_FALSE;
	- char *inheritstr;
	- char *recvdstr;
	-
	- ASSERT(dsl_pool_config_held(dd->dd_pool));
	-
	- if (setpoint)
	- setpoint[0] = '\0';
	-
	- prop = zfs_name_to_prop(propname);
	- inheritable = (prop == ZPROP_INVAL \|\| zfs_prop_inheritable(prop));
	- inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
	- recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
	-
	- /*
	- * Note: dd may become NULL, therefore we shouldn't dereference it
	- * after this loop.
	- */
	- for (; dd != NULL; dd = dd->dd_parent) {
	- if (dd != target \|\| snapshot) {
	- if (!inheritable)
	- break;
	- inheriting = B_TRUE;
	- }
	-
	- /* Check for a local value. */
	- err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
	- propname, intsz, numints, buf);
	- if (err != ENOENT) {
	- if (setpoint != NULL && err == 0)
	- dsl_dir_name(dd, setpoint);
	- break;
	- }
	-
	- /*
	- * Skip the check for a received value if there is an explicit
	- * inheritance entry.
	- */
	- err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
	- inheritstr);
	- if (err != 0 && err != ENOENT)
	- break;
	-
	- if (err == ENOENT) {
	- /* Check for a received value. */
	- err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
	- recvdstr, intsz, numints, buf);
	- if (err != ENOENT) {
	- if (setpoint != NULL && err == 0) {
	- if (inheriting) {
	- dsl_dir_name(dd, setpoint);
	- } else {
	- (void) strcpy(setpoint,
	- ZPROP_SOURCE_VAL_RECVD);
	- }
	- }
	- break;
	- }
	- }
	-
	- /*
	- * If we found an explicit inheritance entry, err is zero even
	- * though we haven't yet found the value, so reinitializing err
	- * at the end of the loop (instead of at the beginning) ensures
	- * that err has a valid post-loop value.
	- */
	- err = SET_ERROR(ENOENT);
	- }
	-
	- if (err == ENOENT)
	- err = dodefault(prop, intsz, numints, buf);
	-
	- strfree(inheritstr);
	- strfree(recvdstr);
	-
	- return (err);
	-}
	-
	-int
	-dsl_prop_get_ds(dsl_dataset_t ds, const char propname,
	- int intsz, int numints, void buf, char setpoint)
	-{
	- zfs_prop_t prop = zfs_name_to_prop(propname);
	- boolean_t inheritable;
	- uint64_t zapobj;
	-
	- ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
	- inheritable = (prop == ZPROP_INVAL \|\| zfs_prop_inheritable(prop));
	- zapobj = dsl_dataset_phys(ds)->ds_props_obj;
	-
	- if (zapobj != 0) {
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- int err;
	-
	- ASSERT(ds->ds_is_snapshot);
	-
	- /* Check for a local value. */
	- err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
	- if (err != ENOENT) {
	- if (setpoint != NULL && err == 0)
	- dsl_dataset_name(ds, setpoint);
	- return (err);
	- }
	-
	- /*
	- * Skip the check for a received value if there is an explicit
	- * inheritance entry.
	- */
	- if (inheritable) {
	- char *inheritstr = kmem_asprintf("%s%s", propname,
	- ZPROP_INHERIT_SUFFIX);
	- err = zap_contains(mos, zapobj, inheritstr);
	- strfree(inheritstr);
	- if (err != 0 && err != ENOENT)
	- return (err);
	- }
	-
	- if (err == ENOENT) {
	- /* Check for a received value. */
	- char *recvdstr = kmem_asprintf("%s%s", propname,
	- ZPROP_RECVD_SUFFIX);
	- err = zap_lookup(mos, zapobj, recvdstr,
	- intsz, numints, buf);
	- strfree(recvdstr);
	- if (err != ENOENT) {
	- if (setpoint != NULL && err == 0)
	- (void) strcpy(setpoint,
	- ZPROP_SOURCE_VAL_RECVD);
	- return (err);
	- }
	- }
	- }
	-
	- return (dsl_prop_get_dd(ds->ds_dir, propname,
	- intsz, numints, buf, setpoint, ds->ds_is_snapshot));
	-}
	-
	-static dsl_prop_record_t *
	-dsl_prop_record_find(dsl_dir_t dd, const char propname)
	-{
	- dsl_prop_record_t *pr = NULL;
	-
	- ASSERT(MUTEX_HELD(&dd->dd_lock));
	-
	- for (pr = list_head(&dd->dd_props);
	- pr != NULL; pr = list_next(&dd->dd_props, pr)) {
	- if (strcmp(pr->pr_propname, propname) == 0)
	- break;
	- }
	-
	- return (pr);
	-}
	-
	-static dsl_prop_record_t *
	-dsl_prop_record_create(dsl_dir_t dd, const char propname)
	-{
	- dsl_prop_record_t *pr;
	-
	- ASSERT(MUTEX_HELD(&dd->dd_lock));
	-
	- pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP);
	- pr->pr_propname = spa_strdup(propname);
	- list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t),
	- offsetof(dsl_prop_cb_record_t, cbr_pr_node));
	- list_insert_head(&dd->dd_props, pr);
	-
	- return (pr);
	-}
	-
	-void
	-dsl_prop_init(dsl_dir_t *dd)
	-{
	- list_create(&dd->dd_props, sizeof (dsl_prop_record_t),
	- offsetof(dsl_prop_record_t, pr_node));
	-}
	-
	-void
	-dsl_prop_fini(dsl_dir_t *dd)
	-{
	- dsl_prop_record_t *pr;
	-
	- while ((pr = list_remove_head(&dd->dd_props)) != NULL) {
	- list_destroy(&pr->pr_cbs);
	- strfree((char *)pr->pr_propname);
	- kmem_free(pr, sizeof (dsl_prop_record_t));
	- }
	- list_destroy(&dd->dd_props);
	-}
	-
	-/*
	- * Register interest in the named property. We'll call the callback
	- * once to notify it of the current property value, and again each time
	- * the property changes, until this callback is unregistered.
	- *
	- * Return 0 on success, errno if the prop is not an integer value.
	- */
	-int
	-dsl_prop_register(dsl_dataset_t ds, const char propname,
	- dsl_prop_changed_cb_t callback, void cbarg)
	-{
	- dsl_dir_t *dd = ds->ds_dir;
	- dsl_pool_t *dp = dd->dd_pool;
	- uint64_t value;
	- dsl_prop_record_t *pr;
	- dsl_prop_cb_record_t *cbr;
	- int err;
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- err = dsl_prop_get_int_ds(ds, propname, &value);
	- if (err != 0)
	- return (err);
	-
	- cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
	- cbr->cbr_ds = ds;
	- cbr->cbr_func = callback;
	- cbr->cbr_arg = cbarg;
	-
	- mutex_enter(&dd->dd_lock);
	- pr = dsl_prop_record_find(dd, propname);
	- if (pr == NULL)
	- pr = dsl_prop_record_create(dd, propname);
	- cbr->cbr_pr = pr;
	- list_insert_head(&pr->pr_cbs, cbr);
	- list_insert_head(&ds->ds_prop_cbs, cbr);
	- mutex_exit(&dd->dd_lock);
	-
	- cbr->cbr_func(cbr->cbr_arg, value);
	- return (0);
	-}
	-
	-int
	-dsl_prop_get(const char dsname, const char propname,
	- int intsz, int numints, void buf, char setpoint)
	-{
	- objset_t *os;
	- int error;
	-
	- error = dmu_objset_hold(dsname, FTAG, &os);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_prop_get_ds(dmu_objset_ds(os), propname,
	- intsz, numints, buf, setpoint);
	-
	- dmu_objset_rele(os, FTAG);
	- return (error);
	-}
	-
	-/*
	- * Get the current property value. It may have changed by the time this
	- * function returns, so it is NOT safe to follow up with
	- * dsl_prop_register() and assume that the value has not changed in
	- * between.
	- *
	- * Return 0 on success, ENOENT if ddname is invalid.
	- */
	-int
	-dsl_prop_get_integer(const char ddname, const char propname,
	- uint64_t valuep, char setpoint)
	-{
	- return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
	-}
	-
	-int
	-dsl_prop_get_int_ds(dsl_dataset_t ds, const char propname,
	- uint64_t *valuep)
	-{
	- return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL));
	-}
	-
	-/*
	- * Predict the effective value of the given special property if it were set with
	- * the given value and source. This is not a general purpose function. It exists
	- * only to handle the special requirements of the quota and reservation
	- * properties. The fact that these properties are non-inheritable greatly
	- * simplifies the prediction logic.
	- *
	- * Returns 0 on success, a positive error code on failure, or -1 if called with
	- * a property not handled by this function.
	- */
	-int
	-dsl_prop_predict(dsl_dir_t dd, const char propname,
	- zprop_source_t source, uint64_t value, uint64_t *newvalp)
	-{
	- zfs_prop_t prop = zfs_name_to_prop(propname);
	- objset_t *mos;
	- uint64_t zapobj;
	- uint64_t version;
	- char *recvdstr;
	- int err = 0;
	-
	- switch (prop) {
	- case ZFS_PROP_QUOTA:
	- case ZFS_PROP_RESERVATION:
	- case ZFS_PROP_REFQUOTA:
	- case ZFS_PROP_REFRESERVATION:
	- break;
	- default:
	- return (-1);
	- }
	-
	- mos = dd->dd_pool->dp_meta_objset;
	- zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
	- recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
	-
	- version = spa_version(dd->dd_pool->dp_spa);
	- if (version < SPA_VERSION_RECVD_PROPS) {
	- if (source & ZPROP_SRC_NONE)
	- source = ZPROP_SRC_NONE;
	- else if (source & ZPROP_SRC_RECEIVED)
	- source = ZPROP_SRC_LOCAL;
	- }
	-
	- switch (source) {
	- case ZPROP_SRC_NONE:
	- /* Revert to the received value, if any. */
	- err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp);
	- if (err == ENOENT)
	- *newvalp = 0;
	- break;
	- case ZPROP_SRC_LOCAL:
	- *newvalp = value;
	- break;
	- case ZPROP_SRC_RECEIVED:
	- /*
	- * If there's no local setting, then the new received value will
	- * be the effective value.
	- */
	- err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
	- if (err == ENOENT)
	- *newvalp = value;
	- break;
	- case (ZPROP_SRC_NONE \| ZPROP_SRC_RECEIVED):
	- /*
	- * We're clearing the received value, so the local setting (if
	- * it exists) remains the effective value.
	- */
	- err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
	- if (err == ENOENT)
	- *newvalp = 0;
	- break;
	- default:
	- panic("unexpected property source: %d", source);
	- }
	-
	- strfree(recvdstr);
	-
	- if (err == ENOENT)
	- return (0);
	-
	- return (err);
	-}
	-
	-/*
	- * Unregister all callbacks that are registered with the
	- * given callback argument.
	- */
	-void
	-dsl_prop_unregister_all(dsl_dataset_t ds, void cbarg)
	-{
	- dsl_prop_cb_record_t cbr, next_cbr;
	-
	- dsl_dir_t *dd = ds->ds_dir;
	-
	- mutex_enter(&dd->dd_lock);
	- next_cbr = list_head(&ds->ds_prop_cbs);
	- while (next_cbr != NULL) {
	- cbr = next_cbr;
	- next_cbr = list_next(&ds->ds_prop_cbs, cbr);
	- if (cbr->cbr_arg == cbarg) {
	- list_remove(&ds->ds_prop_cbs, cbr);
	- list_remove(&cbr->cbr_pr->pr_cbs, cbr);
	- kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
	- }
	- }
	- mutex_exit(&dd->dd_lock);
	-}
	-
	-boolean_t
	-dsl_prop_hascb(dsl_dataset_t *ds)
	-{
	- return (!list_is_empty(&ds->ds_prop_cbs));
	-}
	-
	-/* ARGSUSED */
	-static int
	-dsl_prop_notify_all_cb(dsl_pool_t dp, dsl_dataset_t ds, void *arg)
	-{
	- dsl_dir_t *dd = ds->ds_dir;
	- dsl_prop_record_t *pr;
	- dsl_prop_cb_record_t *cbr;
	-
	- mutex_enter(&dd->dd_lock);
	- for (pr = list_head(&dd->dd_props);
	- pr; pr = list_next(&dd->dd_props, pr)) {
	- for (cbr = list_head(&pr->pr_cbs); cbr;
	- cbr = list_next(&pr->pr_cbs, cbr)) {
	- uint64_t value;
	-
	- /*
	- * Callback entries do not have holds on their
	- * datasets so that datasets with registered
	- * callbacks are still eligible for eviction.
	- * Unlike operations to update properties on a
	- * single dataset, we are performing a recursive
	- * descent of related head datasets. The caller
	- * of this function only has a dataset hold on
	- * the passed in head dataset, not the snapshots
	- * associated with this dataset. Without a hold,
	- * the dataset pointer within callback records
	- * for snapshots can be invalidated by eviction
	- * at any time.
	- *
	- * Use dsl_dataset_try_add_ref() to verify
	- * that the dataset for a snapshot has not
	- * begun eviction processing and to prevent
	- * eviction from occurring for the duration of
	- * the callback. If the hold attempt fails,
	- * this object is already being evicted and the
	- * callback can be safely ignored.
	- */
	- if (ds != cbr->cbr_ds &&
	- !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
	- continue;
	-
	- if (dsl_prop_get_ds(cbr->cbr_ds,
	- cbr->cbr_pr->pr_propname, sizeof (value), 1,
	- &value, NULL) == 0)
	- cbr->cbr_func(cbr->cbr_arg, value);
	-
	- if (ds != cbr->cbr_ds)
	- dsl_dataset_rele(cbr->cbr_ds, FTAG);
	- }
	- }
	- mutex_exit(&dd->dd_lock);
	-
	- return (0);
	-}
	-
	-/*
	- * Update all property values for ddobj & its descendants. This is used
	- * when renaming the dir.
	- */
	-void
	-dsl_prop_notify_all(dsl_dir_t *dd)
	-{
	- dsl_pool_t *dp = dd->dd_pool;
	- ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
	- (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb,
	- NULL, DS_FIND_CHILDREN);
	-}
	-
	-static void
	-dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
	- const char *propname, uint64_t value, int first)
	-{
	- dsl_dir_t *dd;
	- dsl_prop_record_t *pr;
	- dsl_prop_cb_record_t *cbr;
	- objset_t *mos = dp->dp_meta_objset;
	- zap_cursor_t zc;
	- zap_attribute_t *za;
	- int err;
	-
	- ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
	- err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
	- if (err)
	- return;
	-
	- if (!first) {
	- /*
	- * If the prop is set here, then this change is not
	- * being inherited here or below; stop the recursion.
	- */
	- err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
	- propname);
	- if (err == 0) {
	- dsl_dir_rele(dd, FTAG);
	- return;
	- }
	- ASSERT3U(err, ==, ENOENT);
	- }
	-
	- mutex_enter(&dd->dd_lock);
	- pr = dsl_prop_record_find(dd, propname);
	- if (pr != NULL) {
	- for (cbr = list_head(&pr->pr_cbs); cbr;
	- cbr = list_next(&pr->pr_cbs, cbr)) {
	- uint64_t propobj;
	-
	- /*
	- * cbr->cbr_ds may be invalidated due to eviction,
	- * requiring the use of dsl_dataset_try_add_ref().
	- * See comment block in dsl_prop_notify_all_cb()
	- * for details.
	- */
	- if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
	- continue;
	-
	- propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
	-
	- /*
	- * If the property is not set on this ds, then it is
	- * inherited here; call the callback.
	- */
	- if (propobj == 0 \|\|
	- zap_contains(mos, propobj, propname) != 0)
	- cbr->cbr_func(cbr->cbr_arg, value);
	-
	- dsl_dataset_rele(cbr->cbr_ds, FTAG);
	- }
	- }
	- mutex_exit(&dd->dd_lock);
	-
	- za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
	- for (zap_cursor_init(&zc, mos,
	- dsl_dir_phys(dd)->dd_child_dir_zapobj);
	- zap_cursor_retrieve(&zc, za) == 0;
	- zap_cursor_advance(&zc)) {
	- dsl_prop_changed_notify(dp, za->za_first_integer,
	- propname, value, FALSE);
	- }
	- kmem_free(za, sizeof (zap_attribute_t));
	- zap_cursor_fini(&zc);
	- dsl_dir_rele(dd, FTAG);
	-}
	-
	-void
	-dsl_prop_set_sync_impl(dsl_dataset_t ds, const char propname,
	- zprop_source_t source, int intsz, int numints, const void *value,
	- dmu_tx_t *tx)
	-{
	- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- uint64_t zapobj, intval, dummy;
	- int isint;
	- char valbuf[32];
	- const char *valstr = NULL;
	- char *inheritstr;
	- char *recvdstr;
	- char *tbuf = NULL;
	- int err;
	- uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
	-
	- isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0);
	-
	- if (ds->ds_is_snapshot) {
	- ASSERT(version >= SPA_VERSION_SNAP_PROPS);
	- if (dsl_dataset_phys(ds)->ds_props_obj == 0) {
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- dsl_dataset_phys(ds)->ds_props_obj =
	- zap_create(mos,
	- DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
	- }
	- zapobj = dsl_dataset_phys(ds)->ds_props_obj;
	- } else {
	- zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
	- }
	-
	- if (version < SPA_VERSION_RECVD_PROPS) {
	- if (source & ZPROP_SRC_NONE)
	- source = ZPROP_SRC_NONE;
	- else if (source & ZPROP_SRC_RECEIVED)
	- source = ZPROP_SRC_LOCAL;
	- }
	-
	- inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
	- recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
	-
	- switch (source) {
	- case ZPROP_SRC_NONE:
	- /*
	- * revert to received value, if any (inherit -S)
	- * - remove propname
	- * - remove propname$inherit
	- */
	- err = zap_remove(mos, zapobj, propname, tx);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- err = zap_remove(mos, zapobj, inheritstr, tx);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- break;
	- case ZPROP_SRC_LOCAL:
	- /*
	- * remove propname$inherit
	- * set propname -> value
	- */
	- err = zap_remove(mos, zapobj, inheritstr, tx);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- VERIFY0(zap_update(mos, zapobj, propname,
	- intsz, numints, value, tx));
	- break;
	- case ZPROP_SRC_INHERITED:
	- /*
	- * explicitly inherit
	- * - remove propname
	- * - set propname$inherit
	- */
	- err = zap_remove(mos, zapobj, propname, tx);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- if (version >= SPA_VERSION_RECVD_PROPS &&
	- dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
	- dummy = 0;
	- VERIFY0(zap_update(mos, zapobj, inheritstr,
	- 8, 1, &dummy, tx));
	- }
	- break;
	- case ZPROP_SRC_RECEIVED:
	- /*
	- * set propname$recvd -> value
	- */
	- err = zap_update(mos, zapobj, recvdstr,
	- intsz, numints, value, tx);
	- ASSERT(err == 0);
	- break;
	- case (ZPROP_SRC_NONE \| ZPROP_SRC_LOCAL \| ZPROP_SRC_RECEIVED):
	- /*
	- * clear local and received settings
	- * - remove propname
	- * - remove propname$inherit
	- * - remove propname$recvd
	- */
	- err = zap_remove(mos, zapobj, propname, tx);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- err = zap_remove(mos, zapobj, inheritstr, tx);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- /* FALLTHRU */
	- case (ZPROP_SRC_NONE \| ZPROP_SRC_RECEIVED):
	- /*
	- * remove propname$recvd
	- */
	- err = zap_remove(mos, zapobj, recvdstr, tx);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- break;
	- default:
	- cmn_err(CE_PANIC, "unexpected property source: %d", source);
	- }
	-
	- strfree(inheritstr);
	- strfree(recvdstr);
	-
	- if (isint) {
	- VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
	-
	- if (ds->ds_is_snapshot) {
	- dsl_prop_cb_record_t *cbr;
	- /*
	- * It's a snapshot; nothing can inherit this
	- * property, so just look for callbacks on this
	- * ds here.
	- */
	- mutex_enter(&ds->ds_dir->dd_lock);
	- for (cbr = list_head(&ds->ds_prop_cbs); cbr;
	- cbr = list_next(&ds->ds_prop_cbs, cbr)) {
	- if (strcmp(cbr->cbr_pr->pr_propname,
	- propname) == 0)
	- cbr->cbr_func(cbr->cbr_arg, intval);
	- }
	- mutex_exit(&ds->ds_dir->dd_lock);
	- } else {
	- dsl_prop_changed_notify(ds->ds_dir->dd_pool,
	- ds->ds_dir->dd_object, propname, intval, TRUE);
	- }
	-
	- (void) snprintf(valbuf, sizeof (valbuf),
	- "%lld", (longlong_t)intval);
	- valstr = valbuf;
	- } else {
	- if (source == ZPROP_SRC_LOCAL) {
	- valstr = value;
	- } else {
	- tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
	- if (dsl_prop_get_ds(ds, propname, 1,
	- ZAP_MAXVALUELEN, tbuf, NULL) == 0)
	- valstr = tbuf;
	- }
	- }
	-
	- spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE \|\|
	- source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx,
	- "%s=%s", propname, (valstr == NULL ? "" : valstr));
	-
	- if (tbuf != NULL)
	- kmem_free(tbuf, ZAP_MAXVALUELEN);
	-}
	-
	-int
	-dsl_prop_set_int(const char dsname, const char propname,
	- zprop_source_t source, uint64_t value)
	-{
	- nvlist_t *nvl = fnvlist_alloc();
	- int error;
	-
	- fnvlist_add_uint64(nvl, propname, value);
	- error = dsl_props_set(dsname, source, nvl);
	- fnvlist_free(nvl);
	- return (error);
	-}
	-
	-int
	-dsl_prop_set_string(const char dsname, const char propname,
	- zprop_source_t source, const char *value)
	-{
	- nvlist_t *nvl = fnvlist_alloc();
	- int error;
	-
	- fnvlist_add_string(nvl, propname, value);
	- error = dsl_props_set(dsname, source, nvl);
	- fnvlist_free(nvl);
	- return (error);
	-}
	-
	-int
	-dsl_prop_inherit(const char dsname, const char propname,
	- zprop_source_t source)
	-{
	- nvlist_t *nvl = fnvlist_alloc();
	- int error;
	-
	- fnvlist_add_boolean(nvl, propname);
	- error = dsl_props_set(dsname, source, nvl);
	- fnvlist_free(nvl);
	- return (error);
	-}
	-
	-typedef struct dsl_props_set_arg {
	- const char *dpsa_dsname;
	- zprop_source_t dpsa_source;
	- nvlist_t *dpsa_props;
	-} dsl_props_set_arg_t;
	-
	-static int
	-dsl_props_set_check(void arg, dmu_tx_t tx)
	-{
	- dsl_props_set_arg_t *dpsa = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	- uint64_t version;
	- nvpair_t *elem = NULL;
	- int err;
	-
	- err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds);
	- if (err != 0)
	- return (err);
	-
	- version = spa_version(ds->ds_dir->dd_pool->dp_spa);
	- while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) {
	- if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ENAMETOOLONG));
	- }
	- if (nvpair_type(elem) == DATA_TYPE_STRING) {
	- char *valstr = fnvpair_value_string(elem);
	- if (strlen(valstr) >= (version <
	- SPA_VERSION_STMF_PROP ?
	- ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (E2BIG);
	- }
	- }
	- }
	-
	- if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) {
	- dsl_dataset_rele(ds, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	-}
	-
	-void
	-dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source,
	- nvlist_t props, dmu_tx_t tx)
	-{
	- nvpair_t *elem = NULL;
	-
	- while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
	- nvpair_t *pair = elem;
	- const char *name = nvpair_name(pair);
	-
	- if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	- /*
	- * This usually happens when we reuse the nvlist_t data
	- * returned by the counterpart dsl_prop_get_all_impl().
	- * For instance we do this to restore the original
	- * received properties when an error occurs in the
	- * zfs_ioc_recv() codepath.
	- */
	- nvlist_t *attrs = fnvpair_value_nvlist(pair);
	- pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
	- }
	-
	- if (nvpair_type(pair) == DATA_TYPE_STRING) {
	- const char *value = fnvpair_value_string(pair);
	- dsl_prop_set_sync_impl(ds, name,
	- source, 1, strlen(value) + 1, value, tx);
	- } else if (nvpair_type(pair) == DATA_TYPE_UINT64) {
	- uint64_t intval = fnvpair_value_uint64(pair);
	- dsl_prop_set_sync_impl(ds, name,
	- source, sizeof (intval), 1, &intval, tx);
	- } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) {
	- dsl_prop_set_sync_impl(ds, name,
	- source, 0, 0, NULL, tx);
	- } else {
	- panic("invalid nvpair type");
	- }
	- }
	-}
	-
	-static void
	-dsl_props_set_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_props_set_arg_t *dpsa = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_dataset_t *ds;
	-
	- VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds));
	- dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx);
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-/*
	- * All-or-nothing; if any prop can't be set, nothing will be modified.
	- */
	-int
	-dsl_props_set(const char dsname, zprop_source_t source, nvlist_t props)
	-{
	- dsl_props_set_arg_t dpsa;
	- int nblks = 0;
	-
	- dpsa.dpsa_dsname = dsname;
	- dpsa.dpsa_source = source;
	- dpsa.dpsa_props = props;
	-
	- /*
	- * If the source includes NONE, then we will only be removing entries
	- * from the ZAP object. In that case don't check for ENOSPC.
	- */
	- if ((source & ZPROP_SRC_NONE) == 0)
	- nblks = 2 * fnvlist_num_pairs(props);
	-
	- return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
	- &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
	-}
	-
	-typedef enum dsl_prop_getflags {
	- DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */
	- DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */
	- DSL_PROP_GET_LOCAL = 0x4, /* local properties */
	- DSL_PROP_GET_RECEIVED = 0x8 /* received properties */
	-} dsl_prop_getflags_t;
	-
	-static int
	-dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
	- const char setpoint, dsl_prop_getflags_t flags, nvlist_t nv)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- int err = 0;
	-
	- for (zap_cursor_init(&zc, mos, propobj);
	- (err = zap_cursor_retrieve(&zc, &za)) == 0;
	- zap_cursor_advance(&zc)) {
	- nvlist_t *propval;
	- zfs_prop_t prop;
	- char buf[ZAP_MAXNAMELEN];
	- char *valstr;
	- const char *suffix;
	- const char *propname;
	- const char *source;
	-
	- suffix = strchr(za.za_name, '$');
	-
	- if (suffix == NULL) {
	- /*
	- * Skip local properties if we only want received
	- * properties.
	- */
	- if (flags & DSL_PROP_GET_RECEIVED)
	- continue;
	-
	- propname = za.za_name;
	- source = setpoint;
	- } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
	- /* Skip explicitly inherited entries. */
	- continue;
	- } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
	- if (flags & DSL_PROP_GET_LOCAL)
	- continue;
	-
	- (void) strncpy(buf, za.za_name, (suffix - za.za_name));
	- buf[suffix - za.za_name] = '\0';
	- propname = buf;
	-
	- if (!(flags & DSL_PROP_GET_RECEIVED)) {
	- /* Skip if locally overridden. */
	- err = zap_contains(mos, propobj, propname);
	- if (err == 0)
	- continue;
	- if (err != ENOENT)
	- break;
	-
	- /* Skip if explicitly inherited. */
	- valstr = kmem_asprintf("%s%s", propname,
	- ZPROP_INHERIT_SUFFIX);
	- err = zap_contains(mos, propobj, valstr);
	- strfree(valstr);
	- if (err == 0)
	- continue;
	- if (err != ENOENT)
	- break;
	- }
	-
	- source = ((flags & DSL_PROP_GET_INHERITING) ?
	- setpoint : ZPROP_SOURCE_VAL_RECVD);
	- } else {
	- /*
	- * For backward compatibility, skip suffixes we don't
	- * recognize.
	- */
	- continue;
	- }
	-
	- prop = zfs_name_to_prop(propname);
	-
	- /* Skip non-inheritable properties. */
	- if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
	- !zfs_prop_inheritable(prop))
	- continue;
	-
	- /* Skip properties not valid for this type. */
	- if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
	- !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
	- continue;
	-
	- /* Skip properties already defined. */
	- if (nvlist_exists(nv, propname))
	- continue;
	-
	- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- if (za.za_integer_length == 1) {
	- /*
	- * String property
	- */
	- char *tmp = kmem_alloc(za.za_num_integers,
	- KM_SLEEP);
	- err = zap_lookup(mos, propobj,
	- za.za_name, 1, za.za_num_integers, tmp);
	- if (err != 0) {
	- kmem_free(tmp, za.za_num_integers);
	- break;
	- }
	- VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
	- tmp) == 0);
	- kmem_free(tmp, za.za_num_integers);
	- } else {
	- /*
	- * Integer property
	- */
	- ASSERT(za.za_integer_length == 8);
	- (void) nvlist_add_uint64(propval, ZPROP_VALUE,
	- za.za_first_integer);
	- }
	-
	- VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
	- VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
	- nvlist_free(propval);
	- }
	- zap_cursor_fini(&zc);
	- if (err == ENOENT)
	- err = 0;
	- return (err);
	-}
	-
	-/*
	- * Iterate over all properties for this dataset and return them in an nvlist.
	- */
	-static int
	-dsl_prop_get_all_ds(dsl_dataset_t ds, nvlist_t *nvp,
	- dsl_prop_getflags_t flags)
	-{
	- dsl_dir_t *dd = ds->ds_dir;
	- dsl_pool_t *dp = dd->dd_pool;
	- objset_t *mos = dp->dp_meta_objset;
	- int err = 0;
	- char setpoint[ZFS_MAX_DATASET_NAME_LEN];
	-
	- VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- if (ds->ds_is_snapshot)
	- flags \|= DSL_PROP_GET_SNAPSHOT;
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
	- ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
	- dsl_dataset_name(ds, setpoint);
	- err = dsl_prop_get_all_impl(mos,
	- dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
	- if (err)
	- goto out;
	- }
	-
	- for (; dd != NULL; dd = dd->dd_parent) {
	- if (dd != ds->ds_dir \|\| (flags & DSL_PROP_GET_SNAPSHOT)) {
	- if (flags & (DSL_PROP_GET_LOCAL \|
	- DSL_PROP_GET_RECEIVED))
	- break;
	- flags \|= DSL_PROP_GET_INHERITING;
	- }
	- dsl_dir_name(dd, setpoint);
	- err = dsl_prop_get_all_impl(mos,
	- dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
	- if (err)
	- break;
	- }
	-out:
	- return (err);
	-}
	-
	-boolean_t
	-dsl_prop_get_hasrecvd(const char *dsname)
	-{
	- uint64_t dummy;
	-
	- return (0 ==
	- dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL));
	-}
	-
	-static int
	-dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source)
	-{
	- uint64_t version;
	- spa_t *spa;
	- int error = 0;
	-
	- VERIFY0(spa_open(dsname, &spa, FTAG));
	- version = spa_version(spa);
	- spa_close(spa, FTAG);
	-
	- if (version >= SPA_VERSION_RECVD_PROPS)
	- error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0);
	- return (error);
	-}
	-
	-/*
	- * Call after successfully receiving properties to ensure that only the first
	- * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
	- */
	-int
	-dsl_prop_set_hasrecvd(const char *dsname)
	-{
	- int error = 0;
	- if (!dsl_prop_get_hasrecvd(dsname))
	- error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL);
	- return (error);
	-}
	-
	-void
	-dsl_prop_unset_hasrecvd(const char *dsname)
	-{
	- VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE));
	-}
	-
	-int
	-dsl_prop_get_all(objset_t os, nvlist_t *nvp)
	-{
	- return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
	-}
	-
	-int
	-dsl_prop_get_received(const char dsname, nvlist_t *nvp)
	-{
	- objset_t *os;
	- int error;
	-
	- /*
	- * Received properties are not distinguishable from local properties
	- * until the dataset has received properties on or after
	- * SPA_VERSION_RECVD_PROPS.
	- */
	- dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ?
	- DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
	-
	- error = dmu_objset_hold(dsname, FTAG, &os);
	- if (error != 0)
	- return (error);
	- error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags);
	- dmu_objset_rele(os, FTAG);
	- return (error);
	-}
	-
	-void
	-dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
	-{
	- nvlist_t *propval;
	- const char *propname = zfs_prop_to_name(prop);
	- uint64_t default_value;
	-
	- if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
	- VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
	- return;
	- }
	-
	- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
	- /* Indicate the default source if we can. */
	- if (dodefault(prop, 8, 1, &default_value) == 0 &&
	- value == default_value) {
	- VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
	- }
	- VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
	- nvlist_free(propval);
	-}
	-
	-void
	-dsl_prop_nvlist_add_string(nvlist_t nv, zfs_prop_t prop, const char value)
	-{
	- nvlist_t *propval;
	- const char *propname = zfs_prop_to_name(prop);
	-
	- if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
	- VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
	- return;
	- }
	-
	- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
	- VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
	- nvlist_free(propval);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
	@@ -1,4001 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright 2016 Gary Mills
	- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	- * Copyright 2017 Joyent, Inc.
	- * Copyright (c) 2017 Datto Inc.
	- */
	-
	-#include <sys/dsl_scan.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dnode.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/arc.h>
	-#include <sys/zap.h>
	-#include <sys/zio.h>
	-#include <sys/zfs_context.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/spa_impl.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zil_impl.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/ddt.h>
	-#include <sys/sa.h>
	-#include <sys/sa_impl.h>
	-#include <sys/zfeature.h>
	-#include <sys/abd.h>
	-#include <sys/range_tree.h>
	-#ifdef _KERNEL
	-#include <sys/zfs_vfsops.h>
	-#endif
	-
	-/*
	- * Grand theory statement on scan queue sorting
	- *
	- * Scanning is implemented by recursively traversing all indirection levels
	- * in an object and reading all blocks referenced from said objects. This
	- * results in us approximately traversing the object from lowest logical
	- * offset to the highest. For best performance, we would want the logical
	- * blocks to be physically contiguous. However, this is frequently not the
	- * case with pools given the allocation patterns of copy-on-write filesystems.
	- * So instead, we put the I/Os into a reordering queue and issue them in a
	- * way that will most benefit physical disks (LBA-order).
	- *
	- * Queue management:
	- *
	- * Ideally, we would want to scan all metadata and queue up all block I/O
	- * prior to starting to issue it, because that allows us to do an optimal
	- * sorting job. This can however consume large amounts of memory. Therefore
	- * we continuously monitor the size of the queues and constrain them to 5%
	- * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
	- * limit, we clear out a few of the largest extents at the head of the queues
	- * to make room for more scanning. Hopefully, these extents will be fairly
	- * large and contiguous, allowing us to approach sequential I/O throughput
	- * even without a fully sorted tree.
	- *
	- * Metadata scanning takes place in dsl_scan_visit(), which is called from
	- * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
	- * metadata on the pool, or we need to make room in memory because our
	- * queues are too large, dsl_scan_visit() is postponed and
	- * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
	- * that metadata scanning and queued I/O issuing are mutually exclusive. This
	- * allows us to provide maximum sequential I/O throughput for the majority of
	- * I/O's issued since sequential I/O performance is significantly negatively
	- * impacted if it is interleaved with random I/O.
	- *
	- * Implementation Notes
	- *
	- * One side effect of the queued scanning algorithm is that the scanning code
	- * needs to be notified whenever a block is freed. This is needed to allow
	- * the scanning code to remove these I/Os from the issuing queue. Additionally,
	- * we do not attempt to queue gang blocks to be issued sequentially since this
	- * is very hard to do and would have an extremely limitted performance benefit.
	- * Instead, we simply issue gang I/Os as soon as we find them using the legacy
	- * algorithm.
	- *
	- * Backwards compatibility
	- *
	- * This new algorithm is backwards compatible with the legacy on-disk data
	- * structures (and therefore does not require a new feature flag).
	- * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
	- * will stop scanning metadata (in logical order) and wait for all outstanding
	- * sorted I/O to complete. Once this is done, we write out a checkpoint
	- * bookmark, indicating that we have scanned everything logically before it.
	- * If the pool is imported on a machine without the new sorting algorithm,
	- * the scan simply resumes from the last checkpoint using the legacy algorithm.
	- */
	-
	-typedef int (scan_cb_t)(dsl_pool_t , const blkptr_t ,
	- const zbookmark_phys_t *);
	-
	-static scan_cb_t dsl_scan_scrub_cb;
	-
	-static int scan_ds_queue_compare(const void a, const void b);
	-static int scan_prefetch_queue_compare(const void a, const void b);
	-static void scan_ds_queue_clear(dsl_scan_t *scn);
	-static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
	- uint64_t *txg);
	-static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
	-static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
	-static void scan_ds_queue_sync(dsl_scan_t scn, dmu_tx_t tx);
	-static uint64_t dsl_scan_count_leaves(vdev_t *vd);
	-
	-extern int zfs_vdev_async_write_active_min_dirty_percent;
	-
	-/*
	- * By default zfs will check to ensure it is not over the hard memory
	- * limit before each txg. If finer-grained control of this is needed
	- * this value can be set to 1 to enable checking before scanning each
	- * block.
	- */
	-int zfs_scan_strict_mem_lim = B_FALSE;
	-
	-unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver -- 2 is a good number */
	-unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub -- 4 is a good number */
	-unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */
	-
	-/*
	- * Maximum number of parallelly executed bytes per leaf vdev. We attempt
	- * to strike a balance here between keeping the vdev queues full of I/Os
	- * at all times and not overflowing the queues to cause long latency,
	- * which would cause long txg sync times. No matter what, we will not
	- * overload the drives with I/O, since that is protected by
	- * zfs_vdev_scrub_max_active.
	- */
	-unsigned long zfs_scan_vdev_limit = 4 << 20;
	-
	-int zfs_scan_issue_strategy = 0;
	-int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
	-uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
	-
	-unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */
	-#define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval)
	-
	-/*
	- * fill_weight is non-tunable at runtime, so we copy it at module init from
	- * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
	- * break queue sorting.
	- */
	-uint64_t zfs_scan_fill_weight = 3;
	-static uint64_t fill_weight;
	-
	-/* See dsl_scan_should_clear() for details on the memory limit tunables */
	-uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */
	-uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */
	-int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */
	-int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */
	-
	-unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
	-unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
	-unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
	-unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
	-boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
	-boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN,
	- &zfs_resilver_delay, 0, "Number of ticks to delay resilver");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN,
	- &zfs_scrub_delay, 0, "Number of ticks to delay scrub");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN,
	- &zfs_scan_idle, 0, "Idle scan window in clock ticks");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
	- &zfs_scrub_min_time_ms, 0, "Min millisecs to scrub per txg");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN,
	- &zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN,
	- &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN,
	- &zfs_no_scrub_io, 0, "Disable scrub I/O");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN,
	- &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_legacy, CTLFLAG_RWTUN,
	- &zfs_scan_legacy, 0, "Scrub using legacy non-sequential method");
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_checkpoint_interval, CTLFLAG_RWTUN,
	- &zfs_scan_checkpoint_intval, 0, "Scan progress on-disk checkpointing interval");
	-
	-enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
	-/* max number of blocks to free in a single TXG */
	-uint64_t zfs_async_block_max_blocks = UINT64_MAX;
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
	- &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG");
	-
	-/*
	- * We wait a few txgs after importing a pool to begin scanning so that
	- * the import / mounting code isn't held up by scrub / resilver IO.
	- * Unfortunately, it is a bit difficult to determine exactly how long
	- * this will take since userspace will trigger fs mounts asynchronously
	- * and the kernel will create zvol minors asynchronously. As a result,
	- * the value provided here is a bit arbitrary, but represents a
	- * reasonable estimate of how many txgs it will take to finish fully
	- * importing a pool
	- */
	-#define SCAN_IMPORT_WAIT_TXGS 5
	-
	-
	-#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
	- ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB \|\| \
	- (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
	-
	-extern int zfs_txg_timeout;
	-
	-/*
	- * Enable/disable the processing of the free_bpobj object.
	- */
	-boolean_t zfs_free_bpobj_enabled = B_TRUE;
	-
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
	- &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");
	-
	-/* the order has to match pool_scan_type */
	-static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
	- NULL,
	- dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
	- dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
	-};
	-
	-/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
	-typedef struct {
	- uint64_t sds_dsobj;
	- uint64_t sds_txg;
	- avl_node_t sds_node;
	-} scan_ds_t;
	-
	-/*
	- * This controls what conditions are placed on dsl_scan_sync_state():
	- * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
	- * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
	- * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
	- * write out the scn_phys_cached version.
	- * See dsl_scan_sync_state for details.
	- */
	-typedef enum {
	- SYNC_OPTIONAL,
	- SYNC_MANDATORY,
	- SYNC_CACHED
	-} state_sync_type_t;
	-
	-/*
	- * This struct represents the minimum information needed to reconstruct a
	- * zio for sequential scanning. This is useful because many of these will
	- * accumulate in the sequential IO queues before being issued, so saving
	- * memory matters here.
	- */
	-typedef struct scan_io {
	- /* fields from blkptr_t */
	- uint64_t sio_offset;
	- uint64_t sio_blk_prop;
	- uint64_t sio_phys_birth;
	- uint64_t sio_birth;
	- zio_cksum_t sio_cksum;
	- uint32_t sio_asize;
	-
	- /* fields from zio_t */
	- int sio_flags;
	- zbookmark_phys_t sio_zb;
	-
	- /* members for queue sorting */
	- union {
	- avl_node_t sio_addr_node; /* link into issueing queue */
	- list_node_t sio_list_node; /* link for issuing to disk */
	- } sio_nodes;
	-} scan_io_t;
	-
	-struct dsl_scan_io_queue {
	- dsl_scan_t q_scn; / associated dsl_scan_t */
	- vdev_t q_vd; / top-level vdev that this queue represents */
	-
	- /* trees used for sorting I/Os and extents of I/Os */
	- range_tree_t *q_exts_by_addr;
	- avl_tree_t q_exts_by_size;
	- avl_tree_t q_sios_by_addr;
	-
	- /* members for zio rate limiting */
	- uint64_t q_maxinflight_bytes;
	- uint64_t q_inflight_bytes;
	- kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
	-
	- /* per txg statistics */
	- uint64_t q_total_seg_size_this_txg;
	- uint64_t q_segs_this_txg;
	- uint64_t q_total_zio_size_this_txg;
	- uint64_t q_zios_this_txg;
	-};
	-
	-/* private data for dsl_scan_prefetch_cb() */
	-typedef struct scan_prefetch_ctx {
	- zfs_refcount_t spc_refcnt; /* refcount for memory management */
	- dsl_scan_t spc_scn; / dsl_scan_t for the pool */
	- boolean_t spc_root; /* is this prefetch for an objset? */
	- uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */
	- uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */
	-} scan_prefetch_ctx_t;
	-
	-/* private data for dsl_scan_prefetch() */
	-typedef struct scan_prefetch_issue_ctx {
	- avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */
	- scan_prefetch_ctx_t spic_spc; / spc for the callback */
	- blkptr_t spic_bp; /* bp to prefetch */
	- zbookmark_phys_t spic_zb; /* bookmark to prefetch */
	-} scan_prefetch_issue_ctx_t;
	-
	-static void scan_exec_io(dsl_pool_t dp, const blkptr_t bp, int zio_flags,
	- const zbookmark_phys_t zb, dsl_scan_io_queue_t queue);
	-static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
	- scan_io_t *sio);
	-
	-static dsl_scan_io_queue_t scan_io_queue_create(vdev_t vd);
	-static void scan_io_queues_destroy(dsl_scan_t *scn);
	-
	-static kmem_cache_t *sio_cache;
	-
	-void
	-scan_init(void)
	-{
	- /*
	- * This is used in ext_size_compare() to weight segments
	- * based on how sparse they are. This cannot be changed
	- * mid-scan and the tree comparison functions don't currently
	- * have a mechansim for passing additional context to the
	- * compare functions. Thus we store this value globally and
	- * we only allow it to be set at module intiailization time
	- */
	- fill_weight = zfs_scan_fill_weight;
	-
	- sio_cache = kmem_cache_create("sio_cache",
	- sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	-}
	-
	-void
	-scan_fini(void)
	-{
	- kmem_cache_destroy(sio_cache);
	-}
	-
	-static inline boolean_t
	-dsl_scan_is_running(const dsl_scan_t *scn)
	-{
	- return (scn->scn_phys.scn_state == DSS_SCANNING);
	-}
	-
	-boolean_t
	-dsl_scan_resilvering(dsl_pool_t *dp)
	-{
	- return (dsl_scan_is_running(dp->dp_scan) &&
	- dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
	-}
	-
	-static inline void
	-sio2bp(const scan_io_t sio, blkptr_t bp, uint64_t vdev_id)
	-{
	- bzero(bp, sizeof (*bp));
	- DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
	- DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
	- DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
	- bp->blk_prop = sio->sio_blk_prop;
	- bp->blk_phys_birth = sio->sio_phys_birth;
	- bp->blk_birth = sio->sio_birth;
	- bp->blk_fill = 1; /* we always only work with data pointers */
	- bp->blk_cksum = sio->sio_cksum;
	-}
	-
	-static inline void
	-bp2sio(const blkptr_t bp, scan_io_t sio, int dva_i)
	-{
	- /* we discard the vdev id, since we can deduce it from the queue */
	- sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
	- sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
	- sio->sio_blk_prop = bp->blk_prop;
	- sio->sio_phys_birth = bp->blk_phys_birth;
	- sio->sio_birth = bp->blk_birth;
	- sio->sio_cksum = bp->blk_cksum;
	-}
	-
	-void
	-dsl_scan_global_init(void)
	-{
	- /*
	- * This is used in ext_size_compare() to weight segments
	- * based on how sparse they are. This cannot be changed
	- * mid-scan and the tree comparison functions don't currently
	- * have a mechansim for passing additional context to the
	- * compare functions. Thus we store this value globally and
	- * we only allow it to be set at module intiailization time
	- */
	- fill_weight = zfs_scan_fill_weight;
	-}
	-
	-int
	-dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
	-{
	- int err;
	- dsl_scan_t *scn;
	- spa_t *spa = dp->dp_spa;
	- uint64_t f;
	-
	- scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
	- scn->scn_dp = dp;
	-
	- /*
	- * It's possible that we're resuming a scan after a reboot so
	- * make sure that the scan_async_destroying flag is initialized
	- * appropriately.
	- */
	- ASSERT(!scn->scn_async_destroying);
	- scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
	- SPA_FEATURE_ASYNC_DESTROY);
	-
	- /*
	- * Calculate the max number of in-flight bytes for pool-wide
	- * scanning operations (minimum 1MB). Limits for the issuing
	- * phase are done per top-level vdev and are handled separately.
	- */
	- scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
	- dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20);
	-
	- avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
	- offsetof(scan_ds_t, sds_node));
	- avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
	- sizeof (scan_prefetch_issue_ctx_t),
	- offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
	-
	- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- "scrub_func", sizeof (uint64_t), 1, &f);
	- if (err == 0) {
	- /*
	- * There was an old-style scrub in progress. Restart a
	- * new-style scrub from the beginning.
	- */
	- scn->scn_restart_txg = txg;
	- zfs_dbgmsg("old-style scrub was in progress; "
	- "restarting new-style scrub in txg %llu",
	- (longlong_t)scn->scn_restart_txg);
	-
	- /*
	- * Load the queue obj from the old location so that it
	- * can be freed by dsl_scan_done().
	- */
	- (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- "scrub_queue", sizeof (uint64_t), 1,
	- &scn->scn_phys.scn_queue_obj);
	- } else {
	- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
	- &scn->scn_phys);
	- if (err == ENOENT)
	- return (0);
	- else if (err)
	- return (err);
	-
	- /*
	- * We might be restarting after a reboot, so jump the issued
	- * counter to how far we've scanned. We know we're consistent
	- * up to here.
	- */
	- scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
	-
	- if (dsl_scan_is_running(scn) &&
	- spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
	- /*
	- * A new-type scrub was in progress on an old
	- * pool, and the pool was accessed by old
	- * software. Restart from the beginning, since
	- * the old software may have changed the pool in
	- * the meantime.
	- */
	- scn->scn_restart_txg = txg;
	- zfs_dbgmsg("new-style scrub was modified "
	- "by old software; restarting in txg %llu",
	- (longlong_t)scn->scn_restart_txg);
	- }
	- }
	-
	- bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
	-
	- /* reload the queue into the in-core state */
	- if (scn->scn_phys.scn_queue_obj != 0) {
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- for (zap_cursor_init(&zc, dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- (void) zap_cursor_advance(&zc)) {
	- scan_ds_queue_insert(scn,
	- zfs_strtonum(za.za_name, NULL),
	- za.za_first_integer);
	- }
	- zap_cursor_fini(&zc);
	- }
	-
	- spa_scan_stat_init(spa);
	- return (0);
	-}
	-
	-void
	-dsl_scan_fini(dsl_pool_t *dp)
	-{
	- if (dp->dp_scan != NULL) {
	- dsl_scan_t *scn = dp->dp_scan;
	-
	- if (scn->scn_taskq != NULL)
	- taskq_destroy(scn->scn_taskq);
	- scan_ds_queue_clear(scn);
	- avl_destroy(&scn->scn_queue);
	- avl_destroy(&scn->scn_prefetch_queue);
	-
	- kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
	- dp->dp_scan = NULL;
	- }
	-}
	-
	-static boolean_t
	-dsl_scan_restarting(dsl_scan_t scn, dmu_tx_t tx)
	-{
	- return (scn->scn_restart_txg != 0 &&
	- scn->scn_restart_txg <= tx->tx_txg);
	-}
	-
	-boolean_t
	-dsl_scan_scrubbing(const dsl_pool_t *dp)
	-{
	- dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
	-
	- return (scn_phys->scn_state == DSS_SCANNING &&
	- scn_phys->scn_func == POOL_SCAN_SCRUB);
	-}
	-
	-boolean_t
	-dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
	-{
	- return (dsl_scan_scrubbing(scn->scn_dp) &&
	- scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
	-}
	-
	-/*
	- * Writes out a persistent dsl_scan_phys_t record to the pool directory.
	- * Because we can be running in the block sorting algorithm, we do not always
	- * want to write out the record, only when it is "safe" to do so. This safety
	- * condition is achieved by making sure that the sorting queues are empty
	- * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
	- * is inconsistent with how much actual scanning progress has been made. The
	- * kind of sync to be performed is specified by the sync_type argument. If the
	- * sync is optional, we only sync if the queues are empty. If the sync is
	- * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
	- * third possible state is a "cached" sync. This is done in response to:
	- * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
	- * destroyed, so we wouldn't be able to restart scanning from it.
	- * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
	- * superseded by a newer snapshot.
	- * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
	- * swapped with its clone.
	- * In all cases, a cached sync simply rewrites the last record we've written,
	- * just slightly modified. For the modifications that are performed to the
	- * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
	- * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
	- */
	-static void
	-dsl_scan_sync_state(dsl_scan_t scn, dmu_tx_t tx, state_sync_type_t sync_type)
	-{
	- int i;
	- spa_t *spa = scn->scn_dp->dp_spa;
	-
	- ASSERT(sync_type != SYNC_MANDATORY \|\| scn->scn_bytes_pending == 0);
	- if (scn->scn_bytes_pending == 0) {
	- for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
	- vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
	- dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
	-
	- if (q == NULL)
	- continue;
	-
	- mutex_enter(&vd->vdev_scan_io_queue_lock);
	- ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
	- ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL);
	- ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
	- mutex_exit(&vd->vdev_scan_io_queue_lock);
	- }
	-
	- if (scn->scn_phys.scn_queue_obj != 0)
	- scan_ds_queue_sync(scn, tx);
	- VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
	- &scn->scn_phys, tx));
	- bcopy(&scn->scn_phys, &scn->scn_phys_cached,
	- sizeof (scn->scn_phys));
	-
	- if (scn->scn_checkpointing)
	- zfs_dbgmsg("finish scan checkpoint");
	-
	- scn->scn_checkpointing = B_FALSE;
	- scn->scn_last_checkpoint = ddi_get_lbolt();
	- } else if (sync_type == SYNC_CACHED) {
	- VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
	- &scn->scn_phys_cached, tx));
	- }
	-}
	-
	-/* ARGSUSED */
	-static int
	-dsl_scan_setup_check(void arg, dmu_tx_t tx)
	-{
	- dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
	-
	- if (dsl_scan_is_running(scn))
	- return (SET_ERROR(EBUSY));
	-
	- return (0);
	-}
	-
	-static void
	-dsl_scan_setup_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
	- pool_scan_func_t *funcp = arg;
	- dmu_object_type_t ot = 0;
	- dsl_pool_t *dp = scn->scn_dp;
	- spa_t *spa = dp->dp_spa;
	-
	- ASSERT(!dsl_scan_is_running(scn));
	- ASSERT(funcp > POOL_SCAN_NONE && funcp < POOL_SCAN_FUNCS);
	- bzero(&scn->scn_phys, sizeof (scn->scn_phys));
	- scn->scn_phys.scn_func = *funcp;
	- scn->scn_phys.scn_state = DSS_SCANNING;
	- scn->scn_phys.scn_min_txg = 0;
	- scn->scn_phys.scn_max_txg = tx->tx_txg;
	- scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
	- scn->scn_phys.scn_start_time = gethrestime_sec();
	- scn->scn_phys.scn_errors = 0;
	- scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
	- scn->scn_issued_before_pass = 0;
	- scn->scn_restart_txg = 0;
	- scn->scn_done_txg = 0;
	- scn->scn_last_checkpoint = 0;
	- scn->scn_checkpointing = B_FALSE;
	- spa_scan_stat_init(spa);
	-
	- if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
	- scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
	-
	- /* rewrite all disk labels */
	- vdev_config_dirty(spa->spa_root_vdev);
	-
	- if (vdev_resilver_needed(spa->spa_root_vdev,
	- &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
	- spa_event_notify(spa, NULL, NULL,
	- ESC_ZFS_RESILVER_START);
	- } else {
	- spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
	- }
	-
	- spa->spa_scrub_started = B_TRUE;
	- /*
	- * If this is an incremental scrub, limit the DDT scrub phase
	- * to just the auto-ditto class (for correctness); the rest
	- * of the scrub should go faster using top-down pruning.
	- */
	- if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
	- scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
	-
	- }
	-
	- /* back to the generic stuff */
	-
	- if (dp->dp_blkstats == NULL) {
	- dp->dp_blkstats =
	- kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
	- mutex_init(&dp->dp_blkstats->zab_lock, NULL,
	- MUTEX_DEFAULT, NULL);
	- }
	- bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
	-
	- if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
	- ot = DMU_OT_ZAP_OTHER;
	-
	- scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
	- ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
	-
	- bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
	-
	- dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
	-
	- spa_history_log_internal(spa, "scan setup", tx,
	- "func=%u mintxg=%llu maxtxg=%llu",
	- *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
	-}
	-
	-/*
	- * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
	- * Can also be called to resume a paused scrub.
	- */
	-int
	-dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
	-{
	- spa_t *spa = dp->dp_spa;
	- dsl_scan_t *scn = dp->dp_scan;
	-
	- /*
	- * Purge all vdev caches and probe all devices. We do this here
	- * rather than in sync context because this requires a writer lock
	- * on the spa_config lock, which we can't do from sync context. The
	- * spa_scrub_reopen flag indicates that vdev_open() should not
	- * attempt to start another scrub.
	- */
	- spa_vdev_state_enter(spa, SCL_NONE);
	- spa->spa_scrub_reopen = B_TRUE;
	- vdev_reopen(spa->spa_root_vdev);
	- spa->spa_scrub_reopen = B_FALSE;
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	-
	- if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
	- /* got scrub start cmd, resume paused scrub */
	- int err = dsl_scrub_set_pause_resume(scn->scn_dp,
	- POOL_SCRUB_NORMAL);
	- if (err == 0) {
	- spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
	- return (ECANCELED);
	- }
	- return (SET_ERROR(err));
	- }
	-
	- return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
	- dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
	-}
	-
	-/* ARGSUSED */
	-static void
	-dsl_scan_done(dsl_scan_t scn, boolean_t complete, dmu_tx_t tx)
	-{
	- static const char *old_names[] = {
	- "scrub_bookmark",
	- "scrub_ddt_bookmark",
	- "scrub_ddt_class_max",
	- "scrub_queue",
	- "scrub_min_txg",
	- "scrub_max_txg",
	- "scrub_func",
	- "scrub_errors",
	- NULL
	- };
	-
	- dsl_pool_t *dp = scn->scn_dp;
	- spa_t *spa = dp->dp_spa;
	- int i;
	-
	- /* Remove any remnants of an old-style scrub. */
	- for (i = 0; old_names[i]; i++) {
	- (void) zap_remove(dp->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
	- }
	-
	- if (scn->scn_phys.scn_queue_obj != 0) {
	- VERIFY0(dmu_object_free(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, tx));
	- scn->scn_phys.scn_queue_obj = 0;
	- }
	- scan_ds_queue_clear(scn);
	-
	- scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
	-
	- /*
	- * If we were "restarted" from a stopped state, don't bother
	- * with anything else.
	- */
	- if (!dsl_scan_is_running(scn)) {
	- ASSERT(!scn->scn_is_sorted);
	- return;
	- }
	-
	- if (scn->scn_is_sorted) {
	- scan_io_queues_destroy(scn);
	- scn->scn_is_sorted = B_FALSE;
	-
	- if (scn->scn_taskq != NULL) {
	- taskq_destroy(scn->scn_taskq);
	- scn->scn_taskq = NULL;
	- }
	- }
	-
	- scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
	-
	- if (dsl_scan_restarting(scn, tx))
	- spa_history_log_internal(spa, "scan aborted, restarting", tx,
	- "errors=%llu", spa_get_errlog_size(spa));
	- else if (!complete)
	- spa_history_log_internal(spa, "scan cancelled", tx,
	- "errors=%llu", spa_get_errlog_size(spa));
	- else
	- spa_history_log_internal(spa, "scan done", tx,
	- "errors=%llu", spa_get_errlog_size(spa));
	-
	- if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
	- spa->spa_scrub_started = B_FALSE;
	- spa->spa_scrub_active = B_FALSE;
	-
	- /*
	- * If the scrub/resilver completed, update all DTLs to
	- * reflect this. Whether it succeeded or not, vacate
	- * all temporary scrub DTLs.
	- *
	- * As the scrub does not currently support traversing
	- * data that have been freed but are part of a checkpoint,
	- * we don't mark the scrub as done in the DTLs as faults
	- * may still exist in those vdevs.
	- */
	- if (complete &&
	- !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
	- vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
	- scn->scn_phys.scn_max_txg, B_TRUE);
	-
	- spa_event_notify(spa, NULL, NULL,
	- scn->scn_phys.scn_min_txg ?
	- ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
	- } else {
	- vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
	- 0, B_TRUE);
	- }
	- spa_errlog_rotate(spa);
	-
	- /*
	- * We may have finished replacing a device.
	- * Let the async thread assess this and handle the detach.
	- */
	- spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
	- }
	-
	- scn->scn_phys.scn_end_time = gethrestime_sec();
	-
	- ASSERT(!dsl_scan_is_running(scn));
	-}
	-
	-/* ARGSUSED */
	-static int
	-dsl_scan_cancel_check(void arg, dmu_tx_t tx)
	-{
	- dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
	-
	- if (!dsl_scan_is_running(scn))
	- return (SET_ERROR(ENOENT));
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dsl_scan_cancel_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
	-
	- dsl_scan_done(scn, B_FALSE, tx);
	- dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
	- spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
	-}
	-
	-int
	-dsl_scan_cancel(dsl_pool_t *dp)
	-{
	- return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
	- dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
	-}
	-
	-static int
	-dsl_scrub_pause_resume_check(void arg, dmu_tx_t tx)
	-{
	- pool_scrub_cmd_t *cmd = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- dsl_scan_t *scn = dp->dp_scan;
	-
	- if (*cmd == POOL_SCRUB_PAUSE) {
	- /* can't pause a scrub when there is no in-progress scrub */
	- if (!dsl_scan_scrubbing(dp))
	- return (SET_ERROR(ENOENT));
	-
	- /* can't pause a paused scrub */
	- if (dsl_scan_is_paused_scrub(scn))
	- return (SET_ERROR(EBUSY));
	- } else if (*cmd != POOL_SCRUB_NORMAL) {
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- return (0);
	-}
	-
	-static void
	-dsl_scrub_pause_resume_sync(void arg, dmu_tx_t tx)
	-{
	- pool_scrub_cmd_t *cmd = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- spa_t *spa = dp->dp_spa;
	- dsl_scan_t *scn = dp->dp_scan;
	-
	- if (*cmd == POOL_SCRUB_PAUSE) {
	- /* can't pause a scrub when there is no in-progress scrub */
	- spa->spa_scan_pass_scrub_pause = gethrestime_sec();
	- scn->scn_phys.scn_flags \|= DSF_SCRUB_PAUSED;
	- scn->scn_phys_cached.scn_flags \|= DSF_SCRUB_PAUSED;
	- dsl_scan_sync_state(scn, tx, SYNC_CACHED);
	- spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
	- } else {
	- ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
	- if (dsl_scan_is_paused_scrub(scn)) {
	- /*
	- * We need to keep track of how much time we spend
	- * paused per pass so that we can adjust the scrub rate
	- * shown in the output of 'zpool status'
	- */
	- spa->spa_scan_pass_scrub_spent_paused +=
	- gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
	- spa->spa_scan_pass_scrub_pause = 0;
	- scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
	- scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
	- dsl_scan_sync_state(scn, tx, SYNC_CACHED);
	- }
	- }
	-}
	-
	-/*
	- * Set scrub pause/resume state if it makes sense to do so
	- */
	-int
	-dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
	-{
	- return (dsl_sync_task(spa_name(dp->dp_spa),
	- dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
	- ZFS_SPACE_CHECK_RESERVED));
	-}
	-
	-
	-/* start a new scan, or restart an existing one. */
	-void
	-dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
	-{
	- if (txg == 0) {
	- dmu_tx_t *tx;
	- tx = dmu_tx_create_dd(dp->dp_mos_dir);
	- VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
	-
	- txg = dmu_tx_get_txg(tx);
	- dp->dp_scan->scn_restart_txg = txg;
	- dmu_tx_commit(tx);
	- } else {
	- dp->dp_scan->scn_restart_txg = txg;
	- }
	- zfs_dbgmsg("restarting resilver txg=%llu", txg);
	-}
	-
	-void
	-dsl_free(dsl_pool_t dp, uint64_t txg, const blkptr_t bp)
	-{
	- zio_free(dp->dp_spa, txg, bp);
	-}
	-
	-void
	-dsl_free_sync(zio_t pio, dsl_pool_t dp, uint64_t txg, const blkptr_t *bpp)
	-{
	- ASSERT(dsl_pool_sync_context(dp));
	- zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
	- pio->io_flags));
	-}
	-
	-static int
	-scan_ds_queue_compare(const void a, const void b)
	-{
	- const scan_ds_t sds_a = a, sds_b = b;
	-
	- if (sds_a->sds_dsobj < sds_b->sds_dsobj)
	- return (-1);
	- if (sds_a->sds_dsobj == sds_b->sds_dsobj)
	- return (0);
	- return (1);
	-}
	-
	-static void
	-scan_ds_queue_clear(dsl_scan_t *scn)
	-{
	- void *cookie = NULL;
	- scan_ds_t *sds;
	- while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
	- kmem_free(sds, sizeof (*sds));
	- }
	-}
	-
	-static boolean_t
	-scan_ds_queue_contains(dsl_scan_t scn, uint64_t dsobj, uint64_t txg)
	-{
	- scan_ds_t srch, *sds;
	-
	- srch.sds_dsobj = dsobj;
	- sds = avl_find(&scn->scn_queue, &srch, NULL);
	- if (sds != NULL && txg != NULL)
	- *txg = sds->sds_txg;
	- return (sds != NULL);
	-}
	-
	-static void
	-scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
	-{
	- scan_ds_t *sds;
	- avl_index_t where;
	-
	- sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
	- sds->sds_dsobj = dsobj;
	- sds->sds_txg = txg;
	-
	- VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
	- avl_insert(&scn->scn_queue, sds, where);
	-}
	-
	-static void
	-scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
	-{
	- scan_ds_t srch, *sds;
	-
	- srch.sds_dsobj = dsobj;
	-
	- sds = avl_find(&scn->scn_queue, &srch, NULL);
	- VERIFY(sds != NULL);
	- avl_remove(&scn->scn_queue, sds);
	- kmem_free(sds, sizeof (*sds));
	-}
	-
	-static void
	-scan_ds_queue_sync(dsl_scan_t scn, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = scn->scn_dp;
	- spa_t *spa = dp->dp_spa;
	- dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
	- DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
	-
	- ASSERT0(scn->scn_bytes_pending);
	- ASSERT(scn->scn_phys.scn_queue_obj != 0);
	-
	- VERIFY0(dmu_object_free(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, tx));
	- scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
	- DMU_OT_NONE, 0, tx);
	- for (scan_ds_t *sds = avl_first(&scn->scn_queue);
	- sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
	- VERIFY0(zap_add_int_key(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
	- sds->sds_txg, tx));
	- }
	-}
	-
	-/*
	- * Computes the memory limit state that we're currently in. A sorted scan
	- * needs quite a bit of memory to hold the sorting queue, so we need to
	- * reasonably constrain the size so it doesn't impact overall system
	- * performance. We compute two limits:
	- * 1) Hard memory limit: if the amount of memory used by the sorting
	- * queues on a pool gets above this value, we stop the metadata
	- * scanning portion and start issuing the queued up and sorted
	- * I/Os to reduce memory usage.
	- * This limit is calculated as a fraction of physmem (by default 5%).
	- * We constrain the lower bound of the hard limit to an absolute
	- * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
	- * the upper bound to 5% of the total pool size - no chance we'll
	- * ever need that much memory, but just to keep the value in check.
	- * 2) Soft memory limit: once we hit the hard memory limit, we start
	- * issuing I/O to reduce queue memory usage, but we don't want to
	- * completely empty out the queues, since we might be able to find I/Os
	- * that will fill in the gaps of our non-sequential IOs at some point
	- * in the future. So we stop the issuing of I/Os once the amount of
	- * memory used drops below the soft limit (at which point we stop issuing
	- * I/O and start scanning metadata again).
	- *
	- * This limit is calculated by subtracting a fraction of the hard
	- * limit from the hard limit. By default this fraction is 5%, so
	- * the soft limit is 95% of the hard limit. We cap the size of the
	- * difference between the hard and soft limits at an absolute
	- * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
	- * sufficient to not cause too frequent switching between the
	- * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
	- * worth of queues is about 1.2 GiB of on-pool data, so scanning
	- * that should take at least a decent fraction of a second).
	- */
	-static boolean_t
	-dsl_scan_should_clear(dsl_scan_t *scn)
	-{
	- spa_t *spa = scn->scn_dp->dp_spa;
	- vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
	- uint64_t alloc, mlim_hard, mlim_soft, mused;
	-
	- alloc = metaslab_class_get_alloc(spa_normal_class(spa));
	- alloc += metaslab_class_get_alloc(spa_special_class(spa));
	- alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
	-
	- mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
	- zfs_scan_mem_lim_min);
	- mlim_hard = MIN(mlim_hard, alloc / 20);
	- mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
	- zfs_scan_mem_lim_soft_max);
	- mused = 0;
	- for (uint64_t i = 0; i < rvd->vdev_children; i++) {
	- vdev_t *tvd = rvd->vdev_child[i];
	- dsl_scan_io_queue_t *queue;
	-
	- mutex_enter(&tvd->vdev_scan_io_queue_lock);
	- queue = tvd->vdev_scan_io_queue;
	- if (queue != NULL) {
	- /* #extents in exts_by_size = # in exts_by_addr */
	- mused += avl_numnodes(&queue->q_exts_by_size) *
	- sizeof (range_seg_t) +
	- avl_numnodes(&queue->q_sios_by_addr) *
	- sizeof (scan_io_t);
	- }
	- mutex_exit(&tvd->vdev_scan_io_queue_lock);
	- }
	-
	- dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
	-
	- if (mused == 0)
	- ASSERT0(scn->scn_bytes_pending);
	-
	- /*
	- * If we are above our hard limit, we need to clear out memory.
	- * If we are below our soft limit, we need to accumulate sequential IOs.
	- * Otherwise, we should keep doing whatever we are currently doing.
	- */
	- if (mused >= mlim_hard)
	- return (B_TRUE);
	- else if (mused < mlim_soft)
	- return (B_FALSE);
	- else
	- return (scn->scn_clearing);
	-}
	-
	-static boolean_t
	-dsl_scan_check_suspend(dsl_scan_t scn, const zbookmark_phys_t zb)
	-{
	- /* we never skip user/group accounting objects */
	- if (zb && (int64_t)zb->zb_object < 0)
	- return (B_FALSE);
	-
	- if (scn->scn_suspending)
	- return (B_TRUE); /* we're already suspending */
	-
	- if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
	- return (B_FALSE); /* we're resuming */
	-
	- /* We only know how to resume from level-0 blocks. */
	- if (zb && zb->zb_level != 0)
	- return (B_FALSE);
	-
	- /*
	- * We suspend if:
	- * - we have scanned for at least the minimum time (default 1 sec
	- * for scrub, 3 sec for resilver), and either we have sufficient
	- * dirty data that we are starting to write more quickly
	- * (default 30%), or someone is explicitly waiting for this txg
	- * to complete.
	- * or
	- * - the spa is shutting down because this pool is being exported
	- * or the machine is rebooting.
	- * or
	- * - the scan queue has reached its memory use limit
	- */
	- uint64_t elapsed_nanosecs = gethrtime();
	- uint64_t curr_time_ns = gethrtime();
	- uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
	- uint64_t sync_time_ns = curr_time_ns -
	- scn->scn_dp->dp_spa->spa_sync_starttime;
	-
	- int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
	- int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
	- zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
	-
	- if ((NSEC2MSEC(scan_time_ns) > mintime &&
	- (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent \|\|
	- txg_sync_waiting(scn->scn_dp) \|\|
	- NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) \|\|
	- spa_shutting_down(scn->scn_dp->dp_spa) \|\|
	- (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
	- if (zb) {
	- dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
	- (longlong_t)zb->zb_objset,
	- (longlong_t)zb->zb_object,
	- (longlong_t)zb->zb_level,
	- (longlong_t)zb->zb_blkid);
	- scn->scn_phys.scn_bookmark = *zb;
	- } else {
	- dsl_scan_phys_t *scnp = &scn->scn_phys;
	-
	- dprintf("suspending at at DDT bookmark "
	- "%llx/%llx/%llx/%llx\n",
	- (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
	- (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
	- (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
	- (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
	- }
	- scn->scn_suspending = B_TRUE;
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-typedef struct zil_scan_arg {
	- dsl_pool_t *zsa_dp;
	- zil_header_t *zsa_zh;
	-} zil_scan_arg_t;
	-
	-/* ARGSUSED */
	-static int
	-dsl_scan_zil_block(zilog_t zilog, blkptr_t bp, void *arg, uint64_t claim_txg)
	-{
	- zil_scan_arg_t *zsa = arg;
	- dsl_pool_t *dp = zsa->zsa_dp;
	- dsl_scan_t *scn = dp->dp_scan;
	- zil_header_t *zh = zsa->zsa_zh;
	- zbookmark_phys_t zb;
	-
	- if (BP_IS_HOLE(bp) \|\| bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
	- return (0);
	-
	- /*
	- * One block ("stubby") can be allocated a long time ago; we
	- * want to visit that one because it has been allocated
	- * (on-disk) even if it hasn't been claimed (even though for
	- * scrub there's nothing to do to it).
	- */
	- if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
	- return (0);
	-
	- SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
	- ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
	-
	- VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-dsl_scan_zil_record(zilog_t zilog, lr_t lrc, void *arg, uint64_t claim_txg)
	-{
	- if (lrc->lrc_txtype == TX_WRITE) {
	- zil_scan_arg_t *zsa = arg;
	- dsl_pool_t *dp = zsa->zsa_dp;
	- dsl_scan_t *scn = dp->dp_scan;
	- zil_header_t *zh = zsa->zsa_zh;
	- lr_write_t lr = (lr_write_t )lrc;
	- blkptr_t *bp = &lr->lr_blkptr;
	- zbookmark_phys_t zb;
	-
	- if (BP_IS_HOLE(bp) \|\|
	- bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
	- return (0);
	-
	- /*
	- * birth can be < claim_txg if this record's txg is
	- * already txg sync'ed (but this log block contains
	- * other records that are not synced)
	- */
	- if (claim_txg == 0 \|\| bp->blk_birth < claim_txg)
	- return (0);
	-
	- SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
	- lr->lr_foid, ZB_ZIL_LEVEL,
	- lr->lr_offset / BP_GET_LSIZE(bp));
	-
	- VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
	- }
	- return (0);
	-}
	-
	-static void
	-dsl_scan_zil(dsl_pool_t dp, zil_header_t zh)
	-{
	- uint64_t claim_txg = zh->zh_claim_txg;
	- zil_scan_arg_t zsa = { dp, zh };
	- zilog_t *zilog;
	-
	- ASSERT(spa_writeable(dp->dp_spa));
	-
	- /*
	- * We only want to visit blocks that have been claimed
	- * but not yet replayed.
	- */
	- if (claim_txg == 0)
	- return;
	-
	- zilog = zil_alloc(dp->dp_meta_objset, zh);
	-
	- (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
	- claim_txg);
	-
	- zil_free(zilog);
	-}
	-
	-/*
	- * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
	- * here is to sort the AVL tree by the order each block will be needed.
	- */
	-static int
	-scan_prefetch_queue_compare(const void a, const void b)
	-{
	- const scan_prefetch_issue_ctx_t spic_a = a, spic_b = b;
	- const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
	- const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
	-
	- return (zbookmark_compare(spc_a->spc_datablkszsec,
	- spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
	- spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
	-}
	-
	-static void
	-scan_prefetch_ctx_rele(scan_prefetch_ctx_t spc, void tag)
	-{
	- if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
	- zfs_refcount_destroy(&spc->spc_refcnt);
	- kmem_free(spc, sizeof (scan_prefetch_ctx_t));
	- }
	-}
	-
	-static scan_prefetch_ctx_t *
	-scan_prefetch_ctx_create(dsl_scan_t scn, dnode_phys_t dnp, void *tag)
	-{
	- scan_prefetch_ctx_t *spc;
	-
	- spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
	- zfs_refcount_create(&spc->spc_refcnt);
	- zfs_refcount_add(&spc->spc_refcnt, tag);
	- spc->spc_scn = scn;
	- if (dnp != NULL) {
	- spc->spc_datablkszsec = dnp->dn_datablkszsec;
	- spc->spc_indblkshift = dnp->dn_indblkshift;
	- spc->spc_root = B_FALSE;
	- } else {
	- spc->spc_datablkszsec = 0;
	- spc->spc_indblkshift = 0;
	- spc->spc_root = B_TRUE;
	- }
	-
	- return (spc);
	-}
	-
	-static void
	-scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t spc, void tag)
	-{
	- zfs_refcount_add(&spc->spc_refcnt, tag);
	-}
	-
	-static boolean_t
	-dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
	- const zbookmark_phys_t *zb)
	-{
	- zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
	- dnode_phys_t tmp_dnp;
	- dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
	-
	- if (zb->zb_objset != last_zb->zb_objset)
	- return (B_TRUE);
	- if ((int64_t)zb->zb_object < 0)
	- return (B_FALSE);
	-
	- tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
	- tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
	-
	- if (zbookmark_subtree_completed(dnp, zb, last_zb))
	- return (B_TRUE);
	-
	- return (B_FALSE);
	-}
	-
	-static void
	-dsl_scan_prefetch(scan_prefetch_ctx_t spc, blkptr_t bp, zbookmark_phys_t *zb)
	-{
	- avl_index_t idx;
	- dsl_scan_t *scn = spc->spc_scn;
	- spa_t *spa = scn->scn_dp->dp_spa;
	- scan_prefetch_issue_ctx_t *spic;
	-
	- if (zfs_no_scrub_prefetch)
	- return;
	-
	- if (BP_IS_HOLE(bp) \|\| bp->blk_birth <= scn->scn_phys.scn_cur_min_txg \|\|
	- (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
	- BP_GET_TYPE(bp) != DMU_OT_OBJSET))
	- return;
	-
	- if (dsl_scan_check_prefetch_resume(spc, zb))
	- return;
	-
	- scan_prefetch_ctx_add_ref(spc, scn);
	- spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
	- spic->spic_spc = spc;
	- spic->spic_bp = *bp;
	- spic->spic_zb = *zb;
	-
	- /*
	- * Add the IO to the queue of blocks to prefetch. This allows us to
	- * prioritize blocks that we will need first for the main traversal
	- * thread.
	- */
	- mutex_enter(&spa->spa_scrub_lock);
	- if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
	- /* this block is already queued for prefetch */
	- kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
	- scan_prefetch_ctx_rele(spc, scn);
	- mutex_exit(&spa->spa_scrub_lock);
	- return;
	- }
	-
	- avl_insert(&scn->scn_prefetch_queue, spic, idx);
	- cv_broadcast(&spa->spa_scrub_io_cv);
	- mutex_exit(&spa->spa_scrub_lock);
	-}
	-
	-static void
	-dsl_scan_prefetch_dnode(dsl_scan_t scn, dnode_phys_t dnp,
	- uint64_t objset, uint64_t object)
	-{
	- int i;
	- zbookmark_phys_t zb;
	- scan_prefetch_ctx_t *spc;
	-
	- if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
	- return;
	-
	- SET_BOOKMARK(&zb, objset, object, 0, 0);
	-
	- spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
	-
	- for (i = 0; i < dnp->dn_nblkptr; i++) {
	- zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
	- zb.zb_blkid = i;
	- dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
	- }
	-
	- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
	- zb.zb_level = 0;
	- zb.zb_blkid = DMU_SPILL_BLKID;
	- dsl_scan_prefetch(spc, &dnp->dn_spill, &zb);
	- }
	-
	- scan_prefetch_ctx_rele(spc, FTAG);
	-}
	-
	-void
	-dsl_scan_prefetch_cb(zio_t zio, const zbookmark_phys_t zb, const blkptr_t *bp,
	- arc_buf_t buf, void private)
	-{
	- scan_prefetch_ctx_t *spc = private;
	- dsl_scan_t *scn = spc->spc_scn;
	- spa_t *spa = scn->scn_dp->dp_spa;
	-
	- /* broadcast that the IO has completed for rate limitting purposes */
	- mutex_enter(&spa->spa_scrub_lock);
	- ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
	- spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
	- cv_broadcast(&spa->spa_scrub_io_cv);
	- mutex_exit(&spa->spa_scrub_lock);
	-
	- /* if there was an error or we are done prefetching, just cleanup */
	- if (buf == NULL \|\| scn->scn_suspending)
	- goto out;
	-
	- if (BP_GET_LEVEL(bp) > 0) {
	- int i;
	- blkptr_t *cbp;
	- int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
	- zbookmark_phys_t czb;
	-
	- for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
	- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
	- zb->zb_level - 1, zb->zb_blkid * epb + i);
	- dsl_scan_prefetch(spc, cbp, &czb);
	- }
	- } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
	- dnode_phys_t *cdnp = buf->b_data;
	- int i;
	- int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
	-
	- for (i = 0, cdnp = buf->b_data; i < epb;
	- i += cdnp->dn_extra_slots + 1,
	- cdnp += cdnp->dn_extra_slots + 1) {
	- dsl_scan_prefetch_dnode(scn, cdnp,
	- zb->zb_objset, zb->zb_blkid * epb + i);
	- }
	- } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
	- objset_phys_t *osp = buf->b_data;
	-
	- dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
	- zb->zb_objset, DMU_META_DNODE_OBJECT);
	-
	- if (OBJSET_BUF_HAS_USERUSED(buf)) {
	- dsl_scan_prefetch_dnode(scn,
	- &osp->os_groupused_dnode, zb->zb_objset,
	- DMU_GROUPUSED_OBJECT);
	- dsl_scan_prefetch_dnode(scn,
	- &osp->os_userused_dnode, zb->zb_objset,
	- DMU_USERUSED_OBJECT);
	- }
	- }
	-
	-out:
	- if (buf != NULL)
	- arc_buf_destroy(buf, private);
	- scan_prefetch_ctx_rele(spc, scn);
	-}
	-
	-/* ARGSUSED */
	-static void
	-dsl_scan_prefetch_thread(void *arg)
	-{
	- dsl_scan_t *scn = arg;
	- spa_t *spa = scn->scn_dp->dp_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- scan_prefetch_issue_ctx_t *spic;
	-
	- /* loop until we are told to stop */
	- while (!scn->scn_prefetch_stop) {
	- arc_flags_t flags = ARC_FLAG_NOWAIT \|
	- ARC_FLAG_PRESCIENT_PREFETCH \| ARC_FLAG_PREFETCH;
	- int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCAN_THREAD;
	-
	- mutex_enter(&spa->spa_scrub_lock);
	-
	- /*
	- * Wait until we have an IO to issue and are not above our
	- * maximum in flight limit.
	- */
	- while (!scn->scn_prefetch_stop &&
	- (avl_numnodes(&scn->scn_prefetch_queue) == 0 \|\|
	- spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
	- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
	- }
	-
	- /* recheck if we should stop since we waited for the cv */
	- if (scn->scn_prefetch_stop) {
	- mutex_exit(&spa->spa_scrub_lock);
	- break;
	- }
	-
	- /* remove the prefetch IO from the tree */
	- spic = avl_first(&scn->scn_prefetch_queue);
	- spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
	- avl_remove(&scn->scn_prefetch_queue, spic);
	-
	- mutex_exit(&spa->spa_scrub_lock);
	-
	- /* issue the prefetch asynchronously */
	- (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
	- &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
	- ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
	-
	- kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
	- }
	-
	- ASSERT(scn->scn_prefetch_stop);
	-
	- /* free any prefetches we didn't get to complete */
	- mutex_enter(&spa->spa_scrub_lock);
	- while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
	- avl_remove(&scn->scn_prefetch_queue, spic);
	- scan_prefetch_ctx_rele(spic->spic_spc, scn);
	- kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
	- }
	- ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
	- mutex_exit(&spa->spa_scrub_lock);
	-}
	-
	-static boolean_t
	-dsl_scan_check_resume(dsl_scan_t scn, const dnode_phys_t dnp,
	- const zbookmark_phys_t *zb)
	-{
	- /*
	- * We never skip over user/group accounting objects (obj<0)
	- */
	- if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
	- (int64_t)zb->zb_object >= 0) {
	- /*
	- * If we already visited this bp & everything below (in
	- * a prior txg sync), don't bother doing it again.
	- */
	- if (zbookmark_subtree_completed(dnp, zb,
	- &scn->scn_phys.scn_bookmark))
	- return (B_TRUE);
	-
	- /*
	- * If we found the block we're trying to resume from, or
	- * we went past it to a different object, zero it out to
	- * indicate that it's OK to start checking for suspending
	- * again.
	- */
	- if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 \|\|
	- zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
	- dprintf("resuming at %llx/%llx/%llx/%llx\n",
	- (longlong_t)zb->zb_objset,
	- (longlong_t)zb->zb_object,
	- (longlong_t)zb->zb_level,
	- (longlong_t)zb->zb_blkid);
	- bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
	- }
	- }
	- return (B_FALSE);
	-}
	-
	-static void dsl_scan_visitbp(blkptr_t bp, const zbookmark_phys_t zb,
	- dnode_phys_t dnp, dsl_dataset_t ds, dsl_scan_t *scn,
	- dmu_objset_type_t ostype, dmu_tx_t *tx);
	-static void dsl_scan_visitdnode(
	- dsl_scan_t , dsl_dataset_t ds, dmu_objset_type_t ostype,
	- dnode_phys_t dnp, uint64_t object, dmu_tx_t tx);
	-
	-/*
	- * Return nonzero on i/o error.
	- * Return new buf to write out in *bufp.
	- */
	-static int
	-dsl_scan_recurse(dsl_scan_t scn, dsl_dataset_t ds, dmu_objset_type_t ostype,
	- dnode_phys_t dnp, const blkptr_t bp,
	- const zbookmark_phys_t zb, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = scn->scn_dp;
	- int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCAN_THREAD;
	- int err;
	-
	- if (BP_GET_LEVEL(bp) > 0) {
	- arc_flags_t flags = ARC_FLAG_WAIT;
	- int i;
	- blkptr_t *cbp;
	- int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
	- arc_buf_t *buf;
	-
	- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
	- ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
	- if (err) {
	- scn->scn_phys.scn_errors++;
	- return (err);
	- }
	- for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
	- zbookmark_phys_t czb;
	-
	- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
	- zb->zb_level - 1,
	- zb->zb_blkid * epb + i);
	- dsl_scan_visitbp(cbp, &czb, dnp,
	- ds, scn, ostype, tx);
	- }
	- arc_buf_destroy(buf, &buf);
	- } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
	- arc_flags_t flags = ARC_FLAG_WAIT;
	- dnode_phys_t *cdnp;
	- int i;
	- int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
	- arc_buf_t *buf;
	-
	- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
	- ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
	- if (err) {
	- scn->scn_phys.scn_errors++;
	- return (err);
	- }
	- for (i = 0, cdnp = buf->b_data; i < epb;
	- i += cdnp->dn_extra_slots + 1,
	- cdnp += cdnp->dn_extra_slots + 1) {
	- dsl_scan_visitdnode(scn, ds, ostype,
	- cdnp, zb->zb_blkid * epb + i, tx);
	- }
	-
	- arc_buf_destroy(buf, &buf);
	- } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
	- arc_flags_t flags = ARC_FLAG_WAIT;
	- objset_phys_t *osp;
	- arc_buf_t *buf;
	-
	- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
	- ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
	- if (err) {
	- scn->scn_phys.scn_errors++;
	- return (err);
	- }
	-
	- osp = buf->b_data;
	-
	- dsl_scan_visitdnode(scn, ds, osp->os_type,
	- &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
	-
	- if (OBJSET_BUF_HAS_USERUSED(buf)) {
	- /*
	- * We also always visit user/group accounting
	- * objects, and never skip them, even if we are
	- * suspending. This is necessary so that the space
	- * deltas from this txg get integrated.
	- */
	- dsl_scan_visitdnode(scn, ds, osp->os_type,
	- &osp->os_groupused_dnode,
	- DMU_GROUPUSED_OBJECT, tx);
	- dsl_scan_visitdnode(scn, ds, osp->os_type,
	- &osp->os_userused_dnode,
	- DMU_USERUSED_OBJECT, tx);
	- }
	- arc_buf_destroy(buf, &buf);
	- }
	-
	- return (0);
	-}
	-
	-static void
	-dsl_scan_visitdnode(dsl_scan_t scn, dsl_dataset_t ds,
	- dmu_objset_type_t ostype, dnode_phys_t *dnp,
	- uint64_t object, dmu_tx_t *tx)
	-{
	- int j;
	-
	- for (j = 0; j < dnp->dn_nblkptr; j++) {
	- zbookmark_phys_t czb;
	-
	- SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
	- dnp->dn_nlevels - 1, j);
	- dsl_scan_visitbp(&dnp->dn_blkptr[j],
	- &czb, dnp, ds, scn, ostype, tx);
	- }
	-
	- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
	- zbookmark_phys_t czb;
	- SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
	- 0, DMU_SPILL_BLKID);
	- dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
	- &czb, dnp, ds, scn, ostype, tx);
	- }
	-}
	-
	-/*
	- * The arguments are in this order because mdb can only print the
	- * first 5; we want them to be useful.
	- */
	-static void
	-dsl_scan_visitbp(blkptr_t bp, const zbookmark_phys_t zb,
	- dnode_phys_t dnp, dsl_dataset_t ds, dsl_scan_t *scn,
	- dmu_objset_type_t ostype, dmu_tx_t *tx)
	-{
	- dsl_pool_t *dp = scn->scn_dp;
	- blkptr_t *bp_toread = NULL;
	-
	- if (dsl_scan_check_suspend(scn, zb))
	- return;
	-
	- if (dsl_scan_check_resume(scn, dnp, zb))
	- return;
	-
	- scn->scn_visited_this_txg++;
	-
	- dprintf_bp(bp,
	- "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
	- ds, ds ? ds->ds_object : 0,
	- zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
	- bp);
	-
	- if (BP_IS_HOLE(bp)) {
	- scn->scn_holes_this_txg++;
	- return;
	- }
	-
	- if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
	- scn->scn_lt_min_this_txg++;
	- return;
	- }
	-
	- bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
	- bp_toread = bp;
	-
	- if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
	- goto out;
	-
	- /*
	- * If dsl_scan_ddt() has already visited this block, it will have
	- * already done any translations or scrubbing, so don't call the
	- * callback again.
	- */
	- if (ddt_class_contains(dp->dp_spa,
	- scn->scn_phys.scn_ddt_class_max, bp)) {
	- scn->scn_ddt_contained_this_txg++;
	- goto out;
	- }
	-
	- /*
	- * If this block is from the future (after cur_max_txg), then we
	- * are doing this on behalf of a deleted snapshot, and we will
	- * revisit the future block on the next pass of this dataset.
	- * Don't scan it now unless we need to because something
	- * under it was modified.
	- */
	- if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
	- scn->scn_gt_max_this_txg++;
	- goto out;
	- }
	-
	- scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
	-out:
	- kmem_free(bp_toread, sizeof (blkptr_t));
	-}
	-
	-static void
	-dsl_scan_visit_rootbp(dsl_scan_t scn, dsl_dataset_t ds, blkptr_t *bp,
	- dmu_tx_t *tx)
	-{
	- zbookmark_phys_t zb;
	- scan_prefetch_ctx_t *spc;
	-
	- SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
	- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
	-
	- if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
	- SET_BOOKMARK(&scn->scn_prefetch_bookmark,
	- zb.zb_objset, 0, 0, 0);
	- } else {
	- scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
	- }
	-
	- scn->scn_objsets_visited_this_txg++;
	-
	- spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
	- dsl_scan_prefetch(spc, bp, &zb);
	- scan_prefetch_ctx_rele(spc, FTAG);
	-
	- dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
	-
	- dprintf_ds(ds, "finished scan%s", "");
	-}
	-
	-static void
	-ds_destroyed_scn_phys(dsl_dataset_t ds, dsl_scan_phys_t scn_phys)
	-{
	- if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
	- if (ds->ds_is_snapshot) {
	- /*
	- * Note:
	- * - scn_cur_{min,max}_txg stays the same.
	- * - Setting the flag is not really necessary if
	- * scn_cur_max_txg == scn_max_txg, because there
	- * is nothing after this snapshot that we care
	- * about. However, we set it anyway and then
	- * ignore it when we retraverse it in
	- * dsl_scan_visitds().
	- */
	- scn_phys->scn_bookmark.zb_objset =
	- dsl_dataset_phys(ds)->ds_next_snap_obj;
	- zfs_dbgmsg("destroying ds %llu; currently traversing; "
	- "reset zb_objset to %llu",
	- (u_longlong_t)ds->ds_object,
	- (u_longlong_t)dsl_dataset_phys(ds)->
	- ds_next_snap_obj);
	- scn_phys->scn_flags \|= DSF_VISIT_DS_AGAIN;
	- } else {
	- SET_BOOKMARK(&scn_phys->scn_bookmark,
	- ZB_DESTROYED_OBJSET, 0, 0, 0);
	- zfs_dbgmsg("destroying ds %llu; currently traversing; "
	- "reset bookmark to -1,0,0,0",
	- (u_longlong_t)ds->ds_object);
	- }
	- }
	-}
	-
	-/*
	- * Invoked when a dataset is destroyed. We need to make sure that:
	- *
	- * 1) If it is the dataset that was currently being scanned, we write
	- * a new dsl_scan_phys_t and marking the objset reference in it
	- * as destroyed.
	- * 2) Remove it from the work queue, if it was present.
	- *
	- * If the dataset was actually a snapshot, instead of marking the dataset
	- * as destroyed, we instead substitute the next snapshot in line.
	- */
	-void
	-dsl_scan_ds_destroyed(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- dsl_scan_t *scn = dp->dp_scan;
	- uint64_t mintxg;
	-
	- if (!dsl_scan_is_running(scn))
	- return;
	-
	- ds_destroyed_scn_phys(ds, &scn->scn_phys);
	- ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
	-
	- if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
	- scan_ds_queue_remove(scn, ds->ds_object);
	- if (ds->ds_is_snapshot)
	- scan_ds_queue_insert(scn,
	- dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
	- }
	-
	- if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
	- ds->ds_object, &mintxg) == 0) {
	- ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
	- VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
	- if (ds->ds_is_snapshot) {
	- /*
	- * We keep the same mintxg; it could be >
	- * ds_creation_txg if the previous snapshot was
	- * deleted too.
	- */
	- VERIFY(zap_add_int_key(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj,
	- dsl_dataset_phys(ds)->ds_next_snap_obj,
	- mintxg, tx) == 0);
	- zfs_dbgmsg("destroying ds %llu; in queue; "
	- "replacing with %llu",
	- (u_longlong_t)ds->ds_object,
	- (u_longlong_t)dsl_dataset_phys(ds)->
	- ds_next_snap_obj);
	- } else {
	- zfs_dbgmsg("destroying ds %llu; in queue; removing",
	- (u_longlong_t)ds->ds_object);
	- }
	- }
	-
	- /*
	- * dsl_scan_sync() should be called after this, and should sync
	- * out our changed state, but just to be safe, do it here.
	- */
	- dsl_scan_sync_state(scn, tx, SYNC_CACHED);
	-}
	-
	-static void
	-ds_snapshotted_bookmark(dsl_dataset_t ds, zbookmark_phys_t scn_bookmark)
	-{
	- if (scn_bookmark->zb_objset == ds->ds_object) {
	- scn_bookmark->zb_objset =
	- dsl_dataset_phys(ds)->ds_prev_snap_obj;
	- zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
	- "reset zb_objset to %llu",
	- (u_longlong_t)ds->ds_object,
	- (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
	- }
	-}
	-
	-/*
	- * Called when a dataset is snapshotted. If we were currently traversing
	- * this snapshot, we reset our bookmark to point at the newly created
	- * snapshot. We also modify our work queue to remove the old snapshot and
	- * replace with the new one.
	- */
	-void
	-dsl_scan_ds_snapshotted(dsl_dataset_t ds, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- dsl_scan_t *scn = dp->dp_scan;
	- uint64_t mintxg;
	-
	- if (!dsl_scan_is_running(scn))
	- return;
	-
	- ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
	-
	- ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
	- ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
	-
	- if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
	- scan_ds_queue_remove(scn, ds->ds_object);
	- scan_ds_queue_insert(scn,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
	- }
	-
	- if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
	- ds->ds_object, &mintxg) == 0) {
	- VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
	- VERIFY(zap_add_int_key(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
	- zfs_dbgmsg("snapshotting ds %llu; in queue; "
	- "replacing with %llu",
	- (u_longlong_t)ds->ds_object,
	- (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
	- }
	-
	- dsl_scan_sync_state(scn, tx, SYNC_CACHED);
	-}
	-
	-static void
	-ds_clone_swapped_bookmark(dsl_dataset_t ds1, dsl_dataset_t ds2,
	- zbookmark_phys_t *scn_bookmark)
	-{
	- if (scn_bookmark->zb_objset == ds1->ds_object) {
	- scn_bookmark->zb_objset = ds2->ds_object;
	- zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
	- "reset zb_objset to %llu",
	- (u_longlong_t)ds1->ds_object,
	- (u_longlong_t)ds2->ds_object);
	- } else if (scn_bookmark->zb_objset == ds2->ds_object) {
	- scn_bookmark->zb_objset = ds1->ds_object;
	- zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
	- "reset zb_objset to %llu",
	- (u_longlong_t)ds2->ds_object,
	- (u_longlong_t)ds1->ds_object);
	- }
	-}
	-
	-/*
	- * Called when an origin dataset and its clone are swapped. If we were
	- * currently traversing the dataset, we need to switch to traversing the
	- * newly promoted clone.
	- */
	-void
	-dsl_scan_ds_clone_swapped(dsl_dataset_t ds1, dsl_dataset_t ds2, dmu_tx_t *tx)
	-{
	- dsl_pool_t *dp = ds1->ds_dir->dd_pool;
	- dsl_scan_t *scn = dp->dp_scan;
	- uint64_t mintxg1, mintxg2;
	- boolean_t ds1_queued, ds2_queued;
	-
	- if (!dsl_scan_is_running(scn))
	- return;
	-
	- ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
	- ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
	-
	- /*
	- * Handle the in-memory scan queue.
	- */
	- ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
	- ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
	-
	- /* Sanity checking. */
	- if (ds1_queued) {
	- ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
	- ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
	- }
	- if (ds2_queued) {
	- ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
	- ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
	- }
	-
	- if (ds1_queued && ds2_queued) {
	- /*
	- * If both are queued, we don't need to do anything.
	- * The swapping code below would not handle this case correctly,
	- * since we can't insert ds2 if it is already there. That's
	- * because scan_ds_queue_insert() prohibits a duplicate insert
	- * and panics.
	- */
	- } else if (ds1_queued) {
	- scan_ds_queue_remove(scn, ds1->ds_object);
	- scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
	- } else if (ds2_queued) {
	- scan_ds_queue_remove(scn, ds2->ds_object);
	- scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
	- }
	-
	- /*
	- * Handle the on-disk scan queue.
	- * The on-disk state is an out-of-date version of the in-memory state,
	- * so the in-memory and on-disk values for ds1_queued and ds2_queued may
	- * be different. Therefore we need to apply the swap logic to the
	- * on-disk state independently of the in-memory state.
	- */
	- ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
	- ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
	-
	- /* Sanity checking. */
	- if (ds1_queued) {
	- ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
	- ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
	- }
	- if (ds2_queued) {
	- ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
	- ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
	- }
	-
	- if (ds1_queued && ds2_queued) {
	- /*
	- * If both are queued, we don't need to do anything.
	- * Alternatively, we could check for EEXIST from
	- * zap_add_int_key() and back out to the original state, but
	- * that would be more work than checking for this case upfront.
	- */
	- } else if (ds1_queued) {
	- VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
	- VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
	- zfs_dbgmsg("clone_swap ds %llu; in queue; "
	- "replacing with %llu",
	- (u_longlong_t)ds1->ds_object,
	- (u_longlong_t)ds2->ds_object);
	- } else if (ds2_queued) {
	- VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
	- VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
	- scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
	- zfs_dbgmsg("clone_swap ds %llu; in queue; "
	- "replacing with %llu",
	- (u_longlong_t)ds2->ds_object,
	- (u_longlong_t)ds1->ds_object);
	- }
	-
	- dsl_scan_sync_state(scn, tx, SYNC_CACHED);
	-}
	-
	-/* ARGSUSED */
	-static int
	-enqueue_clones_cb(dsl_pool_t dp, dsl_dataset_t hds, void *arg)
	-{
	- uint64_t originobj = (uint64_t )arg;
	- dsl_dataset_t *ds;
	- int err;
	- dsl_scan_t *scn = dp->dp_scan;
	-
	- if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
	- return (0);
	-
	- err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
	- if (err)
	- return (err);
	-
	- while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
	- dsl_dataset_t *prev;
	- err = dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
	-
	- dsl_dataset_rele(ds, FTAG);
	- if (err)
	- return (err);
	- ds = prev;
	- }
	- scan_ds_queue_insert(scn, ds->ds_object,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg);
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	-}
	-
	-static void
	-dsl_scan_visitds(dsl_scan_t scn, uint64_t dsobj, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = scn->scn_dp;
	- dsl_dataset_t *ds;
	-
	- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
	-
	- if (scn->scn_phys.scn_cur_min_txg >=
	- scn->scn_phys.scn_max_txg) {
	- /*
	- * This can happen if this snapshot was created after the
	- * scan started, and we already completed a previous snapshot
	- * that was created after the scan started. This snapshot
	- * only references blocks with:
	- *
	- * birth < our ds_creation_txg
	- * cur_min_txg is no less than ds_creation_txg.
	- * We have already visited these blocks.
	- * or
	- * birth > scn_max_txg
	- * The scan requested not to visit these blocks.
	- *
	- * Subsequent snapshots (and clones) can reference our
	- * blocks, or blocks with even higher birth times.
	- * Therefore we do not need to visit them either,
	- * so we do not add them to the work queue.
	- *
	- * Note that checking for cur_min_txg >= cur_max_txg
	- * is not sufficient, because in that case we may need to
	- * visit subsequent snapshots. This happens when min_txg > 0,
	- * which raises cur_min_txg. In this case we will visit
	- * this dataset but skip all of its blocks, because the
	- * rootbp's birth time is < cur_min_txg. Then we will
	- * add the next snapshots/clones to the work queue.
	- */
	- char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
	- dsl_dataset_name(ds, dsname);
	- zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
	- "cur_min_txg (%llu) >= max_txg (%llu)",
	- (longlong_t)dsobj, dsname,
	- (longlong_t)scn->scn_phys.scn_cur_min_txg,
	- (longlong_t)scn->scn_phys.scn_max_txg);
	- kmem_free(dsname, MAXNAMELEN);
	-
	- goto out;
	- }
	-
	- /*
	- * Only the ZIL in the head (non-snapshot) is valid. Even though
	- * snapshots can have ZIL block pointers (which may be the same
	- * BP as in the head), they must be ignored. In addition, $ORIGIN
	- * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
	- * need to look for a ZIL in it either. So we traverse the ZIL here,
	- * rather than in scan_recurse(), because the regular snapshot
	- * block-sharing rules don't apply to it.
	- */
	- if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) &&
	- (dp->dp_origin_snap == NULL \|\|
	- ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
	- objset_t *os;
	- if (dmu_objset_from_ds(ds, &os) != 0) {
	- goto out;
	- }
	- dsl_scan_zil(dp, &os->os_zil_header);
	- }
	-
	- /*
	- * Iterate over the bps in this ds.
	- */
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	- dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
	- rrw_exit(&ds->ds_bp_rwlock, FTAG);
	-
	- char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
	- dsl_dataset_name(ds, dsname);
	- zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
	- "suspending=%u",
	- (longlong_t)dsobj, dsname,
	- (longlong_t)scn->scn_phys.scn_cur_min_txg,
	- (longlong_t)scn->scn_phys.scn_cur_max_txg,
	- (int)scn->scn_suspending);
	- kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
	-
	- if (scn->scn_suspending)
	- goto out;
	-
	- /*
	- * We've finished this pass over this dataset.
	- */
	-
	- /*
	- * If we did not completely visit this dataset, do another pass.
	- */
	- if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
	- zfs_dbgmsg("incomplete pass; visiting again");
	- scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
	- scan_ds_queue_insert(scn, ds->ds_object,
	- scn->scn_phys.scn_cur_max_txg);
	- goto out;
	- }
	-
	- /*
	- * Add descendent datasets to work queue.
	- */
	- if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
	- scan_ds_queue_insert(scn,
	- dsl_dataset_phys(ds)->ds_next_snap_obj,
	- dsl_dataset_phys(ds)->ds_creation_txg);
	- }
	- if (dsl_dataset_phys(ds)->ds_num_children > 1) {
	- boolean_t usenext = B_FALSE;
	- if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
	- uint64_t count;
	- /*
	- * A bug in a previous version of the code could
	- * cause upgrade_clones_cb() to not set
	- * ds_next_snap_obj when it should, leading to a
	- * missing entry. Therefore we can only use the
	- * next_clones_obj when its count is correct.
	- */
	- int err = zap_count(dp->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
	- if (err == 0 &&
	- count == dsl_dataset_phys(ds)->ds_num_children - 1)
	- usenext = B_TRUE;
	- }
	-
	- if (usenext) {
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- for (zap_cursor_init(&zc, dp->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_next_clones_obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- (void) zap_cursor_advance(&zc)) {
	- scan_ds_queue_insert(scn,
	- zfs_strtonum(za.za_name, NULL),
	- dsl_dataset_phys(ds)->ds_creation_txg);
	- }
	- zap_cursor_fini(&zc);
	- } else {
	- VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	- enqueue_clones_cb, &ds->ds_object,
	- DS_FIND_CHILDREN));
	- }
	- }
	-
	-out:
	- dsl_dataset_rele(ds, FTAG);
	-}
	-
	-/* ARGSUSED */
	-static int
	-enqueue_cb(dsl_pool_t dp, dsl_dataset_t hds, void *arg)
	-{
	- dsl_dataset_t *ds;
	- int err;
	- dsl_scan_t *scn = dp->dp_scan;
	-
	- err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
	- if (err)
	- return (err);
	-
	- while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	- dsl_dataset_t *prev;
	- err = dsl_dataset_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
	- if (err) {
	- dsl_dataset_rele(ds, FTAG);
	- return (err);
	- }
	-
	- /*
	- * If this is a clone, we don't need to worry about it for now.
	- */
	- if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
	- dsl_dataset_rele(ds, FTAG);
	- dsl_dataset_rele(prev, FTAG);
	- return (0);
	- }
	- dsl_dataset_rele(ds, FTAG);
	- ds = prev;
	- }
	-
	- scan_ds_queue_insert(scn, ds->ds_object,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg);
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-void
	-dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
	- ddt_entry_t dde, dmu_tx_t tx)
	-{
	- const ddt_key_t *ddk = &dde->dde_key;
	- ddt_phys_t *ddp = dde->dde_phys;
	- blkptr_t bp;
	- zbookmark_phys_t zb = { 0 };
	- int p;
	-
	- if (!dsl_scan_is_running(scn))
	- return;
	-
	- /*
	- * This function is special because it is the only thing
	- * that can add scan_io_t's to the vdev scan queues from
	- * outside dsl_scan_sync(). For the most part this is ok
	- * as long as it is called from within syncing context.
	- * However, dsl_scan_sync() expects that no new sio's will
	- * be added between when all the work for a scan is done
	- * and the next txg when the scan is actually marked as
	- * completed. This check ensures we do not issue new sio's
	- * during this period.
	- */
	- if (scn->scn_done_txg != 0)
	- return;
	-
	- for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	- if (ddp->ddp_phys_birth == 0 \|\|
	- ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
	- continue;
	- ddt_bp_create(checksum, ddk, ddp, &bp);
	-
	- scn->scn_visited_this_txg++;
	- scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
	- }
	-}
	-
	-/*
	- * Scrub/dedup interaction.
	- *
	- * If there are N references to a deduped block, we don't want to scrub it
	- * N times -- ideally, we should scrub it exactly once.
	- *
	- * We leverage the fact that the dde's replication class (enum ddt_class)
	- * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
	- * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
	- *
	- * To prevent excess scrubbing, the scrub begins by walking the DDT
	- * to find all blocks with refcnt > 1, and scrubs each of these once.
	- * Since there are two replication classes which contain blocks with
	- * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
	- * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
	- *
	- * There would be nothing more to say if a block's refcnt couldn't change
	- * during a scrub, but of course it can so we must account for changes
	- * in a block's replication class.
	- *
	- * Here's an example of what can occur:
	- *
	- * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
	- * when visited during the top-down scrub phase, it will be scrubbed twice.
	- * This negates our scrub optimization, but is otherwise harmless.
	- *
	- * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
	- * on each visit during the top-down scrub phase, it will never be scrubbed.
	- * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
	- * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
	- * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
	- * while a scrub is in progress, it scrubs the block right then.
	- */
	-static void
	-dsl_scan_ddt(dsl_scan_t scn, dmu_tx_t tx)
	-{
	- ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
	- ddt_entry_t dde = { 0 };
	- int error;
	- uint64_t n = 0;
	-
	- while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
	- ddt_t *ddt;
	-
	- if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
	- break;
	- dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
	- (longlong_t)ddb->ddb_class,
	- (longlong_t)ddb->ddb_type,
	- (longlong_t)ddb->ddb_checksum,
	- (longlong_t)ddb->ddb_cursor);
	-
	- /* There should be no pending changes to the dedup table */
	- ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
	- ASSERT(avl_first(&ddt->ddt_tree) == NULL);
	-
	- dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
	- n++;
	-
	- if (dsl_scan_check_suspend(scn, NULL))
	- break;
	- }
	-
	- zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
	- "suspending=%u", (longlong_t)n,
	- (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
	-
	- ASSERT(error == 0 \|\| error == ENOENT);
	- ASSERT(error != ENOENT \|\|
	- ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
	-}
	-
	-static uint64_t
	-dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
	-{
	- uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
	- if (ds->ds_is_snapshot)
	- return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
	- return (smt);
	-}
	-
	-static void
	-dsl_scan_visit(dsl_scan_t scn, dmu_tx_t tx)
	-{
	- scan_ds_t *sds;
	- dsl_pool_t *dp = scn->scn_dp;
	-
	- if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
	- scn->scn_phys.scn_ddt_class_max) {
	- scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
	- scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
	- dsl_scan_ddt(scn, tx);
	- if (scn->scn_suspending)
	- return;
	- }
	-
	- if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
	- /* First do the MOS & ORIGIN */
	-
	- scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
	- scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
	- dsl_scan_visit_rootbp(scn, NULL,
	- &dp->dp_meta_rootbp, tx);
	- spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
	- if (scn->scn_suspending)
	- return;
	-
	- if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
	- VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	- enqueue_cb, NULL, DS_FIND_CHILDREN));
	- } else {
	- dsl_scan_visitds(scn,
	- dp->dp_origin_snap->ds_object, tx);
	- }
	- ASSERT(!scn->scn_suspending);
	- } else if (scn->scn_phys.scn_bookmark.zb_objset !=
	- ZB_DESTROYED_OBJSET) {
	- uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
	- /*
	- * If we were suspended, continue from here. Note if the
	- * ds we were suspended on was deleted, the zb_objset may
	- * be -1, so we will skip this and find a new objset
	- * below.
	- */
	- dsl_scan_visitds(scn, dsobj, tx);
	- if (scn->scn_suspending)
	- return;
	- }
	-
	- /*
	- * In case we suspended right at the end of the ds, zero the
	- * bookmark so we don't think that we're still trying to resume.
	- */
	- bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
	-
	- /*
	- * Keep pulling things out of the dataset avl queue. Updates to the
	- * persistent zap-object-as-queue happen only at checkpoints.
	- */
	- while ((sds = avl_first(&scn->scn_queue)) != NULL) {
	- dsl_dataset_t *ds;
	- uint64_t dsobj = sds->sds_dsobj;
	- uint64_t txg = sds->sds_txg;
	-
	- /* dequeue and free the ds from the queue */
	- scan_ds_queue_remove(scn, dsobj);
	- sds = NULL; /* must not be touched after removal */
	-
	- /* Set up min / max txg */
	- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
	- if (txg != 0) {
	- scn->scn_phys.scn_cur_min_txg =
	- MAX(scn->scn_phys.scn_min_txg, txg);
	- } else {
	- scn->scn_phys.scn_cur_min_txg =
	- MAX(scn->scn_phys.scn_min_txg,
	- dsl_dataset_phys(ds)->ds_prev_snap_txg);
	- }
	- scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
	- dsl_dataset_rele(ds, FTAG);
	-
	- dsl_scan_visitds(scn, dsobj, tx);
	- if (scn->scn_suspending)
	- return;
	- }
	- /* No more objsets to fetch, we're done */
	- scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
	- ASSERT0(scn->scn_suspending);
	-}
	-
	-static uint64_t
	-dsl_scan_count_leaves(vdev_t *vd)
	-{
	- uint64_t i, leaves = 0;
	-
	- /* we only count leaves that belong to the main pool and are readable */
	- if (vd->vdev_islog \|\| vd->vdev_isspare \|\|
	- vd->vdev_isl2cache \|\| !vdev_readable(vd))
	- return (0);
	-
	- if (vd->vdev_ops->vdev_op_leaf)
	- return (1);
	-
	- for (i = 0; i < vd->vdev_children; i++) {
	- leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
	- }
	-
	- return (leaves);
	-}
	-
	-
	-static void
	-scan_io_queues_update_zio_stats(dsl_scan_io_queue_t q, const blkptr_t bp)
	-{
	- int i;
	- uint64_t cur_size = 0;
	-
	- for (i = 0; i < BP_GET_NDVAS(bp); i++) {
	- cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
	- }
	-
	- q->q_total_zio_size_this_txg += cur_size;
	- q->q_zios_this_txg++;
	-}
	-
	-static void
	-scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
	- uint64_t end)
	-{
	- q->q_total_seg_size_this_txg += end - start;
	- q->q_segs_this_txg++;
	-}
	-
	-static boolean_t
	-scan_io_queue_check_suspend(dsl_scan_t *scn)
	-{
	- /* See comment in dsl_scan_check_suspend() */
	- uint64_t curr_time_ns = gethrtime();
	- uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
	- uint64_t sync_time_ns = curr_time_ns -
	- scn->scn_dp->dp_spa->spa_sync_starttime;
	- int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
	- int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
	- zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
	-
	- return ((NSEC2MSEC(scan_time_ns) > mintime &&
	- (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent \|\|
	- txg_sync_waiting(scn->scn_dp) \|\|
	- NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) \|\|
	- spa_shutting_down(scn->scn_dp->dp_spa));
	-}
	-
	-/*
	- * Given a list of scan_io_t's in io_list, this issues the io's out to
	- * disk. This consumes the io_list and frees the scan_io_t's. This is
	- * called when emptying queues, either when we're up against the memory
	- * limit or when we have finished scanning. Returns B_TRUE if we stopped
	- * processing the list before we finished. Any zios that were not issued
	- * will remain in the io_list.
	- */
	-static boolean_t
	-scan_io_queue_issue(dsl_scan_io_queue_t queue, list_t io_list)
	-{
	- dsl_scan_t *scn = queue->q_scn;
	- scan_io_t *sio;
	- int64_t bytes_issued = 0;
	- boolean_t suspended = B_FALSE;
	-
	- while ((sio = list_head(io_list)) != NULL) {
	- blkptr_t bp;
	-
	- if (scan_io_queue_check_suspend(scn)) {
	- suspended = B_TRUE;
	- break;
	- }
	-
	- sio2bp(sio, &bp, queue->q_vd->vdev_id);
	- bytes_issued += sio->sio_asize;
	- scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
	- &sio->sio_zb, queue);
	- (void) list_remove_head(io_list);
	- scan_io_queues_update_zio_stats(queue, &bp);
	- kmem_free(sio, sizeof (*sio));
	- }
	-
	- atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
	-
	- return (suspended);
	-}
	-
	-/*
	- * Given a range_seg_t (extent) and a list, this function passes over a
	- * scan queue and gathers up the appropriate ios which fit into that
	- * scan seg (starting from lowest LBA). At the end, we remove the segment
	- * from the q_exts_by_addr range tree.
	- */
	-static boolean_t
	-scan_io_queue_gather(dsl_scan_io_queue_t queue, range_seg_t rs, list_t *list)
	-{
	- scan_io_t srch_sio, sio, next_sio;
	- avl_index_t idx;
	- uint_t num_sios = 0;
	- int64_t bytes_issued = 0;
	-
	- ASSERT(rs != NULL);
	- ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
	-
	- srch_sio.sio_offset = rs->rs_start;
	-
	- /*
	- * The exact start of the extent might not contain any matching zios,
	- * so if that's the case, examine the next one in the tree.
	- */
	- sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
	- if (sio == NULL)
	- sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
	-
	- while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
	- ASSERT3U(sio->sio_offset, >=, rs->rs_start);
	- ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
	-
	- next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
	- avl_remove(&queue->q_sios_by_addr, sio);
	-
	- bytes_issued += sio->sio_asize;
	- num_sios++;
	- list_insert_tail(list, sio);
	- sio = next_sio;
	- }
	-
	- /*
	- * We limit the number of sios we process at once to 32 to avoid
	- * biting off more than we can chew. If we didn't take everything
	- * in the segment we update it to reflect the work we were able to
	- * complete. Otherwise, we remove it from the range tree entirely.
	- */
	- if (sio != NULL && sio->sio_offset < rs->rs_end) {
	- range_tree_adjust_fill(queue->q_exts_by_addr, rs,
	- -bytes_issued);
	- range_tree_resize_segment(queue->q_exts_by_addr, rs,
	- sio->sio_offset, rs->rs_end - sio->sio_offset);
	-
	- return (B_TRUE);
	- } else {
	- range_tree_remove(queue->q_exts_by_addr, rs->rs_start,
	- rs->rs_end - rs->rs_start);
	- return (B_FALSE);
	- }
	-}
	-
	-
	-/*
	- * This is called from the queue emptying thread and selects the next
	- * extent from which we are to issue io's. The behavior of this function
	- * depends on the state of the scan, the current memory consumption and
	- * whether or not we are performing a scan shutdown.
	- * 1) We select extents in an elevator algorithm (LBA-order) if the scan
	- * needs to perform a checkpoint
	- * 2) We select the largest available extent if we are up against the
	- * memory limit.
	- * 3) Otherwise we don't select any extents.
	- */
	-static const range_seg_t *
	-scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
	-{
	- dsl_scan_t *scn = queue->q_scn;
	-
	- ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
	- ASSERT(scn->scn_is_sorted);
	-
	- /* handle tunable overrides */
	- if (scn->scn_checkpointing \|\| scn->scn_clearing) {
	- if (zfs_scan_issue_strategy == 1) {
	- return (range_tree_first(queue->q_exts_by_addr));
	- } else if (zfs_scan_issue_strategy == 2) {
	- return (avl_first(&queue->q_exts_by_size));
	- }
	- }
	-
	- /*
	- * During normal clearing, we want to issue our largest segments
	- * first, keeping IO as sequential as possible, and leaving the
	- * smaller extents for later with the hope that they might eventually
	- * grow to larger sequential segments. However, when the scan is
	- * checkpointing, no new extents will be added to the sorting queue,
	- * so the way we are sorted now is as good as it will ever get.
	- * In this case, we instead switch to issuing extents in LBA order.
	- */
	- if (scn->scn_checkpointing) {
	- return (range_tree_first(queue->q_exts_by_addr));
	- } else if (scn->scn_clearing) {
	- return (avl_first(&queue->q_exts_by_size));
	- } else {
	- return (NULL);
	- }
	-}
	-
	-static void
	-scan_io_queues_run_one(void *arg)
	-{
	- dsl_scan_io_queue_t *queue = arg;
	- kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
	- boolean_t suspended = B_FALSE;
	- range_seg_t *rs = NULL;
	- scan_io_t *sio = NULL;
	- list_t sio_list;
	- uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
	- uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
	-
	- ASSERT(queue->q_scn->scn_is_sorted);
	-
	- list_create(&sio_list, sizeof (scan_io_t),
	- offsetof(scan_io_t, sio_nodes.sio_list_node));
	- mutex_enter(q_lock);
	-
	- /* calculate maximum in-flight bytes for this txg (min 1MB) */
	- queue->q_maxinflight_bytes =
	- MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
	-
	- /* reset per-queue scan statistics for this txg */
	- queue->q_total_seg_size_this_txg = 0;
	- queue->q_segs_this_txg = 0;
	- queue->q_total_zio_size_this_txg = 0;
	- queue->q_zios_this_txg = 0;
	-
	- /* loop until we have run out of time or sios */
	- while ((rs = (range_seg_t*)scan_io_queue_fetch_ext(queue)) != NULL) {
	- uint64_t seg_start = 0, seg_end = 0;
	- boolean_t more_left = B_TRUE;
	-
	- ASSERT(list_is_empty(&sio_list));
	-
	- /* loop while we still have sios left to process in this rs */
	- while (more_left) {
	- scan_io_t first_sio, last_sio;
	-
	- /*
	- * We have selected which extent needs to be
	- * processed next. Gather up the corresponding sios.
	- */
	- more_left = scan_io_queue_gather(queue, rs, &sio_list);
	- ASSERT(!list_is_empty(&sio_list));
	- first_sio = list_head(&sio_list);
	- last_sio = list_tail(&sio_list);
	-
	- seg_end = last_sio->sio_offset + last_sio->sio_asize;
	- if (seg_start == 0)
	- seg_start = first_sio->sio_offset;
	-
	- /*
	- * Issuing sios can take a long time so drop the
	- * queue lock. The sio queue won't be updated by
	- * other threads since we're in syncing context so
	- * we can be sure that our trees will remain exactly
	- * as we left them.
	- */
	- mutex_exit(q_lock);
	- suspended = scan_io_queue_issue(queue, &sio_list);
	- mutex_enter(q_lock);
	-
	- if (suspended)
	- break;
	- }
	- /* update statistics for debugging purposes */
	- scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
	-
	- if (suspended)
	- break;
	- }
	-
	-
	- /* If we were suspended in the middle of processing,
	- * requeue any unfinished sios and exit.
	- */
	- while ((sio = list_head(&sio_list)) != NULL) {
	- list_remove(&sio_list, sio);
	- scan_io_queue_insert_impl(queue, sio);
	- }
	-
	- mutex_exit(q_lock);
	- list_destroy(&sio_list);
	-}
	-
	-/*
	- * Performs an emptying run on all scan queues in the pool. This just
	- * punches out one thread per top-level vdev, each of which processes
	- * only that vdev's scan queue. We can parallelize the I/O here because
	- * we know that each queue's io's only affect its own top-level vdev.
	- *
	- * This function waits for the queue runs to complete, and must be
	- * called from dsl_scan_sync (or in general, syncing context).
	- */
	-static void
	-scan_io_queues_run(dsl_scan_t *scn)
	-{
	- spa_t *spa = scn->scn_dp->dp_spa;
	-
	- ASSERT(scn->scn_is_sorted);
	- ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
	-
	- if (scn->scn_bytes_pending == 0)
	- return;
	-
	- if (scn->scn_taskq == NULL) {
	- char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16,
	- KM_SLEEP);
	- int nthreads = spa->spa_root_vdev->vdev_children;
	-
	- /*
	- * We need to make this taskq always execute as many
	- * threads in parallel as we have top-level vdevs and no
	- * less, otherwise strange serialization of the calls to
	- * scan_io_queues_run_one can occur during spa_sync runs
	- * and that significantly impacts performance.
	- */
	- (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16,
	- "dsl_scan_tq_%s", spa->spa_name);
	- scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri,
	- nthreads, nthreads, TASKQ_PREPOPULATE);
	- kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16);
	- }
	-
	- for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
	- vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
	-
	- mutex_enter(&vd->vdev_scan_io_queue_lock);
	- if (vd->vdev_scan_io_queue != NULL) {
	- VERIFY(taskq_dispatch(scn->scn_taskq,
	- scan_io_queues_run_one, vd->vdev_scan_io_queue,
	- TQ_SLEEP) != TASKQID_INVALID);
	- }
	- mutex_exit(&vd->vdev_scan_io_queue_lock);
	- }
	-
	- /*
	- * Wait for the queues to finish issuing thir IOs for this run
	- * before we return. There may still be IOs in flight at this
	- * point.
	- */
	- taskq_wait(scn->scn_taskq);
	-}
	-
	-static boolean_t
	-dsl_scan_async_block_should_pause(dsl_scan_t *scn)
	-{
	- uint64_t elapsed_nanosecs;
	-
	- if (zfs_recover)
	- return (B_FALSE);
	-
	- if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks)
	- return (B_TRUE);
	-
	- elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
	- return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout \|\|
	- (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
	- txg_sync_waiting(scn->scn_dp)) \|\|
	- spa_shutting_down(scn->scn_dp->dp_spa));
	-}
	-
	-static int
	-dsl_scan_free_block_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- dsl_scan_t *scn = arg;
	-
	- if (!scn->scn_is_bptree \|\|
	- (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
	- if (dsl_scan_async_block_should_pause(scn))
	- return (SET_ERROR(ERESTART));
	- }
	-
	- zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
	- dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
	- dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
	- -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
	- -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
	- scn->scn_visited_this_txg++;
	- return (0);
	-}
	-
	-static void
	-dsl_scan_update_stats(dsl_scan_t *scn)
	-{
	- spa_t *spa = scn->scn_dp->dp_spa;
	- uint64_t i;
	- uint64_t seg_size_total = 0, zio_size_total = 0;
	- uint64_t seg_count_total = 0, zio_count_total = 0;
	-
	- for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
	- vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
	- dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
	-
	- if (queue == NULL)
	- continue;
	-
	- seg_size_total += queue->q_total_seg_size_this_txg;
	- zio_size_total += queue->q_total_zio_size_this_txg;
	- seg_count_total += queue->q_segs_this_txg;
	- zio_count_total += queue->q_zios_this_txg;
	- }
	-
	- if (seg_count_total == 0 \|\| zio_count_total == 0) {
	- scn->scn_avg_seg_size_this_txg = 0;
	- scn->scn_avg_zio_size_this_txg = 0;
	- scn->scn_segs_this_txg = 0;
	- scn->scn_zios_this_txg = 0;
	- return;
	- }
	-
	- scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
	- scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
	- scn->scn_segs_this_txg = seg_count_total;
	- scn->scn_zios_this_txg = zio_count_total;
	-}
	-
	-static int
	-dsl_scan_obsolete_block_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- dsl_scan_t *scn = arg;
	- const dva_t *dva = &bp->blk_dva[0];
	-
	- if (dsl_scan_async_block_should_pause(scn))
	- return (SET_ERROR(ERESTART));
	-
	- spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
	- DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
	- DVA_GET_ASIZE(dva), tx);
	- scn->scn_visited_this_txg++;
	- return (0);
	-}
	-
	-boolean_t
	-dsl_scan_active(dsl_scan_t *scn)
	-{
	- spa_t *spa = scn->scn_dp->dp_spa;
	- uint64_t used = 0, comp, uncomp;
	-
	- if (spa->spa_load_state != SPA_LOAD_NONE)
	- return (B_FALSE);
	- if (spa_shutting_down(spa))
	- return (B_FALSE);
	- if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) \|\|
	- (scn->scn_async_destroying && !scn->scn_async_stalled))
	- return (B_TRUE);
	-
	- if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
	- (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
	- &used, &comp, &uncomp);
	- }
	- return (used != 0);
	-}
	-
	-static boolean_t
	-dsl_scan_need_resilver(spa_t spa, const dva_t dva, size_t psize,
	- uint64_t phys_birth)
	-{
	- vdev_t *vd;
	-
	- vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
	-
	- if (vd->vdev_ops == &vdev_indirect_ops) {
	- /*
	- * The indirect vdev can point to multiple
	- * vdevs. For simplicity, always create
	- * the resilver zio_t. zio_vdev_io_start()
	- * will bypass the child resilver i/o's if
	- * they are on vdevs that don't have DTL's.
	- */
	- return (B_TRUE);
	- }
	-
	- if (DVA_GET_GANG(dva)) {
	- /*
	- * Gang members may be spread across multiple
	- * vdevs, so the best estimate we have is the
	- * scrub range, which has already been checked.
	- * XXX -- it would be better to change our
	- * allocation policy to ensure that all
	- * gang members reside on the same vdev.
	- */
	- return (B_TRUE);
	- }
	-
	- /*
	- * Check if the txg falls within the range which must be
	- * resilvered. DVAs outside this range can always be skipped.
	- */
	- if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
	- return (B_FALSE);
	-
	- /*
	- * Check if the top-level vdev must resilver this offset.
	- * When the offset does not intersect with a dirty leaf DTL
	- * then it may be possible to skip the resilver IO. The psize
	- * is provided instead of asize to simplify the check for RAIDZ.
	- */
	- if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
	- return (B_FALSE);
	-
	- return (B_TRUE);
	-}
	-
	-static int
	-dsl_process_async_destroys(dsl_pool_t dp, dmu_tx_t tx)
	-{
	- int err = 0;
	- dsl_scan_t *scn = dp->dp_scan;
	- spa_t *spa = dp->dp_spa;
	-
	- if (spa_suspend_async_destroy(spa))
	- return (0);
	-
	- if (zfs_free_bpobj_enabled &&
	- spa_version(spa) >= SPA_VERSION_DEADLISTS) {
	- scn->scn_is_bptree = B_FALSE;
	- scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
	- scn->scn_zio_root = zio_root(spa, NULL,
	- NULL, ZIO_FLAG_MUSTSUCCEED);
	- err = bpobj_iterate(&dp->dp_free_bpobj,
	- dsl_scan_free_block_cb, scn, tx);
	- VERIFY0(zio_wait(scn->scn_zio_root));
	- scn->scn_zio_root = NULL;
	-
	- if (err != 0 && err != ERESTART)
	- zfs_panic_recover("error %u from bpobj_iterate()", err);
	- }
	-
	- if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
	- ASSERT(scn->scn_async_destroying);
	- scn->scn_is_bptree = B_TRUE;
	- scn->scn_zio_root = zio_root(spa, NULL,
	- NULL, ZIO_FLAG_MUSTSUCCEED);
	- err = bptree_iterate(dp->dp_meta_objset,
	- dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
	- VERIFY0(zio_wait(scn->scn_zio_root));
	- scn->scn_zio_root = NULL;
	-
	- if (err == EIO \|\| err == ECKSUM) {
	- err = 0;
	- } else if (err != 0 && err != ERESTART) {
	- zfs_panic_recover("error %u from "
	- "traverse_dataset_destroyed()", err);
	- }
	-
	- if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
	- /* finished; deactivate async destroy feature */
	- spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
	- ASSERT(!spa_feature_is_active(spa,
	- SPA_FEATURE_ASYNC_DESTROY));
	- VERIFY0(zap_remove(dp->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_BPTREE_OBJ, tx));
	- VERIFY0(bptree_free(dp->dp_meta_objset,
	- dp->dp_bptree_obj, tx));
	- dp->dp_bptree_obj = 0;
	- scn->scn_async_destroying = B_FALSE;
	- scn->scn_async_stalled = B_FALSE;
	- } else {
	- /*
	- * If we didn't make progress, mark the async
	- * destroy as stalled, so that we will not initiate
	- * a spa_sync() on its behalf. Note that we only
	- * check this if we are not finished, because if the
	- * bptree had no blocks for us to visit, we can
	- * finish without "making progress".
	- */
	- scn->scn_async_stalled =
	- (scn->scn_visited_this_txg == 0);
	- }
	- }
	- if (scn->scn_visited_this_txg) {
	- zfs_dbgmsg("freed %llu blocks in %llums from "
	- "free_bpobj/bptree txg %llu; err=%d",
	- (longlong_t)scn->scn_visited_this_txg,
	- (longlong_t)
	- NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
	- (longlong_t)tx->tx_txg, err);
	- scn->scn_visited_this_txg = 0;
	-
	- /*
	- * Write out changes to the DDT that may be required as a
	- * result of the blocks freed. This ensures that the DDT
	- * is clean when a scrub/resilver runs.
	- */
	- ddt_sync(spa, tx->tx_txg);
	- }
	- if (err != 0)
	- return (err);
	- if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
	- zfs_free_leak_on_eio &&
	- (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 \|\|
	- dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 \|\|
	- dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
	- /*
	- * We have finished background destroying, but there is still
	- * some space left in the dp_free_dir. Transfer this leaked
	- * space to the dp_leak_dir.
	- */
	- if (dp->dp_leak_dir == NULL) {
	- rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
	- (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
	- LEAK_DIR_NAME, tx);
	- VERIFY0(dsl_pool_open_special_dir(dp,
	- LEAK_DIR_NAME, &dp->dp_leak_dir));
	- rrw_exit(&dp->dp_config_rwlock, FTAG);
	- }
	- dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
	- dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
	- dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
	- dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
	- dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
	- -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
	- -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
	- -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
	- }
	-
	- if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
	- /* finished; verify that space accounting went to zero */
	- ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
	- ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
	- ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
	- }
	-
	- EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
	- 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_OBSOLETE_BPOBJ));
	- if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
	- ASSERT(spa_feature_is_active(dp->dp_spa,
	- SPA_FEATURE_OBSOLETE_COUNTS));
	-
	- scn->scn_is_bptree = B_FALSE;
	- scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
	- err = bpobj_iterate(&dp->dp_obsolete_bpobj,
	- dsl_scan_obsolete_block_cb, scn, tx);
	- if (err != 0 && err != ERESTART)
	- zfs_panic_recover("error %u from bpobj_iterate()", err);
	-
	- if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
	- dsl_pool_destroy_obsolete_bpobj(dp, tx);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * This is the primary entry point for scans that is called from syncing
	- * context. Scans must happen entirely during syncing context so that we
	- * cna guarantee that blocks we are currently scanning will not change out
	- * from under us. While a scan is active, this funciton controls how quickly
	- * transaction groups proceed, instead of the normal handling provided by
	- * txg_sync_thread().
	- */
	-void
	-dsl_scan_sync(dsl_pool_t dp, dmu_tx_t tx)
	-{
	- dsl_scan_t *scn = dp->dp_scan;
	- spa_t *spa = dp->dp_spa;
	- int err = 0;
	- state_sync_type_t sync_type = SYNC_OPTIONAL;
	-
	- /*
	- * Check for scn_restart_txg before checking spa_load_state, so
	- * that we can restart an old-style scan while the pool is being
	- * imported (see dsl_scan_init).
	- */
	- if (dsl_scan_restarting(scn, tx)) {
	- pool_scan_func_t func = POOL_SCAN_SCRUB;
	- dsl_scan_done(scn, B_FALSE, tx);
	- if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
	- func = POOL_SCAN_RESILVER;
	- zfs_dbgmsg("restarting scan func=%u txg=%llu",
	- func, (longlong_t)tx->tx_txg);
	- dsl_scan_setup_sync(&func, tx);
	- }
	-
	- /*
	- * Only process scans in sync pass 1.
	- */
	- if (spa_sync_pass(dp->dp_spa) > 1)
	- return;
	-
	- /*
	- * If the spa is shutting down, then stop scanning. This will
	- * ensure that the scan does not dirty any new data during the
	- * shutdown phase.
	- */
	- if (spa_shutting_down(spa))
	- return;
	-
	- /*
	- * If the scan is inactive due to a stalled async destroy, try again.
	- */
	- if (!scn->scn_async_stalled && !dsl_scan_active(scn))
	- return;
	-
	- /* reset scan statistics */
	- scn->scn_visited_this_txg = 0;
	- scn->scn_holes_this_txg = 0;
	- scn->scn_lt_min_this_txg = 0;
	- scn->scn_gt_max_this_txg = 0;
	- scn->scn_ddt_contained_this_txg = 0;
	- scn->scn_objsets_visited_this_txg = 0;
	- scn->scn_avg_seg_size_this_txg = 0;
	- scn->scn_segs_this_txg = 0;
	- scn->scn_avg_zio_size_this_txg = 0;
	- scn->scn_zios_this_txg = 0;
	- scn->scn_suspending = B_FALSE;
	- scn->scn_sync_start_time = gethrtime();
	- spa->spa_scrub_active = B_TRUE;
	-
	- /*
	- * First process the async destroys. If we pause, don't do
	- * any scrubbing or resilvering. This ensures that there are no
	- * async destroys while we are scanning, so the scan code doesn't
	- * have to worry about traversing it. It is also faster to free the
	- * blocks than to scrub them.
	- */
	- err = dsl_process_async_destroys(dp, tx);
	- if (err != 0)
	- return;
	-
	- if (!dsl_scan_is_running(scn) \|\| dsl_scan_is_paused_scrub(scn))
	- return;
	-
	- /*
	- * Wait a few txgs after importing to begin scanning so that
	- * we can get the pool imported quickly.
	- */
	- if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
	- return;
	-
	- /*
	- * It is possible to switch from unsorted to sorted at any time,
	- * but afterwards the scan will remain sorted unless reloaded from
	- * a checkpoint after a reboot.
	- */
	- if (!zfs_scan_legacy) {
	- scn->scn_is_sorted = B_TRUE;
	- if (scn->scn_last_checkpoint == 0)
	- scn->scn_last_checkpoint = ddi_get_lbolt();
	- }
	-
	- /*
	- * For sorted scans, determine what kind of work we will be doing
	- * this txg based on our memory limitations and whether or not we
	- * need to perform a checkpoint.
	- */
	- if (scn->scn_is_sorted) {
	- /*
	- * If we are over our checkpoint interval, set scn_clearing
	- * so that we can begin checkpointing immediately. The
	- * checkpoint allows us to save a consisent bookmark
	- * representing how much data we have scrubbed so far.
	- * Otherwise, use the memory limit to determine if we should
	- * scan for metadata or start issue scrub IOs. We accumulate
	- * metadata until we hit our hard memory limit at which point
	- * we issue scrub IOs until we are at our soft memory limit.
	- */
	- if (scn->scn_checkpointing \|\|
	- ddi_get_lbolt() - scn->scn_last_checkpoint >
	- SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
	- if (!scn->scn_checkpointing)
	- zfs_dbgmsg("begin scan checkpoint");
	-
	- scn->scn_checkpointing = B_TRUE;
	- scn->scn_clearing = B_TRUE;
	- } else {
	- boolean_t should_clear = dsl_scan_should_clear(scn);
	- if (should_clear && !scn->scn_clearing) {
	- zfs_dbgmsg("begin scan clearing");
	- scn->scn_clearing = B_TRUE;
	- } else if (!should_clear && scn->scn_clearing) {
	- zfs_dbgmsg("finish scan clearing");
	- scn->scn_clearing = B_FALSE;
	- }
	- }
	- } else {
	- ASSERT0(scn->scn_checkpointing);
	- ASSERT0(scn->scn_clearing);
	- }
	-
	- if (!scn->scn_clearing && scn->scn_done_txg == 0) {
	- /* Need to scan metadata for more blocks to scrub */
	- dsl_scan_phys_t *scnp = &scn->scn_phys;
	- taskqid_t prefetch_tqid;
	- uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
	- uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
	-
	- /*
	- * Recalculate the max number of in-flight bytes for pool-wide
	- * scanning operations (minimum 1MB). Limits for the issuing
	- * phase are done per top-level vdev and are handled separately.
	- */
	- scn->scn_maxinflight_bytes =
	- MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
	-
	- if (scnp->scn_ddt_bookmark.ddb_class <=
	- scnp->scn_ddt_class_max) {
	- ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
	- zfs_dbgmsg("doing scan sync txg %llu; "
	- "ddt bm=%llu/%llu/%llu/%llx",
	- (longlong_t)tx->tx_txg,
	- (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
	- (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
	- (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
	- (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
	- } else {
	- zfs_dbgmsg("doing scan sync txg %llu; "
	- "bm=%llu/%llu/%llu/%llu",
	- (longlong_t)tx->tx_txg,
	- (longlong_t)scnp->scn_bookmark.zb_objset,
	- (longlong_t)scnp->scn_bookmark.zb_object,
	- (longlong_t)scnp->scn_bookmark.zb_level,
	- (longlong_t)scnp->scn_bookmark.zb_blkid);
	- }
	-
	- scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
	- NULL, ZIO_FLAG_CANFAIL);
	-
	- scn->scn_prefetch_stop = B_FALSE;
	- prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
	- dsl_scan_prefetch_thread, scn, TQ_SLEEP);
	- ASSERT(prefetch_tqid != TASKQID_INVALID);
	-
	- dsl_pool_config_enter(dp, FTAG);
	- dsl_scan_visit(scn, tx);
	- dsl_pool_config_exit(dp, FTAG);
	-
	- mutex_enter(&dp->dp_spa->spa_scrub_lock);
	- scn->scn_prefetch_stop = B_TRUE;
	- cv_broadcast(&spa->spa_scrub_io_cv);
	- mutex_exit(&dp->dp_spa->spa_scrub_lock);
	-
	- taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
	- (void) zio_wait(scn->scn_zio_root);
	- scn->scn_zio_root = NULL;
	-
	- zfs_dbgmsg("scan visited %llu blocks in %llums "
	- "(%llu os's, %llu holes, %llu < mintxg, "
	- "%llu in ddt, %llu > maxtxg)",
	- (longlong_t)scn->scn_visited_this_txg,
	- (longlong_t)NSEC2MSEC(gethrtime() -
	- scn->scn_sync_start_time),
	- (longlong_t)scn->scn_objsets_visited_this_txg,
	- (longlong_t)scn->scn_holes_this_txg,
	- (longlong_t)scn->scn_lt_min_this_txg,
	- (longlong_t)scn->scn_ddt_contained_this_txg,
	- (longlong_t)scn->scn_gt_max_this_txg);
	-
	- if (!scn->scn_suspending) {
	- ASSERT0(avl_numnodes(&scn->scn_queue));
	- scn->scn_done_txg = tx->tx_txg + 1;
	- if (scn->scn_is_sorted) {
	- scn->scn_checkpointing = B_TRUE;
	- scn->scn_clearing = B_TRUE;
	- }
	- zfs_dbgmsg("scan complete txg %llu",
	- (longlong_t)tx->tx_txg);
	- }
	- } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
	- ASSERT(scn->scn_clearing);
	-
	- /* need to issue scrubbing IOs from per-vdev queues */
	- scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
	- NULL, ZIO_FLAG_CANFAIL);
	- scan_io_queues_run(scn);
	- (void) zio_wait(scn->scn_zio_root);
	- scn->scn_zio_root = NULL;
	-
	- /* calculate and dprintf the current memory usage */
	- (void) dsl_scan_should_clear(scn);
	- dsl_scan_update_stats(scn);
	-
	- zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums "
	- "(avg_block_size = %llu, avg_seg_size = %llu)",
	- (longlong_t)scn->scn_zios_this_txg,
	- (longlong_t)scn->scn_segs_this_txg,
	- (longlong_t)NSEC2MSEC(gethrtime() -
	- scn->scn_sync_start_time),
	- (longlong_t)scn->scn_avg_zio_size_this_txg,
	- (longlong_t)scn->scn_avg_seg_size_this_txg);
	- } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
	- /* Finished with everything. Mark the scrub as complete */
	- zfs_dbgmsg("scan issuing complete txg %llu",
	- (longlong_t)tx->tx_txg);
	- ASSERT3U(scn->scn_done_txg, !=, 0);
	- ASSERT0(spa->spa_scrub_inflight);
	- ASSERT0(scn->scn_bytes_pending);
	- dsl_scan_done(scn, B_TRUE, tx);
	- sync_type = SYNC_MANDATORY;
	- }
	-
	- dsl_scan_sync_state(scn, tx, sync_type);
	-}
	-
	-static void
	-count_block(dsl_scan_t scn, zfs_all_blkstats_t zab, const blkptr_t *bp)
	-{
	- int i;
	-
	- /* update the spa's stats on how many bytes we have issued */
	- for (i = 0; i < BP_GET_NDVAS(bp); i++) {
	- atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
	- DVA_GET_ASIZE(&bp->blk_dva[i]));
	- }
	-
	- /*
	- * If we resume after a reboot, zab will be NULL; don't record
	- * incomplete stats in that case.
	- */
	- if (zab == NULL)
	- return;
	-
	- mutex_enter(&zab->zab_lock);
	-
	- for (i = 0; i < 4; i++) {
	- int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
	- int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
	- if (t & DMU_OT_NEWTYPE)
	- t = DMU_OT_OTHER;
	- zfs_blkstat_t *zb = &zab->zab_type[l][t];
	- int equal;
	-
	- zb->zb_count++;
	- zb->zb_asize += BP_GET_ASIZE(bp);
	- zb->zb_lsize += BP_GET_LSIZE(bp);
	- zb->zb_psize += BP_GET_PSIZE(bp);
	- zb->zb_gangs += BP_COUNT_GANG(bp);
	-
	- switch (BP_GET_NDVAS(bp)) {
	- case 2:
	- if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	- DVA_GET_VDEV(&bp->blk_dva[1]))
	- zb->zb_ditto_2_of_2_samevdev++;
	- break;
	- case 3:
	- equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	- DVA_GET_VDEV(&bp->blk_dva[1])) +
	- (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	- DVA_GET_VDEV(&bp->blk_dva[2])) +
	- (DVA_GET_VDEV(&bp->blk_dva[1]) ==
	- DVA_GET_VDEV(&bp->blk_dva[2]));
	- if (equal == 1)
	- zb->zb_ditto_2_of_3_samevdev++;
	- else if (equal == 3)
	- zb->zb_ditto_3_of_3_samevdev++;
	- break;
	- }
	- }
	-
	- mutex_exit(&zab->zab_lock);
	-}
	-
	-static void
	-scan_io_queue_insert_impl(dsl_scan_io_queue_t queue, scan_io_t sio)
	-{
	- avl_index_t idx;
	- int64_t asize = sio->sio_asize;
	- dsl_scan_t *scn = queue->q_scn;
	-
	- ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
	-
	- if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
	- /* block is already scheduled for reading */
	- atomic_add_64(&scn->scn_bytes_pending, -asize);
	- kmem_free(sio, sizeof (*sio));
	- return;
	- }
	- avl_insert(&queue->q_sios_by_addr, sio, idx);
	- range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
	-}
	-
	-/*
	- * Given all the info we got from our metadata scanning process, we
	- * construct a scan_io_t and insert it into the scan sorting queue. The
	- * I/O must already be suitable for us to process. This is controlled
	- * by dsl_scan_enqueue().
	- */
	-static void
	-scan_io_queue_insert(dsl_scan_io_queue_t queue, const blkptr_t bp, int dva_i,
	- int zio_flags, const zbookmark_phys_t *zb)
	-{
	- dsl_scan_t *scn = queue->q_scn;
	- scan_io_t sio = kmem_zalloc(sizeof (sio), KM_SLEEP);
	-
	- ASSERT0(BP_IS_GANG(bp));
	- ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
	-
	- bp2sio(bp, sio, dva_i);
	- sio->sio_flags = zio_flags;
	- sio->sio_zb = *zb;
	-
	- /*
	- * Increment the bytes pending counter now so that we can't
	- * get an integer underflow in case the worker processes the
	- * zio before we get to incrementing this counter.
	- */
	- atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
	-
	- scan_io_queue_insert_impl(queue, sio);
	-}
	-
	-/*
	- * Given a set of I/O parameters as discovered by the metadata traversal
	- * process, attempts to place the I/O into the sorted queues (if allowed),
	- * or immediately executes the I/O.
	- */
	-static void
	-dsl_scan_enqueue(dsl_pool_t dp, const blkptr_t bp, int zio_flags,
	- const zbookmark_phys_t *zb)
	-{
	- spa_t *spa = dp->dp_spa;
	-
	- ASSERT(!BP_IS_EMBEDDED(bp));
	-
	- /*
	- * Gang blocks are hard to issue sequentially, so we just issue them
	- * here immediately instead of queuing them.
	- */
	- if (!dp->dp_scan->scn_is_sorted \|\| BP_IS_GANG(bp)) {
	- scan_exec_io(dp, bp, zio_flags, zb, NULL);
	- return;
	- }
	- for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
	- dva_t dva;
	- vdev_t *vdev;
	-
	- dva = bp->blk_dva[i];
	- vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
	- ASSERT(vdev != NULL);
	-
	- mutex_enter(&vdev->vdev_scan_io_queue_lock);
	- if (vdev->vdev_scan_io_queue == NULL)
	- vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
	- ASSERT(dp->dp_scan != NULL);
	- scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
	- i, zio_flags, zb);
	- mutex_exit(&vdev->vdev_scan_io_queue_lock);
	- }
	-}
	-
	-static int
	-dsl_scan_scrub_cb(dsl_pool_t *dp,
	- const blkptr_t bp, const zbookmark_phys_t zb)
	-{
	- dsl_scan_t *scn = dp->dp_scan;
	- spa_t *spa = dp->dp_spa;
	- uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
	- size_t psize = BP_GET_PSIZE(bp);
	- boolean_t needs_io;
	- int zio_flags = ZIO_FLAG_SCAN_THREAD \| ZIO_FLAG_RAW \| ZIO_FLAG_CANFAIL;
	- int d;
	-
	- if (phys_birth <= scn->scn_phys.scn_min_txg \|\|
	- phys_birth >= scn->scn_phys.scn_max_txg) {
	- count_block(scn, dp->dp_blkstats, bp);
	- return (0);
	- }
	-
	- /* Embedded BP's have phys_birth==0, so we reject them above. */
	- ASSERT(!BP_IS_EMBEDDED(bp));
	-
	- ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
	- if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
	- zio_flags \|= ZIO_FLAG_SCRUB;
	- needs_io = B_TRUE;
	- } else {
	- ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
	- zio_flags \|= ZIO_FLAG_RESILVER;
	- needs_io = B_FALSE;
	- }
	-
	- /* If it's an intent log block, failure is expected. */
	- if (zb->zb_level == ZB_ZIL_LEVEL)
	- zio_flags \|= ZIO_FLAG_SPECULATIVE;
	-
	- for (d = 0; d < BP_GET_NDVAS(bp); d++) {
	- const dva_t *dva = &bp->blk_dva[d];
	-
	- /*
	- * Keep track of how much data we've examined so that
	- * zpool(1M) status can make useful progress reports.
	- */
	- scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
	- spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
	-
	- /* if it's a resilver, this may not be in the target range */
	- if (!needs_io)
	- needs_io = dsl_scan_need_resilver(spa, dva, psize,
	- phys_birth);
	- }
	-
	- if (needs_io && !zfs_no_scrub_io) {
	- dsl_scan_enqueue(dp, bp, zio_flags, zb);
	- } else {
	- count_block(scn, dp->dp_blkstats, bp);
	- }
	-
	- /* do not relocate this block */
	- return (0);
	-}
	-
	-static void
	-dsl_scan_scrub_done(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- blkptr_t *bp = zio->io_bp;
	- dsl_scan_io_queue_t *queue = zio->io_private;
	-
	- abd_free(zio->io_abd);
	-
	- if (queue == NULL) {
	- mutex_enter(&spa->spa_scrub_lock);
	- ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
	- spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
	- cv_broadcast(&spa->spa_scrub_io_cv);
	- mutex_exit(&spa->spa_scrub_lock);
	- } else {
	- mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
	- ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
	- queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
	- cv_broadcast(&queue->q_zio_cv);
	- mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
	- }
	-
	- if (zio->io_error && (zio->io_error != ECKSUM \|\|
	- !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
	- atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
	- }
	-}
	-
	-/*
	- * Given a scanning zio's information, executes the zio. The zio need
	- * not necessarily be only sortable, this function simply executes the
	- * zio, no matter what it is. The optional queue argument allows the
	- * caller to specify that they want per top level vdev IO rate limiting
	- * instead of the legacy global limiting.
	- */
	-static void
	-scan_exec_io(dsl_pool_t dp, const blkptr_t bp, int zio_flags,
	- const zbookmark_phys_t zb, dsl_scan_io_queue_t queue)
	-{
	- spa_t *spa = dp->dp_spa;
	- dsl_scan_t *scn = dp->dp_scan;
	- size_t size = BP_GET_PSIZE(bp);
	- abd_t *data = abd_alloc_for_io(size, B_FALSE);
	- unsigned int scan_delay = 0;
	-
	- ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
	-
	- if (queue == NULL) {
	- mutex_enter(&spa->spa_scrub_lock);
	- while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
	- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
	- spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
	- mutex_exit(&spa->spa_scrub_lock);
	- } else {
	- kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
	-
	- mutex_enter(q_lock);
	- while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
	- cv_wait(&queue->q_zio_cv, q_lock);
	- queue->q_inflight_bytes += BP_GET_PSIZE(bp);
	- mutex_exit(q_lock);
	- }
	-
	- if (zio_flags & ZIO_FLAG_RESILVER)
	- scan_delay = zfs_resilver_delay;
	- else {
	- ASSERT(zio_flags & ZIO_FLAG_SCRUB);
	- scan_delay = zfs_scrub_delay;
	- }
	-
	- if (scan_delay && (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle))
	- delay(MAX((int)scan_delay, 0));
	-
	- count_block(dp->dp_scan, dp->dp_blkstats, bp);
	- zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size,
	- dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
	-}
	-
	-/*
	- * This is the primary extent sorting algorithm. We balance two parameters:
	- * 1) how many bytes of I/O are in an extent
	- * 2) how well the extent is filled with I/O (as a fraction of its total size)
	- * Since we allow extents to have gaps between their constituent I/Os, it's
	- * possible to have a fairly large extent that contains the same amount of
	- * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
	- * The algorithm sorts based on a score calculated from the extent's size,
	- * the relative fill volume (in %) and a "fill weight" parameter that controls
	- * the split between whether we prefer larger extents or more well populated
	- * extents:
	- *
	- * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
	- *
	- * Example:
	- * 1) assume extsz = 64 MiB
	- * 2) assume fill = 32 MiB (extent is half full)
	- * 3) assume fill_weight = 3
	- * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
	- * SCORE = 32M + (50 * 3 * 32M) / 100
	- * SCORE = 32M + (4800M / 100)
	- * SCORE = 32M + 48M
	- * ^ ^
	- * \| +--- final total relative fill-based score
	- * +--------- final total fill-based score
	- * SCORE = 80M
	- *
	- * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
	- * extents that are more completely filled (in a 3:2 ratio) vs just larger.
	- * Note that as an optimization, we replace multiplication and division by
	- * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128).
	- */
	-static int
	-ext_size_compare(const void x, const void y)
	-{
	- const range_seg_t rsa = x, rsb = y;
	- uint64_t sa = rsa->rs_end - rsa->rs_start,
	- sb = rsb->rs_end - rsb->rs_start;
	- uint64_t score_a, score_b;
	-
	- score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
	- fill_weight * rsa->rs_fill) >> 7);
	- score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
	- fill_weight * rsb->rs_fill) >> 7);
	-
	- if (score_a > score_b)
	- return (-1);
	- if (score_a == score_b) {
	- if (rsa->rs_start < rsb->rs_start)
	- return (-1);
	- if (rsa->rs_start == rsb->rs_start)
	- return (0);
	- return (1);
	- }
	- return (1);
	-}
	-
	-/*
	- * Comparator for the q_sios_by_addr tree. Sorting is simply performed
	- * based on LBA-order (from lowest to highest).
	- */
	-static int
	-io_addr_compare(const void x, const void y)
	-{
	- const scan_io_t a = x, b = y;
	-
	- if (a->sio_offset < b->sio_offset)
	- return (-1);
	- if (a->sio_offset == b->sio_offset)
	- return (0);
	- return (1);
	-}
	-
	-/* IO queues are created on demand when they are needed. */
	-static dsl_scan_io_queue_t *
	-scan_io_queue_create(vdev_t *vd)
	-{
	- dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
	- dsl_scan_io_queue_t q = kmem_zalloc(sizeof (q), KM_SLEEP);
	-
	- q->q_scn = scn;
	- q->q_vd = vd;
	- cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
	- q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
	- &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
	- avl_create(&q->q_sios_by_addr, io_addr_compare,
	- sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
	-
	- return (q);
	-}
	-
	-/*
	- * Destroys a scan queue and all segments and scan_io_t's contained in it.
	- * No further execution of I/O occurs, anything pending in the queue is
	- * simply freed without being executed.
	- */
	-void
	-dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
	-{
	- dsl_scan_t *scn = queue->q_scn;
	- scan_io_t *sio;
	- void *cookie = NULL;
	- int64_t bytes_dequeued = 0;
	-
	- ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
	-
	- while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
	- NULL) {
	- ASSERT(range_tree_contains(queue->q_exts_by_addr,
	- sio->sio_offset, sio->sio_asize));
	- bytes_dequeued += sio->sio_asize;
	- kmem_free(sio, sizeof (*sio));
	- }
	-
	- atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
	- range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
	- range_tree_destroy(queue->q_exts_by_addr);
	- avl_destroy(&queue->q_sios_by_addr);
	- cv_destroy(&queue->q_zio_cv);
	-
	- kmem_free(queue, sizeof (*queue));
	-}
	-
	-/*
	- * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
	- * called on behalf of vdev_top_transfer when creating or destroying
	- * a mirror vdev due to zpool attach/detach.
	- */
	-void
	-dsl_scan_io_queue_vdev_xfer(vdev_t svd, vdev_t tvd)
	-{
	- mutex_enter(&svd->vdev_scan_io_queue_lock);
	- mutex_enter(&tvd->vdev_scan_io_queue_lock);
	-
	- VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
	- tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
	- svd->vdev_scan_io_queue = NULL;
	- if (tvd->vdev_scan_io_queue != NULL)
	- tvd->vdev_scan_io_queue->q_vd = tvd;
	-
	- mutex_exit(&tvd->vdev_scan_io_queue_lock);
	- mutex_exit(&svd->vdev_scan_io_queue_lock);
	-}
	-
	-static void
	-scan_io_queues_destroy(dsl_scan_t *scn)
	-{
	- vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
	-
	- for (uint64_t i = 0; i < rvd->vdev_children; i++) {
	- vdev_t *tvd = rvd->vdev_child[i];
	-
	- mutex_enter(&tvd->vdev_scan_io_queue_lock);
	- if (tvd->vdev_scan_io_queue != NULL)
	- dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
	- tvd->vdev_scan_io_queue = NULL;
	- mutex_exit(&tvd->vdev_scan_io_queue_lock);
	- }
	-}
	-
	-static void
	-dsl_scan_freed_dva(spa_t spa, const blkptr_t bp, int dva_i)
	-{
	- dsl_pool_t *dp = spa->spa_dsl_pool;
	- dsl_scan_t *scn = dp->dp_scan;
	- vdev_t *vdev;
	- kmutex_t *q_lock;
	- dsl_scan_io_queue_t *queue;
	- scan_io_t srch, *sio;
	- avl_index_t idx;
	- uint64_t start, size;
	-
	- vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
	- ASSERT(vdev != NULL);
	- q_lock = &vdev->vdev_scan_io_queue_lock;
	- queue = vdev->vdev_scan_io_queue;
	-
	- mutex_enter(q_lock);
	- if (queue == NULL) {
	- mutex_exit(q_lock);
	- return;
	- }
	-
	- bp2sio(bp, &srch, dva_i);
	- start = srch.sio_offset;
	- size = srch.sio_asize;
	-
	- /*
	- * We can find the zio in two states:
	- * 1) Cold, just sitting in the queue of zio's to be issued at
	- * some point in the future. In this case, all we do is
	- * remove the zio from the q_sios_by_addr tree, decrement
	- * its data volume from the containing range_seg_t and
	- * resort the q_exts_by_size tree to reflect that the
	- * range_seg_t has lost some of its 'fill'. We don't shorten
	- * the range_seg_t - this is usually rare enough not to be
	- * worth the extra hassle of trying keep track of precise
	- * extent boundaries.
	- * 2) Hot, where the zio is currently in-flight in
	- * dsl_scan_issue_ios. In this case, we can't simply
	- * reach in and stop the in-flight zio's, so we instead
	- * block the caller. Eventually, dsl_scan_issue_ios will
	- * be done with issuing the zio's it gathered and will
	- * signal us.
	- */
	- sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
	- if (sio != NULL) {
	- int64_t asize = sio->sio_asize;
	- blkptr_t tmpbp;
	-
	- /* Got it while it was cold in the queue */
	- ASSERT3U(start, ==, sio->sio_offset);
	- ASSERT3U(size, ==, asize);
	- avl_remove(&queue->q_sios_by_addr, sio);
	-
	- ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
	- range_tree_remove_fill(queue->q_exts_by_addr, start, size);
	-
	- /*
	- * We only update scn_bytes_pending in the cold path,
	- * otherwise it will already have been accounted for as
	- * part of the zio's execution.
	- */
	- atomic_add_64(&scn->scn_bytes_pending, -asize);
	-
	- /* count the block as though we issued it */
	- sio2bp(sio, &tmpbp, dva_i);
	- count_block(scn, dp->dp_blkstats, &tmpbp);
	-
	- kmem_free(sio, sizeof (*sio));
	- }
	- mutex_exit(q_lock);
	-}
	-
	-/*
	- * Callback invoked when a zio_free() zio is executing. This needs to be
	- * intercepted to prevent the zio from deallocating a particular portion
	- * of disk space and it then getting reallocated and written to, while we
	- * still have it queued up for processing.
	- */
	-void
	-dsl_scan_freed(spa_t spa, const blkptr_t bp)
	-{
	- dsl_pool_t *dp = spa->spa_dsl_pool;
	- dsl_scan_t *scn = dp->dp_scan;
	-
	- ASSERT(!BP_IS_EMBEDDED(bp));
	- ASSERT(scn != NULL);
	- if (!dsl_scan_is_running(scn))
	- return;
	-
	- for (int i = 0; i < BP_GET_NDVAS(bp); i++)
	- dsl_scan_freed_dva(spa, bp, i);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
	@@ -1,256 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/metaslab.h>
	-
	-#define DST_AVG_BLKSHIFT 14
	-
	-/* ARGSUSED */
	-static int
	-dsl_null_checkfunc(void arg, dmu_tx_t tx)
	-{
	- return (0);
	-}
	-
	-static int
	-dsl_sync_task_common(const char pool, dsl_checkfunc_t checkfunc,
	- dsl_syncfunc_t syncfunc, dsl_sigfunc_t sigfunc, void *arg,
	- int blocks_modified, zfs_space_check_t space_check, boolean_t early)
	-{
	- spa_t *spa;
	- dmu_tx_t *tx;
	- int err;
	- dsl_sync_task_t dst = { 0 };
	- dsl_pool_t *dp;
	-
	- err = spa_open(pool, &spa, FTAG);
	- if (err != 0)
	- return (err);
	- dp = spa_get_dsl(spa);
	-
	-top:
	- tx = dmu_tx_create_dd(dp->dp_mos_dir);
	- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	-
	- dst.dst_pool = dp;
	- dst.dst_txg = dmu_tx_get_txg(tx);
	- dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
	- dst.dst_space_check = space_check;
	- dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
	- dst.dst_syncfunc = syncfunc;
	- dst.dst_arg = arg;
	- dst.dst_error = 0;
	- dst.dst_nowaiter = B_FALSE;
	-
	- dsl_pool_config_enter(dp, FTAG);
	- err = dst.dst_checkfunc(arg, tx);
	- dsl_pool_config_exit(dp, FTAG);
	-
	- if (err != 0) {
	- dmu_tx_commit(tx);
	- spa_close(spa, FTAG);
	- return (err);
	- }
	-
	- txg_list_t *task_list = (early) ?
	- &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
	- VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg));
	-
	- dmu_tx_commit(tx);
	-
	- if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) {
	- /* current contract is to call func once */
	- sigfunc(arg, tx);
	- sigfunc = NULL; /* in case of an EAGAIN retry */
	- }
	- txg_wait_synced(dp, dst.dst_txg);
	-
	- if (dst.dst_error == EAGAIN) {
	- txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE);
	- goto top;
	- }
	-
	- spa_close(spa, FTAG);
	- return (dst.dst_error);
	-}
	-
	-/*
	- * Called from open context to perform a callback in syncing context. Waits
	- * for the operation to complete.
	- *
	- * The checkfunc will be called from open context as a preliminary check
	- * which can quickly fail. If it succeeds, it will be called again from
	- * syncing context. The checkfunc should generally be designed to work
	- * properly in either context, but if necessary it can check
	- * dmu_tx_is_syncing(tx).
	- *
	- * The synctask infrastructure enforces proper locking strategy with respect
	- * to the dp_config_rwlock -- the lock will always be held when the callbacks
	- * are called. It will be held for read during the open-context (preliminary)
	- * call to the checkfunc, and then held for write from syncing context during
	- * the calls to the check and sync funcs.
	- *
	- * A dataset or pool name can be passed as the first argument. Typically,
	- * the check func will hold, check the return value of the hold, and then
	- * release the dataset. The sync func will VERIFYO(hold()) the dataset.
	- * This is safe because no changes can be made between the check and sync funcs,
	- * and the sync func will only be called if the check func successfully opened
	- * the dataset.
	- */
	-int
	-dsl_sync_task(const char pool, dsl_checkfunc_t checkfunc,
	- dsl_syncfunc_t syncfunc, void arg,
	- int blocks_modified, zfs_space_check_t space_check)
	-{
	- return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
	- blocks_modified, space_check, B_FALSE));
	-}
	-
	-/*
	- * An early synctask works exactly as a standard synctask with one important
	- * difference on the way it is handled during syncing context. Standard
	- * synctasks run after we've written out all the dirty blocks of dirty
	- * datasets. Early synctasks are executed before writing out any dirty data,
	- * and thus before standard synctasks.
	- *
	- * For that reason, early synctasks can affect the process of writing dirty
	- * changes to disk for the txg that they run and should be used with caution.
	- * In addition, early synctasks should not dirty any metaslabs as this would
	- * invalidate the precodition/invariant for subsequent early synctasks.
	- * [see dsl_pool_sync() and dsl_early_sync_task_verify()]
	- */
	-int
	-dsl_early_sync_task(const char pool, dsl_checkfunc_t checkfunc,
	- dsl_syncfunc_t syncfunc, void arg,
	- int blocks_modified, zfs_space_check_t space_check)
	-{
	- return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
	- blocks_modified, space_check, B_TRUE));
	-}
	-
	-/*
	- * A standard synctask that can be interrupted from a signal. The sigfunc
	- * is called once if a signal occurred while waiting for the task to sync.
	- */
	-int
	-dsl_sync_task_sig(const char pool, dsl_checkfunc_t checkfunc,
	- dsl_syncfunc_t syncfunc, dsl_sigfunc_t sigfunc, void *arg,
	- int blocks_modified, zfs_space_check_t space_check)
	-{
	- return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg,
	- blocks_modified, space_check, B_FALSE));
	-}
	-
	-static void
	-dsl_sync_task_nowait_common(dsl_pool_t dp, dsl_syncfunc_t syncfunc, void *arg,
	- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,
	- boolean_t early)
	-{
	- dsl_sync_task_t dst = kmem_zalloc(sizeof (dst), KM_SLEEP);
	-
	- dst->dst_pool = dp;
	- dst->dst_txg = dmu_tx_get_txg(tx);
	- dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
	- dst->dst_space_check = space_check;
	- dst->dst_checkfunc = dsl_null_checkfunc;
	- dst->dst_syncfunc = syncfunc;
	- dst->dst_arg = arg;
	- dst->dst_error = 0;
	- dst->dst_nowaiter = B_TRUE;
	-
	- txg_list_t *task_list = (early) ?
	- &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
	- VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg));
	-}
	-
	-void
	-dsl_sync_task_nowait(dsl_pool_t dp, dsl_syncfunc_t syncfunc, void *arg,
	- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
	-{
	- dsl_sync_task_nowait_common(dp, syncfunc, arg,
	- blocks_modified, space_check, tx, B_FALSE);
	-}
	-
	-void
	-dsl_early_sync_task_nowait(dsl_pool_t dp, dsl_syncfunc_t syncfunc, void *arg,
	- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
	-{
	- dsl_sync_task_nowait_common(dp, syncfunc, arg,
	- blocks_modified, space_check, tx, B_TRUE);
	-}
	-
	-/*
	- * Called in syncing context to execute the synctask.
	- */
	-void
	-dsl_sync_task_sync(dsl_sync_task_t dst, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = dst->dst_pool;
	-
	- ASSERT0(dst->dst_error);
	-
	- /*
	- * Check for sufficient space.
	- *
	- * When the sync task was created, the caller specified the
	- * type of space checking required. See the comment in
	- * zfs_space_check_t for details on the semantics of each
	- * type of space checking.
	- *
	- * We just check against what's on-disk; we don't want any
	- * in-flight accounting to get in our way, because open context
	- * may have already used up various in-core limits
	- * (arc_tempreserve, dsl_pool_tempreserve).
	- */
	- if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
	- uint64_t quota = dsl_pool_unreserved_space(dp,
	- dst->dst_space_check);
	- uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
	-
	- /* MOS space is triple-dittoed, so we multiply by 3. */
	- if (used + dst->dst_space * 3 > quota) {
	- dst->dst_error = SET_ERROR(ENOSPC);
	- if (dst->dst_nowaiter)
	- kmem_free(dst, sizeof (*dst));
	- return;
	- }
	- }
	-
	- /*
	- * Check for errors by calling checkfunc.
	- */
	- rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
	- dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx);
	- if (dst->dst_error == 0)
	- dst->dst_syncfunc(dst->dst_arg, tx);
	- rrw_exit(&dp->dp_config_rwlock, FTAG);
	- if (dst->dst_nowaiter)
	- kmem_free(dst, sizeof (*dst));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
	@@ -1,667 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dsl_userhold.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/zfs_onexit.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zap.h>
	-
	-typedef struct dsl_dataset_user_hold_arg {
	- nvlist_t *dduha_holds;
	- nvlist_t *dduha_chkholds;
	- nvlist_t *dduha_errlist;
	- minor_t dduha_minor;
	-} dsl_dataset_user_hold_arg_t;
	-
	-/*
	- * If you add new checks here, you may need to add additional checks to the
	- * "temporary" case in snapshot_check() in dmu_objset.c.
	- */
	-int
	-dsl_dataset_user_hold_check_one(dsl_dataset_t ds, const char htag,
	- boolean_t temphold, dmu_tx_t *tx)
	-{
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- objset_t *mos = dp->dp_meta_objset;
	- int error = 0;
	-
	- ASSERT(dsl_pool_config_held(dp));
	-
	- if (strlen(htag) > MAXNAMELEN)
	- return (SET_ERROR(E2BIG));
	- /* Tempholds have a more restricted length */
	- if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
	- return (SET_ERROR(E2BIG));
	-
	- /* tags must be unique (if ds already exists) */
	- if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
	- uint64_t value;
	-
	- error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
	- htag, 8, 1, &value);
	- if (error == 0)
	- error = SET_ERROR(EEXIST);
	- else if (error == ENOENT)
	- error = 0;
	- }
	-
	- return (error);
	-}
	-
	-static int
	-dsl_dataset_user_hold_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_user_hold_arg_t *dduha = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	-
	- if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS)
	- return (SET_ERROR(ENOTSUP));
	-
	- if (!dmu_tx_is_syncing(tx))
	- return (0);
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
	- dsl_dataset_t *ds;
	- int error = 0;
	- char htag, name;
	-
	- /* must be a snapshot */
	- name = nvpair_name(pair);
	- if (strchr(name, '@') == NULL)
	- error = SET_ERROR(EINVAL);
	-
	- if (error == 0)
	- error = nvpair_value_string(pair, &htag);
	-
	- if (error == 0)
	- error = dsl_dataset_hold(dp, name, FTAG, &ds);
	-
	- if (error == 0) {
	- error = dsl_dataset_user_hold_check_one(ds, htag,
	- dduha->dduha_minor != 0, tx);
	- dsl_dataset_rele(ds, FTAG);
	- }
	-
	- if (error == 0) {
	- fnvlist_add_string(dduha->dduha_chkholds, name, htag);
	- } else {
	- /*
	- * We register ENOENT errors so they can be correctly
	- * reported if needed, such as when all holds fail.
	- */
	- fnvlist_add_int32(dduha->dduha_errlist, name, error);
	- if (error != ENOENT)
	- return (error);
	- }
	- }
	-
	- return (0);
	-}
	-
	-
	-static void
	-dsl_dataset_user_hold_sync_one_impl(nvlist_t tmpholds, dsl_dataset_t ds,
	- const char htag, minor_t minor, uint64_t now, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- objset_t *mos = dp->dp_meta_objset;
	- uint64_t zapobj;
	-
	- ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
	-
	- if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
	- /*
	- * This is the first user hold for this dataset. Create
	- * the userrefs zap object.
	- */
	- dmu_buf_will_dirty(ds->ds_dbuf, tx);
	- zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
	- zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
	- } else {
	- zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
	- }
	- ds->ds_userrefs++;
	-
	- VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx));
	-
	- if (minor != 0) {
	- char name[MAXNAMELEN];
	- nvlist_t *tags;
	-
	- VERIFY0(dsl_pool_user_hold(dp, ds->ds_object,
	- htag, now, tx));
	- (void) snprintf(name, sizeof (name), "%llx",
	- (u_longlong_t)ds->ds_object);
	-
	- if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) {
	- tags = fnvlist_alloc();
	- fnvlist_add_boolean(tags, htag);
	- fnvlist_add_nvlist(tmpholds, name, tags);
	- fnvlist_free(tags);
	- } else {
	- fnvlist_add_boolean(tags, htag);
	- }
	- }
	-
	- spa_history_log_internal_ds(ds, "hold", tx,
	- "tag=%s temp=%d refs=%llu",
	- htag, minor != 0, ds->ds_userrefs);
	-}
	-
	-typedef struct zfs_hold_cleanup_arg {
	- char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN];
	- uint64_t zhca_spa_load_guid;
	- nvlist_t *zhca_holds;
	-} zfs_hold_cleanup_arg_t;
	-
	-static void
	-dsl_dataset_user_release_onexit(void *arg)
	-{
	- zfs_hold_cleanup_arg_t *ca = arg;
	- spa_t *spa;
	- int error;
	-
	- error = spa_open(ca->zhca_spaname, &spa, FTAG);
	- if (error != 0) {
	- zfs_dbgmsg("couldn't release holds on pool=%s "
	- "because pool is no longer loaded",
	- ca->zhca_spaname);
	- return;
	- }
	- if (spa_load_guid(spa) != ca->zhca_spa_load_guid) {
	- zfs_dbgmsg("couldn't release holds on pool=%s "
	- "because pool is no longer loaded (guid doesn't match)",
	- ca->zhca_spaname);
	- spa_close(spa, FTAG);
	- return;
	- }
	-
	- (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds);
	- fnvlist_free(ca->zhca_holds);
	- kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
	- spa_close(spa, FTAG);
	-}
	-
	-static void
	-dsl_onexit_hold_cleanup(spa_t spa, nvlist_t holds, minor_t minor)
	-{
	- zfs_hold_cleanup_arg_t *ca;
	-
	- if (minor == 0 \|\| nvlist_empty(holds)) {
	- fnvlist_free(holds);
	- return;
	- }
	-
	- ASSERT(spa != NULL);
	- ca = kmem_alloc(sizeof (*ca), KM_SLEEP);
	-
	- (void) strlcpy(ca->zhca_spaname, spa_name(spa),
	- sizeof (ca->zhca_spaname));
	- ca->zhca_spa_load_guid = spa_load_guid(spa);
	- ca->zhca_holds = holds;
	- VERIFY0(zfs_onexit_add_cb(minor,
	- dsl_dataset_user_release_onexit, ca, NULL));
	-}
	-
	-void
	-dsl_dataset_user_hold_sync_one(dsl_dataset_t ds, const char htag,
	- minor_t minor, uint64_t now, dmu_tx_t *tx)
	-{
	- nvlist_t *tmpholds;
	-
	- if (minor != 0)
	- tmpholds = fnvlist_alloc();
	- else
	- tmpholds = NULL;
	- dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx);
	- dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor);
	-}
	-
	-static void
	-dsl_dataset_user_hold_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_user_hold_arg_t *dduha = arg;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- nvlist_t *tmpholds;
	- uint64_t now = gethrestime_sec();
	-
	- if (dduha->dduha_minor != 0)
	- tmpholds = fnvlist_alloc();
	- else
	- tmpholds = NULL;
	- for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL);
	- pair != NULL;
	- pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) {
	- dsl_dataset_t *ds;
	-
	- VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
	- dsl_dataset_user_hold_sync_one_impl(tmpholds, ds,
	- fnvpair_value_string(pair), dduha->dduha_minor, now, tx);
	- dsl_dataset_rele(ds, FTAG);
	- }
	- dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor);
	-}
	-
	-/*
	- * The full semantics of this function are described in the comment above
	- * lzc_hold().
	- *
	- * To summarize:
	- * holds is nvl of snapname -> holdname
	- * errlist will be filled in with snapname -> error
	- *
	- * The snaphosts must all be in the same pool.
	- *
	- * Holds for snapshots that don't exist will be skipped.
	- *
	- * If none of the snapshots for requested holds exist then ENOENT will be
	- * returned.
	- *
	- * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned
	- * up when the process exits.
	- *
	- * On success all the holds, for snapshots that existed, will be created and 0
	- * will be returned.
	- *
	- * On failure no holds will be created, the errlist will be filled in,
	- * and an errno will returned.
	- *
	- * In all cases the errlist will contain entries for holds where the snapshot
	- * didn't exist.
	- */
	-int
	-dsl_dataset_user_hold(nvlist_t holds, minor_t cleanup_minor, nvlist_t errlist)
	-{
	- dsl_dataset_user_hold_arg_t dduha;
	- nvpair_t *pair;
	- int ret;
	-
	- pair = nvlist_next_nvpair(holds, NULL);
	- if (pair == NULL)
	- return (0);
	-
	- dduha.dduha_holds = holds;
	- dduha.dduha_chkholds = fnvlist_alloc();
	- dduha.dduha_errlist = errlist;
	- dduha.dduha_minor = cleanup_minor;
	-
	- ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
	- dsl_dataset_user_hold_sync, &dduha,
	- fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
	- fnvlist_free(dduha.dduha_chkholds);
	-
	- return (ret);
	-}
	-
	-typedef int (dsl_holdfunc_t)(dsl_pool_t dp, const char name, void *tag,
	- dsl_dataset_t **dsp);
	-
	-typedef struct dsl_dataset_user_release_arg {
	- dsl_holdfunc_t *ddura_holdfunc;
	- nvlist_t *ddura_holds;
	- nvlist_t *ddura_todelete;
	- nvlist_t *ddura_errlist;
	- nvlist_t *ddura_chkholds;
	-} dsl_dataset_user_release_arg_t;
	-
	-/* Place a dataset hold on the snapshot identified by passed dsobj string */
	-static int
	-dsl_dataset_hold_obj_string(dsl_pool_t dp, const char dsobj, void *tag,
	- dsl_dataset_t **dsp)
	-{
	- return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp));
	-}
	-
	-static int
	-dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
	- dsl_dataset_t ds, nvlist_t holds, const char *snapname)
	-{
	- uint64_t zapobj;
	- nvlist_t *holds_found;
	- objset_t *mos;
	- int numholds;
	-
	- if (!ds->ds_is_snapshot)
	- return (SET_ERROR(EINVAL));
	-
	- if (nvlist_empty(holds))
	- return (0);
	-
	- numholds = 0;
	- mos = ds->ds_dir->dd_pool->dp_meta_objset;
	- zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
	- holds_found = fnvlist_alloc();
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(holds, pair)) {
	- uint64_t tmp;
	- int error;
	- const char *holdname = nvpair_name(pair);
	-
	- if (zapobj != 0)
	- error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp);
	- else
	- error = SET_ERROR(ENOENT);
	-
	- /*
	- * Non-existent holds are put on the errlist, but don't
	- * cause an overall failure.
	- */
	- if (error == ENOENT) {
	- if (ddura->ddura_errlist != NULL) {
	- char *errtag = kmem_asprintf("%s#%s",
	- snapname, holdname);
	- fnvlist_add_int32(ddura->ddura_errlist, errtag,
	- ENOENT);
	- strfree(errtag);
	- }
	- continue;
	- }
	-
	- if (error != 0) {
	- fnvlist_free(holds_found);
	- return (error);
	- }
	-
	- fnvlist_add_boolean(holds_found, holdname);
	- numholds++;
	- }
	-
	- if (DS_IS_DEFER_DESTROY(ds) &&
	- dsl_dataset_phys(ds)->ds_num_children == 1 &&
	- ds->ds_userrefs == numholds) {
	- /* we need to destroy the snapshot as well */
	- if (dsl_dataset_long_held(ds)) {
	- fnvlist_free(holds_found);
	- return (SET_ERROR(EBUSY));
	- }
	- fnvlist_add_boolean(ddura->ddura_todelete, snapname);
	- }
	-
	- if (numholds != 0) {
	- fnvlist_add_nvlist(ddura->ddura_chkholds, snapname,
	- holds_found);
	- }
	- fnvlist_free(holds_found);
	-
	- return (0);
	-}
	-
	-static int
	-dsl_dataset_user_release_check(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_user_release_arg_t *ddura;
	- dsl_holdfunc_t *holdfunc;
	- dsl_pool_t *dp;
	-
	- if (!dmu_tx_is_syncing(tx))
	- return (0);
	-
	- dp = dmu_tx_pool(tx);
	-
	- ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
	-
	- ddura = arg;
	- holdfunc = ddura->ddura_holdfunc;
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) {
	- int error;
	- dsl_dataset_t *ds;
	- nvlist_t *holds;
	- const char *snapname = nvpair_name(pair);
	-
	- error = nvpair_value_nvlist(pair, &holds);
	- if (error != 0)
	- error = (SET_ERROR(EINVAL));
	- else
	- error = holdfunc(dp, snapname, FTAG, &ds);
	- if (error == 0) {
	- error = dsl_dataset_user_release_check_one(ddura, ds,
	- holds, snapname);
	- dsl_dataset_rele(ds, FTAG);
	- }
	- if (error != 0) {
	- if (ddura->ddura_errlist != NULL) {
	- fnvlist_add_int32(ddura->ddura_errlist,
	- snapname, error);
	- }
	- /*
	- * Non-existent snapshots are put on the errlist,
	- * but don't cause an overall failure.
	- */
	- if (error != ENOENT)
	- return (error);
	- }
	- }
	-
	- return (0);
	-}
	-
	-static void
	-dsl_dataset_user_release_sync_one(dsl_dataset_t ds, nvlist_t holds,
	- dmu_tx_t *tx)
	-{
	- dsl_pool_t *dp = ds->ds_dir->dd_pool;
	- objset_t *mos = dp->dp_meta_objset;
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(holds, pair)) {
	- int error;
	- const char *holdname = nvpair_name(pair);
	-
	- /* Remove temporary hold if one exists. */
	- error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
	- VERIFY(error == 0 \|\| error == ENOENT);
	-
	- VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
	- holdname, tx));
	- ds->ds_userrefs--;
	-
	- spa_history_log_internal_ds(ds, "release", tx,
	- "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs);
	- }
	-}
	-
	-static void
	-dsl_dataset_user_release_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_dataset_user_release_arg_t *ddura = arg;
	- dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc;
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	-
	- ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds,
	- pair)) {
	- dsl_dataset_t *ds;
	- const char *name = nvpair_name(pair);
	-
	- VERIFY0(holdfunc(dp, name, FTAG, &ds));
	-
	- dsl_dataset_user_release_sync_one(ds,
	- fnvpair_value_nvlist(pair), tx);
	- if (nvlist_exists(ddura->ddura_todelete, name)) {
	- ASSERT(ds->ds_userrefs == 0 &&
	- dsl_dataset_phys(ds)->ds_num_children == 1 &&
	- DS_IS_DEFER_DESTROY(ds));
	- dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
	- }
	- dsl_dataset_rele(ds, FTAG);
	- }
	-}
	-
	-/*
	- * The full semantics of this function are described in the comment above
	- * lzc_release().
	- *
	- * To summarize:
	- * Releases holds specified in the nvl holds.
	- *
	- * holds is nvl of snapname -> { holdname, ... }
	- * errlist will be filled in with snapname -> error
	- *
	- * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots,
	- * otherwise they should be the names of shapshots.
	- *
	- * As a release may cause snapshots to be destroyed this trys to ensure they
	- * aren't mounted.
	- *
	- * The release of non-existent holds are skipped.
	- *
	- * At least one hold must have been released for the this function to succeed
	- * and return 0.
	- */
	-static int
	-dsl_dataset_user_release_impl(nvlist_t holds, nvlist_t errlist,
	- dsl_pool_t *tmpdp)
	-{
	- dsl_dataset_user_release_arg_t ddura;
	- nvpair_t *pair;
	- char *pool;
	- int error;
	-
	- pair = nvlist_next_nvpair(holds, NULL);
	- if (pair == NULL)
	- return (0);
	-
	- /*
	- * The release may cause snapshots to be destroyed; make sure they
	- * are not mounted.
	- */
	- if (tmpdp != NULL) {
	- /* Temporary holds are specified by dsobj string. */
	- ddura.ddura_holdfunc = dsl_dataset_hold_obj_string;
	- pool = spa_name(tmpdp->dp_spa);
	-#ifdef _KERNEL
	- for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(holds, pair)) {
	- dsl_dataset_t *ds;
	-
	- dsl_pool_config_enter(tmpdp, FTAG);
	- error = dsl_dataset_hold_obj_string(tmpdp,
	- nvpair_name(pair), FTAG, &ds);
	- if (error == 0) {
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	- dsl_dataset_name(ds, name);
	- dsl_pool_config_exit(tmpdp, FTAG);
	- dsl_dataset_rele(ds, FTAG);
	- (void) zfs_unmount_snap(name);
	- } else {
	- dsl_pool_config_exit(tmpdp, FTAG);
	- }
	- }
	-#endif
	- } else {
	- /* Non-temporary holds are specified by name. */
	- ddura.ddura_holdfunc = dsl_dataset_hold;
	- pool = nvpair_name(pair);
	-#ifdef _KERNEL
	- for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(holds, pair)) {
	- (void) zfs_unmount_snap(nvpair_name(pair));
	- }
	-#endif
	- }
	-
	- ddura.ddura_holds = holds;
	- ddura.ddura_errlist = errlist;
	- ddura.ddura_todelete = fnvlist_alloc();
	- ddura.ddura_chkholds = fnvlist_alloc();
	-
	- error = dsl_sync_task(pool, dsl_dataset_user_release_check,
	- dsl_dataset_user_release_sync, &ddura, 0,
	- ZFS_SPACE_CHECK_EXTRA_RESERVED);
	- fnvlist_free(ddura.ddura_todelete);
	- fnvlist_free(ddura.ddura_chkholds);
	-
	- return (error);
	-}
	-
	-/*
	- * holds is nvl of snapname -> { holdname, ... }
	- * errlist will be filled in with snapname -> error
	- */
	-int
	-dsl_dataset_user_release(nvlist_t holds, nvlist_t errlist)
	-{
	- return (dsl_dataset_user_release_impl(holds, errlist, NULL));
	-}
	-
	-/*
	- * holds is nvl of snapdsobj -> { holdname, ... }
	- */
	-void
	-dsl_dataset_user_release_tmp(struct dsl_pool dp, nvlist_t holds)
	-{
	- ASSERT(dp != NULL);
	- (void) dsl_dataset_user_release_impl(holds, NULL, dp);
	-}
	-
	-int
	-dsl_dataset_get_holds(const char dsname, nvlist_t nvl)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- int err;
	-
	- err = dsl_pool_hold(dsname, FTAG, &dp);
	- if (err != 0)
	- return (err);
	- err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
	- if (err != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (err);
	- }
	-
	- if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
	- zap_attribute_t *za;
	- zap_cursor_t zc;
	-
	- za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
	- for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_userrefs_obj);
	- zap_cursor_retrieve(&zc, za) == 0;
	- zap_cursor_advance(&zc)) {
	- fnvlist_add_uint64(nvl, za->za_name,
	- za->za_first_integer);
	- }
	- zap_cursor_fini(&zc);
	- kmem_free(za, sizeof (zap_attribute_t));
	- }
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c
	@@ -1,114 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://opensource.org/licenses/CDDL-1.0.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-#include <sys/zfs_context.h>
	-#include <sys/zio.h>
	-#include <sys/edonr.h>
	-#include <sys/abd.h>
	-
	-#define EDONR_MODE 512
	-#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE
	-
	-static int
	-edonr_incremental(void buf, size_t size, void arg)
	-{
	- EdonRState *ctx = arg;
	- EdonRUpdate(ctx, buf, size * 8);
	- return (0);
	-}
	-
	-/*
	- * Native zio_checksum interface for the Edon-R hash function.
	- */
	-/ARGSUSED/
	-void
	-abd_checksum_edonr_native(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- uint8_t digest[EDONR_MODE / 8];
	- EdonRState ctx;
	-
	- ASSERT(ctx_template != NULL);
	- bcopy(ctx_template, &ctx, sizeof (ctx));
	- (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
	- EdonRFinal(&ctx, digest);
	- bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
	-}
	-
	-/*
	- * Byteswapped zio_checksum interface for the Edon-R hash function.
	- */
	-void
	-abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- zio_cksum_t tmp;
	-
	- abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
	- zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
	- zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
	- zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
	- zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]);
	-}
	-
	-void *
	-abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
	-{
	- EdonRState *ctx;
	- uint8_t salt_block[EDONR_BLOCK_SIZE];
	-
	- /*
	- * Edon-R needs all but the last hash invocation to be on full-size
	- * blocks, but the salt is too small. Rather than simply padding it
	- * with zeros, we expand the salt into a new salt block of proper
	- * size by double-hashing it (the new salt block will be composed of
	- * H(salt) \|\| H(H(salt))).
	- */
	- CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
	- EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
	- salt_block);
	- EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
	- EDONR_MODE / 8);
	-
	- /*
	- * Feed the new salt block into the hash function - this will serve
	- * as our MAC key.
	- */
	- ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
	- EdonRInit(ctx, EDONR_MODE);
	- EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
	- return (ctx);
	-}
	-
	-void
	-abd_checksum_edonr_tmpl_free(void *ctx_template)
	-{
	- EdonRState *ctx = ctx_template;
	-
	- bzero(ctx, sizeof (*ctx));
	- kmem_free(ctx, sizeof (*ctx));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c
	@@ -1,69 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include <sys/debug.h>
	-#include <sys/types.h>
	-#include <sys/zmod.h>
	-
	-#ifdef _KERNEL
	-#include <sys/systm.h>
	-#else
	-#include <strings.h>
	-#endif
	-
	-size_t
	-gzip_compress(void s_start, void d_start, size_t s_len, size_t d_len, int n)
	-{
	- size_t dstlen = d_len;
	-
	- ASSERT(d_len <= s_len);
	-
	- if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
	- if (d_len != s_len)
	- return (s_len);
	-
	- bcopy(s_start, d_start, s_len);
	- return (s_len);
	- }
	-
	- return (dstlen);
	-}
	-
	-/ARGSUSED/
	-int
	-gzip_decompress(void s_start, void d_start, size_t s_len, size_t d_len, int n)
	-{
	- size_t dstlen = d_len;
	-
	- ASSERT(d_len >= s_len);
	-
	- if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK)
	- return (-1);
	-
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs
	@@ -1,80 +0,0 @@
	-#
	-# CDDL HEADER START
	-#
	-# This file and its contents are supplied under the terms of the
	-# Common Development and Distribution License ("CDDL"), version 1.0.
	-# You may only use this file in accordance with the terms of version
	-# 1.0 of the CDDL.
	-#
	-# A full copy of the text of the CDDL should have accompanied this
	-# source. A copy of the CDDL is also available via the Internet at
	-# http://www.illumos.org/license/CDDL.
	-#
	-# CDDL HEADER END
	-#
	-
	-#
	-# Copyright (c) 2017 by Delphix. All rights reserved.
	-#
	-
	-Introduction
	-------------
	-
	-This README describes the Lua interpreter source code that lives in the ZFS
	-source tree to enable execution of ZFS channel programs, including its
	-maintenance policy, the modifications that have been made to it, and how it
	-should (and should not) be used.
	-
	-For a description of the Lua language and features exposed by ZFS channel
	-programs, please refer to the zfs-program(1m) man page instead.
	-
	-
	-Maintenance policy
	-------------------
	-
	-The Lua runtime is considered stable software. Channel programs don't need much
	-complicated logic, so updates to the Lua runtime from upstream are viewed as
	-nice-to-have, but not required for channel programs to be well-supported. As
	-such, the Lua runtime in ZFS should be updated on an as-needed basis for
	-security vulnerabilities, but not much else.
	-
	-
	-Modifications to Lua
	---------------------
	-
	-The version of the Lua runtime we're using in ZFS has been modified in a variety
	-of ways to make it more useful for the specific purpose of running channel
	-programs. These changes include:
	-
	-1. "Normal" Lua uses floating point for all numbers it stores, but those aren't
	- useful inside ZFS / the kernel. We have changed the runtime to use int64_t
	- throughout for all numbers.
	-2. Some of the Lua standard libraries do file I/O or spawn processes, but
	- neither of these make sense from inside channel programs. We have removed
	- those libraries rather than reimplementing them using kernel APIs.
	-3. The "normal" Lua runtime handles errors by failing fatally, but since this
	- version of Lua runs inside the kernel we must handle these failures and
	- return meaningful error codes to userland. We have customized the Lua
	- failure paths so that they aren't fatal.
	-4. Running poorly-vetted code inside the kernel is always a risk; even if the
	- ability to do so is restricted to the root user, it's still possible to write
	- an incorrect program that results in an infinite loop or massive memory use.
	- We've added new protections into the Lua interpreter to limit the runtime
	- (measured in number of Lua instructions run) and memory overhead of running
	- a channel program.
	-5. The Lua bytecode is not designed to be secure / safe, so it would be easy to
	- pass invalid bytecode which can panic the kernel. By comparison, the parser
	- is hardened and fails gracefully on invalid input. Therefore, we only accept
	- Lua source code at the ioctl level and then interpret it inside the kernel.
	-
	-Each of these modifications have been tested in the zfs-test suite. If / when
	-new modifications are made, new tests should be added to the suite located in
	-zfs-tests/tests/functional/channel_program/lua_core.
	-
	-
	-How to use this Lua interpreter
	--------------------------------
	-
	-From the above, it should be clear that this is not a general-purpose Lua
	-interpreter. Additional work would be required to extricate this custom version
	-of Lua from ZFS and make it usable by other areas of the kernel.
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h
	@@ -1,24 +0,0 @@
	-/*
	-** $Id: lapi.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Auxiliary functions from Lua API
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lapi_h
	-#define lapi_h
	-
	-
	-#include "llimits.h"
	-#include "lstate.h"
	-
	-#define api_incr_top(L) {L->top++; api_check(L, L->top <= L->ci->top, \
	- "stack overflow");}
	-
	-#define adjustresults(L,nres) \
	- { if ((nres) == LUA_MULTRET && L->ci->top < L->top) L->ci->top = L->top; }
	-
	-#define api_checknelems(L,n) api_check(L, (n) < (L->top - L->ci->func), \
	- "not enough elements in the stack")
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c
	@@ -1,1283 +0,0 @@
	-/*
	-** $Id: lapi.c,v 2.171.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Lua API
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define lapi_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lapi.h"
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lfunc.h"
	-#include "lgc.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "ltm.h"
	-#include "lundump.h"
	-#include "lvm.h"
	-
	-
	-
	-const char lua_ident[] =
	- "$LuaVersion: " LUA_COPYRIGHT " $"
	- "$LuaAuthors: " LUA_AUTHORS " $";
	-
	-
	-/* value at a non-valid index */
	-#define NONVALIDVALUE cast(TValue *, luaO_nilobject)
	-
	-/* corresponding test */
	-#define isvalid(o) ((o) != luaO_nilobject)
	-
	-/* test for pseudo index */
	-#define ispseudo(i) ((i) <= LUA_REGISTRYINDEX)
	-
	-/* test for valid but not pseudo index */
	-#define isstackindex(i, o) (isvalid(o) && !ispseudo(i))
	-
	-#define api_checkvalidindex(L, o) api_check(L, isvalid(o), "invalid index")
	-
	-#define api_checkstackindex(L, i, o) \
	- api_check(L, isstackindex(i, o), "index not in the stack")
	-
	-
	-static TValue index2addr (lua_State L, int idx) {
	- CallInfo *ci = L->ci;
	- if (idx > 0) {
	- TValue *o = ci->func + idx;
	- api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index");
	- if (o >= L->top) return NONVALIDVALUE;
	- else return o;
	- }
	- else if (!ispseudo(idx)) { /* negative index */
	- api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index");
	- return L->top + idx;
	- }
	- else if (idx == LUA_REGISTRYINDEX)
	- return &G(L)->l_registry;
	- else { /* upvalues */
	- idx = LUA_REGISTRYINDEX - idx;
	- api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large");
	- if (ttislcf(ci->func)) /* light C function? */
	- return NONVALIDVALUE; /* it has no upvalues */
	- else {
	- CClosure *func = clCvalue(ci->func);
	- return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE;
	- }
	- }
	-}
	-
	-
	-/*
	-** to be called by 'lua_checkstack' in protected mode, to grow stack
	-** capturing memory errors
	-*/
	-static void growstack (lua_State L, void ud) {
	- int size = (int )ud;
	- luaD_growstack(L, size);
	-}
	-
	-
	-LUA_API int lua_checkstack (lua_State *L, int size) {
	- int res;
	- CallInfo *ci = L->ci;
	- lua_lock(L);
	- if (L->stack_last - L->top > size) /* stack large enough? */
	- res = 1; /* yes; check is OK */
	- else { /* no; need to grow stack */
	- int inuse = cast_int(L->top - L->stack) + EXTRA_STACK;
	- if (inuse > LUAI_MAXSTACK - size) /* can grow without overflow? */
	- res = 0; /* no */
	- else /* try to grow stack */
	- res = (luaD_rawrunprotected(L, &growstack, &size) == LUA_OK);
	- }
	- if (res && ci->top < L->top + size)
	- ci->top = L->top + size; /* adjust frame top */
	- lua_unlock(L);
	- return res;
	-}
	-
	-
	-LUA_API void lua_xmove (lua_State from, lua_State to, int n) {
	- int i;
	- if (from == to) return;
	- lua_lock(to);
	- api_checknelems(from, n);
	- api_check(from, G(from) == G(to), "moving among independent states");
	- api_check(from, to->ci->top - to->top >= n, "not enough elements to move");
	- from->top -= n;
	- for (i = 0; i < n; i++) {
	- setobj2s(to, to->top++, from->top + i);
	- }
	- lua_unlock(to);
	-}
	-
	-
	-LUA_API lua_CFunction lua_atpanic (lua_State *L, lua_CFunction panicf) {
	- lua_CFunction old;
	- lua_lock(L);
	- old = G(L)->panic;
	- G(L)->panic = panicf;
	- lua_unlock(L);
	- return old;
	-}
	-
	-
	-LUA_API const lua_Number lua_version (lua_State L) {
	- static const lua_Number version = LUA_VERSION_NUM;
	- if (L == NULL) return &version;
	- else return G(L)->version;
	-}
	-
	-
	-
	-/*
	-** basic stack manipulation
	-*/
	-
	-
	-/*
	-** convert an acceptable stack index into an absolute index
	-*/
	-LUA_API int lua_absindex (lua_State *L, int idx) {
	- return (idx > 0 \|\| ispseudo(idx))
	- ? idx
	- : cast_int(L->top - L->ci->func + idx);
	-}
	-
	-
	-LUA_API int lua_gettop (lua_State *L) {
	- return cast_int(L->top - (L->ci->func + 1));
	-}
	-
	-
	-LUA_API void lua_settop (lua_State *L, int idx) {
	- StkId func = L->ci->func;
	- lua_lock(L);
	- if (idx >= 0) {
	- api_check(L, idx <= L->stack_last - (func + 1), "new top too large");
	- while (L->top < (func + 1) + idx)
	- setnilvalue(L->top++);
	- L->top = (func + 1) + idx;
	- }
	- else {
	- api_check(L, -(idx+1) <= (L->top - (func + 1)), "invalid new top");
	- L->top += idx+1; /* `subtract' index (index is negative) */
	- }
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_remove (lua_State *L, int idx) {
	- StkId p;
	- lua_lock(L);
	- p = index2addr(L, idx);
	- api_checkstackindex(L, idx, p);
	- while (++p < L->top) setobjs2s(L, p-1, p);
	- L->top--;
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_insert (lua_State *L, int idx) {
	- StkId p;
	- StkId q;
	- lua_lock(L);
	- p = index2addr(L, idx);
	- api_checkstackindex(L, idx, p);
	- for (q = L->top; q > p; q--) /* use L->top as a temporary */
	- setobjs2s(L, q, q - 1);
	- setobjs2s(L, p, L->top);
	- lua_unlock(L);
	-}
	-
	-
	-static void moveto (lua_State L, TValue fr, int idx) {
	- TValue *to = index2addr(L, idx);
	- api_checkvalidindex(L, to);
	- setobj(L, to, fr);
	- if (idx < LUA_REGISTRYINDEX) /* function upvalue? */
	- luaC_barrier(L, clCvalue(L->ci->func), fr);
	- /* LUA_REGISTRYINDEX does not need gc barrier
	- (collector revisits it before finishing collection) */
	-}
	-
	-
	-LUA_API void lua_replace (lua_State *L, int idx) {
	- lua_lock(L);
	- api_checknelems(L, 1);
	- moveto(L, L->top - 1, idx);
	- L->top--;
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_copy (lua_State *L, int fromidx, int toidx) {
	- TValue *fr;
	- lua_lock(L);
	- fr = index2addr(L, fromidx);
	- moveto(L, fr, toidx);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_pushvalue (lua_State *L, int idx) {
	- lua_lock(L);
	- setobj2s(L, L->top, index2addr(L, idx));
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-
	-/*
	-** access functions (stack -> C)
	-*/
	-
	-
	-LUA_API int lua_type (lua_State *L, int idx) {
	- StkId o = index2addr(L, idx);
	- return (isvalid(o) ? ttypenv(o) : LUA_TNONE);
	-}
	-
	-
	-LUA_API const char lua_typename (lua_State L, int t) {
	- UNUSED(L);
	- return ttypename(t);
	-}
	-
	-
	-LUA_API int lua_iscfunction (lua_State *L, int idx) {
	- StkId o = index2addr(L, idx);
	- return (ttislcf(o) \|\| (ttisCclosure(o)));
	-}
	-
	-
	-LUA_API int lua_isnumber (lua_State *L, int idx) {
	- TValue n;
	- const TValue *o = index2addr(L, idx);
	- return tonumber(o, &n);
	-}
	-
	-
	-LUA_API int lua_isstring (lua_State *L, int idx) {
	- int t = lua_type(L, idx);
	- return (t == LUA_TSTRING \|\| t == LUA_TNUMBER);
	-}
	-
	-
	-LUA_API int lua_isuserdata (lua_State *L, int idx) {
	- const TValue *o = index2addr(L, idx);
	- return (ttisuserdata(o) \|\| ttislightuserdata(o));
	-}
	-
	-
	-LUA_API int lua_rawequal (lua_State *L, int index1, int index2) {
	- StkId o1 = index2addr(L, index1);
	- StkId o2 = index2addr(L, index2);
	- return (isvalid(o1) && isvalid(o2)) ? luaV_rawequalobj(o1, o2) : 0;
	-}
	-
	-
	-LUA_API void lua_arith (lua_State *L, int op) {
	- StkId o1; /* 1st operand */
	- StkId o2; /* 2nd operand */
	- lua_lock(L);
	- if (op != LUA_OPUNM) /* all other operations expect two operands */
	- api_checknelems(L, 2);
	- else { /* for unary minus, add fake 2nd operand */
	- api_checknelems(L, 1);
	- setobjs2s(L, L->top, L->top - 1);
	- L->top++;
	- }
	- o1 = L->top - 2;
	- o2 = L->top - 1;
	- if (ttisnumber(o1) && ttisnumber(o2)) {
	- setnvalue(o1, luaO_arith(op, nvalue(o1), nvalue(o2)));
	- }
	- else
	- luaV_arith(L, o1, o1, o2, cast(TMS, op - LUA_OPADD + TM_ADD));
	- L->top--;
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API int lua_compare (lua_State *L, int index1, int index2, int op) {
	- StkId o1, o2;
	- int i = 0;
	- lua_lock(L); /* may call tag method */
	- o1 = index2addr(L, index1);
	- o2 = index2addr(L, index2);
	- if (isvalid(o1) && isvalid(o2)) {
	- switch (op) {
	- case LUA_OPEQ: i = equalobj(L, o1, o2); break;
	- case LUA_OPLT: i = luaV_lessthan(L, o1, o2); break;
	- case LUA_OPLE: i = luaV_lessequal(L, o1, o2); break;
	- default: api_check(L, 0, "invalid option");
	- }
	- }
	- lua_unlock(L);
	- return i;
	-}
	-
	-
	-LUA_API lua_Number lua_tonumberx (lua_State L, int idx, int isnum) {
	- TValue n;
	- const TValue *o = index2addr(L, idx);
	- if (tonumber(o, &n)) {
	- if (isnum) *isnum = 1;
	- return nvalue(o);
	- }
	- else {
	- if (isnum) *isnum = 0;
	- return 0;
	- }
	-}
	-
	-
	-LUA_API lua_Integer lua_tointegerx (lua_State L, int idx, int isnum) {
	- TValue n;
	- const TValue *o = index2addr(L, idx);
	- if (tonumber(o, &n)) {
	- lua_Integer res;
	- lua_Number num = nvalue(o);
	- lua_number2integer(res, num);
	- if (isnum) *isnum = 1;
	- return res;
	- }
	- else {
	- if (isnum) *isnum = 0;
	- return 0;
	- }
	-}
	-
	-
	-LUA_API lua_Unsigned lua_tounsignedx (lua_State L, int idx, int isnum) {
	- TValue n;
	- const TValue *o = index2addr(L, idx);
	- if (tonumber(o, &n)) {
	- lua_Unsigned res;
	- lua_Number num = nvalue(o);
	- lua_number2unsigned(res, num);
	- if (isnum) *isnum = 1;
	- return res;
	- }
	- else {
	- if (isnum) *isnum = 0;
	- return 0;
	- }
	-}
	-
	-
	-LUA_API int lua_toboolean (lua_State *L, int idx) {
	- const TValue *o = index2addr(L, idx);
	- return !l_isfalse(o);
	-}
	-
	-
	-LUA_API const char lua_tolstring (lua_State L, int idx, size_t *len) {
	- StkId o = index2addr(L, idx);
	- if (!ttisstring(o)) {
	- lua_lock(L); /* `luaV_tostring' may create a new string */
	- if (!luaV_tostring(L, o)) { /* conversion failed? */
	- if (len != NULL) *len = 0;
	- lua_unlock(L);
	- return NULL;
	- }
	- luaC_checkGC(L);
	- o = index2addr(L, idx); /* previous call may reallocate the stack */
	- lua_unlock(L);
	- }
	- if (len != NULL) *len = tsvalue(o)->len;
	- return svalue(o);
	-}
	-
	-
	-LUA_API size_t lua_rawlen (lua_State *L, int idx) {
	- StkId o = index2addr(L, idx);
	- switch (ttypenv(o)) {
	- case LUA_TSTRING: return tsvalue(o)->len;
	- case LUA_TUSERDATA: return uvalue(o)->len;
	- case LUA_TTABLE: return luaH_getn(hvalue(o));
	- default: return 0;
	- }
	-}
	-
	-
	-LUA_API lua_CFunction lua_tocfunction (lua_State *L, int idx) {
	- StkId o = index2addr(L, idx);
	- if (ttislcf(o)) return fvalue(o);
	- else if (ttisCclosure(o))
	- return clCvalue(o)->f;
	- else return NULL; /* not a C function */
	-}
	-
	-
	-LUA_API void lua_touserdata (lua_State L, int idx) {
	- StkId o = index2addr(L, idx);
	- switch (ttypenv(o)) {
	- case LUA_TUSERDATA: return (rawuvalue(o) + 1);
	- case LUA_TLIGHTUSERDATA: return pvalue(o);
	- default: return NULL;
	- }
	-}
	-
	-
	-LUA_API lua_State lua_tothread (lua_State L, int idx) {
	- StkId o = index2addr(L, idx);
	- return (!ttisthread(o)) ? NULL : thvalue(o);
	-}
	-
	-
	-LUA_API const void lua_topointer (lua_State L, int idx) {
	- StkId o = index2addr(L, idx);
	- switch (ttype(o)) {
	- case LUA_TTABLE: return hvalue(o);
	- case LUA_TLCL: return clLvalue(o);
	- case LUA_TCCL: return clCvalue(o);
	- case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o)));
	- case LUA_TTHREAD: return thvalue(o);
	- case LUA_TUSERDATA:
	- case LUA_TLIGHTUSERDATA:
	- return lua_touserdata(L, idx);
	- default: return NULL;
	- }
	-}
	-
	-
	-
	-/*
	-** push functions (C -> stack)
	-*/
	-
	-
	-LUA_API void lua_pushnil (lua_State *L) {
	- lua_lock(L);
	- setnilvalue(L->top);
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_pushnumber (lua_State *L, lua_Number n) {
	- lua_lock(L);
	- setnvalue(L->top, n);
	- luai_checknum(L, L->top,
	- luaG_runerror(L, "C API - attempt to push a signaling NaN"));
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_pushinteger (lua_State *L, lua_Integer n) {
	- lua_lock(L);
	- setnvalue(L->top, cast_num(n));
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_pushunsigned (lua_State *L, lua_Unsigned u) {
	- lua_Number n;
	- lua_lock(L);
	- n = lua_unsigned2number(u);
	- setnvalue(L->top, n);
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API const char lua_pushlstring (lua_State L, const char *s, size_t len) {
	- TString *ts;
	- lua_lock(L);
	- luaC_checkGC(L);
	- ts = luaS_newlstr(L, s, len);
	- setsvalue2s(L, L->top, ts);
	- api_incr_top(L);
	- lua_unlock(L);
	- return getstr(ts);
	-}
	-
	-
	-LUA_API const char lua_pushstring (lua_State L, const char *s) {
	- if (s == NULL) {
	- lua_pushnil(L);
	- return NULL;
	- }
	- else {
	- TString *ts;
	- lua_lock(L);
	- luaC_checkGC(L);
	- ts = luaS_new(L, s);
	- setsvalue2s(L, L->top, ts);
	- api_incr_top(L);
	- lua_unlock(L);
	- return getstr(ts);
	- }
	-}
	-
	-
	-LUA_API const char lua_pushvfstring (lua_State L, const char *fmt,
	- va_list argp) {
	- const char *ret;
	- lua_lock(L);
	- luaC_checkGC(L);
	- ret = luaO_pushvfstring(L, fmt, argp);
	- lua_unlock(L);
	- return ret;
	-}
	-
	-
	-LUA_API const char lua_pushfstring (lua_State L, const char *fmt, ...) {
	- const char *ret;
	- va_list argp;
	- lua_lock(L);
	- luaC_checkGC(L);
	- va_start(argp, fmt);
	- ret = luaO_pushvfstring(L, fmt, argp);
	- va_end(argp);
	- lua_unlock(L);
	- return ret;
	-}
	-
	-
	-LUA_API void lua_pushcclosure (lua_State *L, lua_CFunction fn, int n) {
	- lua_lock(L);
	- if (n == 0) {
	- setfvalue(L->top, fn);
	- }
	- else {
	- Closure *cl;
	- api_checknelems(L, n);
	- api_check(L, n <= MAXUPVAL, "upvalue index too large");
	- luaC_checkGC(L);
	- cl = luaF_newCclosure(L, n);
	- cl->c.f = fn;
	- L->top -= n;
	- while (n--)
	- setobj2n(L, &cl->c.upvalue[n], L->top + n);
	- setclCvalue(L, L->top, cl);
	- }
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_pushboolean (lua_State *L, int b) {
	- lua_lock(L);
	- setbvalue(L->top, (b != 0)); /* ensure that true is 1 */
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_pushlightuserdata (lua_State L, void p) {
	- lua_lock(L);
	- setpvalue(L->top, p);
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API int lua_pushthread (lua_State *L) {
	- lua_lock(L);
	- setthvalue(L, L->top, L);
	- api_incr_top(L);
	- lua_unlock(L);
	- return (G(L)->mainthread == L);
	-}
	-
	-
	-
	-/*
	-** get functions (Lua -> stack)
	-*/
	-
	-
	-LUA_API void lua_getglobal (lua_State L, const char var) {
	- Table *reg = hvalue(&G(L)->l_registry);
	- const TValue gt; / global table */
	- lua_lock(L);
	- gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
	- setsvalue2s(L, L->top++, luaS_new(L, var));
	- luaV_gettable(L, gt, L->top - 1, L->top - 1);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_gettable (lua_State *L, int idx) {
	- StkId t;
	- lua_lock(L);
	- t = index2addr(L, idx);
	- luaV_gettable(L, t, L->top - 1, L->top - 1);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_getfield (lua_State L, int idx, const char k) {
	- StkId t;
	- lua_lock(L);
	- t = index2addr(L, idx);
	- setsvalue2s(L, L->top, luaS_new(L, k));
	- api_incr_top(L);
	- luaV_gettable(L, t, L->top - 1, L->top - 1);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_rawget (lua_State *L, int idx) {
	- StkId t;
	- lua_lock(L);
	- t = index2addr(L, idx);
	- api_check(L, ttistable(t), "table expected");
	- setobj2s(L, L->top - 1, luaH_get(hvalue(t), L->top - 1));
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_rawgeti (lua_State *L, int idx, int n) {
	- StkId t;
	- lua_lock(L);
	- t = index2addr(L, idx);
	- api_check(L, ttistable(t), "table expected");
	- setobj2s(L, L->top, luaH_getint(hvalue(t), n));
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_rawgetp (lua_State L, int idx, const void p) {
	- StkId t;
	- TValue k;
	- lua_lock(L);
	- t = index2addr(L, idx);
	- api_check(L, ttistable(t), "table expected");
	- setpvalue(&k, cast(void *, p));
	- setobj2s(L, L->top, luaH_get(hvalue(t), &k));
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_createtable (lua_State *L, int narray, int nrec) {
	- Table *t;
	- lua_lock(L);
	- luaC_checkGC(L);
	- t = luaH_new(L);
	- sethvalue(L, L->top, t);
	- api_incr_top(L);
	- if (narray > 0 \|\| nrec > 0)
	- luaH_resize(L, t, narray, nrec);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API int lua_getmetatable (lua_State *L, int objindex) {
	- const TValue *obj;
	- Table *mt = NULL;
	- int res;
	- lua_lock(L);
	- obj = index2addr(L, objindex);
	- switch (ttypenv(obj)) {
	- case LUA_TTABLE:
	- mt = hvalue(obj)->metatable;
	- break;
	- case LUA_TUSERDATA:
	- mt = uvalue(obj)->metatable;
	- break;
	- default:
	- mt = G(L)->mt[ttypenv(obj)];
	- break;
	- }
	- if (mt == NULL)
	- res = 0;
	- else {
	- sethvalue(L, L->top, mt);
	- api_incr_top(L);
	- res = 1;
	- }
	- lua_unlock(L);
	- return res;
	-}
	-
	-
	-LUA_API void lua_getuservalue (lua_State *L, int idx) {
	- StkId o;
	- lua_lock(L);
	- o = index2addr(L, idx);
	- api_check(L, ttisuserdata(o), "userdata expected");
	- if (uvalue(o)->env) {
	- sethvalue(L, L->top, uvalue(o)->env);
	- } else
	- setnilvalue(L->top);
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-/*
	-** set functions (stack -> Lua)
	-*/
	-
	-
	-LUA_API void lua_setglobal (lua_State L, const char var) {
	- Table *reg = hvalue(&G(L)->l_registry);
	- const TValue gt; / global table */
	- lua_lock(L);
	- api_checknelems(L, 1);
	- gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
	- setsvalue2s(L, L->top++, luaS_new(L, var));
	- luaV_settable(L, gt, L->top - 1, L->top - 2);
	- L->top -= 2; /* pop value and key */
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_settable (lua_State *L, int idx) {
	- StkId t;
	- lua_lock(L);
	- api_checknelems(L, 2);
	- t = index2addr(L, idx);
	- luaV_settable(L, t, L->top - 2, L->top - 1);
	- L->top -= 2; /* pop index and value */
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_setfield (lua_State L, int idx, const char k) {
	- StkId t;
	- lua_lock(L);
	- api_checknelems(L, 1);
	- t = index2addr(L, idx);
	- setsvalue2s(L, L->top++, luaS_new(L, k));
	- luaV_settable(L, t, L->top - 1, L->top - 2);
	- L->top -= 2; /* pop value and key */
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_rawset (lua_State *L, int idx) {
	- StkId t;
	- lua_lock(L);
	- api_checknelems(L, 2);
	- t = index2addr(L, idx);
	- api_check(L, ttistable(t), "table expected");
	- setobj2t(L, luaH_set(L, hvalue(t), L->top-2), L->top-1);
	- invalidateTMcache(hvalue(t));
	- luaC_barrierback(L, gcvalue(t), L->top-1);
	- L->top -= 2;
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_rawseti (lua_State *L, int idx, int n) {
	- StkId t;
	- lua_lock(L);
	- api_checknelems(L, 1);
	- t = index2addr(L, idx);
	- api_check(L, ttistable(t), "table expected");
	- luaH_setint(L, hvalue(t), n, L->top - 1);
	- luaC_barrierback(L, gcvalue(t), L->top-1);
	- L->top--;
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_rawsetp (lua_State L, int idx, const void p) {
	- StkId t;
	- TValue k;
	- lua_lock(L);
	- api_checknelems(L, 1);
	- t = index2addr(L, idx);
	- api_check(L, ttistable(t), "table expected");
	- setpvalue(&k, cast(void *, p));
	- setobj2t(L, luaH_set(L, hvalue(t), &k), L->top - 1);
	- luaC_barrierback(L, gcvalue(t), L->top - 1);
	- L->top--;
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API int lua_setmetatable (lua_State *L, int objindex) {
	- TValue *obj;
	- Table *mt;
	- lua_lock(L);
	- api_checknelems(L, 1);
	- obj = index2addr(L, objindex);
	- if (ttisnil(L->top - 1))
	- mt = NULL;
	- else {
	- api_check(L, ttistable(L->top - 1), "table expected");
	- mt = hvalue(L->top - 1);
	- }
	- switch (ttypenv(obj)) {
	- case LUA_TTABLE: {
	- hvalue(obj)->metatable = mt;
	- if (mt) {
	- luaC_objbarrierback(L, gcvalue(obj), mt);
	- luaC_checkfinalizer(L, gcvalue(obj), mt);
	- }
	- break;
	- }
	- case LUA_TUSERDATA: {
	- uvalue(obj)->metatable = mt;
	- if (mt) {
	- luaC_objbarrier(L, rawuvalue(obj), mt);
	- luaC_checkfinalizer(L, gcvalue(obj), mt);
	- }
	- break;
	- }
	- default: {
	- G(L)->mt[ttypenv(obj)] = mt;
	- break;
	- }
	- }
	- L->top--;
	- lua_unlock(L);
	- return 1;
	-}
	-
	-
	-LUA_API void lua_setuservalue (lua_State *L, int idx) {
	- StkId o;
	- lua_lock(L);
	- api_checknelems(L, 1);
	- o = index2addr(L, idx);
	- api_check(L, ttisuserdata(o), "userdata expected");
	- if (ttisnil(L->top - 1))
	- uvalue(o)->env = NULL;
	- else {
	- api_check(L, ttistable(L->top - 1), "table expected");
	- uvalue(o)->env = hvalue(L->top - 1);
	- luaC_objbarrier(L, gcvalue(o), hvalue(L->top - 1));
	- }
	- L->top--;
	- lua_unlock(L);
	-}
	-
	-
	-/*
	-** `load' and `call' functions (run Lua code)
	-*/
	-
	-
	-#define checkresults(L,na,nr) \
	- api_check(L, (nr) == LUA_MULTRET \|\| (L->ci->top - L->top >= (nr) - (na)), \
	- "results from function overflow current stack size")
	-
	-
	-LUA_API int lua_getctx (lua_State L, int ctx) {
	- if (L->ci->callstatus & CIST_YIELDED) {
	- if (ctx) *ctx = L->ci->u.c.ctx;
	- return L->ci->u.c.status;
	- }
	- else return LUA_OK;
	-}
	-
	-
	-LUA_API void lua_callk (lua_State *L, int nargs, int nresults, int ctx,
	- lua_CFunction k) {
	- StkId func;
	- lua_lock(L);
	- api_check(L, k == NULL \|\| !isLua(L->ci),
	- "cannot use continuations inside hooks");
	- api_checknelems(L, nargs+1);
	- api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
	- checkresults(L, nargs, nresults);
	- func = L->top - (nargs+1);
	- if (k != NULL && L->nny == 0) { /* need to prepare continuation? */
	- L->ci->u.c.k = k; /* save continuation */
	- L->ci->u.c.ctx = ctx; /* save context */
	- luaD_call(L, func, nresults, 1); /* do the call */
	- }
	- else /* no continuation or no yieldable */
	- luaD_call(L, func, nresults, 0); /* just do the call */
	- adjustresults(L, nresults);
	- lua_unlock(L);
	-}
	-
	-
	-
	-/*
	-** Execute a protected call.
	-*/
	-struct CallS { /* data to `f_call' */
	- StkId func;
	- int nresults;
	-};
	-
	-
	-static void f_call (lua_State L, void ud) {
	- struct CallS c = cast(struct CallS , ud);
	- luaD_call(L, c->func, c->nresults, 0);
	-}
	-
	-
	-
	-LUA_API int lua_pcallk (lua_State *L, int nargs, int nresults, int errfunc,
	- int ctx, lua_CFunction k) {
	- struct CallS c;
	- int status;
	- ptrdiff_t func;
	- lua_lock(L);
	- api_check(L, k == NULL \|\| !isLua(L->ci),
	- "cannot use continuations inside hooks");
	- api_checknelems(L, nargs+1);
	- api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
	- checkresults(L, nargs, nresults);
	- if (errfunc == 0)
	- func = 0;
	- else {
	- StkId o = index2addr(L, errfunc);
	- api_checkstackindex(L, errfunc, o);
	- func = savestack(L, o);
	- }
	- c.func = L->top - (nargs+1); /* function to be called */
	- if (k == NULL \|\| L->nny > 0) { /* no continuation or no yieldable? */
	- c.nresults = nresults; /* do a 'conventional' protected call */
	- status = luaD_pcall(L, f_call, &c, savestack(L, c.func), func);
	- }
	- else { /* prepare continuation (call is already protected by 'resume') */
	- CallInfo *ci = L->ci;
	- ci->u.c.k = k; /* save continuation */
	- ci->u.c.ctx = ctx; /* save context */
	- /* save information for error recovery */
	- ci->extra = savestack(L, c.func);
	- ci->u.c.old_allowhook = L->allowhook;
	- ci->u.c.old_errfunc = L->errfunc;
	- L->errfunc = func;
	- /* mark that function may do error recovery */
	- ci->callstatus \|= CIST_YPCALL;
	- luaD_call(L, c.func, nresults, 1); /* do the call */
	- ci->callstatus &= ~CIST_YPCALL;
	- L->errfunc = ci->u.c.old_errfunc;
	- status = LUA_OK; /* if it is here, there were no errors */
	- }
	- adjustresults(L, nresults);
	- lua_unlock(L);
	- return status;
	-}
	-
	-
	-LUA_API int lua_load (lua_State L, lua_Reader reader, void data,
	- const char chunkname, const char mode) {
	- ZIO z;
	- int status;
	- lua_lock(L);
	- if (!chunkname) chunkname = "?";
	- luaZ_init(L, &z, reader, data);
	- status = luaD_protectedparser(L, &z, chunkname, mode);
	- if (status == LUA_OK) { /* no errors? */
	- LClosure f = clLvalue(L->top - 1); / get newly created function */
	- if (f->nupvalues == 1) { /* does it have one upvalue? */
	- /* get global table from registry */
	- Table *reg = hvalue(&G(L)->l_registry);
	- const TValue *gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
	- /* set global table as 1st upvalue of 'f' (may be LUA_ENV) */
	- setobj(L, f->upvals[0]->v, gt);
	- luaC_barrier(L, f->upvals[0], gt);
	- }
	- }
	- lua_unlock(L);
	- return status;
	-}
	-
	-
	-LUA_API int lua_dump (lua_State L, lua_Writer writer, void data) {
	- int status;
	- TValue *o;
	- lua_lock(L);
	- api_checknelems(L, 1);
	- o = L->top - 1;
	- if (isLfunction(o))
	- status = luaU_dump(L, getproto(o), writer, data, 0);
	- else
	- status = 1;
	- lua_unlock(L);
	- return status;
	-}
	-
	-
	-LUA_API int lua_status (lua_State *L) {
	- return L->status;
	-}
	-
	-
	-/*
	-** Garbage-collection function
	-*/
	-
	-LUA_API int lua_gc (lua_State *L, int what, int data) {
	- int res = 0;
	- global_State *g;
	- lua_lock(L);
	- g = G(L);
	- switch (what) {
	- case LUA_GCSTOP: {
	- g->gcrunning = 0;
	- break;
	- }
	- case LUA_GCRESTART: {
	- luaE_setdebt(g, 0);
	- g->gcrunning = 1;
	- break;
	- }
	- case LUA_GCCOLLECT: {
	- luaC_fullgc(L, 0);
	- break;
	- }
	- case LUA_GCCOUNT: {
	- /* GC values are expressed in Kbytes: #bytes/2^10 */
	- res = cast_int(gettotalbytes(g) >> 10);
	- break;
	- }
	- case LUA_GCCOUNTB: {
	- res = cast_int(gettotalbytes(g) & 0x3ff);
	- break;
	- }
	- case LUA_GCSTEP: {
	- if (g->gckind == KGC_GEN) { /* generational mode? */
	- res = (g->GCestimate == 0); /* true if it will do major collection */
	- luaC_forcestep(L); /* do a single step */
	- }
	- else {
	- lu_mem debt = cast(lu_mem, data) * 1024 - GCSTEPSIZE;
	- if (g->gcrunning)
	- debt += g->GCdebt; /* include current debt */
	- luaE_setdebt(g, debt);
	- luaC_forcestep(L);
	- if (g->gcstate == GCSpause) /* end of cycle? */
	- res = 1; /* signal it */
	- }
	- break;
	- }
	- case LUA_GCSETPAUSE: {
	- res = g->gcpause;
	- g->gcpause = data;
	- break;
	- }
	- case LUA_GCSETMAJORINC: {
	- res = g->gcmajorinc;
	- g->gcmajorinc = data;
	- break;
	- }
	- case LUA_GCSETSTEPMUL: {
	- res = g->gcstepmul;
	- g->gcstepmul = data;
	- break;
	- }
	- case LUA_GCISRUNNING: {
	- res = g->gcrunning;
	- break;
	- }
	- case LUA_GCGEN: { /* change collector to generational mode */
	- luaC_changemode(L, KGC_GEN);
	- break;
	- }
	- case LUA_GCINC: { /* change collector to incremental mode */
	- luaC_changemode(L, KGC_NORMAL);
	- break;
	- }
	- default: res = -1; /* invalid option */
	- }
	- lua_unlock(L);
	- return res;
	-}
	-
	-
	-
	-/*
	-** miscellaneous functions
	-*/
	-
	-
	-LUA_API int lua_error (lua_State *L) {
	- lua_lock(L);
	- api_checknelems(L, 1);
	- luaG_errormsg(L);
	- /* code unreachable; will unlock when control actually leaves the kernel */
	- return 0; /* to avoid warnings */
	-}
	-
	-
	-LUA_API int lua_next (lua_State *L, int idx) {
	- StkId t;
	- int more;
	- lua_lock(L);
	- t = index2addr(L, idx);
	- api_check(L, ttistable(t), "table expected");
	- more = luaH_next(L, hvalue(t), L->top - 1);
	- if (more) {
	- api_incr_top(L);
	- }
	- else /* no more elements */
	- L->top -= 1; /* remove key */
	- lua_unlock(L);
	- return more;
	-}
	-
	-
	-LUA_API void lua_concat (lua_State *L, int n) {
	- lua_lock(L);
	- api_checknelems(L, n);
	- if (n >= 2) {
	- luaC_checkGC(L);
	- luaV_concat(L, n);
	- }
	- else if (n == 0) { /* push empty string */
	- setsvalue2s(L, L->top, luaS_newlstr(L, "", 0));
	- api_incr_top(L);
	- }
	- /* else n == 1; nothing to do */
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_len (lua_State *L, int idx) {
	- StkId t;
	- lua_lock(L);
	- t = index2addr(L, idx);
	- luaV_objlen(L, L->top, t);
	- api_incr_top(L);
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API lua_Alloc lua_getallocf (lua_State L, void *ud) {
	- lua_Alloc f;
	- lua_lock(L);
	- if (ud) *ud = G(L)->ud;
	- f = G(L)->frealloc;
	- lua_unlock(L);
	- return f;
	-}
	-
	-
	-LUA_API void lua_setallocf (lua_State L, lua_Alloc f, void ud) {
	- lua_lock(L);
	- G(L)->ud = ud;
	- G(L)->frealloc = f;
	- lua_unlock(L);
	-}
	-
	-
	-LUA_API void lua_newuserdata (lua_State L, size_t size) {
	- Udata *u;
	- lua_lock(L);
	- luaC_checkGC(L);
	- u = luaS_newudata(L, size, NULL);
	- setuvalue(L, L->top, u);
	- api_incr_top(L);
	- lua_unlock(L);
	- return u + 1;
	-}
	-
	-
	-
	-static const char aux_upvalue (StkId fi, int n, TValue *val,
	- GCObject **owner) {
	- switch (ttype(fi)) {
	- case LUA_TCCL: { /* C closure */
	- CClosure *f = clCvalue(fi);
	- if (!(1 <= n && n <= f->nupvalues)) return NULL;
	- *val = &f->upvalue[n-1];
	- if (owner) *owner = obj2gco(f);
	- return "";
	- }
	- case LUA_TLCL: { /* Lua closure */
	- LClosure *f = clLvalue(fi);
	- TString *name;
	- Proto *p = f->p;
	- if (!(1 <= n && n <= p->sizeupvalues)) return NULL;
	- *val = f->upvals[n-1]->v;
	- if (owner) *owner = obj2gco(f->upvals[n - 1]);
	- name = p->upvalues[n-1].name;
	- return (name == NULL) ? "" : getstr(name);
	- }
	- default: return NULL; /* not a closure */
	- }
	-}
	-
	-
	-LUA_API const char lua_getupvalue (lua_State L, int funcindex, int n) {
	- const char *name;
	- TValue val = NULL; / to avoid warnings */
	- lua_lock(L);
	- name = aux_upvalue(index2addr(L, funcindex), n, &val, NULL);
	- if (name) {
	- setobj2s(L, L->top, val);
	- api_incr_top(L);
	- }
	- lua_unlock(L);
	- return name;
	-}
	-
	-
	-LUA_API const char lua_setupvalue (lua_State L, int funcindex, int n) {
	- const char *name;
	- TValue val = NULL; / to avoid warnings */
	- GCObject owner = NULL; / to avoid warnings */
	- StkId fi;
	- lua_lock(L);
	- fi = index2addr(L, funcindex);
	- api_checknelems(L, 1);
	- name = aux_upvalue(fi, n, &val, &owner);
	- if (name) {
	- L->top--;
	- setobj(L, val, L->top);
	- luaC_barrier(L, owner, L->top);
	- }
	- lua_unlock(L);
	- return name;
	-}
	-
	-
	-static UpVal *getupvalref (lua_State L, int fidx, int n, LClosure **pf) {
	- LClosure *f;
	- StkId fi = index2addr(L, fidx);
	- api_check(L, ttisLclosure(fi), "Lua function expected");
	- f = clLvalue(fi);
	- api_check(L, (1 <= n && n <= f->p->sizeupvalues), "invalid upvalue index");
	- if (pf) *pf = f;
	- return &f->upvals[n - 1]; /* get its upvalue pointer */
	-}
	-
	-
	-LUA_API void lua_upvalueid (lua_State L, int fidx, int n) {
	- StkId fi = index2addr(L, fidx);
	- switch (ttype(fi)) {
	- case LUA_TLCL: { /* lua closure */
	- return *getupvalref(L, fidx, n, NULL);
	- }
	- case LUA_TCCL: { /* C closure */
	- CClosure *f = clCvalue(fi);
	- api_check(L, 1 <= n && n <= f->nupvalues, "invalid upvalue index");
	- return &f->upvalue[n - 1];
	- }
	- default: {
	- api_check(L, 0, "closure expected");
	- return NULL;
	- }
	- }
	-}
	-
	-
	-LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1,
	- int fidx2, int n2) {
	- LClosure *f1;
	- UpVal **up1 = getupvalref(L, fidx1, n1, &f1);
	- UpVal **up2 = getupvalref(L, fidx2, n2, NULL);
	- up1 = up2;
	- luaC_objbarrier(L, f1, *up2);
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h
	@@ -1,176 +0,0 @@
	-/*
	-** $Id: lauxlib.h,v 1.120.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Auxiliary functions for building Lua libraries
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#ifndef lauxlib_h
	-#define lauxlib_h
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#include "lua.h"
	-
	-
	-
	-/* extra error code for `luaL_load' */
	-#define LUA_ERRFILE (LUA_ERRERR+1)
	-
	-
	-typedef struct luaL_Reg {
	- const char *name;
	- lua_CFunction func;
	-} luaL_Reg;
	-
	-
	-LUALIB_API void (luaL_checkversion_) (lua_State *L, lua_Number ver);
	-#define luaL_checkversion(L) luaL_checkversion_(L, LUA_VERSION_NUM)
	-
	-LUALIB_API int (luaL_getmetafield) (lua_State L, int obj, const char e);
	-LUALIB_API int (luaL_callmeta) (lua_State L, int obj, const char e);
	-LUALIB_API const char (luaL_tolstring) (lua_State L, int idx, size_t *len);
	-LUALIB_API int (luaL_argerror) (lua_State L, int numarg, const char extramsg);
	-LUALIB_API const char (luaL_checklstring) (lua_State L, int numArg,
	- size_t *l);
	-LUALIB_API const char (luaL_optlstring) (lua_State L, int numArg,
	- const char def, size_t l);
	-LUALIB_API lua_Number (luaL_checknumber) (lua_State *L, int numArg);
	-LUALIB_API lua_Number (luaL_optnumber) (lua_State *L, int nArg, lua_Number def);
	-
	-LUALIB_API lua_Integer (luaL_checkinteger) (lua_State *L, int numArg);
	-LUALIB_API lua_Integer (luaL_optinteger) (lua_State *L, int nArg,
	- lua_Integer def);
	-LUALIB_API lua_Unsigned (luaL_checkunsigned) (lua_State *L, int numArg);
	-LUALIB_API lua_Unsigned (luaL_optunsigned) (lua_State *L, int numArg,
	- lua_Unsigned def);
	-
	-LUALIB_API void (luaL_checkstack) (lua_State L, int sz, const char msg);
	-LUALIB_API void (luaL_checktype) (lua_State *L, int narg, int t);
	-LUALIB_API void (luaL_checkany) (lua_State *L, int narg);
	-
	-LUALIB_API int (luaL_newmetatable) (lua_State L, const char tname);
	-LUALIB_API void (luaL_setmetatable) (lua_State L, const char tname);
	-LUALIB_API void (luaL_testudata) (lua_State L, int ud, const char *tname);
	-LUALIB_API void (luaL_checkudata) (lua_State L, int ud, const char *tname);
	-
	-LUALIB_API void (luaL_where) (lua_State *L, int lvl);
	-LUALIB_API int (luaL_error) (lua_State L, const char fmt, ...);
	-
	-LUALIB_API int (luaL_checkoption) (lua_State L, int narg, const char def,
	- const char *const lst[]);
	-
	-/* pre-defined references */
	-#define LUA_NOREF (-2)
	-#define LUA_REFNIL (-1)
	-
	-LUALIB_API int (luaL_ref) (lua_State *L, int t);
	-LUALIB_API void (luaL_unref) (lua_State *L, int t, int ref);
	-
	-LUALIB_API int (luaL_loadbufferx) (lua_State L, const char buff, size_t sz,
	- const char name, const char mode);
	-LUALIB_API int (luaL_loadstring) (lua_State L, const char s);
	-
	-LUALIB_API int (luaL_len) (lua_State *L, int idx);
	-
	-LUALIB_API const char (luaL_gsub) (lua_State L, const char s, const char p,
	- const char *r);
	-
	-LUALIB_API void (luaL_setfuncs) (lua_State L, const luaL_Reg l, int nup);
	-
	-LUALIB_API int (luaL_getsubtable) (lua_State L, int idx, const char fname);
	-
	-LUALIB_API void (luaL_traceback) (lua_State L, lua_State L1,
	- const char *msg, int level);
	-
	-LUALIB_API void (luaL_requiref) (lua_State L, const char modname,
	- lua_CFunction openf, int glb);
	-
	-/*
	-** ===============================================================
	-** some useful macros
	-** ===============================================================
	-*/
	-
	-
	-#define luaL_newlibtable(L,l) \
	- lua_createtable(L, 0, sizeof(l)/sizeof((l)[0]) - 1)
	-
	-#define luaL_newlib(L,l) (luaL_newlibtable(L,l), luaL_setfuncs(L,l,0))
	-
	-#define luaL_argcheck(L, cond,numarg,extramsg) \
	- ((void)((cond) \|\| luaL_argerror(L, (numarg), (extramsg))))
	-#define luaL_checkstring(L,n) (luaL_checklstring(L, (n), NULL))
	-#define luaL_optstring(L,n,d) (luaL_optlstring(L, (n), (d), NULL))
	-#define luaL_checkint(L,n) ((int)luaL_checkinteger(L, (n)))
	-#define luaL_optint(L,n,d) ((int)luaL_optinteger(L, (n), (d)))
	-#define luaL_checklong(L,n) ((long)luaL_checkinteger(L, (n)))
	-#define luaL_optlong(L,n,d) ((long)luaL_optinteger(L, (n), (d)))
	-
	-#define luaL_typename(L,i) lua_typename(L, lua_type(L,(i)))
	-
	-#define luaL_dofile(L, fn) \
	- (luaL_loadfile(L, fn) \|\| lua_pcall(L, 0, LUA_MULTRET, 0))
	-
	-#define luaL_dostring(L, s) \
	- (luaL_loadstring(L, s) \|\| lua_pcall(L, 0, LUA_MULTRET, 0))
	-
	-#define luaL_getmetatable(L,n) (lua_getfield(L, LUA_REGISTRYINDEX, (n)))
	-
	-#define luaL_opt(L,f,n,d) (lua_isnoneornil(L,(n)) ? (d) : f(L,(n)))
	-
	-#define luaL_loadbuffer(L,s,sz,n) luaL_loadbufferx(L,s,sz,n,NULL)
	-
	-
	-/*
	-** {======================================================
	-** Generic Buffer manipulation
	-** =======================================================
	-*/
	-
	-typedef struct luaL_Buffer {
	- char b; / buffer address */
	- size_t size; /* buffer size */
	- size_t n; /* number of characters in buffer */
	- lua_State *L;
	- char initb[LUAL_BUFFERSIZE]; /* initial buffer */
	-} luaL_Buffer;
	-
	-
	-#define luaL_addchar(B,c) \
	- ((void)((B)->n < (B)->size \|\| luaL_prepbuffsize((B), 1)), \
	- ((B)->b[(B)->n++] = (c)))
	-
	-#define luaL_addsize(B,s) ((B)->n += (s))
	-
	-LUALIB_API void (luaL_buffinit) (lua_State L, luaL_Buffer B);
	-LUALIB_API char (luaL_prepbuffsize) (luaL_Buffer B, size_t sz);
	-LUALIB_API void (luaL_addlstring) (luaL_Buffer B, const char s, size_t l);
	-LUALIB_API void (luaL_addstring) (luaL_Buffer B, const char s);
	-LUALIB_API void (luaL_addvalue) (luaL_Buffer *B);
	-LUALIB_API void (luaL_pushresult) (luaL_Buffer *B);
	-LUALIB_API void (luaL_pushresultsize) (luaL_Buffer *B, size_t sz);
	-LUALIB_API char (luaL_buffinitsize) (lua_State L, luaL_Buffer *B, size_t sz);
	-
	-#define luaL_prepbuffer(B) luaL_prepbuffsize(B, LUAL_BUFFERSIZE)
	-
	-/* }====================================================== */
	-
	-
	-/* compatibility with old module system */
	-#if defined(LUA_COMPAT_MODULE)
	-
	-LUALIB_API void (luaL_pushmodule) (lua_State L, const char modname,
	- int sizehint);
	-LUALIB_API void (luaL_openlib) (lua_State L, const char libname,
	- const luaL_Reg *l, int nup);
	-
	-#define luaL_register(L,n,l) (luaL_openlib(L,(n),(l),0))
	-
	-#endif
	-
	-
	-#endif
	-
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c
	@@ -1,791 +0,0 @@
	-/*
	-** $Id: lauxlib.c,v 1.248.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Auxiliary functions for building Lua libraries
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-/* This file uses only the official API of Lua.
	-** Any function declared here could be written as an application function.
	-*/
	-
	-#define lauxlib_c
	-#define LUA_LIB
	-
	-#include "lua.h"
	-
	-#include "lauxlib.h"
	-
	-
	-/*
	-** {======================================================
	-** Traceback
	-** =======================================================
	-*/
	-
	-
	-#define LEVELS1 12 /* size of the first part of the stack */
	-#define LEVELS2 10 /* size of the second part of the stack */
	-
	-
	-
	-/*
	-** search for 'objidx' in table at index -1.
	-** return 1 + string at top if find a good name.
	-*/
	-static int findfield (lua_State *L, int objidx, int level) {
	- if (level == 0 \|\| !lua_istable(L, -1))
	- return 0; /* not found */
	- lua_pushnil(L); /* start 'next' loop */
	- while (lua_next(L, -2)) { /* for each pair in table */
	- if (lua_type(L, -2) == LUA_TSTRING) { /* ignore non-string keys */
	- if (lua_rawequal(L, objidx, -1)) { /* found object? */
	- lua_pop(L, 1); /* remove value (but keep name) */
	- return 1;
	- }
	- else if (findfield(L, objidx, level - 1)) { /* try recursively */
	- lua_remove(L, -2); /* remove table (but keep name) */
	- lua_pushliteral(L, ".");
	- lua_insert(L, -2); /* place '.' between the two names */
	- lua_concat(L, 3);
	- return 1;
	- }
	- }
	- lua_pop(L, 1); /* remove value */
	- }
	- return 0; /* not found */
	-}
	-
	-
	-static int pushglobalfuncname (lua_State L, lua_Debug ar) {
	- int top = lua_gettop(L);
	- lua_getinfo(L, "f", ar); /* push function */
	- lua_pushglobaltable(L);
	- if (findfield(L, top + 1, 2)) {
	- lua_copy(L, -1, top + 1); /* move name to proper place */
	- lua_pop(L, 2); /* remove pushed values */
	- return 1;
	- }
	- else {
	- lua_settop(L, top); /* remove function and global table */
	- return 0;
	- }
	-}
	-
	-
	-static void pushfuncname (lua_State L, lua_Debug ar) {
	- if (ar->namewhat != '\0') / is there a name? */
	- lua_pushfstring(L, "function " LUA_QS, ar->name);
	- else if (ar->what == 'm') / main? */
	- lua_pushliteral(L, "main chunk");
	- else if (*ar->what == 'C') {
	- if (pushglobalfuncname(L, ar)) {
	- lua_pushfstring(L, "function " LUA_QS, lua_tostring(L, -1));
	- lua_remove(L, -2); /* remove name */
	- }
	- else
	- lua_pushliteral(L, "?");
	- }
	- else
	- lua_pushfstring(L, "function <%s:%d>", ar->short_src, ar->linedefined);
	-}
	-
	-
	-static int countlevels (lua_State *L) {
	- lua_Debug ar;
	- int li = 1, le = 1;
	- /* find an upper bound */
	- while (lua_getstack(L, le, &ar)) { li = le; le *= 2; }
	- /* do a binary search */
	- while (li < le) {
	- int m = (li + le)/2;
	- if (lua_getstack(L, m, &ar)) li = m + 1;
	- else le = m;
	- }
	- return le - 1;
	-}
	-
	-
	-LUALIB_API void luaL_traceback (lua_State L, lua_State L1,
	- const char *msg, int level) {
	- lua_Debug ar;
	- int top = lua_gettop(L);
	- int numlevels = countlevels(L1);
	- int mark = (numlevels > LEVELS1 + LEVELS2) ? LEVELS1 : 0;
	- if (msg) lua_pushfstring(L, "%s\n", msg);
	- lua_pushliteral(L, "stack traceback:");
	- while (lua_getstack(L1, level++, &ar)) {
	- if (level == mark) { /* too many levels? */
	- lua_pushliteral(L, "\n\t..."); /* add a '...' */
	- level = numlevels - LEVELS2; /* and skip to last ones */
	- }
	- else {
	- lua_getinfo(L1, "Slnt", &ar);
	- lua_pushfstring(L, "\n\t%s:", ar.short_src);
	- if (ar.currentline > 0)
	- lua_pushfstring(L, "%d:", ar.currentline);
	- lua_pushliteral(L, " in ");
	- pushfuncname(L, &ar);
	- if (ar.istailcall)
	- lua_pushliteral(L, "\n\t(...tail calls...)");
	- lua_concat(L, lua_gettop(L) - top);
	- }
	- }
	- lua_concat(L, lua_gettop(L) - top);
	-}
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** Error-report functions
	-** =======================================================
	-*/
	-
	-LUALIB_API int luaL_argerror (lua_State L, int narg, const char extramsg) {
	- lua_Debug ar;
	- if (!lua_getstack(L, 0, &ar)) /* no stack frame? */
	- return luaL_error(L, "bad argument #%d (%s)", narg, extramsg);
	- lua_getinfo(L, "n", &ar);
	- if (strcmp(ar.namewhat, "method") == 0) {
	- narg--; /* do not count `self' */
	- if (narg == 0) /* error is in the self argument itself? */
	- return luaL_error(L, "calling " LUA_QS " on bad self (%s)",
	- ar.name, extramsg);
	- }
	- if (ar.name == NULL)
	- ar.name = (pushglobalfuncname(L, &ar)) ? lua_tostring(L, -1) : "?";
	- return luaL_error(L, "bad argument #%d to " LUA_QS " (%s)",
	- narg, ar.name, extramsg);
	-}
	-
	-
	-static int typeerror (lua_State L, int narg, const char tname) {
	- const char *msg = lua_pushfstring(L, "%s expected, got %s",
	- tname, luaL_typename(L, narg));
	- return luaL_argerror(L, narg, msg);
	-}
	-
	-
	-static void tag_error (lua_State *L, int narg, int tag) {
	- typeerror(L, narg, lua_typename(L, tag));
	-}
	-
	-
	-LUALIB_API void luaL_where (lua_State *L, int level) {
	- lua_Debug ar;
	- if (lua_getstack(L, level, &ar)) { /* check function at level */
	- lua_getinfo(L, "Sl", &ar); /* get info about it */
	- if (ar.currentline > 0) { /* is there info? */
	- lua_pushfstring(L, "%s:%d: ", ar.short_src, ar.currentline);
	- return;
	- }
	- }
	- lua_pushliteral(L, ""); /* else, no information available... */
	-}
	-
	-
	-LUALIB_API int luaL_error (lua_State L, const char fmt, ...) {
	- va_list argp;
	- va_start(argp, fmt);
	- luaL_where(L, 1);
	- lua_pushvfstring(L, fmt, argp);
	- va_end(argp);
	- lua_concat(L, 2);
	- return lua_error(L);
	-}
	-
	-
	-#if !defined(inspectstat) /* { */
	-
	-#if defined(LUA_USE_POSIX)
	-
	-#include <sys/wait.h>
	-
	-/*
	-** use appropriate macros to interpret 'pclose' return status
	-*/
	-#define inspectstat(stat,what) \
	- if (WIFEXITED(stat)) { stat = WEXITSTATUS(stat); } \
	- else if (WIFSIGNALED(stat)) { stat = WTERMSIG(stat); what = "signal"; }
	-
	-#else
	-
	-#define inspectstat(stat,what) /* no op */
	-
	-#endif
	-
	-#endif /* } */
	-
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** Userdata's metatable manipulation
	-** =======================================================
	-*/
	-
	-LUALIB_API int luaL_newmetatable (lua_State L, const char tname) {
	- luaL_getmetatable(L, tname); /* try to get metatable */
	- if (!lua_isnil(L, -1)) /* name already in use? */
	- return 0; /* leave previous value on top, but return 0 */
	- lua_pop(L, 1);
	- lua_newtable(L); /* create metatable */
	- lua_pushvalue(L, -1);
	- lua_setfield(L, LUA_REGISTRYINDEX, tname); /* registry.name = metatable */
	- return 1;
	-}
	-
	-
	-LUALIB_API void luaL_setmetatable (lua_State L, const char tname) {
	- luaL_getmetatable(L, tname);
	- lua_setmetatable(L, -2);
	-}
	-
	-
	-LUALIB_API void luaL_testudata (lua_State L, int ud, const char *tname) {
	- void *p = lua_touserdata(L, ud);
	- if (p != NULL) { /* value is a userdata? */
	- if (lua_getmetatable(L, ud)) { /* does it have a metatable? */
	- luaL_getmetatable(L, tname); /* get correct metatable */
	- if (!lua_rawequal(L, -1, -2)) /* not the same? */
	- p = NULL; /* value is a userdata with wrong metatable */
	- lua_pop(L, 2); /* remove both metatables */
	- return p;
	- }
	- }
	- return NULL; /* value is not a userdata with a metatable */
	-}
	-
	-
	-LUALIB_API void luaL_checkudata (lua_State L, int ud, const char *tname) {
	- void *p = luaL_testudata(L, ud, tname);
	- if (p == NULL) typeerror(L, ud, tname);
	- return p;
	-}
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** Argument check functions
	-** =======================================================
	-*/
	-
	-LUALIB_API int luaL_checkoption (lua_State L, int narg, const char def,
	- const char *const lst[]) {
	- const char *name = (def) ? luaL_optstring(L, narg, def) :
	- luaL_checkstring(L, narg);
	- int i;
	- for (i=0; lst[i]; i++)
	- if (strcmp(lst[i], name) == 0)
	- return i;
	- return luaL_argerror(L, narg,
	- lua_pushfstring(L, "invalid option " LUA_QS, name));
	-}
	-
	-
	-LUALIB_API void luaL_checkstack (lua_State L, int space, const char msg) {
	- /* keep some extra space to run error routines, if needed */
	- const int extra = LUA_MINSTACK;
	- if (!lua_checkstack(L, space + extra)) {
	- if (msg)
	- luaL_error(L, "stack overflow (%s)", msg);
	- else
	- luaL_error(L, "stack overflow");
	- }
	-}
	-
	-
	-LUALIB_API void luaL_checktype (lua_State *L, int narg, int t) {
	- if (lua_type(L, narg) != t)
	- tag_error(L, narg, t);
	-}
	-
	-
	-LUALIB_API void luaL_checkany (lua_State *L, int narg) {
	- if (lua_type(L, narg) == LUA_TNONE)
	- luaL_argerror(L, narg, "value expected");
	-}
	-
	-
	-LUALIB_API const char luaL_checklstring (lua_State L, int narg, size_t *len) {
	- const char *s = lua_tolstring(L, narg, len);
	- if (!s) tag_error(L, narg, LUA_TSTRING);
	- return s;
	-}
	-
	-
	-LUALIB_API const char luaL_optlstring (lua_State L, int narg,
	- const char def, size_t len) {
	- if (lua_isnoneornil(L, narg)) {
	- if (len)
	- *len = (def ? strlen(def) : 0);
	- return def;
	- }
	- else return luaL_checklstring(L, narg, len);
	-}
	-
	-
	-LUALIB_API lua_Number luaL_checknumber (lua_State *L, int narg) {
	- int isnum;
	- lua_Number d = lua_tonumberx(L, narg, &isnum);
	- if (!isnum)
	- tag_error(L, narg, LUA_TNUMBER);
	- return d;
	-}
	-
	-
	-LUALIB_API lua_Number luaL_optnumber (lua_State *L, int narg, lua_Number def) {
	- return luaL_opt(L, luaL_checknumber, narg, def);
	-}
	-
	-
	-LUALIB_API lua_Integer luaL_checkinteger (lua_State *L, int narg) {
	- int isnum;
	- lua_Integer d = lua_tointegerx(L, narg, &isnum);
	- if (!isnum)
	- tag_error(L, narg, LUA_TNUMBER);
	- return d;
	-}
	-
	-
	-LUALIB_API lua_Unsigned luaL_checkunsigned (lua_State *L, int narg) {
	- int isnum;
	- lua_Unsigned d = lua_tounsignedx(L, narg, &isnum);
	- if (!isnum)
	- tag_error(L, narg, LUA_TNUMBER);
	- return d;
	-}
	-
	-
	-LUALIB_API lua_Integer luaL_optinteger (lua_State *L, int narg,
	- lua_Integer def) {
	- return luaL_opt(L, luaL_checkinteger, narg, def);
	-}
	-
	-
	-LUALIB_API lua_Unsigned luaL_optunsigned (lua_State *L, int narg,
	- lua_Unsigned def) {
	- return luaL_opt(L, luaL_checkunsigned, narg, def);
	-}
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** Generic Buffer manipulation
	-** =======================================================
	-*/
	-
	-/*
	-** check whether buffer is using a userdata on the stack as a temporary
	-** buffer
	-*/
	-#define buffonstack(B) ((B)->b != (B)->initb)
	-
	-
	-/*
	-** returns a pointer to a free area with at least 'sz' bytes
	-*/
	-LUALIB_API char luaL_prepbuffsize (luaL_Buffer B, size_t sz) {
	- lua_State *L = B->L;
	- if (B->size - B->n < sz) { /* not enough space? */
	- char *newbuff;
	- size_t newsize = B->size * 2; /* double buffer size */
	- if (newsize - B->n < sz) /* not big enough? */
	- newsize = B->n + sz;
	- if (newsize < B->n \|\| newsize - B->n < sz)
	- luaL_error(L, "buffer too large");
	- /* create larger buffer */
	- newbuff = (char )lua_newuserdata(L, newsize sizeof(char));
	- /* move content to new buffer */
	- memcpy(newbuff, B->b, B->n * sizeof(char));
	- if (buffonstack(B))
	- lua_remove(L, -2); /* remove old buffer */
	- B->b = newbuff;
	- B->size = newsize;
	- }
	- return &B->b[B->n];
	-}
	-
	-
	-LUALIB_API void luaL_addlstring (luaL_Buffer B, const char s, size_t l) {
	- char *b = luaL_prepbuffsize(B, l);
	- memcpy(b, s, l * sizeof(char));
	- luaL_addsize(B, l);
	-}
	-
	-
	-LUALIB_API void luaL_addstring (luaL_Buffer B, const char s) {
	- luaL_addlstring(B, s, strlen(s));
	-}
	-
	-
	-LUALIB_API void luaL_pushresult (luaL_Buffer *B) {
	- lua_State *L = B->L;
	- lua_pushlstring(L, B->b, B->n);
	- if (buffonstack(B))
	- lua_remove(L, -2); /* remove old buffer */
	-}
	-
	-
	-LUALIB_API void luaL_pushresultsize (luaL_Buffer *B, size_t sz) {
	- luaL_addsize(B, sz);
	- luaL_pushresult(B);
	-}
	-
	-
	-LUALIB_API void luaL_addvalue (luaL_Buffer *B) {
	- lua_State *L = B->L;
	- size_t l;
	- const char *s = lua_tolstring(L, -1, &l);
	- if (buffonstack(B))
	- lua_insert(L, -2); /* put value below buffer */
	- luaL_addlstring(B, s, l);
	- lua_remove(L, (buffonstack(B)) ? -2 : -1); /* remove value */
	-}
	-
	-
	-LUALIB_API void luaL_buffinit (lua_State L, luaL_Buffer B) {
	- B->L = L;
	- B->b = B->initb;
	- B->n = 0;
	- B->size = LUAL_BUFFERSIZE;
	-}
	-
	-
	-LUALIB_API char luaL_buffinitsize (lua_State L, luaL_Buffer *B, size_t sz) {
	- luaL_buffinit(L, B);
	- return luaL_prepbuffsize(B, sz);
	-}
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** Reference system
	-** =======================================================
	-*/
	-
	-/* index of free-list header */
	-#define freelist 0
	-
	-
	-LUALIB_API int luaL_ref (lua_State *L, int t) {
	- int ref;
	- if (lua_isnil(L, -1)) {
	- lua_pop(L, 1); /* remove from stack */
	- return LUA_REFNIL; /* `nil' has a unique fixed reference */
	- }
	- t = lua_absindex(L, t);
	- lua_rawgeti(L, t, freelist); /* get first free element */
	- ref = (int)lua_tointeger(L, -1); /* ref = t[freelist] */
	- lua_pop(L, 1); /* remove it from stack */
	- if (ref != 0) { /* any free element? */
	- lua_rawgeti(L, t, ref); /* remove it from list */
	- lua_rawseti(L, t, freelist); /* (t[freelist] = t[ref]) */
	- }
	- else /* no free elements */
	- ref = (int)lua_rawlen(L, t) + 1; /* get a new reference */
	- lua_rawseti(L, t, ref);
	- return ref;
	-}
	-
	-
	-LUALIB_API void luaL_unref (lua_State *L, int t, int ref) {
	- if (ref >= 0) {
	- t = lua_absindex(L, t);
	- lua_rawgeti(L, t, freelist);
	- lua_rawseti(L, t, ref); /* t[ref] = t[freelist] */
	- lua_pushinteger(L, ref);
	- lua_rawseti(L, t, freelist); /* t[freelist] = ref */
	- }
	-}
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** Load functions
	-** =======================================================
	-*/
	-
	-typedef struct LoadS {
	- const char *s;
	- size_t size;
	-} LoadS;
	-
	-
	-static const char getS (lua_State L, void ud, size_t size) {
	- LoadS ls = (LoadS )ud;
	- (void)L; /* not used */
	- if (ls->size == 0) return NULL;
	- *size = ls->size;
	- ls->size = 0;
	- return ls->s;
	-}
	-
	-
	-LUALIB_API int luaL_loadbufferx (lua_State L, const char buff, size_t size,
	- const char name, const char mode) {
	- LoadS ls;
	- ls.s = buff;
	- ls.size = size;
	- return lua_load(L, getS, &ls, name, mode);
	-}
	-
	-
	-LUALIB_API int luaL_loadstring (lua_State L, const char s) {
	- return luaL_loadbuffer(L, s, strlen(s), s);
	-}
	-
	-/* }====================================================== */
	-
	-
	-
	-LUALIB_API int luaL_getmetafield (lua_State L, int obj, const char event) {
	- if (!lua_getmetatable(L, obj)) /* no metatable? */
	- return 0;
	- lua_pushstring(L, event);
	- lua_rawget(L, -2);
	- if (lua_isnil(L, -1)) {
	- lua_pop(L, 2); /* remove metatable and metafield */
	- return 0;
	- }
	- else {
	- lua_remove(L, -2); /* remove only metatable */
	- return 1;
	- }
	-}
	-
	-
	-LUALIB_API int luaL_callmeta (lua_State L, int obj, const char event) {
	- obj = lua_absindex(L, obj);
	- if (!luaL_getmetafield(L, obj, event)) /* no metafield? */
	- return 0;
	- lua_pushvalue(L, obj);
	- lua_call(L, 1, 1);
	- return 1;
	-}
	-
	-
	-LUALIB_API int luaL_len (lua_State *L, int idx) {
	- int l;
	- int isnum;
	- lua_len(L, idx);
	- l = (int)lua_tointegerx(L, -1, &isnum);
	- if (!isnum)
	- luaL_error(L, "object length is not a number");
	- lua_pop(L, 1); /* remove object */
	- return l;
	-}
	-
	-
	-LUALIB_API const char luaL_tolstring (lua_State L, int idx, size_t *len) {
	- if (!luaL_callmeta(L, idx, "__tostring")) { /* no metafield? */
	- switch (lua_type(L, idx)) {
	- case LUA_TNUMBER:
	- case LUA_TSTRING:
	- lua_pushvalue(L, idx);
	- break;
	- case LUA_TBOOLEAN:
	- lua_pushstring(L, (lua_toboolean(L, idx) ? "true" : "false"));
	- break;
	- case LUA_TNIL:
	- lua_pushliteral(L, "nil");
	- break;
	- default:
	- lua_pushfstring(L, "%s: %p", luaL_typename(L, idx),
	- lua_topointer(L, idx));
	- break;
	- }
	- }
	- return lua_tolstring(L, -1, len);
	-}
	-
	-
	-/*
	-** {======================================================
	-** Compatibility with 5.1 module functions
	-** =======================================================
	-*/
	-#if defined(LUA_COMPAT_MODULE)
	-
	-static const char luaL_findtable (lua_State L, int idx,
	- const char *fname, int szhint) {
	- const char *e;
	- if (idx) lua_pushvalue(L, idx);
	- do {
	- e = strchr(fname, '.');
	- if (e == NULL) e = fname + strlen(fname);
	- lua_pushlstring(L, fname, e - fname);
	- lua_rawget(L, -2);
	- if (lua_isnil(L, -1)) { /* no such field? */
	- lua_pop(L, 1); /* remove this nil */
	- lua_createtable(L, 0, (e == '.' ? 1 : szhint)); / new table for field */
	- lua_pushlstring(L, fname, e - fname);
	- lua_pushvalue(L, -2);
	- lua_settable(L, -4); /* set new table into field */
	- }
	- else if (!lua_istable(L, -1)) { /* field has a non-table value? */
	- lua_pop(L, 2); /* remove table and value */
	- return fname; /* return problematic part of the name */
	- }
	- lua_remove(L, -2); /* remove previous table */
	- fname = e + 1;
	- } while (*e == '.');
	- return NULL;
	-}
	-
	-
	-/*
	-** Count number of elements in a luaL_Reg list.
	-*/
	-static int libsize (const luaL_Reg *l) {
	- int size = 0;
	- for (; l && l->name; l++) size++;
	- return size;
	-}
	-
	-
	-/*
	-** Find or create a module table with a given name. The function
	-** first looks at the _LOADED table and, if that fails, try a
	-** global variable with that name. In any case, leaves on the stack
	-** the module table.
	-*/
	-LUALIB_API void luaL_pushmodule (lua_State L, const char modname,
	- int sizehint) {
	- luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 1); /* get _LOADED table */
	- lua_getfield(L, -1, modname); /* get _LOADED[modname] */
	- if (!lua_istable(L, -1)) { /* not found? */
	- lua_pop(L, 1); /* remove previous result */
	- /* try global variable (and create one if it does not exist) */
	- lua_pushglobaltable(L);
	- if (luaL_findtable(L, 0, modname, sizehint) != NULL)
	- luaL_error(L, "name conflict for module " LUA_QS, modname);
	- lua_pushvalue(L, -1);
	- lua_setfield(L, -3, modname); /* _LOADED[modname] = new table */
	- }
	- lua_remove(L, -2); /* remove _LOADED table */
	-}
	-
	-
	-LUALIB_API void luaL_openlib (lua_State L, const char libname,
	- const luaL_Reg *l, int nup) {
	- luaL_checkversion(L);
	- if (libname) {
	- luaL_pushmodule(L, libname, libsize(l)); /* get/create library table */
	- lua_insert(L, -(nup + 1)); /* move library table to below upvalues */
	- }
	- if (l)
	- luaL_setfuncs(L, l, nup);
	- else
	- lua_pop(L, nup); /* remove upvalues */
	-}
	-
	-#endif
	-/* }====================================================== */
	-
	-/*
	-** set functions from list 'l' into table at top - 'nup'; each
	-** function gets the 'nup' elements at the top as upvalues.
	-** Returns with only the table at the stack.
	-*/
	-LUALIB_API void luaL_setfuncs (lua_State L, const luaL_Reg l, int nup) {
	- luaL_checkversion(L);
	- luaL_checkstack(L, nup, "too many upvalues");
	- for (; l->name != NULL; l++) { /* fill the table with given functions */
	- int i;
	- for (i = 0; i < nup; i++) /* copy upvalues to the top */
	- lua_pushvalue(L, -nup);
	- lua_pushcclosure(L, l->func, nup); /* closure with those upvalues */
	- lua_setfield(L, -(nup + 2), l->name);
	- }
	- lua_pop(L, nup); /* remove upvalues */
	-}
	-
	-
	-/*
	-** ensure that stack[idx][fname] has a table and push that table
	-** into the stack
	-*/
	-LUALIB_API int luaL_getsubtable (lua_State L, int idx, const char fname) {
	- lua_getfield(L, idx, fname);
	- if (lua_istable(L, -1)) return 1; /* table already there */
	- else {
	- lua_pop(L, 1); /* remove previous result */
	- idx = lua_absindex(L, idx);
	- lua_newtable(L);
	- lua_pushvalue(L, -1); /* copy to be left at top */
	- lua_setfield(L, idx, fname); /* assign new table to field */
	- return 0; /* false, because did not find table there */
	- }
	-}
	-
	-
	-/*
	-** stripped-down 'require'. Calls 'openf' to open a module,
	-** registers the result in 'package.loaded' table and, if 'glb'
	-** is true, also registers the result in the global table.
	-** Leaves resulting module on the top.
	-*/
	-LUALIB_API void luaL_requiref (lua_State L, const char modname,
	- lua_CFunction openf, int glb) {
	- lua_pushcfunction(L, openf);
	- lua_pushstring(L, modname); /* argument to open function */
	- lua_call(L, 1, 1); /* open module */
	- luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED");
	- lua_pushvalue(L, -2); /* make copy of module (call result) */
	- lua_setfield(L, -2, modname); /* _LOADED[modname] = module */
	- lua_pop(L, 1); /* remove _LOADED table */
	- if (glb) {
	- lua_pushvalue(L, -1); /* copy of 'mod' */
	- lua_setglobal(L, modname); /* _G[modname] = module */
	- }
	-}
	-
	-
	-LUALIB_API const char luaL_gsub (lua_State L, const char s, const char p,
	- const char *r) {
	- const char *wild;
	- size_t l = strlen(p);
	- luaL_Buffer b;
	- luaL_buffinit(L, &b);
	- while ((wild = strstr(s, p)) != NULL) {
	- luaL_addlstring(&b, s, wild - s); /* push prefix */
	- luaL_addstring(&b, r); /* push replacement in place of pattern */
	- s = wild + l; /* continue after `p' */
	- }
	- luaL_addstring(&b, s); /* push last suffix */
	- luaL_pushresult(&b);
	- return lua_tostring(L, -1);
	-}
	-
	-
	-LUALIB_API void luaL_checkversion_ (lua_State *L, lua_Number ver) {
	- const lua_Number *v = lua_version(L);
	- if (v != lua_version(NULL))
	- luaL_error(L, "multiple Lua VMs detected");
	- else if (*v != ver)
	- luaL_error(L, "version mismatch: app. needs %f, Lua core provides %f",
	- ver, *v);
	- /* check conversions number -> integer types */
	- lua_pushnumber(L, -(lua_Number)0x1234);
	- if (lua_tointeger(L, -1) != -0x1234 \|\|
	- lua_tounsigned(L, -1) != (lua_Unsigned)-0x1234)
	- luaL_error(L, "bad conversion number->int;"
	- " must recompile Lua with proper settings");
	- lua_pop(L, 1);
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c
	@@ -1,296 +0,0 @@
	-/*
	-** $Id: lbaselib.c,v 1.276.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Basic library
	-** See Copyright Notice in lua.h
	-*/
	-
	-/* The following built-in lua functions have been removed and are not available
	- * for use in ZFS channel programs:
	- *
	- * dofile
	- * loadfile
	- * load
	- * pcall
	- * print
	- * xpcall
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/ctype.h>
	-#ifdef illumos
	-#define toupper(C) (((C) >= 'a' && (C) <= 'z')? (C) - 'a' + 'A': (C))
	-#else
	-#define isalnum(C) (isalpha(C) \|\| isdigit(C))
	-#endif
	-
	-#define lbaselib_c
	-#define LUA_LIB
	-
	-#include "lua.h"
	-
	-#include "lauxlib.h"
	-#include "lualib.h"
	-
	-#define SPACECHARS " \f\n\r\t\v"
	-
	-static int luaB_tonumber (lua_State *L) {
	- if (lua_isnoneornil(L, 2)) { /* standard conversion */
	- int isnum;
	- lua_Number n = lua_tonumberx(L, 1, &isnum);
	- if (isnum) {
	- lua_pushnumber(L, n);
	- return 1;
	- } /* else not a number; must be something */
	- luaL_checkany(L, 1);
	- }
	- else {
	- size_t l;
	- const char *s = luaL_checklstring(L, 1, &l);
	- const char e = s + l; / end point for 's' */
	- int base = luaL_checkint(L, 2);
	- int neg = 0;
	- luaL_argcheck(L, 2 <= base && base <= 36, 2, "base out of range");
	- s += strspn(s, SPACECHARS); /* skip initial spaces */
	- if (s == '-') { s++; neg = 1; } / handle signal */
	- else if (*s == '+') s++;
	- if (isalnum((unsigned char)*s)) {
	- lua_Number n = 0;
	- do {
	- int digit = (isdigit((unsigned char)s)) ? s - '0'
	- : toupper((unsigned char)*s) - 'A' + 10;
	- if (digit >= base) break; /* invalid numeral; force a fail */
	- n = n * (lua_Number)base + (lua_Number)digit;
	- s++;
	- } while (isalnum((unsigned char)*s));
	- s += strspn(s, SPACECHARS); /* skip trailing spaces */
	- if (s == e) { /* no invalid trailing characters? */
	- lua_pushnumber(L, (neg) ? -n : n);
	- return 1;
	- } /* else not a number */
	- } /* else not a number */
	- }
	- lua_pushnil(L); /* not a number */
	- return 1;
	-}
	-
	-
	-static int luaB_error (lua_State *L) {
	- int level = luaL_optint(L, 2, 1);
	- lua_settop(L, 1);
	- if (lua_isstring(L, 1) && level > 0) { /* add extra information? */
	- luaL_where(L, level);
	- lua_pushvalue(L, 1);
	- lua_concat(L, 2);
	- }
	- return lua_error(L);
	-}
	-
	-
	-static int luaB_getmetatable (lua_State *L) {
	- luaL_checkany(L, 1);
	- if (!lua_getmetatable(L, 1)) {
	- lua_pushnil(L);
	- return 1; /* no metatable */
	- }
	- luaL_getmetafield(L, 1, "__metatable");
	- return 1; /* returns either __metatable field (if present) or metatable */
	-}
	-
	-
	-static int luaB_setmetatable (lua_State *L) {
	- int t = lua_type(L, 2);
	- luaL_checktype(L, 1, LUA_TTABLE);
	- luaL_argcheck(L, t == LUA_TNIL \|\| t == LUA_TTABLE, 2,
	- "nil or table expected");
	- if (luaL_getmetafield(L, 1, "__metatable"))
	- return luaL_error(L, "cannot change a protected metatable");
	- lua_settop(L, 2);
	- lua_setmetatable(L, 1);
	- return 1;
	-}
	-
	-
	-static int luaB_rawequal (lua_State *L) {
	- luaL_checkany(L, 1);
	- luaL_checkany(L, 2);
	- lua_pushboolean(L, lua_rawequal(L, 1, 2));
	- return 1;
	-}
	-
	-
	-static int luaB_rawlen (lua_State *L) {
	- int t = lua_type(L, 1);
	- luaL_argcheck(L, t == LUA_TTABLE \|\| t == LUA_TSTRING, 1,
	- "table or string expected");
	- lua_pushinteger(L, lua_rawlen(L, 1));
	- return 1;
	-}
	-
	-
	-static int luaB_rawget (lua_State *L) {
	- luaL_checktype(L, 1, LUA_TTABLE);
	- luaL_checkany(L, 2);
	- lua_settop(L, 2);
	- lua_rawget(L, 1);
	- return 1;
	-}
	-
	-static int luaB_rawset (lua_State *L) {
	- luaL_checktype(L, 1, LUA_TTABLE);
	- luaL_checkany(L, 2);
	- luaL_checkany(L, 3);
	- lua_settop(L, 3);
	- lua_rawset(L, 1);
	- return 1;
	-}
	-
	-
	-static int luaB_collectgarbage (lua_State *L) {
	- static const char *const opts[] = {"stop", "restart", "collect",
	- "count", "step", "setpause", "setstepmul",
	- "setmajorinc", "isrunning", "generational", "incremental", NULL};
	- static const int optsnum[] = {LUA_GCSTOP, LUA_GCRESTART, LUA_GCCOLLECT,
	- LUA_GCCOUNT, LUA_GCSTEP, LUA_GCSETPAUSE, LUA_GCSETSTEPMUL,
	- LUA_GCSETMAJORINC, LUA_GCISRUNNING, LUA_GCGEN, LUA_GCINC};
	- int o = optsnum[luaL_checkoption(L, 1, "collect", opts)];
	- int ex = luaL_optint(L, 2, 0);
	- int res = lua_gc(L, o, ex);
	- switch (o) {
	- case LUA_GCCOUNT: {
	- int b = lua_gc(L, LUA_GCCOUNTB, 0);
	- lua_pushnumber(L, res + ((lua_Number)b/1024));
	- lua_pushinteger(L, b);
	- return 2;
	- }
	- case LUA_GCSTEP: case LUA_GCISRUNNING: {
	- lua_pushboolean(L, res);
	- return 1;
	- }
	- default: {
	- lua_pushinteger(L, res);
	- return 1;
	- }
	- }
	-}
	-
	-
	-static int luaB_type (lua_State *L) {
	- luaL_checkany(L, 1);
	- lua_pushstring(L, luaL_typename(L, 1));
	- return 1;
	-}
	-
	-
	-static int pairsmeta (lua_State L, const char method, int iszero,
	- lua_CFunction iter) {
	- if (!luaL_getmetafield(L, 1, method)) { /* no metamethod? */
	- luaL_checktype(L, 1, LUA_TTABLE); /* argument must be a table */
	- lua_pushcfunction(L, iter); /* will return generator, */
	- lua_pushvalue(L, 1); /* state, */
	- if (iszero) lua_pushinteger(L, 0); /* and initial value */
	- else lua_pushnil(L);
	- }
	- else {
	- lua_pushvalue(L, 1); /* argument 'self' to metamethod */
	- lua_call(L, 1, 3); /* get 3 values from metamethod */
	- }
	- return 3;
	-}
	-
	-
	-static int luaB_next (lua_State *L) {
	- luaL_checktype(L, 1, LUA_TTABLE);
	- lua_settop(L, 2); /* create a 2nd argument if there isn't one */
	- if (lua_next(L, 1))
	- return 2;
	- else {
	- lua_pushnil(L);
	- return 1;
	- }
	-}
	-
	-
	-static int luaB_pairs (lua_State *L) {
	- return pairsmeta(L, "__pairs", 0, luaB_next);
	-}
	-
	-
	-static int ipairsaux (lua_State *L) {
	- int i = luaL_checkint(L, 2);
	- luaL_checktype(L, 1, LUA_TTABLE);
	- i++; /* next value */
	- lua_pushinteger(L, i);
	- lua_rawgeti(L, 1, i);
	- return (lua_isnil(L, -1)) ? 1 : 2;
	-}
	-
	-
	-static int luaB_ipairs (lua_State *L) {
	- return pairsmeta(L, "__ipairs", 1, ipairsaux);
	-}
	-
	-
	-static int luaB_assert (lua_State *L) {
	- if (!lua_toboolean(L, 1))
	- return luaL_error(L, "%s", luaL_optstring(L, 2, "assertion failed!"));
	- return lua_gettop(L);
	-}
	-
	-
	-static int luaB_select (lua_State *L) {
	- int n = lua_gettop(L);
	- if (lua_type(L, 1) == LUA_TSTRING && *lua_tostring(L, 1) == '#') {
	- lua_pushinteger(L, n-1);
	- return 1;
	- }
	- else {
	- int i = luaL_checkint(L, 1);
	- if (i < 0) i = n + i;
	- else if (i > n) i = n;
	- luaL_argcheck(L, 1 <= i, 1, "index out of range");
	- return n - i;
	- }
	-}
	-
	-static int luaB_tostring (lua_State *L) {
	- luaL_checkany(L, 1);
	- luaL_tolstring(L, 1, NULL);
	- return 1;
	-}
	-
	-static const luaL_Reg base_funcs[] = {
	- {"assert", luaB_assert},
	- {"collectgarbage", luaB_collectgarbage},
	- {"error", luaB_error},
	- {"getmetatable", luaB_getmetatable},
	- {"ipairs", luaB_ipairs},
	-#if defined(LUA_COMPAT_LOADSTRING)
	- {"loadstring", luaB_load},
	-#endif
	- {"next", luaB_next},
	- {"pairs", luaB_pairs},
	- {"rawequal", luaB_rawequal},
	- {"rawlen", luaB_rawlen},
	- {"rawget", luaB_rawget},
	- {"rawset", luaB_rawset},
	- {"select", luaB_select},
	- {"setmetatable", luaB_setmetatable},
	- {"tonumber", luaB_tonumber},
	- {"tostring", luaB_tostring},
	- {"type", luaB_type},
	- {NULL, NULL}
	-};
	-
	-
	-LUAMOD_API int luaopen_base (lua_State *L) {
	- /* set global _G */
	- lua_pushglobaltable(L);
	- lua_pushglobaltable(L);
	- lua_setfield(L, -2, "_G");
	- /* open lib into global table */
	- luaL_setfuncs(L, base_funcs, 0);
	- lua_pushliteral(L, LUA_VERSION);
	- lua_setfield(L, -2, "_VERSION"); /* set global _VERSION */
	- return 1;
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c
	@@ -1,212 +0,0 @@
	-/*
	-** $Id: lbitlib.c,v 1.18.1.2 2013/07/09 18:01:41 roberto Exp $
	-** Standard library for bitwise operations
	-** See Copyright Notice in lua.h
	-*/
	-
	-#define lbitlib_c
	-#define LUA_LIB
	-
	-#include "lua.h"
	-
	-#include "lauxlib.h"
	-#include "lualib.h"
	-
	-
	-/* number of bits to consider in a number */
	-#if !defined(LUA_NBITS)
	-#define LUA_NBITS 32
	-#endif
	-
	-
	-#define ALLONES (~(((~(lua_Unsigned)0) << (LUA_NBITS - 1)) << 1))
	-
	-/* macro to trim extra bits */
	-#define trim(x) ((x) & ALLONES)
	-
	-
	-/* builds a number with 'n' ones (1 <= n <= LUA_NBITS) */
	-#define mask(n) (~((ALLONES << 1) << ((n) - 1)))
	-
	-
	-typedef lua_Unsigned b_uint;
	-
	-
	-
	-static b_uint andaux (lua_State *L) {
	- int i, n = lua_gettop(L);
	- b_uint r = ~(b_uint)0;
	- for (i = 1; i <= n; i++)
	- r &= luaL_checkunsigned(L, i);
	- return trim(r);
	-}
	-
	-
	-static int b_and (lua_State *L) {
	- b_uint r = andaux(L);
	- lua_pushunsigned(L, r);
	- return 1;
	-}
	-
	-
	-static int b_test (lua_State *L) {
	- b_uint r = andaux(L);
	- lua_pushboolean(L, r != 0);
	- return 1;
	-}
	-
	-
	-static int b_or (lua_State *L) {
	- int i, n = lua_gettop(L);
	- b_uint r = 0;
	- for (i = 1; i <= n; i++)
	- r \|= luaL_checkunsigned(L, i);
	- lua_pushunsigned(L, trim(r));
	- return 1;
	-}
	-
	-
	-static int b_xor (lua_State *L) {
	- int i, n = lua_gettop(L);
	- b_uint r = 0;
	- for (i = 1; i <= n; i++)
	- r ^= luaL_checkunsigned(L, i);
	- lua_pushunsigned(L, trim(r));
	- return 1;
	-}
	-
	-
	-static int b_not (lua_State *L) {
	- b_uint r = ~luaL_checkunsigned(L, 1);
	- lua_pushunsigned(L, trim(r));
	- return 1;
	-}
	-
	-
	-static int b_shift (lua_State *L, b_uint r, int i) {
	- if (i < 0) { /* shift right? */
	- i = -i;
	- r = trim(r);
	- if (i >= LUA_NBITS) r = 0;
	- else r >>= i;
	- }
	- else { /* shift left */
	- if (i >= LUA_NBITS) r = 0;
	- else r <<= i;
	- r = trim(r);
	- }
	- lua_pushunsigned(L, r);
	- return 1;
	-}
	-
	-
	-static int b_lshift (lua_State *L) {
	- return b_shift(L, luaL_checkunsigned(L, 1), luaL_checkint(L, 2));
	-}
	-
	-
	-static int b_rshift (lua_State *L) {
	- return b_shift(L, luaL_checkunsigned(L, 1), -luaL_checkint(L, 2));
	-}
	-
	-
	-static int b_arshift (lua_State *L) {
	- b_uint r = luaL_checkunsigned(L, 1);
	- int i = luaL_checkint(L, 2);
	- if (i < 0 \|\| !(r & ((b_uint)1 << (LUA_NBITS - 1))))
	- return b_shift(L, r, -i);
	- else { /* arithmetic shift for 'negative' number */
	- if (i >= LUA_NBITS) r = ALLONES;
	- else
	- r = trim((r >> i) \| ~(~(b_uint)0 >> i)); /* add signal bit */
	- lua_pushunsigned(L, r);
	- return 1;
	- }
	-}
	-
	-
	-static int b_rot (lua_State *L, int i) {
	- b_uint r = luaL_checkunsigned(L, 1);
	- i &= (LUA_NBITS - 1); /* i = i % NBITS */
	- r = trim(r);
	- if (i != 0) /* avoid undefined shift of LUA_NBITS when i == 0 */
	- r = (r << i) \| (r >> (LUA_NBITS - i));
	- lua_pushunsigned(L, trim(r));
	- return 1;
	-}
	-
	-
	-static int b_lrot (lua_State *L) {
	- return b_rot(L, luaL_checkint(L, 2));
	-}
	-
	-
	-static int b_rrot (lua_State *L) {
	- return b_rot(L, -luaL_checkint(L, 2));
	-}
	-
	-
	-/*
	-** get field and width arguments for field-manipulation functions,
	-** checking whether they are valid.
	-** ('luaL_error' called without 'return' to avoid later warnings about
	-** 'width' being used uninitialized.)
	-*/
	-static int fieldargs (lua_State L, int farg, int width) {
	- int f = luaL_checkint(L, farg);
	- int w = luaL_optint(L, farg + 1, 1);
	- luaL_argcheck(L, 0 <= f, farg, "field cannot be negative");
	- luaL_argcheck(L, 0 < w, farg + 1, "width must be positive");
	- if (f + w > LUA_NBITS)
	- luaL_error(L, "trying to access non-existent bits");
	- *width = w;
	- return f;
	-}
	-
	-
	-static int b_extract (lua_State *L) {
	- int w;
	- b_uint r = luaL_checkunsigned(L, 1);
	- int f = fieldargs(L, 2, &w);
	- r = (r >> f) & mask(w);
	- lua_pushunsigned(L, r);
	- return 1;
	-}
	-
	-
	-static int b_replace (lua_State *L) {
	- int w;
	- b_uint r = luaL_checkunsigned(L, 1);
	- b_uint v = luaL_checkunsigned(L, 2);
	- int f = fieldargs(L, 3, &w);
	- int m = mask(w);
	- v &= m; /* erase bits outside given width */
	- r = (r & ~(m << f)) \| (v << f);
	- lua_pushunsigned(L, r);
	- return 1;
	-}
	-
	-
	-static const luaL_Reg bitlib[] = {
	- {"arshift", b_arshift},
	- {"band", b_and},
	- {"bnot", b_not},
	- {"bor", b_or},
	- {"bxor", b_xor},
	- {"btest", b_test},
	- {"extract", b_extract},
	- {"lrotate", b_lrot},
	- {"lshift", b_lshift},
	- {"replace", b_replace},
	- {"rrotate", b_rrot},
	- {"rshift", b_rshift},
	- {NULL, NULL}
	-};
	-
	-
	-
	-LUAMOD_API int luaopen_bit32 (lua_State *L) {
	- luaL_newlib(L, bitlib);
	- return 1;
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h
	@@ -1,83 +0,0 @@
	-/*
	-** $Id: lcode.h,v 1.58.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Code generator for Lua
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lcode_h
	-#define lcode_h
	-
	-#include "llex.h"
	-#include "lobject.h"
	-#include "lopcodes.h"
	-#include "lparser.h"
	-
	-
	-/*
	-** Marks the end of a patch list. It is an invalid value both as an absolute
	-** address, and as a list link (would link an element to itself).
	-*/
	-#define NO_JUMP (-1)
	-
	-
	-/*
	-** grep "ORDER OPR" if you change these enums (ORDER OP)
	-*/
	-typedef enum BinOpr {
	- OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW,
	- OPR_CONCAT,
	- OPR_EQ, OPR_LT, OPR_LE,
	- OPR_NE, OPR_GT, OPR_GE,
	- OPR_AND, OPR_OR,
	- OPR_NOBINOPR
	-} BinOpr;
	-
	-
	-typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr;
	-
	-
	-#define getcode(fs,e) ((fs)->f->code[(e)->u.info])
	-
	-#define luaK_codeAsBx(fs,o,A,sBx) luaK_codeABx(fs,o,A,(sBx)+MAXARG_sBx)
	-
	-#define luaK_setmultret(fs,e) luaK_setreturns(fs, e, LUA_MULTRET)
	-
	-#define luaK_jumpto(fs,t) luaK_patchlist(fs, luaK_jump(fs), t)
	-
	-LUAI_FUNC int luaK_codeABx (FuncState *fs, OpCode o, int A, unsigned int Bx);
	-LUAI_FUNC int luaK_codeABC (FuncState *fs, OpCode o, int A, int B, int C);
	-LUAI_FUNC int luaK_codek (FuncState *fs, int reg, int k);
	-LUAI_FUNC void luaK_fixline (FuncState *fs, int line);
	-LUAI_FUNC void luaK_nil (FuncState *fs, int from, int n);
	-LUAI_FUNC void luaK_reserveregs (FuncState *fs, int n);
	-LUAI_FUNC void luaK_checkstack (FuncState *fs, int n);
	-LUAI_FUNC int luaK_stringK (FuncState fs, TString s);
	-LUAI_FUNC int luaK_numberK (FuncState *fs, lua_Number r);
	-LUAI_FUNC void luaK_dischargevars (FuncState fs, expdesc e);
	-LUAI_FUNC int luaK_exp2anyreg (FuncState fs, expdesc e);
	-LUAI_FUNC void luaK_exp2anyregup (FuncState fs, expdesc e);
	-LUAI_FUNC void luaK_exp2nextreg (FuncState fs, expdesc e);
	-LUAI_FUNC void luaK_exp2val (FuncState fs, expdesc e);
	-LUAI_FUNC int luaK_exp2RK (FuncState fs, expdesc e);
	-LUAI_FUNC void luaK_self (FuncState fs, expdesc e, expdesc *key);
	-LUAI_FUNC void luaK_indexed (FuncState fs, expdesc t, expdesc *k);
	-LUAI_FUNC void luaK_goiftrue (FuncState fs, expdesc e);
	-LUAI_FUNC void luaK_goiffalse (FuncState fs, expdesc e);
	-LUAI_FUNC void luaK_storevar (FuncState fs, expdesc var, expdesc *e);
	-LUAI_FUNC void luaK_setreturns (FuncState fs, expdesc e, int nresults);
	-LUAI_FUNC void luaK_setoneret (FuncState fs, expdesc e);
	-LUAI_FUNC int luaK_jump (FuncState *fs);
	-LUAI_FUNC void luaK_ret (FuncState *fs, int first, int nret);
	-LUAI_FUNC void luaK_patchlist (FuncState *fs, int list, int target);
	-LUAI_FUNC void luaK_patchtohere (FuncState *fs, int list);
	-LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level);
	-LUAI_FUNC void luaK_concat (FuncState fs, int l1, int l2);
	-LUAI_FUNC int luaK_getlabel (FuncState *fs);
	-LUAI_FUNC void luaK_prefix (FuncState fs, UnOpr op, expdesc v, int line);
	-LUAI_FUNC void luaK_infix (FuncState fs, BinOpr op, expdesc v);
	-LUAI_FUNC void luaK_posfix (FuncState fs, BinOpr op, expdesc v1,
	- expdesc *v2, int line);
	-LUAI_FUNC void luaK_setlist (FuncState *fs, int base, int nelems, int tostore);
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c
	@@ -1,885 +0,0 @@
	-/*
	-** $Id: lcode.c,v 2.62.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Code generator for Lua
	-** See Copyright Notice in lua.h
	-*/
	-
	-#include <sys/zfs_context.h>
	-
	-#define lcode_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lcode.h"
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lgc.h"
	-#include "llex.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lopcodes.h"
	-#include "lparser.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "lvm.h"
	-
	-
	-#define hasjumps(e) ((e)->t != (e)->f)
	-
	-
	-static int isnumeral(expdesc *e) {
	- return (e->k == VKNUM && e->t == NO_JUMP && e->f == NO_JUMP);
	-}
	-
	-
	-void luaK_nil (FuncState *fs, int from, int n) {
	- Instruction *previous;
	- int l = from + n - 1; /* last register to set nil */
	- if (fs->pc > fs->lasttarget) { /* no jumps to current position? */
	- previous = &fs->f->code[fs->pc-1];
	- if (GET_OPCODE(*previous) == OP_LOADNIL) {
	- int pfrom = GETARG_A(*previous);
	- int pl = pfrom + GETARG_B(*previous);
	- if ((pfrom <= from && from <= pl + 1) \|\|
	- (from <= pfrom && pfrom <= l + 1)) { /* can connect both? */
	- if (pfrom < from) from = pfrom; /* from = min(from, pfrom) */
	- if (pl > l) l = pl; /* l = max(l, pl) */
	- SETARG_A(*previous, from);
	- SETARG_B(*previous, l - from);
	- return;
	- }
	- } /* else go through */
	- }
	- luaK_codeABC(fs, OP_LOADNIL, from, n - 1, 0); /* else no optimization */
	-}
	-
	-
	-int luaK_jump (FuncState *fs) {
	- int jpc = fs->jpc; /* save list of jumps to here */
	- int j;
	- fs->jpc = NO_JUMP;
	- j = luaK_codeAsBx(fs, OP_JMP, 0, NO_JUMP);
	- luaK_concat(fs, &j, jpc); /* keep them on hold */
	- return j;
	-}
	-
	-
	-void luaK_ret (FuncState *fs, int first, int nret) {
	- luaK_codeABC(fs, OP_RETURN, first, nret+1, 0);
	-}
	-
	-
	-static int condjump (FuncState *fs, OpCode op, int A, int B, int C) {
	- luaK_codeABC(fs, op, A, B, C);
	- return luaK_jump(fs);
	-}
	-
	-
	-static void fixjump (FuncState *fs, int pc, int dest) {
	- Instruction *jmp = &fs->f->code[pc];
	- int offset = dest-(pc+1);
	- lua_assert(dest != NO_JUMP);
	- if (abs(offset) > MAXARG_sBx)
	- luaX_syntaxerror(fs->ls, "control structure too long");
	- SETARG_sBx(*jmp, offset);
	-}
	-
	-
	-/*
	-** returns current `pc' and marks it as a jump target (to avoid wrong
	-** optimizations with consecutive instructions not in the same basic block).
	-*/
	-int luaK_getlabel (FuncState *fs) {
	- fs->lasttarget = fs->pc;
	- return fs->pc;
	-}
	-
	-
	-static int getjump (FuncState *fs, int pc) {
	- int offset = GETARG_sBx(fs->f->code[pc]);
	- if (offset == NO_JUMP) /* point to itself represents end of list */
	- return NO_JUMP; /* end of list */
	- else
	- return (pc+1)+offset; /* turn offset into absolute position */
	-}
	-
	-
	-static Instruction getjumpcontrol (FuncState fs, int pc) {
	- Instruction *pi = &fs->f->code[pc];
	- if (pc >= 1 && testTMode(GET_OPCODE(*(pi-1))))
	- return pi-1;
	- else
	- return pi;
	-}
	-
	-
	-/*
	-** check whether list has any jump that do not produce a value
	-** (or produce an inverted value)
	-*/
	-static int need_value (FuncState *fs, int list) {
	- for (; list != NO_JUMP; list = getjump(fs, list)) {
	- Instruction i = *getjumpcontrol(fs, list);
	- if (GET_OPCODE(i) != OP_TESTSET) return 1;
	- }
	- return 0; /* not found */
	-}
	-
	-
	-static int patchtestreg (FuncState *fs, int node, int reg) {
	- Instruction *i = getjumpcontrol(fs, node);
	- if (GET_OPCODE(*i) != OP_TESTSET)
	- return 0; /* cannot patch other instructions */
	- if (reg != NO_REG && reg != GETARG_B(*i))
	- SETARG_A(*i, reg);
	- else /* no register to put value or register already has the value */
	- i = CREATE_ABC(OP_TEST, GETARG_B(i), 0, GETARG_C(*i));
	-
	- return 1;
	-}
	-
	-
	-static void removevalues (FuncState *fs, int list) {
	- for (; list != NO_JUMP; list = getjump(fs, list))
	- patchtestreg(fs, list, NO_REG);
	-}
	-
	-
	-static void patchlistaux (FuncState *fs, int list, int vtarget, int reg,
	- int dtarget) {
	- while (list != NO_JUMP) {
	- int next = getjump(fs, list);
	- if (patchtestreg(fs, list, reg))
	- fixjump(fs, list, vtarget);
	- else
	- fixjump(fs, list, dtarget); /* jump to default target */
	- list = next;
	- }
	-}
	-
	-
	-static void dischargejpc (FuncState *fs) {
	- patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc);
	- fs->jpc = NO_JUMP;
	-}
	-
	-
	-void luaK_patchlist (FuncState *fs, int list, int target) {
	- if (target == fs->pc)
	- luaK_patchtohere(fs, list);
	- else {
	- lua_assert(target < fs->pc);
	- patchlistaux(fs, list, target, NO_REG, target);
	- }
	-}
	-
	-
	-LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level) {
	- level++; /* argument is +1 to reserve 0 as non-op */
	- while (list != NO_JUMP) {
	- int next = getjump(fs, list);
	- lua_assert(GET_OPCODE(fs->f->code[list]) == OP_JMP &&
	- (GETARG_A(fs->f->code[list]) == 0 \|\|
	- GETARG_A(fs->f->code[list]) >= level));
	- SETARG_A(fs->f->code[list], level);
	- list = next;
	- }
	-}
	-
	-
	-void luaK_patchtohere (FuncState *fs, int list) {
	- luaK_getlabel(fs);
	- luaK_concat(fs, &fs->jpc, list);
	-}
	-
	-
	-void luaK_concat (FuncState fs, int l1, int l2) {
	- if (l2 == NO_JUMP) return;
	- else if (*l1 == NO_JUMP)
	- *l1 = l2;
	- else {
	- int list = *l1;
	- int next;
	- while ((next = getjump(fs, list)) != NO_JUMP) /* find last element */
	- list = next;
	- fixjump(fs, list, l2);
	- }
	-}
	-
	-
	-static int luaK_code (FuncState *fs, Instruction i) {
	- Proto *f = fs->f;
	- dischargejpc(fs); /* `pc' will change */
	- /* put new instruction in code array */
	- luaM_growvector(fs->ls->L, f->code, fs->pc, f->sizecode, Instruction,
	- MAX_INT, "opcodes");
	- f->code[fs->pc] = i;
	- /* save corresponding line information */
	- luaM_growvector(fs->ls->L, f->lineinfo, fs->pc, f->sizelineinfo, int,
	- MAX_INT, "opcodes");
	- f->lineinfo[fs->pc] = fs->ls->lastline;
	- return fs->pc++;
	-}
	-
	-
	-int luaK_codeABC (FuncState *fs, OpCode o, int a, int b, int c) {
	- lua_assert(getOpMode(o) == iABC);
	- lua_assert(getBMode(o) != OpArgN \|\| b == 0);
	- lua_assert(getCMode(o) != OpArgN \|\| c == 0);
	- lua_assert(a <= MAXARG_A && b <= MAXARG_B && c <= MAXARG_C);
	- return luaK_code(fs, CREATE_ABC(o, a, b, c));
	-}
	-
	-
	-int luaK_codeABx (FuncState *fs, OpCode o, int a, unsigned int bc) {
	- lua_assert(getOpMode(o) == iABx \|\| getOpMode(o) == iAsBx);
	- lua_assert(getCMode(o) == OpArgN);
	- lua_assert(a <= MAXARG_A && bc <= MAXARG_Bx);
	- return luaK_code(fs, CREATE_ABx(o, a, bc));
	-}
	-
	-
	-static int codeextraarg (FuncState *fs, int a) {
	- lua_assert(a <= MAXARG_Ax);
	- return luaK_code(fs, CREATE_Ax(OP_EXTRAARG, a));
	-}
	-
	-
	-int luaK_codek (FuncState *fs, int reg, int k) {
	- if (k <= MAXARG_Bx)
	- return luaK_codeABx(fs, OP_LOADK, reg, k);
	- else {
	- int p = luaK_codeABx(fs, OP_LOADKX, reg, 0);
	- codeextraarg(fs, k);
	- return p;
	- }
	-}
	-
	-
	-void luaK_checkstack (FuncState *fs, int n) {
	- int newstack = fs->freereg + n;
	- if (newstack > fs->f->maxstacksize) {
	- if (newstack >= MAXSTACK)
	- luaX_syntaxerror(fs->ls, "function or expression too complex");
	- fs->f->maxstacksize = cast_byte(newstack);
	- }
	-}
	-
	-
	-void luaK_reserveregs (FuncState *fs, int n) {
	- luaK_checkstack(fs, n);
	- fs->freereg += n;
	-}
	-
	-
	-static void freereg (FuncState *fs, int reg) {
	- if (!ISK(reg) && reg >= fs->nactvar) {
	- fs->freereg--;
	- lua_assert(reg == fs->freereg);
	- }
	-}
	-
	-
	-static void freeexp (FuncState fs, expdesc e) {
	- if (e->k == VNONRELOC)
	- freereg(fs, e->u.info);
	-}
	-
	-
	-static int addk (FuncState fs, TValue key, TValue *v) {
	- lua_State *L = fs->ls->L;
	- TValue *idx = luaH_set(L, fs->h, key);
	- Proto *f = fs->f;
	- int k, oldsize;
	- if (ttisnumber(idx)) {
	- lua_Number n = nvalue(idx);
	- lua_number2int(k, n);
	- if (luaV_rawequalobj(&f->k[k], v))
	- return k;
	- /* else may be a collision (e.g., between 0.0 and "\0\0\0\0\0\0\0\0");
	- go through and create a new entry for this value */
	- }
	- /* constant not found; create a new entry */
	- oldsize = f->sizek;
	- k = fs->nk;
	- /* numerical value does not need GC barrier;
	- table has no metatable, so it does not need to invalidate cache */
	- setnvalue(idx, cast_num(k));
	- luaM_growvector(L, f->k, k, f->sizek, TValue, MAXARG_Ax, "constants");
	- while (oldsize < f->sizek) setnilvalue(&f->k[oldsize++]);
	- setobj(L, &f->k[k], v);
	- fs->nk++;
	- luaC_barrier(L, f, v);
	- return k;
	-}
	-
	-
	-int luaK_stringK (FuncState fs, TString s) {
	- TValue o;
	- setsvalue(fs->ls->L, &o, s);
	- return addk(fs, &o, &o);
	-}
	-
	-
	-int luaK_numberK (FuncState *fs, lua_Number r) {
	- int n;
	- lua_State *L = fs->ls->L;
	- TValue o;
	- setnvalue(&o, r);
	- if (r == 0 \|\| luai_numisnan(NULL, r)) { /* handle -0 and NaN */
	- /* use raw representation as key to avoid numeric problems */
	- setsvalue(L, L->top++, luaS_newlstr(L, (char *)&r, sizeof(r)));
	- n = addk(fs, L->top - 1, &o);
	- L->top--;
	- }
	- else
	- n = addk(fs, &o, &o); /* regular case */
	- return n;
	-}
	-
	-
	-static int boolK (FuncState *fs, int b) {
	- TValue o;
	- setbvalue(&o, b);
	- return addk(fs, &o, &o);
	-}
	-
	-
	-static int nilK (FuncState *fs) {
	- TValue k, v;
	- setnilvalue(&v);
	- /* cannot use nil as key; instead use table itself to represent nil */
	- sethvalue(fs->ls->L, &k, fs->h);
	- return addk(fs, &k, &v);
	-}
	-
	-
	-void luaK_setreturns (FuncState fs, expdesc e, int nresults) {
	- if (e->k == VCALL) { /* expression is an open function call? */
	- SETARG_C(getcode(fs, e), nresults+1);
	- }
	- else if (e->k == VVARARG) {
	- SETARG_B(getcode(fs, e), nresults+1);
	- SETARG_A(getcode(fs, e), fs->freereg);
	- luaK_reserveregs(fs, 1);
	- }
	-}
	-
	-
	-void luaK_setoneret (FuncState fs, expdesc e) {
	- if (e->k == VCALL) { /* expression is an open function call? */
	- e->k = VNONRELOC;
	- e->u.info = GETARG_A(getcode(fs, e));
	- }
	- else if (e->k == VVARARG) {
	- SETARG_B(getcode(fs, e), 2);
	- e->k = VRELOCABLE; /* can relocate its simple result */
	- }
	-}
	-
	-
	-void luaK_dischargevars (FuncState fs, expdesc e) {
	- switch (e->k) {
	- case VLOCAL: {
	- e->k = VNONRELOC;
	- break;
	- }
	- case VUPVAL: {
	- e->u.info = luaK_codeABC(fs, OP_GETUPVAL, 0, e->u.info, 0);
	- e->k = VRELOCABLE;
	- break;
	- }
	- case VINDEXED: {
	- OpCode op = OP_GETTABUP; /* assume 't' is in an upvalue */
	- freereg(fs, e->u.ind.idx);
	- if (e->u.ind.vt == VLOCAL) { /* 't' is in a register? */
	- freereg(fs, e->u.ind.t);
	- op = OP_GETTABLE;
	- }
	- e->u.info = luaK_codeABC(fs, op, 0, e->u.ind.t, e->u.ind.idx);
	- e->k = VRELOCABLE;
	- break;
	- }
	- case VVARARG:
	- case VCALL: {
	- luaK_setoneret(fs, e);
	- break;
	- }
	- default: break; /* there is one value available (somewhere) */
	- }
	-}
	-
	-
	-static int code_label (FuncState *fs, int A, int b, int jump) {
	- luaK_getlabel(fs); /* those instructions may be jump targets */
	- return luaK_codeABC(fs, OP_LOADBOOL, A, b, jump);
	-}
	-
	-
	-static void discharge2reg (FuncState fs, expdesc e, int reg) {
	- luaK_dischargevars(fs, e);
	- switch (e->k) {
	- case VNIL: {
	- luaK_nil(fs, reg, 1);
	- break;
	- }
	- case VFALSE: case VTRUE: {
	- luaK_codeABC(fs, OP_LOADBOOL, reg, e->k == VTRUE, 0);
	- break;
	- }
	- case VK: {
	- luaK_codek(fs, reg, e->u.info);
	- break;
	- }
	- case VKNUM: {
	- luaK_codek(fs, reg, luaK_numberK(fs, e->u.nval));
	- break;
	- }
	- case VRELOCABLE: {
	- Instruction *pc = &getcode(fs, e);
	- SETARG_A(*pc, reg);
	- break;
	- }
	- case VNONRELOC: {
	- if (reg != e->u.info)
	- luaK_codeABC(fs, OP_MOVE, reg, e->u.info, 0);
	- break;
	- }
	- default: {
	- lua_assert(e->k == VVOID \|\| e->k == VJMP);
	- return; /* nothing to do... */
	- }
	- }
	- e->u.info = reg;
	- e->k = VNONRELOC;
	-}
	-
	-
	-static void discharge2anyreg (FuncState fs, expdesc e) {
	- if (e->k != VNONRELOC) {
	- luaK_reserveregs(fs, 1);
	- discharge2reg(fs, e, fs->freereg-1);
	- }
	-}
	-
	-
	-static void exp2reg (FuncState fs, expdesc e, int reg) {
	- discharge2reg(fs, e, reg);
	- if (e->k == VJMP)
	- luaK_concat(fs, &e->t, e->u.info); /* put this jump in `t' list */
	- if (hasjumps(e)) {
	- int final; /* position after whole expression */
	- int p_f = NO_JUMP; /* position of an eventual LOAD false */
	- int p_t = NO_JUMP; /* position of an eventual LOAD true */
	- if (need_value(fs, e->t) \|\| need_value(fs, e->f)) {
	- int fj = (e->k == VJMP) ? NO_JUMP : luaK_jump(fs);
	- p_f = code_label(fs, reg, 0, 1);
	- p_t = code_label(fs, reg, 1, 0);
	- luaK_patchtohere(fs, fj);
	- }
	- final = luaK_getlabel(fs);
	- patchlistaux(fs, e->f, final, reg, p_f);
	- patchlistaux(fs, e->t, final, reg, p_t);
	- }
	- e->f = e->t = NO_JUMP;
	- e->u.info = reg;
	- e->k = VNONRELOC;
	-}
	-
	-
	-void luaK_exp2nextreg (FuncState fs, expdesc e) {
	- luaK_dischargevars(fs, e);
	- freeexp(fs, e);
	- luaK_reserveregs(fs, 1);
	- exp2reg(fs, e, fs->freereg - 1);
	-}
	-
	-
	-int luaK_exp2anyreg (FuncState fs, expdesc e) {
	- luaK_dischargevars(fs, e);
	- if (e->k == VNONRELOC) {
	- if (!hasjumps(e)) return e->u.info; /* exp is already in a register */
	- if (e->u.info >= fs->nactvar) { /* reg. is not a local? */
	- exp2reg(fs, e, e->u.info); /* put value on it */
	- return e->u.info;
	- }
	- }
	- luaK_exp2nextreg(fs, e); /* default */
	- return e->u.info;
	-}
	-
	-
	-void luaK_exp2anyregup (FuncState fs, expdesc e) {
	- if (e->k != VUPVAL \|\| hasjumps(e))
	- luaK_exp2anyreg(fs, e);
	-}
	-
	-
	-void luaK_exp2val (FuncState fs, expdesc e) {
	- if (hasjumps(e))
	- luaK_exp2anyreg(fs, e);
	- else
	- luaK_dischargevars(fs, e);
	-}
	-
	-
	-int luaK_exp2RK (FuncState fs, expdesc e) {
	- luaK_exp2val(fs, e);
	- switch (e->k) {
	- case VTRUE:
	- case VFALSE:
	- case VNIL: {
	- if (fs->nk <= MAXINDEXRK) { /* constant fits in RK operand? */
	- e->u.info = (e->k == VNIL) ? nilK(fs) : boolK(fs, (e->k == VTRUE));
	- e->k = VK;
	- return RKASK(e->u.info);
	- }
	- else break;
	- }
	- case VKNUM: {
	- e->u.info = luaK_numberK(fs, e->u.nval);
	- e->k = VK;
	- /* go through */
	- }
	- case VK: {
	- if (e->u.info <= MAXINDEXRK) /* constant fits in argC? */
	- return RKASK(e->u.info);
	- else break;
	- }
	- default: break;
	- }
	- /* not a constant in the right range: put it in a register */
	- return luaK_exp2anyreg(fs, e);
	-}
	-
	-
	-void luaK_storevar (FuncState fs, expdesc var, expdesc *ex) {
	- switch (var->k) {
	- case VLOCAL: {
	- freeexp(fs, ex);
	- exp2reg(fs, ex, var->u.info);
	- return;
	- }
	- case VUPVAL: {
	- int e = luaK_exp2anyreg(fs, ex);
	- luaK_codeABC(fs, OP_SETUPVAL, e, var->u.info, 0);
	- break;
	- }
	- case VINDEXED: {
	- OpCode op = (var->u.ind.vt == VLOCAL) ? OP_SETTABLE : OP_SETTABUP;
	- int e = luaK_exp2RK(fs, ex);
	- luaK_codeABC(fs, op, var->u.ind.t, var->u.ind.idx, e);
	- break;
	- }
	- default: {
	- lua_assert(0); /* invalid var kind to store */
	- break;
	- }
	- }
	- freeexp(fs, ex);
	-}
	-
	-
	-void luaK_self (FuncState fs, expdesc e, expdesc *key) {
	- int ereg;
	- luaK_exp2anyreg(fs, e);
	- ereg = e->u.info; /* register where 'e' was placed */
	- freeexp(fs, e);
	- e->u.info = fs->freereg; /* base register for op_self */
	- e->k = VNONRELOC;
	- luaK_reserveregs(fs, 2); /* function and 'self' produced by op_self */
	- luaK_codeABC(fs, OP_SELF, e->u.info, ereg, luaK_exp2RK(fs, key));
	- freeexp(fs, key);
	-}
	-
	-
	-static void invertjump (FuncState fs, expdesc e) {
	- Instruction *pc = getjumpcontrol(fs, e->u.info);
	- lua_assert(testTMode(GET_OPCODE(pc)) && GET_OPCODE(pc) != OP_TESTSET &&
	- GET_OPCODE(*pc) != OP_TEST);
	- SETARG_A(pc, !(GETARG_A(pc)));
	-}
	-
	-
	-static int jumponcond (FuncState fs, expdesc e, int cond) {
	- if (e->k == VRELOCABLE) {
	- Instruction ie = getcode(fs, e);
	- if (GET_OPCODE(ie) == OP_NOT) {
	- fs->pc--; /* remove previous OP_NOT */
	- return condjump(fs, OP_TEST, GETARG_B(ie), 0, !cond);
	- }
	- /* else go through */
	- }
	- discharge2anyreg(fs, e);
	- freeexp(fs, e);
	- return condjump(fs, OP_TESTSET, NO_REG, e->u.info, cond);
	-}
	-
	-
	-void luaK_goiftrue (FuncState fs, expdesc e) {
	- int pc; /* pc of last jump */
	- luaK_dischargevars(fs, e);
	- switch (e->k) {
	- case VJMP: {
	- invertjump(fs, e);
	- pc = e->u.info;
	- break;
	- }
	- case VK: case VKNUM: case VTRUE: {
	- pc = NO_JUMP; /* always true; do nothing */
	- break;
	- }
	- default: {
	- pc = jumponcond(fs, e, 0);
	- break;
	- }
	- }
	- luaK_concat(fs, &e->f, pc); /* insert last jump in `f' list */
	- luaK_patchtohere(fs, e->t);
	- e->t = NO_JUMP;
	-}
	-
	-
	-void luaK_goiffalse (FuncState fs, expdesc e) {
	- int pc; /* pc of last jump */
	- luaK_dischargevars(fs, e);
	- switch (e->k) {
	- case VJMP: {
	- pc = e->u.info;
	- break;
	- }
	- case VNIL: case VFALSE: {
	- pc = NO_JUMP; /* always false; do nothing */
	- break;
	- }
	- default: {
	- pc = jumponcond(fs, e, 1);
	- break;
	- }
	- }
	- luaK_concat(fs, &e->t, pc); /* insert last jump in `t' list */
	- luaK_patchtohere(fs, e->f);
	- e->f = NO_JUMP;
	-}
	-
	-
	-static void codenot (FuncState fs, expdesc e) {
	- luaK_dischargevars(fs, e);
	- switch (e->k) {
	- case VNIL: case VFALSE: {
	- e->k = VTRUE;
	- break;
	- }
	- case VK: case VKNUM: case VTRUE: {
	- e->k = VFALSE;
	- break;
	- }
	- case VJMP: {
	- invertjump(fs, e);
	- break;
	- }
	- case VRELOCABLE:
	- case VNONRELOC: {
	- discharge2anyreg(fs, e);
	- freeexp(fs, e);
	- e->u.info = luaK_codeABC(fs, OP_NOT, 0, e->u.info, 0);
	- e->k = VRELOCABLE;
	- break;
	- }
	- default: {
	- lua_assert(0); /* cannot happen */
	- break;
	- }
	- }
	- /* interchange true and false lists */
	- { int temp = e->f; e->f = e->t; e->t = temp; }
	- removevalues(fs, e->f);
	- removevalues(fs, e->t);
	-}
	-
	-
	-void luaK_indexed (FuncState fs, expdesc t, expdesc *k) {
	- lua_assert(!hasjumps(t));
	- t->u.ind.t = t->u.info;
	- t->u.ind.idx = luaK_exp2RK(fs, k);
	- t->u.ind.vt = (t->k == VUPVAL) ? VUPVAL
	- : check_exp(vkisinreg(t->k), VLOCAL);
	- t->k = VINDEXED;
	-}
	-
	-
	-static int constfolding (OpCode op, expdesc e1, expdesc e2) {
	- lua_Number r;
	- if (!isnumeral(e1) \|\| !isnumeral(e2)) return 0;
	- if ((op == OP_DIV \|\| op == OP_MOD) && e2->u.nval == 0)
	- return 0; /* do not attempt to divide by 0 */
	- /*
	- * Patched: check for MIN_INT / -1
	- */
	- if (op == OP_DIV && e1->u.nval == INT64_MIN && e2->u.nval == -1)
	- return 0;
	- r = luaO_arith(op - OP_ADD + LUA_OPADD, e1->u.nval, e2->u.nval);
	- e1->u.nval = r;
	- return 1;
	-}
	-
	-
	-static void codearith (FuncState *fs, OpCode op,
	- expdesc e1, expdesc e2, int line) {
	- if (constfolding(op, e1, e2))
	- return;
	- else {
	- int o2 = (op != OP_UNM && op != OP_LEN) ? luaK_exp2RK(fs, e2) : 0;
	- int o1 = luaK_exp2RK(fs, e1);
	- if (o1 > o2) {
	- freeexp(fs, e1);
	- freeexp(fs, e2);
	- }
	- else {
	- freeexp(fs, e2);
	- freeexp(fs, e1);
	- }
	- e1->u.info = luaK_codeABC(fs, op, 0, o1, o2);
	- e1->k = VRELOCABLE;
	- luaK_fixline(fs, line);
	- }
	-}
	-
	-
	-static void codecomp (FuncState fs, OpCode op, int cond, expdesc e1,
	- expdesc *e2) {
	- int o1 = luaK_exp2RK(fs, e1);
	- int o2 = luaK_exp2RK(fs, e2);
	- freeexp(fs, e2);
	- freeexp(fs, e1);
	- if (cond == 0 && op != OP_EQ) {
	- int temp; /* exchange args to replace by `<' or `<=' */
	- temp = o1; o1 = o2; o2 = temp; /* o1 <==> o2 */
	- cond = 1;
	- }
	- e1->u.info = condjump(fs, op, cond, o1, o2);
	- e1->k = VJMP;
	-}
	-
	-
	-void luaK_prefix (FuncState fs, UnOpr op, expdesc e, int line) {
	- expdesc e2;
	- e2.t = e2.f = NO_JUMP; e2.k = VKNUM; e2.u.nval = 0;
	- switch (op) {
	- case OPR_MINUS: {
	- if (isnumeral(e)) /* minus constant? */
	- e->u.nval = luai_numunm(NULL, e->u.nval); /* fold it */
	- else {
	- luaK_exp2anyreg(fs, e);
	- codearith(fs, OP_UNM, e, &e2, line);
	- }
	- break;
	- }
	- case OPR_NOT: codenot(fs, e); break;
	- case OPR_LEN: {
	- luaK_exp2anyreg(fs, e); /* cannot operate on constants */
	- codearith(fs, OP_LEN, e, &e2, line);
	- break;
	- }
	- default: lua_assert(0);
	- }
	-}
	-
	-
	-void luaK_infix (FuncState fs, BinOpr op, expdesc v) {
	- switch (op) {
	- case OPR_AND: {
	- luaK_goiftrue(fs, v);
	- break;
	- }
	- case OPR_OR: {
	- luaK_goiffalse(fs, v);
	- break;
	- }
	- case OPR_CONCAT: {
	- luaK_exp2nextreg(fs, v); /* operand must be on the `stack' */
	- break;
	- }
	- case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
	- case OPR_MOD: case OPR_POW: {
	- if (!isnumeral(v)) luaK_exp2RK(fs, v);
	- break;
	- }
	- default: {
	- luaK_exp2RK(fs, v);
	- break;
	- }
	- }
	-}
	-
	-
	-void luaK_posfix (FuncState *fs, BinOpr op,
	- expdesc e1, expdesc e2, int line) {
	- switch (op) {
	- case OPR_AND: {
	- lua_assert(e1->t == NO_JUMP); /* list must be closed */
	- luaK_dischargevars(fs, e2);
	- luaK_concat(fs, &e2->f, e1->f);
	- e1 = e2;
	- break;
	- }
	- case OPR_OR: {
	- lua_assert(e1->f == NO_JUMP); /* list must be closed */
	- luaK_dischargevars(fs, e2);
	- luaK_concat(fs, &e2->t, e1->t);
	- e1 = e2;
	- break;
	- }
	- case OPR_CONCAT: {
	- luaK_exp2val(fs, e2);
	- if (e2->k == VRELOCABLE && GET_OPCODE(getcode(fs, e2)) == OP_CONCAT) {
	- lua_assert(e1->u.info == GETARG_B(getcode(fs, e2))-1);
	- freeexp(fs, e1);
	- SETARG_B(getcode(fs, e2), e1->u.info);
	- e1->k = VRELOCABLE; e1->u.info = e2->u.info;
	- }
	- else {
	- luaK_exp2nextreg(fs, e2); /* operand must be on the 'stack' */
	- codearith(fs, OP_CONCAT, e1, e2, line);
	- }
	- break;
	- }
	- case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
	- case OPR_MOD: case OPR_POW: {
	- codearith(fs, cast(OpCode, op - OPR_ADD + OP_ADD), e1, e2, line);
	- break;
	- }
	- case OPR_EQ: case OPR_LT: case OPR_LE: {
	- codecomp(fs, cast(OpCode, op - OPR_EQ + OP_EQ), 1, e1, e2);
	- break;
	- }
	- case OPR_NE: case OPR_GT: case OPR_GE: {
	- codecomp(fs, cast(OpCode, op - OPR_NE + OP_EQ), 0, e1, e2);
	- break;
	- }
	- default: lua_assert(0);
	- }
	-}
	-
	-
	-void luaK_fixline (FuncState *fs, int line) {
	- fs->f->lineinfo[fs->pc - 1] = line;
	-}
	-
	-
	-void luaK_setlist (FuncState *fs, int base, int nelems, int tostore) {
	- int c = (nelems - 1)/LFIELDS_PER_FLUSH + 1;
	- int b = (tostore == LUA_MULTRET) ? 0 : tostore;
	- lua_assert(tostore != 0);
	- if (c <= MAXARG_C)
	- luaK_codeABC(fs, OP_SETLIST, base, b, c);
	- else if (c <= MAXARG_Ax) {
	- luaK_codeABC(fs, OP_SETLIST, base, b, 0);
	- codeextraarg(fs, c);
	- }
	- else
	- luaX_syntaxerror(fs->ls, "constructor too long");
	- fs->freereg = base + 1; /* free registers with list values */
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c
	@@ -1,102 +0,0 @@
	-/*
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#include "lua.h"
	-
	-#include <sys/zfs_context.h>
	-
	-ssize_t
	-lcompat_sprintf(char buf, const char fmt, ...)
	-{
	- ssize_t res;
	- va_list args;
	-
	- va_start(args, fmt);
	- res = vsnprintf(buf, INT_MAX, fmt, args);
	- va_end(args);
	-
	- return (res);
	-}
	-
	-int64_t
	-lcompat_strtoll(const char str, char *ptr)
	-{
	- int base;
	- const char *cp;
	- int digits;
	- int64_t value;
	- boolean_t is_negative;
	-
	- cp = str;
	- while (cp == ' ' \|\| cp == '\t' \|\| *cp == '\n') {
	- cp++;
	- }
	- is_negative = (*cp == '-');
	- if (is_negative) {
	- cp++;
	- }
	- base = 10;
	-
	- if (*cp == '0') {
	- base = 8;
	- cp++;
	- if (cp == 'x' \|\| cp == 'X') {
	- base = 16;
	- cp++;
	- }
	- }
	-
	- value = 0;
	- for (; *cp != '\0'; cp++) {
	- if (cp >= '0' && cp <= '9') {
	- digits = *cp - '0';
	- } else if (cp >= 'a' && cp <= 'f') {
	- digits = *cp - 'a' + 10;
	- } else if (cp >= 'A' && cp <= 'F') {
	- digits = *cp - 'A' + 10;
	- } else {
	- break;
	- }
	- if (digits >= base) {
	- break;
	- }
	- value = (value * base) + digits;
	- }
	-
	- if (ptr != NULL) {
	- ptr = (char )cp;
	- }
	- if (is_negative) {
	- value = -value;
	- }
	- return (value);
	-}
	-
	-int64_t
	-lcompat_pow(int64_t x, int64_t y)
	-{
	- int64_t result = 1;
	- if (y < 0)
	- return (0);
	-
	- while (y) {
	- if (y & 1)
	- result *= x;
	- y >>= 1;
	- x *= x;
	- }
	- return (result);
	-}
	-
	-int
	-lcompat_hashnum(int64_t x)
	-{
	- x = (~x) + (x << 18);
	- x = x ^ (x >> 31);
	- x = x * 21;
	- x = x ^ (x >> 11);
	- x = x + (x << 6);
	- x = x ^ (x >> 22);
	- return ((int)x);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c
	@@ -1,154 +0,0 @@
	-/*
	-** $Id: lcorolib.c,v 1.5.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Coroutine Library
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define lcorolib_c
	-#define LUA_LIB
	-
	-#include "lua.h"
	-
	-#include "lauxlib.h"
	-#include "lualib.h"
	-
	-
	-static int auxresume (lua_State L, lua_State co, int narg) {
	- int status;
	- if (!lua_checkstack(co, narg)) {
	- lua_pushliteral(L, "too many arguments to resume");
	- return -1; /* error flag */
	- }
	- if (lua_status(co) == LUA_OK && lua_gettop(co) == 0) {
	- lua_pushliteral(L, "cannot resume dead coroutine");
	- return -1; /* error flag */
	- }
	- lua_xmove(L, co, narg);
	- status = lua_resume(co, L, narg);
	- if (status == LUA_OK \|\| status == LUA_YIELD) {
	- int nres = lua_gettop(co);
	- if (!lua_checkstack(L, nres + 1)) {
	- lua_pop(co, nres); /* remove results anyway */
	- lua_pushliteral(L, "too many results to resume");
	- return -1; /* error flag */
	- }
	- lua_xmove(co, L, nres); /* move yielded values */
	- return nres;
	- }
	- else {
	- lua_xmove(co, L, 1); /* move error message */
	- return -1; /* error flag */
	- }
	-}
	-
	-
	-static int luaB_coresume (lua_State *L) {
	- lua_State *co = lua_tothread(L, 1);
	- int r;
	- luaL_argcheck(L, co, 1, "coroutine expected");
	- r = auxresume(L, co, lua_gettop(L) - 1);
	- if (r < 0) {
	- lua_pushboolean(L, 0);
	- lua_insert(L, -2);
	- return 2; /* return false + error message */
	- }
	- else {
	- lua_pushboolean(L, 1);
	- lua_insert(L, -(r + 1));
	- return r + 1; /* return true + `resume' returns */
	- }
	-}
	-
	-
	-static int luaB_auxwrap (lua_State *L) {
	- lua_State *co = lua_tothread(L, lua_upvalueindex(1));
	- int r = auxresume(L, co, lua_gettop(L));
	- if (r < 0) {
	- if (lua_isstring(L, -1)) { /* error object is a string? */
	- luaL_where(L, 1); /* add extra info */
	- lua_insert(L, -2);
	- lua_concat(L, 2);
	- }
	- return lua_error(L); /* propagate error */
	- }
	- return r;
	-}
	-
	-
	-static int luaB_cocreate (lua_State *L) {
	- lua_State *NL;
	- luaL_checktype(L, 1, LUA_TFUNCTION);
	- NL = lua_newthread(L);
	- lua_pushvalue(L, 1); /* move function to top */
	- lua_xmove(L, NL, 1); /* move function from L to NL */
	- return 1;
	-}
	-
	-
	-static int luaB_cowrap (lua_State *L) {
	- luaB_cocreate(L);
	- lua_pushcclosure(L, luaB_auxwrap, 1);
	- return 1;
	-}
	-
	-
	-static int luaB_yield (lua_State *L) {
	- return lua_yield(L, lua_gettop(L));
	-}
	-
	-
	-static int luaB_costatus (lua_State *L) {
	- lua_State *co = lua_tothread(L, 1);
	- luaL_argcheck(L, co, 1, "coroutine expected");
	- if (L == co) lua_pushliteral(L, "running");
	- else {
	- switch (lua_status(co)) {
	- case LUA_YIELD:
	- lua_pushliteral(L, "suspended");
	- break;
	- case LUA_OK: {
	- lua_Debug ar;
	- if (lua_getstack(co, 0, &ar) > 0) /* does it have frames? */
	- lua_pushliteral(L, "normal"); /* it is running */
	- else if (lua_gettop(co) == 0)
	- lua_pushliteral(L, "dead");
	- else
	- lua_pushliteral(L, "suspended"); /* initial state */
	- break;
	- }
	- default: /* some error occurred */
	- lua_pushliteral(L, "dead");
	- break;
	- }
	- }
	- return 1;
	-}
	-
	-
	-static int luaB_corunning (lua_State *L) {
	- int ismain = lua_pushthread(L);
	- lua_pushboolean(L, ismain);
	- return 2;
	-}
	-
	-
	-static const luaL_Reg co_funcs[] = {
	- {"create", luaB_cocreate},
	- {"resume", luaB_coresume},
	- {"running", luaB_corunning},
	- {"status", luaB_costatus},
	- {"wrap", luaB_cowrap},
	- {"yield", luaB_yield},
	- {NULL, NULL}
	-};
	-
	-
	-
	-LUAMOD_API int luaopen_coroutine (lua_State *L) {
	- luaL_newlib(L, co_funcs);
	- return 1;
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h
	@@ -1,93 +0,0 @@
	-/*
	-** $Id: lctype.h,v 1.12.1.1 2013/04/12 18:48:47 roberto Exp $
	-** 'ctype' functions for Lua
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lctype_h
	-#define lctype_h
	-
	-#include "lua.h"
	-
	-
	-/*
	-** WARNING: the functions defined here do not necessarily correspond
	-** to the similar functions in the standard C ctype.h. They are
	-** optimized for the specific needs of Lua
	-*/
	-
	-#if !defined(LUA_USE_CTYPE)
	-
	-#if 'A' == 65 && '0' == 48
	-/* ASCII case: can use its own tables; faster and fixed */
	-#define LUA_USE_CTYPE 0
	-#else
	-/* must use standard C ctype */
	-#define LUA_USE_CTYPE 1
	-#endif
	-
	-#endif
	-
	-
	-#if !LUA_USE_CTYPE /* { */
	-
	-#include "llimits.h"
	-
	-
	-#define ALPHABIT 0
	-#define DIGITBIT 1
	-#define PRINTBIT 2
	-#define SPACEBIT 3
	-#define XDIGITBIT 4
	-
	-
	-#define MASK(B) (1 << (B))
	-
	-
	-/*
	-** add 1 to char to allow index -1 (EOZ)
	-*/
	-#define testprop(c,p) (luai_ctype_[(c)+1] & (p))
	-
	-/*
	-** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_'
	-*/
	-#define lislalpha(c) testprop(c, MASK(ALPHABIT))
	-#define lislalnum(c) testprop(c, (MASK(ALPHABIT) \| MASK(DIGITBIT)))
	-#define lisdigit(c) testprop(c, MASK(DIGITBIT))
	-#define lisspace(c) testprop(c, MASK(SPACEBIT))
	-#define lisprint(c) testprop(c, MASK(PRINTBIT))
	-#define lisxdigit(c) testprop(c, MASK(XDIGITBIT))
	-
	-/*
	-** this 'ltolower' only works for alphabetic characters
	-*/
	-#define ltolower(c) ((c) \| ('A' ^ 'a'))
	-
	-
	-/* two more entries for 0 and -1 (EOZ) */
	-LUAI_DDEC const lu_byte luai_ctype_[UCHAR_MAX + 2];
	-
	-
	-#else /* }{ */
	-
	-/*
	-** use standard C ctypes
	-*/
	-
	-#include <ctype.h>
	-
	-
	-#define lislalpha(c) (isalpha(c) \|\| (c) == '_')
	-#define lislalnum(c) (isalnum(c) \|\| (c) == '_')
	-#define lisdigit(c) (isdigit(c))
	-#define lisspace(c) (isspace(c))
	-#define lisprint(c) (isprint(c))
	-#define lisxdigit(c) (isxdigit(c))
	-
	-#define ltolower(c) (tolower(c))
	-
	-#endif /* } */
	-
	-#endif
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c
	@@ -1,52 +0,0 @@
	-/*
	-** $Id: lctype.c,v 1.11.1.1 2013/04/12 18:48:47 roberto Exp $
	-** 'ctype' functions for Lua
	-** See Copyright Notice in lua.h
	-*/
	-
	-#define lctype_c
	-#define LUA_CORE
	-
	-#include "lctype.h"
	-
	-#if !LUA_USE_CTYPE /* { */
	-
	-#include <sys/zfs_context.h>
	-
	-LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = {
	- 0x00, /* EOZ */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0. */
	- 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00,
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1. */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	- 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* 2. */
	- 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
	- 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, /* 3. */
	- 0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
	- 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 4. */
	- 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
	- 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 5. */
	- 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05,
	- 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 6. */
	- 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
	- 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 7. */
	- 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00,
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8. */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 9. */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* a. */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* b. */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* c. */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* d. */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* e. */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* f. */
	- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	-};
	-
	-#endif /* } */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h
	@@ -1,34 +0,0 @@
	-/*
	-** $Id: ldebug.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Auxiliary functions from Debug Interface module
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef ldebug_h
	-#define ldebug_h
	-
	-
	-#include "lstate.h"
	-
	-
	-#define pcRel(pc, p) (cast(int, (pc) - (p)->code) - 1)
	-
	-#define getfuncline(f,pc) (((f)->lineinfo) ? (f)->lineinfo[pc] : 0)
	-
	-#define resethookcount(L) (L->hookcount = L->basehookcount)
	-
	-/* Active Lua function (given call info) */
	-#define ci_func(ci) (clLvalue((ci)->func))
	-
	-
	-LUAI_FUNC l_noret luaG_typeerror (lua_State L, const TValue o,
	- const char *opname);
	-LUAI_FUNC l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2);
	-LUAI_FUNC l_noret luaG_aritherror (lua_State L, const TValue p1,
	- const TValue *p2);
	-LUAI_FUNC l_noret luaG_ordererror (lua_State L, const TValue p1,
	- const TValue *p2);
	-LUAI_FUNC l_noret luaG_runerror (lua_State L, const char fmt, ...);
	-LUAI_FUNC l_noret luaG_errormsg (lua_State *L);
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c
	@@ -1,607 +0,0 @@
	-/*
	-** $Id: ldebug.c,v 2.90.1.4 2015/02/19 17:05:13 roberto Exp $
	-** Debug Interface
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define ldebug_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lapi.h"
	-#include "lcode.h"
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lfunc.h"
	-#include "lobject.h"
	-#include "lopcodes.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "ltm.h"
	-#include "lvm.h"
	-
	-
	-
	-#define noLuaClosure(f) ((f) == NULL \|\| (f)->c.tt == LUA_TCCL)
	-
	-
	-static const char getfuncname (lua_State L, CallInfo ci, const char *name);
	-
	-
	-static int currentpc (CallInfo *ci) {
	- lua_assert(isLua(ci));
	- return pcRel(ci->u.l.savedpc, ci_func(ci)->p);
	-}
	-
	-
	-static int currentline (CallInfo *ci) {
	- return getfuncline(ci_func(ci)->p, currentpc(ci));
	-}
	-
	-
	-static void swapextra (lua_State *L) {
	- if (L->status == LUA_YIELD) {
	- CallInfo ci = L->ci; / get function that yielded */
	- StkId temp = ci->func; /* exchange its 'func' and 'extra' values */
	- ci->func = restorestack(L, ci->extra);
	- ci->extra = savestack(L, temp);
	- }
	-}
	-
	-
	-/*
	-** this function can be called asynchronous (e.g. during a signal)
	-*/
	-LUA_API int lua_sethook (lua_State *L, lua_Hook func, int mask, int count) {
	- if (func == NULL \|\| mask == 0) { /* turn off hooks? */
	- mask = 0;
	- func = NULL;
	- }
	- if (isLua(L->ci))
	- L->oldpc = L->ci->u.l.savedpc;
	- L->hook = func;
	- L->basehookcount = count;
	- resethookcount(L);
	- L->hookmask = cast_byte(mask);
	- return 1;
	-}
	-
	-
	-LUA_API lua_Hook lua_gethook (lua_State *L) {
	- return L->hook;
	-}
	-
	-
	-LUA_API int lua_gethookmask (lua_State *L) {
	- return L->hookmask;
	-}
	-
	-
	-LUA_API int lua_gethookcount (lua_State *L) {
	- return L->basehookcount;
	-}
	-
	-
	-LUA_API int lua_getstack (lua_State L, int level, lua_Debug ar) {
	- int status;
	- CallInfo *ci;
	- if (level < 0) return 0; /* invalid (negative) level */
	- lua_lock(L);
	- for (ci = L->ci; level > 0 && ci != &L->base_ci; ci = ci->previous)
	- level--;
	- if (level == 0 && ci != &L->base_ci) { /* level found? */
	- status = 1;
	- ar->i_ci = ci;
	- }
	- else status = 0; /* no such level */
	- lua_unlock(L);
	- return status;
	-}
	-
	-
	-static const char upvalname (Proto p, int uv) {
	- TString *s = check_exp(uv < p->sizeupvalues, p->upvalues[uv].name);
	- if (s == NULL) return "?";
	- else return getstr(s);
	-}
	-
	-
	-static const char findvararg (CallInfo ci, int n, StkId *pos) {
	- int nparams = clLvalue(ci->func)->p->numparams;
	- if (n >= ci->u.l.base - ci->func - nparams)
	- return NULL; /* no such vararg */
	- else {
	- *pos = ci->func + nparams + n;
	- return "(vararg)"; / generic name for any vararg */
	- }
	-}
	-
	-
	-static const char findlocal (lua_State L, CallInfo *ci, int n,
	- StkId *pos) {
	- const char *name = NULL;
	- StkId base;
	- if (isLua(ci)) {
	- if (n < 0) /* access to vararg values? */
	- return findvararg(ci, -n, pos);
	- else {
	- base = ci->u.l.base;
	- name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
	- }
	- }
	- else
	- base = ci->func + 1;
	- if (name == NULL) { /* no 'standard' name? */
	- StkId limit = (ci == L->ci) ? L->top : ci->next->func;
	- if (limit - base >= n && n > 0) /* is 'n' inside 'ci' stack? */
	- name = "(temporary)"; / generic name for any valid slot */
	- else
	- return NULL; /* no name */
	- }
	- *pos = base + (n - 1);
	- return name;
	-}
	-
	-
	-LUA_API const char lua_getlocal (lua_State L, const lua_Debug *ar, int n) {
	- const char *name;
	- lua_lock(L);
	- swapextra(L);
	- if (ar == NULL) { /* information about non-active function? */
	- if (!isLfunction(L->top - 1)) /* not a Lua function? */
	- name = NULL;
	- else /* consider live variables at function start (parameters) */
	- name = luaF_getlocalname(clLvalue(L->top - 1)->p, n, 0);
	- }
	- else { /* active function; get information through 'ar' */
	- StkId pos = 0; /* to avoid warnings */
	- name = findlocal(L, ar->i_ci, n, &pos);
	- if (name) {
	- setobj2s(L, L->top, pos);
	- api_incr_top(L);
	- }
	- }
	- swapextra(L);
	- lua_unlock(L);
	- return name;
	-}
	-
	-
	-LUA_API const char lua_setlocal (lua_State L, const lua_Debug *ar, int n) {
	- StkId pos = 0; /* to avoid warnings */
	- const char *name;
	- lua_lock(L);
	- swapextra(L);
	- name = findlocal(L, ar->i_ci, n, &pos);
	- if (name)
	- setobjs2s(L, pos, L->top - 1);
	- L->top--; /* pop value */
	- swapextra(L);
	- lua_unlock(L);
	- return name;
	-}
	-
	-
	-static void funcinfo (lua_Debug ar, Closure cl) {
	- if (noLuaClosure(cl)) {
	- ar->source = "=[C]";
	- ar->linedefined = -1;
	- ar->lastlinedefined = -1;
	- ar->what = "C";
	- }
	- else {
	- Proto *p = cl->l.p;
	- ar->source = p->source ? getstr(p->source) : "=?";
	- ar->linedefined = p->linedefined;
	- ar->lastlinedefined = p->lastlinedefined;
	- ar->what = (ar->linedefined == 0) ? "main" : "Lua";
	- }
	- luaO_chunkid(ar->short_src, ar->source, LUA_IDSIZE);
	-}
	-
	-
	-static void collectvalidlines (lua_State L, Closure f) {
	- if (noLuaClosure(f)) {
	- setnilvalue(L->top);
	- api_incr_top(L);
	- }
	- else {
	- int i;
	- TValue v;
	- int *lineinfo = f->l.p->lineinfo;
	- Table t = luaH_new(L); / new table to store active lines */
	- sethvalue(L, L->top, t); /* push it on stack */
	- api_incr_top(L);
	- setbvalue(&v, 1); /* boolean 'true' to be the value of all indices */
	- for (i = 0; i < f->l.p->sizelineinfo; i++) /* for all lines with code */
	- luaH_setint(L, t, lineinfo[i], &v); /* table[line] = true */
	- }
	-}
	-
	-
	-static int auxgetinfo (lua_State L, const char what, lua_Debug *ar,
	- Closure f, CallInfo ci) {
	- int status = 1;
	- for (; *what; what++) {
	- switch (*what) {
	- case 'S': {
	- funcinfo(ar, f);
	- break;
	- }
	- case 'l': {
	- ar->currentline = (ci && isLua(ci)) ? currentline(ci) : -1;
	- break;
	- }
	- case 'u': {
	- ar->nups = (f == NULL) ? 0 : f->c.nupvalues;
	- if (noLuaClosure(f)) {
	- ar->isvararg = 1;
	- ar->nparams = 0;
	- }
	- else {
	- ar->isvararg = f->l.p->is_vararg;
	- ar->nparams = f->l.p->numparams;
	- }
	- break;
	- }
	- case 't': {
	- ar->istailcall = (ci) ? ci->callstatus & CIST_TAIL : 0;
	- break;
	- }
	- case 'n': {
	- /* calling function is a known Lua function? */
	- if (ci && !(ci->callstatus & CIST_TAIL) && isLua(ci->previous))
	- ar->namewhat = getfuncname(L, ci->previous, &ar->name);
	- else
	- ar->namewhat = NULL;
	- if (ar->namewhat == NULL) {
	- ar->namewhat = ""; /* not found */
	- ar->name = NULL;
	- }
	- break;
	- }
	- case 'L':
	- case 'f': /* handled by lua_getinfo */
	- break;
	- default: status = 0; /* invalid option */
	- }
	- }
	- return status;
	-}
	-
	-
	-LUA_API int lua_getinfo (lua_State L, const char what, lua_Debug *ar) {
	- int status;
	- Closure *cl;
	- CallInfo *ci;
	- StkId func;
	- lua_lock(L);
	- swapextra(L);
	- if (*what == '>') {
	- ci = NULL;
	- func = L->top - 1;
	- api_check(L, ttisfunction(func), "function expected");
	- what++; /* skip the '>' */
	- L->top--; /* pop function */
	- }
	- else {
	- ci = ar->i_ci;
	- func = ci->func;
	- lua_assert(ttisfunction(ci->func));
	- }
	- cl = ttisclosure(func) ? clvalue(func) : NULL;
	- status = auxgetinfo(L, what, ar, cl, ci);
	- if (strchr(what, 'f')) {
	- setobjs2s(L, L->top, func);
	- api_incr_top(L);
	- }
	- swapextra(L);
	- if (strchr(what, 'L'))
	- collectvalidlines(L, cl);
	- lua_unlock(L);
	- return status;
	-}
	-
	-
	-/*
	-** {======================================================
	-** Symbolic Execution
	-** =======================================================
	-*/
	-
	-static const char getobjname (Proto p, int lastpc, int reg,
	- const char **name);
	-
	-
	-/*
	-** find a "name" for the RK value 'c'
	-*/
	-static void kname (Proto p, int pc, int c, const char *name) {
	- if (ISK(c)) { /* is 'c' a constant? */
	- TValue *kvalue = &p->k[INDEXK(c)];
	- if (ttisstring(kvalue)) { /* literal constant? */
	- name = svalue(kvalue); / it is its own name */
	- return;
	- }
	- /* else no reasonable name found */
	- }
	- else { /* 'c' is a register */
	- const char what = getobjname(p, pc, c, name); / search for 'c' */
	- if (what && what == 'c') { / found a constant name? */
	- return; /* 'name' already filled */
	- }
	- /* else no reasonable name found */
	- }
	- name = "?"; / no reasonable name found */
	-}
	-
	-
	-static int filterpc (int pc, int jmptarget) {
	- if (pc < jmptarget) /* is code conditional (inside a jump)? */
	- return -1; /* cannot know who sets that register */
	- else return pc; /* current position sets that register */
	-}
	-
	-
	-/*
	-** try to find last instruction before 'lastpc' that modified register 'reg'
	-*/
	-static int findsetreg (Proto *p, int lastpc, int reg) {
	- int pc;
	- int setreg = -1; /* keep last instruction that changed 'reg' */
	- int jmptarget = 0; /* any code before this address is conditional */
	- for (pc = 0; pc < lastpc; pc++) {
	- Instruction i = p->code[pc];
	- OpCode op = GET_OPCODE(i);
	- int a = GETARG_A(i);
	- switch (op) {
	- case OP_LOADNIL: {
	- int b = GETARG_B(i);
	- if (a <= reg && reg <= a + b) /* set registers from 'a' to 'a+b' */
	- setreg = filterpc(pc, jmptarget);
	- break;
	- }
	- case OP_TFORCALL: {
	- if (reg >= a + 2) /* affect all regs above its base */
	- setreg = filterpc(pc, jmptarget);
	- break;
	- }
	- case OP_CALL:
	- case OP_TAILCALL: {
	- if (reg >= a) /* affect all registers above base */
	- setreg = filterpc(pc, jmptarget);
	- break;
	- }
	- case OP_JMP: {
	- int b = GETARG_sBx(i);
	- int dest = pc + 1 + b;
	- /* jump is forward and do not skip `lastpc'? */
	- if (pc < dest && dest <= lastpc) {
	- if (dest > jmptarget)
	- jmptarget = dest; /* update 'jmptarget' */
	- }
	- break;
	- }
	- case OP_TEST: {
	- if (reg == a) /* jumped code can change 'a' */
	- setreg = filterpc(pc, jmptarget);
	- break;
	- }
	- default:
	- if (testAMode(op) && reg == a) /* any instruction that set A */
	- setreg = filterpc(pc, jmptarget);
	- break;
	- }
	- }
	- return setreg;
	-}
	-
	-
	-static const char getobjname (Proto p, int lastpc, int reg,
	- const char **name) {
	- int pc;
	- *name = luaF_getlocalname(p, reg + 1, lastpc);
	- if (name) / is a local? */
	- return "local";
	- /* else try symbolic execution */
	- pc = findsetreg(p, lastpc, reg);
	- if (pc != -1) { /* could find instruction? */
	- Instruction i = p->code[pc];
	- OpCode op = GET_OPCODE(i);
	- switch (op) {
	- case OP_MOVE: {
	- int b = GETARG_B(i); /* move from 'b' to 'a' */
	- if (b < GETARG_A(i))
	- return getobjname(p, pc, b, name); /* get name for 'b' */
	- break;
	- }
	- case OP_GETTABUP:
	- case OP_GETTABLE: {
	- int k = GETARG_C(i); /* key index */
	- int t = GETARG_B(i); /* table index */
	- const char vn = (op == OP_GETTABLE) / name of indexed variable */
	- ? luaF_getlocalname(p, t + 1, pc)
	- : upvalname(p, t);
	- kname(p, pc, k, name);
	- return (vn && strcmp(vn, LUA_ENV) == 0) ? "global" : "field";
	- }
	- case OP_GETUPVAL: {
	- *name = upvalname(p, GETARG_B(i));
	- return "upvalue";
	- }
	- case OP_LOADK:
	- case OP_LOADKX: {
	- int b = (op == OP_LOADK) ? GETARG_Bx(i)
	- : GETARG_Ax(p->code[pc + 1]);
	- if (ttisstring(&p->k[b])) {
	- *name = svalue(&p->k[b]);
	- return "constant";
	- }
	- break;
	- }
	- case OP_SELF: {
	- int k = GETARG_C(i); /* key index */
	- kname(p, pc, k, name);
	- return "method";
	- }
	- default: break; /* go through to return NULL */
	- }
	- }
	- return NULL; /* could not find reasonable name */
	-}
	-
	-
	-static const char getfuncname (lua_State L, CallInfo ci, const char *name) {
	- TMS tm;
	- Proto p = ci_func(ci)->p; / calling function */
	- int pc = currentpc(ci); /* calling instruction index */
	- Instruction i = p->code[pc]; /* calling instruction */
	- switch (GET_OPCODE(i)) {
	- case OP_CALL:
	- case OP_TAILCALL: /* get function name */
	- return getobjname(p, pc, GETARG_A(i), name);
	- case OP_TFORCALL: { /* for iterator */
	- *name = "for iterator";
	- return "for iterator";
	- }
	- /* all other instructions can call only through metamethods */
	- case OP_SELF:
	- case OP_GETTABUP:
	- case OP_GETTABLE: tm = TM_INDEX; break;
	- case OP_SETTABUP:
	- case OP_SETTABLE: tm = TM_NEWINDEX; break;
	- case OP_EQ: tm = TM_EQ; break;
	- case OP_ADD: tm = TM_ADD; break;
	- case OP_SUB: tm = TM_SUB; break;
	- case OP_MUL: tm = TM_MUL; break;
	- case OP_DIV: tm = TM_DIV; break;
	- case OP_MOD: tm = TM_MOD; break;
	- case OP_POW: tm = TM_POW; break;
	- case OP_UNM: tm = TM_UNM; break;
	- case OP_LEN: tm = TM_LEN; break;
	- case OP_LT: tm = TM_LT; break;
	- case OP_LE: tm = TM_LE; break;
	- case OP_CONCAT: tm = TM_CONCAT; break;
	- default:
	- return NULL; /* else no useful name can be found */
	- }
	- *name = getstr(G(L)->tmname[tm]);
	- return "metamethod";
	-}
	-
	-/* }====================================================== */
	-
	-
	-
	-/*
	-** only ANSI way to check whether a pointer points to an array
	-** (used only for error messages, so efficiency is not a big concern)
	-*/
	-static int isinstack (CallInfo ci, const TValue o) {
	- StkId p;
	- for (p = ci->u.l.base; p < ci->top; p++)
	- if (o == p) return 1;
	- return 0;
	-}
	-
	-
	-static const char getupvalname (CallInfo ci, const TValue *o,
	- const char **name) {
	- LClosure *c = ci_func(ci);
	- int i;
	- for (i = 0; i < c->nupvalues; i++) {
	- if (c->upvals[i]->v == o) {
	- *name = upvalname(c->p, i);
	- return "upvalue";
	- }
	- }
	- return NULL;
	-}
	-
	-
	-l_noret luaG_typeerror (lua_State L, const TValue o, const char *op) {
	- CallInfo *ci = L->ci;
	- const char *name = NULL;
	- const char *t = objtypename(o);
	- const char *kind = NULL;
	- if (isLua(ci)) {
	- kind = getupvalname(ci, o, &name); /* check whether 'o' is an upvalue */
	- if (!kind && isinstack(ci, o)) /* no? try a register */
	- kind = getobjname(ci_func(ci)->p, currentpc(ci),
	- cast_int(o - ci->u.l.base), &name);
	- }
	- if (kind)
	- luaG_runerror(L, "attempt to %s %s " LUA_QS " (a %s value)",
	- op, kind, name, t);
	- else
	- luaG_runerror(L, "attempt to %s a %s value", op, t);
	-}
	-
	-
	-l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2) {
	- if (ttisstring(p1) \|\| ttisnumber(p1)) p1 = p2;
	- lua_assert(!ttisstring(p1) && !ttisnumber(p1));
	- luaG_typeerror(L, p1, "concatenate");
	-}
	-
	-
	-l_noret luaG_aritherror (lua_State L, const TValue p1, const TValue *p2) {
	- TValue temp;
	- if (luaV_tonumber(p1, &temp) == NULL)
	- p2 = p1; /* first operand is wrong */
	- luaG_typeerror(L, p2, "perform arithmetic on");
	-}
	-
	-
	-l_noret luaG_ordererror (lua_State L, const TValue p1, const TValue *p2) {
	- const char *t1 = objtypename(p1);
	- const char *t2 = objtypename(p2);
	- if (t1 == t2)
	- luaG_runerror(L, "attempt to compare two %s values", t1);
	- else
	- luaG_runerror(L, "attempt to compare %s with %s", t1, t2);
	-}
	-
	-
	-static void addinfo (lua_State L, const char msg) {
	- CallInfo *ci = L->ci;
	- if (isLua(ci)) { /* is Lua code? */
	- char buff[LUA_IDSIZE]; /* add file:line information */
	- int line = currentline(ci);
	- TString *src = ci_func(ci)->p->source;
	- if (src)
	- luaO_chunkid(buff, getstr(src), LUA_IDSIZE);
	- else { /* no source available; use "?" instead */
	- buff[0] = '?'; buff[1] = '\0';
	- }
	- luaO_pushfstring(L, "%s:%d: %s", buff, line, msg);
	- }
	-}
	-
	-
	-l_noret luaG_errormsg (lua_State *L) {
	- if (L->errfunc != 0) { /* is there an error handling function? */
	- StkId errfunc = restorestack(L, L->errfunc);
	- if (!ttisfunction(errfunc)) luaD_throw(L, LUA_ERRERR);
	- setobjs2s(L, L->top, L->top - 1); /* move argument */
	- setobjs2s(L, L->top - 1, errfunc); /* push function */
	- L->top++;
	- luaD_call(L, L->top - 2, 1, 0); /* call it */
	- }
	- luaD_throw(L, LUA_ERRRUN);
	-}
	-
	-
	-l_noret luaG_runerror (lua_State L, const char fmt, ...) {
	- va_list argp;
	- va_start(argp, fmt);
	- addinfo(L, luaO_pushvfstring(L, fmt, argp));
	- va_end(argp);
	- luaG_errormsg(L);
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h
	@@ -1,46 +0,0 @@
	-/*
	-** $Id: ldo.h,v 2.20.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Stack and Call structure of Lua
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef ldo_h
	-#define ldo_h
	-
	-
	-#include "lobject.h"
	-#include "lstate.h"
	-#include "lzio.h"
	-
	-
	-#define luaD_checkstack(L,n) if (L->stack_last - L->top <= (n)) \
	- luaD_growstack(L, n); else condmovestack(L);
	-
	-
	-#define incr_top(L) {L->top++; luaD_checkstack(L,0);}
	-
	-#define savestack(L,p) ((char )(p) - (char )L->stack)
	-#define restorestack(L,n) ((TValue )((char )L->stack + (n)))
	-
	-
	-/* type of protected functions, to be ran by `runprotected' */
	-typedef void (Pfunc) (lua_State L, void *ud);
	-
	-LUAI_FUNC int luaD_protectedparser (lua_State L, ZIO z, const char *name,
	- const char *mode);
	-LUAI_FUNC void luaD_hook (lua_State *L, int event, int line);
	-LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nresults);
	-LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults,
	- int allowyield);
	-LUAI_FUNC int luaD_pcall (lua_State L, Pfunc func, void u,
	- ptrdiff_t oldtop, ptrdiff_t ef);
	-LUAI_FUNC int luaD_poscall (lua_State *L, StkId firstResult);
	-LUAI_FUNC void luaD_reallocstack (lua_State *L, int newsize);
	-LUAI_FUNC void luaD_growstack (lua_State *L, int n);
	-LUAI_FUNC void luaD_shrinkstack (lua_State *L);
	-
	-LUAI_FUNC l_noret luaD_throw (lua_State *L, int errcode);
	-LUAI_FUNC int luaD_rawrunprotected (lua_State L, Pfunc f, void ud);
	-
	-#endif
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c
	@@ -1,691 +0,0 @@
	-/*
	-** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $
	-** Stack and Call structure of Lua
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define ldo_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lapi.h"
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lfunc.h"
	-#include "lgc.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lopcodes.h"
	-#include "lparser.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "ltm.h"
	-#include "lundump.h"
	-#include "lvm.h"
	-#include "lzio.h"
	-
	-
	-
	-
	-/*
	-** {======================================================
	-** Error-recovery functions
	-** =======================================================
	-*/
	-
	-/*
	-** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By
	-** default, Lua handles errors with exceptions when compiling as
	-** C++ code, with _longjmp/_setjmp when asked to use them, and with
	-** longjmp/setjmp otherwise.
	-*/
	-#if !defined(LUAI_THROW)
	-
	-#ifdef _KERNEL
	-#ifdef illumos
	-#define LUAI_THROW(L,c) longjmp(&(c)->b)
	-#define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a }
	-#define luai_jmpbuf label_t
	-#else
	-#define LUAI_THROW(L,c) longjmp((c)->b, 1)
	-#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a }
	-#define luai_jmpbuf jmp_buf
	-#endif
	-#else
	-#if defined(__cplusplus) && !defined(LUA_USE_LONGJMP)
	-/* C++ exceptions */
	-#define LUAI_THROW(L,c) throw(c)
	-#define LUAI_TRY(L,c,a) \
	- try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; }
	-#define luai_jmpbuf int /* dummy variable */
	-
	-#elif defined(LUA_USE_ULONGJMP)
	-/* in Unix, try _longjmp/_setjmp (more efficient) */
	-#define LUAI_THROW(L,c) _longjmp((c)->b, 1)
	-#define LUAI_TRY(L,c,a) if (_setjmp((c)->b) == 0) { a }
	-#define luai_jmpbuf jmp_buf
	-
	-#else
	-/* default handling with long jumps */
	-#define LUAI_THROW(L,c) longjmp((c)->b, 1)
	-#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a }
	-#define luai_jmpbuf jmp_buf
	-
	-#endif
	-
	-#endif
	-
	-#endif
	-
	-
	-/* chain list of long jump buffers */
	-struct lua_longjmp {
	- struct lua_longjmp *previous;
	- luai_jmpbuf b;
	- volatile int status; /* error code */
	-};
	-
	-
	-static void seterrorobj (lua_State *L, int errcode, StkId oldtop) {
	- switch (errcode) {
	- case LUA_ERRMEM: { /* memory error? */
	- setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */
	- break;
	- }
	- case LUA_ERRERR: {
	- setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling"));
	- break;
	- }
	- default: {
	- setobjs2s(L, oldtop, L->top - 1); /* error message on current top */
	- break;
	- }
	- }
	- L->top = oldtop + 1;
	-}
	-
	-
	-l_noret luaD_throw (lua_State *L, int errcode) {
	- if (L->errorJmp) { /* thread has an error handler? */
	- L->errorJmp->status = errcode; /* set status */
	- LUAI_THROW(L, L->errorJmp); /* jump to it */
	- }
	- else { /* thread has no error handler */
	- L->status = cast_byte(errcode); /* mark it as dead */
	- if (G(L)->mainthread->errorJmp) { /* main thread has a handler? */
	- setobjs2s(L, G(L)->mainthread->top++, L->top - 1); /* copy error obj. */
	- luaD_throw(G(L)->mainthread, errcode); /* re-throw in main thread */
	- }
	- else { /* no handler at all; abort */
	- if (G(L)->panic) { /* panic function? */
	- lua_unlock(L);
	- G(L)->panic(L); /* call it (last chance to jump out) */
	- }
	- panic("no error handler");
	- }
	- }
	-}
	-
	-
	-int luaD_rawrunprotected (lua_State L, Pfunc f, void ud) {
	- unsigned short oldnCcalls = L->nCcalls;
	- struct lua_longjmp lj;
	- lj.status = LUA_OK;
	- lj.previous = L->errorJmp; /* chain new error handler */
	- L->errorJmp = &lj;
	- LUAI_TRY(L, &lj,
	- (*f)(L, ud);
	- );
	- L->errorJmp = lj.previous; /* restore old error handler */
	- L->nCcalls = oldnCcalls;
	- return lj.status;
	-}
	-
	-/* }====================================================== */
	-
	-
	-static void correctstack (lua_State L, TValue oldstack) {
	- CallInfo *ci;
	- GCObject *up;
	- L->top = (L->top - oldstack) + L->stack;
	- for (up = L->openupval; up != NULL; up = up->gch.next)
	- gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack;
	- for (ci = L->ci; ci != NULL; ci = ci->previous) {
	- ci->top = (ci->top - oldstack) + L->stack;
	- ci->func = (ci->func - oldstack) + L->stack;
	- if (isLua(ci))
	- ci->u.l.base = (ci->u.l.base - oldstack) + L->stack;
	- }
	-}
	-
	-
	-/* some space for error handling */
	-#define ERRORSTACKSIZE (LUAI_MAXSTACK + 200)
	-
	-
	-void luaD_reallocstack (lua_State *L, int newsize) {
	- TValue *oldstack = L->stack;
	- int lim = L->stacksize;
	- lua_assert(newsize <= LUAI_MAXSTACK \|\| newsize == ERRORSTACKSIZE);
	- lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK);
	- luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue);
	- for (; lim < newsize; lim++)
	- setnilvalue(L->stack + lim); /* erase new segment */
	- L->stacksize = newsize;
	- L->stack_last = L->stack + newsize - EXTRA_STACK;
	- correctstack(L, oldstack);
	-}
	-
	-
	-void luaD_growstack (lua_State *L, int n) {
	- int size = L->stacksize;
	- if (size > LUAI_MAXSTACK) /* error after extra size? */
	- luaD_throw(L, LUA_ERRERR);
	- else {
	- int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK;
	- int newsize = 2 * size;
	- if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK;
	- if (newsize < needed) newsize = needed;
	- if (newsize > LUAI_MAXSTACK) { /* stack overflow? */
	- luaD_reallocstack(L, ERRORSTACKSIZE);
	- luaG_runerror(L, "stack overflow");
	- }
	- else
	- luaD_reallocstack(L, newsize);
	- }
	-}
	-
	-
	-static int stackinuse (lua_State *L) {
	- CallInfo *ci;
	- StkId lim = L->top;
	- for (ci = L->ci; ci != NULL; ci = ci->previous) {
	- lua_assert(ci->top <= L->stack_last);
	- if (lim < ci->top) lim = ci->top;
	- }
	- return cast_int(lim - L->stack) + 1; /* part of stack in use */
	-}
	-
	-
	-void luaD_shrinkstack (lua_State *L) {
	- int inuse = stackinuse(L);
	- int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK;
	- if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK;
	- if (inuse > LUAI_MAXSTACK \|\| /* handling stack overflow? */
	- goodsize >= L->stacksize) /* would grow instead of shrink? */
	- condmovestack(L); /* don't change stack (change only for debugging) */
	- else
	- luaD_reallocstack(L, goodsize); /* shrink it */
	-}
	-
	-
	-void luaD_hook (lua_State *L, int event, int line) {
	- lua_Hook hook = L->hook;
	- if (hook && L->allowhook) {
	- CallInfo *ci = L->ci;
	- ptrdiff_t top = savestack(L, L->top);
	- ptrdiff_t ci_top = savestack(L, ci->top);
	- lua_Debug ar;
	- ar.event = event;
	- ar.currentline = line;
	- ar.i_ci = ci;
	- luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */
	- ci->top = L->top + LUA_MINSTACK;
	- lua_assert(ci->top <= L->stack_last);
	- L->allowhook = 0; /* cannot call hooks inside a hook */
	- ci->callstatus \|= CIST_HOOKED;
	- lua_unlock(L);
	- (*hook)(L, &ar);
	- lua_lock(L);
	- lua_assert(!L->allowhook);
	- L->allowhook = 1;
	- ci->top = restorestack(L, ci_top);
	- L->top = restorestack(L, top);
	- ci->callstatus &= ~CIST_HOOKED;
	- }
	-}
	-
	-
	-static void callhook (lua_State L, CallInfo ci) {
	- int hook = LUA_HOOKCALL;
	- ci->u.l.savedpc++; /* hooks assume 'pc' is already incremented */
	- if (isLua(ci->previous) &&
	- GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) {
	- ci->callstatus \|= CIST_TAIL;
	- hook = LUA_HOOKTAILCALL;
	- }
	- luaD_hook(L, hook, -1);
	- ci->u.l.savedpc--; /* correct 'pc' */
	-}
	-
	-
	-static StkId adjust_varargs (lua_State L, Proto p, int actual) {
	- int i;
	- int nfixargs = p->numparams;
	- StkId base, fixed;
	- lua_assert(actual >= nfixargs);
	- /* move fixed parameters to final position */
	- luaD_checkstack(L, p->maxstacksize); /* check again for new 'base' */
	- fixed = L->top - actual; /* first fixed argument */
	- base = L->top; /* final position of first argument */
	- for (i=0; i<nfixargs; i++) {
	- setobjs2s(L, L->top++, fixed + i);
	- setnilvalue(fixed + i);
	- }
	- return base;
	-}
	-
	-
	-static StkId tryfuncTM (lua_State *L, StkId func) {
	- const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL);
	- StkId p;
	- ptrdiff_t funcr = savestack(L, func);
	- if (!ttisfunction(tm))
	- luaG_typeerror(L, func, "call");
	- /* Open a hole inside the stack at `func' */
	- for (p = L->top; p > func; p--) setobjs2s(L, p, p-1);
	- incr_top(L);
	- func = restorestack(L, funcr); /* previous call may change stack */
	- setobj2s(L, func, tm); /* tag method is the new function to be called */
	- return func;
	-}
	-
	-
	-
	-#define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L)))
	-
	-
	-/*
	-** returns true if function has been executed (C function)
	-*/
	-int luaD_precall (lua_State *L, StkId func, int nresults) {
	- lua_CFunction f;
	- CallInfo *ci;
	- int n; /* number of arguments (Lua) or returns (C) */
	- ptrdiff_t funcr = savestack(L, func);
	- switch (ttype(func)) {
	- case LUA_TLCF: /* light C function */
	- f = fvalue(func);
	- goto Cfunc;
	- case LUA_TCCL: { /* C closure */
	- f = clCvalue(func)->f;
	- Cfunc:
	- luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */
	- ci = next_ci(L); /* now 'enter' new function */
	- ci->nresults = nresults;
	- ci->func = restorestack(L, funcr);
	- ci->top = L->top + LUA_MINSTACK;
	- lua_assert(ci->top <= L->stack_last);
	- ci->callstatus = 0;
	- luaC_checkGC(L); /* stack grow uses memory */
	- if (L->hookmask & LUA_MASKCALL)
	- luaD_hook(L, LUA_HOOKCALL, -1);
	- lua_unlock(L);
	- n = (f)(L); / do the actual call */
	- lua_lock(L);
	- api_checknelems(L, n);
	- luaD_poscall(L, L->top - n);
	- return 1;
	- }
	- case LUA_TLCL: { /* Lua function: prepare its call */
	- StkId base;
	- Proto *p = clLvalue(func)->p;
	- n = cast_int(L->top - func) - 1; /* number of real arguments */
	- luaD_checkstack(L, p->maxstacksize);
	- for (; n < p->numparams; n++)
	- setnilvalue(L->top++); /* complete missing arguments */
	- if (!p->is_vararg) {
	- func = restorestack(L, funcr);
	- base = func + 1;
	- }
	- else {
	- base = adjust_varargs(L, p, n);
	- func = restorestack(L, funcr); /* previous call can change stack */
	- }
	- ci = next_ci(L); /* now 'enter' new function */
	- ci->nresults = nresults;
	- ci->func = func;
	- ci->u.l.base = base;
	- ci->top = base + p->maxstacksize;
	- lua_assert(ci->top <= L->stack_last);
	- ci->u.l.savedpc = p->code; /* starting point */
	- ci->callstatus = CIST_LUA;
	- L->top = ci->top;
	- luaC_checkGC(L); /* stack grow uses memory */
	- if (L->hookmask & LUA_MASKCALL)
	- callhook(L, ci);
	- return 0;
	- }
	- default: { /* not a function */
	- func = tryfuncTM(L, func); /* retry with 'function' tag method */
	- return luaD_precall(L, func, nresults); /* now it must be a function */
	- }
	- }
	-}
	-
	-
	-int luaD_poscall (lua_State *L, StkId firstResult) {
	- StkId res;
	- int wanted, i;
	- CallInfo *ci = L->ci;
	- if (L->hookmask & (LUA_MASKRET \| LUA_MASKLINE)) {
	- if (L->hookmask & LUA_MASKRET) {
	- ptrdiff_t fr = savestack(L, firstResult); /* hook may change stack */
	- luaD_hook(L, LUA_HOOKRET, -1);
	- firstResult = restorestack(L, fr);
	- }
	- L->oldpc = ci->previous->u.l.savedpc; /* 'oldpc' for caller function */
	- }
	- res = ci->func; /* res == final position of 1st result */
	- wanted = ci->nresults;
	- L->ci = ci = ci->previous; /* back to caller */
	- /* move results to correct place */
	- for (i = wanted; i != 0 && firstResult < L->top; i--)
	- setobjs2s(L, res++, firstResult++);
	- while (i-- > 0)
	- setnilvalue(res++);
	- L->top = res;
	- return (wanted - LUA_MULTRET); /* 0 iff wanted == LUA_MULTRET */
	-}
	-
	-
	-/*
	-** Call a function (C or Lua). The function to be called is at *func.
	-** The arguments are on the stack, right after the function.
	-** When returns, all the results are on the stack, starting at the original
	-** function position.
	-*/
	-void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) {
	- if (++L->nCcalls >= LUAI_MAXCCALLS) {
	- if (L->nCcalls == LUAI_MAXCCALLS)
	- luaG_runerror(L, "C stack overflow");
	- else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3)))
	- luaD_throw(L, LUA_ERRERR); /* error while handing stack error */
	- }
	- if (!allowyield) L->nny++;
	- if (!luaD_precall(L, func, nResults)) /* is a Lua function? */
	- luaV_execute(L); /* call it */
	- if (!allowyield) L->nny--;
	- L->nCcalls--;
	-}
	-
	-
	-static void finishCcall (lua_State *L) {
	- CallInfo *ci = L->ci;
	- int n;
	- lua_assert(ci->u.c.k != NULL); /* must have a continuation */
	- lua_assert(L->nny == 0);
	- if (ci->callstatus & CIST_YPCALL) { /* was inside a pcall? */
	- ci->callstatus &= ~CIST_YPCALL; /* finish 'lua_pcall' */
	- L->errfunc = ci->u.c.old_errfunc;
	- }
	- /* finish 'lua_callk'/'lua_pcall' */
	- adjustresults(L, ci->nresults);
	- /* call continuation function */
	- if (!(ci->callstatus & CIST_STAT)) /* no call status? */
	- ci->u.c.status = LUA_YIELD; /* 'default' status */
	- lua_assert(ci->u.c.status != LUA_OK);
	- ci->callstatus = (ci->callstatus & ~(CIST_YPCALL \| CIST_STAT)) \| CIST_YIELDED;
	- lua_unlock(L);
	- n = (*ci->u.c.k)(L);
	- lua_lock(L);
	- api_checknelems(L, n);
	- /* finish 'luaD_precall' */
	- luaD_poscall(L, L->top - n);
	-}
	-
	-
	-static void unroll (lua_State L, void ud) {
	- UNUSED(ud);
	- for (;;) {
	- if (L->ci == &L->base_ci) /* stack is empty? */
	- return; /* coroutine finished normally */
	- if (!isLua(L->ci)) /* C function? */
	- finishCcall(L);
	- else { /* Lua function */
	- luaV_finishOp(L); /* finish interrupted instruction */
	- luaV_execute(L); /* execute down to higher C 'boundary' */
	- }
	- }
	-}
	-
	-
	-/*
	-** check whether thread has a suspended protected call
	-*/
	-static CallInfo findpcall (lua_State L) {
	- CallInfo *ci;
	- for (ci = L->ci; ci != NULL; ci = ci->previous) { /* search for a pcall */
	- if (ci->callstatus & CIST_YPCALL)
	- return ci;
	- }
	- return NULL; /* no pending pcall */
	-}
	-
	-
	-static int recover (lua_State *L, int status) {
	- StkId oldtop;
	- CallInfo *ci = findpcall(L);
	- if (ci == NULL) return 0; /* no recovery point */
	- /* "finish" luaD_pcall */
	- oldtop = restorestack(L, ci->extra);
	- luaF_close(L, oldtop);
	- seterrorobj(L, status, oldtop);
	- L->ci = ci;
	- L->allowhook = ci->u.c.old_allowhook;
	- L->nny = 0; /* should be zero to be yieldable */
	- luaD_shrinkstack(L);
	- L->errfunc = ci->u.c.old_errfunc;
	- ci->callstatus \|= CIST_STAT; /* call has error status */
	- ci->u.c.status = status; /* (here it is) */
	- return 1; /* continue running the coroutine */
	-}
	-
	-
	-/*
	-** signal an error in the call to 'resume', not in the execution of the
	-** coroutine itself. (Such errors should not be handled by any coroutine
	-** error handler and should not kill the coroutine.)
	-*/
	-static l_noret resume_error (lua_State L, const char msg, StkId firstArg) {
	- L->top = firstArg; /* remove args from the stack */
	- setsvalue2s(L, L->top, luaS_new(L, msg)); /* push error message */
	- api_incr_top(L);
	- luaD_throw(L, -1); /* jump back to 'lua_resume' */
	-}
	-
	-
	-/*
	-** do the work for 'lua_resume' in protected mode
	-*/
	-static void resume_cb (lua_State L, void ud) {
	- int nCcalls = L->nCcalls;
	- StkId firstArg = cast(StkId, ud);
	- CallInfo *ci = L->ci;
	- if (nCcalls >= LUAI_MAXCCALLS)
	- resume_error(L, "C stack overflow", firstArg);
	- if (L->status == LUA_OK) { /* may be starting a coroutine */
	- if (ci != &L->base_ci) /* not in base level? */
	- resume_error(L, "cannot resume non-suspended coroutine", firstArg);
	- /* coroutine is in base level; start running it */
	- if (!luaD_precall(L, firstArg - 1, LUA_MULTRET)) /* Lua function? */
	- luaV_execute(L); /* call it */
	- }
	- else if (L->status != LUA_YIELD)
	- resume_error(L, "cannot resume dead coroutine", firstArg);
	- else { /* resuming from previous yield */
	- L->status = LUA_OK;
	- ci->func = restorestack(L, ci->extra);
	- if (isLua(ci)) /* yielded inside a hook? */
	- luaV_execute(L); /* just continue running Lua code */
	- else { /* 'common' yield */
	- if (ci->u.c.k != NULL) { /* does it have a continuation? */
	- int n;
	- ci->u.c.status = LUA_YIELD; /* 'default' status */
	- ci->callstatus \|= CIST_YIELDED;
	- lua_unlock(L);
	- n = (ci->u.c.k)(L); / call continuation */
	- lua_lock(L);
	- api_checknelems(L, n);
	- firstArg = L->top - n; /* yield results come from continuation */
	- }
	- luaD_poscall(L, firstArg); /* finish 'luaD_precall' */
	- }
	- unroll(L, NULL);
	- }
	- lua_assert(nCcalls == L->nCcalls);
	-}
	-
	-
	-LUA_API int lua_resume (lua_State L, lua_State from, int nargs) {
	- int status;
	- int oldnny = L->nny; /* save 'nny' */
	- lua_lock(L);
	- luai_userstateresume(L, nargs);
	- L->nCcalls = (from) ? from->nCcalls + 1 : 1;
	- L->nny = 0; /* allow yields */
	- api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs);
	- status = luaD_rawrunprotected(L, resume_cb, L->top - nargs);
	- if (status == -1) /* error calling 'lua_resume'? */
	- status = LUA_ERRRUN;
	- else { /* yield or regular error */
	- while (status != LUA_OK && status != LUA_YIELD) { /* error? */
	- if (recover(L, status)) /* recover point? */
	- status = luaD_rawrunprotected(L, unroll, NULL); /* run continuation */
	- else { /* unrecoverable error */
	- L->status = cast_byte(status); /* mark thread as `dead' */
	- seterrorobj(L, status, L->top);
	- L->ci->top = L->top;
	- break;
	- }
	- }
	- lua_assert(status == L->status);
	- }
	- L->nny = oldnny; /* restore 'nny' */
	- L->nCcalls--;
	- lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0));
	- lua_unlock(L);
	- return status;
	-}
	-
	-
	-LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) {
	- CallInfo *ci = L->ci;
	- luai_userstateyield(L, nresults);
	- lua_lock(L);
	- api_checknelems(L, nresults);
	- if (L->nny > 0) {
	- if (L != G(L)->mainthread)
	- luaG_runerror(L, "attempt to yield across a C-call boundary");
	- else
	- luaG_runerror(L, "attempt to yield from outside a coroutine");
	- }
	- L->status = LUA_YIELD;
	- ci->extra = savestack(L, ci->func); /* save current 'func' */
	- if (isLua(ci)) { /* inside a hook? */
	- api_check(L, k == NULL, "hooks cannot continue after yielding");
	- }
	- else {
	- if ((ci->u.c.k = k) != NULL) /* is there a continuation? */
	- ci->u.c.ctx = ctx; /* save context */
	- ci->func = L->top - nresults - 1; /* protect stack below results */
	- luaD_throw(L, LUA_YIELD);
	- }
	- lua_assert(ci->callstatus & CIST_HOOKED); /* must be inside a hook */
	- lua_unlock(L);
	- return 0; /* return to 'luaD_hook' */
	-}
	-
	-
	-int luaD_pcall (lua_State L, Pfunc func, void u,
	- ptrdiff_t old_top, ptrdiff_t ef) {
	- int status;
	- CallInfo *old_ci = L->ci;
	- lu_byte old_allowhooks = L->allowhook;
	- unsigned short old_nny = L->nny;
	- ptrdiff_t old_errfunc = L->errfunc;
	- L->errfunc = ef;
	- status = luaD_rawrunprotected(L, func, u);
	- if (status != LUA_OK) { /* an error occurred? */
	- StkId oldtop = restorestack(L, old_top);
	- luaF_close(L, oldtop); /* close possible pending closures */
	- seterrorobj(L, status, oldtop);
	- L->ci = old_ci;
	- L->allowhook = old_allowhooks;
	- L->nny = old_nny;
	- luaD_shrinkstack(L);
	- }
	- L->errfunc = old_errfunc;
	- return status;
	-}
	-
	-
	-
	-/*
	-** Execute a protected parser.
	-*/
	-struct SParser { /* data to `f_parser' */
	- ZIO *z;
	- Mbuffer buff; /* dynamic structure used by the scanner */
	- Dyndata dyd; /* dynamic structures used by the parser */
	- const char *mode;
	- const char *name;
	-};
	-
	-
	-static void checkmode (lua_State L, const char mode, const char *x) {
	- if (mode && strchr(mode, x[0]) == NULL) {
	- luaO_pushfstring(L,
	- "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode);
	- luaD_throw(L, LUA_ERRSYNTAX);
	- }
	-}
	-
	-
	-static void f_parser (lua_State L, void ud) {
	- int i;
	- Closure *cl;
	- struct SParser p = cast(struct SParser , ud);
	- int c = zgetc(p->z); /* read first character */
	- if (c == LUA_SIGNATURE[0]) {
	- checkmode(L, p->mode, "binary");
	- cl = luaU_undump(L, p->z, &p->buff, p->name);
	- }
	- else {
	- checkmode(L, p->mode, "text");
	- cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c);
	- }
	- lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues);
	- for (i = 0; i < cl->l.nupvalues; i++) { /* initialize upvalues */
	- UpVal *up = luaF_newupval(L);
	- cl->l.upvals[i] = up;
	- luaC_objbarrier(L, cl, up);
	- }
	-}
	-
	-
	-int luaD_protectedparser (lua_State L, ZIO z, const char *name,
	- const char *mode) {
	- struct SParser p;
	- int status;
	- L->nny++; /* cannot yield during parsing */
	- p.z = z; p.name = name; p.mode = mode;
	- p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0;
	- p.dyd.gt.arr = NULL; p.dyd.gt.size = 0;
	- p.dyd.label.arr = NULL; p.dyd.label.size = 0;
	- luaZ_initbuffer(L, &p.buff);
	- status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc);
	- luaZ_freebuffer(L, &p.buff);
	- luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size);
	- luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size);
	- luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size);
	- L->nny--;
	- return status;
	-}
	-
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c
	@@ -1,173 +0,0 @@
	-/*
	-** $Id: ldump.c,v 2.17.1.1 2013/04/12 18:48:47 roberto Exp $
	-** save precompiled Lua chunks
	-** See Copyright Notice in lua.h
	-*/
	-
	-#include <sys/zfs_context.h>
	-
	-#define ldump_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lobject.h"
	-#include "lstate.h"
	-#include "lundump.h"
	-
	-typedef struct {
	- lua_State* L;
	- lua_Writer writer;
	- void* data;
	- int strip;
	- int status;
	-} DumpState;
	-
	-#define DumpMem(b,n,size,D) DumpBlock(b,(n)*(size),D)
	-#define DumpVar(x,D) DumpMem(&x,1,sizeof(x),D)
	-
	-static void DumpBlock(const void* b, size_t size, DumpState* D)
	-{
	- if (D->status==0)
	- {
	- lua_unlock(D->L);
	- D->status=(*D->writer)(D->L,b,size,D->data);
	- lua_lock(D->L);
	- }
	-}
	-
	-static void DumpChar(int y, DumpState* D)
	-{
	- char x=(char)y;
	- DumpVar(x,D);
	-}
	-
	-static void DumpInt(int x, DumpState* D)
	-{
	- DumpVar(x,D);
	-}
	-
	-static void DumpNumber(lua_Number x, DumpState* D)
	-{
	- DumpVar(x,D);
	-}
	-
	-static void DumpVector(const void* b, int n, size_t size, DumpState* D)
	-{
	- DumpInt(n,D);
	- DumpMem(b,n,size,D);
	-}
	-
	-static void DumpString(const TString* s, DumpState* D)
	-{
	- if (s==NULL)
	- {
	- size_t size=0;
	- DumpVar(size,D);
	- }
	- else
	- {
	- size_t size=s->tsv.len+1; /* include trailing '\0' */
	- DumpVar(size,D);
	- DumpBlock(getstr(s),size*sizeof(char),D);
	- }
	-}
	-
	-#define DumpCode(f,D) DumpVector(f->code,f->sizecode,sizeof(Instruction),D)
	-
	-static void DumpFunction(const Proto* f, DumpState* D);
	-
	-static void DumpConstants(const Proto* f, DumpState* D)
	-{
	- int i,n=f->sizek;
	- DumpInt(n,D);
	- for (i=0; i<n; i++)
	- {
	- const TValue* o=&f->k[i];
	- DumpChar(ttypenv(o),D);
	- switch (ttypenv(o))
	- {
	- case LUA_TNIL:
	- break;
	- case LUA_TBOOLEAN:
	- DumpChar(bvalue(o),D);
	- break;
	- case LUA_TNUMBER:
	- DumpNumber(nvalue(o),D);
	- break;
	- case LUA_TSTRING:
	- DumpString(rawtsvalue(o),D);
	- break;
	- default: lua_assert(0);
	- }
	- }
	- n=f->sizep;
	- DumpInt(n,D);
	- for (i=0; i<n; i++) DumpFunction(f->p[i],D);
	-}
	-
	-static void DumpUpvalues(const Proto* f, DumpState* D)
	-{
	- int i,n=f->sizeupvalues;
	- DumpInt(n,D);
	- for (i=0; i<n; i++)
	- {
	- DumpChar(f->upvalues[i].instack,D);
	- DumpChar(f->upvalues[i].idx,D);
	- }
	-}
	-
	-static void DumpDebug(const Proto* f, DumpState* D)
	-{
	- int i,n;
	- DumpString((D->strip) ? NULL : f->source,D);
	- n= (D->strip) ? 0 : f->sizelineinfo;
	- DumpVector(f->lineinfo,n,sizeof(int),D);
	- n= (D->strip) ? 0 : f->sizelocvars;
	- DumpInt(n,D);
	- for (i=0; i<n; i++)
	- {
	- DumpString(f->locvars[i].varname,D);
	- DumpInt(f->locvars[i].startpc,D);
	- DumpInt(f->locvars[i].endpc,D);
	- }
	- n= (D->strip) ? 0 : f->sizeupvalues;
	- DumpInt(n,D);
	- for (i=0; i<n; i++) DumpString(f->upvalues[i].name,D);
	-}
	-
	-static void DumpFunction(const Proto* f, DumpState* D)
	-{
	- DumpInt(f->linedefined,D);
	- DumpInt(f->lastlinedefined,D);
	- DumpChar(f->numparams,D);
	- DumpChar(f->is_vararg,D);
	- DumpChar(f->maxstacksize,D);
	- DumpCode(f,D);
	- DumpConstants(f,D);
	- DumpUpvalues(f,D);
	- DumpDebug(f,D);
	-}
	-
	-static void DumpHeader(DumpState* D)
	-{
	- lu_byte h[LUAC_HEADERSIZE];
	- luaU_header(h);
	- DumpBlock(h,LUAC_HEADERSIZE,D);
	-}
	-
	-/*
	-** dump Lua function as precompiled chunk
	-*/
	-int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip)
	-{
	- DumpState D;
	- D.L=L;
	- D.writer=w;
	- D.data=data;
	- D.strip=strip;
	- D.status=0;
	- DumpHeader(&D);
	- DumpFunction(f,&D);
	- return D.status;
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h
	@@ -1,33 +0,0 @@
	-/*
	-** $Id: lfunc.h,v 2.8.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Auxiliary functions to manipulate prototypes and closures
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lfunc_h
	-#define lfunc_h
	-
	-
	-#include "lobject.h"
	-
	-
	-#define sizeCclosure(n) (cast(int, sizeof(CClosure)) + \
	- cast(int, sizeof(TValue)*((n)-1)))
	-
	-#define sizeLclosure(n) (cast(int, sizeof(LClosure)) + \
	- cast(int, sizeof(TValue )((n)-1)))
	-
	-
	-LUAI_FUNC Proto luaF_newproto (lua_State L);
	-LUAI_FUNC Closure luaF_newCclosure (lua_State L, int nelems);
	-LUAI_FUNC Closure luaF_newLclosure (lua_State L, int nelems);
	-LUAI_FUNC UpVal luaF_newupval (lua_State L);
	-LUAI_FUNC UpVal luaF_findupval (lua_State L, StkId level);
	-LUAI_FUNC void luaF_close (lua_State *L, StkId level);
	-LUAI_FUNC void luaF_freeproto (lua_State L, Proto f);
	-LUAI_FUNC void luaF_freeupval (lua_State L, UpVal uv);
	-LUAI_FUNC const char luaF_getlocalname (const Proto func, int local_number,
	- int pc);
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c
	@@ -1,161 +0,0 @@
	-/*
	-** $Id: lfunc.c,v 2.30.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Auxiliary functions to manipulate prototypes and closures
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define lfunc_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lfunc.h"
	-#include "lgc.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lstate.h"
	-
	-
	-
	-Closure luaF_newCclosure (lua_State L, int n) {
	- Closure *c = &luaC_newobj(L, LUA_TCCL, sizeCclosure(n), NULL, 0)->cl;
	- c->c.nupvalues = cast_byte(n);
	- return c;
	-}
	-
	-
	-Closure luaF_newLclosure (lua_State L, int n) {
	- Closure *c = &luaC_newobj(L, LUA_TLCL, sizeLclosure(n), NULL, 0)->cl;
	- c->l.p = NULL;
	- c->l.nupvalues = cast_byte(n);
	- while (n--) c->l.upvals[n] = NULL;
	- return c;
	-}
	-
	-
	-UpVal luaF_newupval (lua_State L) {
	- UpVal *uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), NULL, 0)->uv;
	- uv->v = &uv->u.value;
	- setnilvalue(uv->v);
	- return uv;
	-}
	-
	-
	-UpVal luaF_findupval (lua_State L, StkId level) {
	- global_State *g = G(L);
	- GCObject **pp = &L->openupval;
	- UpVal *p;
	- UpVal *uv;
	- while (pp != NULL && (p = gco2uv(pp))->v >= level) {
	- GCObject *o = obj2gco(p);
	- lua_assert(p->v != &p->u.value);
	- lua_assert(!isold(o) \|\| isold(obj2gco(L)));
	- if (p->v == level) { /* found a corresponding upvalue? */
	- if (isdead(g, o)) /* is it dead? */
	- changewhite(o); /* resurrect it */
	- return p;
	- }
	- pp = &p->next;
	- }
	- /* not found: create a new one */
	- uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), pp, 0)->uv;
	- uv->v = level; /* current value lives in the stack */
	- uv->u.l.prev = &g->uvhead; /* double link it in `uvhead' list */
	- uv->u.l.next = g->uvhead.u.l.next;
	- uv->u.l.next->u.l.prev = uv;
	- g->uvhead.u.l.next = uv;
	- lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
	- return uv;
	-}
	-
	-
	-static void unlinkupval (UpVal *uv) {
	- lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
	- uv->u.l.next->u.l.prev = uv->u.l.prev; /* remove from `uvhead' list */
	- uv->u.l.prev->u.l.next = uv->u.l.next;
	-}
	-
	-
	-void luaF_freeupval (lua_State L, UpVal uv) {
	- if (uv->v != &uv->u.value) /* is it open? */
	- unlinkupval(uv); /* remove from open list */
	- luaM_free(L, uv); /* free upvalue */
	-}
	-
	-
	-void luaF_close (lua_State *L, StkId level) {
	- UpVal *uv;
	- global_State *g = G(L);
	- while (L->openupval != NULL && (uv = gco2uv(L->openupval))->v >= level) {
	- GCObject *o = obj2gco(uv);
	- lua_assert(!isblack(o) && uv->v != &uv->u.value);
	- L->openupval = uv->next; /* remove from `open' list */
	- if (isdead(g, o))
	- luaF_freeupval(L, uv); /* free upvalue */
	- else {
	- unlinkupval(uv); /* remove upvalue from 'uvhead' list */
	- setobj(L, &uv->u.value, uv->v); /* move value to upvalue slot */
	- uv->v = &uv->u.value; /* now current value lives here */
	- gch(o)->next = g->allgc; /* link upvalue into 'allgc' list */
	- g->allgc = o;
	- luaC_checkupvalcolor(g, uv);
	- }
	- }
	-}
	-
	-
	-Proto luaF_newproto (lua_State L) {
	- Proto *f = &luaC_newobj(L, LUA_TPROTO, sizeof(Proto), NULL, 0)->p;
	- f->k = NULL;
	- f->sizek = 0;
	- f->p = NULL;
	- f->sizep = 0;
	- f->code = NULL;
	- f->cache = NULL;
	- f->sizecode = 0;
	- f->lineinfo = NULL;
	- f->sizelineinfo = 0;
	- f->upvalues = NULL;
	- f->sizeupvalues = 0;
	- f->numparams = 0;
	- f->is_vararg = 0;
	- f->maxstacksize = 0;
	- f->locvars = NULL;
	- f->sizelocvars = 0;
	- f->linedefined = 0;
	- f->lastlinedefined = 0;
	- f->source = NULL;
	- return f;
	-}
	-
	-
	-void luaF_freeproto (lua_State L, Proto f) {
	- luaM_freearray(L, f->code, f->sizecode);
	- luaM_freearray(L, f->p, f->sizep);
	- luaM_freearray(L, f->k, f->sizek);
	- luaM_freearray(L, f->lineinfo, f->sizelineinfo);
	- luaM_freearray(L, f->locvars, f->sizelocvars);
	- luaM_freearray(L, f->upvalues, f->sizeupvalues);
	- luaM_free(L, f);
	-}
	-
	-
	-/*
	-** Look for n-th local variable at line `line' in function `func'.
	-** Returns NULL if not found.
	-*/
	-const char luaF_getlocalname (const Proto f, int local_number, int pc) {
	- int i;
	- for (i = 0; i<f->sizelocvars && f->locvars[i].startpc <= pc; i++) {
	- if (pc < f->locvars[i].endpc) { /* is variable active? */
	- local_number--;
	- if (local_number == 0)
	- return getstr(f->locvars[i].varname);
	- }
	- }
	- return NULL; /* not found */
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h
	@@ -1,157 +0,0 @@
	-/*
	-** $Id: lgc.h,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Garbage Collector
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lgc_h
	-#define lgc_h
	-
	-
	-#include "lobject.h"
	-#include "lstate.h"
	-
	-/*
	-** Collectable objects may have one of three colors: white, which
	-** means the object is not marked; gray, which means the
	-** object is marked, but its references may be not marked; and
	-** black, which means that the object and all its references are marked.
	-** The main invariant of the garbage collector, while marking objects,
	-** is that a black object can never point to a white one. Moreover,
	-** any gray object must be in a "gray list" (gray, grayagain, weak,
	-** allweak, ephemeron) so that it can be visited again before finishing
	-** the collection cycle. These lists have no meaning when the invariant
	-** is not being enforced (e.g., sweep phase).
	-*/
	-
	-
	-
	-/* how much to allocate before next GC step */
	-#if !defined(GCSTEPSIZE)
	-/* ~100 small strings */
	-#define GCSTEPSIZE (cast_int(100 * sizeof(TString)))
	-#endif
	-
	-
	-/*
	-** Possible states of the Garbage Collector
	-*/
	-#define GCSpropagate 0
	-#define GCSatomic 1
	-#define GCSsweepstring 2
	-#define GCSsweepudata 3
	-#define GCSsweep 4
	-#define GCSpause 5
	-
	-
	-#define issweepphase(g) \
	- (GCSsweepstring <= (g)->gcstate && (g)->gcstate <= GCSsweep)
	-
	-#define isgenerational(g) ((g)->gckind == KGC_GEN)
	-
	-/*
	-** macros to tell when main invariant (white objects cannot point to black
	-** ones) must be kept. During a non-generational collection, the sweep
	-** phase may break the invariant, as objects turned white may point to
	-** still-black objects. The invariant is restored when sweep ends and
	-** all objects are white again. During a generational collection, the
	-** invariant must be kept all times.
	-*/
	-
	-#define keepinvariant(g) (isgenerational(g) \|\| g->gcstate <= GCSatomic)
	-
	-
	-/*
	-** Outside the collector, the state in generational mode is kept in
	-** 'propagate', so 'keepinvariant' is always true.
	-*/
	-#define keepinvariantout(g) \
	- check_exp(g->gcstate == GCSpropagate \|\| !isgenerational(g), \
	- g->gcstate <= GCSatomic)
	-
	-
	-/*
	-** some useful bit tricks
	-*/
	-#define resetbits(x,m) ((x) &= cast(lu_byte, ~(m)))
	-#define setbits(x,m) ((x) \|= (m))
	-#define testbits(x,m) ((x) & (m))
	-#define bitmask(b) (1<<(b))
	-#define bit2mask(b1,b2) (bitmask(b1) \| bitmask(b2))
	-#define l_setbit(x,b) setbits(x, bitmask(b))
	-#define resetbit(x,b) resetbits(x, bitmask(b))
	-#define testbit(x,b) testbits(x, bitmask(b))
	-
	-
	-/* Layout for bit use in `marked' field: */
	-#define WHITE0BIT 0 /* object is white (type 0) */
	-#define WHITE1BIT 1 /* object is white (type 1) */
	-#define BLACKBIT 2 /* object is black */
	-#define FINALIZEDBIT 3 /* object has been separated for finalization */
	-#define SEPARATED 4 /* object is in 'finobj' list or in 'tobefnz' */
	-#define FIXEDBIT 5 /* object is fixed (should not be collected) */
	-#define OLDBIT 6 /* object is old (only in generational mode) */
	-/* bit 7 is currently used by tests (luaL_checkmemory) */
	-
	-#define WHITEBITS bit2mask(WHITE0BIT, WHITE1BIT)
	-
	-
	-#define iswhite(x) testbits((x)->gch.marked, WHITEBITS)
	-#define isblack(x) testbit((x)->gch.marked, BLACKBIT)
	-#define isgray(x) /* neither white nor black */ \
	- (!testbits((x)->gch.marked, WHITEBITS \| bitmask(BLACKBIT)))
	-
	-#define isold(x) testbit((x)->gch.marked, OLDBIT)
	-
	-/* MOVE OLD rule: whenever an object is moved to the beginning of
	- a GC list, its old bit must be cleared */
	-#define resetoldbit(o) resetbit((o)->gch.marked, OLDBIT)
	-
	-#define otherwhite(g) (g->currentwhite ^ WHITEBITS)
	-#define isdeadm(ow,m) (!(((m) ^ WHITEBITS) & (ow)))
	-#define isdead(g,v) isdeadm(otherwhite(g), (v)->gch.marked)
	-
	-#define changewhite(x) ((x)->gch.marked ^= WHITEBITS)
	-#define gray2black(x) l_setbit((x)->gch.marked, BLACKBIT)
	-
	-#define valiswhite(x) (iscollectable(x) && iswhite(gcvalue(x)))
	-
	-#define luaC_white(g) cast(lu_byte, (g)->currentwhite & WHITEBITS)
	-
	-
	-#define luaC_condGC(L,c) \
	- {if (G(L)->GCdebt > 0) {c;}; condchangemem(L);}
	-#define luaC_checkGC(L) luaC_condGC(L, luaC_step(L);)
	-
	-
	-#define luaC_barrier(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \
	- luaC_barrier_(L,obj2gco(p),gcvalue(v)); }
	-
	-#define luaC_barrierback(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \
	- luaC_barrierback_(L,p); }
	-
	-#define luaC_objbarrier(L,p,o) \
	- { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \
	- luaC_barrier_(L,obj2gco(p),obj2gco(o)); }
	-
	-#define luaC_objbarrierback(L,p,o) \
	- { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) luaC_barrierback_(L,p); }
	-
	-#define luaC_barrierproto(L,p,c) \
	- { if (isblack(obj2gco(p))) luaC_barrierproto_(L,p,c); }
	-
	-LUAI_FUNC void luaC_freeallobjects (lua_State *L);
	-LUAI_FUNC void luaC_step (lua_State *L);
	-LUAI_FUNC void luaC_forcestep (lua_State *L);
	-LUAI_FUNC void luaC_runtilstate (lua_State *L, int statesmask);
	-LUAI_FUNC void luaC_fullgc (lua_State *L, int isemergency);
	-LUAI_FUNC GCObject luaC_newobj (lua_State L, int tt, size_t sz,
	- GCObject **list, int offset);
	-LUAI_FUNC void luaC_barrier_ (lua_State L, GCObject o, GCObject *v);
	-LUAI_FUNC void luaC_barrierback_ (lua_State L, GCObject o);
	-LUAI_FUNC void luaC_barrierproto_ (lua_State L, Proto p, Closure *c);
	-LUAI_FUNC void luaC_checkfinalizer (lua_State L, GCObject o, Table *mt);
	-LUAI_FUNC void luaC_checkupvalcolor (global_State g, UpVal uv);
	-LUAI_FUNC void luaC_changemode (lua_State *L, int mode);
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c
	@@ -1,1220 +0,0 @@
	-/*
	-** $Id: lgc.c,v 2.140.1.3 2014/09/01 16:55:08 roberto Exp $
	-** Garbage Collector
	-** See Copyright Notice in lua.h
	-*/
	-
	-#include <sys/zfs_context.h>
	-
	-#define lgc_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lfunc.h"
	-#include "lgc.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "ltm.h"
	-
	-
	-
	-/*
	-** cost of sweeping one element (the size of a small object divided
	-** by some adjust for the sweep speed)
	-*/
	-#define GCSWEEPCOST ((sizeof(TString) + 4) / 4)
	-
	-/* maximum number of elements to sweep in each single step */
	-#define GCSWEEPMAX (cast_int((GCSTEPSIZE / GCSWEEPCOST) / 4))
	-
	-/* maximum number of finalizers to call in each GC step */
	-#define GCFINALIZENUM 4
	-
	-
	-/*
	-** macro to adjust 'stepmul': 'stepmul' is actually used like
	-** 'stepmul / STEPMULADJ' (value chosen by tests)
	-*/
	-#define STEPMULADJ 200
	-
	-
	-/*
	-** macro to adjust 'pause': 'pause' is actually used like
	-** 'pause / PAUSEADJ' (value chosen by tests)
	-*/
	-#define PAUSEADJ 100
	-
	-
	-/*
	-** 'makewhite' erases all color bits plus the old bit and then
	-** sets only the current white bit
	-*/
	-#define maskcolors (~(bit2mask(BLACKBIT, OLDBIT) \| WHITEBITS))
	-#define makewhite(g,x) \
	- (gch(x)->marked = cast_byte((gch(x)->marked & maskcolors) \| luaC_white(g)))
	-
	-#define white2gray(x) resetbits(gch(x)->marked, WHITEBITS)
	-#define black2gray(x) resetbit(gch(x)->marked, BLACKBIT)
	-
	-
	-#define isfinalized(x) testbit(gch(x)->marked, FINALIZEDBIT)
	-
	-#define checkdeadkey(n) lua_assert(!ttisdeadkey(gkey(n)) \|\| ttisnil(gval(n)))
	-
	-
	-#define checkconsistency(obj) \
	- lua_longassert(!iscollectable(obj) \|\| righttt(obj))
	-
	-
	-#define markvalue(g,o) { checkconsistency(o); \
	- if (valiswhite(o)) reallymarkobject(g,gcvalue(o)); }
	-
	-#define markobject(g,t) { if ((t) && iswhite(obj2gco(t))) \
	- reallymarkobject(g, obj2gco(t)); }
	-
	-static void reallymarkobject (global_State g, GCObject o);
	-
	-
	-/*
	-** {======================================================
	-** Generic functions
	-** =======================================================
	-*/
	-
	-
	-/*
	-** one after last element in a hash array
	-*/
	-#define gnodelast(h) gnode(h, cast(size_t, sizenode(h)))
	-
	-
	-/*
	-** link table 'h' into list pointed by 'p'
	-*/
	-#define linktable(h,p) ((h)->gclist = (p), (p) = obj2gco(h))
	-
	-
	-/*
	-** if key is not marked, mark its entry as dead (therefore removing it
	-** from the table)
	-*/
	-static void removeentry (Node *n) {
	- lua_assert(ttisnil(gval(n)));
	- if (valiswhite(gkey(n)))
	- setdeadvalue(gkey(n)); /* unused and unmarked key; remove it */
	-}
	-
	-
	-/*
	-** tells whether a key or value can be cleared from a weak
	-** table. Non-collectable objects are never removed from weak
	-** tables. Strings behave as `values', so are never removed too. for
	-** other objects: if really collected, cannot keep them; for objects
	-** being finalized, keep them in keys, but not in values
	-*/
	-static int iscleared (global_State g, const TValue o) {
	- if (!iscollectable(o)) return 0;
	- else if (ttisstring(o)) {
	- markobject(g, rawtsvalue(o)); /* strings are `values', so are never weak */
	- return 0;
	- }
	- else return iswhite(gcvalue(o));
	-}
	-
	-
	-/*
	-** barrier that moves collector forward, that is, mark the white object
	-** being pointed by a black object.
	-*/
	-void luaC_barrier_ (lua_State L, GCObject o, GCObject *v) {
	- global_State *g = G(L);
	- lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o));
	- lua_assert(g->gcstate != GCSpause);
	- lua_assert(gch(o)->tt != LUA_TTABLE);
	- if (keepinvariantout(g)) /* must keep invariant? */
	- reallymarkobject(g, v); /* restore invariant */
	- else { /* sweep phase */
	- lua_assert(issweepphase(g));
	- makewhite(g, o); /* mark main obj. as white to avoid other barriers */
	- }
	-}
	-
	-
	-/*
	-** barrier that moves collector backward, that is, mark the black object
	-** pointing to a white object as gray again. (Current implementation
	-** only works for tables; access to 'gclist' is not uniform across
	-** different types.)
	-*/
	-void luaC_barrierback_ (lua_State L, GCObject o) {
	- global_State *g = G(L);
	- lua_assert(isblack(o) && !isdead(g, o) && gch(o)->tt == LUA_TTABLE);
	- black2gray(o); /* make object gray (again) */
	- gco2t(o)->gclist = g->grayagain;
	- g->grayagain = o;
	-}
	-
	-
	-/*
	-** barrier for prototypes. When creating first closure (cache is
	-** NULL), use a forward barrier; this may be the only closure of the
	-** prototype (if it is a "regular" function, with a single instance)
	-** and the prototype may be big, so it is better to avoid traversing
	-** it again. Otherwise, use a backward barrier, to avoid marking all
	-** possible instances.
	-*/
	-LUAI_FUNC void luaC_barrierproto_ (lua_State L, Proto p, Closure *c) {
	- global_State *g = G(L);
	- lua_assert(isblack(obj2gco(p)));
	- if (p->cache == NULL) { /* first time? */
	- luaC_objbarrier(L, p, c);
	- }
	- else { /* use a backward barrier */
	- black2gray(obj2gco(p)); /* make prototype gray (again) */
	- p->gclist = g->grayagain;
	- g->grayagain = obj2gco(p);
	- }
	-}
	-
	-
	-/*
	-** check color (and invariants) for an upvalue that was closed,
	-** i.e., moved into the 'allgc' list
	-*/
	-void luaC_checkupvalcolor (global_State g, UpVal uv) {
	- GCObject *o = obj2gco(uv);
	- lua_assert(!isblack(o)); /* open upvalues are never black */
	- if (isgray(o)) {
	- if (keepinvariant(g)) {
	- resetoldbit(o); /* see MOVE OLD rule */
	- gray2black(o); /* it is being visited now */
	- markvalue(g, uv->v);
	- }
	- else {
	- lua_assert(issweepphase(g));
	- makewhite(g, o);
	- }
	- }
	-}
	-
	-
	-/*
	-** create a new collectable object (with given type and size) and link
	-** it to '*list'. 'offset' tells how many bytes to allocate before the
	-** object itself (used only by states).
	-*/
	-GCObject luaC_newobj (lua_State L, int tt, size_t sz, GCObject **list,
	- int offset) {
	- global_State *g = G(L);
	- char raw = cast(char , luaM_newobject(L, novariant(tt), sz));
	- GCObject *o = obj2gco(raw + offset);
	- if (list == NULL)
	- list = &g->allgc; /* standard list for collectable objects */
	- gch(o)->marked = luaC_white(g);
	- gch(o)->tt = tt;
	- gch(o)->next = *list;
	- *list = o;
	- return o;
	-}
	-
	-/* }====================================================== */
	-
	-
	-
	-/*
	-** {======================================================
	-** Mark functions
	-** =======================================================
	-*/
	-
	-
	-/*
	-** mark an object. Userdata, strings, and closed upvalues are visited
	-** and turned black here. Other objects are marked gray and added
	-** to appropriate list to be visited (and turned black) later. (Open
	-** upvalues are already linked in 'headuv' list.)
	-*/
	-static void reallymarkobject (global_State g, GCObject o) {
	- lu_mem size;
	- white2gray(o);
	- switch (gch(o)->tt) {
	- case LUA_TSHRSTR:
	- case LUA_TLNGSTR: {
	- size = sizestring(gco2ts(o));
	- break; /* nothing else to mark; make it black */
	- }
	- case LUA_TUSERDATA: {
	- Table *mt = gco2u(o)->metatable;
	- markobject(g, mt);
	- markobject(g, gco2u(o)->env);
	- size = sizeudata(gco2u(o));
	- break;
	- }
	- case LUA_TUPVAL: {
	- UpVal *uv = gco2uv(o);
	- markvalue(g, uv->v);
	- if (uv->v != &uv->u.value) /* open? */
	- return; /* open upvalues remain gray */
	- size = sizeof(UpVal);
	- break;
	- }
	- case LUA_TLCL: {
	- gco2lcl(o)->gclist = g->gray;
	- g->gray = o;
	- return;
	- }
	- case LUA_TCCL: {
	- gco2ccl(o)->gclist = g->gray;
	- g->gray = o;
	- return;
	- }
	- case LUA_TTABLE: {
	- linktable(gco2t(o), &g->gray);
	- return;
	- }
	- case LUA_TTHREAD: {
	- gco2th(o)->gclist = g->gray;
	- g->gray = o;
	- return;
	- }
	- case LUA_TPROTO: {
	- gco2p(o)->gclist = g->gray;
	- g->gray = o;
	- return;
	- }
	- default: lua_assert(0); return;
	- }
	- gray2black(o);
	- g->GCmemtrav += size;
	-}
	-
	-
	-/*
	-** mark metamethods for basic types
	-*/
	-static void markmt (global_State *g) {
	- int i;
	- for (i=0; i < LUA_NUMTAGS; i++)
	- markobject(g, g->mt[i]);
	-}
	-
	-
	-/*
	-** mark all objects in list of being-finalized
	-*/
	-static void markbeingfnz (global_State *g) {
	- GCObject *o;
	- for (o = g->tobefnz; o != NULL; o = gch(o)->next) {
	- makewhite(g, o);
	- reallymarkobject(g, o);
	- }
	-}
	-
	-
	-/*
	-** mark all values stored in marked open upvalues. (See comment in
	-** 'lstate.h'.)
	-*/
	-static void remarkupvals (global_State *g) {
	- UpVal *uv;
	- for (uv = g->uvhead.u.l.next; uv != &g->uvhead; uv = uv->u.l.next) {
	- if (isgray(obj2gco(uv)))
	- markvalue(g, uv->v);
	- }
	-}
	-
	-
	-/*
	-** mark root set and reset all gray lists, to start a new
	-** incremental (or full) collection
	-*/
	-static void restartcollection (global_State *g) {
	- g->gray = g->grayagain = NULL;
	- g->weak = g->allweak = g->ephemeron = NULL;
	- markobject(g, g->mainthread);
	- markvalue(g, &g->l_registry);
	- markmt(g);
	- markbeingfnz(g); /* mark any finalizing object left from previous cycle */
	-}
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** Traverse functions
	-** =======================================================
	-*/
	-
	-static void traverseweakvalue (global_State g, Table h) {
	- Node n, limit = gnodelast(h);
	- /* if there is array part, assume it may have white values (do not
	- traverse it just to check) */
	- int hasclears = (h->sizearray > 0);
	- for (n = gnode(h, 0); n < limit; n++) {
	- checkdeadkey(n);
	- if (ttisnil(gval(n))) /* entry is empty? */
	- removeentry(n); /* remove it */
	- else {
	- lua_assert(!ttisnil(gkey(n)));
	- markvalue(g, gkey(n)); /* mark key */
	- if (!hasclears && iscleared(g, gval(n))) /* is there a white value? */
	- hasclears = 1; /* table will have to be cleared */
	- }
	- }
	- if (hasclears)
	- linktable(h, &g->weak); /* has to be cleared later */
	- else /* no white values */
	- linktable(h, &g->grayagain); /* no need to clean */
	-}
	-
	-
	-static int traverseephemeron (global_State g, Table h) {
	- int marked = 0; /* true if an object is marked in this traversal */
	- int hasclears = 0; /* true if table has white keys */
	- int prop = 0; /* true if table has entry "white-key -> white-value" */
	- Node n, limit = gnodelast(h);
	- int i;
	- /* traverse array part (numeric keys are 'strong') */
	- for (i = 0; i < h->sizearray; i++) {
	- if (valiswhite(&h->array[i])) {
	- marked = 1;
	- reallymarkobject(g, gcvalue(&h->array[i]));
	- }
	- }
	- /* traverse hash part */
	- for (n = gnode(h, 0); n < limit; n++) {
	- checkdeadkey(n);
	- if (ttisnil(gval(n))) /* entry is empty? */
	- removeentry(n); /* remove it */
	- else if (iscleared(g, gkey(n))) { /* key is not marked (yet)? */
	- hasclears = 1; /* table must be cleared */
	- if (valiswhite(gval(n))) /* value not marked yet? */
	- prop = 1; /* must propagate again */
	- }
	- else if (valiswhite(gval(n))) { /* value not marked yet? */
	- marked = 1;
	- reallymarkobject(g, gcvalue(gval(n))); /* mark it now */
	- }
	- }
	- if (g->gcstate != GCSatomic \|\| prop)
	- linktable(h, &g->ephemeron); /* have to propagate again */
	- else if (hasclears) /* does table have white keys? */
	- linktable(h, &g->allweak); /* may have to clean white keys */
	- else /* no white keys */
	- linktable(h, &g->grayagain); /* no need to clean */
	- return marked;
	-}
	-
	-
	-static void traversestrongtable (global_State g, Table h) {
	- Node n, limit = gnodelast(h);
	- int i;
	- for (i = 0; i < h->sizearray; i++) /* traverse array part */
	- markvalue(g, &h->array[i]);
	- for (n = gnode(h, 0); n < limit; n++) { /* traverse hash part */
	- checkdeadkey(n);
	- if (ttisnil(gval(n))) /* entry is empty? */
	- removeentry(n); /* remove it */
	- else {
	- lua_assert(!ttisnil(gkey(n)));
	- markvalue(g, gkey(n)); /* mark key */
	- markvalue(g, gval(n)); /* mark value */
	- }
	- }
	-}
	-
	-
	-static lu_mem traversetable (global_State g, Table h) {
	- const char weakkey, weakvalue;
	- const TValue *mode = gfasttm(g, h->metatable, TM_MODE);
	- markobject(g, h->metatable);
	- if (mode && ttisstring(mode) && /* is there a weak mode? */
	- ((weakkey = strchr(svalue(mode), 'k')),
	- (weakvalue = strchr(svalue(mode), 'v')),
	- (weakkey \|\| weakvalue))) { /* is really weak? */
	- black2gray(obj2gco(h)); /* keep table gray */
	- if (!weakkey) /* strong keys? */
	- traverseweakvalue(g, h);
	- else if (!weakvalue) /* strong values? */
	- traverseephemeron(g, h);
	- else /* all weak */
	- linktable(h, &g->allweak); /* nothing to traverse now */
	- }
	- else /* not weak */
	- traversestrongtable(g, h);
	- return sizeof(Table) + sizeof(TValue) * h->sizearray +
	- sizeof(Node) * cast(size_t, sizenode(h));
	-}
	-
	-
	-static int traverseproto (global_State g, Proto f) {
	- int i;
	- if (f->cache && iswhite(obj2gco(f->cache)))
	- f->cache = NULL; /* allow cache to be collected */
	- markobject(g, f->source);
	- for (i = 0; i < f->sizek; i++) /* mark literals */
	- markvalue(g, &f->k[i]);
	- for (i = 0; i < f->sizeupvalues; i++) /* mark upvalue names */
	- markobject(g, f->upvalues[i].name);
	- for (i = 0; i < f->sizep; i++) /* mark nested protos */
	- markobject(g, f->p[i]);
	- for (i = 0; i < f->sizelocvars; i++) /* mark local-variable names */
	- markobject(g, f->locvars[i].varname);
	- return sizeof(Proto) + sizeof(Instruction) * f->sizecode +
	- sizeof(Proto ) f->sizep +
	- sizeof(TValue) * f->sizek +
	- sizeof(int) * f->sizelineinfo +
	- sizeof(LocVar) * f->sizelocvars +
	- sizeof(Upvaldesc) * f->sizeupvalues;
	-}
	-
	-
	-static lu_mem traverseCclosure (global_State g, CClosure cl) {
	- int i;
	- for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */
	- markvalue(g, &cl->upvalue[i]);
	- return sizeCclosure(cl->nupvalues);
	-}
	-
	-static lu_mem traverseLclosure (global_State g, LClosure cl) {
	- int i;
	- markobject(g, cl->p); /* mark its prototype */
	- for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */
	- markobject(g, cl->upvals[i]);
	- return sizeLclosure(cl->nupvalues);
	-}
	-
	-
	-static lu_mem traversestack (global_State g, lua_State th) {
	- int n = 0;
	- StkId o = th->stack;
	- if (o == NULL)
	- return 1; /* stack not completely built yet */
	- for (; o < th->top; o++) /* mark live elements in the stack */
	- markvalue(g, o);
	- if (g->gcstate == GCSatomic) { /* final traversal? */
	- StkId lim = th->stack + th->stacksize; /* real end of stack */
	- for (; o < lim; o++) /* clear not-marked stack slice */
	- setnilvalue(o);
	- }
	- else { /* count call infos to compute size */
	- CallInfo *ci;
	- for (ci = &th->base_ci; ci != th->ci; ci = ci->next)
	- n++;
	- }
	- return sizeof(lua_State) + sizeof(TValue) * th->stacksize +
	- sizeof(CallInfo) * n;
	-}
	-
	-
	-/*
	-** traverse one gray object, turning it to black (except for threads,
	-** which are always gray).
	-*/
	-static void propagatemark (global_State *g) {
	- lu_mem size;
	- GCObject *o = g->gray;
	- lua_assert(isgray(o));
	- gray2black(o);
	- switch (gch(o)->tt) {
	- case LUA_TTABLE: {
	- Table *h = gco2t(o);
	- g->gray = h->gclist; /* remove from 'gray' list */
	- size = traversetable(g, h);
	- break;
	- }
	- case LUA_TLCL: {
	- LClosure *cl = gco2lcl(o);
	- g->gray = cl->gclist; /* remove from 'gray' list */
	- size = traverseLclosure(g, cl);
	- break;
	- }
	- case LUA_TCCL: {
	- CClosure *cl = gco2ccl(o);
	- g->gray = cl->gclist; /* remove from 'gray' list */
	- size = traverseCclosure(g, cl);
	- break;
	- }
	- case LUA_TTHREAD: {
	- lua_State *th = gco2th(o);
	- g->gray = th->gclist; /* remove from 'gray' list */
	- th->gclist = g->grayagain;
	- g->grayagain = o; /* insert into 'grayagain' list */
	- black2gray(o);
	- size = traversestack(g, th);
	- break;
	- }
	- case LUA_TPROTO: {
	- Proto *p = gco2p(o);
	- g->gray = p->gclist; /* remove from 'gray' list */
	- size = traverseproto(g, p);
	- break;
	- }
	- default: lua_assert(0); return;
	- }
	- g->GCmemtrav += size;
	-}
	-
	-
	-static void propagateall (global_State *g) {
	- while (g->gray) propagatemark(g);
	-}
	-
	-
	-static void propagatelist (global_State g, GCObject l) {
	- lua_assert(g->gray == NULL); /* no grays left */
	- g->gray = l;
	- propagateall(g); /* traverse all elements from 'l' */
	-}
	-
	-/*
	-** retraverse all gray lists. Because tables may be reinserted in other
	-** lists when traversed, traverse the original lists to avoid traversing
	-** twice the same table (which is not wrong, but inefficient)
	-*/
	-static void retraversegrays (global_State *g) {
	- GCObject weak = g->weak; / save original lists */
	- GCObject *grayagain = g->grayagain;
	- GCObject *ephemeron = g->ephemeron;
	- g->weak = g->grayagain = g->ephemeron = NULL;
	- propagateall(g); /* traverse main gray list */
	- propagatelist(g, grayagain);
	- propagatelist(g, weak);
	- propagatelist(g, ephemeron);
	-}
	-
	-
	-static void convergeephemerons (global_State *g) {
	- int changed;
	- do {
	- GCObject *w;
	- GCObject next = g->ephemeron; / get ephemeron list */
	- g->ephemeron = NULL; /* tables will return to this list when traversed */
	- changed = 0;
	- while ((w = next) != NULL) {
	- next = gco2t(w)->gclist;
	- if (traverseephemeron(g, gco2t(w))) { /* traverse marked some value? */
	- propagateall(g); /* propagate changes */
	- changed = 1; /* will have to revisit all ephemeron tables */
	- }
	- }
	- } while (changed);
	-}
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** Sweep Functions
	-** =======================================================
	-*/
	-
	-
	-/*
	-** clear entries with unmarked keys from all weaktables in list 'l' up
	-** to element 'f'
	-*/
	-static void clearkeys (global_State g, GCObject l, GCObject *f) {
	- for (; l != f; l = gco2t(l)->gclist) {
	- Table *h = gco2t(l);
	- Node n, limit = gnodelast(h);
	- for (n = gnode(h, 0); n < limit; n++) {
	- if (!ttisnil(gval(n)) && (iscleared(g, gkey(n)))) {
	- setnilvalue(gval(n)); /* remove value ... */
	- removeentry(n); /* and remove entry from table */
	- }
	- }
	- }
	-}
	-
	-
	-/*
	-** clear entries with unmarked values from all weaktables in list 'l' up
	-** to element 'f'
	-*/
	-static void clearvalues (global_State g, GCObject l, GCObject *f) {
	- for (; l != f; l = gco2t(l)->gclist) {
	- Table *h = gco2t(l);
	- Node n, limit = gnodelast(h);
	- int i;
	- for (i = 0; i < h->sizearray; i++) {
	- TValue *o = &h->array[i];
	- if (iscleared(g, o)) /* value was collected? */
	- setnilvalue(o); /* remove value */
	- }
	- for (n = gnode(h, 0); n < limit; n++) {
	- if (!ttisnil(gval(n)) && iscleared(g, gval(n))) {
	- setnilvalue(gval(n)); /* remove value ... */
	- removeentry(n); /* and remove entry from table */
	- }
	- }
	- }
	-}
	-
	-
	-static void freeobj (lua_State L, GCObject o) {
	- switch (gch(o)->tt) {
	- case LUA_TPROTO: luaF_freeproto(L, gco2p(o)); break;
	- case LUA_TLCL: {
	- luaM_freemem(L, o, sizeLclosure(gco2lcl(o)->nupvalues));
	- break;
	- }
	- case LUA_TCCL: {
	- luaM_freemem(L, o, sizeCclosure(gco2ccl(o)->nupvalues));
	- break;
	- }
	- case LUA_TUPVAL: luaF_freeupval(L, gco2uv(o)); break;
	- case LUA_TTABLE: luaH_free(L, gco2t(o)); break;
	- case LUA_TTHREAD: luaE_freethread(L, gco2th(o)); break;
	- case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break;
	- case LUA_TSHRSTR:
	- G(L)->strt.nuse--;
	- /* FALLTHROUGH */
	- case LUA_TLNGSTR: {
	- luaM_freemem(L, o, sizestring(gco2ts(o)));
	- break;
	- }
	- default: lua_assert(0);
	- }
	-}
	-
	-
	-#define sweepwholelist(L,p) sweeplist(L,p,MAX_LUMEM)
	-static GCObject *sweeplist (lua_State L, GCObject **p, lu_mem count);
	-
	-
	-/*
	-** sweep the (open) upvalues of a thread and resize its stack and
	-** list of call-info structures.
	-*/
	-static void sweepthread (lua_State L, lua_State L1) {
	- if (L1->stack == NULL) return; /* stack not completely built yet */
	- sweepwholelist(L, &L1->openupval); /* sweep open upvalues */
	- luaE_freeCI(L1); /* free extra CallInfo slots */
	- /* should not change the stack during an emergency gc cycle */
	- if (G(L)->gckind != KGC_EMERGENCY)
	- luaD_shrinkstack(L1);
	-}
	-
	-
	-/*
	-** sweep at most 'count' elements from a list of GCObjects erasing dead
	-** objects, where a dead (not alive) object is one marked with the "old"
	-** (non current) white and not fixed.
	-** In non-generational mode, change all non-dead objects back to white,
	-** preparing for next collection cycle.
	-** In generational mode, keep black objects black, and also mark them as
	-** old; stop when hitting an old object, as all objects after that
	-** one will be old too.
	-** When object is a thread, sweep its list of open upvalues too.
	-*/
	-static GCObject *sweeplist (lua_State L, GCObject **p, lu_mem count) {
	- global_State *g = G(L);
	- int ow = otherwhite(g);
	- int toclear, toset; /* bits to clear and to set in all live objects */
	- int tostop; /* stop sweep when this is true */
	- if (isgenerational(g)) { /* generational mode? */
	- toclear = ~0; /* clear nothing */
	- toset = bitmask(OLDBIT); /* set the old bit of all surviving objects */
	- tostop = bitmask(OLDBIT); /* do not sweep old generation */
	- }
	- else { /* normal mode */
	- toclear = maskcolors; /* clear all color bits + old bit */
	- toset = luaC_white(g); /* make object white */
	- tostop = 0; /* do not stop */
	- }
	- while (*p != NULL && count-- > 0) {
	- GCObject curr = p;
	- int marked = gch(curr)->marked;
	- if (isdeadm(ow, marked)) { /* is 'curr' dead? */
	- p = gch(curr)->next; / remove 'curr' from list */
	- freeobj(L, curr); /* erase 'curr' */
	- }
	- else {
	- if (testbits(marked, tostop))
	- return NULL; /* stop sweeping this list */
	- if (gch(curr)->tt == LUA_TTHREAD)
	- sweepthread(L, gco2th(curr)); /* sweep thread's upvalues */
	- /* update marks */
	- gch(curr)->marked = cast_byte((marked & toclear) \| toset);
	- p = &gch(curr)->next; /* go to next element */
	- }
	- }
	- return (*p == NULL) ? NULL : p;
	-}
	-
	-
	-/*
	-** sweep a list until a live object (or end of list)
	-*/
	-static GCObject *sweeptolive (lua_State L, GCObject *p, int n) {
	- GCObject ** old = p;
	- int i = 0;
	- do {
	- i++;
	- p = sweeplist(L, p, 1);
	- } while (p == old);
	- if (n) *n += i;
	- return p;
	-}
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** Finalization
	-** =======================================================
	-*/
	-
	-static void checkSizes (lua_State *L) {
	- global_State *g = G(L);
	- if (g->gckind != KGC_EMERGENCY) { /* do not change sizes in emergency */
	- int hs = g->strt.size / 2; /* half the size of the string table */
	- if (g->strt.nuse < cast(lu_int32, hs)) /* using less than that half? */
	- luaS_resize(L, hs); /* halve its size */
	- luaZ_freebuffer(L, &g->buff); /* free concatenation buffer */
	- }
	-}
	-
	-
	-static GCObject udata2finalize (global_State g) {
	- GCObject o = g->tobefnz; / get first element */
	- lua_assert(isfinalized(o));
	- g->tobefnz = gch(o)->next; /* remove it from 'tobefnz' list */
	- gch(o)->next = g->allgc; /* return it to 'allgc' list */
	- g->allgc = o;
	- resetbit(gch(o)->marked, SEPARATED); /* mark that it is not in 'tobefnz' */
	- lua_assert(!isold(o)); /* see MOVE OLD rule */
	- if (!keepinvariantout(g)) /* not keeping invariant? */
	- makewhite(g, o); /* "sweep" object */
	- return o;
	-}
	-
	-
	-static void dothecall (lua_State L, void ud) {
	- UNUSED(ud);
	- luaD_call(L, L->top - 2, 0, 0);
	-}
	-
	-
	-static void GCTM (lua_State *L, int propagateerrors) {
	- global_State *g = G(L);
	- const TValue *tm;
	- TValue v;
	- setgcovalue(L, &v, udata2finalize(g));
	- tm = luaT_gettmbyobj(L, &v, TM_GC);
	- if (tm != NULL && ttisfunction(tm)) { /* is there a finalizer? */
	- int status;
	- lu_byte oldah = L->allowhook;
	- int running = g->gcrunning;
	- L->allowhook = 0; /* stop debug hooks during GC metamethod */
	- g->gcrunning = 0; /* avoid GC steps */
	- setobj2s(L, L->top, tm); /* push finalizer... */
	- setobj2s(L, L->top + 1, &v); /* ... and its argument */
	- L->top += 2; /* and (next line) call the finalizer */
	- status = luaD_pcall(L, dothecall, NULL, savestack(L, L->top - 2), 0);
	- L->allowhook = oldah; /* restore hooks */
	- g->gcrunning = running; /* restore state */
	- if (status != LUA_OK && propagateerrors) { /* error while running __gc? */
	- if (status == LUA_ERRRUN) { /* is there an error object? */
	- const char *msg = (ttisstring(L->top - 1))
	- ? svalue(L->top - 1)
	- : "no message";
	- luaO_pushfstring(L, "error in __gc metamethod (%s)", msg);
	- status = LUA_ERRGCMM; /* error in __gc metamethod */
	- }
	- luaD_throw(L, status); /* re-throw error */
	- }
	- }
	-}
	-
	-
	-/*
	-** move all unreachable objects (or 'all' objects) that need
	-** finalization from list 'finobj' to list 'tobefnz' (to be finalized)
	-*/
	-static void separatetobefnz (lua_State *L, int all) {
	- global_State *g = G(L);
	- GCObject **p = &g->finobj;
	- GCObject *curr;
	- GCObject **lastnext = &g->tobefnz;
	- /* find last 'next' field in 'tobefnz' list (to add elements in its end) */
	- while (*lastnext != NULL)
	- lastnext = &gch(*lastnext)->next;
	- while ((curr = p) != NULL) { / traverse all finalizable objects */
	- lua_assert(!isfinalized(curr));
	- lua_assert(testbit(gch(curr)->marked, SEPARATED));
	- if (!(iswhite(curr) \|\| all)) /* not being collected? */
	- p = &gch(curr)->next; /* don't bother with it */
	- else {
	- l_setbit(gch(curr)->marked, FINALIZEDBIT); /* won't be finalized again */
	- p = gch(curr)->next; / remove 'curr' from 'finobj' list */
	- gch(curr)->next = lastnext; / link at the end of 'tobefnz' list */
	- *lastnext = curr;
	- lastnext = &gch(curr)->next;
	- }
	- }
	-}
	-
	-
	-/*
	-** if object 'o' has a finalizer, remove it from 'allgc' list (must
	-** search the list to find it) and link it in 'finobj' list.
	-*/
	-void luaC_checkfinalizer (lua_State L, GCObject o, Table *mt) {
	- global_State *g = G(L);
	- if (testbit(gch(o)->marked, SEPARATED) \|\| /* obj. is already separated... */
	- isfinalized(o) \|\| /* ... or is finalized... */
	- gfasttm(g, mt, TM_GC) == NULL) /* or has no finalizer? */
	- return; /* nothing to be done */
	- else { /* move 'o' to 'finobj' list */
	- GCObject **p;
	- GCheader *ho = gch(o);
	- if (g->sweepgc == &ho->next) { /* avoid removing current sweep object */
	- lua_assert(issweepphase(g));
	- g->sweepgc = sweeptolive(L, g->sweepgc, NULL);
	- }
	- /* search for pointer pointing to 'o' */
	- for (p = &g->allgc; p != o; p = &gch(p)->next) { /* empty */ }
	- p = ho->next; / remove 'o' from root list */
	- ho->next = g->finobj; /* link it in list 'finobj' */
	- g->finobj = o;
	- l_setbit(ho->marked, SEPARATED); /* mark it as such */
	- if (!keepinvariantout(g)) /* not keeping invariant? */
	- makewhite(g, o); /* "sweep" object */
	- else
	- resetoldbit(o); /* see MOVE OLD rule */
	- }
	-}
	-
	-/* }====================================================== */
	-
	-
	-/*
	-** {======================================================
	-** GC control
	-** =======================================================
	-*/
	-
	-
	-/*
	-** set a reasonable "time" to wait before starting a new GC cycle;
	-** cycle will start when memory use hits threshold
	-*/
	-static void setpause (global_State *g, l_mem estimate) {
	- l_mem debt, threshold;
	- estimate = estimate / PAUSEADJ; /* adjust 'estimate' */
	- threshold = (g->gcpause < MAX_LMEM / estimate) /* overflow? */
	- ? estimate * g->gcpause /* no overflow */
	- : MAX_LMEM; /* overflow; truncate to maximum */
	- debt = -cast(l_mem, threshold - gettotalbytes(g));
	- luaE_setdebt(g, debt);
	-}
	-
	-
	-#define sweepphases \
	- (bitmask(GCSsweepstring) \| bitmask(GCSsweepudata) \| bitmask(GCSsweep))
	-
	-
	-/*
	-** enter first sweep phase (strings) and prepare pointers for other
	-** sweep phases. The calls to 'sweeptolive' make pointers point to an
	-** object inside the list (instead of to the header), so that the real
	-** sweep do not need to skip objects created between "now" and the start
	-** of the real sweep.
	-** Returns how many objects it swept.
	-*/
	-static int entersweep (lua_State *L) {
	- global_State *g = G(L);
	- int n = 0;
	- g->gcstate = GCSsweepstring;
	- lua_assert(g->sweepgc == NULL && g->sweepfin == NULL);
	- /* prepare to sweep strings, finalizable objects, and regular objects */
	- g->sweepstrgc = 0;
	- g->sweepfin = sweeptolive(L, &g->finobj, &n);
	- g->sweepgc = sweeptolive(L, &g->allgc, &n);
	- return n;
	-}
	-
	-
	-/*
	-** change GC mode
	-*/
	-void luaC_changemode (lua_State *L, int mode) {
	- global_State *g = G(L);
	- if (mode == g->gckind) return; /* nothing to change */
	- if (mode == KGC_GEN) { /* change to generational mode */
	- /* make sure gray lists are consistent */
	- luaC_runtilstate(L, bitmask(GCSpropagate));
	- g->GCestimate = gettotalbytes(g);
	- g->gckind = KGC_GEN;
	- }
	- else { /* change to incremental mode */
	- /* sweep all objects to turn them back to white
	- (as white has not changed, nothing extra will be collected) */
	- g->gckind = KGC_NORMAL;
	- entersweep(L);
	- luaC_runtilstate(L, ~sweepphases);
	- }
	-}
	-
	-
	-/*
	-** call all pending finalizers
	-*/
	-static void callallpendingfinalizers (lua_State *L, int propagateerrors) {
	- global_State *g = G(L);
	- while (g->tobefnz) {
	- resetoldbit(g->tobefnz);
	- GCTM(L, propagateerrors);
	- }
	-}
	-
	-
	-void luaC_freeallobjects (lua_State *L) {
	- global_State *g = G(L);
	- int i;
	- separatetobefnz(L, 1); /* separate all objects with finalizers */
	- lua_assert(g->finobj == NULL);
	- callallpendingfinalizers(L, 0);
	- g->currentwhite = WHITEBITS; /* this "white" makes all objects look dead */
	- g->gckind = KGC_NORMAL;
	- sweepwholelist(L, &g->finobj); /* finalizers can create objs. in 'finobj' */
	- sweepwholelist(L, &g->allgc);
	- for (i = 0; i < g->strt.size; i++) /* free all string lists */
	- sweepwholelist(L, &g->strt.hash[i]);
	- lua_assert(g->strt.nuse == 0);
	-}
	-
	-
	-static l_mem atomic (lua_State *L) {
	- global_State *g = G(L);
	- l_mem work = -cast(l_mem, g->GCmemtrav); /* start counting work */
	- GCObject origweak, origall;
	- lua_assert(!iswhite(obj2gco(g->mainthread)));
	- markobject(g, L); /* mark running thread */
	- /* registry and global metatables may be changed by API */
	- markvalue(g, &g->l_registry);
	- markmt(g); /* mark basic metatables */
	- /* remark occasional upvalues of (maybe) dead threads */
	- remarkupvals(g);
	- propagateall(g); /* propagate changes */
	- work += g->GCmemtrav; /* stop counting (do not (re)count grays) */
	- /* traverse objects caught by write barrier and by 'remarkupvals' */
	- retraversegrays(g);
	- work -= g->GCmemtrav; /* restart counting */
	- convergeephemerons(g);
	- /* at this point, all strongly accessible objects are marked. */
	- /* clear values from weak tables, before checking finalizers */
	- clearvalues(g, g->weak, NULL);
	- clearvalues(g, g->allweak, NULL);
	- origweak = g->weak; origall = g->allweak;
	- work += g->GCmemtrav; /* stop counting (objects being finalized) */
	- separatetobefnz(L, 0); /* separate objects to be finalized */
	- markbeingfnz(g); /* mark objects that will be finalized */
	- propagateall(g); /* remark, to propagate `preserveness' */
	- work -= g->GCmemtrav; /* restart counting */
	- convergeephemerons(g);
	- /* at this point, all resurrected objects are marked. */
	- /* remove dead objects from weak tables */
	- clearkeys(g, g->ephemeron, NULL); /* clear keys from all ephemeron tables */
	- clearkeys(g, g->allweak, NULL); /* clear keys from all allweak tables */
	- /* clear values from resurrected weak tables */
	- clearvalues(g, g->weak, origweak);
	- clearvalues(g, g->allweak, origall);
	- g->currentwhite = cast_byte(otherwhite(g)); /* flip current white */
	- work += g->GCmemtrav; /* complete counting */
	- return work; /* estimate of memory marked by 'atomic' */
	-}
	-
	-
	-static lu_mem singlestep (lua_State *L) {
	- global_State *g = G(L);
	- switch (g->gcstate) {
	- case GCSpause: {
	- /* start to count memory traversed */
	- g->GCmemtrav = g->strt.size * sizeof(GCObject*);
	- lua_assert(!isgenerational(g));
	- restartcollection(g);
	- g->gcstate = GCSpropagate;
	- return g->GCmemtrav;
	- }
	- case GCSpropagate: {
	- if (g->gray) {
	- lu_mem oldtrav = g->GCmemtrav;
	- propagatemark(g);
	- return g->GCmemtrav - oldtrav; /* memory traversed in this step */
	- }
	- else { /* no more `gray' objects */
	- lu_mem work;
	- int sw;
	- g->gcstate = GCSatomic; /* finish mark phase */
	- g->GCestimate = g->GCmemtrav; /* save what was counted */;
	- work = atomic(L); /* add what was traversed by 'atomic' */
	- g->GCestimate += work; /* estimate of total memory traversed */
	- sw = entersweep(L);
	- return work + sw * GCSWEEPCOST;
	- }
	- }
	- case GCSsweepstring: {
	- int i;
	- for (i = 0; i < GCSWEEPMAX && g->sweepstrgc + i < g->strt.size; i++)
	- sweepwholelist(L, &g->strt.hash[g->sweepstrgc + i]);
	- g->sweepstrgc += i;
	- if (g->sweepstrgc >= g->strt.size) /* no more strings to sweep? */
	- g->gcstate = GCSsweepudata;
	- return i * GCSWEEPCOST;
	- }
	- case GCSsweepudata: {
	- if (g->sweepfin) {
	- g->sweepfin = sweeplist(L, g->sweepfin, GCSWEEPMAX);
	- return GCSWEEPMAX*GCSWEEPCOST;
	- }
	- else {
	- g->gcstate = GCSsweep;
	- return 0;
	- }
	- }
	- case GCSsweep: {
	- if (g->sweepgc) {
	- g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX);
	- return GCSWEEPMAX*GCSWEEPCOST;
	- }
	- else {
	- /* sweep main thread */
	- GCObject *mt = obj2gco(g->mainthread);
	- sweeplist(L, &mt, 1);
	- checkSizes(L);
	- g->gcstate = GCSpause; /* finish collection */
	- return GCSWEEPCOST;
	- }
	- }
	- default: lua_assert(0); return 0;
	- }
	-}
	-
	-
	-/*
	-** advances the garbage collector until it reaches a state allowed
	-** by 'statemask'
	-*/
	-void luaC_runtilstate (lua_State *L, int statesmask) {
	- global_State *g = G(L);
	- while (!testbit(statesmask, g->gcstate))
	- singlestep(L);
	-}
	-
	-
	-static void generationalcollection (lua_State *L) {
	- global_State *g = G(L);
	- lua_assert(g->gcstate == GCSpropagate);
	- if (g->GCestimate == 0) { /* signal for another major collection? */
	- luaC_fullgc(L, 0); /* perform a full regular collection */
	- g->GCestimate = gettotalbytes(g); /* update control */
	- }
	- else {
	- lu_mem estimate = g->GCestimate;
	- luaC_runtilstate(L, bitmask(GCSpause)); /* run complete (minor) cycle */
	- g->gcstate = GCSpropagate; /* skip restart */
	- if (gettotalbytes(g) > (estimate / 100) * g->gcmajorinc)
	- g->GCestimate = 0; /* signal for a major collection */
	- else
	- g->GCestimate = estimate; /* keep estimate from last major coll. */
	-
	- }
	- setpause(g, gettotalbytes(g));
	- lua_assert(g->gcstate == GCSpropagate);
	-}
	-
	-
	-static void incstep (lua_State *L) {
	- global_State *g = G(L);
	- l_mem debt = g->GCdebt;
	- int stepmul = g->gcstepmul;
	- if (stepmul < 40) stepmul = 40; /* avoid ridiculous low values (and 0) */
	- /* convert debt from Kb to 'work units' (avoid zero debt and overflows) */
	- debt = (debt / STEPMULADJ) + 1;
	- debt = (debt < MAX_LMEM / stepmul) ? debt * stepmul : MAX_LMEM;
	- do { /* always perform at least one single step */
	- lu_mem work = singlestep(L); /* do some work */
	- debt -= work;
	- } while (debt > -GCSTEPSIZE && g->gcstate != GCSpause);
	- if (g->gcstate == GCSpause)
	- setpause(g, g->GCestimate); /* pause until next cycle */
	- else {
	- debt = (debt / stepmul) * STEPMULADJ; /* convert 'work units' to Kb */
	- luaE_setdebt(g, debt);
	- }
	-}
	-
	-
	-/*
	-** performs a basic GC step
	-*/
	-void luaC_forcestep (lua_State *L) {
	- global_State *g = G(L);
	- int i;
	- if (isgenerational(g)) generationalcollection(L);
	- else incstep(L);
	- /* run a few finalizers (or all of them at the end of a collect cycle) */
	- for (i = 0; g->tobefnz && (i < GCFINALIZENUM \|\| g->gcstate == GCSpause); i++)
	- GCTM(L, 1); /* call one finalizer */
	-}
	-
	-
	-/*
	-** performs a basic GC step only if collector is running
	-*/
	-void luaC_step (lua_State *L) {
	- global_State *g = G(L);
	- if (g->gcrunning) luaC_forcestep(L);
	- else luaE_setdebt(g, -GCSTEPSIZE); /* avoid being called too often */
	-}
	-
	-
	-
	-/*
	-** performs a full GC cycle; if "isemergency", does not call
	-** finalizers (which could change stack positions)
	-*/
	-void luaC_fullgc (lua_State *L, int isemergency) {
	- global_State *g = G(L);
	- int origkind = g->gckind;
	- lua_assert(origkind != KGC_EMERGENCY);
	- if (isemergency) /* do not run finalizers during emergency GC */
	- g->gckind = KGC_EMERGENCY;
	- else {
	- g->gckind = KGC_NORMAL;
	- callallpendingfinalizers(L, 1);
	- }
	- if (keepinvariant(g)) { /* may there be some black objects? */
	- /* must sweep all objects to turn them back to white
	- (as white has not changed, nothing will be collected) */
	- entersweep(L);
	- }
	- /* finish any pending sweep phase to start a new cycle */
	- luaC_runtilstate(L, bitmask(GCSpause));
	- luaC_runtilstate(L, ~bitmask(GCSpause)); /* start new collection */
	- luaC_runtilstate(L, bitmask(GCSpause)); /* run entire collection */
	- if (origkind == KGC_GEN) { /* generational mode? */
	- /* generational mode must be kept in propagate phase */
	- luaC_runtilstate(L, bitmask(GCSpropagate));
	- }
	- g->gckind = origkind;
	- setpause(g, gettotalbytes(g));
	- if (!isemergency) /* do not run finalizers during emergency GC */
	- callallpendingfinalizers(L, 1);
	-}
	-
	-/* }====================================================== */
	-
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h
	@@ -1,78 +0,0 @@
	-/*
	-** $Id: llex.h,v 1.72.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Lexical Analyzer
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef llex_h
	-#define llex_h
	-
	-#include "lobject.h"
	-#include "lzio.h"
	-
	-
	-#define FIRST_RESERVED 257
	-
	-
	-
	-/*
	-* WARNING: if you change the order of this enumeration,
	-* grep "ORDER RESERVED"
	-*/
	-enum RESERVED {
	- /* terminal symbols denoted by reserved words */
	- TK_AND = FIRST_RESERVED, TK_BREAK,
	- TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
	- TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
	- TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
	- /* other terminal symbols */
	- TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_DBCOLON, TK_EOS,
	- TK_NUMBER, TK_NAME, TK_STRING
	-};
	-
	-/* number of reserved words */
	-#define NUM_RESERVED (cast(int, TK_WHILE-FIRST_RESERVED+1))
	-
	-
	-typedef union {
	- lua_Number r;
	- TString *ts;
	-} SemInfo; /* semantics information */
	-
	-
	-typedef struct Token {
	- int token;
	- SemInfo seminfo;
	-} Token;
	-
	-
	-/* state of the lexer plus state of the parser when shared by all
	- functions */
	-typedef struct LexState {
	- int current; /* current character (charint) */
	- int linenumber; /* input line counter */
	- int lastline; /* line of last token `consumed' */
	- Token t; /* current token */
	- Token lookahead; /* look ahead token */
	- struct FuncState fs; / current function (parser) */
	- struct lua_State *L;
	- ZIO z; / input stream */
	- Mbuffer buff; / buffer for tokens */
	- struct Dyndata dyd; / dynamic structures used by the parser */
	- TString source; / current source name */
	- TString envn; / environment variable name */
	- char decpoint; /* locale decimal point */
	-} LexState;
	-
	-
	-LUAI_FUNC void luaX_init (lua_State *L);
	-LUAI_FUNC void luaX_setinput (lua_State L, LexState ls, ZIO *z,
	- TString *source, int firstchar);
	-LUAI_FUNC TString luaX_newstring (LexState ls, const char *str, size_t l);
	-LUAI_FUNC void luaX_next (LexState *ls);
	-LUAI_FUNC int luaX_lookahead (LexState *ls);
	-LUAI_FUNC l_noret luaX_syntaxerror (LexState ls, const char s);
	-LUAI_FUNC const char luaX_token2str (LexState ls, int token);
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c
	@@ -1,529 +0,0 @@
	-/*
	-** $Id: llex.c,v 2.63.1.3 2015/02/09 17:56:34 roberto Exp $
	-** Lexical Analyzer
	-** See Copyright Notice in lua.h
	-*/
	-
	-#include <sys/zfs_context.h>
	-
	-#define llex_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lctype.h"
	-#include "ldo.h"
	-#include "llex.h"
	-#include "lobject.h"
	-#include "lparser.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "lzio.h"
	-
	-
	-
	-#define next(ls) (ls->current = zgetc(ls->z))
	-
	-
	-
	-#define currIsNewline(ls) (ls->current == '\n' \|\| ls->current == '\r')
	-
	-
	-/* ORDER RESERVED */
	-static const char *const luaX_tokens [] = {
	- "and", "break", "do", "else", "elseif",
	- "end", "false", "for", "function", "goto", "if",
	- "in", "local", "nil", "not", "or", "repeat",
	- "return", "then", "true", "until", "while",
	- "..", "...", "==", ">=", "<=", "~=", "::", "<eof>",
	- "<number>", "<name>", "<string>"
	-};
	-
	-
	-#define save_and_next(ls) (save(ls, ls->current), next(ls))
	-
	-
	-static l_noret lexerror (LexState ls, const char msg, int token);
	-
	-
	-static void save (LexState *ls, int c) {
	- Mbuffer *b = ls->buff;
	- if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
	- size_t newsize;
	- if (luaZ_sizebuffer(b) >= MAX_SIZET/2)
	- lexerror(ls, "lexical element too long", 0);
	- newsize = luaZ_sizebuffer(b) * 2;
	- luaZ_resizebuffer(ls->L, b, newsize);
	- }
	- b->buffer[luaZ_bufflen(b)++] = cast(char, c);
	-}
	-
	-
	-void luaX_init (lua_State *L) {
	- int i;
	- for (i=0; i<NUM_RESERVED; i++) {
	- TString *ts = luaS_new(L, luaX_tokens[i]);
	- luaS_fix(ts); /* reserved words are never collected */
	- ts->tsv.extra = cast_byte(i+1); /* reserved word */
	- }
	-}
	-
	-
	-const char luaX_token2str (LexState ls, int token) {
	- if (token < FIRST_RESERVED) { /* single-byte symbols? */
	- lua_assert(token == cast(unsigned char, token));
	- return (lisprint(token)) ? luaO_pushfstring(ls->L, LUA_QL("%c"), token) :
	- luaO_pushfstring(ls->L, "char(%d)", token);
	- }
	- else {
	- const char *s = luaX_tokens[token - FIRST_RESERVED];
	- if (token < TK_EOS) /* fixed format (symbols and reserved words)? */
	- return luaO_pushfstring(ls->L, LUA_QS, s);
	- else /* names, strings, and numerals */
	- return s;
	- }
	-}
	-
	-
	-static const char txtToken (LexState ls, int token) {
	- switch (token) {
	- case TK_NAME:
	- case TK_STRING:
	- case TK_NUMBER:
	- save(ls, '\0');
	- return luaO_pushfstring(ls->L, LUA_QS, luaZ_buffer(ls->buff));
	- default:
	- return luaX_token2str(ls, token);
	- }
	-}
	-
	-
	-static l_noret lexerror (LexState ls, const char msg, int token) {
	- char buff[LUA_IDSIZE];
	- luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE);
	- msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg);
	- if (token)
	- luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
	- luaD_throw(ls->L, LUA_ERRSYNTAX);
	-}
	-
	-
	-l_noret luaX_syntaxerror (LexState ls, const char msg) {
	- lexerror(ls, msg, ls->t.token);
	-}
	-
	-
	-/*
	-** creates a new string and anchors it in function's table so that
	-** it will not be collected until the end of the function's compilation
	-** (by that time it should be anchored in function's prototype)
	-*/
	-TString luaX_newstring (LexState ls, const char *str, size_t l) {
	- lua_State *L = ls->L;
	- TValue o; / entry for `str' */
	- TString ts = luaS_newlstr(L, str, l); / create new string */
	- setsvalue2s(L, L->top++, ts); /* temporarily anchor it in stack */
	- o = luaH_set(L, ls->fs->h, L->top - 1);
	- if (ttisnil(o)) { /* not in use yet? (see 'addK') */
	- /* boolean value does not need GC barrier;
	- table has no metatable, so it does not need to invalidate cache */
	- setbvalue(o, 1); /* t[string] = true */
	- luaC_checkGC(L);
	- }
	- else { /* string already present */
	- ts = rawtsvalue(keyfromval(o)); /* re-use value previously stored */
	- }
	- L->top--; /* remove string from stack */
	- return ts;
	-}
	-
	-
	-/*
	-** increment line number and skips newline sequence (any of
	-** \n, \r, \n\r, or \r\n)
	-*/
	-static void inclinenumber (LexState *ls) {
	- int old = ls->current;
	- lua_assert(currIsNewline(ls));
	- next(ls); /* skip `\n' or `\r' */
	- if (currIsNewline(ls) && ls->current != old)
	- next(ls); /* skip `\n\r' or `\r\n' */
	- if (++ls->linenumber >= MAX_INT)
	- lexerror(ls, "chunk has too many lines", 0);
	-}
	-
	-
	-void luaX_setinput (lua_State L, LexState ls, ZIO z, TString source,
	- int firstchar) {
	- ls->decpoint = '.';
	- ls->L = L;
	- ls->current = firstchar;
	- ls->lookahead.token = TK_EOS; /* no look-ahead token */
	- ls->z = z;
	- ls->fs = NULL;
	- ls->linenumber = 1;
	- ls->lastline = 1;
	- ls->source = source;
	- ls->envn = luaS_new(L, LUA_ENV); /* create env name */
	- luaS_fix(ls->envn); /* never collect this name */
	- luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */
	-}
	-
	-
	-
	-/*
	-** =======================================================
	-** LEXICAL ANALYZER
	-** =======================================================
	-*/
	-
	-
	-
	-static int check_next (LexState ls, const char set) {
	- if (ls->current == '\0' \|\| !strchr(set, ls->current))
	- return 0;
	- save_and_next(ls);
	- return 1;
	-}
	-
	-
	-/*
	-** change all characters 'from' in buffer to 'to'
	-*/
	-static void buffreplace (LexState *ls, char from, char to) {
	- size_t n = luaZ_bufflen(ls->buff);
	- char *p = luaZ_buffer(ls->buff);
	- while (n--)
	- if (p[n] == from) p[n] = to;
	-}
	-
	-
	-#if !defined(getlocaledecpoint)
	-#define getlocaledecpoint() (localeconv()->decimal_point[0])
	-#endif
	-
	-
	-#define buff2d(b,e) luaO_str2d(luaZ_buffer(b), luaZ_bufflen(b) - 1, e)
	-
	-/*
	-** in case of format error, try to change decimal point separator to
	-** the one defined in the current locale and check again
	-*/
	-static void trydecpoint (LexState ls, SemInfo seminfo) {
	- char old = ls->decpoint;
	- ls->decpoint = getlocaledecpoint();
	- buffreplace(ls, old, ls->decpoint); /* try new decimal separator */
	- if (!buff2d(ls->buff, &seminfo->r)) {
	- /* format error with correct decimal point: no more options */
	- buffreplace(ls, ls->decpoint, '.'); /* undo change (for error message) */
	- lexerror(ls, "malformed number", TK_NUMBER);
	- }
	-}
	-
	-
	-/* LUA_NUMBER */
	-/*
	-** this function is quite liberal in what it accepts, as 'luaO_str2d'
	-** will reject ill-formed numerals.
	-*/
	-static void read_numeral (LexState ls, SemInfo seminfo) {
	- const char *expo = "Ee";
	- int first = ls->current;
	- lua_assert(lisdigit(ls->current));
	- save_and_next(ls);
	- if (first == '0' && check_next(ls, "Xx")) /* hexadecimal? */
	- expo = "Pp";
	- for (;;) {
	- if (check_next(ls, expo)) /* exponent part? */
	- check_next(ls, "+-"); /* optional exponent sign */
	- if (lisxdigit(ls->current) \|\| ls->current == '.')
	- save_and_next(ls);
	- else break;
	- }
	- save(ls, '\0');
	- buffreplace(ls, '.', ls->decpoint); /* follow locale for decimal point */
	- if (!buff2d(ls->buff, &seminfo->r)) /* format error? */
	- trydecpoint(ls, seminfo); /* try to update decimal point separator */
	-}
	-
	-
	-/*
	-** skip a sequence '[=[' or ']=]' and return its number of '='s or
	-** -1 if sequence is malformed
	-*/
	-static int skip_sep (LexState *ls) {
	- int count = 0;
	- int s = ls->current;
	- lua_assert(s == '[' \|\| s == ']');
	- save_and_next(ls);
	- while (ls->current == '=') {
	- save_and_next(ls);
	- count++;
	- }
	- return (ls->current == s) ? count : (-count) - 1;
	-}
	-
	-
	-static void read_long_string (LexState ls, SemInfo seminfo, int sep) {
	- save_and_next(ls); /* skip 2nd `[' */
	- if (currIsNewline(ls)) /* string starts with a newline? */
	- inclinenumber(ls); /* skip it */
	- for (;;) {
	- switch (ls->current) {
	- case EOZ:
	- lexerror(ls, (seminfo) ? "unfinished long string" :
	- "unfinished long comment", TK_EOS);
	- break; /* to avoid warnings */
	- case ']': {
	- if (skip_sep(ls) == sep) {
	- save_and_next(ls); /* skip 2nd `]' */
	- goto endloop;
	- }
	- break;
	- }
	- case '\n': case '\r': {
	- save(ls, '\n');
	- inclinenumber(ls);
	- if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */
	- break;
	- }
	- default: {
	- if (seminfo) save_and_next(ls);
	- else next(ls);
	- }
	- }
	- } endloop:
	- if (seminfo)
	- seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
	- luaZ_bufflen(ls->buff) - 2*(2 + sep));
	-}
	-
	-
	-static void escerror (LexState ls, int c, int n, const char *msg) {
	- int i;
	- luaZ_resetbuffer(ls->buff); /* prepare error message */
	- save(ls, '\\');
	- for (i = 0; i < n && c[i] != EOZ; i++)
	- save(ls, c[i]);
	- lexerror(ls, msg, TK_STRING);
	-}
	-
	-
	-static int readhexaesc (LexState *ls) {
	- int c[3], i; /* keep input for error message */
	- int r = 0; /* result accumulator */
	- c[0] = 'x'; /* for error message */
	- for (i = 1; i < 3; i++) { /* read two hexadecimal digits */
	- c[i] = next(ls);
	- if (!lisxdigit(c[i]))
	- escerror(ls, c, i + 1, "hexadecimal digit expected");
	- r = (r << 4) + luaO_hexavalue(c[i]);
	- }
	- return r;
	-}
	-
	-
	-static int readdecesc (LexState *ls) {
	- int c[3], i;
	- int r = 0; /* result accumulator */
	- for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */
	- c[i] = ls->current;
	- r = 10*r + c[i] - '0';
	- next(ls);
	- }
	- if (r > UCHAR_MAX)
	- escerror(ls, c, i, "decimal escape too large");
	- return r;
	-}
	-
	-
	-static void read_string (LexState ls, int del, SemInfo seminfo) {
	- save_and_next(ls); /* keep delimiter (for error messages) */
	- while (ls->current != del) {
	- switch (ls->current) {
	- case EOZ:
	- lexerror(ls, "unfinished string", TK_EOS);
	- break; /* to avoid warnings */
	- case '\n':
	- case '\r':
	- lexerror(ls, "unfinished string", TK_STRING);
	- break; /* to avoid warnings */
	- case '\\': { /* escape sequences */
	- int c; /* final character to be saved */
	- next(ls); /* do not save the `\' */
	- switch (ls->current) {
	- case 'a': c = '\a'; goto read_save;
	- case 'b': c = '\b'; goto read_save;
	- case 'f': c = '\f'; goto read_save;
	- case 'n': c = '\n'; goto read_save;
	- case 'r': c = '\r'; goto read_save;
	- case 't': c = '\t'; goto read_save;
	- case 'v': c = '\v'; goto read_save;
	- case 'x': c = readhexaesc(ls); goto read_save;
	- case '\n': case '\r':
	- inclinenumber(ls); c = '\n'; goto only_save;
	- case '\\': case '\"': case '\'':
	- c = ls->current; goto read_save;
	- case EOZ: goto no_save; /* will raise an error next loop */
	- case 'z': { /* zap following span of spaces */
	- next(ls); /* skip the 'z' */
	- while (lisspace(ls->current)) {
	- if (currIsNewline(ls)) inclinenumber(ls);
	- else next(ls);
	- }
	- goto no_save;
	- }
	- default: {
	- if (!lisdigit(ls->current))
	- escerror(ls, &ls->current, 1, "invalid escape sequence");
	- /* digital escape \ddd */
	- c = readdecesc(ls);
	- goto only_save;
	- }
	- }
	- read_save: next(ls); /* read next character */
	- only_save: save(ls, c); /* save 'c' */
	- no_save: break;
	- }
	- default:
	- save_and_next(ls);
	- }
	- }
	- save_and_next(ls); /* skip delimiter */
	- seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
	- luaZ_bufflen(ls->buff) - 2);
	-}
	-
	-
	-static int llex (LexState ls, SemInfo seminfo) {
	- luaZ_resetbuffer(ls->buff);
	- for (;;) {
	- switch (ls->current) {
	- case '\n': case '\r': { /* line breaks */
	- inclinenumber(ls);
	- break;
	- }
	- case ' ': case '\f': case '\t': case '\v': { /* spaces */
	- next(ls);
	- break;
	- }
	- case '-': { /* '-' or '--' (comment) */
	- next(ls);
	- if (ls->current != '-') return '-';
	- /* else is a comment */
	- next(ls);
	- if (ls->current == '[') { /* long comment? */
	- int sep = skip_sep(ls);
	- luaZ_resetbuffer(ls->buff); /* `skip_sep' may dirty the buffer */
	- if (sep >= 0) {
	- read_long_string(ls, NULL, sep); /* skip long comment */
	- luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */
	- break;
	- }
	- }
	- /* else short comment */
	- while (!currIsNewline(ls) && ls->current != EOZ)
	- next(ls); /* skip until end of line (or end of file) */
	- break;
	- }
	- case '[': { /* long string or simply '[' */
	- int sep = skip_sep(ls);
	- if (sep >= 0) {
	- read_long_string(ls, seminfo, sep);
	- return TK_STRING;
	- }
	- else if (sep == -1) return '[';
	- else lexerror(ls, "invalid long string delimiter", TK_STRING);
	- }
	- case '=': {
	- next(ls);
	- if (ls->current != '=') return '=';
	- else { next(ls); return TK_EQ; }
	- }
	- case '<': {
	- next(ls);
	- if (ls->current != '=') return '<';
	- else { next(ls); return TK_LE; }
	- }
	- case '>': {
	- next(ls);
	- if (ls->current != '=') return '>';
	- else { next(ls); return TK_GE; }
	- }
	- case '~': {
	- next(ls);
	- if (ls->current != '=') return '~';
	- else { next(ls); return TK_NE; }
	- }
	- case ':': {
	- next(ls);
	- if (ls->current != ':') return ':';
	- else { next(ls); return TK_DBCOLON; }
	- }
	- case '"': case '\'': { /* short literal strings */
	- read_string(ls, ls->current, seminfo);
	- return TK_STRING;
	- }
	- case '.': { /* '.', '..', '...', or number */
	- save_and_next(ls);
	- if (check_next(ls, ".")) {
	- if (check_next(ls, "."))
	- return TK_DOTS; /* '...' */
	- else return TK_CONCAT; /* '..' */
	- }
	- else if (!lisdigit(ls->current)) return '.';
	- /* else go through */
	- }
	- /* FALLTHROUGH */
	- case '0': case '1': case '2': case '3': case '4':
	- case '5': case '6': case '7': case '8': case '9': {
	- read_numeral(ls, seminfo);
	- return TK_NUMBER;
	- }
	- case EOZ: {
	- return TK_EOS;
	- }
	- default: {
	- if (lislalpha(ls->current)) { /* identifier or reserved word? */
	- TString *ts;
	- do {
	- save_and_next(ls);
	- } while (lislalnum(ls->current));
	- ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
	- luaZ_bufflen(ls->buff));
	- seminfo->ts = ts;
	- if (isreserved(ts)) /* reserved word? */
	- return ts->tsv.extra - 1 + FIRST_RESERVED;
	- else {
	- return TK_NAME;
	- }
	- }
	- else { /* single-char tokens (+ - / ...) */
	- int c = ls->current;
	- next(ls);
	- return c;
	- }
	- }
	- }
	- }
	-}
	-
	-
	-void luaX_next (LexState *ls) {
	- ls->lastline = ls->linenumber;
	- if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */
	- ls->t = ls->lookahead; /* use this one */
	- ls->lookahead.token = TK_EOS; /* and discharge it */
	- }
	- else
	- ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */
	-}
	-
	-
	-int luaX_lookahead (LexState *ls) {
	- lua_assert(ls->lookahead.token == TK_EOS);
	- ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
	- return ls->lookahead.token;
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h
	@@ -1,308 +0,0 @@
	-/*
	-** $Id: llimits.h,v 1.103.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Limits, basic types, and some other `installation-dependent' definitions
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef llimits_h
	-#define llimits_h
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#include "lua.h"
	-
	-
	-typedef unsigned LUA_INT32 lu_int32;
	-
	-typedef LUAI_UMEM lu_mem;
	-
	-typedef LUAI_MEM l_mem;
	-
	-
	-
	-/* chars used as small naturals (so that `char' is reserved for characters) */
	-typedef unsigned char lu_byte;
	-
	-
	-#define MAX_SIZET ((size_t)(~(size_t)0)-2)
	-
	-#define MAX_LUMEM ((lu_mem)(~(lu_mem)0)-2)
	-
	-#define MAX_LMEM ((l_mem) ((MAX_LUMEM >> 1) - 2))
	-
	-
	-#define MAX_INT (INT_MAX-2) /* maximum value of an int (-2 for safety) */
	-
	-/*
	-** conversion of pointer to integer
	-** this is for hashing only; there is no problem if the integer
	-** cannot hold the whole pointer value
	-*/
	-#define IntPoint(p) ((unsigned int)(lu_mem)(p))
	-
	-
	-
	-/* type to ensure maximum alignment */
	-#if !defined(LUAI_USER_ALIGNMENT_T)
	-#define LUAI_USER_ALIGNMENT_T union { double u; void *s; long l; }
	-#endif
	-
	-typedef LUAI_USER_ALIGNMENT_T L_Umaxalign;
	-
	-
	-/* result of a `usual argument conversion' over lua_Number */
	-typedef LUAI_UACNUMBER l_uacNumber;
	-
	-
	-/* internal assertions for in-house debugging */
	-#if defined(lua_assert)
	-#define check_exp(c,e) (lua_assert(c), (e))
	-/* to avoid problems with conditions too long */
	-#define lua_longassert(c) { if (!(c)) lua_assert(0); }
	-#else
	-#define lua_assert(c) ((void)0)
	-#define check_exp(c,e) (e)
	-#define lua_longassert(c) ((void)0)
	-#endif
	-
	-/*
	-** assertion for checking API calls
	-*/
	-#if !defined(luai_apicheck)
	-
	-#if defined(LUA_USE_APICHECK)
	-#include <assert.h>
	-#define luai_apicheck(L,e) assert(e)
	-#else
	-#define luai_apicheck(L,e) lua_assert(e)
	-#endif
	-
	-#endif
	-
	-#define api_check(l,e,msg) luai_apicheck(l,(e) && msg)
	-
	-
	-#if !defined(UNUSED)
	-#define UNUSED(x) ((void)(x)) /* to avoid warnings */
	-#endif
	-
	-
	-#define cast(t, exp) ((t)(exp))
	-
	-#define cast_byte(i) cast(lu_byte, (i))
	-#define cast_num(i) cast(lua_Number, (i))
	-#define cast_int(i) cast(int, (i))
	-#define cast_uchar(i) cast(unsigned char, (i))
	-
	-
	-/*
	-** non-return type
	-*/
	-#if defined(__GNUC__)
	-#define l_noret void __attribute__((noreturn))
	-#elif defined(_MSC_VER)
	-#define l_noret void __declspec(noreturn)
	-#else
	-#define l_noret void
	-#endif
	-
	-
	-
	-/*
	-** maximum depth for nested C calls and syntactical nested non-terminals
	-** in a program. (Value must fit in an unsigned short int.)
	-**
	-** Note: On amd64 platform, the limit has been measured to be 45. We set
	-** the maximum lower to give a margin for changing the amount of stack
	-** used by various functions involved in parsing and executing code.
	-*/
	-#if !defined(LUAI_MAXCCALLS)
	-#define LUAI_MAXCCALLS 20
	-#endif
	-
	-/*
	-** maximum number of upvalues in a closure (both C and Lua). (Value
	-** must fit in an unsigned char.)
	-*/
	-#define MAXUPVAL UCHAR_MAX
	-
	-
	-/*
	-** type for virtual-machine instructions
	-** must be an unsigned with (at least) 4 bytes (see details in lopcodes.h)
	-*/
	-typedef lu_int32 Instruction;
	-
	-
	-
	-/* maximum stack for a Lua function */
	-#define MAXSTACK 250
	-
	-
	-
	-/* minimum size for the string table (must be power of 2) */
	-#if !defined(MINSTRTABSIZE)
	-#define MINSTRTABSIZE 32
	-#endif
	-
	-
	-/* minimum size for string buffer */
	-#if !defined(LUA_MINBUFFER)
	-#define LUA_MINBUFFER 32
	-#endif
	-
	-
	-#if !defined(lua_lock)
	-#define lua_lock(L) ((void) 0)
	-#define lua_unlock(L) ((void) 0)
	-#endif
	-
	-#if !defined(luai_threadyield)
	-#define luai_threadyield(L) {lua_unlock(L); lua_lock(L);}
	-#endif
	-
	-
	-/*
	-** these macros allow user-specific actions on threads when you defined
	-** LUAI_EXTRASPACE and need to do something extra when a thread is
	-** created/deleted/resumed/yielded.
	-*/
	-#if !defined(luai_userstateopen)
	-#define luai_userstateopen(L) ((void)L)
	-#endif
	-
	-#if !defined(luai_userstateclose)
	-#define luai_userstateclose(L) ((void)L)
	-#endif
	-
	-#if !defined(luai_userstatethread)
	-#define luai_userstatethread(L,L1) ((void)L)
	-#endif
	-
	-#if !defined(luai_userstatefree)
	-#define luai_userstatefree(L,L1) ((void)L)
	-#endif
	-
	-#if !defined(luai_userstateresume)
	-#define luai_userstateresume(L,n) ((void)L)
	-#endif
	-
	-#if !defined(luai_userstateyield)
	-#define luai_userstateyield(L,n) ((void)L)
	-#endif
	-
	-/*
	-** lua_number2int is a macro to convert lua_Number to int.
	-** lua_number2integer is a macro to convert lua_Number to lua_Integer.
	-** lua_number2unsigned is a macro to convert a lua_Number to a lua_Unsigned.
	-** lua_unsigned2number is a macro to convert a lua_Unsigned to a lua_Number.
	-** luai_hashnum is a macro to hash a lua_Number value into an integer.
	-** The hash must be deterministic and give reasonable values for
	-** both small and large values (outside the range of integers).
	-*/
	-
	-#if defined(MS_ASMTRICK) \|\| defined(LUA_MSASMTRICK) /* { */
	-/* trick with Microsoft assembler for X86 */
	-
	-#define lua_number2int(i,n) __asm {__asm fld n __asm fistp i}
	-#define lua_number2integer(i,n) lua_number2int(i, n)
	-#define lua_number2unsigned(i,n) \
	- {__int64 l; __asm {__asm fld n __asm fistp l} i = (unsigned int)l;}
	-
	-
	-#elif defined(LUA_IEEE754TRICK) /* }{ */
	-/* the next trick should work on any machine using IEEE754 with
	- a 32-bit int type */
	-
	-union luai_Cast { double l_d; LUA_INT32 l_p[2]; };
	-
	-#if !defined(LUA_IEEEENDIAN) /* { */
	-#define LUAI_EXTRAIEEE \
	- static const union luai_Cast ieeeendian = {-(33.0 + 6755399441055744.0)};
	-#define LUA_IEEEENDIANLOC (ieeeendian.l_p[1] == 33)
	-#else
	-#define LUA_IEEEENDIANLOC LUA_IEEEENDIAN
	-#define LUAI_EXTRAIEEE /* empty */
	-#endif /* } */
	-
	-#define lua_number2int32(i,n,t) \
	- { LUAI_EXTRAIEEE \
	- volatile union luai_Cast u; u.l_d = (n) + 6755399441055744.0; \
	- (i) = (t)u.l_p[LUA_IEEEENDIANLOC]; }
	-
	-#define luai_hashnum(i,n) \
	- { volatile union luai_Cast u; u.l_d = (n) + 1.0; /* avoid -0 */ \
	- (i) = u.l_p[0]; (i) += u.l_p[1]; } /* add double bits for his hash */
	-
	-#define lua_number2int(i,n) lua_number2int32(i, n, int)
	-#define lua_number2unsigned(i,n) lua_number2int32(i, n, lua_Unsigned)
	-
	-/* the trick can be expanded to lua_Integer when it is a 32-bit value */
	-#if defined(LUA_IEEELL)
	-#define lua_number2integer(i,n) lua_number2int32(i, n, lua_Integer)
	-#endif
	-
	-#endif /* } */
	-
	-
	-/* the following definitions always work, but may be slow */
	-
	-#if !defined(lua_number2int)
	-#define lua_number2int(i,n) ((i)=(int)(n))
	-#endif
	-
	-#if !defined(lua_number2integer)
	-#define lua_number2integer(i,n) ((i)=(lua_Integer)(n))
	-#endif
	-
	-#if !defined(lua_number2unsigned) /* { */
	-/* the following definition assures proper modulo behavior */
	-#if defined(LUA_NUMBER_DOUBLE) \|\| defined(LUA_NUMBER_FLOAT)
	-#include <math.h>
	-#define SUPUNSIGNED ((lua_Number)(~(lua_Unsigned)0) + 1)
	-#define lua_number2unsigned(i,n) \
	- ((i)=(lua_Unsigned)((n) - floor((n)/SUPUNSIGNED)*SUPUNSIGNED))
	-#else
	-#define lua_number2unsigned(i,n) ((i)=(lua_Unsigned)(n))
	-#endif
	-#endif /* } */
	-
	-
	-#if !defined(lua_unsigned2number)
	-/* on several machines, coercion from unsigned to double is slow,
	- so it may be worth to avoid */
	-#define lua_unsigned2number(u) \
	- (((u) <= (lua_Unsigned)INT_MAX) ? (lua_Number)(int)(u) : (lua_Number)(u))
	-#endif
	-
	-
	-
	-#if defined(ltable_c) && !defined(luai_hashnum)
	-
	-extern int lcompat_hashnum(int64_t);
	-
	-#define luai_hashnum(i,n) (i = lcompat_hashnum(n))
	-
	-#endif
	-
	-
	-
	-/*
	-** macro to control inclusion of some hard tests on stack reallocation
	-*/
	-#if !defined(HARDSTACKTESTS)
	-#define condmovestack(L) ((void)0)
	-#else
	-/* realloc stack keeping its size */
	-#define condmovestack(L) luaD_reallocstack((L), (L)->stacksize)
	-#endif
	-
	-#if !defined(HARDMEMTESTS)
	-#define condchangemem(L) condmovestack(L)
	-#else
	-#define condchangemem(L) \
	- ((void)(!(G(L)->gcrunning) \|\| (luaC_fullgc(L, 0), 1)))
	-#endif
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h
	@@ -1,57 +0,0 @@
	-/*
	-** $Id: lmem.h,v 1.40.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Interface to Memory Manager
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lmem_h
	-#define lmem_h
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#include "llimits.h"
	-#include "lua.h"
	-
	-
	-/*
	-** This macro avoids the runtime division MAX_SIZET/(e), as 'e' is
	-** always constant.
	-** The macro is somewhat complex to avoid warnings:
	-** +1 avoids warnings of "comparison has constant result";
	-** cast to 'void' avoids warnings of "value unused".
	-*/
	-#define luaM_reallocv(L,b,on,n,e) \
	- (cast(void, \
	- (cast(size_t, (n)+1) > MAX_SIZET/(e)) ? (luaM_toobig(L), 0) : 0), \
	- luaM_realloc_(L, (b), (on)(e), (n)(e)))
	-
	-#define luaM_freemem(L, b, s) luaM_realloc_(L, (b), (s), 0)
	-#define luaM_free(L, b) luaM_realloc_(L, (b), sizeof(*(b)), 0)
	-#define luaM_freearray(L, b, n) luaM_reallocv(L, (b), n, 0, sizeof((b)[0]))
	-
	-#define luaM_malloc(L,s) luaM_realloc_(L, NULL, 0, (s))
	-#define luaM_new(L,t) cast(t *, luaM_malloc(L, sizeof(t)))
	-#define luaM_newvector(L,n,t) \
	- cast(t *, luaM_reallocv(L, NULL, 0, n, sizeof(t)))
	-
	-#define luaM_newobject(L,tag,s) luaM_realloc_(L, NULL, tag, (s))
	-
	-#define luaM_growvector(L,v,nelems,size,t,limit,e) \
	- if ((nelems)+1 > (size)) \
	- ((v)=cast(t *, luaM_growaux_(L,v,&(size),sizeof(t),limit,e)))
	-
	-#define luaM_reallocvector(L, v,oldn,n,t) \
	- ((v)=cast(t *, luaM_reallocv(L, v, oldn, n, sizeof(t))))
	-
	-LUAI_FUNC l_noret luaM_toobig (lua_State *L);
	-
	-/* not to be called directly */
	-LUAI_FUNC void luaM_realloc_ (lua_State L, void *block, size_t oldsize,
	- size_t size);
	-LUAI_FUNC void luaM_growaux_ (lua_State L, void block, int size,
	- size_t size_elem, int limit,
	- const char *what);
	-
	-#endif
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c
	@@ -1,99 +0,0 @@
	-/*
	-** $Id: lmem.c,v 1.84.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Interface to Memory Manager
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define lmem_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lgc.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lstate.h"
	-
	-
	-
	-/*
	-** About the realloc function:
	-** void * frealloc (void ud, void ptr, size_t osize, size_t nsize);
	-** (`osize' is the old size, `nsize' is the new size)
	-**
	-** * frealloc(ud, NULL, x, s) creates a new block of size `s' (no
	-** matter 'x').
	-**
	-** * frealloc(ud, p, x, 0) frees the block `p'
	-** (in this specific case, frealloc must return NULL);
	-** particularly, frealloc(ud, NULL, 0, 0) does nothing
	-** (which is equivalent to free(NULL) in ANSI C)
	-**
	-** frealloc returns NULL if it cannot create or reallocate the area
	-** (any reallocation to an equal or smaller size cannot fail!)
	-*/
	-
	-
	-
	-#define MINSIZEARRAY 4
	-
	-
	-void luaM_growaux_ (lua_State L, void block, int size, size_t size_elems,
	- int limit, const char *what) {
	- void *newblock;
	- int newsize;
	- if (size >= limit/2) { / cannot double it? */
	- if (size >= limit) / cannot grow even a little? */
	- luaG_runerror(L, "too many %s (limit is %d)", what, limit);
	- newsize = limit; /* still have at least one free place */
	- }
	- else {
	- newsize = (size)2;
	- if (newsize < MINSIZEARRAY)
	- newsize = MINSIZEARRAY; /* minimum size */
	- }
	- newblock = luaM_reallocv(L, block, *size, newsize, size_elems);
	- size = newsize; / update only when everything else is OK */
	- return newblock;
	-}
	-
	-
	-l_noret luaM_toobig (lua_State *L) {
	- luaG_runerror(L, "memory allocation error: block too big");
	-}
	-
	-
	-
	-/*
	-** generic allocation routine.
	-*/
	-void luaM_realloc_ (lua_State L, void *block, size_t osize, size_t nsize) {
	- void *newblock;
	- global_State *g = G(L);
	- size_t realosize = (block) ? osize : 0;
	- lua_assert((realosize == 0) == (block == NULL));
	-#if defined(HARDMEMTESTS)
	- if (nsize > realosize && g->gcrunning)
	- luaC_fullgc(L, 1); /* force a GC whenever possible */
	-#endif
	- newblock = (*g->frealloc)(g->ud, block, osize, nsize);
	- if (newblock == NULL && nsize > 0) {
	- api_check(L, nsize > realosize,
	- "realloc cannot fail when shrinking a block");
	- if (g->gcrunning) {
	- luaC_fullgc(L, 1); /* try to free some memory... */
	- newblock = (g->frealloc)(g->ud, block, osize, nsize); / try again */
	- }
	- if (newblock == NULL)
	- luaD_throw(L, LUA_ERRMEM);
	- }
	- lua_assert((nsize == 0) == (newblock == NULL));
	- g->GCdebt = (g->GCdebt + nsize) - realosize;
	- return newblock;
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h
	@@ -1,606 +0,0 @@
	-/*
	-** $Id: lobject.h,v 2.71.1.2 2014/05/07 14:14:58 roberto Exp $
	-** Type definitions for Lua objects
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#ifndef lobject_h
	-#define lobject_h
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#include "llimits.h"
	-#include "lua.h"
	-
	-
	-/*
	-** Extra tags for non-values
	-*/
	-#define LUA_TPROTO LUA_NUMTAGS
	-#define LUA_TUPVAL (LUA_NUMTAGS+1)
	-#define LUA_TDEADKEY (LUA_NUMTAGS+2)
	-
	-/*
	-** number of all possible tags (including LUA_TNONE but excluding DEADKEY)
	-*/
	-#define LUA_TOTALTAGS (LUA_TUPVAL+2)
	-
	-
	-/*
	-** tags for Tagged Values have the following use of bits:
	-** bits 0-3: actual tag (a LUA_T* value)
	-** bits 4-5: variant bits
	-** bit 6: whether value is collectable
	-*/
	-
	-#define VARBITS (3 << 4)
	-
	-
	-/*
	-** LUA_TFUNCTION variants:
	-** 0 - Lua function
	-** 1 - light C function
	-** 2 - regular C function (closure)
	-*/
	-
	-/* Variant tags for functions */
	-#define LUA_TLCL (LUA_TFUNCTION \| (0 << 4)) /* Lua closure */
	-#define LUA_TLCF (LUA_TFUNCTION \| (1 << 4)) /* light C function */
	-#define LUA_TCCL (LUA_TFUNCTION \| (2 << 4)) /* C closure */
	-
	-
	-/* Variant tags for strings */
	-#define LUA_TSHRSTR (LUA_TSTRING \| (0 << 4)) /* short strings */
	-#define LUA_TLNGSTR (LUA_TSTRING \| (1 << 4)) /* long strings */
	-
	-
	-/* Bit mark for collectable types */
	-#define BIT_ISCOLLECTABLE (1 << 6)
	-
	-/* mark a tag as collectable */
	-#define ctb(t) ((t) \| BIT_ISCOLLECTABLE)
	-
	-
	-/*
	-** Union of all collectable objects
	-*/
	-typedef union GCObject GCObject;
	-
	-
	-/*
	-** Common Header for all collectable objects (in macro form, to be
	-** included in other objects)
	-*/
	-#define CommonHeader GCObject *next; lu_byte tt; lu_byte marked
	-
	-
	-/*
	-** Common header in struct form
	-*/
	-typedef struct GCheader {
	- CommonHeader;
	-} GCheader;
	-
	-
	-
	-/*
	-** Union of all Lua values
	-*/
	-typedef union Value Value;
	-
	-
	-#define numfield lua_Number n; /* numbers */
	-
	-
	-
	-/*
	-** Tagged Values. This is the basic representation of values in Lua,
	-** an actual value plus a tag with its type.
	-*/
	-
	-#define TValuefields Value value_; int tt_
	-
	-typedef struct lua_TValue TValue;
	-
	-
	-/* macro defining a nil value */
	-#define NILCONSTANT {NULL}, LUA_TNIL
	-
	-
	-#define val_(o) ((o)->value_)
	-#define num_(o) (val_(o).n)
	-
	-
	-/* raw type tag of a TValue */
	-#define rttype(o) ((o)->tt_)
	-
	-/* tag with no variants (bits 0-3) */
	-#define novariant(x) ((x) & 0x0F)
	-
	-/* type tag of a TValue (bits 0-3 for tags + variant bits 4-5) */
	-#define ttype(o) (rttype(o) & 0x3F)
	-
	-/* type tag of a TValue with no variants (bits 0-3) */
	-#define ttypenv(o) (novariant(rttype(o)))
	-
	-
	-/* Macros to test type */
	-#define checktag(o,t) (rttype(o) == (t))
	-#define checktype(o,t) (ttypenv(o) == (t))
	-#define ttisnumber(o) checktag((o), LUA_TNUMBER)
	-#define ttisnil(o) checktag((o), LUA_TNIL)
	-#define ttisboolean(o) checktag((o), LUA_TBOOLEAN)
	-#define ttislightuserdata(o) checktag((o), LUA_TLIGHTUSERDATA)
	-#define ttisstring(o) checktype((o), LUA_TSTRING)
	-#define ttisshrstring(o) checktag((o), ctb(LUA_TSHRSTR))
	-#define ttislngstring(o) checktag((o), ctb(LUA_TLNGSTR))
	-#define ttistable(o) checktag((o), ctb(LUA_TTABLE))
	-#define ttisfunction(o) checktype(o, LUA_TFUNCTION)
	-#define ttisclosure(o) ((rttype(o) & 0x1F) == LUA_TFUNCTION)
	-#define ttisCclosure(o) checktag((o), ctb(LUA_TCCL))
	-#define ttisLclosure(o) checktag((o), ctb(LUA_TLCL))
	-#define ttislcf(o) checktag((o), LUA_TLCF)
	-#define ttisuserdata(o) checktag((o), ctb(LUA_TUSERDATA))
	-#define ttisthread(o) checktag((o), ctb(LUA_TTHREAD))
	-#define ttisdeadkey(o) checktag((o), LUA_TDEADKEY)
	-
	-#define ttisequal(o1,o2) (rttype(o1) == rttype(o2))
	-
	-/* Macros to access values */
	-#define nvalue(o) check_exp(ttisnumber(o), num_(o))
	-#define gcvalue(o) check_exp(iscollectable(o), val_(o).gc)
	-#define pvalue(o) check_exp(ttislightuserdata(o), val_(o).p)
	-#define rawtsvalue(o) check_exp(ttisstring(o), &val_(o).gc->ts)
	-#define tsvalue(o) (&rawtsvalue(o)->tsv)
	-#define rawuvalue(o) check_exp(ttisuserdata(o), &val_(o).gc->u)
	-#define uvalue(o) (&rawuvalue(o)->uv)
	-#define clvalue(o) check_exp(ttisclosure(o), &val_(o).gc->cl)
	-#define clLvalue(o) check_exp(ttisLclosure(o), &val_(o).gc->cl.l)
	-#define clCvalue(o) check_exp(ttisCclosure(o), &val_(o).gc->cl.c)
	-#define fvalue(o) check_exp(ttislcf(o), val_(o).f)
	-#define hvalue(o) check_exp(ttistable(o), &val_(o).gc->h)
	-#define bvalue(o) check_exp(ttisboolean(o), val_(o).b)
	-#define thvalue(o) check_exp(ttisthread(o), &val_(o).gc->th)
	-/* a dead value may get the 'gc' field, but cannot access its contents */
	-#define deadvalue(o) check_exp(ttisdeadkey(o), cast(void *, val_(o).gc))
	-
	-#define l_isfalse(o) (ttisnil(o) \|\| (ttisboolean(o) && bvalue(o) == 0))
	-
	-
	-#define iscollectable(o) (rttype(o) & BIT_ISCOLLECTABLE)
	-
	-
	-/* Macros for internal tests */
	-#define righttt(obj) (ttype(obj) == gcvalue(obj)->gch.tt)
	-
	-#define checkliveness(g,obj) \
	- lua_longassert(!iscollectable(obj) \|\| \
	- (righttt(obj) && !isdead(g,gcvalue(obj))))
	-
	-
	-/* Macros to set values */
	-#define settt_(o,t) ((o)->tt_=(t))
	-
	-#define setnvalue(obj,x) \
	- { TValue *io=(obj); num_(io)=(x); settt_(io, LUA_TNUMBER); }
	-
	-#define setnilvalue(obj) settt_(obj, LUA_TNIL)
	-
	-#define setfvalue(obj,x) \
	- { TValue *io=(obj); val_(io).f=(x); settt_(io, LUA_TLCF); }
	-
	-#define setpvalue(obj,x) \
	- { TValue *io=(obj); val_(io).p=(x); settt_(io, LUA_TLIGHTUSERDATA); }
	-
	-#define setbvalue(obj,x) \
	- { TValue *io=(obj); val_(io).b=(x); settt_(io, LUA_TBOOLEAN); }
	-
	-#define setgcovalue(L,obj,x) \
	- { TValue io=(obj); GCObject i_g=(x); \
	- val_(io).gc=i_g; settt_(io, ctb(gch(i_g)->tt)); }
	-
	-#define setsvalue(L,obj,x) \
	- { TValue *io=(obj); \
	- TString *x_ = (x); \
	- val_(io).gc=cast(GCObject *, x_); settt_(io, ctb(x_->tsv.tt)); \
	- checkliveness(G(L),io); }
	-
	-#define setuvalue(L,obj,x) \
	- { TValue *io=(obj); \
	- val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TUSERDATA)); \
	- checkliveness(G(L),io); }
	-
	-#define setthvalue(L,obj,x) \
	- { TValue *io=(obj); \
	- val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTHREAD)); \
	- checkliveness(G(L),io); }
	-
	-#define setclLvalue(L,obj,x) \
	- { TValue *io=(obj); \
	- val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TLCL)); \
	- checkliveness(G(L),io); }
	-
	-#define setclCvalue(L,obj,x) \
	- { TValue *io=(obj); \
	- val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TCCL)); \
	- checkliveness(G(L),io); }
	-
	-#define sethvalue(L,obj,x) \
	- { TValue *io=(obj); \
	- val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTABLE)); \
	- checkliveness(G(L),io); }
	-
	-#define setdeadvalue(obj) settt_(obj, LUA_TDEADKEY)
	-
	-
	-
	-#define setobj(L,obj1,obj2) \
	- { const TValue io2=(obj2); TValue io1=(obj1); \
	- io1->value_ = io2->value_; io1->tt_ = io2->tt_; \
	- checkliveness(G(L),io1); }
	-
	-
	-/*
	-** different types of assignments, according to destination
	-*/
	-
	-/* from stack to (same) stack */
	-#define setobjs2s setobj
	-/* to stack (not from same stack) */
	-#define setobj2s setobj
	-#define setsvalue2s setsvalue
	-#define sethvalue2s sethvalue
	-#define setptvalue2s setptvalue
	-/* from table to same table */
	-#define setobjt2t setobj
	-/* to table */
	-#define setobj2t setobj
	-/* to new object */
	-#define setobj2n setobj
	-#define setsvalue2n setsvalue
	-
	-
	-/* check whether a number is valid (useful only for NaN trick) */
	-#define luai_checknum(L,o,c) { /* empty */ }
	-
	-
	-/*
	-** {======================================================
	-** NaN Trick
	-** =======================================================
	-*/
	-#if defined(LUA_NANTRICK)
	-
	-/*
	-** numbers are represented in the 'd_' field. All other values have the
	-** value (NNMARK \| tag) in 'tt__'. A number with such pattern would be
	-** a "signaled NaN", which is never generated by regular operations by
	-** the CPU (nor by 'strtod')
	-*/
	-
	-/* allows for external implementation for part of the trick */
	-#if !defined(NNMARK) /* { */
	-
	-
	-#if !defined(LUA_IEEEENDIAN)
	-#error option 'LUA_NANTRICK' needs 'LUA_IEEEENDIAN'
	-#endif
	-
	-
	-#define NNMARK 0x7FF7A500
	-#define NNMASK 0x7FFFFF00
	-
	-#undef TValuefields
	-#undef NILCONSTANT
	-
	-#if (LUA_IEEEENDIAN == 0) /* { */
	-
	-/* little endian */
	-#define TValuefields \
	- union { struct { Value v__; int tt__; } i; double d__; } u
	-#define NILCONSTANT {{{NULL}, tag2tt(LUA_TNIL)}}
	-/* field-access macros */
	-#define v_(o) ((o)->u.i.v__)
	-#define d_(o) ((o)->u.d__)
	-#define tt_(o) ((o)->u.i.tt__)
	-
	-#else /* }{ */
	-
	-/* big endian */
	-#define TValuefields \
	- union { struct { int tt__; Value v__; } i; double d__; } u
	-#define NILCONSTANT {{tag2tt(LUA_TNIL), {NULL}}}
	-/* field-access macros */
	-#define v_(o) ((o)->u.i.v__)
	-#define d_(o) ((o)->u.d__)
	-#define tt_(o) ((o)->u.i.tt__)
	-
	-#endif /* } */
	-
	-#endif /* } */
	-
	-
	-/* correspondence with standard representation */
	-#undef val_
	-#define val_(o) v_(o)
	-#undef num_
	-#define num_(o) d_(o)
	-
	-
	-#undef numfield
	-#define numfield /* no such field; numbers are the entire struct */
	-
	-/* basic check to distinguish numbers from non-numbers */
	-#undef ttisnumber
	-#define ttisnumber(o) ((tt_(o) & NNMASK) != NNMARK)
	-
	-#define tag2tt(t) (NNMARK \| (t))
	-
	-#undef rttype
	-#define rttype(o) (ttisnumber(o) ? LUA_TNUMBER : tt_(o) & 0xff)
	-
	-#undef settt_
	-#define settt_(o,t) (tt_(o) = tag2tt(t))
	-
	-#undef setnvalue
	-#define setnvalue(obj,x) \
	- { TValue *io_=(obj); num_(io_)=(x); lua_assert(ttisnumber(io_)); }
	-
	-#undef setobj
	-#define setobj(L,obj1,obj2) \
	- { const TValue o2_=(obj2); TValue o1_=(obj1); \
	- o1_->u = o2_->u; \
	- checkliveness(G(L),o1_); }
	-
	-
	-/*
	-** these redefinitions are not mandatory, but these forms are more efficient
	-*/
	-
	-#undef checktag
	-#undef checktype
	-#define checktag(o,t) (tt_(o) == tag2tt(t))
	-#define checktype(o,t) (ctb(tt_(o) \| VARBITS) == ctb(tag2tt(t) \| VARBITS))
	-
	-#undef ttisequal
	-#define ttisequal(o1,o2) \
	- (ttisnumber(o1) ? ttisnumber(o2) : (tt_(o1) == tt_(o2)))
	-
	-
	-#undef luai_checknum
	-#define luai_checknum(L,o,c) { if (!ttisnumber(o)) c; }
	-
	-#endif
	-/* }====================================================== */
	-
	-
	-
	-/*
	-** {======================================================
	-** types and prototypes
	-** =======================================================
	-*/
	-
	-
	-union Value {
	- GCObject gc; / collectable objects */
	- void p; / light userdata */
	- int b; /* booleans */
	- lua_CFunction f; /* light C functions */
	- numfield /* numbers */
	-};
	-
	-
	-struct lua_TValue {
	- TValuefields;
	-};
	-
	-
	-typedef TValue StkId; / index to stack elements */
	-
	-
	-
	-
	-/*
	-** Header for string value; string bytes follow the end of this structure
	-*/
	-typedef union TString {
	- L_Umaxalign dummy; /* ensures maximum alignment for strings */
	- struct {
	- CommonHeader;
	- lu_byte extra; /* reserved words for short strings; "has hash" for longs */
	- unsigned int hash;
	- size_t len; /* number of characters in string */
	- } tsv;
	-} TString;
	-
	-
	-/* get the actual string (array of bytes) from a TString */
	-#define getstr(ts) cast(const char *, (ts) + 1)
	-
	-/* get the actual string (array of bytes) from a Lua value */
	-#define svalue(o) getstr(rawtsvalue(o))
	-
	-
	-/*
	-** Header for userdata; memory area follows the end of this structure
	-*/
	-typedef union Udata {
	- L_Umaxalign dummy; /* ensures maximum alignment for `local' udata */
	- struct {
	- CommonHeader;
	- struct Table *metatable;
	- struct Table *env;
	- size_t len; /* number of bytes */
	- } uv;
	-} Udata;
	-
	-
	-
	-/*
	-** Description of an upvalue for function prototypes
	-*/
	-typedef struct Upvaldesc {
	- TString name; / upvalue name (for debug information) */
	- lu_byte instack; /* whether it is in stack */
	- lu_byte idx; /* index of upvalue (in stack or in outer function's list) */
	-} Upvaldesc;
	-
	-
	-/*
	-** Description of a local variable for function prototypes
	-** (used for debug information)
	-*/
	-typedef struct LocVar {
	- TString *varname;
	- int startpc; /* first point where variable is active */
	- int endpc; /* first point where variable is dead */
	-} LocVar;
	-
	-
	-/*
	-** Function Prototypes
	-*/
	-typedef struct Proto {
	- CommonHeader;
	- TValue k; / constants used by the function */
	- Instruction *code;
	- struct Proto *p; / functions defined inside the function */
	- int lineinfo; / map from opcodes to source lines (debug information) */
	- LocVar locvars; / information about local variables (debug information) */
	- Upvaldesc upvalues; / upvalue information */
	- union Closure cache; / last created closure with this prototype */
	- TString source; / used for debug information */
	- int sizeupvalues; /* size of 'upvalues' */
	- int sizek; /* size of `k' */
	- int sizecode;
	- int sizelineinfo;
	- int sizep; /* size of `p' */
	- int sizelocvars;
	- int linedefined;
	- int lastlinedefined;
	- GCObject *gclist;
	- lu_byte numparams; /* number of fixed parameters */
	- lu_byte is_vararg;
	- lu_byte maxstacksize; /* maximum stack used by this function */
	-} Proto;
	-
	-
	-
	-/*
	-** Lua Upvalues
	-*/
	-typedef struct UpVal {
	- CommonHeader;
	- TValue v; / points to stack or to its own value */
	- union {
	- TValue value; /* the value (when closed) */
	- struct { /* double linked list (when open) */
	- struct UpVal *prev;
	- struct UpVal *next;
	- } l;
	- } u;
	-} UpVal;
	-
	-
	-/*
	-** Closures
	-*/
	-
	-#define ClosureHeader \
	- CommonHeader; lu_byte nupvalues; GCObject *gclist
	-
	-typedef struct CClosure {
	- ClosureHeader;
	- lua_CFunction f;
	- TValue upvalue[1]; /* list of upvalues */
	-} CClosure;
	-
	-
	-typedef struct LClosure {
	- ClosureHeader;
	- struct Proto *p;
	- UpVal upvals[1]; / list of upvalues */
	-} LClosure;
	-
	-
	-typedef union Closure {
	- CClosure c;
	- LClosure l;
	-} Closure;
	-
	-
	-#define isLfunction(o) ttisLclosure(o)
	-
	-#define getproto(o) (clLvalue(o)->p)
	-
	-
	-/*
	-** Tables
	-*/
	-
	-typedef union TKey {
	- struct {
	- TValuefields;
	- struct Node next; / for chaining */
	- } nk;
	- TValue tvk;
	-} TKey;
	-
	-
	-typedef struct Node {
	- TValue i_val;
	- TKey i_key;
	-} Node;
	-
	-
	-typedef struct Table {
	- CommonHeader;
	- lu_byte flags; /* 1<<p means tagmethod(p) is not present */
	- lu_byte lsizenode; /* log2 of size of `node' array */
	- int sizearray; /* size of `array' array */
	- TValue array; / array part */
	- Node *node;
	- Node lastfree; / any free position is before this position */
	- struct Table *metatable;
	- GCObject *gclist;
	-} Table;
	-
	-
	-
	-/*
	-** `module' operation for hashing (size is always a power of 2)
	-*/
	-#define lmod(s,size) \
	- (check_exp((size&(size-1))==0, (cast(int, (s) & ((size)-1)))))
	-
	-
	-#define twoto(x) (1<<(x))
	-#define sizenode(t) (twoto((t)->lsizenode))
	-
	-
	-/*
	-** (address of) a fixed nil value
	-*/
	-#define luaO_nilobject (&luaO_nilobject_)
	-
	-
	-LUAI_DDEC const TValue luaO_nilobject_;
	-
	-
	-LUAI_FUNC int luaO_int2fb (unsigned int x);
	-LUAI_FUNC int luaO_fb2int (int x);
	-LUAI_FUNC int luaO_ceillog2 (unsigned int x);
	-LUAI_FUNC lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2);
	-LUAI_FUNC int luaO_str2d (const char s, size_t len, lua_Number result);
	-LUAI_FUNC int luaO_hexavalue (int c);
	-LUAI_FUNC const char luaO_pushvfstring (lua_State L, const char *fmt,
	- va_list argp);
	-LUAI_FUNC const char luaO_pushfstring (lua_State L, const char *fmt, ...);
	-LUAI_FUNC void luaO_chunkid (char out, const char source, size_t len);
	-
	-
	-#endif
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c
	@@ -1,283 +0,0 @@
	-/*
	-** $Id: lobject.c,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Some generic functions over Lua objects
	-** See Copyright Notice in lua.h
	-*/
	-
	-#include <sys/zfs_context.h>
	-
	-#define lobject_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lctype.h"
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "lvm.h"
	-
	-
	-
	-LUAI_DDEF const TValue luaO_nilobject_ = {NILCONSTANT};
	-
	-
	-/*
	-** converts an integer to a "floating point byte", represented as
	-** (eeeeexxx), where the real value is (1xxx) * 2^(eeeee - 1) if
	-** eeeee != 0 and (xxx) otherwise.
	-*/
	-int luaO_int2fb (unsigned int x) {
	- int e = 0; /* exponent */
	- if (x < 8) return x;
	- while (x >= 0x10) {
	- x = (x+1) >> 1;
	- e++;
	- }
	- return ((e+1) << 3) \| (cast_int(x) - 8);
	-}
	-
	-
	-/* converts back */
	-int luaO_fb2int (int x) {
	- int e = (x >> 3) & 0x1f;
	- if (e == 0) return x;
	- else return ((x & 7) + 8) << (e - 1);
	-}
	-
	-
	-int luaO_ceillog2 (unsigned int x) {
	- static const lu_byte log_2[256] = {
	- 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
	- 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
	- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	- 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
	- 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
	- 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
	- 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
	- };
	- int l = 0;
	- x--;
	- while (x >= 256) { l += 8; x >>= 8; }
	- return l + log_2[x];
	-}
	-
	-
	-lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2) {
	- switch (op) {
	- case LUA_OPADD: return luai_numadd(NULL, v1, v2);
	- case LUA_OPSUB: return luai_numsub(NULL, v1, v2);
	- case LUA_OPMUL: return luai_nummul(NULL, v1, v2);
	- case LUA_OPDIV: return luai_numdiv(NULL, v1, v2);
	- case LUA_OPMOD: return luai_nummod(NULL, v1, v2);
	- case LUA_OPPOW: return luai_numpow(NULL, v1, v2);
	- case LUA_OPUNM: return luai_numunm(NULL, v1);
	- default: lua_assert(0); return 0;
	- }
	-}
	-
	-
	-int luaO_hexavalue (int c) {
	- if (lisdigit(c)) return c - '0';
	- else return ltolower(c) - 'a' + 10;
	-}
	-
	-
	-#if !defined(lua_strx2number)
	-
	-
	-
	-static int isneg (const char **s) {
	- if (*s == '-') { (s)++; return 1; }
	- else if (*s == '+') (s)++;
	- return 0;
	-}
	-
	-
	-static lua_Number readhexa (const char *s, lua_Number r, int count) {
	- for (; lisxdigit(cast_uchar(*s)); (s)++) { /* read integer part */
	- r = (r * cast_num(16.0)) + cast_num(luaO_hexavalue(cast_uchar(**s)));
	- (*count)++;
	- }
	- return r;
	-}
	-
	-
	-/*
	-** convert an hexadecimal numeric string to a number, following
	-** C99 specification for 'strtod'
	-*/
	-static lua_Number lua_strx2number (const char s, char *endptr) {
	- lua_Number r = 0.0;
	- int e = 0, i = 0;
	- int neg = 0; /* 1 if number is negative */
	- endptr = cast(char , s); /* nothing is valid yet */
	- while (lisspace(cast_uchar(s))) s++; / skip initial spaces */
	- neg = isneg(&s); /* check signal */
	- if (!(s == '0' && ((s + 1) == 'x' \|\| (s + 1) == 'X'))) / check '0x' */
	- return 0.0; /* invalid format (no '0x') */
	- s += 2; /* skip '0x' */
	- r = readhexa(&s, r, &i); /* read integer part */
	- if (*s == '.') {
	- s++; /* skip dot */
	- r = readhexa(&s, r, &e); /* read fractional part */
	- }
	- if (i == 0 && e == 0)
	- return 0.0; /* invalid format (no digit) */
	- e = -4; / each fractional digit divides value by 2^-4 */
	- endptr = cast(char , s); /* valid up to here */
	- if (s == 'p' \|\| s == 'P') { /* exponent part? */
	- int exp1 = 0;
	- int neg1;
	- s++; /* skip 'p' */
	- neg1 = isneg(&s); /* signal */
	- if (!lisdigit(cast_uchar(*s)))
	- goto ret; /* must have at least one digit */
	- while (lisdigit(cast_uchar(s))) / read exponent */
	- exp1 = exp1 * 10 + *(s++) - '0';
	- if (neg1) exp1 = -exp1;
	- e += exp1;
	- }
	- endptr = cast(char , s); /* valid up to here */
	- ret:
	- if (neg) r = -r;
	- return (r * (1 << e));
	-}
	-
	-#endif
	-
	-
	-int luaO_str2d (const char s, size_t len, lua_Number result) {
	- char *endptr;
	- if (strpbrk(s, "nN")) /* reject 'inf' and 'nan' */
	- return 0;
	- else if (strpbrk(s, "xX")) /* hexa? */
	- *result = lua_strx2number(s, &endptr);
	- else
	- *result = lua_str2number(s, &endptr);
	- if (endptr == s) return 0; /* nothing recognized */
	- while (lisspace(cast_uchar(*endptr))) endptr++;
	- return (endptr == s + len); /* OK if no trailing characters */
	-}
	-
	-
	-
	-static void pushstr (lua_State L, const char str, size_t l) {
	- setsvalue2s(L, L->top++, luaS_newlstr(L, str, l));
	-}
	-
	-
	-/* this function handles only `%d', `%c', %f, %p, and `%s' formats */
	-const char luaO_pushvfstring (lua_State L, const char *fmt, va_list argp) {
	- int n = 0;
	- for (;;) {
	- const char *e = strchr(fmt, '%');
	- if (e == NULL) break;
	- luaD_checkstack(L, 2); /* fmt + item */
	- pushstr(L, fmt, e - fmt);
	- switch (*(e+1)) {
	- case 's': {
	- const char s = va_arg(argp, char );
	- if (s == NULL) s = "(null)";
	- pushstr(L, s, strlen(s));
	- break;
	- }
	- case 'c': {
	- char buff;
	- buff = cast(char, va_arg(argp, int));
	- pushstr(L, &buff, 1);
	- break;
	- }
	- case 'd': {
	- setnvalue(L->top++, cast_num(va_arg(argp, int)));
	- break;
	- }
	- case 'f': {
	- setnvalue(L->top++, cast_num(va_arg(argp, l_uacNumber)));
	- break;
	- }
	- case 'p': {
	- char buff[4sizeof(void ) + 8]; /* should be enough space for a `%p' */
	- int l = lcompat_sprintf(buff, "%p", va_arg(argp, void *));
	- pushstr(L, buff, l);
	- break;
	- }
	- case '%': {
	- pushstr(L, "%", 1);
	- break;
	- }
	- default: {
	- luaG_runerror(L,
	- "invalid option " LUA_QL("%%%c") " to " LUA_QL("lua_pushfstring"),
	- *(e + 1));
	- }
	- }
	- n += 2;
	- fmt = e+2;
	- }
	- luaD_checkstack(L, 1);
	- pushstr(L, fmt, strlen(fmt));
	- if (n > 0) luaV_concat(L, n + 1);
	- return svalue(L->top - 1);
	-}
	-
	-
	-const char luaO_pushfstring (lua_State L, const char *fmt, ...) {
	- const char *msg;
	- va_list argp;
	- va_start(argp, fmt);
	- msg = luaO_pushvfstring(L, fmt, argp);
	- va_end(argp);
	- return msg;
	-}
	-
	-
	-/* number of chars of a literal string without the ending \0 */
	-#define LL(x) (sizeof(x)/sizeof(char) - 1)
	-
	-#define RETS "..."
	-#define PRE "[string \""
	-#define POS "\"]"
	-
	-#define addstr(a,b,l) ( memcpy(a,b,(l) * sizeof(char)), a += (l) )
	-
	-void luaO_chunkid (char out, const char source, size_t bufflen) {
	- size_t l = strlen(source);
	- if (source == '=') { / 'literal' source */
	- if (l <= bufflen) /* small enough? */
	- memcpy(out, source + 1, l * sizeof(char));
	- else { /* truncate it */
	- addstr(out, source + 1, bufflen - 1);
	- *out = '\0';
	- }
	- }
	- else if (source == '@') { / file name */
	- if (l <= bufflen) /* small enough? */
	- memcpy(out, source + 1, l * sizeof(char));
	- else { /* add '...' before rest of name */
	- addstr(out, RETS, LL(RETS));
	- bufflen -= LL(RETS);
	- memcpy(out, source + 1 + l - bufflen, bufflen * sizeof(char));
	- }
	- }
	- else { /* string; format as [string "source"] */
	- const char nl = strchr(source, '\n'); / find first new line (if any) */
	- addstr(out, PRE, LL(PRE)); /* add prefix */
	- bufflen -= LL(PRE RETS POS) + 1; /* save space for prefix+suffix+'\0' */
	- if (l < bufflen && nl == NULL) { /* small one-line source? */
	- addstr(out, source, l); /* keep it */
	- }
	- else {
	- if (nl != NULL) l = nl - source; /* stop at first newline */
	- if (l > bufflen) l = bufflen;
	- addstr(out, source, l);
	- addstr(out, RETS, LL(RETS));
	- }
	- memcpy(out, POS, (LL(POS) + 1) * sizeof(char));
	- }
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h
	@@ -1,288 +0,0 @@
	-/*
	-** $Id: lopcodes.h,v 1.142.1.2 2014/10/20 18:32:09 roberto Exp $
	-** Opcodes for Lua virtual machine
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lopcodes_h
	-#define lopcodes_h
	-
	-#include "llimits.h"
	-
	-
	-/*===========================================================================
	- We assume that instructions are unsigned numbers.
	- All instructions have an opcode in the first 6 bits.
	- Instructions can have the following fields:
	- `A' : 8 bits
	- `B' : 9 bits
	- `C' : 9 bits
	- 'Ax' : 26 bits ('A', 'B', and 'C' together)
	- `Bx' : 18 bits (`B' and `C' together)
	- `sBx' : signed Bx
	-
	- A signed argument is represented in excess K; that is, the number
	- value is the unsigned value minus K. K is exactly the maximum value
	- for that argument (so that -max is represented by 0, and +max is
	- represented by 2*max), which is half the maximum for the corresponding
	- unsigned argument.
	-===========================================================================*/
	-
	-
	-enum OpMode {iABC, iABx, iAsBx, iAx}; /* basic instruction format */
	-
	-
	-/*
	-** size and position of opcode arguments.
	-*/
	-#define SIZE_C 9
	-#define SIZE_B 9
	-#define SIZE_Bx (SIZE_C + SIZE_B)
	-#define SIZE_A 8
	-#define SIZE_Ax (SIZE_C + SIZE_B + SIZE_A)
	-
	-#define SIZE_OP 6
	-
	-#define POS_OP 0
	-#define POS_A (POS_OP + SIZE_OP)
	-#define POS_C (POS_A + SIZE_A)
	-#define POS_B (POS_C + SIZE_C)
	-#define POS_Bx POS_C
	-#define POS_Ax POS_A
	-
	-
	-/*
	-** limits for opcode arguments.
	-** we use (signed) int to manipulate most arguments,
	-** so they must fit in LUAI_BITSINT-1 bits (-1 for sign)
	-*/
	-#if SIZE_Bx < LUAI_BITSINT-1
	-#define MAXARG_Bx ((1<<SIZE_Bx)-1)
	-#define MAXARG_sBx (MAXARG_Bx>>1) /* `sBx' is signed */
	-#else
	-#define MAXARG_Bx MAX_INT
	-#define MAXARG_sBx MAX_INT
	-#endif
	-
	-#if SIZE_Ax < LUAI_BITSINT-1
	-#define MAXARG_Ax ((1<<SIZE_Ax)-1)
	-#else
	-#define MAXARG_Ax MAX_INT
	-#endif
	-
	-
	-#define MAXARG_A ((1<<SIZE_A)-1)
	-#define MAXARG_B ((1<<SIZE_B)-1)
	-#define MAXARG_C ((1<<SIZE_C)-1)
	-
	-
	-/* creates a mask with `n' 1 bits at position `p' */
	-#define MASK1(n,p) ((~((~(Instruction)0)<<(n)))<<(p))
	-
	-/* creates a mask with `n' 0 bits at position `p' */
	-#define MASK0(n,p) (~MASK1(n,p))
	-
	-/*
	-** the following macros help to manipulate instructions
	-*/
	-
	-#define GET_OPCODE(i) (cast(OpCode, ((i)>>POS_OP) & MASK1(SIZE_OP,0)))
	-#define SET_OPCODE(i,o) ((i) = (((i)&MASK0(SIZE_OP,POS_OP)) \| \
	- ((cast(Instruction, o)<<POS_OP)&MASK1(SIZE_OP,POS_OP))))
	-
	-#define getarg(i,pos,size) (cast(int, ((i)>>pos) & MASK1(size,0)))
	-#define setarg(i,v,pos,size) ((i) = (((i)&MASK0(size,pos)) \| \
	- ((cast(Instruction, v)<<pos)&MASK1(size,pos))))
	-
	-#define GETARG_A(i) getarg(i, POS_A, SIZE_A)
	-#define SETARG_A(i,v) setarg(i, v, POS_A, SIZE_A)
	-
	-#define GETARG_B(i) getarg(i, POS_B, SIZE_B)
	-#define SETARG_B(i,v) setarg(i, v, POS_B, SIZE_B)
	-
	-#define GETARG_C(i) getarg(i, POS_C, SIZE_C)
	-#define SETARG_C(i,v) setarg(i, v, POS_C, SIZE_C)
	-
	-#define GETARG_Bx(i) getarg(i, POS_Bx, SIZE_Bx)
	-#define SETARG_Bx(i,v) setarg(i, v, POS_Bx, SIZE_Bx)
	-
	-#define GETARG_Ax(i) getarg(i, POS_Ax, SIZE_Ax)
	-#define SETARG_Ax(i,v) setarg(i, v, POS_Ax, SIZE_Ax)
	-
	-#define GETARG_sBx(i) (GETARG_Bx(i)-MAXARG_sBx)
	-#define SETARG_sBx(i,b) SETARG_Bx((i),cast(unsigned int, (b)+MAXARG_sBx))
	-
	-
	-#define CREATE_ABC(o,a,b,c) ((cast(Instruction, o)<<POS_OP) \
	- \| (cast(Instruction, a)<<POS_A) \
	- \| (cast(Instruction, b)<<POS_B) \
	- \| (cast(Instruction, c)<<POS_C))
	-
	-#define CREATE_ABx(o,a,bc) ((cast(Instruction, o)<<POS_OP) \
	- \| (cast(Instruction, a)<<POS_A) \
	- \| (cast(Instruction, bc)<<POS_Bx))
	-
	-#define CREATE_Ax(o,a) ((cast(Instruction, o)<<POS_OP) \
	- \| (cast(Instruction, a)<<POS_Ax))
	-
	-
	-/*
	-** Macros to operate RK indices
	-*/
	-
	-/* this bit 1 means constant (0 means register) */
	-#define BITRK (1 << (SIZE_B - 1))
	-
	-/* test whether value is a constant */
	-#define ISK(x) ((x) & BITRK)
	-
	-/* gets the index of the constant */
	-#define INDEXK(r) ((int)(r) & ~BITRK)
	-
	-#define MAXINDEXRK (BITRK - 1)
	-
	-/* code a constant index as a RK value */
	-#define RKASK(x) ((x) \| BITRK)
	-
	-
	-/*
	-** invalid register that fits in 8 bits
	-*/
	-#define NO_REG MAXARG_A
	-
	-
	-/*
	-** R(x) - register
	-** Kst(x) - constant (in constant table)
	-** RK(x) == if ISK(x) then Kst(INDEXK(x)) else R(x)
	-*/
	-
	-
	-/*
	-** grep "ORDER OP" if you change these enums
	-*/
	-
	-typedef enum {
	-/*----------------------------------------------------------------------
	-name args description
	-------------------------------------------------------------------------*/
	-OP_MOVE,/* A B R(A) := R(B) */
	-OP_LOADK,/* A Bx R(A) := Kst(Bx) */
	-OP_LOADKX,/* A R(A) := Kst(extra arg) */
	-OP_LOADBOOL,/* A B C R(A) := (Bool)B; if (C) pc++ */
	-OP_LOADNIL,/* A B R(A), R(A+1), ..., R(A+B) := nil */
	-OP_GETUPVAL,/* A B R(A) := UpValue[B] */
	-
	-OP_GETTABUP,/* A B C R(A) := UpValue[B][RK(C)] */
	-OP_GETTABLE,/* A B C R(A) := R(B)[RK(C)] */
	-
	-OP_SETTABUP,/* A B C UpValue[A][RK(B)] := RK(C) */
	-OP_SETUPVAL,/* A B UpValue[B] := R(A) */
	-OP_SETTABLE,/* A B C R(A)[RK(B)] := RK(C) */
	-
	-OP_NEWTABLE,/* A B C R(A) := {} (size = B,C) */
	-
	-OP_SELF,/* A B C R(A+1) := R(B); R(A) := R(B)[RK(C)] */
	-
	-OP_ADD,/* A B C R(A) := RK(B) + RK(C) */
	-OP_SUB,/* A B C R(A) := RK(B) - RK(C) */
	-OP_MUL,/* A B C R(A) := RK(B) * RK(C) */
	-OP_DIV,/* A B C R(A) := RK(B) / RK(C) */
	-OP_MOD,/* A B C R(A) := RK(B) % RK(C) */
	-OP_POW,/* A B C R(A) := RK(B) ^ RK(C) */
	-OP_UNM,/* A B R(A) := -R(B) */
	-OP_NOT,/* A B R(A) := not R(B) */
	-OP_LEN,/* A B R(A) := length of R(B) */
	-
	-OP_CONCAT,/* A B C R(A) := R(B).. ... ..R(C) */
	-
	-OP_JMP,/* A sBx pc+=sBx; if (A) close all upvalues >= R(A - 1) */
	-OP_EQ,/* A B C if ((RK(B) == RK(C)) ~= A) then pc++ */
	-OP_LT,/* A B C if ((RK(B) < RK(C)) ~= A) then pc++ */
	-OP_LE,/* A B C if ((RK(B) <= RK(C)) ~= A) then pc++ */
	-
	-OP_TEST,/* A C if not (R(A) <=> C) then pc++ */
	-OP_TESTSET,/* A B C if (R(B) <=> C) then R(A) := R(B) else pc++ */
	-
	-OP_CALL,/* A B C R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */
	-OP_TAILCALL,/* A B C return R(A)(R(A+1), ... ,R(A+B-1)) */
	-OP_RETURN,/* A B return R(A), ... ,R(A+B-2) (see note) */
	-
	-OP_FORLOOP,/* A sBx R(A)+=R(A+2);
	- if R(A) <?= R(A+1) then { pc+=sBx; R(A+3)=R(A) }*/
	-OP_FORPREP,/* A sBx R(A)-=R(A+2); pc+=sBx */
	-
	-OP_TFORCALL,/* A C R(A+3), ... ,R(A+2+C) := R(A)(R(A+1), R(A+2)); */
	-OP_TFORLOOP,/* A sBx if R(A+1) ~= nil then { R(A)=R(A+1); pc += sBx }*/
	-
	-OP_SETLIST,/* A B C R(A)[(C-1)FPF+i] := R(A+i), 1 <= i <= B /
	-
	-OP_CLOSURE,/* A Bx R(A) := closure(KPROTO[Bx]) */
	-
	-OP_VARARG,/* A B R(A), R(A+1), ..., R(A+B-2) = vararg */
	-
	-OP_EXTRAARG/* Ax extra (larger) argument for previous opcode */
	-} OpCode;
	-
	-
	-#define NUM_OPCODES (cast(int, OP_EXTRAARG) + 1)
	-
	-
	-
	-/*===========================================================================
	- Notes:
	- (*) In OP_CALL, if (B == 0) then B = top. If (C == 0), then `top' is
	- set to last_result+1, so next open instruction (OP_CALL, OP_RETURN,
	- OP_SETLIST) may use `top'.
	-
	- (*) In OP_VARARG, if (B == 0) then use actual number of varargs and
	- set top (like in OP_CALL with C == 0).
	-
	- (*) In OP_RETURN, if (B == 0) then return up to `top'.
	-
	- (*) In OP_SETLIST, if (B == 0) then B = `top'; if (C == 0) then next
	- 'instruction' is EXTRAARG(real C).
	-
	- (*) In OP_LOADKX, the next 'instruction' is always EXTRAARG.
	-
	- (*) For comparisons, A specifies what condition the test should accept
	- (true or false).
	-
	- (*) All `skips' (pc++) assume that next instruction is a jump.
	-
	-===========================================================================*/
	-
	-
	-/*
	-** masks for instruction properties. The format is:
	-** bits 0-1: op mode
	-** bits 2-3: C arg mode
	-** bits 4-5: B arg mode
	-** bit 6: instruction set register A
	-** bit 7: operator is a test (next instruction must be a jump)
	-*/
	-
	-enum OpArgMask {
	- OpArgN, /* argument is not used */
	- OpArgU, /* argument is used */
	- OpArgR, /* argument is a register or a jump offset */
	- OpArgK /* argument is a constant or register/constant */
	-};
	-
	-LUAI_DDEC const lu_byte luaP_opmodes[NUM_OPCODES];
	-
	-#define getOpMode(m) (cast(enum OpMode, luaP_opmodes[m] & 3))
	-#define getBMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 4) & 3))
	-#define getCMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 2) & 3))
	-#define testAMode(m) (luaP_opmodes[m] & (1 << 6))
	-#define testTMode(m) (luaP_opmodes[m] & (1 << 7))
	-
	-
	-LUAI_DDEC const char const luaP_opnames[NUM_OPCODES+1]; / opcode names */
	-
	-
	-/* number of list items to accumulate before a SETLIST instruction */
	-#define LFIELDS_PER_FLUSH 50
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c
	@@ -1,107 +0,0 @@
	-/*
	-** $Id: lopcodes.c,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Opcodes for Lua virtual machine
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#define lopcodes_c
	-#define LUA_CORE
	-
	-
	-#include "lopcodes.h"
	-
	-
	-/* ORDER OP */
	-
	-LUAI_DDEF const char *const luaP_opnames[NUM_OPCODES+1] = {
	- "MOVE",
	- "LOADK",
	- "LOADKX",
	- "LOADBOOL",
	- "LOADNIL",
	- "GETUPVAL",
	- "GETTABUP",
	- "GETTABLE",
	- "SETTABUP",
	- "SETUPVAL",
	- "SETTABLE",
	- "NEWTABLE",
	- "SELF",
	- "ADD",
	- "SUB",
	- "MUL",
	- "DIV",
	- "MOD",
	- "POW",
	- "UNM",
	- "NOT",
	- "LEN",
	- "CONCAT",
	- "JMP",
	- "EQ",
	- "LT",
	- "LE",
	- "TEST",
	- "TESTSET",
	- "CALL",
	- "TAILCALL",
	- "RETURN",
	- "FORLOOP",
	- "FORPREP",
	- "TFORCALL",
	- "TFORLOOP",
	- "SETLIST",
	- "CLOSURE",
	- "VARARG",
	- "EXTRAARG",
	- NULL
	-};
	-
	-
	-#define opmode(t,a,b,c,m) (((t)<<7) \| ((a)<<6) \| ((b)<<4) \| ((c)<<2) \| (m))
	-
	-LUAI_DDEF const lu_byte luaP_opmodes[NUM_OPCODES] = {
	-/* T A B C mode opcode */
	- opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_MOVE */
	- ,opmode(0, 1, OpArgK, OpArgN, iABx) /* OP_LOADK */
	- ,opmode(0, 1, OpArgN, OpArgN, iABx) /* OP_LOADKX */
	- ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_LOADBOOL */
	- ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_LOADNIL */
	- ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_GETUPVAL */
	- ,opmode(0, 1, OpArgU, OpArgK, iABC) /* OP_GETTABUP */
	- ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_GETTABLE */
	- ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABUP */
	- ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_SETUPVAL */
	- ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABLE */
	- ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_NEWTABLE */
	- ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_SELF */
	- ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_ADD */
	- ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_SUB */
	- ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MUL */
	- ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_DIV */
	- ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MOD */
	- ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_POW */
	- ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_UNM */
	- ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_NOT */
	- ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_LEN */
	- ,opmode(0, 1, OpArgR, OpArgR, iABC) /* OP_CONCAT */
	- ,opmode(0, 0, OpArgR, OpArgN, iAsBx) /* OP_JMP */
	- ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_EQ */
	- ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LT */
	- ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LE */
	- ,opmode(1, 0, OpArgN, OpArgU, iABC) /* OP_TEST */
	- ,opmode(1, 1, OpArgR, OpArgU, iABC) /* OP_TESTSET */
	- ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_CALL */
	- ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_TAILCALL */
	- ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_RETURN */
	- ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORLOOP */
	- ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORPREP */
	- ,opmode(0, 0, OpArgN, OpArgU, iABC) /* OP_TFORCALL */
	- ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_TFORLOOP */
	- ,opmode(0, 0, OpArgU, OpArgU, iABC) /* OP_SETLIST */
	- ,opmode(0, 1, OpArgU, OpArgN, iABx) /* OP_CLOSURE */
	- ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_VARARG */
	- ,opmode(0, 0, OpArgU, OpArgU, iAx) /* OP_EXTRAARG */
	-};
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h
	@@ -1,119 +0,0 @@
	-/*
	-** $Id: lparser.h,v 1.70.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Lua Parser
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lparser_h
	-#define lparser_h
	-
	-#include "llimits.h"
	-#include "lobject.h"
	-#include "lzio.h"
	-
	-
	-/*
	-** Expression descriptor
	-*/
	-
	-typedef enum {
	- VVOID, /* no value */
	- VNIL,
	- VTRUE,
	- VFALSE,
	- VK, /* info = index of constant in `k' */
	- VKNUM, /* nval = numerical value */
	- VNONRELOC, /* info = result register */
	- VLOCAL, /* info = local register */
	- VUPVAL, /* info = index of upvalue in 'upvalues' */
	- VINDEXED, /* t = table register/upvalue; idx = index R/K */
	- VJMP, /* info = instruction pc */
	- VRELOCABLE, /* info = instruction pc */
	- VCALL, /* info = instruction pc */
	- VVARARG /* info = instruction pc */
	-} expkind;
	-
	-
	-#define vkisvar(k) (VLOCAL <= (k) && (k) <= VINDEXED)
	-#define vkisinreg(k) ((k) == VNONRELOC \|\| (k) == VLOCAL)
	-
	-typedef struct expdesc {
	- expkind k;
	- union {
	- struct { /* for indexed variables (VINDEXED) */
	- short idx; /* index (R/K) */
	- lu_byte t; /* table (register or upvalue) */
	- lu_byte vt; /* whether 't' is register (VLOCAL) or upvalue (VUPVAL) */
	- } ind;
	- int info; /* for generic use */
	- lua_Number nval; /* for VKNUM */
	- } u;
	- int t; /* patch list of `exit when true' */
	- int f; /* patch list of `exit when false' */
	-} expdesc;
	-
	-
	-/* description of active local variable */
	-typedef struct Vardesc {
	- short idx; /* variable index in stack */
	-} Vardesc;
	-
	-
	-/* description of pending goto statements and label statements */
	-typedef struct Labeldesc {
	- TString name; / label identifier */
	- int pc; /* position in code */
	- int line; /* line where it appeared */
	- lu_byte nactvar; /* local level where it appears in current block */
	-} Labeldesc;
	-
	-
	-/* list of labels or gotos */
	-typedef struct Labellist {
	- Labeldesc arr; / array */
	- int n; /* number of entries in use */
	- int size; /* array size */
	-} Labellist;
	-
	-
	-/* dynamic structures used by the parser */
	-typedef struct Dyndata {
	- struct { /* list of active local variables */
	- Vardesc *arr;
	- int n;
	- int size;
	- } actvar;
	- Labellist gt; /* list of pending gotos */
	- Labellist label; /* list of active labels */
	-} Dyndata;
	-
	-
	-/* control of blocks */
	-struct BlockCnt; /* defined in lparser.c */
	-
	-
	-/* state needed to generate code for a given function */
	-typedef struct FuncState {
	- Proto f; / current function header */
	- Table h; / table to find (and reuse) elements in `k' */
	- struct FuncState prev; / enclosing function */
	- struct LexState ls; / lexical state */
	- struct BlockCnt bl; / chain of current blocks */
	- int pc; /* next position to code (equivalent to `ncode') */
	- int lasttarget; /* 'label' of last 'jump label' */
	- int jpc; /* list of pending jumps to `pc' */
	- int nk; /* number of elements in `k' */
	- int np; /* number of elements in `p' */
	- int firstlocal; /* index of first local var (in Dyndata array) */
	- short nlocvars; /* number of elements in 'f->locvars' */
	- lu_byte nactvar; /* number of active local variables */
	- lu_byte nups; /* number of upvalues */
	- lu_byte freereg; /* first free register */
	-} FuncState;
	-
	-
	-LUAI_FUNC Closure luaY_parser (lua_State L, ZIO z, Mbuffer buff,
	- Dyndata dyd, const char name, int firstchar);
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c
	@@ -1,1637 +0,0 @@
	-/*
	-** $Id: lparser.c,v 2.130.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Lua Parser
	-** See Copyright Notice in lua.h
	-*/
	-
	-#include <sys/zfs_context.h>
	-
	-#define lparser_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lcode.h"
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lfunc.h"
	-#include "llex.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lopcodes.h"
	-#include "lparser.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-
	-
	-
	-/* maximum number of local variables per function (must be smaller
	- than 250, due to the bytecode format) */
	-#define MAXVARS 200
	-
	-
	-#define hasmultret(k) ((k) == VCALL \|\| (k) == VVARARG)
	-
	-
	-
	-/*
	-** nodes for block list (list of active blocks)
	-*/
	-typedef struct BlockCnt {
	- struct BlockCnt previous; / chain */
	- short firstlabel; /* index of first label in this block */
	- short firstgoto; /* index of first pending goto in this block */
	- lu_byte nactvar; /* # active locals outside the block */
	- lu_byte upval; /* true if some variable in the block is an upvalue */
	- lu_byte isloop; /* true if `block' is a loop */
	-} BlockCnt;
	-
	-
	-
	-/*
	-** prototypes for recursive non-terminal functions
	-*/
	-static void statement (LexState *ls);
	-static void expr (LexState ls, expdesc v);
	-
	-
	-static void anchor_token (LexState *ls) {
	- /* last token from outer function must be EOS */
	- lua_assert(ls->fs != NULL \|\| ls->t.token == TK_EOS);
	- if (ls->t.token == TK_NAME \|\| ls->t.token == TK_STRING) {
	- TString *ts = ls->t.seminfo.ts;
	- luaX_newstring(ls, getstr(ts), ts->tsv.len);
	- }
	-}
	-
	-
	-/* semantic error */
	-static l_noret semerror (LexState ls, const char msg) {
	- ls->t.token = 0; /* remove 'near to' from final message */
	- luaX_syntaxerror(ls, msg);
	-}
	-
	-
	-static l_noret error_expected (LexState *ls, int token) {
	- luaX_syntaxerror(ls,
	- luaO_pushfstring(ls->L, "%s expected", luaX_token2str(ls, token)));
	-}
	-
	-
	-static l_noret errorlimit (FuncState fs, int limit, const char what) {
	- lua_State *L = fs->ls->L;
	- const char *msg;
	- int line = fs->f->linedefined;
	- const char *where = (line == 0)
	- ? "main function"
	- : luaO_pushfstring(L, "function at line %d", line);
	- msg = luaO_pushfstring(L, "too many %s (limit is %d) in %s",
	- what, limit, where);
	- luaX_syntaxerror(fs->ls, msg);
	-}
	-
	-
	-static void checklimit (FuncState fs, int v, int l, const char what) {
	- if (v > l) errorlimit(fs, l, what);
	-}
	-
	-
	-static int testnext (LexState *ls, int c) {
	- if (ls->t.token == c) {
	- luaX_next(ls);
	- return 1;
	- }
	- else return 0;
	-}
	-
	-
	-static void check (LexState *ls, int c) {
	- if (ls->t.token != c)
	- error_expected(ls, c);
	-}
	-
	-
	-static void checknext (LexState *ls, int c) {
	- check(ls, c);
	- luaX_next(ls);
	-}
	-
	-
	-#define check_condition(ls,c,msg) { if (!(c)) luaX_syntaxerror(ls, msg); }
	-
	-
	-
	-static void check_match (LexState *ls, int what, int who, int where) {
	- if (!testnext(ls, what)) {
	- if (where == ls->linenumber)
	- error_expected(ls, what);
	- else {
	- luaX_syntaxerror(ls, luaO_pushfstring(ls->L,
	- "%s expected (to close %s at line %d)",
	- luaX_token2str(ls, what), luaX_token2str(ls, who), where));
	- }
	- }
	-}
	-
	-
	-static TString str_checkname (LexState ls) {
	- TString *ts;
	- check(ls, TK_NAME);
	- ts = ls->t.seminfo.ts;
	- luaX_next(ls);
	- return ts;
	-}
	-
	-
	-static void init_exp (expdesc *e, expkind k, int i) {
	- e->f = e->t = NO_JUMP;
	- e->k = k;
	- e->u.info = i;
	-}
	-
	-
	-static void codestring (LexState ls, expdesc e, TString *s) {
	- init_exp(e, VK, luaK_stringK(ls->fs, s));
	-}
	-
	-
	-static void checkname (LexState ls, expdesc e) {
	- codestring(ls, e, str_checkname(ls));
	-}
	-
	-
	-static int registerlocalvar (LexState ls, TString varname) {
	- FuncState *fs = ls->fs;
	- Proto *f = fs->f;
	- int oldsize = f->sizelocvars;
	- luaM_growvector(ls->L, f->locvars, fs->nlocvars, f->sizelocvars,
	- LocVar, SHRT_MAX, "local variables");
	- while (oldsize < f->sizelocvars) f->locvars[oldsize++].varname = NULL;
	- f->locvars[fs->nlocvars].varname = varname;
	- luaC_objbarrier(ls->L, f, varname);
	- return fs->nlocvars++;
	-}
	-
	-
	-static void new_localvar (LexState ls, TString name) {
	- FuncState *fs = ls->fs;
	- Dyndata *dyd = ls->dyd;
	- int reg = registerlocalvar(ls, name);
	- checklimit(fs, dyd->actvar.n + 1 - fs->firstlocal,
	- MAXVARS, "local variables");
	- luaM_growvector(ls->L, dyd->actvar.arr, dyd->actvar.n + 1,
	- dyd->actvar.size, Vardesc, MAX_INT, "local variables");
	- dyd->actvar.arr[dyd->actvar.n++].idx = cast(short, reg);
	-}
	-
	-
	-static void new_localvarliteral_ (LexState ls, const char name, size_t sz) {
	- new_localvar(ls, luaX_newstring(ls, name, sz));
	-}
	-
	-#define new_localvarliteral(ls,v) \
	- new_localvarliteral_(ls, "" v, (sizeof(v)/sizeof(char))-1)
	-
	-
	-static LocVar getlocvar (FuncState fs, int i) {
	- int idx = fs->ls->dyd->actvar.arr[fs->firstlocal + i].idx;
	- lua_assert(idx < fs->nlocvars);
	- return &fs->f->locvars[idx];
	-}
	-
	-
	-static void adjustlocalvars (LexState *ls, int nvars) {
	- FuncState *fs = ls->fs;
	- fs->nactvar = cast_byte(fs->nactvar + nvars);
	- for (; nvars; nvars--) {
	- getlocvar(fs, fs->nactvar - nvars)->startpc = fs->pc;
	- }
	-}
	-
	-
	-static void removevars (FuncState *fs, int tolevel) {
	- fs->ls->dyd->actvar.n -= (fs->nactvar - tolevel);
	- while (fs->nactvar > tolevel)
	- getlocvar(fs, --fs->nactvar)->endpc = fs->pc;
	-}
	-
	-
	-static int searchupvalue (FuncState fs, TString name) {
	- int i;
	- Upvaldesc *up = fs->f->upvalues;
	- for (i = 0; i < fs->nups; i++) {
	- if (luaS_eqstr(up[i].name, name)) return i;
	- }
	- return -1; /* not found */
	-}
	-
	-
	-static int newupvalue (FuncState fs, TString name, expdesc *v) {
	- Proto *f = fs->f;
	- int oldsize = f->sizeupvalues;
	- checklimit(fs, fs->nups + 1, MAXUPVAL, "upvalues");
	- luaM_growvector(fs->ls->L, f->upvalues, fs->nups, f->sizeupvalues,
	- Upvaldesc, MAXUPVAL, "upvalues");
	- while (oldsize < f->sizeupvalues) f->upvalues[oldsize++].name = NULL;
	- f->upvalues[fs->nups].instack = (v->k == VLOCAL);
	- f->upvalues[fs->nups].idx = cast_byte(v->u.info);
	- f->upvalues[fs->nups].name = name;
	- luaC_objbarrier(fs->ls->L, f, name);
	- return fs->nups++;
	-}
	-
	-
	-static int searchvar (FuncState fs, TString n) {
	- int i;
	- for (i = cast_int(fs->nactvar) - 1; i >= 0; i--) {
	- if (luaS_eqstr(n, getlocvar(fs, i)->varname))
	- return i;
	- }
	- return -1; /* not found */
	-}
	-
	-
	-/*
	- Mark block where variable at given level was defined
	- (to emit close instructions later).
	-*/
	-static void markupval (FuncState *fs, int level) {
	- BlockCnt *bl = fs->bl;
	- while (bl->nactvar > level) bl = bl->previous;
	- bl->upval = 1;
	-}
	-
	-
	-/*
	- Find variable with given name 'n'. If it is an upvalue, add this
	- upvalue into all intermediate functions.
	-*/
	-static int singlevaraux (FuncState fs, TString n, expdesc *var, int base) {
	- if (fs == NULL) /* no more levels? */
	- return VVOID; /* default is global */
	- else {
	- int v = searchvar(fs, n); /* look up locals at current level */
	- if (v >= 0) { /* found? */
	- init_exp(var, VLOCAL, v); /* variable is local */
	- if (!base)
	- markupval(fs, v); /* local will be used as an upval */
	- return VLOCAL;
	- }
	- else { /* not found as local at current level; try upvalues */
	- int idx = searchupvalue(fs, n); /* try existing upvalues */
	- if (idx < 0) { /* not found? */
	- if (singlevaraux(fs->prev, n, var, 0) == VVOID) /* try upper levels */
	- return VVOID; /* not found; is a global */
	- /* else was LOCAL or UPVAL */
	- idx = newupvalue(fs, n, var); /* will be a new upvalue */
	- }
	- init_exp(var, VUPVAL, idx);
	- return VUPVAL;
	- }
	- }
	-}
	-
	-
	-static void singlevar (LexState ls, expdesc var) {
	- TString *varname = str_checkname(ls);
	- FuncState *fs = ls->fs;
	- if (singlevaraux(fs, varname, var, 1) == VVOID) { /* global name? */
	- expdesc key;
	- singlevaraux(fs, ls->envn, var, 1); /* get environment variable */
	- lua_assert(var->k == VLOCAL \|\| var->k == VUPVAL);
	- codestring(ls, &key, varname); /* key is variable name */
	- luaK_indexed(fs, var, &key); /* env[varname] */
	- }
	-}
	-
	-
	-static void adjust_assign (LexState ls, int nvars, int nexps, expdesc e) {
	- FuncState *fs = ls->fs;
	- int extra = nvars - nexps;
	- if (hasmultret(e->k)) {
	- extra++; /* includes call itself */
	- if (extra < 0) extra = 0;
	- luaK_setreturns(fs, e, extra); /* last exp. provides the difference */
	- if (extra > 1) luaK_reserveregs(fs, extra-1);
	- }
	- else {
	- if (e->k != VVOID) luaK_exp2nextreg(fs, e); /* close last expression */
	- if (extra > 0) {
	- int reg = fs->freereg;
	- luaK_reserveregs(fs, extra);
	- luaK_nil(fs, reg, extra);
	- }
	- }
	-}
	-
	-
	-static void enterlevel (LexState *ls) {
	- lua_State *L = ls->L;
	- ++L->nCcalls;
	- checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels");
	-}
	-
	-
	-#define leavelevel(ls) ((ls)->L->nCcalls--)
	-
	-
	-static void closegoto (LexState ls, int g, Labeldesc label) {
	- int i;
	- FuncState *fs = ls->fs;
	- Labellist *gl = &ls->dyd->gt;
	- Labeldesc *gt = &gl->arr[g];
	- lua_assert(luaS_eqstr(gt->name, label->name));
	- if (gt->nactvar < label->nactvar) {
	- TString *vname = getlocvar(fs, gt->nactvar)->varname;
	- const char *msg = luaO_pushfstring(ls->L,
	- "<goto %s> at line %d jumps into the scope of local " LUA_QS,
	- getstr(gt->name), gt->line, getstr(vname));
	- semerror(ls, msg);
	- }
	- luaK_patchlist(fs, gt->pc, label->pc);
	- /* remove goto from pending list */
	- for (i = g; i < gl->n - 1; i++)
	- gl->arr[i] = gl->arr[i + 1];
	- gl->n--;
	-}
	-
	-
	-/*
	-** try to close a goto with existing labels; this solves backward jumps
	-*/
	-static int findlabel (LexState *ls, int g) {
	- int i;
	- BlockCnt *bl = ls->fs->bl;
	- Dyndata *dyd = ls->dyd;
	- Labeldesc *gt = &dyd->gt.arr[g];
	- /* check labels in current block for a match */
	- for (i = bl->firstlabel; i < dyd->label.n; i++) {
	- Labeldesc *lb = &dyd->label.arr[i];
	- if (luaS_eqstr(lb->name, gt->name)) { /* correct label? */
	- if (gt->nactvar > lb->nactvar &&
	- (bl->upval \|\| dyd->label.n > bl->firstlabel))
	- luaK_patchclose(ls->fs, gt->pc, lb->nactvar);
	- closegoto(ls, g, lb); /* close it */
	- return 1;
	- }
	- }
	- return 0; /* label not found; cannot close goto */
	-}
	-
	-
	-static int newlabelentry (LexState ls, Labellist l, TString *name,
	- int line, int pc) {
	- int n = l->n;
	- luaM_growvector(ls->L, l->arr, n, l->size,
	- Labeldesc, SHRT_MAX, "labels/gotos");
	- l->arr[n].name = name;
	- l->arr[n].line = line;
	- l->arr[n].nactvar = ls->fs->nactvar;
	- l->arr[n].pc = pc;
	- l->n++;
	- return n;
	-}
	-
	-
	-/*
	-** check whether new label 'lb' matches any pending gotos in current
	-** block; solves forward jumps
	-*/
	-static void findgotos (LexState ls, Labeldesc lb) {
	- Labellist *gl = &ls->dyd->gt;
	- int i = ls->fs->bl->firstgoto;
	- while (i < gl->n) {
	- if (luaS_eqstr(gl->arr[i].name, lb->name))
	- closegoto(ls, i, lb);
	- else
	- i++;
	- }
	-}
	-
	-
	-/*
	-** "export" pending gotos to outer level, to check them against
	-** outer labels; if the block being exited has upvalues, and
	-** the goto exits the scope of any variable (which can be the
	-** upvalue), close those variables being exited.
	-*/
	-static void movegotosout (FuncState fs, BlockCnt bl) {
	- int i = bl->firstgoto;
	- Labellist *gl = &fs->ls->dyd->gt;
	- /* correct pending gotos to current block and try to close it
	- with visible labels */
	- while (i < gl->n) {
	- Labeldesc *gt = &gl->arr[i];
	- if (gt->nactvar > bl->nactvar) {
	- if (bl->upval)
	- luaK_patchclose(fs, gt->pc, bl->nactvar);
	- gt->nactvar = bl->nactvar;
	- }
	- if (!findlabel(fs->ls, i))
	- i++; /* move to next one */
	- }
	-}
	-
	-
	-static void enterblock (FuncState fs, BlockCnt bl, lu_byte isloop) {
	- bl->isloop = isloop;
	- bl->nactvar = fs->nactvar;
	- bl->firstlabel = fs->ls->dyd->label.n;
	- bl->firstgoto = fs->ls->dyd->gt.n;
	- bl->upval = 0;
	- bl->previous = fs->bl;
	- fs->bl = bl;
	- lua_assert(fs->freereg == fs->nactvar);
	-}
	-
	-
	-/*
	-** create a label named "break" to resolve break statements
	-*/
	-static void breaklabel (LexState *ls) {
	- TString *n = luaS_new(ls->L, "break");
	- int l = newlabelentry(ls, &ls->dyd->label, n, 0, ls->fs->pc);
	- findgotos(ls, &ls->dyd->label.arr[l]);
	-}
	-
	-/*
	-** generates an error for an undefined 'goto'; choose appropriate
	-** message when label name is a reserved word (which can only be 'break')
	-*/
	-static l_noret undefgoto (LexState ls, Labeldesc gt) {
	- const char *msg = isreserved(gt->name)
	- ? "<%s> at line %d not inside a loop"
	- : "no visible label " LUA_QS " for <goto> at line %d";
	- msg = luaO_pushfstring(ls->L, msg, getstr(gt->name), gt->line);
	- semerror(ls, msg);
	-}
	-
	-
	-static void leaveblock (FuncState *fs) {
	- BlockCnt *bl = fs->bl;
	- LexState *ls = fs->ls;
	- if (bl->previous && bl->upval) {
	- /* create a 'jump to here' to close upvalues */
	- int j = luaK_jump(fs);
	- luaK_patchclose(fs, j, bl->nactvar);
	- luaK_patchtohere(fs, j);
	- }
	- if (bl->isloop)
	- breaklabel(ls); /* close pending breaks */
	- fs->bl = bl->previous;
	- removevars(fs, bl->nactvar);
	- lua_assert(bl->nactvar == fs->nactvar);
	- fs->freereg = fs->nactvar; /* free registers */
	- ls->dyd->label.n = bl->firstlabel; /* remove local labels */
	- if (bl->previous) /* inner block? */
	- movegotosout(fs, bl); /* update pending gotos to outer block */
	- else if (bl->firstgoto < ls->dyd->gt.n) /* pending gotos in outer block? */
	- undefgoto(ls, &ls->dyd->gt.arr[bl->firstgoto]); /* error */
	-}
	-
	-
	-/*
	-** adds a new prototype into list of prototypes
	-*/
	-static Proto addprototype (LexState ls) {
	- Proto *clp;
	- lua_State *L = ls->L;
	- FuncState *fs = ls->fs;
	- Proto f = fs->f; / prototype of current function */
	- if (fs->np >= f->sizep) {
	- int oldsize = f->sizep;
	- luaM_growvector(L, f->p, fs->np, f->sizep, Proto *, MAXARG_Bx, "functions");
	- while (oldsize < f->sizep) f->p[oldsize++] = NULL;
	- }
	- f->p[fs->np++] = clp = luaF_newproto(L);
	- luaC_objbarrier(L, f, clp);
	- return clp;
	-}
	-
	-
	-/*
	-** codes instruction to create new closure in parent function.
	-** The OP_CLOSURE instruction must use the last available register,
	-** so that, if it invokes the GC, the GC knows which registers
	-** are in use at that time.
	-*/
	-static void codeclosure (LexState ls, expdesc v) {
	- FuncState *fs = ls->fs->prev;
	- init_exp(v, VRELOCABLE, luaK_codeABx(fs, OP_CLOSURE, 0, fs->np - 1));
	- luaK_exp2nextreg(fs, v); /* fix it at the last register */
	-}
	-
	-
	-static void open_func (LexState ls, FuncState fs, BlockCnt *bl) {
	- lua_State *L = ls->L;
	- Proto *f;
	- fs->prev = ls->fs; /* linked list of funcstates */
	- fs->ls = ls;
	- ls->fs = fs;
	- fs->pc = 0;
	- fs->lasttarget = 0;
	- fs->jpc = NO_JUMP;
	- fs->freereg = 0;
	- fs->nk = 0;
	- fs->np = 0;
	- fs->nups = 0;
	- fs->nlocvars = 0;
	- fs->nactvar = 0;
	- fs->firstlocal = ls->dyd->actvar.n;
	- fs->bl = NULL;
	- f = fs->f;
	- f->source = ls->source;
	- f->maxstacksize = 2; /* registers 0/1 are always valid */
	- fs->h = luaH_new(L);
	- /* anchor table of constants (to avoid being collected) */
	- sethvalue2s(L, L->top, fs->h);
	- incr_top(L);
	- enterblock(fs, bl, 0);
	-}
	-
	-
	-static void close_func (LexState *ls) {
	- lua_State *L = ls->L;
	- FuncState *fs = ls->fs;
	- Proto *f = fs->f;
	- luaK_ret(fs, 0, 0); /* final return */
	- leaveblock(fs);
	- luaM_reallocvector(L, f->code, f->sizecode, fs->pc, Instruction);
	- f->sizecode = fs->pc;
	- luaM_reallocvector(L, f->lineinfo, f->sizelineinfo, fs->pc, int);
	- f->sizelineinfo = fs->pc;
	- luaM_reallocvector(L, f->k, f->sizek, fs->nk, TValue);
	- f->sizek = fs->nk;
	- luaM_reallocvector(L, f->p, f->sizep, fs->np, Proto *);
	- f->sizep = fs->np;
	- luaM_reallocvector(L, f->locvars, f->sizelocvars, fs->nlocvars, LocVar);
	- f->sizelocvars = fs->nlocvars;
	- luaM_reallocvector(L, f->upvalues, f->sizeupvalues, fs->nups, Upvaldesc);
	- f->sizeupvalues = fs->nups;
	- lua_assert(fs->bl == NULL);
	- ls->fs = fs->prev;
	- /* last token read was anchored in defunct function; must re-anchor it */
	- anchor_token(ls);
	- L->top--; /* pop table of constants */
	- luaC_checkGC(L);
	-}
	-
	-
	-
	-/============================================================/
	-/* GRAMMAR RULES */
	-/============================================================/
	-
	-
	-/*
	-** check whether current token is in the follow set of a block.
	-** 'until' closes syntactical blocks, but do not close scope,
	-** so it handled in separate.
	-*/
	-static int block_follow (LexState *ls, int withuntil) {
	- switch (ls->t.token) {
	- case TK_ELSE: case TK_ELSEIF:
	- case TK_END: case TK_EOS:
	- return 1;
	- case TK_UNTIL: return withuntil;
	- default: return 0;
	- }
	-}
	-
	-
	-static void statlist (LexState *ls) {
	- /* statlist -> { stat [`;'] } */
	- while (!block_follow(ls, 1)) {
	- if (ls->t.token == TK_RETURN) {
	- statement(ls);
	- return; /* 'return' must be last statement */
	- }
	- statement(ls);
	- }
	-}
	-
	-
	-static void fieldsel (LexState ls, expdesc v) {
	- /* fieldsel -> ['.' \| ':'] NAME */
	- FuncState *fs = ls->fs;
	- expdesc key;
	- luaK_exp2anyregup(fs, v);
	- luaX_next(ls); /* skip the dot or colon */
	- checkname(ls, &key);
	- luaK_indexed(fs, v, &key);
	-}
	-
	-
	-static void yindex (LexState ls, expdesc v) {
	- /* index -> '[' expr ']' */
	- luaX_next(ls); /* skip the '[' */
	- expr(ls, v);
	- luaK_exp2val(ls->fs, v);
	- checknext(ls, ']');
	-}
	-
	-
	-/*
	-** {======================================================================
	-** Rules for Constructors
	-** =======================================================================
	-*/
	-
	-
	-struct ConsControl {
	- expdesc v; /* last list item read */
	- expdesc t; / table descriptor */
	- int nh; /* total number of `record' elements */
	- int na; /* total number of array elements */
	- int tostore; /* number of array elements pending to be stored */
	-};
	-
	-
	-static void recfield (LexState ls, struct ConsControl cc) {
	- /* recfield -> (NAME \| `['exp1`]') = exp1 */
	- FuncState *fs = ls->fs;
	- int reg = ls->fs->freereg;
	- expdesc key, val;
	- int rkkey;
	- if (ls->t.token == TK_NAME) {
	- checklimit(fs, cc->nh, MAX_INT, "items in a constructor");
	- checkname(ls, &key);
	- }
	- else /* ls->t.token == '[' */
	- yindex(ls, &key);
	- cc->nh++;
	- checknext(ls, '=');
	- rkkey = luaK_exp2RK(fs, &key);
	- expr(ls, &val);
	- luaK_codeABC(fs, OP_SETTABLE, cc->t->u.info, rkkey, luaK_exp2RK(fs, &val));
	- fs->freereg = reg; /* free registers */
	-}
	-
	-
	-static void closelistfield (FuncState fs, struct ConsControl cc) {
	- if (cc->v.k == VVOID) return; /* there is no list item */
	- luaK_exp2nextreg(fs, &cc->v);
	- cc->v.k = VVOID;
	- if (cc->tostore == LFIELDS_PER_FLUSH) {
	- luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore); /* flush */
	- cc->tostore = 0; /* no more items pending */
	- }
	-}
	-
	-
	-static void lastlistfield (FuncState fs, struct ConsControl cc) {
	- if (cc->tostore == 0) return;
	- if (hasmultret(cc->v.k)) {
	- luaK_setmultret(fs, &cc->v);
	- luaK_setlist(fs, cc->t->u.info, cc->na, LUA_MULTRET);
	- cc->na--; /* do not count last expression (unknown number of elements) */
	- }
	- else {
	- if (cc->v.k != VVOID)
	- luaK_exp2nextreg(fs, &cc->v);
	- luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore);
	- }
	-}
	-
	-
	-static void listfield (LexState ls, struct ConsControl cc) {
	- /* listfield -> exp */
	- expr(ls, &cc->v);
	- checklimit(ls->fs, cc->na, MAX_INT, "items in a constructor");
	- cc->na++;
	- cc->tostore++;
	-}
	-
	-
	-static void field (LexState ls, struct ConsControl cc) {
	- /* field -> listfield \| recfield */
	- switch(ls->t.token) {
	- case TK_NAME: { /* may be 'listfield' or 'recfield' */
	- if (luaX_lookahead(ls) != '=') /* expression? */
	- listfield(ls, cc);
	- else
	- recfield(ls, cc);
	- break;
	- }
	- case '[': {
	- recfield(ls, cc);
	- break;
	- }
	- default: {
	- listfield(ls, cc);
	- break;
	- }
	- }
	-}
	-
	-
	-static void constructor (LexState ls, expdesc t) {
	- /* constructor -> '{' [ field { sep field } [sep] ] '}'
	- sep -> ',' \| ';' */
	- FuncState *fs = ls->fs;
	- int line = ls->linenumber;
	- int pc = luaK_codeABC(fs, OP_NEWTABLE, 0, 0, 0);
	- struct ConsControl cc;
	- cc.na = cc.nh = cc.tostore = 0;
	- cc.t = t;
	- init_exp(t, VRELOCABLE, pc);
	- init_exp(&cc.v, VVOID, 0); /* no value (yet) */
	- luaK_exp2nextreg(ls->fs, t); /* fix it at stack top */
	- checknext(ls, '{');
	- do {
	- lua_assert(cc.v.k == VVOID \|\| cc.tostore > 0);
	- if (ls->t.token == '}') break;
	- closelistfield(fs, &cc);
	- field(ls, &cc);
	- } while (testnext(ls, ',') \|\| testnext(ls, ';'));
	- check_match(ls, '}', '{', line);
	- lastlistfield(fs, &cc);
	- SETARG_B(fs->f->code[pc], luaO_int2fb(cc.na)); /* set initial array size */
	- SETARG_C(fs->f->code[pc], luaO_int2fb(cc.nh)); /* set initial table size */
	-}
	-
	-/* }====================================================================== */
	-
	-
	-
	-static void parlist (LexState *ls) {
	- /* parlist -> [ param { `,' param } ] */
	- FuncState *fs = ls->fs;
	- Proto *f = fs->f;
	- int nparams = 0;
	- f->is_vararg = 0;
	- if (ls->t.token != ')') { /* is `parlist' not empty? */
	- do {
	- switch (ls->t.token) {
	- case TK_NAME: { /* param -> NAME */
	- new_localvar(ls, str_checkname(ls));
	- nparams++;
	- break;
	- }
	- case TK_DOTS: { /* param -> `...' */
	- luaX_next(ls);
	- f->is_vararg = 1;
	- break;
	- }
	- default: luaX_syntaxerror(ls, "<name> or " LUA_QL("...") " expected");
	- }
	- } while (!f->is_vararg && testnext(ls, ','));
	- }
	- adjustlocalvars(ls, nparams);
	- f->numparams = cast_byte(fs->nactvar);
	- luaK_reserveregs(fs, fs->nactvar); /* reserve register for parameters */
	-}
	-
	-
	-static void body (LexState ls, expdesc e, int ismethod, int line) {
	- /* body -> `(' parlist `)' block END */
	- FuncState new_fs;
	- BlockCnt bl;
	- new_fs.f = addprototype(ls);
	- new_fs.f->linedefined = line;
	- open_func(ls, &new_fs, &bl);
	- checknext(ls, '(');
	- if (ismethod) {
	- new_localvarliteral(ls, "self"); /* create 'self' parameter */
	- adjustlocalvars(ls, 1);
	- }
	- parlist(ls);
	- checknext(ls, ')');
	- statlist(ls);
	- new_fs.f->lastlinedefined = ls->linenumber;
	- check_match(ls, TK_END, TK_FUNCTION, line);
	- codeclosure(ls, e);
	- close_func(ls);
	-}
	-
	-
	-static int explist (LexState ls, expdesc v) {
	- /* explist -> expr { `,' expr } */
	- int n = 1; /* at least one expression */
	- expr(ls, v);
	- while (testnext(ls, ',')) {
	- luaK_exp2nextreg(ls->fs, v);
	- expr(ls, v);
	- n++;
	- }
	- return n;
	-}
	-
	-
	-static void funcargs (LexState ls, expdesc f, int line) {
	- FuncState *fs = ls->fs;
	- expdesc args;
	- int base, nparams;
	- switch (ls->t.token) {
	- case '(': { /* funcargs -> `(' [ explist ] `)' */
	- luaX_next(ls);
	- if (ls->t.token == ')') /* arg list is empty? */
	- args.k = VVOID;
	- else {
	- explist(ls, &args);
	- luaK_setmultret(fs, &args);
	- }
	- check_match(ls, ')', '(', line);
	- break;
	- }
	- case '{': { /* funcargs -> constructor */
	- constructor(ls, &args);
	- break;
	- }
	- case TK_STRING: { /* funcargs -> STRING */
	- codestring(ls, &args, ls->t.seminfo.ts);
	- luaX_next(ls); /* must use `seminfo' before `next' */
	- break;
	- }
	- default: {
	- luaX_syntaxerror(ls, "function arguments expected");
	- }
	- }
	- lua_assert(f->k == VNONRELOC);
	- base = f->u.info; /* base register for call */
	- if (hasmultret(args.k))
	- nparams = LUA_MULTRET; /* open call */
	- else {
	- if (args.k != VVOID)
	- luaK_exp2nextreg(fs, &args); /* close last argument */
	- nparams = fs->freereg - (base+1);
	- }
	- init_exp(f, VCALL, luaK_codeABC(fs, OP_CALL, base, nparams+1, 2));
	- luaK_fixline(fs, line);
	- fs->freereg = base+1; /* call remove function and arguments and leaves
	- (unless changed) one result */
	-}
	-
	-
	-
	-
	-/*
	-** {======================================================================
	-** Expression parsing
	-** =======================================================================
	-*/
	-
	-
	-static void primaryexp (LexState ls, expdesc v) {
	- /* primaryexp -> NAME \| '(' expr ')' */
	- switch (ls->t.token) {
	- case '(': {
	- int line = ls->linenumber;
	- luaX_next(ls);
	- expr(ls, v);
	- check_match(ls, ')', '(', line);
	- luaK_dischargevars(ls->fs, v);
	- return;
	- }
	- case TK_NAME: {
	- singlevar(ls, v);
	- return;
	- }
	- default: {
	- luaX_syntaxerror(ls, "unexpected symbol");
	- }
	- }
	-}
	-
	-
	-static void suffixedexp (LexState ls, expdesc v) {
	- /* suffixedexp ->
	- primaryexp { '.' NAME \| '[' exp ']' \| ':' NAME funcargs \| funcargs } */
	- FuncState *fs = ls->fs;
	- int line = ls->linenumber;
	- primaryexp(ls, v);
	- for (;;) {
	- switch (ls->t.token) {
	- case '.': { /* fieldsel */
	- fieldsel(ls, v);
	- break;
	- }
	- case '[': { /* `[' exp1 `]' */
	- expdesc key;
	- luaK_exp2anyregup(fs, v);
	- yindex(ls, &key);
	- luaK_indexed(fs, v, &key);
	- break;
	- }
	- case ':': { /* `:' NAME funcargs */
	- expdesc key;
	- luaX_next(ls);
	- checkname(ls, &key);
	- luaK_self(fs, v, &key);
	- funcargs(ls, v, line);
	- break;
	- }
	- case '(': case TK_STRING: case '{': { /* funcargs */
	- luaK_exp2nextreg(fs, v);
	- funcargs(ls, v, line);
	- break;
	- }
	- default: return;
	- }
	- }
	-}
	-
	-
	-static void simpleexp (LexState ls, expdesc v) {
	- /* simpleexp -> NUMBER \| STRING \| NIL \| TRUE \| FALSE \| ... \|
	- constructor \| FUNCTION body \| suffixedexp */
	- switch (ls->t.token) {
	- case TK_NUMBER: {
	- init_exp(v, VKNUM, 0);
	- v->u.nval = ls->t.seminfo.r;
	- break;
	- }
	- case TK_STRING: {
	- codestring(ls, v, ls->t.seminfo.ts);
	- break;
	- }
	- case TK_NIL: {
	- init_exp(v, VNIL, 0);
	- break;
	- }
	- case TK_TRUE: {
	- init_exp(v, VTRUE, 0);
	- break;
	- }
	- case TK_FALSE: {
	- init_exp(v, VFALSE, 0);
	- break;
	- }
	- case TK_DOTS: { /* vararg */
	- FuncState *fs = ls->fs;
	- check_condition(ls, fs->f->is_vararg,
	- "cannot use " LUA_QL("...") " outside a vararg function");
	- init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0));
	- break;
	- }
	- case '{': { /* constructor */
	- constructor(ls, v);
	- return;
	- }
	- case TK_FUNCTION: {
	- luaX_next(ls);
	- body(ls, v, 0, ls->linenumber);
	- return;
	- }
	- default: {
	- suffixedexp(ls, v);
	- return;
	- }
	- }
	- luaX_next(ls);
	-}
	-
	-
	-static UnOpr getunopr (int op) {
	- switch (op) {
	- case TK_NOT: return OPR_NOT;
	- case '-': return OPR_MINUS;
	- case '#': return OPR_LEN;
	- default: return OPR_NOUNOPR;
	- }
	-}
	-
	-
	-static BinOpr getbinopr (int op) {
	- switch (op) {
	- case '+': return OPR_ADD;
	- case '-': return OPR_SUB;
	- case '*': return OPR_MUL;
	- case '/': return OPR_DIV;
	- case '%': return OPR_MOD;
	- case '^': return OPR_POW;
	- case TK_CONCAT: return OPR_CONCAT;
	- case TK_NE: return OPR_NE;
	- case TK_EQ: return OPR_EQ;
	- case '<': return OPR_LT;
	- case TK_LE: return OPR_LE;
	- case '>': return OPR_GT;
	- case TK_GE: return OPR_GE;
	- case TK_AND: return OPR_AND;
	- case TK_OR: return OPR_OR;
	- default: return OPR_NOBINOPR;
	- }
	-}
	-
	-
	-static const struct {
	- lu_byte left; /* left priority for each binary operator */
	- lu_byte right; /* right priority */
	-} priority[] = { /* ORDER OPR */
	- {6, 6}, {6, 6}, {7, 7}, {7, 7}, {7, 7}, /* `+' `-' `' `/' `%' /
	- {10, 9}, {5, 4}, /* ^, .. (right associative) */
	- {3, 3}, {3, 3}, {3, 3}, /* ==, <, <= */
	- {3, 3}, {3, 3}, {3, 3}, /* ~=, >, >= */
	- {2, 2}, {1, 1} /* and, or */
	-};
	-
	-#define UNARY_PRIORITY 8 /* priority for unary operators */
	-
	-
	-/*
	-** subexpr -> (simpleexp \| unop subexpr) { binop subexpr }
	-** where `binop' is any binary operator with a priority higher than `limit'
	-*/
	-static BinOpr subexpr (LexState ls, expdesc v, int limit) {
	- BinOpr op;
	- UnOpr uop;
	- enterlevel(ls);
	- uop = getunopr(ls->t.token);
	- if (uop != OPR_NOUNOPR) {
	- int line = ls->linenumber;
	- luaX_next(ls);
	- subexpr(ls, v, UNARY_PRIORITY);
	- luaK_prefix(ls->fs, uop, v, line);
	- }
	- else simpleexp(ls, v);
	- /* expand while operators have priorities higher than `limit' */
	- op = getbinopr(ls->t.token);
	- while (op != OPR_NOBINOPR && priority[op].left > limit) {
	- expdesc v2;
	- BinOpr nextop;
	- int line = ls->linenumber;
	- luaX_next(ls);
	- luaK_infix(ls->fs, op, v);
	- /* read sub-expression with higher priority */
	- nextop = subexpr(ls, &v2, priority[op].right);
	- luaK_posfix(ls->fs, op, v, &v2, line);
	- op = nextop;
	- }
	- leavelevel(ls);
	- return op; /* return first untreated operator */
	-}
	-
	-
	-static void expr (LexState ls, expdesc v) {
	- subexpr(ls, v, 0);
	-}
	-
	-/* }==================================================================== */
	-
	-
	-
	-/*
	-** {======================================================================
	-** Rules for Statements
	-** =======================================================================
	-*/
	-
	-
	-static void block (LexState *ls) {
	- /* block -> statlist */
	- FuncState *fs = ls->fs;
	- BlockCnt bl;
	- enterblock(fs, &bl, 0);
	- statlist(ls);
	- leaveblock(fs);
	-}
	-
	-
	-/*
	-** structure to chain all variables in the left-hand side of an
	-** assignment
	-*/
	-struct LHS_assign {
	- struct LHS_assign *prev;
	- expdesc v; /* variable (global, local, upvalue, or indexed) */
	-};
	-
	-
	-/*
	-** check whether, in an assignment to an upvalue/local variable, the
	-** upvalue/local variable is begin used in a previous assignment to a
	-** table. If so, save original upvalue/local value in a safe place and
	-** use this safe copy in the previous assignment.
	-*/
	-static void check_conflict (LexState ls, struct LHS_assign lh, expdesc *v) {
	- FuncState *fs = ls->fs;
	- int extra = fs->freereg; /* eventual position to save local variable */
	- int conflict = 0;
	- for (; lh; lh = lh->prev) { /* check all previous assignments */
	- if (lh->v.k == VINDEXED) { /* assigning to a table? */
	- /* table is the upvalue/local being assigned now? */
	- if (lh->v.u.ind.vt == v->k && lh->v.u.ind.t == v->u.info) {
	- conflict = 1;
	- lh->v.u.ind.vt = VLOCAL;
	- lh->v.u.ind.t = extra; /* previous assignment will use safe copy */
	- }
	- /* index is the local being assigned? (index cannot be upvalue) */
	- if (v->k == VLOCAL && lh->v.u.ind.idx == v->u.info) {
	- conflict = 1;
	- lh->v.u.ind.idx = extra; /* previous assignment will use safe copy */
	- }
	- }
	- }
	- if (conflict) {
	- /* copy upvalue/local value to a temporary (in position 'extra') */
	- OpCode op = (v->k == VLOCAL) ? OP_MOVE : OP_GETUPVAL;
	- luaK_codeABC(fs, op, extra, v->u.info, 0);
	- luaK_reserveregs(fs, 1);
	- }
	-}
	-
	-
	-static void assignment (LexState ls, struct LHS_assign lh, int nvars) {
	- expdesc e;
	- check_condition(ls, vkisvar(lh->v.k), "syntax error");
	- if (testnext(ls, ',')) { /* assignment -> ',' suffixedexp assignment */
	- struct LHS_assign nv;
	- nv.prev = lh;
	- suffixedexp(ls, &nv.v);
	- if (nv.v.k != VINDEXED)
	- check_conflict(ls, lh, &nv.v);
	- checklimit(ls->fs, nvars + ls->L->nCcalls, LUAI_MAXCCALLS,
	- "C levels");
	- assignment(ls, &nv, nvars+1);
	- }
	- else { /* assignment -> `=' explist */
	- int nexps;
	- checknext(ls, '=');
	- nexps = explist(ls, &e);
	- if (nexps != nvars) {
	- adjust_assign(ls, nvars, nexps, &e);
	- if (nexps > nvars)
	- ls->fs->freereg -= nexps - nvars; /* remove extra values */
	- }
	- else {
	- luaK_setoneret(ls->fs, &e); /* close last expression */
	- luaK_storevar(ls->fs, &lh->v, &e);
	- return; /* avoid default */
	- }
	- }
	- init_exp(&e, VNONRELOC, ls->fs->freereg-1); /* default assignment */
	- luaK_storevar(ls->fs, &lh->v, &e);
	-}
	-
	-
	-static int cond (LexState *ls) {
	- /* cond -> exp */
	- expdesc v;
	- expr(ls, &v); /* read condition */
	- if (v.k == VNIL) v.k = VFALSE; /* `falses' are all equal here */
	- luaK_goiftrue(ls->fs, &v);
	- return v.f;
	-}
	-
	-
	-static void gotostat (LexState *ls, int pc) {
	- int line = ls->linenumber;
	- TString *label;
	- int g;
	- if (testnext(ls, TK_GOTO))
	- label = str_checkname(ls);
	- else {
	- luaX_next(ls); /* skip break */
	- label = luaS_new(ls->L, "break");
	- }
	- g = newlabelentry(ls, &ls->dyd->gt, label, line, pc);
	- findlabel(ls, g); /* close it if label already defined */
	-}
	-
	-
	-/* check for repeated labels on the same block */
	-static void checkrepeated (FuncState fs, Labellist ll, TString *label) {
	- int i;
	- for (i = fs->bl->firstlabel; i < ll->n; i++) {
	- if (luaS_eqstr(label, ll->arr[i].name)) {
	- const char *msg = luaO_pushfstring(fs->ls->L,
	- "label " LUA_QS " already defined on line %d",
	- getstr(label), ll->arr[i].line);
	- semerror(fs->ls, msg);
	- }
	- }
	-}
	-
	-
	-/* skip no-op statements */
	-static void skipnoopstat (LexState *ls) {
	- while (ls->t.token == ';' \|\| ls->t.token == TK_DBCOLON)
	- statement(ls);
	-}
	-
	-
	-static void labelstat (LexState ls, TString label, int line) {
	- /* label -> '::' NAME '::' */
	- FuncState *fs = ls->fs;
	- Labellist *ll = &ls->dyd->label;
	- int l; /* index of new label being created */
	- checkrepeated(fs, ll, label); /* check for repeated labels */
	- checknext(ls, TK_DBCOLON); /* skip double colon */
	- /* create new entry for this label */
	- l = newlabelentry(ls, ll, label, line, fs->pc);
	- skipnoopstat(ls); /* skip other no-op statements */
	- if (block_follow(ls, 0)) { /* label is last no-op statement in the block? */
	- /* assume that locals are already out of scope */
	- ll->arr[l].nactvar = fs->bl->nactvar;
	- }
	- findgotos(ls, &ll->arr[l]);
	-}
	-
	-
	-static void whilestat (LexState *ls, int line) {
	- /* whilestat -> WHILE cond DO block END */
	- FuncState *fs = ls->fs;
	- int whileinit;
	- int condexit;
	- BlockCnt bl;
	- luaX_next(ls); /* skip WHILE */
	- whileinit = luaK_getlabel(fs);
	- condexit = cond(ls);
	- enterblock(fs, &bl, 1);
	- checknext(ls, TK_DO);
	- block(ls);
	- luaK_jumpto(fs, whileinit);
	- check_match(ls, TK_END, TK_WHILE, line);
	- leaveblock(fs);
	- luaK_patchtohere(fs, condexit); /* false conditions finish the loop */
	-}
	-
	-
	-static void repeatstat (LexState *ls, int line) {
	- /* repeatstat -> REPEAT block UNTIL cond */
	- int condexit;
	- FuncState *fs = ls->fs;
	- int repeat_init = luaK_getlabel(fs);
	- BlockCnt bl1, bl2;
	- enterblock(fs, &bl1, 1); /* loop block */
	- enterblock(fs, &bl2, 0); /* scope block */
	- luaX_next(ls); /* skip REPEAT */
	- statlist(ls);
	- check_match(ls, TK_UNTIL, TK_REPEAT, line);
	- condexit = cond(ls); /* read condition (inside scope block) */
	- if (bl2.upval) /* upvalues? */
	- luaK_patchclose(fs, condexit, bl2.nactvar);
	- leaveblock(fs); /* finish scope */
	- luaK_patchlist(fs, condexit, repeat_init); /* close the loop */
	- leaveblock(fs); /* finish loop */
	-}
	-
	-
	-static int exp1 (LexState *ls) {
	- expdesc e;
	- int reg;
	- expr(ls, &e);
	- luaK_exp2nextreg(ls->fs, &e);
	- lua_assert(e.k == VNONRELOC);
	- reg = e.u.info;
	- return reg;
	-}
	-
	-
	-static void forbody (LexState *ls, int base, int line, int nvars, int isnum) {
	- /* forbody -> DO block */
	- BlockCnt bl;
	- FuncState *fs = ls->fs;
	- int prep, endfor;
	- adjustlocalvars(ls, 3); /* control variables */
	- checknext(ls, TK_DO);
	- prep = isnum ? luaK_codeAsBx(fs, OP_FORPREP, base, NO_JUMP) : luaK_jump(fs);
	- enterblock(fs, &bl, 0); /* scope for declared variables */
	- adjustlocalvars(ls, nvars);
	- luaK_reserveregs(fs, nvars);
	- block(ls);
	- leaveblock(fs); /* end of scope for declared variables */
	- luaK_patchtohere(fs, prep);
	- if (isnum) /* numeric for? */
	- endfor = luaK_codeAsBx(fs, OP_FORLOOP, base, NO_JUMP);
	- else { /* generic for */
	- luaK_codeABC(fs, OP_TFORCALL, base, 0, nvars);
	- luaK_fixline(fs, line);
	- endfor = luaK_codeAsBx(fs, OP_TFORLOOP, base + 2, NO_JUMP);
	- }
	- luaK_patchlist(fs, endfor, prep + 1);
	- luaK_fixline(fs, line);
	-}
	-
	-
	-static void fornum (LexState ls, TString varname, int line) {
	- /* fornum -> NAME = exp1,exp1[,exp1] forbody */
	- FuncState *fs = ls->fs;
	- int base = fs->freereg;
	- new_localvarliteral(ls, "(for index)");
	- new_localvarliteral(ls, "(for limit)");
	- new_localvarliteral(ls, "(for step)");
	- new_localvar(ls, varname);
	- checknext(ls, '=');
	- exp1(ls); /* initial value */
	- checknext(ls, ',');
	- exp1(ls); /* limit */
	- if (testnext(ls, ','))
	- exp1(ls); /* optional step */
	- else { /* default step = 1 */
	- luaK_codek(fs, fs->freereg, luaK_numberK(fs, 1));
	- luaK_reserveregs(fs, 1);
	- }
	- forbody(ls, base, line, 1, 1);
	-}
	-
	-
	-static void forlist (LexState ls, TString indexname) {
	- /* forlist -> NAME {,NAME} IN explist forbody */
	- FuncState *fs = ls->fs;
	- expdesc e;
	- int nvars = 4; /* gen, state, control, plus at least one declared var */
	- int line;
	- int base = fs->freereg;
	- /* create control variables */
	- new_localvarliteral(ls, "(for generator)");
	- new_localvarliteral(ls, "(for state)");
	- new_localvarliteral(ls, "(for control)");
	- /* create declared variables */
	- new_localvar(ls, indexname);
	- while (testnext(ls, ',')) {
	- new_localvar(ls, str_checkname(ls));
	- nvars++;
	- }
	- checknext(ls, TK_IN);
	- line = ls->linenumber;
	- adjust_assign(ls, 3, explist(ls, &e), &e);
	- luaK_checkstack(fs, 3); /* extra space to call generator */
	- forbody(ls, base, line, nvars - 3, 0);
	-}
	-
	-
	-static void forstat (LexState *ls, int line) {
	- /* forstat -> FOR (fornum \| forlist) END */
	- FuncState *fs = ls->fs;
	- TString *varname;
	- BlockCnt bl;
	- enterblock(fs, &bl, 1); /* scope for loop and control variables */
	- luaX_next(ls); /* skip `for' */
	- varname = str_checkname(ls); /* first variable name */
	- switch (ls->t.token) {
	- case '=': fornum(ls, varname, line); break;
	- case ',': case TK_IN: forlist(ls, varname); break;
	- default: luaX_syntaxerror(ls, LUA_QL("=") " or " LUA_QL("in") " expected");
	- }
	- check_match(ls, TK_END, TK_FOR, line);
	- leaveblock(fs); /* loop scope (`break' jumps to this point) */
	-}
	-
	-
	-static void test_then_block (LexState ls, int escapelist) {
	- /* test_then_block -> [IF \| ELSEIF] cond THEN block */
	- BlockCnt bl;
	- FuncState *fs = ls->fs;
	- expdesc v;
	- int jf; /* instruction to skip 'then' code (if condition is false) */
	- luaX_next(ls); /* skip IF or ELSEIF */
	- expr(ls, &v); /* read condition */
	- checknext(ls, TK_THEN);
	- if (ls->t.token == TK_GOTO \|\| ls->t.token == TK_BREAK) {
	- luaK_goiffalse(ls->fs, &v); /* will jump to label if condition is true */
	- enterblock(fs, &bl, 0); /* must enter block before 'goto' */
	- gotostat(ls, v.t); /* handle goto/break */
	- skipnoopstat(ls); /* skip other no-op statements */
	- if (block_follow(ls, 0)) { /* 'goto' is the entire block? */
	- leaveblock(fs);
	- return; /* and that is it */
	- }
	- else /* must skip over 'then' part if condition is false */
	- jf = luaK_jump(fs);
	- }
	- else { /* regular case (not goto/break) */
	- luaK_goiftrue(ls->fs, &v); /* skip over block if condition is false */
	- enterblock(fs, &bl, 0);
	- jf = v.f;
	- }
	- statlist(ls); /* `then' part */
	- leaveblock(fs);
	- if (ls->t.token == TK_ELSE \|\|
	- ls->t.token == TK_ELSEIF) /* followed by 'else'/'elseif'? */
	- luaK_concat(fs, escapelist, luaK_jump(fs)); /* must jump over it */
	- luaK_patchtohere(fs, jf);
	-}
	-
	-
	-static void ifstat (LexState *ls, int line) {
	- /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */
	- FuncState *fs = ls->fs;
	- int escapelist = NO_JUMP; /* exit list for finished parts */
	- test_then_block(ls, &escapelist); /* IF cond THEN block */
	- while (ls->t.token == TK_ELSEIF)
	- test_then_block(ls, &escapelist); /* ELSEIF cond THEN block */
	- if (testnext(ls, TK_ELSE))
	- block(ls); /* `else' part */
	- check_match(ls, TK_END, TK_IF, line);
	- luaK_patchtohere(fs, escapelist); /* patch escape list to 'if' end */
	-}
	-
	-
	-static void localfunc (LexState *ls) {
	- expdesc b;
	- FuncState *fs = ls->fs;
	- new_localvar(ls, str_checkname(ls)); /* new local variable */
	- adjustlocalvars(ls, 1); /* enter its scope */
	- body(ls, &b, 0, ls->linenumber); /* function created in next register */
	- /* debug information will only see the variable after this point! */
	- getlocvar(fs, b.u.info)->startpc = fs->pc;
	-}
	-
	-
	-static void localstat (LexState *ls) {
	- /* stat -> LOCAL NAME {`,' NAME} [`=' explist] */
	- int nvars = 0;
	- int nexps;
	- expdesc e;
	- do {
	- new_localvar(ls, str_checkname(ls));
	- nvars++;
	- } while (testnext(ls, ','));
	- if (testnext(ls, '='))
	- nexps = explist(ls, &e);
	- else {
	- e.k = VVOID;
	- nexps = 0;
	- }
	- adjust_assign(ls, nvars, nexps, &e);
	- adjustlocalvars(ls, nvars);
	-}
	-
	-
	-static int funcname (LexState ls, expdesc v) {
	- /* funcname -> NAME {fieldsel} [`:' NAME] */
	- int ismethod = 0;
	- singlevar(ls, v);
	- while (ls->t.token == '.')
	- fieldsel(ls, v);
	- if (ls->t.token == ':') {
	- ismethod = 1;
	- fieldsel(ls, v);
	- }
	- return ismethod;
	-}
	-
	-
	-static void funcstat (LexState *ls, int line) {
	- /* funcstat -> FUNCTION funcname body */
	- int ismethod;
	- expdesc v, b;
	- luaX_next(ls); /* skip FUNCTION */
	- ismethod = funcname(ls, &v);
	- body(ls, &b, ismethod, line);
	- luaK_storevar(ls->fs, &v, &b);
	- luaK_fixline(ls->fs, line); /* definition `happens' in the first line */
	-}
	-
	-
	-static void exprstat (LexState *ls) {
	- /* stat -> func \| assignment */
	- FuncState *fs = ls->fs;
	- struct LHS_assign v;
	- suffixedexp(ls, &v.v);
	- if (ls->t.token == '=' \|\| ls->t.token == ',') { /* stat -> assignment ? */
	- v.prev = NULL;
	- assignment(ls, &v, 1);
	- }
	- else { /* stat -> func */
	- check_condition(ls, v.v.k == VCALL, "syntax error");
	- SETARG_C(getcode(fs, &v.v), 1); /* call statement uses no results */
	- }
	-}
	-
	-
	-static void retstat (LexState *ls) {
	- /* stat -> RETURN [explist] [';'] */
	- FuncState *fs = ls->fs;
	- expdesc e;
	- int first, nret; /* registers with returned values */
	- if (block_follow(ls, 1) \|\| ls->t.token == ';')
	- first = nret = 0; /* return no values */
	- else {
	- nret = explist(ls, &e); /* optional return values */
	- if (hasmultret(e.k)) {
	- luaK_setmultret(fs, &e);
	- if (e.k == VCALL && nret == 1) { /* tail call? */
	- SET_OPCODE(getcode(fs,&e), OP_TAILCALL);
	- lua_assert(GETARG_A(getcode(fs,&e)) == fs->nactvar);
	- }
	- first = fs->nactvar;
	- nret = LUA_MULTRET; /* return all values */
	- }
	- else {
	- if (nret == 1) /* only one single value? */
	- first = luaK_exp2anyreg(fs, &e);
	- else {
	- luaK_exp2nextreg(fs, &e); /* values must go to the `stack' */
	- first = fs->nactvar; /* return all `active' values */
	- lua_assert(nret == fs->freereg - first);
	- }
	- }
	- }
	- luaK_ret(fs, first, nret);
	- testnext(ls, ';'); /* skip optional semicolon */
	-}
	-
	-
	-static void statement (LexState *ls) {
	- int line = ls->linenumber; /* may be needed for error messages */
	- enterlevel(ls);
	- switch (ls->t.token) {
	- case ';': { /* stat -> ';' (empty statement) */
	- luaX_next(ls); /* skip ';' */
	- break;
	- }
	- case TK_IF: { /* stat -> ifstat */
	- ifstat(ls, line);
	- break;
	- }
	- case TK_WHILE: { /* stat -> whilestat */
	- whilestat(ls, line);
	- break;
	- }
	- case TK_DO: { /* stat -> DO block END */
	- luaX_next(ls); /* skip DO */
	- block(ls);
	- check_match(ls, TK_END, TK_DO, line);
	- break;
	- }
	- case TK_FOR: { /* stat -> forstat */
	- forstat(ls, line);
	- break;
	- }
	- case TK_REPEAT: { /* stat -> repeatstat */
	- repeatstat(ls, line);
	- break;
	- }
	- case TK_FUNCTION: { /* stat -> funcstat */
	- funcstat(ls, line);
	- break;
	- }
	- case TK_LOCAL: { /* stat -> localstat */
	- luaX_next(ls); /* skip LOCAL */
	- if (testnext(ls, TK_FUNCTION)) /* local function? */
	- localfunc(ls);
	- else
	- localstat(ls);
	- break;
	- }
	- case TK_DBCOLON: { /* stat -> label */
	- luaX_next(ls); /* skip double colon */
	- labelstat(ls, str_checkname(ls), line);
	- break;
	- }
	- case TK_RETURN: { /* stat -> retstat */
	- luaX_next(ls); /* skip RETURN */
	- retstat(ls);
	- break;
	- }
	- case TK_BREAK: /* stat -> breakstat */
	- case TK_GOTO: { /* stat -> 'goto' NAME */
	- gotostat(ls, luaK_jump(ls->fs));
	- break;
	- }
	- default: { /* stat -> func \| assignment */
	- exprstat(ls);
	- break;
	- }
	- }
	- lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg &&
	- ls->fs->freereg >= ls->fs->nactvar);
	- ls->fs->freereg = ls->fs->nactvar; /* free registers */
	- leavelevel(ls);
	-}
	-
	-/* }====================================================================== */
	-
	-
	-/*
	-** compiles the main function, which is a regular vararg function with an
	-** upvalue named LUA_ENV
	-*/
	-static void mainfunc (LexState ls, FuncState fs) {
	- BlockCnt bl;
	- expdesc v;
	- open_func(ls, fs, &bl);
	- fs->f->is_vararg = 1; /* main function is always vararg */
	- init_exp(&v, VLOCAL, 0); /* create and... */
	- newupvalue(fs, ls->envn, &v); /* ...set environment upvalue */
	- luaX_next(ls); /* read first token */
	- statlist(ls); /* parse main body */
	- check(ls, TK_EOS);
	- close_func(ls);
	-}
	-
	-
	-Closure luaY_parser (lua_State L, ZIO z, Mbuffer buff,
	- Dyndata dyd, const char name, int firstchar) {
	- LexState lexstate;
	- FuncState funcstate;
	- Closure cl = luaF_newLclosure(L, 1); / create main closure */
	- /* anchor closure (to avoid being collected) */
	- setclLvalue(L, L->top, cl);
	- incr_top(L);
	- funcstate.f = cl->l.p = luaF_newproto(L);
	- funcstate.f->source = luaS_new(L, name); /* create and anchor TString */
	- lexstate.buff = buff;
	- lexstate.dyd = dyd;
	- dyd->actvar.n = dyd->gt.n = dyd->label.n = 0;
	- luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar);
	- mainfunc(&lexstate, &funcstate);
	- lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs);
	- /* all scopes should be correctly finished */
	- lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0);
	- return cl; /* it's on the stack too */
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h
	@@ -1,228 +0,0 @@
	-/*
	-** $Id: lstate.h,v 2.82.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Global State
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lstate_h
	-#define lstate_h
	-
	-#include "lua.h"
	-
	-#include "lobject.h"
	-#include "ltm.h"
	-#include "lzio.h"
	-
	-
	-/*
	-
	-** Some notes about garbage-collected objects: All objects in Lua must
	-** be kept somehow accessible until being freed.
	-**
	-** Lua keeps most objects linked in list g->allgc. The link uses field
	-** 'next' of the CommonHeader.
	-**
	-** Strings are kept in several lists headed by the array g->strt.hash.
	-**
	-** Open upvalues are not subject to independent garbage collection. They
	-** are collected together with their respective threads. Lua keeps a
	-** double-linked list with all open upvalues (g->uvhead) so that it can
	-** mark objects referred by them. (They are always gray, so they must
	-** be remarked in the atomic step. Usually their contents would be marked
	-** when traversing the respective threads, but the thread may already be
	-** dead, while the upvalue is still accessible through closures.)
	-**
	-** Objects with finalizers are kept in the list g->finobj.
	-**
	-** The list g->tobefnz links all objects being finalized.
	-
	-*/
	-
	-
	-struct lua_longjmp; /* defined in ldo.c */
	-
	-
	-
	-/* extra stack space to handle TM calls and some other extras */
	-#define EXTRA_STACK 5
	-
	-
	-#define BASIC_STACK_SIZE (2*LUA_MINSTACK)
	-
	-
	-/* kinds of Garbage Collection */
	-#define KGC_NORMAL 0
	-#define KGC_EMERGENCY 1 /* gc was forced by an allocation failure */
	-#define KGC_GEN 2 /* generational collection */
	-
	-
	-typedef struct stringtable {
	- GCObject **hash;
	- lu_int32 nuse; /* number of elements */
	- int size;
	-} stringtable;
	-
	-
	-/*
	-** information about a call
	-*/
	-typedef struct CallInfo {
	- StkId func; /* function index in the stack */
	- StkId top; /* top for this function */
	- struct CallInfo previous, next; /* dynamic call link */
	- short nresults; /* expected number of results from this function */
	- lu_byte callstatus;
	- ptrdiff_t extra;
	- union {
	- struct { /* only for Lua functions */
	- StkId base; /* base for this function */
	- const Instruction *savedpc;
	- } l;
	- struct { /* only for C functions */
	- int ctx; /* context info. in case of yields */
	- lua_CFunction k; /* continuation in case of yields */
	- ptrdiff_t old_errfunc;
	- lu_byte old_allowhook;
	- lu_byte status;
	- } c;
	- } u;
	-} CallInfo;
	-
	-
	-/*
	-** Bits in CallInfo status
	-*/
	-#define CIST_LUA (1<<0) /* call is running a Lua function */
	-#define CIST_HOOKED (1<<1) /* call is running a debug hook */
	-#define CIST_REENTRY (1<<2) /* call is running on same invocation of
	- luaV_execute of previous call */
	-#define CIST_YIELDED (1<<3) /* call reentered after suspension */
	-#define CIST_YPCALL (1<<4) /* call is a yieldable protected call */
	-#define CIST_STAT (1<<5) /* call has an error status (pcall) */
	-#define CIST_TAIL (1<<6) /* call was tail called */
	-#define CIST_HOOKYIELD (1<<7) /* last hook called yielded */
	-
	-
	-#define isLua(ci) ((ci)->callstatus & CIST_LUA)
	-
	-
	-/*
	-** `global state', shared by all threads of this state
	-*/
	-typedef struct global_State {
	- lua_Alloc frealloc; /* function to reallocate memory */
	- void ud; / auxiliary data to `frealloc' */
	- lu_mem totalbytes; /* number of bytes currently allocated - GCdebt */
	- l_mem GCdebt; /* bytes allocated not yet compensated by the collector */
	- lu_mem GCmemtrav; /* memory traversed by the GC */
	- lu_mem GCestimate; /* an estimate of the non-garbage memory in use */
	- stringtable strt; /* hash table for strings */
	- TValue l_registry;
	- unsigned int seed; /* randomized seed for hashes */
	- lu_byte currentwhite;
	- lu_byte gcstate; /* state of garbage collector */
	- lu_byte gckind; /* kind of GC running */
	- lu_byte gcrunning; /* true if GC is running */
	- int sweepstrgc; /* position of sweep in `strt' */
	- GCObject allgc; / list of all collectable objects */
	- GCObject finobj; / list of collectable objects with finalizers */
	- GCObject *sweepgc; / current position of sweep in list 'allgc' */
	- GCObject *sweepfin; / current position of sweep in list 'finobj' */
	- GCObject gray; / list of gray objects */
	- GCObject grayagain; / list of objects to be traversed atomically */
	- GCObject weak; / list of tables with weak values */
	- GCObject ephemeron; / list of ephemeron tables (weak keys) */
	- GCObject allweak; / list of all-weak tables */
	- GCObject tobefnz; / list of userdata to be GC */
	- UpVal uvhead; /* head of double-linked list of all open upvalues */
	- Mbuffer buff; /* temporary buffer for string concatenation */
	- int gcpause; /* size of pause between successive GCs */
	- int gcmajorinc; /* pause between major collections (only in gen. mode) */
	- int gcstepmul; /* GC `granularity' */
	- lua_CFunction panic; /* to be called in unprotected errors */
	- struct lua_State *mainthread;
	- const lua_Number version; / pointer to version number */
	- TString memerrmsg; / memory-error message */
	- TString tmname[TM_N]; / array with tag-method names */
	- struct Table mt[LUA_NUMTAGS]; / metatables for basic types */
	-} global_State;
	-
	-
	-/*
	-** `per thread' state
	-*/
	-struct lua_State {
	- CommonHeader;
	- lu_byte status;
	- StkId top; /* first free slot in the stack */
	- global_State *l_G;
	- CallInfo ci; / call info for current function */
	- const Instruction oldpc; / last pc traced */
	- StkId stack_last; /* last free slot in the stack */
	- StkId stack; /* stack base */
	- int stacksize;
	- unsigned short nny; /* number of non-yieldable calls in stack */
	- unsigned short nCcalls; /* number of nested C calls */
	- lu_byte hookmask;
	- lu_byte allowhook;
	- int basehookcount;
	- int hookcount;
	- lua_Hook hook;
	- GCObject openupval; / list of open upvalues in this stack */
	- GCObject *gclist;
	- struct lua_longjmp errorJmp; / current error recover point */
	- ptrdiff_t errfunc; /* current error handling function (stack index) */
	- CallInfo base_ci; /* CallInfo for first level (C calling Lua) */
	-};
	-
	-
	-#define G(L) (L->l_G)
	-
	-
	-/*
	-** Union of all collectable objects
	-*/
	-union GCObject {
	- GCheader gch; /* common header */
	- union TString ts;
	- union Udata u;
	- union Closure cl;
	- struct Table h;
	- struct Proto p;
	- struct UpVal uv;
	- struct lua_State th; /* thread */
	-};
	-
	-
	-#define gch(o) (&(o)->gch)
	-
	-/* macros to convert a GCObject into a specific value */
	-#define rawgco2ts(o) \
	- check_exp(novariant((o)->gch.tt) == LUA_TSTRING, &((o)->ts))
	-#define gco2ts(o) (&rawgco2ts(o)->tsv)
	-#define rawgco2u(o) check_exp((o)->gch.tt == LUA_TUSERDATA, &((o)->u))
	-#define gco2u(o) (&rawgco2u(o)->uv)
	-#define gco2lcl(o) check_exp((o)->gch.tt == LUA_TLCL, &((o)->cl.l))
	-#define gco2ccl(o) check_exp((o)->gch.tt == LUA_TCCL, &((o)->cl.c))
	-#define gco2cl(o) \
	- check_exp(novariant((o)->gch.tt) == LUA_TFUNCTION, &((o)->cl))
	-#define gco2t(o) check_exp((o)->gch.tt == LUA_TTABLE, &((o)->h))
	-#define gco2p(o) check_exp((o)->gch.tt == LUA_TPROTO, &((o)->p))
	-#define gco2uv(o) check_exp((o)->gch.tt == LUA_TUPVAL, &((o)->uv))
	-#define gco2th(o) check_exp((o)->gch.tt == LUA_TTHREAD, &((o)->th))
	-
	-/* macro to convert any Lua object into a GCObject */
	-#define obj2gco(v) (cast(GCObject *, (v)))
	-
	-
	-/* actual number of total bytes allocated */
	-#define gettotalbytes(g) ((g)->totalbytes + (g)->GCdebt)
	-
	-LUAI_FUNC void luaE_setdebt (global_State *g, l_mem debt);
	-LUAI_FUNC void luaE_freethread (lua_State L, lua_State L1);
	-LUAI_FUNC CallInfo luaE_extendCI (lua_State L);
	-LUAI_FUNC void luaE_freeCI (lua_State *L);
	-
	-
	-#endif
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c
	@@ -1,321 +0,0 @@
	-/*
	-** $Id: lstate.c,v 2.99.1.2 2013/11/08 17:45:31 roberto Exp $
	-** Global State
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define lstate_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lapi.h"
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lfunc.h"
	-#include "lgc.h"
	-#include "llex.h"
	-#include "lmem.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "ltm.h"
	-
	-
	-#if !defined(LUAI_GCPAUSE)
	-#define LUAI_GCPAUSE 200 /* 200% */
	-#endif
	-
	-#if !defined(LUAI_GCMAJOR)
	-#define LUAI_GCMAJOR 200 /* 200% */
	-#endif
	-
	-#if !defined(LUAI_GCMUL)
	-#define LUAI_GCMUL 200 /* GC runs 'twice the speed' of memory allocation */
	-#endif
	-
	-
	-#define MEMERRMSG "not enough memory"
	-
	-
	-/*
	-** a macro to help the creation of a unique random seed when a state is
	-** created; the seed is used to randomize hashes.
	-*/
	-#if !defined(luai_makeseed)
	-#define luai_makeseed() cast(unsigned int, gethrtime())
	-#endif
	-
	-
	-
	-/*
	-** thread state + extra space
	-*/
	-typedef struct LX {
	-#if defined(LUAI_EXTRASPACE)
	- char buff[LUAI_EXTRASPACE];
	-#endif
	- lua_State l;
	-} LX;
	-
	-
	-/*
	-** Main thread combines a thread state and the global state
	-*/
	-typedef struct LG {
	- LX l;
	- global_State g;
	-} LG;
	-
	-
	-
	-#define fromstate(L) (cast(LX , cast(lu_byte , (L)) - offsetof(LX, l)))
	-
	-
	-/*
	-** Compute an initial seed as random as possible. In ANSI, rely on
	-** Address Space Layout Randomization (if present) to increase
	-** randomness..
	-*/
	-#define addbuff(b,p,e) \
	- { size_t t = cast(size_t, e); \
	- memcpy(buff + p, &t, sizeof(t)); p += sizeof(t); }
	-
	-static unsigned int makeseed (lua_State *L) {
	- char buff[4 * sizeof(size_t)];
	- unsigned int h = luai_makeseed();
	- int p = 0;
	- addbuff(buff, p, L); /* heap variable */
	- addbuff(buff, p, &h); /* local variable */
	- addbuff(buff, p, luaO_nilobject); /* global variable */
	- addbuff(buff, p, &lua_newstate); /* public function */
	- lua_assert(p == sizeof(buff));
	- return luaS_hash(buff, p, h);
	-}
	-
	-
	-/*
	-** set GCdebt to a new value keeping the value (totalbytes + GCdebt)
	-** invariant
	-*/
	-void luaE_setdebt (global_State *g, l_mem debt) {
	- g->totalbytes -= (debt - g->GCdebt);
	- g->GCdebt = debt;
	-}
	-
	-
	-CallInfo luaE_extendCI (lua_State L) {
	- CallInfo *ci = luaM_new(L, CallInfo);
	- lua_assert(L->ci->next == NULL);
	- L->ci->next = ci;
	- ci->previous = L->ci;
	- ci->next = NULL;
	- return ci;
	-}
	-
	-
	-void luaE_freeCI (lua_State *L) {
	- CallInfo *ci = L->ci;
	- CallInfo *next = ci->next;
	- ci->next = NULL;
	- while ((ci = next) != NULL) {
	- next = ci->next;
	- luaM_free(L, ci);
	- }
	-}
	-
	-
	-static void stack_init (lua_State L1, lua_State L) {
	- int i; CallInfo *ci;
	- /* initialize stack array */
	- L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue);
	- L1->stacksize = BASIC_STACK_SIZE;
	- for (i = 0; i < BASIC_STACK_SIZE; i++)
	- setnilvalue(L1->stack + i); /* erase new stack */
	- L1->top = L1->stack;
	- L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK;
	- /* initialize first ci */
	- ci = &L1->base_ci;
	- ci->next = ci->previous = NULL;
	- ci->callstatus = 0;
	- ci->func = L1->top;
	- setnilvalue(L1->top++); /* 'function' entry for this 'ci' */
	- ci->top = L1->top + LUA_MINSTACK;
	- L1->ci = ci;
	-}
	-
	-
	-static void freestack (lua_State *L) {
	- if (L->stack == NULL)
	- return; /* stack not completely built yet */
	- L->ci = &L->base_ci; /* free the entire 'ci' list */
	- luaE_freeCI(L);
	- luaM_freearray(L, L->stack, L->stacksize); /* free stack array */
	-}
	-
	-
	-/*
	-** Create registry table and its predefined values
	-*/
	-static void init_registry (lua_State L, global_State g) {
	- TValue mt;
	- /* create registry */
	- Table *registry = luaH_new(L);
	- sethvalue(L, &g->l_registry, registry);
	- luaH_resize(L, registry, LUA_RIDX_LAST, 0);
	- /* registry[LUA_RIDX_MAINTHREAD] = L */
	- setthvalue(L, &mt, L);
	- luaH_setint(L, registry, LUA_RIDX_MAINTHREAD, &mt);
	- /* registry[LUA_RIDX_GLOBALS] = table of globals */
	- sethvalue(L, &mt, luaH_new(L));
	- luaH_setint(L, registry, LUA_RIDX_GLOBALS, &mt);
	-}
	-
	-
	-/*
	-** open parts of the state that may cause memory-allocation errors
	-*/
	-static void f_luaopen (lua_State L, void ud) {
	- global_State *g = G(L);
	- UNUSED(ud);
	- stack_init(L, L); /* init stack */
	- init_registry(L, g);
	- luaS_resize(L, MINSTRTABSIZE); /* initial size of string table */
	- luaT_init(L);
	- luaX_init(L);
	- /* pre-create memory-error message */
	- g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
	- luaS_fix(g->memerrmsg); /* it should never be collected */
	- g->gcrunning = 1; /* allow gc */
	- g->version = lua_version(NULL);
	- luai_userstateopen(L);
	-}
	-
	-
	-/*
	-** preinitialize a state with consistent values without allocating
	-** any memory (to avoid errors)
	-*/
	-static void preinit_state (lua_State L, global_State g) {
	- G(L) = g;
	- L->stack = NULL;
	- L->ci = NULL;
	- L->stacksize = 0;
	- L->errorJmp = NULL;
	- L->nCcalls = 0;
	- L->hook = NULL;
	- L->hookmask = 0;
	- L->basehookcount = 0;
	- L->allowhook = 1;
	- resethookcount(L);
	- L->openupval = NULL;
	- L->nny = 1;
	- L->status = LUA_OK;
	- L->errfunc = 0;
	-}
	-
	-
	-static void close_state (lua_State *L) {
	- global_State *g = G(L);
	- luaF_close(L, L->stack); /* close all upvalues for this thread */
	- luaC_freeallobjects(L); /* collect all objects */
	- if (g->version) /* closing a fully built state? */
	- luai_userstateclose(L);
	- luaM_freearray(L, G(L)->strt.hash, G(L)->strt.size);
	- luaZ_freebuffer(L, &g->buff);
	- freestack(L);
	- lua_assert(gettotalbytes(g) == sizeof(LG));
	- (g->frealloc)(g->ud, fromstate(L), sizeof(LG), 0); / free main block */
	-}
	-
	-
	-LUA_API lua_State lua_newthread (lua_State L) {
	- lua_State *L1;
	- lua_lock(L);
	- luaC_checkGC(L);
	- L1 = &luaC_newobj(L, LUA_TTHREAD, sizeof(LX), NULL, offsetof(LX, l))->th;
	- setthvalue(L, L->top, L1);
	- api_incr_top(L);
	- preinit_state(L1, G(L));
	- L1->hookmask = L->hookmask;
	- L1->basehookcount = L->basehookcount;
	- L1->hook = L->hook;
	- resethookcount(L1);
	- luai_userstatethread(L, L1);
	- stack_init(L1, L); /* init stack */
	- lua_unlock(L);
	- return L1;
	-}
	-
	-
	-void luaE_freethread (lua_State L, lua_State L1) {
	- LX *l = fromstate(L1);
	- luaF_close(L1, L1->stack); /* close all upvalues for this thread */
	- lua_assert(L1->openupval == NULL);
	- luai_userstatefree(L, L1);
	- freestack(L1);
	- luaM_free(L, l);
	-}
	-
	-
	-LUA_API lua_State lua_newstate (lua_Alloc f, void ud) {
	- int i;
	- lua_State *L;
	- global_State *g;
	- LG l = cast(LG , (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG)));
	- if (l == NULL) return NULL;
	- L = &l->l.l;
	- g = &l->g;
	- L->next = NULL;
	- L->tt = LUA_TTHREAD;
	- g->currentwhite = bit2mask(WHITE0BIT, FIXEDBIT);
	- L->marked = luaC_white(g);
	- g->gckind = KGC_NORMAL;
	- preinit_state(L, g);
	- g->frealloc = f;
	- g->ud = ud;
	- g->mainthread = L;
	- g->seed = makeseed(L);
	- g->uvhead.u.l.prev = &g->uvhead;
	- g->uvhead.u.l.next = &g->uvhead;
	- g->gcrunning = 0; /* no GC while building state */
	- g->GCestimate = 0;
	- g->strt.size = 0;
	- g->strt.nuse = 0;
	- g->strt.hash = NULL;
	- setnilvalue(&g->l_registry);
	- luaZ_initbuffer(L, &g->buff);
	- g->panic = NULL;
	- g->version = NULL;
	- g->gcstate = GCSpause;
	- g->allgc = NULL;
	- g->finobj = NULL;
	- g->tobefnz = NULL;
	- g->sweepgc = g->sweepfin = NULL;
	- g->gray = g->grayagain = NULL;
	- g->weak = g->ephemeron = g->allweak = NULL;
	- g->totalbytes = sizeof(LG);
	- g->GCdebt = 0;
	- g->gcpause = LUAI_GCPAUSE;
	- g->gcmajorinc = LUAI_GCMAJOR;
	- g->gcstepmul = LUAI_GCMUL;
	- for (i=0; i < LUA_NUMTAGS; i++) g->mt[i] = NULL;
	- if (luaD_rawrunprotected(L, f_luaopen, NULL) != LUA_OK) {
	- /* memory allocation error: free partial state */
	- close_state(L);
	- L = NULL;
	- }
	- return L;
	-}
	-
	-
	-LUA_API void lua_close (lua_State *L) {
	- L = G(L)->mainthread; /* only the main thread can be closed */
	- lua_lock(L);
	- close_state(L);
	-}
	-
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h
	@@ -1,46 +0,0 @@
	-/*
	-** $Id: lstring.h,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
	-** String table (keep all strings handled by Lua)
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lstring_h
	-#define lstring_h
	-
	-#include "lgc.h"
	-#include "lobject.h"
	-#include "lstate.h"
	-
	-
	-#define sizestring(s) (sizeof(union TString)+((s)->len+1)*sizeof(char))
	-
	-#define sizeudata(u) (sizeof(union Udata)+(u)->len)
	-
	-#define luaS_newliteral(L, s) (luaS_newlstr(L, "" s, \
	- (sizeof(s)/sizeof(char))-1))
	-
	-#define luaS_fix(s) l_setbit((s)->tsv.marked, FIXEDBIT)
	-
	-
	-/*
	-** test whether a string is a reserved word
	-*/
	-#define isreserved(s) ((s)->tsv.tt == LUA_TSHRSTR && (s)->tsv.extra > 0)
	-
	-
	-/*
	-** equality for short strings, which are always internalized
	-*/
	-#define eqshrstr(a,b) check_exp((a)->tsv.tt == LUA_TSHRSTR, (a) == (b))
	-
	-
	-LUAI_FUNC unsigned int luaS_hash (const char *str, size_t l, unsigned int seed);
	-LUAI_FUNC int luaS_eqlngstr (TString a, TString b);
	-LUAI_FUNC int luaS_eqstr (TString a, TString b);
	-LUAI_FUNC void luaS_resize (lua_State *L, int newsize);
	-LUAI_FUNC Udata luaS_newudata (lua_State L, size_t s, Table *e);
	-LUAI_FUNC TString luaS_newlstr (lua_State L, const char *str, size_t l);
	-LUAI_FUNC TString luaS_new (lua_State L, const char *str);
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c
	@@ -1,185 +0,0 @@
	-/*
	-** $Id: lstring.c,v 2.26.1.1 2013/04/12 18:48:47 roberto Exp $
	-** String table (keeps all strings handled by Lua)
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define lstring_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-
	-
	-/*
	-** Lua will use at most ~(2^LUAI_HASHLIMIT) bytes from a string to
	-** compute its hash
	-*/
	-#if !defined(LUAI_HASHLIMIT)
	-#define LUAI_HASHLIMIT 5
	-#endif
	-
	-
	-/*
	-** equality for long strings
	-*/
	-int luaS_eqlngstr (TString a, TString b) {
	- size_t len = a->tsv.len;
	- lua_assert(a->tsv.tt == LUA_TLNGSTR && b->tsv.tt == LUA_TLNGSTR);
	- return (a == b) \|\| /* same instance or... */
	- ((len == b->tsv.len) && /* equal length and ... */
	- (memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */
	-}
	-
	-
	-/*
	-** equality for strings
	-*/
	-int luaS_eqstr (TString a, TString b) {
	- return (a->tsv.tt == b->tsv.tt) &&
	- (a->tsv.tt == LUA_TSHRSTR ? eqshrstr(a, b) : luaS_eqlngstr(a, b));
	-}
	-
	-
	-unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
	- unsigned int h = seed ^ cast(unsigned int, l);
	- size_t l1;
	- size_t step = (l >> LUAI_HASHLIMIT) + 1;
	- for (l1 = l; l1 >= step; l1 -= step)
	- h = h ^ ((h<<5) + (h>>2) + cast_byte(str[l1 - 1]));
	- return h;
	-}
	-
	-
	-/*
	-** resizes the string table
	-*/
	-void luaS_resize (lua_State *L, int newsize) {
	- int i;
	- stringtable *tb = &G(L)->strt;
	- /* cannot resize while GC is traversing strings */
	- luaC_runtilstate(L, ~bitmask(GCSsweepstring));
	- if (newsize > tb->size) {
	- luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
	- for (i = tb->size; i < newsize; i++) tb->hash[i] = NULL;
	- }
	- /* rehash */
	- for (i=0; i<tb->size; i++) {
	- GCObject *p = tb->hash[i];
	- tb->hash[i] = NULL;
	- while (p) { /* for each node in the list */
	- GCObject next = gch(p)->next; / save next */
	- unsigned int h = lmod(gco2ts(p)->hash, newsize); /* new position */
	- gch(p)->next = tb->hash[h]; /* chain it */
	- tb->hash[h] = p;
	- resetoldbit(p); /* see MOVE OLD rule */
	- p = next;
	- }
	- }
	- if (newsize < tb->size) {
	- /* shrinking slice must be empty */
	- lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL);
	- luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
	- }
	- tb->size = newsize;
	-}
	-
	-
	-/*
	-** creates a new string object
	-*/
	-static TString createstrobj (lua_State L, const char *str, size_t l,
	- int tag, unsigned int h, GCObject **list) {
	- TString *ts;
	- size_t totalsize; /* total size of TString object */
	- totalsize = sizeof(TString) + ((l + 1) * sizeof(char));
	- ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts;
	- ts->tsv.len = l;
	- ts->tsv.hash = h;
	- ts->tsv.extra = 0;
	- memcpy(ts+1, str, l*sizeof(char));
	- ((char )(ts+1))[l] = '\0'; / ending 0 */
	- return ts;
	-}
	-
	-
	-/*
	-** creates a new short string, inserting it into string table
	-*/
	-static TString newshrstr (lua_State L, const char *str, size_t l,
	- unsigned int h) {
	- GCObject *list; / (pointer to) list where it will be inserted */
	- stringtable *tb = &G(L)->strt;
	- TString *s;
	- if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2)
	- luaS_resize(L, tb->size2); / too crowded */
	- list = &tb->hash[lmod(h, tb->size)];
	- s = createstrobj(L, str, l, LUA_TSHRSTR, h, list);
	- tb->nuse++;
	- return s;
	-}
	-
	-
	-/*
	-** checks whether short string exists and reuses it or creates a new one
	-*/
	-static TString internshrstr (lua_State L, const char *str, size_t l) {
	- GCObject *o;
	- global_State *g = G(L);
	- unsigned int h = luaS_hash(str, l, g->seed);
	- for (o = g->strt.hash[lmod(h, g->strt.size)];
	- o != NULL;
	- o = gch(o)->next) {
	- TString *ts = rawgco2ts(o);
	- if (h == ts->tsv.hash &&
	- l == ts->tsv.len &&
	- (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
	- if (isdead(G(L), o)) /* string is dead (but was not collected yet)? */
	- changewhite(o); /* resurrect it */
	- return ts;
	- }
	- }
	- return newshrstr(L, str, l, h); /* not found; create a new string */
	-}
	-
	-
	-/*
	-** new string (with explicit length)
	-*/
	-TString luaS_newlstr (lua_State L, const char *str, size_t l) {
	- if (l <= LUAI_MAXSHORTLEN) /* short string? */
	- return internshrstr(L, str, l);
	- else {
	- if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char))
	- luaM_toobig(L);
	- return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL);
	- }
	-}
	-
	-
	-/*
	-** new zero-terminated string
	-*/
	-TString luaS_new (lua_State L, const char *str) {
	- return luaS_newlstr(L, str, strlen(str));
	-}
	-
	-
	-Udata luaS_newudata (lua_State L, size_t s, Table *e) {
	- Udata *u;
	- if (s > MAX_SIZET - sizeof(Udata))
	- luaM_toobig(L);
	- u = &luaC_newobj(L, LUA_TUSERDATA, sizeof(Udata) + s, NULL, 0)->u;
	- u->uv.len = s;
	- u->uv.metatable = NULL;
	- u->uv.env = e;
	- return u;
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c
	@@ -1,1050 +0,0 @@
	-/*
	-** $Id: lstrlib.c,v 1.178.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Standard library for string operations and pattern-matching
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/ctype.h>
	-#include <sys/zfs_context.h>
	-
	-#define lstrlib_c
	-#define LUA_LIB
	-
	-#include "lua.h"
	-
	-#include "lauxlib.h"
	-#include "lualib.h"
	-
	-
	-/*
	-** maximum number of captures that a pattern can do during
	-** pattern-matching. This limit is arbitrary.
	-*/
	-#if !defined(LUA_MAXCAPTURES)
	-#define LUA_MAXCAPTURES 32
	-#endif
	-
	-
	-/* macro to `unsign' a character */
	-#define uchar(c) ((unsigned char)(c))
	-
	-/*
	- * PATCHED: add missing character macros.
	- */
	-#ifdef illumos
	-#define tolower(C) (((C) >= 'A' && (C) <= 'Z') ? (C) - 'A' + 'a' : (C))
	-#define toupper(C) (((C) >= 'a' && (C) <= 'z') ? (C) - 'a' + 'A': (C))
	-#define iscntrl(C) ((((C) >= 0) && ((C) <= 0x1f)) \|\| ((C) == 0x7f))
	-#else
	-#define isalnum(C) (isalpha(C) \|\| isdigit(C))
	-#define iscntrl(C) (uchar(C) <= 0x1f \|\| uchar(C) == 0x7f)
	-#endif
	-#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E)
	-#define ispunct(C) (((C) >= 0x21 && (C) <= 0x2F) \|\| \
	- ((C) >= 0x3A && (C) <= 0x40) \|\| \
	- ((C) >= 0x5B && (C) <= 0x60) \|\| \
	- ((C) >= 0x7B && (C) <= 0x7E))
	-
	-/*
	- * The provided version of sprintf returns a char *, but str_format expects
	- * it to return the number of characters printed. This version has the expected
	- * behavior.
	- */
	-static size_t str_sprintf(char buf, const char fmt, ...) {
	- va_list args;
	- size_t len;
	-
	- va_start(args, fmt);
	- len = vsnprintf(buf, INT_MAX, fmt, args);
	- va_end(args);
	-
	- return len;
	-}
	-
	-
	-static int str_len (lua_State *L) {
	- size_t l;
	- luaL_checklstring(L, 1, &l);
	- lua_pushinteger(L, (lua_Integer)l);
	- return 1;
	-}
	-
	-
	-/* translate a relative string position: negative means back from end */
	-static size_t posrelat (ptrdiff_t pos, size_t len) {
	- if (pos >= 0) return (size_t)pos;
	- else if (0u - (size_t)pos > len) return 0;
	- else return len - ((size_t)-pos) + 1;
	-}
	-
	-
	-static int str_sub (lua_State *L) {
	- size_t l;
	- const char *s = luaL_checklstring(L, 1, &l);
	- size_t start = posrelat(luaL_checkinteger(L, 2), l);
	- size_t end = posrelat(luaL_optinteger(L, 3, -1), l);
	- if (start < 1) start = 1;
	- if (end > l) end = l;
	- if (start <= end)
	- lua_pushlstring(L, s + start - 1, end - start + 1);
	- else lua_pushliteral(L, "");
	- return 1;
	-}
	-
	-
	-static int str_reverse (lua_State *L) {
	- size_t l, i;
	- luaL_Buffer b;
	- const char *s = luaL_checklstring(L, 1, &l);
	- char *p = luaL_buffinitsize(L, &b, l);
	- for (i = 0; i < l; i++)
	- p[i] = s[l - i - 1];
	- luaL_pushresultsize(&b, l);
	- return 1;
	-}
	-
	-
	-static int str_lower (lua_State *L) {
	- size_t l;
	- size_t i;
	- luaL_Buffer b;
	- const char *s = luaL_checklstring(L, 1, &l);
	- char *p = luaL_buffinitsize(L, &b, l);
	- for (i=0; i<l; i++)
	- p[i] = tolower(uchar(s[i]));
	- luaL_pushresultsize(&b, l);
	- return 1;
	-}
	-
	-
	-static int str_upper (lua_State *L) {
	- size_t l;
	- size_t i;
	- luaL_Buffer b;
	- const char *s = luaL_checklstring(L, 1, &l);
	- char *p = luaL_buffinitsize(L, &b, l);
	- for (i=0; i<l; i++)
	- p[i] = toupper(uchar(s[i]));
	- luaL_pushresultsize(&b, l);
	- return 1;
	-}
	-
	-
	-/* reasonable limit to avoid arithmetic overflow */
	-#define MAXSIZE ((~(size_t)0) >> 1)
	-
	-static int str_rep (lua_State *L) {
	- size_t l, lsep;
	- const char *s = luaL_checklstring(L, 1, &l);
	- int n = luaL_checkint(L, 2);
	- const char *sep = luaL_optlstring(L, 3, "", &lsep);
	- if (n <= 0) lua_pushliteral(L, "");
	- else if (l + lsep < l \|\| l + lsep >= MAXSIZE / n) /* may overflow? */
	- return luaL_error(L, "resulting string too large");
	- else {
	- size_t totallen = n * l + (n - 1) * lsep;
	- luaL_Buffer b;
	- char *p = luaL_buffinitsize(L, &b, totallen);
	- while (n-- > 1) { /* first n-1 copies (followed by separator) */
	- memcpy(p, s, l * sizeof(char)); p += l;
	- if (lsep > 0) { /* avoid empty 'memcpy' (may be expensive) */
	- memcpy(p, sep, lsep * sizeof(char)); p += lsep;
	- }
	- }
	- memcpy(p, s, l * sizeof(char)); /* last copy (not followed by separator) */
	- luaL_pushresultsize(&b, totallen);
	- }
	- return 1;
	-}
	-
	-
	-static int str_byte (lua_State *L) {
	- size_t l;
	- const char *s = luaL_checklstring(L, 1, &l);
	- size_t posi = posrelat(luaL_optinteger(L, 2, 1), l);
	- size_t pose = posrelat(luaL_optinteger(L, 3, posi), l);
	- int n, i;
	- if (posi < 1) posi = 1;
	- if (pose > l) pose = l;
	- if (posi > pose) return 0; /* empty interval; return no values */
	- n = (int)(pose - posi + 1);
	- if (posi + n <= pose) /* (size_t -> int) overflow? */
	- return luaL_error(L, "string slice too long");
	- luaL_checkstack(L, n, "string slice too long");
	- for (i=0; i<n; i++)
	- lua_pushinteger(L, uchar(s[posi+i-1]));
	- return n;
	-}
	-
	-
	-static int str_char (lua_State *L) {
	- int n = lua_gettop(L); /* number of arguments */
	- int i;
	- luaL_Buffer b;
	- char *p = luaL_buffinitsize(L, &b, n);
	- for (i=1; i<=n; i++) {
	- int c = luaL_checkint(L, i);
	- luaL_argcheck(L, uchar(c) == c, i, "value out of range");
	- p[i - 1] = uchar(c);
	- }
	- luaL_pushresultsize(&b, n);
	- return 1;
	-}
	-
	-
	-static int writer (lua_State L, const void b, size_t size, void* B) {
	- (void)L;
	- luaL_addlstring((luaL_Buffer) B, (const char )b, size);
	- return 0;
	-}
	-
	-
	-static int str_dump (lua_State *L) {
	- luaL_Buffer b;
	- luaL_checktype(L, 1, LUA_TFUNCTION);
	- lua_settop(L, 1);
	- luaL_buffinit(L,&b);
	- if (lua_dump(L, writer, &b) != 0)
	- return luaL_error(L, "unable to dump given function");
	- luaL_pushresult(&b);
	- return 1;
	-}
	-
	-
	-
	-/*
	-** {======================================================
	-** PATTERN MATCHING
	-** =======================================================
	-*/
	-
	-
	-#define CAP_UNFINISHED (-1)
	-#define CAP_POSITION (-2)
	-
	-
	-typedef struct MatchState {
	- int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
	- const char src_init; / init of source string */
	- const char src_end; / end ('\0') of source string */
	- const char p_end; / end ('\0') of pattern */
	- lua_State *L;
	- int level; /* total number of captures (finished or unfinished) */
	- struct {
	- const char *init;
	- ptrdiff_t len;
	- } capture[LUA_MAXCAPTURES];
	-} MatchState;
	-
	-
	-/* recursive function */
	-static const char match (MatchState ms, const char s, const char p);
	-
	-
	-/* maximum recursion depth for 'match' */
	-#if !defined(MAXCCALLS)
	-#define MAXCCALLS 200
	-#endif
	-
	-
	-#define L_ESC '%'
	-#define SPECIALS "^$*+?.([%-"
	-
	-
	-static int check_capture (MatchState *ms, int l) {
	- l -= '1';
	- if (l < 0 \|\| l >= ms->level \|\| ms->capture[l].len == CAP_UNFINISHED)
	- return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
	- return l;
	-}
	-
	-
	-static int capture_to_close (MatchState *ms) {
	- int level = ms->level;
	- for (level--; level>=0; level--)
	- if (ms->capture[level].len == CAP_UNFINISHED) return level;
	- return luaL_error(ms->L, "invalid pattern capture");
	-}
	-
	-
	-static const char classend (MatchState ms, const char *p) {
	- switch (*p++) {
	- case L_ESC: {
	- if (p == ms->p_end)
	- luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
	- return p+1;
	- }
	- case '[': {
	- if (*p == '^') p++;
	- do { /* look for a `]' */
	- if (p == ms->p_end)
	- luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
	- if (*(p++) == L_ESC && p < ms->p_end)
	- p++; /* skip escapes (e.g. `%]') */
	- } while (*p != ']');
	- return p+1;
	- }
	- default: {
	- return p;
	- }
	- }
	-}
	-
	-
	-static int match_class (int c, int cl) {
	- int res;
	- switch (tolower(cl)) {
	- case 'a' : res = isalpha(c); break;
	- case 'c' : res = iscntrl(c); break;
	- case 'd' : res = isdigit(c); break;
	- case 'g' : res = isgraph(c); break;
	- case 'l' : res = islower(c); break;
	- case 'p' : res = ispunct(c); break;
	- case 's' : res = isspace(c); break;
	- case 'u' : res = isupper(c); break;
	- case 'w' : res = isalnum(c); break;
	- case 'x' : res = isxdigit(c); break;
	- case 'z' : res = (c == 0); break; /* deprecated option */
	- default: return (cl == c);
	- }
	- return (islower(cl) ? res : !res);
	-}
	-
	-
	-static int matchbracketclass (int c, const char p, const char ec) {
	- int sig = 1;
	- if (*(p+1) == '^') {
	- sig = 0;
	- p++; /* skip the `^' */
	- }
	- while (++p < ec) {
	- if (*p == L_ESC) {
	- p++;
	- if (match_class(c, uchar(*p)))
	- return sig;
	- }
	- else if ((*(p+1) == '-') && (p+2 < ec)) {
	- p+=2;
	- if (uchar((p-2)) <= c && c <= uchar(p))
	- return sig;
	- }
	- else if (uchar(*p) == c) return sig;
	- }
	- return !sig;
	-}
	-
	-
	-static int singlematch (MatchState ms, const char s, const char *p,
	- const char *ep) {
	- if (s >= ms->src_end)
	- return 0;
	- else {
	- int c = uchar(*s);
	- switch (*p) {
	- case '.': return 1; /* matches any char */
	- case L_ESC: return match_class(c, uchar(*(p+1)));
	- case '[': return matchbracketclass(c, p, ep-1);
	- default: return (uchar(*p) == c);
	- }
	- }
	-}
	-
	-
	-static const char matchbalance (MatchState ms, const char *s,
	- const char *p) {
	- if (p >= ms->p_end - 1)
	- luaL_error(ms->L, "malformed pattern "
	- "(missing arguments to " LUA_QL("%%b") ")");
	- if (s != p) return NULL;
	- else {
	- int b = *p;
	- int e = *(p+1);
	- int cont = 1;
	- while (++s < ms->src_end) {
	- if (*s == e) {
	- if (--cont == 0) return s+1;
	- }
	- else if (*s == b) cont++;
	- }
	- }
	- return NULL; /* string ends out of balance */
	-}
	-
	-
	-static const char max_expand (MatchState ms, const char *s,
	- const char p, const char ep) {
	- ptrdiff_t i = 0; /* counts maximum expand for item */
	- while (singlematch(ms, s + i, p, ep))
	- i++;
	- /* keeps trying to match with the maximum repetitions */
	- while (i>=0) {
	- const char *res = match(ms, (s+i), ep+1);
	- if (res) return res;
	- i--; /* else didn't match; reduce 1 repetition to try again */
	- }
	- return NULL;
	-}
	-
	-
	-static const char min_expand (MatchState ms, const char *s,
	- const char p, const char ep) {
	- for (;;) {
	- const char *res = match(ms, s, ep+1);
	- if (res != NULL)
	- return res;
	- else if (singlematch(ms, s, p, ep))
	- s++; /* try with one more repetition */
	- else return NULL;
	- }
	-}
	-
	-
	-static const char start_capture (MatchState ms, const char *s,
	- const char *p, int what) {
	- const char *res;
	- int level = ms->level;
	- if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
	- ms->capture[level].init = s;
	- ms->capture[level].len = what;
	- ms->level = level+1;
	- if ((res=match(ms, s, p)) == NULL) /* match failed? */
	- ms->level--; /* undo capture */
	- return res;
	-}
	-
	-
	-static const char end_capture (MatchState ms, const char *s,
	- const char *p) {
	- int l = capture_to_close(ms);
	- const char *res;
	- ms->capture[l].len = s - ms->capture[l].init; /* close capture */
	- if ((res = match(ms, s, p)) == NULL) /* match failed? */
	- ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
	- return res;
	-}
	-
	-
	-static const char match_capture (MatchState ms, const char *s, int l) {
	- size_t len;
	- l = check_capture(ms, l);
	- len = ms->capture[l].len;
	- if ((size_t)(ms->src_end-s) >= len &&
	- memcmp(ms->capture[l].init, s, len) == 0)
	- return s+len;
	- else return NULL;
	-}
	-
	-
	-static const char match (MatchState ms, const char s, const char p) {
	- if (ms->matchdepth-- == 0)
	- luaL_error(ms->L, "pattern too complex");
	- init: /* using goto's to optimize tail recursion */
	- if (p != ms->p_end) { /* end of pattern? */
	- switch (*p) {
	- case '(': { /* start capture */
	- if ((p + 1) == ')') / position capture? */
	- s = start_capture(ms, s, p + 2, CAP_POSITION);
	- else
	- s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
	- break;
	- }
	- case ')': { /* end capture */
	- s = end_capture(ms, s, p + 1);
	- break;
	- }
	- case '$': {
	- if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
	- goto dflt; /* no; go to default */
	- s = (s == ms->src_end) ? s : NULL; /* check end of string */
	- break;
	- }
	- case L_ESC: { /* escaped sequences not in the format class[+?-]? /
	- switch (*(p + 1)) {
	- case 'b': { /* balanced string? */
	- s = matchbalance(ms, s, p + 2);
	- if (s != NULL) {
	- p += 4; goto init; /* return match(ms, s, p + 4); */
	- } /* else fail (s == NULL) */
	- break;
	- }
	- case 'f': { /* frontier? */
	- const char *ep; char previous;
	- p += 2;
	- if (*p != '[')
	- luaL_error(ms->L, "missing " LUA_QL("[") " after "
	- LUA_QL("%%f") " in pattern");
	- ep = classend(ms, p); /* points to what is next */
	- previous = (s == ms->src_init) ? '\0' : *(s - 1);
	- if (!matchbracketclass(uchar(previous), p, ep - 1) &&
	- matchbracketclass(uchar(*s), p, ep - 1)) {
	- p = ep; goto init; /* return match(ms, s, ep); */
	- }
	- s = NULL; /* match failed */
	- break;
	- }
	- case '0': case '1': case '2': case '3':
	- case '4': case '5': case '6': case '7':
	- case '8': case '9': { /* capture results (%0-%9)? */
	- s = match_capture(ms, s, uchar(*(p + 1)));
	- if (s != NULL) {
	- p += 2; goto init; /* return match(ms, s, p + 2) */
	- }
	- break;
	- }
	- default: goto dflt;
	- }
	- break;
	- }
	- default: dflt: { /* pattern class plus optional suffix */
	- const char ep = classend(ms, p); / points to optional suffix */
	- /* does not match at least once? */
	- if (!singlematch(ms, s, p, ep)) {
	- if (ep == '' \|\| ep == '?' \|\| ep == '-') { /* accept empty? */
	- p = ep + 1; goto init; /* return match(ms, s, ep + 1); */
	- }
	- else /* '+' or no suffix */
	- s = NULL; /* fail */
	- }
	- else { /* matched once */
	- switch (ep) { / handle optional suffix */
	- case '?': { /* optional */
	- const char *res;
	- if ((res = match(ms, s + 1, ep + 1)) != NULL)
	- s = res;
	- else {
	- p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */
	- }
	- break;
	- }
	- case '+': /* 1 or more repetitions */
	- s++; /* 1 match already done */
	- /* FALLTHROUGH */
	- case '': / 0 or more repetitions */
	- s = max_expand(ms, s, p, ep);
	- break;
	- case '-': /* 0 or more repetitions (minimum) */
	- s = min_expand(ms, s, p, ep);
	- break;
	- default: /* no suffix */
	- s++; p = ep; goto init; /* return match(ms, s + 1, ep); */
	- }
	- }
	- break;
	- }
	- }
	- }
	- ms->matchdepth++;
	- return s;
	-}
	-
	-
	-
	-static const char lmemfind (const char s1, size_t l1,
	- const char *s2, size_t l2) {
	- if (l2 == 0) return s1; /* empty strings are everywhere */
	- else if (l2 > l1) return NULL; /* avoids a negative `l1' */
	- else {
	- const char init; / to search for a `s2' inside `s1' /
	- l2--; /* 1st char will be checked by `memchr' */
	- l1 = l1-l2; /* `s2' cannot be found after that */
	- while (l1 > 0 && (init = (const char )memchr(s1, s2, l1)) != NULL) {
	- init++; /* 1st char is already checked */
	- if (memcmp(init, s2+1, l2) == 0)
	- return init-1;
	- else { /* correct `l1' and `s1' to try again */
	- l1 -= init-s1;
	- s1 = init;
	- }
	- }
	- return NULL; /* not found */
	- }
	-}
	-
	-
	-static void push_onecapture (MatchState ms, int i, const char s,
	- const char *e) {
	- if (i >= ms->level) {
	- if (i == 0) /* ms->level == 0, too */
	- lua_pushlstring(ms->L, s, e - s); /* add whole match */
	- else
	- luaL_error(ms->L, "invalid capture index");
	- }
	- else {
	- ptrdiff_t l = ms->capture[i].len;
	- if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
	- if (l == CAP_POSITION)
	- lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1);
	- else
	- lua_pushlstring(ms->L, ms->capture[i].init, l);
	- }
	-}
	-
	-
	-static int push_captures (MatchState ms, const char s, const char *e) {
	- int i;
	- int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
	- luaL_checkstack(ms->L, nlevels, "too many captures");
	- for (i = 0; i < nlevels; i++)
	- push_onecapture(ms, i, s, e);
	- return nlevels; /* number of strings pushed */
	-}
	-
	-
	-/* check whether pattern has no special characters */
	-static int nospecials (const char *p, size_t l) {
	- size_t upto = 0;
	- do {
	- if (strpbrk(p + upto, SPECIALS))
	- return 0; /* pattern has a special character */
	- upto += strlen(p + upto) + 1; /* may have more after \0 */
	- } while (upto <= l);
	- return 1; /* no special chars found */
	-}
	-
	-
	-static int str_find_aux (lua_State *L, int find) {
	- size_t ls, lp;
	- const char *s = luaL_checklstring(L, 1, &ls);
	- const char *p = luaL_checklstring(L, 2, &lp);
	- size_t init = posrelat(luaL_optinteger(L, 3, 1), ls);
	- if (init < 1) init = 1;
	- else if (init > ls + 1) { /* start after string's end? */
	- lua_pushnil(L); /* cannot find anything */
	- return 1;
	- }
	- /* explicit request or no special characters? */
	- if (find && (lua_toboolean(L, 4) \|\| nospecials(p, lp))) {
	- /* do a plain search */
	- const char *s2 = lmemfind(s + init - 1, ls - init + 1, p, lp);
	- if (s2) {
	- lua_pushinteger(L, s2 - s + 1);
	- lua_pushinteger(L, s2 - s + lp);
	- return 2;
	- }
	- }
	- else {
	- MatchState ms;
	- const char *s1 = s + init - 1;
	- int anchor = (*p == '^');
	- if (anchor) {
	- p++; lp--; /* skip anchor character */
	- }
	- ms.L = L;
	- ms.matchdepth = MAXCCALLS;
	- ms.src_init = s;
	- ms.src_end = s + ls;
	- ms.p_end = p + lp;
	- do {
	- const char *res;
	- ms.level = 0;
	- lua_assert(ms.matchdepth == MAXCCALLS);
	- if ((res=match(&ms, s1, p)) != NULL) {
	- if (find) {
	- lua_pushinteger(L, s1 - s + 1); /* start */
	- lua_pushinteger(L, res - s); /* end */
	- return push_captures(&ms, NULL, 0) + 2;
	- }
	- else
	- return push_captures(&ms, s1, res);
	- }
	- } while (s1++ < ms.src_end && !anchor);
	- }
	- lua_pushnil(L); /* not found */
	- return 1;
	-}
	-
	-
	-static int str_find (lua_State *L) {
	- return str_find_aux(L, 1);
	-}
	-
	-
	-static int str_match (lua_State *L) {
	- return str_find_aux(L, 0);
	-}
	-
	-
	-static int gmatch_aux (lua_State *L) {
	- MatchState ms;
	- size_t ls, lp;
	- const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls);
	- const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp);
	- const char *src;
	- ms.L = L;
	- ms.matchdepth = MAXCCALLS;
	- ms.src_init = s;
	- ms.src_end = s+ls;
	- ms.p_end = p + lp;
	- for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
	- src <= ms.src_end;
	- src++) {
	- const char *e;
	- ms.level = 0;
	- lua_assert(ms.matchdepth == MAXCCALLS);
	- if ((e = match(&ms, src, p)) != NULL) {
	- lua_Integer newstart = e-s;
	- if (e == src) newstart++; /* empty match? go at least one position */
	- lua_pushinteger(L, newstart);
	- lua_replace(L, lua_upvalueindex(3));
	- return push_captures(&ms, src, e);
	- }
	- }
	- return 0; /* not found */
	-}
	-
	-
	-static int str_gmatch (lua_State *L) {
	- luaL_checkstring(L, 1);
	- luaL_checkstring(L, 2);
	- lua_settop(L, 2);
	- lua_pushinteger(L, 0);
	- lua_pushcclosure(L, gmatch_aux, 3);
	- return 1;
	-}
	-
	-
	-static void add_s (MatchState ms, luaL_Buffer b, const char *s,
	- const char *e) {
	- size_t l, i;
	- const char *news = lua_tolstring(ms->L, 3, &l);
	- for (i = 0; i < l; i++) {
	- if (news[i] != L_ESC)
	- luaL_addchar(b, news[i]);
	- else {
	- i++; /* skip ESC */
	- if (!isdigit(uchar(news[i]))) {
	- if (news[i] != L_ESC)
	- luaL_error(ms->L, "invalid use of " LUA_QL("%c")
	- " in replacement string", L_ESC);
	- luaL_addchar(b, news[i]);
	- }
	- else if (news[i] == '0')
	- luaL_addlstring(b, s, e - s);
	- else {
	- push_onecapture(ms, news[i] - '1', s, e);
	- luaL_addvalue(b); /* add capture to accumulated result */
	- }
	- }
	- }
	-}
	-
	-
	-static void add_value (MatchState ms, luaL_Buffer b, const char *s,
	- const char *e, int tr) {
	- lua_State *L = ms->L;
	- switch (tr) {
	- case LUA_TFUNCTION: {
	- int n;
	- lua_pushvalue(L, 3);
	- n = push_captures(ms, s, e);
	- lua_call(L, n, 1);
	- break;
	- }
	- case LUA_TTABLE: {
	- push_onecapture(ms, 0, s, e);
	- lua_gettable(L, 3);
	- break;
	- }
	- default: { /* LUA_TNUMBER or LUA_TSTRING */
	- add_s(ms, b, s, e);
	- return;
	- }
	- }
	- if (!lua_toboolean(L, -1)) { /* nil or false? */
	- lua_pop(L, 1);
	- lua_pushlstring(L, s, e - s); /* keep original text */
	- }
	- else if (!lua_isstring(L, -1))
	- luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
	- luaL_addvalue(b); /* add result to accumulator */
	-}
	-
	-
	-static int str_gsub (lua_State *L) {
	- size_t srcl, lp;
	- const char *src = luaL_checklstring(L, 1, &srcl);
	- const char *p = luaL_checklstring(L, 2, &lp);
	- int tr = lua_type(L, 3);
	- size_t max_s = luaL_optinteger(L, 4, srcl+1);
	- int anchor = (*p == '^');
	- size_t n = 0;
	- MatchState ms;
	- luaL_Buffer b;
	- luaL_argcheck(L, tr == LUA_TNUMBER \|\| tr == LUA_TSTRING \|\|
	- tr == LUA_TFUNCTION \|\| tr == LUA_TTABLE, 3,
	- "string/function/table expected");
	- luaL_buffinit(L, &b);
	- if (anchor) {
	- p++; lp--; /* skip anchor character */
	- }
	- ms.L = L;
	- ms.matchdepth = MAXCCALLS;
	- ms.src_init = src;
	- ms.src_end = src+srcl;
	- ms.p_end = p + lp;
	- while (n < max_s) {
	- const char *e;
	- ms.level = 0;
	- lua_assert(ms.matchdepth == MAXCCALLS);
	- e = match(&ms, src, p);
	- if (e) {
	- n++;
	- add_value(&ms, &b, src, e, tr);
	- }
	- if (e && e>src) /* non empty match? */
	- src = e; /* skip it */
	- else if (src < ms.src_end)
	- luaL_addchar(&b, *src++);
	- else break;
	- if (anchor) break;
	- }
	- luaL_addlstring(&b, src, ms.src_end-src);
	- luaL_pushresult(&b);
	- lua_pushinteger(L, n); /* number of substitutions */
	- return 2;
	-}
	-
	-/* }====================================================== */
	-
	-
	-
	-/*
	-** {======================================================
	-** STRING FORMAT
	-** =======================================================
	-*/
	-
	-/*
	-** LUA_INTFRMLEN is the length modifier for integer conversions in
	-** 'string.format'; LUA_INTFRM_T is the integer type corresponding to
	-** the previous length
	-*/
	-#if !defined(LUA_INTFRMLEN) /* { */
	-#if defined(LUA_USE_LONGLONG)
	-
	-#define LUA_INTFRMLEN "ll"
	-#define LUA_INTFRM_T long long
	-
	-#else
	-
	-#define LUA_INTFRMLEN "l"
	-#define LUA_INTFRM_T long
	-
	-#endif
	-#endif /* } */
	-
	-
	-/*
	-** LUA_FLTFRMLEN is the length modifier for float conversions in
	-** 'string.format'; LUA_FLTFRM_T is the float type corresponding to
	-** the previous length
	-*/
	-#if !defined(LUA_FLTFRMLEN)
	-
	-#define LUA_FLTFRMLEN ""
	-#define LUA_FLTFRM_T double
	-
	-#endif
	-
	-
	-/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */
	-#define MAX_ITEM 512
	-/* valid flags in a format specification */
	-#define FLAGS "-+ #0"
	-/*
	-** maximum size of each format specification (such as '%-099.99d')
	-** (+10 accounts for %99.99x plus margin of error)
	-*/
	-#define MAX_FORMAT (sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10)
	-
	-
	-static void addquoted (lua_State L, luaL_Buffer b, int arg) {
	- size_t l;
	- const char *s = luaL_checklstring(L, arg, &l);
	- luaL_addchar(b, '"');
	- while (l--) {
	- if (s == '"' \|\| s == '\\' \|\| *s == '\n') {
	- luaL_addchar(b, '\\');
	- luaL_addchar(b, *s);
	- }
	- else if (s == '\0' \|\| iscntrl(uchar(s))) {
	- char buff[10];
	- if (!isdigit(uchar(*(s+1))))
	- sprintf(buff, "\\%d", (int)uchar(*s));
	- else
	- sprintf(buff, "\\%03d", (int)uchar(*s));
	- luaL_addstring(b, buff);
	- }
	- else
	- luaL_addchar(b, *s);
	- s++;
	- }
	- luaL_addchar(b, '"');
	-}
	-
	-static const char scanformat (lua_State L, const char strfrmt, char form) {
	- const char *p = strfrmt;
	- while (p != '\0' && strchr(FLAGS, p) != NULL) p++; /* skip flags */
	- if ((size_t)(p - strfrmt) >= sizeof(FLAGS)/sizeof(char))
	- luaL_error(L, "invalid format (repeated flags)");
	- if (isdigit(uchar(p))) p++; / skip width */
	- if (isdigit(uchar(p))) p++; / (2 digits at most) */
	- if (*p == '.') {
	- p++;
	- if (isdigit(uchar(p))) p++; / skip precision */
	- if (isdigit(uchar(p))) p++; / (2 digits at most) */
	- }
	- if (isdigit(uchar(*p)))
	- luaL_error(L, "invalid format (width or precision too long)");
	- *(form++) = '%';
	- memcpy(form, strfrmt, (p - strfrmt + 1) * sizeof(char));
	- form += p - strfrmt + 1;
	- *form = '\0';
	- return p;
	-}
	-
	-
	-/*
	-** add length modifier into formats
	-*/
	-static void addlenmod (char form, const char lenmod) {
	- size_t l = strlen(form);
	- size_t lm = strlen(lenmod);
	- char spec = form[l - 1];
	- strcpy(form + l - 1, lenmod);
	- form[l + lm - 1] = spec;
	- form[l + lm] = '\0';
	-}
	-
	-
	-static int str_format (lua_State *L) {
	- int top = lua_gettop(L);
	- int arg = 1;
	- size_t sfl;
	- const char *strfrmt = luaL_checklstring(L, arg, &sfl);
	- const char *strfrmt_end = strfrmt+sfl;
	- luaL_Buffer b;
	- luaL_buffinit(L, &b);
	- while (strfrmt < strfrmt_end) {
	- if (*strfrmt != L_ESC)
	- luaL_addchar(&b, *strfrmt++);
	- else if (*++strfrmt == L_ESC)
	- luaL_addchar(&b, strfrmt++); / %% */
	- else { /* format item */
	- char form[MAX_FORMAT]; /* to store the format (`%...') */
	- char buff = luaL_prepbuffsize(&b, MAX_ITEM); / to put formatted item */
	- int nb = 0; /* number of bytes in added item */
	- if (++arg > top)
	- luaL_argerror(L, arg, "no value");
	- strfrmt = scanformat(L, strfrmt, form);
	- switch (*strfrmt++) {
	- case 'c': {
	- nb = str_sprintf(buff, form, luaL_checkint(L, arg));
	- break;
	- }
	- case 'd': case 'i': {
	- lua_Number n = luaL_checknumber(L, arg);
	- LUA_INTFRM_T ni = (LUA_INTFRM_T)n;
	- lua_Number diff = n - (lua_Number)ni;
	- luaL_argcheck(L, -1 < diff && diff < 1, arg,
	- "not a number in proper range");
	- addlenmod(form, LUA_INTFRMLEN);
	- nb = str_sprintf(buff, form, ni);
	- break;
	- }
	- case 'o': case 'u': case 'x': case 'X': {
	- lua_Number n = luaL_checknumber(L, arg);
	- unsigned LUA_INTFRM_T ni = (unsigned LUA_INTFRM_T)n;
	- lua_Number diff = n - (lua_Number)ni;
	- luaL_argcheck(L, -1 < diff && diff < 1, arg,
	- "not a non-negative number in proper range");
	- addlenmod(form, LUA_INTFRMLEN);
	- nb = str_sprintf(buff, form, ni);
	- break;
	- }
	-#if defined(LUA_USE_FLOAT_FORMATS)
	- case 'e': case 'E': case 'f':
	-#if defined(LUA_USE_AFORMAT)
	- case 'a': case 'A':
	-#endif
	- case 'g': case 'G': {
	- addlenmod(form, LUA_FLTFRMLEN);
	- nb = str_sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg));
	- break;
	- }
	-#endif
	- case 'q': {
	- addquoted(L, &b, arg);
	- break;
	- }
	- case 's': {
	- size_t l;
	- const char *s = luaL_tolstring(L, arg, &l);
	- if (!strchr(form, '.') && l >= 100) {
	- /* no precision and string is too long to be formatted;
	- keep original string */
	- luaL_addvalue(&b);
	- break;
	- }
	- else {
	- nb = str_sprintf(buff, form, s);
	- lua_pop(L, 1); /* remove result from 'luaL_tolstring' */
	- break;
	- }
	- }
	- default: { /* also treat cases `pnLlh' */
	- return luaL_error(L, "invalid option " LUA_QL("%%%c") " to "
	- LUA_QL("format"), *(strfrmt - 1));
	- }
	- }
	- luaL_addsize(&b, nb);
	- }
	- }
	- luaL_pushresult(&b);
	- return 1;
	-}
	-
	-/* }====================================================== */
	-
	-
	-static const luaL_Reg strlib[] = {
	- {"byte", str_byte},
	- {"char", str_char},
	- {"dump", str_dump},
	- {"find", str_find},
	- {"format", str_format},
	- {"gmatch", str_gmatch},
	- {"gsub", str_gsub},
	- {"len", str_len},
	- {"lower", str_lower},
	- {"match", str_match},
	- {"rep", str_rep},
	- {"reverse", str_reverse},
	- {"sub", str_sub},
	- {"upper", str_upper},
	- {NULL, NULL}
	-};
	-
	-
	-static void createmetatable (lua_State *L) {
	- lua_createtable(L, 0, 1); /* table to be metatable for strings */
	- lua_pushliteral(L, ""); /* dummy string */
	- lua_pushvalue(L, -2); /* copy table */
	- lua_setmetatable(L, -2); /* set table as metatable for strings */
	- lua_pop(L, 1); /* pop dummy string */
	- lua_pushvalue(L, -2); /* get string library */
	- lua_setfield(L, -2, "__index"); /* metatable.__index = string */
	- lua_pop(L, 1); /* pop metatable */
	-}
	-
	-
	-/*
	-** Open string library
	-*/
	-LUAMOD_API int luaopen_string (lua_State *L) {
	- luaL_newlib(L, strlib);
	- createmetatable(L);
	- return 1;
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h
	@@ -1,45 +0,0 @@
	-/*
	-** $Id: ltable.h,v 2.16.1.2 2013/08/30 15:49:41 roberto Exp $
	-** Lua tables (hash)
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef ltable_h
	-#define ltable_h
	-
	-#include "lobject.h"
	-
	-
	-#define gnode(t,i) (&(t)->node[i])
	-#define gkey(n) (&(n)->i_key.tvk)
	-#define gval(n) (&(n)->i_val)
	-#define gnext(n) ((n)->i_key.nk.next)
	-
	-#define invalidateTMcache(t) ((t)->flags = 0)
	-
	-/* returns the key, given the value of a table entry */
	-#define keyfromval(v) \
	- (gkey(cast(Node , cast(char , (v)) - offsetof(Node, i_val))))
	-
	-
	-LUAI_FUNC const TValue luaH_getint (Table t, int key);
	-LUAI_FUNC void luaH_setint (lua_State L, Table t, int key, TValue *value);
	-LUAI_FUNC const TValue luaH_getstr (Table t, TString *key);
	-LUAI_FUNC const TValue luaH_get (Table t, const TValue *key);
	-LUAI_FUNC TValue luaH_newkey (lua_State L, Table t, const TValue key);
	-LUAI_FUNC TValue luaH_set (lua_State L, Table t, const TValue key);
	-LUAI_FUNC Table luaH_new (lua_State L);
	-LUAI_FUNC void luaH_resize (lua_State L, Table t, int nasize, int nhsize);
	-LUAI_FUNC void luaH_resizearray (lua_State L, Table t, int nasize);
	-LUAI_FUNC void luaH_free (lua_State L, Table t);
	-LUAI_FUNC int luaH_next (lua_State L, Table t, StkId key);
	-LUAI_FUNC int luaH_getn (Table *t);
	-
	-
	-#if defined(LUA_DEBUG)
	-LUAI_FUNC Node luaH_mainposition (const Table t, const TValue *key);
	-LUAI_FUNC int luaH_isdummy (Node *n);
	-#endif
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c
	@@ -1,589 +0,0 @@
	-/*
	-** $Id: ltable.c,v 2.72.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Lua tables (hash)
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-/*
	-** Implementation of tables (aka arrays, objects, or hash tables).
	-** Tables keep its elements in two parts: an array part and a hash part.
	-** Non-negative integer keys are all candidates to be kept in the array
	-** part. The actual size of the array is the largest `n' such that at
	-** least half the slots between 0 and n are in use.
	-** Hash uses a mix of chained scatter table with Brent's variation.
	-** A main invariant of these tables is that, if an element is not
	-** in its main position (i.e. the `original' position that its hash gives
	-** to it), then the colliding element is in its own main position.
	-** Hence even when the load factor reaches 100%, performance remains good.
	-*/
	-
	-#include <sys/zfs_context.h>
	-
	-#define ltable_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lgc.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "lvm.h"
	-
	-
	-/*
	-** max size of array part is 2^MAXBITS
	-*/
	-#if LUAI_BITSINT >= 32
	-#define MAXBITS 30
	-#else
	-#define MAXBITS (LUAI_BITSINT-2)
	-#endif
	-
	-#define MAXASIZE (1 << MAXBITS)
	-
	-
	-#define hashpow2(t,n) (gnode(t, lmod((n), sizenode(t))))
	-
	-#define hashstr(t,str) hashpow2(t, (str)->tsv.hash)
	-#define hashboolean(t,p) hashpow2(t, p)
	-
	-
	-/*
	-** for some types, it is better to avoid modulus by power of 2, as
	-** they tend to have many 2 factors.
	-*/
	-#define hashmod(t,n) (gnode(t, ((n) % ((sizenode(t)-1)\|1))))
	-
	-
	-#define hashpointer(t,p) hashmod(t, IntPoint(p))
	-
	-
	-#define dummynode (&dummynode_)
	-
	-#define isdummy(n) ((n) == dummynode)
	-
	-static const Node dummynode_ = {
	- {NILCONSTANT}, /* value */
	- {{NILCONSTANT, NULL}} /* key */
	-};
	-
	-
	-/*
	-** hash for lua_Numbers
	-*/
	-static Node hashnum (const Table t, lua_Number n) {
	- int i;
	- luai_hashnum(i, n);
	- if (i < 0) {
	- if (cast(unsigned int, i) == 0u - i) /* use unsigned to avoid overflows */
	- i = 0; /* handle INT_MIN */
	- i = -i; /* must be a positive value */
	- }
	- return hashmod(t, i);
	-}
	-
	-
	-
	-/*
	-** returns the `main' position of an element in a table (that is, the index
	-** of its hash value)
	-*/
	-static Node mainposition (const Table t, const TValue *key) {
	- switch (ttype(key)) {
	- case LUA_TNUMBER:
	- return hashnum(t, nvalue(key));
	- case LUA_TLNGSTR: {
	- TString *s = rawtsvalue(key);
	- if (s->tsv.extra == 0) { /* no hash? */
	- s->tsv.hash = luaS_hash(getstr(s), s->tsv.len, s->tsv.hash);
	- s->tsv.extra = 1; /* now it has its hash */
	- }
	- return hashstr(t, rawtsvalue(key));
	- }
	- case LUA_TSHRSTR:
	- return hashstr(t, rawtsvalue(key));
	- case LUA_TBOOLEAN:
	- return hashboolean(t, bvalue(key));
	- case LUA_TLIGHTUSERDATA:
	- return hashpointer(t, pvalue(key));
	- case LUA_TLCF:
	- return hashpointer(t, fvalue(key));
	- default:
	- return hashpointer(t, gcvalue(key));
	- }
	-}
	-
	-
	-/*
	-** returns the index for `key' if `key' is an appropriate key to live in
	-** the array part of the table, -1 otherwise.
	-*/
	-static int arrayindex (const TValue *key) {
	- if (ttisnumber(key)) {
	- lua_Number n = nvalue(key);
	- int k;
	- lua_number2int(k, n);
	- if (luai_numeq(cast_num(k), n))
	- return k;
	- }
	- return -1; /* `key' did not match some condition */
	-}
	-
	-
	-/*
	-** returns the index of a `key' for table traversals. First goes all
	-** elements in the array part, then elements in the hash part. The
	-** beginning of a traversal is signaled by -1.
	-*/
	-static int findindex (lua_State L, Table t, StkId key) {
	- int i;
	- if (ttisnil(key)) return -1; /* first iteration */
	- i = arrayindex(key);
	- if (0 < i && i <= t->sizearray) /* is `key' inside array part? */
	- return i-1; /* yes; that's the index (corrected to C) */
	- else {
	- Node *n = mainposition(t, key);
	- for (;;) { /* check whether `key' is somewhere in the chain */
	- /* key may be dead already, but it is ok to use it in `next' */
	- if (luaV_rawequalobj(gkey(n), key) \|\|
	- (ttisdeadkey(gkey(n)) && iscollectable(key) &&
	- deadvalue(gkey(n)) == gcvalue(key))) {
	- i = cast_int(n - gnode(t, 0)); /* key index in hash table */
	- /* hash elements are numbered after array ones */
	- return i + t->sizearray;
	- }
	- else n = gnext(n);
	- if (n == NULL)
	- luaG_runerror(L, "invalid key to " LUA_QL("next")); /* key not found */
	- }
	- }
	-}
	-
	-
	-int luaH_next (lua_State L, Table t, StkId key) {
	- int i = findindex(L, t, key); /* find original element */
	- for (i++; i < t->sizearray; i++) { /* try first array part */
	- if (!ttisnil(&t->array[i])) { /* a non-nil value? */
	- setnvalue(key, cast_num(i+1));
	- setobj2s(L, key+1, &t->array[i]);
	- return 1;
	- }
	- }
	- for (i -= t->sizearray; i < sizenode(t); i++) { /* then hash part */
	- if (!ttisnil(gval(gnode(t, i)))) { /* a non-nil value? */
	- setobj2s(L, key, gkey(gnode(t, i)));
	- setobj2s(L, key+1, gval(gnode(t, i)));
	- return 1;
	- }
	- }
	- return 0; /* no more elements */
	-}
	-
	-
	-/*
	-** {=============================================================
	-** Rehash
	-** ==============================================================
	-*/
	-
	-
	-static int computesizes (int nums[], int *narray) {
	- int i;
	- int twotoi; /* 2^i */
	- int a = 0; /* number of elements smaller than 2^i */
	- int na = 0; /* number of elements to go to array part */
	- int n = 0; /* optimal size for array part */
	- for (i = 0, twotoi = 1; twotoi/2 < narray; i++, twotoi = 2) {
	- if (nums[i] > 0) {
	- a += nums[i];
	- if (a > twotoi/2) { /* more than half elements present? */
	- n = twotoi; /* optimal size (till now) */
	- na = a; /* all elements smaller than n will go to array part */
	- }
	- }
	- if (a == narray) break; / all elements already counted */
	- }
	- *narray = n;
	- lua_assert(narray/2 <= na && na <= narray);
	- return na;
	-}
	-
	-
	-static int countint (const TValue key, int nums) {
	- int k = arrayindex(key);
	- if (0 < k && k <= MAXASIZE) { /* is `key' an appropriate array index? */
	- nums[luaO_ceillog2(k)]++; /* count as such */
	- return 1;
	- }
	- else
	- return 0;
	-}
	-
	-
	-static int numusearray (const Table t, int nums) {
	- int lg;
	- int ttlg; /* 2^lg */
	- int ause = 0; /* summation of `nums' */
	- int i = 1; /* count to traverse all array keys */
	- for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg=2) { / for each slice */
	- int lc = 0; /* counter */
	- int lim = ttlg;
	- if (lim > t->sizearray) {
	- lim = t->sizearray; /* adjust upper limit */
	- if (i > lim)
	- break; /* no more elements to count */
	- }
	- /* count elements in range (2^(lg-1), 2^lg] */
	- for (; i <= lim; i++) {
	- if (!ttisnil(&t->array[i-1]))
	- lc++;
	- }
	- nums[lg] += lc;
	- ause += lc;
	- }
	- return ause;
	-}
	-
	-
	-static int numusehash (const Table t, int nums, int *pnasize) {
	- int totaluse = 0; /* total number of elements */
	- int ause = 0; /* summation of `nums' */
	- int i = sizenode(t);
	- while (i--) {
	- Node *n = &t->node[i];
	- if (!ttisnil(gval(n))) {
	- ause += countint(gkey(n), nums);
	- totaluse++;
	- }
	- }
	- *pnasize += ause;
	- return totaluse;
	-}
	-
	-
	-static void setarrayvector (lua_State L, Table t, int size) {
	- int i;
	- luaM_reallocvector(L, t->array, t->sizearray, size, TValue);
	- for (i=t->sizearray; i<size; i++)
	- setnilvalue(&t->array[i]);
	- t->sizearray = size;
	-}
	-
	-
	-static void setnodevector (lua_State L, Table t, int size) {
	- int lsize;
	- if (size == 0) { /* no elements to hash part? */
	- t->node = cast(Node , dummynode); / use common `dummynode' */
	- lsize = 0;
	- }
	- else {
	- int i;
	- lsize = luaO_ceillog2(size);
	- if (lsize > MAXBITS)
	- luaG_runerror(L, "table overflow");
	- size = twoto(lsize);
	- t->node = luaM_newvector(L, size, Node);
	- for (i=0; i<size; i++) {
	- Node *n = gnode(t, i);
	- gnext(n) = NULL;
	- setnilvalue(gkey(n));
	- setnilvalue(gval(n));
	- }
	- }
	- t->lsizenode = cast_byte(lsize);
	- t->lastfree = gnode(t, size); /* all positions are free */
	-}
	-
	-
	-void luaH_resize (lua_State L, Table t, int nasize, int nhsize) {
	- int i;
	- int oldasize = t->sizearray;
	- int oldhsize = t->lsizenode;
	- Node nold = t->node; / save old hash ... */
	- if (nasize > oldasize) /* array part must grow? */
	- setarrayvector(L, t, nasize);
	- /* create new hash part with appropriate size */
	- setnodevector(L, t, nhsize);
	- if (nasize < oldasize) { /* array part must shrink? */
	- t->sizearray = nasize;
	- /* re-insert elements from vanishing slice */
	- for (i=nasize; i<oldasize; i++) {
	- if (!ttisnil(&t->array[i]))
	- luaH_setint(L, t, i + 1, &t->array[i]);
	- }
	- /* shrink array */
	- luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
	- }
	- /* re-insert elements from hash part */
	- for (i = twoto(oldhsize) - 1; i >= 0; i--) {
	- Node *old = nold+i;
	- if (!ttisnil(gval(old))) {
	- /* doesn't need barrier/invalidate cache, as entry was
	- already present in the table */
	- setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old));
	- }
	- }
	- if (!isdummy(nold))
	- luaM_freearray(L, nold, cast(size_t, twoto(oldhsize))); /* free old array */
	-}
	-
	-
	-void luaH_resizearray (lua_State L, Table t, int nasize) {
	- int nsize = isdummy(t->node) ? 0 : sizenode(t);
	- luaH_resize(L, t, nasize, nsize);
	-}
	-
	-
	-static void rehash (lua_State L, Table t, const TValue *ek) {
	- int nasize, na;
	- int nums[MAXBITS+1]; /* nums[i] = number of keys with 2^(i-1) < k <= 2^i */
	- int i;
	- int totaluse;
	- for (i=0; i<=MAXBITS; i++) nums[i] = 0; /* reset counts */
	- nasize = numusearray(t, nums); /* count keys in array part */
	- totaluse = nasize; /* all those keys are integer keys */
	- totaluse += numusehash(t, nums, &nasize); /* count keys in hash part */
	- /* count extra key */
	- nasize += countint(ek, nums);
	- totaluse++;
	- /* compute new size for array part */
	- na = computesizes(nums, &nasize);
	- /* resize the table to new computed sizes */
	- luaH_resize(L, t, nasize, totaluse - na);
	-}
	-
	-
	-
	-/*
	-** }=============================================================
	-*/
	-
	-
	-Table luaH_new (lua_State L) {
	- Table *t = &luaC_newobj(L, LUA_TTABLE, sizeof(Table), NULL, 0)->h;
	- t->metatable = NULL;
	- t->flags = cast_byte(~0);
	- t->array = NULL;
	- t->sizearray = 0;
	- setnodevector(L, t, 0);
	- return t;
	-}
	-
	-
	-void luaH_free (lua_State L, Table t) {
	- if (!isdummy(t->node))
	- luaM_freearray(L, t->node, cast(size_t, sizenode(t)));
	- luaM_freearray(L, t->array, t->sizearray);
	- luaM_free(L, t);
	-}
	-
	-
	-static Node getfreepos (Table t) {
	- while (t->lastfree > t->node) {
	- t->lastfree--;
	- if (ttisnil(gkey(t->lastfree)))
	- return t->lastfree;
	- }
	- return NULL; /* could not find a free place */
	-}
	-
	-
	-
	-/*
	-** inserts a new key into a hash table; first, check whether key's main
	-** position is free. If not, check whether colliding node is in its main
	-** position or not: if it is not, move colliding node to an empty place and
	-** put new key in its main position; otherwise (colliding node is in its main
	-** position), new key goes to an empty position.
	-*/
	-TValue luaH_newkey (lua_State L, Table t, const TValue key) {
	- Node *mp;
	- if (ttisnil(key)) luaG_runerror(L, "table index is nil");
	- else if (ttisnumber(key) && luai_numisnan(L, nvalue(key)))
	- luaG_runerror(L, "table index is NaN");
	- mp = mainposition(t, key);
	- if (!ttisnil(gval(mp)) \|\| isdummy(mp)) { /* main position is taken? */
	- Node *othern;
	- Node n = getfreepos(t); / get a free place */
	- if (n == NULL) { /* cannot find a free place? */
	- rehash(L, t, key); /* grow table */
	- /* whatever called 'newkey' take care of TM cache and GC barrier */
	- return luaH_set(L, t, key); /* insert key into grown table */
	- }
	- lua_assert(!isdummy(n));
	- othern = mainposition(t, gkey(mp));
	- if (othern != mp) { /* is colliding node out of its main position? */
	- /* yes; move colliding node into free position */
	- while (gnext(othern) != mp) othern = gnext(othern); /* find previous */
	- gnext(othern) = n; /* redo the chain with `n' in place of `mp' */
	- n = mp; /* copy colliding node into free pos. (mp->next also goes) */
	- gnext(mp) = NULL; /* now `mp' is free */
	- setnilvalue(gval(mp));
	- }
	- else { /* colliding node is in its own main position */
	- /* new node will go into free position */
	- gnext(n) = gnext(mp); /* chain new position */
	- gnext(mp) = n;
	- mp = n;
	- }
	- }
	- setobj2t(L, gkey(mp), key);
	- luaC_barrierback(L, obj2gco(t), key);
	- lua_assert(ttisnil(gval(mp)));
	- return gval(mp);
	-}
	-
	-
	-/*
	-** search function for integers
	-*/
	-const TValue luaH_getint (Table t, int key) {
	- /* (1 <= key && key <= t->sizearray) */
	- if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray))
	- return &t->array[key-1];
	- else {
	- lua_Number nk = cast_num(key);
	- Node *n = hashnum(t, nk);
	- do { /* check whether `key' is somewhere in the chain */
	- if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk))
	- return gval(n); /* that's it */
	- else n = gnext(n);
	- } while (n);
	- return luaO_nilobject;
	- }
	-}
	-
	-
	-/*
	-** search function for short strings
	-*/
	-const TValue luaH_getstr (Table t, TString *key) {
	- Node *n = hashstr(t, key);
	- lua_assert(key->tsv.tt == LUA_TSHRSTR);
	- do { /* check whether `key' is somewhere in the chain */
	- if (ttisshrstring(gkey(n)) && eqshrstr(rawtsvalue(gkey(n)), key))
	- return gval(n); /* that's it */
	- else n = gnext(n);
	- } while (n);
	- return luaO_nilobject;
	-}
	-
	-
	-/*
	-** main search function
	-*/
	-const TValue luaH_get (Table t, const TValue *key) {
	- switch (ttype(key)) {
	- case LUA_TSHRSTR: return luaH_getstr(t, rawtsvalue(key));
	- case LUA_TNIL: return luaO_nilobject;
	- case LUA_TNUMBER: {
	- int k;
	- lua_Number n = nvalue(key);
	- lua_number2int(k, n);
	- if (luai_numeq(cast_num(k), n)) /* index is int? */
	- return luaH_getint(t, k); /* use specialized version */
	- /* else go through */
	- }
	- /* FALLTHROUGH */
	- default: {
	- Node *n = mainposition(t, key);
	- do { /* check whether `key' is somewhere in the chain */
	- if (luaV_rawequalobj(gkey(n), key))
	- return gval(n); /* that's it */
	- else n = gnext(n);
	- } while (n);
	- return luaO_nilobject;
	- }
	- }
	-}
	-
	-
	-/*
	-** beware: when using this function you probably need to check a GC
	-** barrier and invalidate the TM cache.
	-*/
	-TValue luaH_set (lua_State L, Table t, const TValue key) {
	- const TValue *p = luaH_get(t, key);
	- if (p != luaO_nilobject)
	- return cast(TValue *, p);
	- else return luaH_newkey(L, t, key);
	-}
	-
	-
	-void luaH_setint (lua_State L, Table t, int key, TValue *value) {
	- const TValue *p = luaH_getint(t, key);
	- TValue *cell;
	- if (p != luaO_nilobject)
	- cell = cast(TValue *, p);
	- else {
	- TValue k;
	- setnvalue(&k, cast_num(key));
	- cell = luaH_newkey(L, t, &k);
	- }
	- setobj2t(L, cell, value);
	-}
	-
	-
	-static int unbound_search (Table *t, unsigned int j) {
	- unsigned int i = j; /* i is zero or a present index */
	- j++;
	- /* find `i' and `j' such that i is present and j is not */
	- while (!ttisnil(luaH_getint(t, j))) {
	- i = j;
	- j *= 2;
	- if (j > cast(unsigned int, MAX_INT)) { /* overflow? */
	- /* table was built with bad purposes: resort to linear search */
	- i = 1;
	- while (!ttisnil(luaH_getint(t, i))) i++;
	- return i - 1;
	- }
	- }
	- /* now do a binary search between them */
	- while (j - i > 1) {
	- unsigned int m = (i+j)/2;
	- if (ttisnil(luaH_getint(t, m))) j = m;
	- else i = m;
	- }
	- return i;
	-}
	-
	-
	-/*
	-** Try to find a boundary in table `t'. A `boundary' is an integer index
	-** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
	-*/
	-int luaH_getn (Table *t) {
	- unsigned int j = t->sizearray;
	- if (j > 0 && ttisnil(&t->array[j - 1])) {
	- /* there is a boundary in the array part: (binary) search for it */
	- unsigned int i = 0;
	- while (j - i > 1) {
	- unsigned int m = (i+j)/2;
	- if (ttisnil(&t->array[m - 1])) j = m;
	- else i = m;
	- }
	- return i;
	- }
	- /* else must find a boundary in hash part */
	- else if (isdummy(t->node)) /* hash part is empty? */
	- return j; /* that is easy... */
	- else return unbound_search(t, j);
	-}
	-
	-
	-
	-#if defined(LUA_DEBUG)
	-
	-Node luaH_mainposition (const Table t, const TValue *key) {
	- return mainposition(t, key);
	-}
	-
	-int luaH_isdummy (Node *n) { return isdummy(n); }
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c
	@@ -1,284 +0,0 @@
	-/*
	-** $Id: ltablib.c,v 1.65.1.2 2014/05/07 16:32:55 roberto Exp $
	-** Library for Table Manipulation
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define ltablib_c
	-#define LUA_LIB
	-
	-#include "lua.h"
	-
	-#include "lauxlib.h"
	-#include "lualib.h"
	-
	-
	-#define aux_getn(L,n) (luaL_checktype(L, n, LUA_TTABLE), luaL_len(L, n))
	-
	-
	-
	-#if defined(LUA_COMPAT_MAXN)
	-static int maxn (lua_State *L) {
	- lua_Number max = 0;
	- luaL_checktype(L, 1, LUA_TTABLE);
	- lua_pushnil(L); /* first key */
	- while (lua_next(L, 1)) {
	- lua_pop(L, 1); /* remove value */
	- if (lua_type(L, -1) == LUA_TNUMBER) {
	- lua_Number v = lua_tonumber(L, -1);
	- if (v > max) max = v;
	- }
	- }
	- lua_pushnumber(L, max);
	- return 1;
	-}
	-#endif
	-
	-
	-static int tinsert (lua_State *L) {
	- int e = aux_getn(L, 1) + 1; /* first empty element */
	- int pos; /* where to insert new element */
	- switch (lua_gettop(L)) {
	- case 2: { /* called with only 2 arguments */
	- pos = e; /* insert new element at the end */
	- break;
	- }
	- case 3: {
	- int i;
	- pos = luaL_checkint(L, 2); /* 2nd argument is the position */
	- luaL_argcheck(L, 1 <= pos && pos <= e, 2, "position out of bounds");
	- for (i = e; i > pos; i--) { /* move up elements */
	- lua_rawgeti(L, 1, i-1);
	- lua_rawseti(L, 1, i); /* t[i] = t[i-1] */
	- }
	- break;
	- }
	- default: {
	- return luaL_error(L, "wrong number of arguments to " LUA_QL("insert"));
	- }
	- }
	- lua_rawseti(L, 1, pos); /* t[pos] = v */
	- return 0;
	-}
	-
	-
	-static int tremove (lua_State *L) {
	- int size = aux_getn(L, 1);
	- int pos = luaL_optint(L, 2, size);
	- if (pos != size) /* validate 'pos' if given */
	- luaL_argcheck(L, 1 <= pos && pos <= size + 1, 1, "position out of bounds");
	- lua_rawgeti(L, 1, pos); /* result = t[pos] */
	- for ( ; pos < size; pos++) {
	- lua_rawgeti(L, 1, pos+1);
	- lua_rawseti(L, 1, pos); /* t[pos] = t[pos+1] */
	- }
	- lua_pushnil(L);
	- lua_rawseti(L, 1, pos); /* t[pos] = nil */
	- return 1;
	-}
	-
	-
	-static void addfield (lua_State L, luaL_Buffer b, int i) {
	- lua_rawgeti(L, 1, i);
	- if (!lua_isstring(L, -1))
	- luaL_error(L, "invalid value (%s) at index %d in table for "
	- LUA_QL("concat"), luaL_typename(L, -1), i);
	- luaL_addvalue(b);
	-}
	-
	-
	-static int tconcat (lua_State *L) {
	- luaL_Buffer b;
	- size_t lsep;
	- int i, last;
	- const char *sep = luaL_optlstring(L, 2, "", &lsep);
	- luaL_checktype(L, 1, LUA_TTABLE);
	- i = luaL_optint(L, 3, 1);
	- last = luaL_opt(L, luaL_checkint, 4, luaL_len(L, 1));
	- luaL_buffinit(L, &b);
	- for (; i < last; i++) {
	- addfield(L, &b, i);
	- luaL_addlstring(&b, sep, lsep);
	- }
	- if (i == last) /* add last value (if interval was not empty) */
	- addfield(L, &b, i);
	- luaL_pushresult(&b);
	- return 1;
	-}
	-
	-
	-/*
	-** {======================================================
	-** Pack/unpack
	-** =======================================================
	-*/
	-
	-static int pack (lua_State *L) {
	- int n = lua_gettop(L); /* number of elements to pack */
	- lua_createtable(L, n, 1); /* create result table */
	- lua_pushinteger(L, n);
	- lua_setfield(L, -2, "n"); /* t.n = number of elements */
	- if (n > 0) { /* at least one element? */
	- int i;
	- lua_pushvalue(L, 1);
	- lua_rawseti(L, -2, 1); /* insert first element */
	- lua_replace(L, 1); /* move table into index 1 */
	- for (i = n; i >= 2; i--) /* assign other elements */
	- lua_rawseti(L, 1, i);
	- }
	- return 1; /* return table */
	-}
	-
	-
	-static int unpack (lua_State *L) {
	- int i, e;
	- unsigned int n;
	- luaL_checktype(L, 1, LUA_TTABLE);
	- i = luaL_optint(L, 2, 1);
	- e = luaL_opt(L, luaL_checkint, 3, luaL_len(L, 1));
	- if (i > e) return 0; /* empty range */
	- n = (unsigned int)e - (unsigned int)i; /* number of elements minus 1 */
	- if (n > (INT_MAX - 10) \|\| !lua_checkstack(L, ++n))
	- return luaL_error(L, "too many results to unpack");
	- lua_rawgeti(L, 1, i); /* push arg[i] (avoiding overflow problems) */
	- while (i++ < e) /* push arg[i + 1...e] */
	- lua_rawgeti(L, 1, i);
	- return n;
	-}
	-
	-/* }====================================================== */
	-
	-
	-
	-/*
	-** {======================================================
	-** Quicksort
	-** (based on `Algorithms in MODULA-3', Robert Sedgewick;
	-** Addison-Wesley, 1993.)
	-** =======================================================
	-*/
	-
	-
	-static void set2 (lua_State *L, int i, int j) {
	- lua_rawseti(L, 1, i);
	- lua_rawseti(L, 1, j);
	-}
	-
	-static int sort_comp (lua_State *L, int a, int b) {
	- if (!lua_isnil(L, 2)) { /* function? */
	- int res;
	- lua_pushvalue(L, 2);
	- lua_pushvalue(L, a-1); /* -1 to compensate function */
	- lua_pushvalue(L, b-2); /* -2 to compensate function and `a' */
	- lua_call(L, 2, 1);
	- res = lua_toboolean(L, -1);
	- lua_pop(L, 1);
	- return res;
	- }
	- else /* a < b? */
	- return lua_compare(L, a, b, LUA_OPLT);
	-}
	-
	-static void auxsort (lua_State *L, int l, int u) {
	- while (l < u) { /* for tail recursion */
	- int i, j;
	- /* sort elements a[l], a[(l+u)/2] and a[u] */
	- lua_rawgeti(L, 1, l);
	- lua_rawgeti(L, 1, u);
	- if (sort_comp(L, -1, -2)) /* a[u] < a[l]? */
	- set2(L, l, u); /* swap a[l] - a[u] */
	- else
	- lua_pop(L, 2);
	- if (u-l == 1) break; /* only 2 elements */
	- i = (l+u)/2;
	- lua_rawgeti(L, 1, i);
	- lua_rawgeti(L, 1, l);
	- if (sort_comp(L, -2, -1)) /* a[i]<a[l]? */
	- set2(L, i, l);
	- else {
	- lua_pop(L, 1); /* remove a[l] */
	- lua_rawgeti(L, 1, u);
	- if (sort_comp(L, -1, -2)) /* a[u]<a[i]? */
	- set2(L, i, u);
	- else
	- lua_pop(L, 2);
	- }
	- if (u-l == 2) break; /* only 3 elements */
	- lua_rawgeti(L, 1, i); /* Pivot */
	- lua_pushvalue(L, -1);
	- lua_rawgeti(L, 1, u-1);
	- set2(L, i, u-1);
	- /* a[l] <= P == a[u-1] <= a[u], only need to sort from l+1 to u-2 */
	- i = l; j = u-1;
	- for (;;) { /* invariant: a[l..i] <= P <= a[j..u] */
	- /* repeat ++i until a[i] >= P */
	- while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) {
	- if (i>=u) luaL_error(L, "invalid order function for sorting");
	- lua_pop(L, 1); /* remove a[i] */
	- }
	- /* repeat --j until a[j] <= P */
	- while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) {
	- if (j<=l) luaL_error(L, "invalid order function for sorting");
	- lua_pop(L, 1); /* remove a[j] */
	- }
	- if (j<i) {
	- lua_pop(L, 3); /* pop pivot, a[i], a[j] */
	- break;
	- }
	- set2(L, i, j);
	- }
	- lua_rawgeti(L, 1, u-1);
	- lua_rawgeti(L, 1, i);
	- set2(L, u-1, i); /* swap pivot (a[u-1]) with a[i] */
	- /* a[l..i-1] <= a[i] == P <= a[i+1..u] */
	- /* adjust so that smaller half is in [j..i] and larger one in [l..u] */
	- if (i-l < u-i) {
	- j=l; i=i-1; l=i+2;
	- }
	- else {
	- j=i+1; i=u; u=j-2;
	- }
	- auxsort(L, j, i); /* call recursively the smaller one */
	- } /* repeat the routine for the larger one */
	-}
	-
	-static int sort (lua_State *L) {
	- int n = aux_getn(L, 1);
	- luaL_checkstack(L, 40, ""); /* assume array is smaller than 2^40 */
	- if (!lua_isnoneornil(L, 2)) /* is there a 2nd argument? */
	- luaL_checktype(L, 2, LUA_TFUNCTION);
	- lua_settop(L, 2); /* make sure there is two arguments */
	- auxsort(L, 1, n);
	- return 0;
	-}
	-
	-/* }====================================================== */
	-
	-
	-static const luaL_Reg tab_funcs[] = {
	- {"concat", tconcat},
	-#if defined(LUA_COMPAT_MAXN)
	- {"maxn", maxn},
	-#endif
	- {"insert", tinsert},
	- {"pack", pack},
	- {"unpack", unpack},
	- {"remove", tremove},
	- {"sort", sort},
	- {NULL, NULL}
	-};
	-
	-
	-LUAMOD_API int luaopen_table (lua_State *L) {
	- luaL_newlib(L, tab_funcs);
	-#if defined(LUA_COMPAT_UNPACK)
	- /* _G.unpack = table.unpack */
	- lua_getfield(L, -1, "unpack");
	- lua_setglobal(L, "unpack");
	-#endif
	- return 1;
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h
	@@ -1,57 +0,0 @@
	-/*
	-** $Id: ltm.h,v 2.11.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Tag methods
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef ltm_h
	-#define ltm_h
	-
	-
	-#include "lobject.h"
	-
	-
	-/*
	-* WARNING: if you change the order of this enumeration,
	-* grep "ORDER TM"
	-*/
	-typedef enum {
	- TM_INDEX,
	- TM_NEWINDEX,
	- TM_GC,
	- TM_MODE,
	- TM_LEN,
	- TM_EQ, /* last tag method with `fast' access */
	- TM_ADD,
	- TM_SUB,
	- TM_MUL,
	- TM_DIV,
	- TM_MOD,
	- TM_POW,
	- TM_UNM,
	- TM_LT,
	- TM_LE,
	- TM_CONCAT,
	- TM_CALL,
	- TM_N /* number of elements in the enum */
	-} TMS;
	-
	-
	-
	-#define gfasttm(g,et,e) ((et) == NULL ? NULL : \
	- ((et)->flags & (1u<<(e))) ? NULL : luaT_gettm(et, e, (g)->tmname[e]))
	-
	-#define fasttm(l,et,e) gfasttm(G(l), et, e)
	-
	-#define ttypename(x) luaT_typenames_[(x) + 1]
	-#define objtypename(x) ttypename(ttypenv(x))
	-
	-LUAI_DDEC const char *const luaT_typenames_[LUA_TOTALTAGS];
	-
	-
	-LUAI_FUNC const TValue luaT_gettm (Table events, TMS event, TString *ename);
	-LUAI_FUNC const TValue luaT_gettmbyobj (lua_State L, const TValue *o,
	- TMS event);
	-LUAI_FUNC void luaT_init (lua_State *L);
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c
	@@ -1,77 +0,0 @@
	-/*
	-** $Id: ltm.c,v 2.14.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Tag methods
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define ltm_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "lobject.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "ltm.h"
	-
	-
	-static const char udatatypename[] = "userdata";
	-
	-LUAI_DDEF const char *const luaT_typenames_[LUA_TOTALTAGS] = {
	- "no value",
	- "nil", "boolean", udatatypename, "number",
	- "string", "table", "function", udatatypename, "thread",
	- "proto", "upval" /* these last two cases are used for tests only */
	-};
	-
	-
	-void luaT_init (lua_State *L) {
	- static const char const luaT_eventname[] = { / ORDER TM */
	- "__index", "__newindex",
	- "__gc", "__mode", "__len", "__eq",
	- "__add", "__sub", "__mul", "__div", "__mod",
	- "__pow", "__unm", "__lt", "__le",
	- "__concat", "__call"
	- };
	- int i;
	- for (i=0; i<TM_N; i++) {
	- G(L)->tmname[i] = luaS_new(L, luaT_eventname[i]);
	- luaS_fix(G(L)->tmname[i]); /* never collect these names */
	- }
	-}
	-
	-
	-/*
	-** function to be used with macro "fasttm": optimized for absence of
	-** tag methods
	-*/
	-const TValue luaT_gettm (Table events, TMS event, TString *ename) {
	- const TValue *tm = luaH_getstr(events, ename);
	- lua_assert(event <= TM_EQ);
	- if (ttisnil(tm)) { /* no tag method? */
	- events->flags \|= cast_byte(1u<<event); /* cache this fact */
	- return NULL;
	- }
	- else return tm;
	-}
	-
	-
	-const TValue luaT_gettmbyobj (lua_State L, const TValue *o, TMS event) {
	- Table *mt;
	- switch (ttypenv(o)) {
	- case LUA_TTABLE:
	- mt = hvalue(o)->metatable;
	- break;
	- case LUA_TUSERDATA:
	- mt = uvalue(o)->metatable;
	- break;
	- default:
	- mt = G(L)->mt[ttypenv(o)];
	- }
	- return (mt ? luaH_getstr(mt, G(L)->tmname[event]) : luaO_nilobject);
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h
	@@ -1,443 +0,0 @@
	-/*
	-** $Id: lua.h,v 1.285.1.4 2015/02/21 14:04:50 roberto Exp $
	-** Lua - A Scripting Language
	-** Lua.org, PUC-Rio, Brazil (http://www.lua.org)
	-** See Copyright Notice at the end of this file
	-*/
	-
	-
	-#ifndef lua_h
	-#define lua_h
	-
	-#include <sys/zfs_context.h>
	-
	-#include "luaconf.h"
	-
	-
	-#define LUA_VERSION_MAJOR "5"
	-#define LUA_VERSION_MINOR "2"
	-#define LUA_VERSION_NUM 502
	-#define LUA_VERSION_RELEASE "4"
	-
	-#define LUA_VERSION "Lua " LUA_VERSION_MAJOR "." LUA_VERSION_MINOR
	-#define LUA_RELEASE LUA_VERSION "." LUA_VERSION_RELEASE
	-#define LUA_COPYRIGHT LUA_RELEASE " Copyright (C) 1994-2015 Lua.org, PUC-Rio"
	-#define LUA_AUTHORS "R. Ierusalimschy, L. H. de Figueiredo, W. Celes"
	-
	-
	-/* mark for precompiled code ('<esc>Lua') */
	-#define LUA_SIGNATURE "\033Lua"
	-
	-/* option for multiple returns in 'lua_pcall' and 'lua_call' */
	-#define LUA_MULTRET (-1)
	-
	-
	-/*
	-** pseudo-indices
	-*/
	-#define LUA_REGISTRYINDEX LUAI_FIRSTPSEUDOIDX
	-#define lua_upvalueindex(i) (LUA_REGISTRYINDEX - (i))
	-
	-
	-/* thread status */
	-#define LUA_OK 0
	-#define LUA_YIELD 1
	-#define LUA_ERRRUN 2
	-#define LUA_ERRSYNTAX 3
	-#define LUA_ERRMEM 4
	-#define LUA_ERRGCMM 5
	-#define LUA_ERRERR 6
	-
	-
	-typedef struct lua_State lua_State;
	-
	-typedef int (lua_CFunction) (lua_State L);
	-
	-
	-/*
	-** functions that read/write blocks when loading/dumping Lua chunks
	-*/
	-typedef const char * (lua_Reader) (lua_State L, void ud, size_t sz);
	-
	-typedef int (lua_Writer) (lua_State L, const void* p, size_t sz, void* ud);
	-
	-
	-/*
	-** prototype for memory-allocation functions
	-*/
	-typedef void * (lua_Alloc) (void ud, void *ptr, size_t osize, size_t nsize);
	-
	-
	-/*
	-** basic types
	-*/
	-#define LUA_TNONE (-1)
	-
	-#define LUA_TNIL 0
	-#define LUA_TBOOLEAN 1
	-#define LUA_TLIGHTUSERDATA 2
	-#define LUA_TNUMBER 3
	-#define LUA_TSTRING 4
	-#define LUA_TTABLE 5
	-#define LUA_TFUNCTION 6
	-#define LUA_TUSERDATA 7
	-#define LUA_TTHREAD 8
	-
	-#define LUA_NUMTAGS 9
	-
	-
	-
	-/* minimum Lua stack available to a C function */
	-#define LUA_MINSTACK 20
	-
	-
	-/* predefined values in the registry */
	-#define LUA_RIDX_MAINTHREAD 1
	-#define LUA_RIDX_GLOBALS 2
	-#define LUA_RIDX_LAST LUA_RIDX_GLOBALS
	-
	-
	-/* type of numbers in Lua */
	-typedef LUA_NUMBER lua_Number;
	-
	-
	-/* type for integer functions */
	-typedef LUA_INTEGER lua_Integer;
	-
	-/* unsigned integer type */
	-typedef LUA_UNSIGNED lua_Unsigned;
	-
	-
	-
	-
	-/*
	-** generic extra include file
	-*/
	-#if defined(LUA_USER_H)
	-#include LUA_USER_H
	-#endif
	-
	-
	-/*
	-** RCS ident string
	-*/
	-extern const char lua_ident[];
	-
	-
	-/*
	-** state manipulation
	-*/
	-LUA_API lua_State (lua_newstate) (lua_Alloc f, void ud);
	-LUA_API void (lua_close) (lua_State *L);
	-LUA_API lua_State (lua_newthread) (lua_State L);
	-
	-LUA_API lua_CFunction (lua_atpanic) (lua_State *L, lua_CFunction panicf);
	-
	-
	-LUA_API const lua_Number (lua_version) (lua_State L);
	-
	-
	-/*
	-** basic stack manipulation
	-*/
	-LUA_API int (lua_absindex) (lua_State *L, int idx);
	-LUA_API int (lua_gettop) (lua_State *L);
	-LUA_API void (lua_settop) (lua_State *L, int idx);
	-LUA_API void (lua_pushvalue) (lua_State *L, int idx);
	-LUA_API void (lua_remove) (lua_State *L, int idx);
	-LUA_API void (lua_insert) (lua_State *L, int idx);
	-LUA_API void (lua_replace) (lua_State *L, int idx);
	-LUA_API void (lua_copy) (lua_State *L, int fromidx, int toidx);
	-LUA_API int (lua_checkstack) (lua_State *L, int sz);
	-
	-LUA_API void (lua_xmove) (lua_State from, lua_State to, int n);
	-
	-
	-/*
	-** access functions (stack -> C)
	-*/
	-
	-LUA_API int (lua_isnumber) (lua_State *L, int idx);
	-LUA_API int (lua_isstring) (lua_State *L, int idx);
	-LUA_API int (lua_iscfunction) (lua_State *L, int idx);
	-LUA_API int (lua_isuserdata) (lua_State *L, int idx);
	-LUA_API int (lua_type) (lua_State *L, int idx);
	-LUA_API const char (lua_typename) (lua_State L, int tp);
	-
	-LUA_API lua_Number (lua_tonumberx) (lua_State L, int idx, int isnum);
	-LUA_API lua_Integer (lua_tointegerx) (lua_State L, int idx, int isnum);
	-LUA_API lua_Unsigned (lua_tounsignedx) (lua_State L, int idx, int isnum);
	-LUA_API int (lua_toboolean) (lua_State *L, int idx);
	-LUA_API const char (lua_tolstring) (lua_State L, int idx, size_t *len);
	-LUA_API size_t (lua_rawlen) (lua_State *L, int idx);
	-LUA_API lua_CFunction (lua_tocfunction) (lua_State *L, int idx);
	-LUA_API void (lua_touserdata) (lua_State L, int idx);
	-LUA_API lua_State (lua_tothread) (lua_State L, int idx);
	-LUA_API const void (lua_topointer) (lua_State L, int idx);
	-
	-
	-/*
	-** Comparison and arithmetic functions
	-*/
	-
	-#define LUA_OPADD 0 /* ORDER TM */
	-#define LUA_OPSUB 1
	-#define LUA_OPMUL 2
	-#define LUA_OPDIV 3
	-#define LUA_OPMOD 4
	-#define LUA_OPPOW 5
	-#define LUA_OPUNM 6
	-
	-LUA_API void (lua_arith) (lua_State *L, int op);
	-
	-#define LUA_OPEQ 0
	-#define LUA_OPLT 1
	-#define LUA_OPLE 2
	-
	-LUA_API int (lua_rawequal) (lua_State *L, int idx1, int idx2);
	-LUA_API int (lua_compare) (lua_State *L, int idx1, int idx2, int op);
	-
	-
	-/*
	-** push functions (C -> stack)
	-*/
	-LUA_API void (lua_pushnil) (lua_State *L);
	-LUA_API void (lua_pushnumber) (lua_State *L, lua_Number n);
	-LUA_API void (lua_pushinteger) (lua_State *L, lua_Integer n);
	-LUA_API void (lua_pushunsigned) (lua_State *L, lua_Unsigned n);
	-LUA_API const char (lua_pushlstring) (lua_State L, const char *s, size_t l);
	-LUA_API const char (lua_pushstring) (lua_State L, const char *s);
	-LUA_API const char (lua_pushvfstring) (lua_State L, const char *fmt,
	- va_list argp);
	-LUA_API const char (lua_pushfstring) (lua_State L, const char *fmt, ...);
	-LUA_API void (lua_pushcclosure) (lua_State *L, lua_CFunction fn, int n);
	-LUA_API void (lua_pushboolean) (lua_State *L, int b);
	-LUA_API void (lua_pushlightuserdata) (lua_State L, void p);
	-LUA_API int (lua_pushthread) (lua_State *L);
	-
	-
	-/*
	-** get functions (Lua -> stack)
	-*/
	-LUA_API void (lua_getglobal) (lua_State L, const char var);
	-LUA_API void (lua_gettable) (lua_State *L, int idx);
	-LUA_API void (lua_getfield) (lua_State L, int idx, const char k);
	-LUA_API void (lua_rawget) (lua_State *L, int idx);
	-LUA_API void (lua_rawgeti) (lua_State *L, int idx, int n);
	-LUA_API void (lua_rawgetp) (lua_State L, int idx, const void p);
	-LUA_API void (lua_createtable) (lua_State *L, int narr, int nrec);
	-LUA_API void (lua_newuserdata) (lua_State L, size_t sz);
	-LUA_API int (lua_getmetatable) (lua_State *L, int objindex);
	-LUA_API void (lua_getuservalue) (lua_State *L, int idx);
	-
	-
	-/*
	-** set functions (stack -> Lua)
	-*/
	-LUA_API void (lua_setglobal) (lua_State L, const char var);
	-LUA_API void (lua_settable) (lua_State *L, int idx);
	-LUA_API void (lua_setfield) (lua_State L, int idx, const char k);
	-LUA_API void (lua_rawset) (lua_State *L, int idx);
	-LUA_API void (lua_rawseti) (lua_State *L, int idx, int n);
	-LUA_API void (lua_rawsetp) (lua_State L, int idx, const void p);
	-LUA_API int (lua_setmetatable) (lua_State *L, int objindex);
	-LUA_API void (lua_setuservalue) (lua_State *L, int idx);
	-
	-
	-/*
	-** 'load' and 'call' functions (load and run Lua code)
	-*/
	-LUA_API void (lua_callk) (lua_State *L, int nargs, int nresults, int ctx,
	- lua_CFunction k);
	-#define lua_call(L,n,r) lua_callk(L, (n), (r), 0, NULL)
	-
	-LUA_API int (lua_getctx) (lua_State L, int ctx);
	-
	-LUA_API int (lua_pcallk) (lua_State *L, int nargs, int nresults, int errfunc,
	- int ctx, lua_CFunction k);
	-#define lua_pcall(L,n,r,f) lua_pcallk(L, (n), (r), (f), 0, NULL)
	-
	-LUA_API int (lua_load) (lua_State L, lua_Reader reader, void dt,
	- const char *chunkname,
	- const char *mode);
	-
	-LUA_API int (lua_dump) (lua_State L, lua_Writer writer, void data);
	-
	-
	-/*
	-** coroutine functions
	-*/
	-LUA_API int (lua_yieldk) (lua_State *L, int nresults, int ctx,
	- lua_CFunction k);
	-#define lua_yield(L,n) lua_yieldk(L, (n), 0, NULL)
	-LUA_API int (lua_resume) (lua_State L, lua_State from, int narg);
	-LUA_API int (lua_status) (lua_State *L);
	-
	-/*
	-** garbage-collection function and options
	-*/
	-
	-#define LUA_GCSTOP 0
	-#define LUA_GCRESTART 1
	-#define LUA_GCCOLLECT 2
	-#define LUA_GCCOUNT 3
	-#define LUA_GCCOUNTB 4
	-#define LUA_GCSTEP 5
	-#define LUA_GCSETPAUSE 6
	-#define LUA_GCSETSTEPMUL 7
	-#define LUA_GCSETMAJORINC 8
	-#define LUA_GCISRUNNING 9
	-#define LUA_GCGEN 10
	-#define LUA_GCINC 11
	-
	-LUA_API int (lua_gc) (lua_State *L, int what, int data);
	-
	-
	-/*
	-** miscellaneous functions
	-*/
	-
	-LUA_API int (lua_error) (lua_State *L);
	-
	-LUA_API int (lua_next) (lua_State *L, int idx);
	-
	-LUA_API void (lua_concat) (lua_State *L, int n);
	-LUA_API void (lua_len) (lua_State *L, int idx);
	-
	-LUA_API lua_Alloc (lua_getallocf) (lua_State L, void *ud);
	-LUA_API void (lua_setallocf) (lua_State L, lua_Alloc f, void ud);
	-
	-
	-
	-/*
	-** ===============================================================
	-** some useful macros
	-** ===============================================================
	-*/
	-
	-#define lua_tonumber(L,i) lua_tonumberx(L,i,NULL)
	-#define lua_tointeger(L,i) lua_tointegerx(L,i,NULL)
	-#define lua_tounsigned(L,i) lua_tounsignedx(L,i,NULL)
	-
	-#define lua_pop(L,n) lua_settop(L, -(n)-1)
	-
	-#define lua_newtable(L) lua_createtable(L, 0, 0)
	-
	-#define lua_register(L,n,f) (lua_pushcfunction(L, (f)), lua_setglobal(L, (n)))
	-
	-#define lua_pushcfunction(L,f) lua_pushcclosure(L, (f), 0)
	-
	-#define lua_isfunction(L,n) (lua_type(L, (n)) == LUA_TFUNCTION)
	-#define lua_istable(L,n) (lua_type(L, (n)) == LUA_TTABLE)
	-#define lua_islightuserdata(L,n) (lua_type(L, (n)) == LUA_TLIGHTUSERDATA)
	-#define lua_isnil(L,n) (lua_type(L, (n)) == LUA_TNIL)
	-#define lua_isboolean(L,n) (lua_type(L, (n)) == LUA_TBOOLEAN)
	-#define lua_isthread(L,n) (lua_type(L, (n)) == LUA_TTHREAD)
	-#define lua_isnone(L,n) (lua_type(L, (n)) == LUA_TNONE)
	-#define lua_isnoneornil(L, n) (lua_type(L, (n)) <= 0)
	-
	-#define lua_pushliteral(L, s) \
	- lua_pushlstring(L, "" s, (sizeof(s)/sizeof(char))-1)
	-
	-#define lua_pushglobaltable(L) \
	- lua_rawgeti(L, LUA_REGISTRYINDEX, LUA_RIDX_GLOBALS)
	-
	-#define lua_tostring(L,i) lua_tolstring(L, (i), NULL)
	-
	-
	-
	-/*
	-** {======================================================================
	-** Debug API
	-** =======================================================================
	-*/
	-
	-
	-/*
	-** Event codes
	-*/
	-#define LUA_HOOKCALL 0
	-#define LUA_HOOKRET 1
	-#define LUA_HOOKLINE 2
	-#define LUA_HOOKCOUNT 3
	-#define LUA_HOOKTAILCALL 4
	-
	-
	-/*
	-** Event masks
	-*/
	-#define LUA_MASKCALL (1 << LUA_HOOKCALL)
	-#define LUA_MASKRET (1 << LUA_HOOKRET)
	-#define LUA_MASKLINE (1 << LUA_HOOKLINE)
	-#define LUA_MASKCOUNT (1 << LUA_HOOKCOUNT)
	-
	-typedef struct lua_Debug lua_Debug; /* activation record */
	-
	-
	-/* Functions to be called by the debugger in specific events */
	-typedef void (lua_Hook) (lua_State L, lua_Debug *ar);
	-
	-
	-LUA_API int (lua_getstack) (lua_State L, int level, lua_Debug ar);
	-LUA_API int (lua_getinfo) (lua_State L, const char what, lua_Debug *ar);
	-LUA_API const char (lua_getlocal) (lua_State L, const lua_Debug *ar, int n);
	-LUA_API const char (lua_setlocal) (lua_State L, const lua_Debug *ar, int n);
	-LUA_API const char (lua_getupvalue) (lua_State L, int funcindex, int n);
	-LUA_API const char (lua_setupvalue) (lua_State L, int funcindex, int n);
	-
	-LUA_API void (lua_upvalueid) (lua_State L, int fidx, int n);
	-LUA_API void (lua_upvaluejoin) (lua_State *L, int fidx1, int n1,
	- int fidx2, int n2);
	-
	-LUA_API int (lua_sethook) (lua_State *L, lua_Hook func, int mask, int count);
	-LUA_API lua_Hook (lua_gethook) (lua_State *L);
	-LUA_API int (lua_gethookmask) (lua_State *L);
	-LUA_API int (lua_gethookcount) (lua_State *L);
	-
	-
	-struct lua_Debug {
	- int event;
	- const char name; / (n) */
	- const char namewhat; / (n) 'global', 'local', 'field', 'method' */
	- const char what; / (S) 'Lua', 'C', 'main', 'tail' */
	- const char source; / (S) */
	- int currentline; /* (l) */
	- int linedefined; /* (S) */
	- int lastlinedefined; /* (S) */
	- unsigned char nups; /* (u) number of upvalues */
	- unsigned char nparams;/* (u) number of parameters */
	- char isvararg; /* (u) */
	- char istailcall; /* (t) */
	- char short_src[LUA_IDSIZE]; /* (S) */
	- /* private part */
	- struct CallInfo i_ci; / active function */
	-};
	-
	-/* }====================================================================== */
	-
	-
	-/******************************************************************************
	-* Copyright (C) 1994-2015 Lua.org, PUC-Rio.
	-*
	-* Permission is hereby granted, free of charge, to any person obtaining
	-* a copy of this software and associated documentation files (the
	-* "Software"), to deal in the Software without restriction, including
	-* without limitation the rights to use, copy, modify, merge, publish,
	-* distribute, sublicense, and/or sell copies of the Software, and to
	-* permit persons to whom the Software is furnished to do so, subject to
	-* the following conditions:
	-*
	-* The above copyright notice and this permission notice shall be
	-* included in all copies or substantial portions of the Software.
	-*
	-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	-******************************************************************************/
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h
	@@ -1,555 +0,0 @@
	-/*
	-** $Id: luaconf.h,v 1.176.1.2 2013/11/21 17:26:16 roberto Exp $
	-** Configuration file for Lua
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#ifndef lconfig_h
	-#define lconfig_h
	-
	-#include <sys/zfs_context.h>
	-#ifdef illumos
	-#include <sys/int_fmtio.h>
	-#else
	-#include <machine/_inttypes.h>
	-#endif
	-
	-extern ssize_t lcompat_sprintf(char , const char , ...);
	-extern int64_t lcompat_strtoll(const char , char *);
	-extern int64_t lcompat_pow(int64_t, int64_t);
	-
	-/*
	-** ==================================================================
	-** Search for "@@" to find all configurable definitions.
	-** ===================================================================
	-*/
	-
	-
	-/*
	-@@ LUA_ANSI controls the use of non-ansi features.
	-** CHANGE it (define it) if you want Lua to avoid the use of any
	-** non-ansi feature or library.
	-*/
	-#if !defined(LUA_ANSI) && defined(__STRICT_ANSI__)
	-#define LUA_ANSI
	-#endif
	-
	-
	-#if !defined(LUA_ANSI) && defined(_WIN32) && !defined(_WIN32_WCE)
	-#define LUA_WIN /* enable goodies for regular Windows platforms */
	-#endif
	-
	-#if defined(LUA_WIN)
	-#define LUA_DL_DLL
	-#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */
	-#endif
	-
	-
	-
	-#if defined(LUA_USE_LINUX)
	-#define LUA_USE_POSIX
	-#define LUA_USE_DLOPEN /* needs an extra library: -ldl */
	-#define LUA_USE_READLINE /* needs some extra libraries */
	-#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */
	-#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */
	-#define LUA_USE_LONGLONG /* assume support for long long */
	-#endif
	-
	-#if defined(LUA_USE_MACOSX)
	-#define LUA_USE_POSIX
	-#define LUA_USE_DLOPEN /* does not need -ldl */
	-#define LUA_USE_READLINE /* needs an extra library: -lreadline */
	-#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */
	-#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */
	-#define LUA_USE_LONGLONG /* assume support for long long */
	-#endif
	-
	-
	-
	-/*
	-@@ LUA_USE_POSIX includes all functionality listed as X/Open System
	-@* Interfaces Extension (XSI).
	-** CHANGE it (define it) if your system is XSI compatible.
	-*/
	-#if defined(LUA_USE_POSIX)
	-#define LUA_USE_MKSTEMP
	-#define LUA_USE_ISATTY
	-#define LUA_USE_POPEN
	-#define LUA_USE_ULONGJMP
	-#define LUA_USE_GMTIME_R
	-#endif
	-
	-
	-
	-/*
	-@@ LUA_PATH_DEFAULT is the default path that Lua uses to look for
	-@* Lua libraries.
	-@@ LUA_CPATH_DEFAULT is the default path that Lua uses to look for
	-@* C libraries.
	-** CHANGE them if your machine has a non-conventional directory
	-** hierarchy or if you want to install your libraries in
	-** non-conventional directories.
	-*/
	-#if defined(_WIN32) /* { */
	-/*
	-** In Windows, any exclamation mark ('!') in the path is replaced by the
	-** path of the directory of the executable file of the current process.
	-*/
	-#define LUA_LDIR "!\\lua\\"
	-#define LUA_CDIR "!\\"
	-#define LUA_PATH_DEFAULT \
	- LUA_LDIR"?.lua;" LUA_LDIR"?\\init.lua;" \
	- LUA_CDIR"?.lua;" LUA_CDIR"?\\init.lua;" ".\\?.lua"
	-#define LUA_CPATH_DEFAULT \
	- LUA_CDIR"?.dll;" LUA_CDIR"loadall.dll;" ".\\?.dll"
	-
	-#else /* }{ */
	-
	-#define LUA_VDIR LUA_VERSION_MAJOR "." LUA_VERSION_MINOR "/"
	-#define LUA_ROOT "/usr/local/"
	-#define LUA_LDIR LUA_ROOT "share/lua/" LUA_VDIR
	-#define LUA_CDIR LUA_ROOT "lib/lua/" LUA_VDIR
	-#define LUA_PATH_DEFAULT \
	- LUA_LDIR"?.lua;" LUA_LDIR"?/init.lua;" \
	- LUA_CDIR"?.lua;" LUA_CDIR"?/init.lua;" "./?.lua"
	-#define LUA_CPATH_DEFAULT \
	- LUA_CDIR"?.so;" LUA_CDIR"loadall.so;" "./?.so"
	-#endif /* } */
	-
	-
	-/*
	-@@ LUA_DIRSEP is the directory separator (for submodules).
	-** CHANGE it if your machine does not use "/" as the directory separator
	-** and is not Windows. (On Windows Lua automatically uses "\".)
	-*/
	-#if defined(_WIN32)
	-#define LUA_DIRSEP "\\"
	-#else
	-#define LUA_DIRSEP "/"
	-#endif
	-
	-
	-/*
	-@@ LUA_ENV is the name of the variable that holds the current
	-@@ environment, used to access global names.
	-** CHANGE it if you do not like this name.
	-*/
	-#define LUA_ENV "_ENV"
	-
	-
	-/*
	-@@ LUA_API is a mark for all core API functions.
	-@@ LUALIB_API is a mark for all auxiliary library functions.
	-@@ LUAMOD_API is a mark for all standard library opening functions.
	-** CHANGE them if you need to define those functions in some special way.
	-** For instance, if you want to create one Windows DLL with the core and
	-** the libraries, you may want to use the following definition (define
	-** LUA_BUILD_AS_DLL to get it).
	-*/
	-#if defined(LUA_BUILD_AS_DLL) /* { */
	-
	-#if defined(LUA_CORE) \|\| defined(LUA_LIB) /* { */
	-#define LUA_API __declspec(dllexport)
	-#else /* }{ */
	-#define LUA_API __declspec(dllimport)
	-#endif /* } */
	-
	-#else /* }{ */
	-
	-#define LUA_API extern
	-
	-#endif /* } */
	-
	-
	-/* more often than not the libs go together with the core */
	-#define LUALIB_API LUA_API
	-#define LUAMOD_API LUALIB_API
	-
	-
	-/*
	-@@ LUAI_FUNC is a mark for all extern functions that are not to be
	-@* exported to outside modules.
	-@@ LUAI_DDEF and LUAI_DDEC are marks for all extern (const) variables
	-@* that are not to be exported to outside modules (LUAI_DDEF for
	-@* definitions and LUAI_DDEC for declarations).
	-** CHANGE them if you need to mark them in some special way. Elf/gcc
	-** (versions 3.2 and later) mark them as "hidden" to optimize access
	-** when Lua is compiled as a shared library. Not all elf targets support
	-** this attribute. Unfortunately, gcc does not offer a way to check
	-** whether the target offers that support, and those without support
	-** give a warning about it. To avoid these warnings, change to the
	-** default definition.
	-*/
	-#if defined(__GNUC__) && ((__GNUC__*100 + __GNUC_MINOR__) >= 302) && \
	- defined(__ELF__) /* { */
	-#define LUAI_FUNC __attribute__((visibility("hidden"))) extern
	-#define LUAI_DDEC LUAI_FUNC
	-#define LUAI_DDEF /* empty */
	-
	-#else /* }{ */
	-#define LUAI_FUNC extern
	-#define LUAI_DDEC extern
	-#define LUAI_DDEF /* empty */
	-#endif /* } */
	-
	-
	-
	-/*
	-@@ LUA_QL describes how error messages quote program elements.
	-** CHANGE it if you want a different appearance.
	-*/
	-#define LUA_QL(x) "'" x "'"
	-#define LUA_QS LUA_QL("%s")
	-
	-
	-/*
	-@@ LUA_IDSIZE gives the maximum size for the description of the source
	-@* of a function in debug information.
	-** CHANGE it if you want a different size.
	-*/
	-#define LUA_IDSIZE 60
	-
	-
	-/*
	-@@ luai_writestringerror defines how to print error messages.
	-** (A format string with one argument is enough for Lua...)
	-*/
	-#ifdef _KERNEL
	-#define luai_writestringerror(s,p) \
	- (zfs_dbgmsg((s), (p)))
	-#else
	-#define luai_writestringerror(s,p) \
	- (fprintf(stderr, (s), (p)), fflush(stderr))
	-#endif
	-
	-
	-/*
	-@@ LUAI_MAXSHORTLEN is the maximum length for short strings, that is,
	-** strings that are internalized. (Cannot be smaller than reserved words
	-** or tags for metamethods, as these strings must be internalized;
	-** #("function") = 8, #("__newindex") = 10.)
	-*/
	-#define LUAI_MAXSHORTLEN 40
	-
	-
	-
	-/*
	-** {==================================================================
	-** Compatibility with previous versions
	-** ===================================================================
	-*/
	-
	-/*
	-@@ LUA_COMPAT_ALL controls all compatibility options.
	-** You can define it to get all options, or change specific options
	-** to fit your specific needs.
	-*/
	-#if defined(LUA_COMPAT_ALL) /* { */
	-
	-/*
	-@@ LUA_COMPAT_UNPACK controls the presence of global 'unpack'.
	-** You can replace it with 'table.unpack'.
	-*/
	-#define LUA_COMPAT_UNPACK
	-
	-/*
	-@@ LUA_COMPAT_LOADERS controls the presence of table 'package.loaders'.
	-** You can replace it with 'package.searchers'.
	-*/
	-#define LUA_COMPAT_LOADERS
	-
	-/*
	-@@ macro 'lua_cpcall' emulates deprecated function lua_cpcall.
	-** You can call your C function directly (with light C functions).
	-*/
	-#define lua_cpcall(L,f,u) \
	- (lua_pushcfunction(L, (f)), \
	- lua_pushlightuserdata(L,(u)), \
	- lua_pcall(L,1,0,0))
	-
	-
	-/*
	-@@ LUA_COMPAT_LOG10 defines the function 'log10' in the math library.
	-** You can rewrite 'log10(x)' as 'log(x, 10)'.
	-*/
	-#define LUA_COMPAT_LOG10
	-
	-/*
	-@@ LUA_COMPAT_LOADSTRING defines the function 'loadstring' in the base
	-** library. You can rewrite 'loadstring(s)' as 'load(s)'.
	-*/
	-#define LUA_COMPAT_LOADSTRING
	-
	-/*
	-@@ LUA_COMPAT_MAXN defines the function 'maxn' in the table library.
	-*/
	-#define LUA_COMPAT_MAXN
	-
	-/*
	-@@ The following macros supply trivial compatibility for some
	-** changes in the API. The macros themselves document how to
	-** change your code to avoid using them.
	-*/
	-#define lua_strlen(L,i) lua_rawlen(L, (i))
	-
	-#define lua_objlen(L,i) lua_rawlen(L, (i))
	-
	-#define lua_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ)
	-#define lua_lessthan(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPLT)
	-
	-/*
	-@@ LUA_COMPAT_MODULE controls compatibility with previous
	-** module functions 'module' (Lua) and 'luaL_register' (C).
	-*/
	-#define LUA_COMPAT_MODULE
	-
	-#endif /* } */
	-
	-/* }================================================================== */
	-
	-
	-
	-/*
	-@@ LUAI_BITSINT defines the number of bits in an int.
	-** CHANGE here if Lua cannot automatically detect the number of bits of
	-** your machine. Probably you do not need to change this.
	-*/
	-/* avoid overflows in comparison */
	-#if INT_MAX-20 < 32760 /* { */
	-#define LUAI_BITSINT 16
	-#elif INT_MAX > 2147483640L /* }{ */
	-/* int has at least 32 bits */
	-#define LUAI_BITSINT 32
	-#else /* }{ */
	-#error "you must define LUA_BITSINT with number of bits in an integer"
	-#endif /* } */
	-
	-
	-/*
	-@@ LUA_INT32 is a signed integer with exactly 32 bits.
	-@@ LUAI_UMEM is an unsigned integer big enough to count the total
	-@* memory used by Lua.
	-@@ LUAI_MEM is a signed integer big enough to count the total memory
	-@* used by Lua.
	-** CHANGE here if for some weird reason the default definitions are not
	-** good enough for your machine. Probably you do not need to change
	-** this.
	-*/
	-#if LUAI_BITSINT >= 32 /* { */
	-#define LUA_INT32 int
	-#define LUAI_UMEM size_t
	-#define LUAI_MEM ptrdiff_t
	-#else /* }{ */
	-/* 16-bit ints */
	-#define LUA_INT32 long
	-#define LUAI_UMEM unsigned long
	-#define LUAI_MEM long
	-#endif /* } */
	-
	-
	-/*
	-@@ LUAI_MAXSTACK limits the size of the Lua stack.
	-** CHANGE it if you need a different limit. This limit is arbitrary;
	-** its only purpose is to stop Lua from consuming unlimited stack
	-** space (and to reserve some numbers for pseudo-indices).
	-*/
	-#if LUAI_BITSINT >= 32
	-#define LUAI_MAXSTACK 1000000
	-#else
	-#define LUAI_MAXSTACK 15000
	-#endif
	-
	-/* reserve some space for error handling */
	-#define LUAI_FIRSTPSEUDOIDX (-LUAI_MAXSTACK - 1000)
	-
	-
	-
	-
	-/*
	-@@ LUAL_BUFFERSIZE is the buffer size used by the lauxlib buffer system.
	-** CHANGE it if it uses too much C-stack space.
	-*/
	-#define LUAL_BUFFERSIZE 1024
	-
	-
	-
	-
	-/*
	-** {==================================================================
	-@@ LUA_NUMBER is the type of numbers in Lua.
	-** CHANGE the following definitions only if you want to build Lua
	-** with a number type different from double. You may also need to
	-** change lua_number2int & lua_number2integer.
	-** ===================================================================
	-*/
	-
	-#define LUA_NUMBER int64_t
	-
	-/*
	-@@ LUAI_UACNUMBER is the result of an 'usual argument conversion'
	-@* over a number.
	-*/
	-#define LUAI_UACNUMBER int64_t
	-
	-
	-/*
	-@@ LUA_NUMBER_SCAN is the format for reading numbers.
	-@@ LUA_NUMBER_FMT is the format for writing numbers.
	-@@ lua_number2str converts a number to a string.
	-@@ LUAI_MAXNUMBER2STR is maximum size of previous conversion.
	-*/
	-#define LUA_NUMBER_FMT "%" PRId64
	-#define lua_number2str(s,n) lcompat_sprintf((s), LUA_NUMBER_FMT, (n))
	-#define LUAI_MAXNUMBER2STR 32 /* 16 digits, sign, point, and \0 */
	-
	-
	-/*
	-@@ l_mathop allows the addition of an 'l' or 'f' to all math operations
	-*/
	-#define l_mathop(x) (x ## l)
	-
	-
	-/*
	-@@ lua_str2number converts a decimal numeric string to a number.
	-@@ lua_strx2number converts an hexadecimal numeric string to a number.
	-** In C99, 'strtod' does both conversions. C89, however, has no function
	-** to convert floating hexadecimal strings to numbers. For these
	-** systems, you can leave 'lua_strx2number' undefined and Lua will
	-** provide its own implementation.
	-*/
	-#define lua_str2number(s,p) lcompat_strtoll((s), (p))
	-
	-#if defined(LUA_USE_STRTODHEX)
	-#define lua_strx2number(s,p) lcompat_strtoll((s), (p))
	-#endif
	-
	-
	-/*
	-@@ The luai_num* macros define the primitive operations over numbers.
	-*/
	-
	-/* the following operations need the math library */
	-#if defined(lobject_c) \|\| defined(lvm_c)
	-#define luai_nummod(L,a,b) ((a) % (b))
	-#define luai_numpow(L,a,b) (lcompat_pow((a),(b)))
	-#endif
	-
	-/* these are quite standard operations */
	-#if defined(LUA_CORE)
	-#define luai_numadd(L,a,b) ((a)+(b))
	-#define luai_numsub(L,a,b) ((a)-(b))
	-#define luai_nummul(L,a,b) ((a)*(b))
	-#define luai_numdiv(L,a,b) ((a)/(b))
	-#define luai_numunm(L,a) (-(a))
	-#define luai_numeq(a,b) ((a)==(b))
	-#define luai_numlt(L,a,b) ((a)<(b))
	-#define luai_numle(L,a,b) ((a)<=(b))
	-#define luai_numisnan(L,a) (!luai_numeq((a), (a)))
	-#endif
	-
	-
	-
	-/*
	-@@ LUA_INTEGER is the integral type used by lua_pushinteger/lua_tointeger.
	-** CHANGE that if ptrdiff_t is not adequate on your machine. (On most
	-** machines, ptrdiff_t gives a good choice between int or long.)
	-*/
	-#define LUA_INTEGER ptrdiff_t
	-
	-/*
	-@@ LUA_UNSIGNED is the integral type used by lua_pushunsigned/lua_tounsigned.
	-** It must have at least 32 bits.
	-*/
	-#define LUA_UNSIGNED uint64_t
	-
	-
	-
	-/*
	-** Some tricks with doubles
	-*/
	-
	-#if defined(LUA_NUMBER_DOUBLE) && !defined(LUA_ANSI) /* { */
	-/*
	-** The next definitions activate some tricks to speed up the
	-** conversion from doubles to integer types, mainly to LUA_UNSIGNED.
	-**
	-@@ LUA_MSASMTRICK uses Microsoft assembler to avoid clashes with a
	-** DirectX idiosyncrasy.
	-**
	-@@ LUA_IEEE754TRICK uses a trick that should work on any machine
	-** using IEEE754 with a 32-bit integer type.
	-**
	-@@ LUA_IEEELL extends the trick to LUA_INTEGER; should only be
	-** defined when LUA_INTEGER is a 32-bit integer.
	-**
	-@@ LUA_IEEEENDIAN is the endianness of doubles in your machine
	-** (0 for little endian, 1 for big endian); if not defined, Lua will
	-** check it dynamically for LUA_IEEE754TRICK (but not for LUA_NANTRICK).
	-**
	-@@ LUA_NANTRICK controls the use of a trick to pack all types into
	-** a single double value, using NaN values to represent non-number
	-** values. The trick only works on 32-bit machines (ints and pointers
	-** are 32-bit values) with numbers represented as IEEE 754-2008 doubles
	-** with conventional endianess (12345678 or 87654321), in CPUs that do
	-** not produce signaling NaN values (all NaNs are quiet).
	-*/
	-
	-/* Microsoft compiler on a Pentium (32 bit) ? */
	-#if defined(LUA_WIN) && defined(_MSC_VER) && defined(_M_IX86) /* { */
	-
	-#define LUA_MSASMTRICK
	-#define LUA_IEEEENDIAN 0
	-#define LUA_NANTRICK
	-
	-
	-/* pentium 32 bits? */
	-#elif defined(__i386__) \|\| defined(__i386) \|\| defined(__X86__) /* }{ */
	-
	-#define LUA_IEEE754TRICK
	-#define LUA_IEEELL
	-#define LUA_IEEEENDIAN 0
	-#define LUA_NANTRICK
	-
	-/* pentium 64 bits? */
	-#elif defined(__x86_64) /* }{ */
	-
	-#define LUA_IEEE754TRICK
	-#define LUA_IEEEENDIAN 0
	-
	-#elif defined(__POWERPC__) \|\| defined(__ppc__) /* }{ */
	-
	-#define LUA_IEEE754TRICK
	-#define LUA_IEEEENDIAN 1
	-
	-#else /* }{ */
	-
	-/* assume IEEE754 and a 32-bit integer type */
	-#define LUA_IEEE754TRICK
	-
	-#endif /* } */
	-
	-#endif /* } */
	-
	-/* }================================================================== */
	-
	-
	-
	-
	-/* =================================================================== */
	-
	-/*
	-** Local configuration. You can use this space to add your redefinitions
	-** without modifying the main part of the file.
	-*/
	-
	-#define getlocaledecpoint() ('.')
	-
	-#define abs(x) (((x) < 0) ? -(x) : (x))
	-
	-#if !defined(UCHAR_MAX)
	-#define UCHAR_MAX (0xff)
	-#endif
	-
	-#endif
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h
	@@ -1,55 +0,0 @@
	-/*
	-** $Id: lualib.h,v 1.43.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Lua standard libraries
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#ifndef lualib_h
	-#define lualib_h
	-
	-#include "lua.h"
	-
	-
	-
	-LUAMOD_API int (luaopen_base) (lua_State *L);
	-
	-#define LUA_COLIBNAME "coroutine"
	-LUAMOD_API int (luaopen_coroutine) (lua_State *L);
	-
	-#define LUA_TABLIBNAME "table"
	-LUAMOD_API int (luaopen_table) (lua_State *L);
	-
	-#define LUA_IOLIBNAME "io"
	-LUAMOD_API int (luaopen_io) (lua_State *L);
	-
	-#define LUA_OSLIBNAME "os"
	-LUAMOD_API int (luaopen_os) (lua_State *L);
	-
	-#define LUA_STRLIBNAME "string"
	-LUAMOD_API int (luaopen_string) (lua_State *L);
	-
	-#define LUA_BITLIBNAME "bit32"
	-LUAMOD_API int (luaopen_bit32) (lua_State *L);
	-
	-#define LUA_MATHLIBNAME "math"
	-LUAMOD_API int (luaopen_math) (lua_State *L);
	-
	-#define LUA_DBLIBNAME "debug"
	-LUAMOD_API int (luaopen_debug) (lua_State *L);
	-
	-#define LUA_LOADLIBNAME "package"
	-LUAMOD_API int (luaopen_package) (lua_State *L);
	-
	-
	-/* open all previous libraries */
	-LUALIB_API void (luaL_openlibs) (lua_State *L);
	-
	-
	-
	-#if !defined(lua_assert)
	-#define lua_assert(x) ((void)0)
	-#endif
	-
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h
	@@ -1,28 +0,0 @@
	-/*
	-** $Id: lundump.h,v 1.39.1.1 2013/04/12 18:48:47 roberto Exp $
	-** load precompiled Lua chunks
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lundump_h
	-#define lundump_h
	-
	-#include "lobject.h"
	-#include "lzio.h"
	-
	-/* load one chunk; from lundump.c */
	-LUAI_FUNC Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name);
	-
	-/* make header; from lundump.c */
	-LUAI_FUNC void luaU_header (lu_byte* h);
	-
	-/* dump one chunk; from ldump.c */
	-LUAI_FUNC int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip);
	-
	-/* data to catch conversion errors */
	-#define LUAC_TAIL "\x19\x93\r\n\x1a\n"
	-
	-/* size in bytes of header of binary files */
	-#define LUAC_HEADERSIZE (sizeof(LUA_SIGNATURE)-sizeof(char)+2+6+sizeof(LUAC_TAIL)-sizeof(char))
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c
	@@ -1,258 +0,0 @@
	-/*
	-** $Id: lundump.c,v 2.22.1.1 2013/04/12 18:48:47 roberto Exp $
	-** load precompiled Lua chunks
	-** See Copyright Notice in lua.h
	-*/
	-
	-#include <sys/zfs_context.h>
	-
	-#define lundump_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lfunc.h"
	-#include "lmem.h"
	-#include "lobject.h"
	-#include "lstring.h"
	-#include "lundump.h"
	-#include "lzio.h"
	-
	-typedef struct {
	- lua_State* L;
	- ZIO* Z;
	- Mbuffer* b;
	- const char* name;
	-} LoadState;
	-
	-static l_noret error(LoadState* S, const char* why)
	-{
	- luaO_pushfstring(S->L,"%s: %s precompiled chunk",S->name,why);
	- luaD_throw(S->L,LUA_ERRSYNTAX);
	-}
	-
	-#define LoadMem(S,b,n,size) LoadBlock(S,b,(n)*(size))
	-#define LoadByte(S) (lu_byte)LoadChar(S)
	-#define LoadVar(S,x) LoadMem(S,&x,1,sizeof(x))
	-#define LoadVector(S,b,n,size) LoadMem(S,b,n,size)
	-
	-#if !defined(luai_verifycode)
	-#define luai_verifycode(L,b,f) /* empty */
	-#endif
	-
	-static void LoadBlock(LoadState* S, void* b, size_t size)
	-{
	- if (luaZ_read(S->Z,b,size)!=0) error(S,"truncated");
	-}
	-
	-static int LoadChar(LoadState* S)
	-{
	- char x;
	- LoadVar(S,x);
	- return x;
	-}
	-
	-static int LoadInt(LoadState* S)
	-{
	- int x;
	- LoadVar(S,x);
	- if (x<0) error(S,"corrupted");
	- return x;
	-}
	-
	-static lua_Number LoadNumber(LoadState* S)
	-{
	- lua_Number x;
	- LoadVar(S,x);
	- return x;
	-}
	-
	-static TString* LoadString(LoadState* S)
	-{
	- size_t size;
	- LoadVar(S,size);
	- if (size==0)
	- return NULL;
	- else
	- {
	- char* s=luaZ_openspace(S->L,S->b,size);
	- LoadBlock(S,s,size*sizeof(char));
	- return luaS_newlstr(S->L,s,size-1); /* remove trailing '\0' */
	- }
	-}
	-
	-static void LoadCode(LoadState* S, Proto* f)
	-{
	- int n=LoadInt(S);
	- f->code=luaM_newvector(S->L,n,Instruction);
	- f->sizecode=n;
	- LoadVector(S,f->code,n,sizeof(Instruction));
	-}
	-
	-static void LoadFunction(LoadState* S, Proto* f);
	-
	-static void LoadConstants(LoadState* S, Proto* f)
	-{
	- int i,n;
	- n=LoadInt(S);
	- f->k=luaM_newvector(S->L,n,TValue);
	- f->sizek=n;
	- for (i=0; i<n; i++) setnilvalue(&f->k[i]);
	- for (i=0; i<n; i++)
	- {
	- TValue* o=&f->k[i];
	- int t=LoadChar(S);
	- switch (t)
	- {
	- case LUA_TNIL:
	- setnilvalue(o);
	- break;
	- case LUA_TBOOLEAN:
	- setbvalue(o,LoadChar(S));
	- break;
	- case LUA_TNUMBER:
	- setnvalue(o,LoadNumber(S));
	- break;
	- case LUA_TSTRING:
	- setsvalue2n(S->L,o,LoadString(S));
	- break;
	- default: lua_assert(0);
	- }
	- }
	- n=LoadInt(S);
	- f->p=luaM_newvector(S->L,n,Proto*);
	- f->sizep=n;
	- for (i=0; i<n; i++) f->p[i]=NULL;
	- for (i=0; i<n; i++)
	- {
	- f->p[i]=luaF_newproto(S->L);
	- LoadFunction(S,f->p[i]);
	- }
	-}
	-
	-static void LoadUpvalues(LoadState* S, Proto* f)
	-{
	- int i,n;
	- n=LoadInt(S);
	- f->upvalues=luaM_newvector(S->L,n,Upvaldesc);
	- f->sizeupvalues=n;
	- for (i=0; i<n; i++) f->upvalues[i].name=NULL;
	- for (i=0; i<n; i++)
	- {
	- f->upvalues[i].instack=LoadByte(S);
	- f->upvalues[i].idx=LoadByte(S);
	- }
	-}
	-
	-static void LoadDebug(LoadState* S, Proto* f)
	-{
	- int i,n;
	- f->source=LoadString(S);
	- n=LoadInt(S);
	- f->lineinfo=luaM_newvector(S->L,n,int);
	- f->sizelineinfo=n;
	- LoadVector(S,f->lineinfo,n,sizeof(int));
	- n=LoadInt(S);
	- f->locvars=luaM_newvector(S->L,n,LocVar);
	- f->sizelocvars=n;
	- for (i=0; i<n; i++) f->locvars[i].varname=NULL;
	- for (i=0; i<n; i++)
	- {
	- f->locvars[i].varname=LoadString(S);
	- f->locvars[i].startpc=LoadInt(S);
	- f->locvars[i].endpc=LoadInt(S);
	- }
	- n=LoadInt(S);
	- for (i=0; i<n; i++) f->upvalues[i].name=LoadString(S);
	-}
	-
	-static void LoadFunction(LoadState* S, Proto* f)
	-{
	- f->linedefined=LoadInt(S);
	- f->lastlinedefined=LoadInt(S);
	- f->numparams=LoadByte(S);
	- f->is_vararg=LoadByte(S);
	- f->maxstacksize=LoadByte(S);
	- LoadCode(S,f);
	- LoadConstants(S,f);
	- LoadUpvalues(S,f);
	- LoadDebug(S,f);
	-}
	-
	-/* the code below must be consistent with the code in luaU_header */
	-#define N0 LUAC_HEADERSIZE
	-#define N1 (sizeof(LUA_SIGNATURE)-sizeof(char))
	-#define N2 N1+2
	-#define N3 N2+6
	-
	-static void LoadHeader(LoadState* S)
	-{
	- lu_byte h[LUAC_HEADERSIZE];
	- lu_byte s[LUAC_HEADERSIZE];
	- luaU_header(h);
	- memcpy(s,h,sizeof(char)); /* first char already read */
	- LoadBlock(S,s+sizeof(char),LUAC_HEADERSIZE-sizeof(char));
	- if (memcmp(h,s,N0)==0) return;
	- if (memcmp(h,s,N1)!=0) error(S,"not a");
	- if (memcmp(h,s,N2)!=0) error(S,"version mismatch in");
	- if (memcmp(h,s,N3)!=0) error(S,"incompatible"); else error(S,"corrupted");
	-}
	-
	-/*
	-** load precompiled chunk
	-*/
	-Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name)
	-{
	- LoadState S;
	- Closure* cl;
	- if (name=='@' \|\| name=='=')
	- S.name=name+1;
	- else if (*name==LUA_SIGNATURE[0])
	- S.name="binary string";
	- else
	- S.name=name;
	- S.L=L;
	- S.Z=Z;
	- S.b=buff;
	- LoadHeader(&S);
	- cl=luaF_newLclosure(L,1);
	- setclLvalue(L,L->top,cl); incr_top(L);
	- cl->l.p=luaF_newproto(L);
	- LoadFunction(&S,cl->l.p);
	- if (cl->l.p->sizeupvalues != 1)
	- {
	- Proto* p=cl->l.p;
	- cl=luaF_newLclosure(L,cl->l.p->sizeupvalues);
	- cl->l.p=p;
	- setclLvalue(L,L->top-1,cl);
	- }
	- luai_verifycode(L,buff,cl->l.p);
	- return cl;
	-}
	-
	-#define MYINT(s) (s[0]-'0')
	-#define VERSION MYINT(LUA_VERSION_MAJOR)*16+MYINT(LUA_VERSION_MINOR)
	-#define FORMAT 0 /* this is the official format */
	-
	-/*
	-* make header for precompiled chunks
	-* if you change the code below be sure to update LoadHeader and FORMAT above
	-* and LUAC_HEADERSIZE in lundump.h
	-*/
	-void luaU_header (lu_byte* h)
	-{
	- int x=1;
	- memcpy(h,LUA_SIGNATURE,sizeof(LUA_SIGNATURE)-sizeof(char));
	- h+=sizeof(LUA_SIGNATURE)-sizeof(char);
	- *h++=cast_byte(VERSION);
	- *h++=cast_byte(FORMAT);
	- h++=cast_byte((char)&x); / endianness */
	- *h++=cast_byte(sizeof(int));
	- *h++=cast_byte(sizeof(size_t));
	- *h++=cast_byte(sizeof(Instruction));
	- *h++=cast_byte(sizeof(lua_Number));
	- h++=cast_byte(((lua_Number)0.5)==0); / is lua_Number integral? */
	- memcpy(h,LUAC_TAIL,sizeof(LUAC_TAIL)-sizeof(char));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h
	@@ -1,44 +0,0 @@
	-/*
	-** $Id: lvm.h,v 2.18.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Lua virtual machine
	-** See Copyright Notice in lua.h
	-*/
	-
	-#ifndef lvm_h
	-#define lvm_h
	-
	-
	-#include "ldo.h"
	-#include "lobject.h"
	-#include "ltm.h"
	-
	-
	-#define tostring(L,o) (ttisstring(o) \|\| (luaV_tostring(L, o)))
	-
	-#define tonumber(o,n) (ttisnumber(o) \|\| (((o) = luaV_tonumber(o,n)) != NULL))
	-
	-#define equalobj(L,o1,o2) (ttisequal(o1, o2) && luaV_equalobj_(L, o1, o2))
	-
	-#define luaV_rawequalobj(o1,o2) equalobj(NULL,o1,o2)
	-
	-
	-/* not to called directly */
	-LUAI_FUNC int luaV_equalobj_ (lua_State L, const TValue t1, const TValue *t2);
	-
	-
	-LUAI_FUNC int luaV_lessthan (lua_State L, const TValue l, const TValue *r);
	-LUAI_FUNC int luaV_lessequal (lua_State L, const TValue l, const TValue *r);
	-LUAI_FUNC const TValue luaV_tonumber (const TValue obj, TValue *n);
	-LUAI_FUNC int luaV_tostring (lua_State *L, StkId obj);
	-LUAI_FUNC void luaV_gettable (lua_State L, const TValue t, TValue *key,
	- StkId val);
	-LUAI_FUNC void luaV_settable (lua_State L, const TValue t, TValue *key,
	- StkId val);
	-LUAI_FUNC void luaV_finishOp (lua_State *L);
	-LUAI_FUNC void luaV_execute (lua_State *L);
	-LUAI_FUNC void luaV_concat (lua_State *L, int total);
	-LUAI_FUNC void luaV_arith (lua_State L, StkId ra, const TValue rb,
	- const TValue *rc, TMS op);
	-LUAI_FUNC void luaV_objlen (lua_State L, StkId ra, const TValue rb);
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c
	@@ -1,930 +0,0 @@
	-/*
	-** $Id: lvm.c,v 2.155.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Lua virtual machine
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define strcoll(l,r) (strcmp((l),(r)))
	-
	-#define lvm_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "ldebug.h"
	-#include "ldo.h"
	-#include "lfunc.h"
	-#include "lgc.h"
	-#include "lobject.h"
	-#include "lopcodes.h"
	-#include "lstate.h"
	-#include "lstring.h"
	-#include "ltable.h"
	-#include "ltm.h"
	-#include "lvm.h"
	-
	-
	-
	-/* limit for table tag-method chains (to avoid loops) */
	-#define MAXTAGLOOP 100
	-
	-
	-const TValue luaV_tonumber (const TValue obj, TValue *n) {
	- lua_Number num;
	- if (ttisnumber(obj)) return obj;
	- if (ttisstring(obj) && luaO_str2d(svalue(obj), tsvalue(obj)->len, &num)) {
	- setnvalue(n, num);
	- return n;
	- }
	- else
	- return NULL;
	-}
	-
	-
	-int luaV_tostring (lua_State *L, StkId obj) {
	- if (!ttisnumber(obj))
	- return 0;
	- else {
	- char s[LUAI_MAXNUMBER2STR];
	- lua_Number n = nvalue(obj);
	- int l = lua_number2str(s, n);
	- setsvalue2s(L, obj, luaS_newlstr(L, s, l));
	- return 1;
	- }
	-}
	-
	-
	-static void traceexec (lua_State *L) {
	- CallInfo *ci = L->ci;
	- lu_byte mask = L->hookmask;
	- int counthook = ((mask & LUA_MASKCOUNT) && L->hookcount == 0);
	- if (counthook)
	- resethookcount(L); /* reset count */
	- if (ci->callstatus & CIST_HOOKYIELD) { /* called hook last time? */
	- ci->callstatus &= ~CIST_HOOKYIELD; /* erase mark */
	- return; /* do not call hook again (VM yielded, so it did not move) */
	- }
	- if (counthook)
	- luaD_hook(L, LUA_HOOKCOUNT, -1); /* call count hook */
	- if (mask & LUA_MASKLINE) {
	- Proto *p = ci_func(ci)->p;
	- int npc = pcRel(ci->u.l.savedpc, p);
	- int newline = getfuncline(p, npc);
	- if (npc == 0 \|\| /* call linehook when enter a new function, */
	- ci->u.l.savedpc <= L->oldpc \|\| /* when jump back (loop), or when */
	- newline != getfuncline(p, pcRel(L->oldpc, p))) /* enter a new line */
	- luaD_hook(L, LUA_HOOKLINE, newline); /* call line hook */
	- }
	- L->oldpc = ci->u.l.savedpc;
	- if (L->status == LUA_YIELD) { /* did hook yield? */
	- if (counthook)
	- L->hookcount = 1; /* undo decrement to zero */
	- ci->u.l.savedpc--; /* undo increment (resume will increment it again) */
	- ci->callstatus \|= CIST_HOOKYIELD; /* mark that it yielded */
	- ci->func = L->top - 1; /* protect stack below results */
	- luaD_throw(L, LUA_YIELD);
	- }
	-}
	-
	-
	-static void callTM (lua_State L, const TValue f, const TValue *p1,
	- const TValue p2, TValue p3, int hasres) {
	- ptrdiff_t result = savestack(L, p3);
	- setobj2s(L, L->top++, f); /* push function */
	- setobj2s(L, L->top++, p1); /* 1st argument */
	- setobj2s(L, L->top++, p2); /* 2nd argument */
	- if (!hasres) /* no result? 'p3' is third argument */
	- setobj2s(L, L->top++, p3); /* 3rd argument */
	- /* metamethod may yield only when called from Lua code */
	- luaD_call(L, L->top - (4 - hasres), hasres, isLua(L->ci));
	- if (hasres) { /* if has result, move it to its place */
	- p3 = restorestack(L, result);
	- setobjs2s(L, p3, --L->top);
	- }
	-}
	-
	-
	-void luaV_gettable (lua_State L, const TValue t, TValue *key, StkId val) {
	- int loop;
	- for (loop = 0; loop < MAXTAGLOOP; loop++) {
	- const TValue *tm;
	- if (ttistable(t)) { /* `t' is a table? */
	- Table *h = hvalue(t);
	- const TValue res = luaH_get(h, key); / do a primitive get */
	- if (!ttisnil(res) \|\| /* result is not nil? */
	- (tm = fasttm(L, h->metatable, TM_INDEX)) == NULL) { /* or no TM? */
	- setobj2s(L, val, res);
	- return;
	- }
	- /* else will try the tag method */
	- }
	- else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_INDEX)))
	- luaG_typeerror(L, t, "index");
	- if (ttisfunction(tm)) {
	- callTM(L, tm, t, key, val, 1);
	- return;
	- }
	- t = tm; /* else repeat with 'tm' */
	- }
	- luaG_runerror(L, "loop in gettable");
	-}
	-
	-
	-void luaV_settable (lua_State L, const TValue t, TValue *key, StkId val) {
	- int loop;
	- for (loop = 0; loop < MAXTAGLOOP; loop++) {
	- const TValue *tm;
	- if (ttistable(t)) { /* `t' is a table? */
	- Table *h = hvalue(t);
	- TValue oldval = cast(TValue , luaH_get(h, key));
	- /* if previous value is not nil, there must be a previous entry
	- in the table; moreover, a metamethod has no relevance */
	- if (!ttisnil(oldval) \|\|
	- /* previous value is nil; must check the metamethod */
	- ((tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL &&
	- /* no metamethod; is there a previous entry in the table? */
	- (oldval != luaO_nilobject \|\|
	- /* no previous entry; must create one. (The next test is
	- always true; we only need the assignment.) */
	- (oldval = luaH_newkey(L, h, key), 1)))) {
	- /* no metamethod and (now) there is an entry with given key */
	- setobj2t(L, oldval, val); /* assign new value to that entry */
	- invalidateTMcache(h);
	- luaC_barrierback(L, obj2gco(h), val);
	- return;
	- }
	- /* else will try the metamethod */
	- }
	- else /* not a table; check metamethod */
	- if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX)))
	- luaG_typeerror(L, t, "index");
	- /* there is a metamethod */
	- if (ttisfunction(tm)) {
	- callTM(L, tm, t, key, val, 0);
	- return;
	- }
	- t = tm; /* else repeat with 'tm' */
	- }
	- luaG_runerror(L, "loop in settable");
	-}
	-
	-
	-static int call_binTM (lua_State L, const TValue p1, const TValue *p2,
	- StkId res, TMS event) {
	- const TValue tm = luaT_gettmbyobj(L, p1, event); / try first operand */
	- if (ttisnil(tm))
	- tm = luaT_gettmbyobj(L, p2, event); /* try second operand */
	- if (ttisnil(tm)) return 0;
	- callTM(L, tm, p1, p2, res, 1);
	- return 1;
	-}
	-
	-
	-static const TValue get_equalTM (lua_State L, Table mt1, Table mt2,
	- TMS event) {
	- const TValue *tm1 = fasttm(L, mt1, event);
	- const TValue *tm2;
	- if (tm1 == NULL) return NULL; /* no metamethod */
	- if (mt1 == mt2) return tm1; /* same metatables => same metamethods */
	- tm2 = fasttm(L, mt2, event);
	- if (tm2 == NULL) return NULL; /* no metamethod */
	- if (luaV_rawequalobj(tm1, tm2)) /* same metamethods? */
	- return tm1;
	- return NULL;
	-}
	-
	-
	-static int call_orderTM (lua_State L, const TValue p1, const TValue *p2,
	- TMS event) {
	- if (!call_binTM(L, p1, p2, L->top, event))
	- return -1; /* no metamethod */
	- else
	- return !l_isfalse(L->top);
	-}
	-
	-
	-static int l_strcmp (const TString ls, const TString rs) {
	- const char *l = getstr(ls);
	- size_t ll = ls->tsv.len;
	- const char *r = getstr(rs);
	- size_t lr = rs->tsv.len;
	- for (;;) {
	- int temp = strcoll(l, r);
	- if (temp != 0) return temp;
	- else { /* strings are equal up to a `\0' */
	- size_t len = strlen(l); /* index of first `\0' in both strings */
	- if (len == lr) /* r is finished? */
	- return (len == ll) ? 0 : 1;
	- else if (len == ll) /* l is finished? */
	- return -1; /* l is smaller than r (because r is not finished) */
	- /* both strings longer than `len'; go on comparing (after the `\0') */
	- len++;
	- l += len; ll -= len; r += len; lr -= len;
	- }
	- }
	-}
	-
	-
	-int luaV_lessthan (lua_State L, const TValue l, const TValue *r) {
	- int res;
	- if (ttisnumber(l) && ttisnumber(r))
	- return luai_numlt(L, nvalue(l), nvalue(r));
	- else if (ttisstring(l) && ttisstring(r))
	- return l_strcmp(rawtsvalue(l), rawtsvalue(r)) < 0;
	- else if ((res = call_orderTM(L, l, r, TM_LT)) < 0)
	- luaG_ordererror(L, l, r);
	- return res;
	-}
	-
	-
	-int luaV_lessequal (lua_State L, const TValue l, const TValue *r) {
	- int res;
	- if (ttisnumber(l) && ttisnumber(r))
	- return luai_numle(L, nvalue(l), nvalue(r));
	- else if (ttisstring(l) && ttisstring(r))
	- return l_strcmp(rawtsvalue(l), rawtsvalue(r)) <= 0;
	- else if ((res = call_orderTM(L, l, r, TM_LE)) >= 0) /* first try `le' */
	- return res;
	- else if ((res = call_orderTM(L, r, l, TM_LT)) < 0) /* else try `lt' */
	- luaG_ordererror(L, l, r);
	- return !res;
	-}
	-
	-
	-/*
	-** equality of Lua values. L == NULL means raw equality (no metamethods)
	-*/
	-int luaV_equalobj_ (lua_State L, const TValue t1, const TValue *t2) {
	- const TValue *tm;
	- lua_assert(ttisequal(t1, t2));
	- switch (ttype(t1)) {
	- case LUA_TNIL: return 1;
	- case LUA_TNUMBER: return luai_numeq(nvalue(t1), nvalue(t2));
	- case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2); /* true must be 1 !! */
	- case LUA_TLIGHTUSERDATA: return pvalue(t1) == pvalue(t2);
	- case LUA_TLCF: return fvalue(t1) == fvalue(t2);
	- case LUA_TSHRSTR: return eqshrstr(rawtsvalue(t1), rawtsvalue(t2));
	- case LUA_TLNGSTR: return luaS_eqlngstr(rawtsvalue(t1), rawtsvalue(t2));
	- case LUA_TUSERDATA: {
	- if (uvalue(t1) == uvalue(t2)) return 1;
	- else if (L == NULL) return 0;
	- tm = get_equalTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ);
	- break; /* will try TM */
	- }
	- case LUA_TTABLE: {
	- if (hvalue(t1) == hvalue(t2)) return 1;
	- else if (L == NULL) return 0;
	- tm = get_equalTM(L, hvalue(t1)->metatable, hvalue(t2)->metatable, TM_EQ);
	- break; /* will try TM */
	- }
	- default:
	- lua_assert(iscollectable(t1));
	- return gcvalue(t1) == gcvalue(t2);
	- }
	- if (tm == NULL) return 0; /* no TM? */
	- callTM(L, tm, t1, t2, L->top, 1); /* call TM */
	- return !l_isfalse(L->top);
	-}
	-
	-
	-void luaV_concat (lua_State *L, int total) {
	- lua_assert(total >= 2);
	- do {
	- StkId top = L->top;
	- int n = 2; /* number of elements handled in this pass (at least 2) */
	- if (!(ttisstring(top-2) \|\| ttisnumber(top-2)) \|\| !tostring(L, top-1)) {
	- if (!call_binTM(L, top-2, top-1, top-2, TM_CONCAT))
	- luaG_concaterror(L, top-2, top-1);
	- }
	- else if (tsvalue(top-1)->len == 0) /* second operand is empty? */
	- (void)tostring(L, top - 2); /* result is first operand */
	- else if (ttisstring(top-2) && tsvalue(top-2)->len == 0) {
	- setobjs2s(L, top - 2, top - 1); /* result is second op. */
	- }
	- else {
	- /* at least two non-empty string values; get as many as possible */
	- size_t tl = tsvalue(top-1)->len;
	- char *buffer;
	- int i;
	- /* collect total length */
	- for (i = 1; i < total && tostring(L, top-i-1); i++) {
	- size_t l = tsvalue(top-i-1)->len;
	- if (l >= (MAX_SIZET/sizeof(char)) - tl)
	- luaG_runerror(L, "string length overflow");
	- tl += l;
	- }
	- buffer = luaZ_openspace(L, &G(L)->buff, tl);
	- tl = 0;
	- n = i;
	- do { /* concat all strings */
	- size_t l = tsvalue(top-i)->len;
	- memcpy(buffer+tl, svalue(top-i), l * sizeof(char));
	- tl += l;
	- } while (--i > 0);
	- setsvalue2s(L, top-n, luaS_newlstr(L, buffer, tl));
	- }
	- total -= n-1; /* got 'n' strings to create 1 new */
	- L->top -= n-1; /* popped 'n' strings and pushed one */
	- } while (total > 1); /* repeat until only 1 result left */
	-}
	-
	-
	-void luaV_objlen (lua_State L, StkId ra, const TValue rb) {
	- const TValue *tm;
	- switch (ttypenv(rb)) {
	- case LUA_TTABLE: {
	- Table *h = hvalue(rb);
	- tm = fasttm(L, h->metatable, TM_LEN);
	- if (tm) break; /* metamethod? break switch to call it */
	- setnvalue(ra, cast_num(luaH_getn(h))); /* else primitive len */
	- return;
	- }
	- case LUA_TSTRING: {
	- setnvalue(ra, cast_num(tsvalue(rb)->len));
	- return;
	- }
	- default: { /* try metamethod */
	- tm = luaT_gettmbyobj(L, rb, TM_LEN);
	- if (ttisnil(tm)) /* no metamethod? */
	- luaG_typeerror(L, rb, "get length of");
	- break;
	- }
	- }
	- callTM(L, tm, rb, rb, ra, 1);
	-}
	-
	-/*
	- * luaV_div and luaV_mod patched in from Lua 5.3.2 in order to properly handle
	- * div/mod by zero (instead of crashing, which is the default behavior in
	- * Lua 5.2)
	- */
	-
	-/*
	-** Integer division; return 'm // n', that is, floor(m/n).
	-** C division truncates its result (rounds towards zero).
	-** 'floor(q) == trunc(q)' when 'q >= 0' or when 'q' is integer,
	-** otherwise 'floor(q) == trunc(q) - 1'.
	-*/
	-static lua_Number luaV_div (lua_State *L, lua_Number m, lua_Number n) {
	- if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */
	- if (n == 0)
	- luaG_runerror(L, "attempt to divide by zero");
	- return (0 - m); /* n==-1; avoid overflow with 0x80000...//-1 */
	- }
	- else {
	- lua_Number q = m / n; /* perform C division */
	- if ((m ^ n) < 0 && m % n != 0) /* 'm/n' would be negative non-integer? */
	- q -= 1; /* correct result for different rounding */
	- return q;
	- }
	-}
	-
	-
	-/*
	-** Integer modulus; return 'm % n'. (Assume that C '%' with
	-** negative operands follows C99 behavior. See previous comment
	-** about luaV_div.)
	-*/
	-static lua_Number luaV_mod (lua_State *L, lua_Number m, lua_Number n) {
	- if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */
	- if (n == 0)
	- luaG_runerror(L, "attempt to perform 'n%%0'");
	- return 0; /* m % -1 == 0; avoid overflow with 0x80000...%-1 */
	- }
	- else {
	- lua_Number r = m % n;
	- if (r != 0 && (m ^ n) < 0) /* 'm/n' would be non-integer negative? */
	- r += n; /* correct result for different rounding */
	- return r;
	- }
	-}
	-
	-/*
	- * End patch from 5.3.2
	- */
	-
	-void luaV_arith (lua_State L, StkId ra, const TValue rb,
	- const TValue *rc, TMS op) {
	- TValue tempb, tempc;
	- const TValue b, c;
	- if ((b = luaV_tonumber(rb, &tempb)) != NULL &&
	- (c = luaV_tonumber(rc, &tempc)) != NULL) {
	- /*
	- * Patched: if dividing or modding, use patched functions from 5.3
	- */
	- lua_Number res;
	- int lop = op - TM_ADD + LUA_OPADD;
	- if (lop == LUA_OPDIV) {
	- res = luaV_div(L, nvalue(b), nvalue(c));
	- } else if (lop == LUA_OPMOD) {
	- res = luaV_mod(L, nvalue(b), nvalue(c));
	- } else {
	- res = luaO_arith(op - TM_ADD + LUA_OPADD, nvalue(b), nvalue(c));
	- }
	- setnvalue(ra, res);
	- }
	- else if (!call_binTM(L, rb, rc, ra, op))
	- luaG_aritherror(L, rb, rc);
	-}
	-
	-
	-/*
	-** check whether cached closure in prototype 'p' may be reused, that is,
	-** whether there is a cached closure with the same upvalues needed by
	-** new closure to be created.
	-*/
	-static Closure getcached (Proto p, UpVal **encup, StkId base) {
	- Closure *c = p->cache;
	- if (c != NULL) { /* is there a cached closure? */
	- int nup = p->sizeupvalues;
	- Upvaldesc *uv = p->upvalues;
	- int i;
	- for (i = 0; i < nup; i++) { /* check whether it has right upvalues */
	- TValue *v = uv[i].instack ? base + uv[i].idx : encup[uv[i].idx]->v;
	- if (c->l.upvals[i]->v != v)
	- return NULL; /* wrong upvalue; cannot reuse closure */
	- }
	- }
	- return c; /* return cached closure (or NULL if no cached closure) */
	-}
	-
	-
	-/*
	-** create a new Lua closure, push it in the stack, and initialize
	-** its upvalues. Note that the call to 'luaC_barrierproto' must come
	-** before the assignment to 'p->cache', as the function needs the
	-** original value of that field.
	-*/
	-static void pushclosure (lua_State L, Proto p, UpVal **encup, StkId base,
	- StkId ra) {
	- int nup = p->sizeupvalues;
	- Upvaldesc *uv = p->upvalues;
	- int i;
	- Closure *ncl = luaF_newLclosure(L, nup);
	- ncl->l.p = p;
	- setclLvalue(L, ra, ncl); /* anchor new closure in stack */
	- for (i = 0; i < nup; i++) { /* fill in its upvalues */
	- if (uv[i].instack) /* upvalue refers to local variable? */
	- ncl->l.upvals[i] = luaF_findupval(L, base + uv[i].idx);
	- else /* get upvalue from enclosing function */
	- ncl->l.upvals[i] = encup[uv[i].idx];
	- }
	- luaC_barrierproto(L, p, ncl);
	- p->cache = ncl; /* save it on cache for reuse */
	-}
	-
	-
	-/*
	-** finish execution of an opcode interrupted by an yield
	-*/
	-void luaV_finishOp (lua_State *L) {
	- CallInfo *ci = L->ci;
	- StkId base = ci->u.l.base;
	- Instruction inst = (ci->u.l.savedpc - 1); / interrupted instruction */
	- OpCode op = GET_OPCODE(inst);
	- switch (op) { /* finish its execution */
	- case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV:
	- case OP_MOD: case OP_POW: case OP_UNM: case OP_LEN:
	- case OP_GETTABUP: case OP_GETTABLE: case OP_SELF: {
	- setobjs2s(L, base + GETARG_A(inst), --L->top);
	- break;
	- }
	- case OP_LE: case OP_LT: case OP_EQ: {
	- int res = !l_isfalse(L->top - 1);
	- L->top--;
	- /* metamethod should not be called when operand is K */
	- lua_assert(!ISK(GETARG_B(inst)));
	- if (op == OP_LE && /* "<=" using "<" instead? */
	- ttisnil(luaT_gettmbyobj(L, base + GETARG_B(inst), TM_LE)))
	- res = !res; /* invert result */
	- lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_JMP);
	- if (res != GETARG_A(inst)) /* condition failed? */
	- ci->u.l.savedpc++; /* skip jump instruction */
	- break;
	- }
	- case OP_CONCAT: {
	- StkId top = L->top - 1; /* top when 'call_binTM' was called */
	- int b = GETARG_B(inst); /* first element to concatenate */
	- int total = cast_int(top - 1 - (base + b)); /* yet to concatenate */
	- setobj2s(L, top - 2, top); /* put TM result in proper position */
	- if (total > 1) { /* are there elements to concat? */
	- L->top = top - 1; /* top is one after last element (at top-2) */
	- luaV_concat(L, total); /* concat them (may yield again) */
	- }
	- /* move final result to final position */
	- setobj2s(L, ci->u.l.base + GETARG_A(inst), L->top - 1);
	- L->top = ci->top; /* restore top */
	- break;
	- }
	- case OP_TFORCALL: {
	- lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_TFORLOOP);
	- L->top = ci->top; /* correct top */
	- break;
	- }
	- case OP_CALL: {
	- if (GETARG_C(inst) - 1 >= 0) /* nresults >= 0? */
	- L->top = ci->top; /* adjust results */
	- break;
	- }
	- case OP_TAILCALL: case OP_SETTABUP: case OP_SETTABLE:
	- break;
	- default: lua_assert(0);
	- }
	-}
	-
	-
	-
	-/*
	-** some macros for common tasks in `luaV_execute'
	-*/
	-
	-#if !defined luai_runtimecheck
	-#define luai_runtimecheck(L, c) /* void */
	-#endif
	-
	-
	-#define RA(i) (base+GETARG_A(i))
	-/* to be used after possible stack reallocation */
	-#define RB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgR, base+GETARG_B(i))
	-#define RC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgR, base+GETARG_C(i))
	-#define RKB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgK, \
	- ISK(GETARG_B(i)) ? k+INDEXK(GETARG_B(i)) : base+GETARG_B(i))
	-#define RKC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgK, \
	- ISK(GETARG_C(i)) ? k+INDEXK(GETARG_C(i)) : base+GETARG_C(i))
	-#define KBx(i) \
	- (k + (GETARG_Bx(i) != 0 ? GETARG_Bx(i) - 1 : GETARG_Ax(*ci->u.l.savedpc++)))
	-
	-
	-/* execute a jump instruction */
	-#define dojump(ci,i,e) \
	- { int a = GETARG_A(i); \
	- if (a > 0) luaF_close(L, ci->u.l.base + a - 1); \
	- ci->u.l.savedpc += GETARG_sBx(i) + e; }
	-
	-/* for test instructions, execute the jump instruction that follows it */
	-#define donextjump(ci) { i = *ci->u.l.savedpc; dojump(ci, i, 1); }
	-
	-
	-#define Protect(x) { {x;}; base = ci->u.l.base; }
	-
	-#define checkGC(L,c) \
	- Protect( luaC_condGC(L,{L->top = (c); /* limit of live values */ \
	- luaC_step(L); \
	- L->top = ci->top;}) /* restore top */ \
	- luai_threadyield(L); )
	-
	-
	-#define arith_op(op,tm) { \
	- TValue *rb = RKB(i); \
	- TValue *rc = RKC(i); \
	- if (ttisnumber(rb) && ttisnumber(rc)) { \
	- lua_Number nb = nvalue(rb), nc = nvalue(rc); \
	- setnvalue(ra, op(L, nb, nc)); \
	- } \
	- else { Protect(luaV_arith(L, ra, rb, rc, tm)); } }
	-
	-
	-#define vmdispatch(o) switch(o)
	-#define vmcase(l,b) case l: {b} break;
	-#define vmcasenb(l,b) case l: {b} /* nb = no break */
	-
	-void luaV_execute (lua_State *L) {
	- CallInfo *ci = L->ci;
	- LClosure *cl;
	- TValue *k;
	- StkId base;
	- newframe: /* reentry point when frame changes (call/return) */
	- lua_assert(ci == L->ci);
	- cl = clLvalue(ci->func);
	- k = cl->p->k;
	- base = ci->u.l.base;
	- /* main loop of interpreter */
	- for (;;) {
	- Instruction i = *(ci->u.l.savedpc++);
	- StkId ra;
	- if ((L->hookmask & (LUA_MASKLINE \| LUA_MASKCOUNT)) &&
	- (--L->hookcount == 0 \|\| L->hookmask & LUA_MASKLINE)) {
	- Protect(traceexec(L));
	- }
	- /* WARNING: several calls may realloc the stack and invalidate `ra' */
	- ra = RA(i);
	- lua_assert(base == ci->u.l.base);
	- lua_assert(base <= L->top && L->top < L->stack + L->stacksize);
	- vmdispatch (GET_OPCODE(i)) {
	- vmcase(OP_MOVE,
	- setobjs2s(L, ra, RB(i));
	- )
	- vmcase(OP_LOADK,
	- TValue *rb = k + GETARG_Bx(i);
	- setobj2s(L, ra, rb);
	- )
	- vmcase(OP_LOADKX,
	- TValue *rb;
	- lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
	- rb = k + GETARG_Ax(*ci->u.l.savedpc++);
	- setobj2s(L, ra, rb);
	- )
	- vmcase(OP_LOADBOOL,
	- setbvalue(ra, GETARG_B(i));
	- if (GETARG_C(i)) ci->u.l.savedpc++; /* skip next instruction (if C) */
	- )
	- vmcase(OP_LOADNIL,
	- int b = GETARG_B(i);
	- do {
	- setnilvalue(ra++);
	- } while (b--);
	- )
	- vmcase(OP_GETUPVAL,
	- int b = GETARG_B(i);
	- setobj2s(L, ra, cl->upvals[b]->v);
	- )
	- vmcase(OP_GETTABUP,
	- int b = GETARG_B(i);
	- Protect(luaV_gettable(L, cl->upvals[b]->v, RKC(i), ra));
	- )
	- vmcase(OP_GETTABLE,
	- Protect(luaV_gettable(L, RB(i), RKC(i), ra));
	- )
	- vmcase(OP_SETTABUP,
	- int a = GETARG_A(i);
	- Protect(luaV_settable(L, cl->upvals[a]->v, RKB(i), RKC(i)));
	- )
	- vmcase(OP_SETUPVAL,
	- UpVal *uv = cl->upvals[GETARG_B(i)];
	- setobj(L, uv->v, ra);
	- luaC_barrier(L, uv, ra);
	- )
	- vmcase(OP_SETTABLE,
	- Protect(luaV_settable(L, ra, RKB(i), RKC(i)));
	- )
	- vmcase(OP_NEWTABLE,
	- int b = GETARG_B(i);
	- int c = GETARG_C(i);
	- Table *t = luaH_new(L);
	- sethvalue(L, ra, t);
	- if (b != 0 \|\| c != 0)
	- luaH_resize(L, t, luaO_fb2int(b), luaO_fb2int(c));
	- checkGC(L, ra + 1);
	- )
	- vmcase(OP_SELF,
	- StkId rb = RB(i);
	- setobjs2s(L, ra+1, rb);
	- Protect(luaV_gettable(L, rb, RKC(i), ra));
	- )
	- vmcase(OP_ADD,
	- arith_op(luai_numadd, TM_ADD);
	- )
	- vmcase(OP_SUB,
	- arith_op(luai_numsub, TM_SUB);
	- )
	- vmcase(OP_MUL,
	- arith_op(luai_nummul, TM_MUL);
	- )
	- /*
	- * Patched: use luaV_* instead of luai_* to handle div/mod by 0
	- */
	- vmcase(OP_DIV,
	- arith_op(luaV_div, TM_DIV);
	- )
	- vmcase(OP_MOD,
	- arith_op(luaV_mod, TM_MOD);
	- )
	- vmcase(OP_POW,
	- arith_op(luai_numpow, TM_POW);
	- )
	- vmcase(OP_UNM,
	- TValue *rb = RB(i);
	- if (ttisnumber(rb)) {
	- lua_Number nb = nvalue(rb);
	- setnvalue(ra, luai_numunm(L, nb));
	- }
	- else {
	- Protect(luaV_arith(L, ra, rb, rb, TM_UNM));
	- }
	- )
	- vmcase(OP_NOT,
	- TValue *rb = RB(i);
	- int res = l_isfalse(rb); /* next assignment may change this value */
	- setbvalue(ra, res);
	- )
	- vmcase(OP_LEN,
	- Protect(luaV_objlen(L, ra, RB(i)));
	- )
	- vmcase(OP_CONCAT,
	- int b = GETARG_B(i);
	- int c = GETARG_C(i);
	- StkId rb;
	- L->top = base + c + 1; /* mark the end of concat operands */
	- Protect(luaV_concat(L, c - b + 1));
	- ra = RA(i); /* 'luav_concat' may invoke TMs and move the stack */
	- rb = b + base;
	- setobjs2s(L, ra, rb);
	- checkGC(L, (ra >= rb ? ra + 1 : rb));
	- L->top = ci->top; /* restore top */
	- )
	- vmcase(OP_JMP,
	- dojump(ci, i, 0);
	- )
	- vmcase(OP_EQ,
	- TValue *rb = RKB(i);
	- TValue *rc = RKC(i);
	- Protect(
	- if (cast_int(equalobj(L, rb, rc)) != GETARG_A(i))
	- ci->u.l.savedpc++;
	- else
	- donextjump(ci);
	- )
	- )
	- vmcase(OP_LT,
	- Protect(
	- if (luaV_lessthan(L, RKB(i), RKC(i)) != GETARG_A(i))
	- ci->u.l.savedpc++;
	- else
	- donextjump(ci);
	- )
	- )
	- vmcase(OP_LE,
	- Protect(
	- if (luaV_lessequal(L, RKB(i), RKC(i)) != GETARG_A(i))
	- ci->u.l.savedpc++;
	- else
	- donextjump(ci);
	- )
	- )
	- vmcase(OP_TEST,
	- if (GETARG_C(i) ? l_isfalse(ra) : !l_isfalse(ra))
	- ci->u.l.savedpc++;
	- else
	- donextjump(ci);
	- )
	- vmcase(OP_TESTSET,
	- TValue *rb = RB(i);
	- if (GETARG_C(i) ? l_isfalse(rb) : !l_isfalse(rb))
	- ci->u.l.savedpc++;
	- else {
	- setobjs2s(L, ra, rb);
	- donextjump(ci);
	- }
	- )
	- vmcase(OP_CALL,
	- int b = GETARG_B(i);
	- int nresults = GETARG_C(i) - 1;
	- if (b != 0) L->top = ra+b; /* else previous instruction set top */
	- if (luaD_precall(L, ra, nresults)) { /* C function? */
	- if (nresults >= 0) L->top = ci->top; /* adjust results */
	- base = ci->u.l.base;
	- }
	- else { /* Lua function */
	- ci = L->ci;
	- ci->callstatus \|= CIST_REENTRY;
	- goto newframe; /* restart luaV_execute over new Lua function */
	- }
	- )
	- vmcase(OP_TAILCALL,
	- int b = GETARG_B(i);
	- if (b != 0) L->top = ra+b; /* else previous instruction set top */
	- lua_assert(GETARG_C(i) - 1 == LUA_MULTRET);
	- if (luaD_precall(L, ra, LUA_MULTRET)) /* C function? */
	- base = ci->u.l.base;
	- else {
	- /* tail call: put called frame (n) in place of caller one (o) */
	- CallInfo nci = L->ci; / called frame */
	- CallInfo oci = nci->previous; / caller frame */
	- StkId nfunc = nci->func; /* called function */
	- StkId ofunc = oci->func; /* caller function */
	- /* last stack slot filled by 'precall' */
	- StkId lim = nci->u.l.base + getproto(nfunc)->numparams;
	- int aux;
	- /* close all upvalues from previous call */
	- if (cl->p->sizep > 0) luaF_close(L, oci->u.l.base);
	- /* move new frame into old one */
	- for (aux = 0; nfunc + aux < lim; aux++)
	- setobjs2s(L, ofunc + aux, nfunc + aux);
	- oci->u.l.base = ofunc + (nci->u.l.base - nfunc); /* correct base */
	- oci->top = L->top = ofunc + (L->top - nfunc); /* correct top */
	- oci->u.l.savedpc = nci->u.l.savedpc;
	- oci->callstatus \|= CIST_TAIL; /* function was tail called */
	- ci = L->ci = oci; /* remove new frame */
	- lua_assert(L->top == oci->u.l.base + getproto(ofunc)->maxstacksize);
	- goto newframe; /* restart luaV_execute over new Lua function */
	- }
	- )
	- vmcasenb(OP_RETURN,
	- int b = GETARG_B(i);
	- if (b != 0) L->top = ra+b-1;
	- if (cl->p->sizep > 0) luaF_close(L, base);
	- b = luaD_poscall(L, ra);
	- if (!(ci->callstatus & CIST_REENTRY)) /* 'ci' still the called one */
	- return; /* external invocation: return */
	- else { /* invocation via reentry: continue execution */
	- ci = L->ci;
	- if (b) L->top = ci->top;
	- lua_assert(isLua(ci));
	- lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL);
	- goto newframe; /* restart luaV_execute over new Lua function */
	- }
	- )
	- vmcase(OP_FORLOOP,
	- lua_Number step = nvalue(ra+2);
	- lua_Number idx = luai_numadd(L, nvalue(ra), step); /* increment index */
	- lua_Number limit = nvalue(ra+1);
	- if (luai_numlt(L, 0, step) ? luai_numle(L, idx, limit)
	- : luai_numle(L, limit, idx)) {
	- ci->u.l.savedpc += GETARG_sBx(i); /* jump back */
	- setnvalue(ra, idx); /* update internal index... */
	- setnvalue(ra+3, idx); /* ...and external index */
	- }
	- )
	- vmcase(OP_FORPREP,
	- const TValue *init = ra;
	- const TValue *plimit = ra+1;
	- const TValue *pstep = ra+2;
	- if (!tonumber(init, ra))
	- luaG_runerror(L, LUA_QL("for") " initial value must be a number");
	- else if (!tonumber(plimit, ra+1))
	- luaG_runerror(L, LUA_QL("for") " limit must be a number");
	- else if (!tonumber(pstep, ra+2))
	- luaG_runerror(L, LUA_QL("for") " step must be a number");
	- setnvalue(ra, luai_numsub(L, nvalue(ra), nvalue(pstep)));
	- ci->u.l.savedpc += GETARG_sBx(i);
	- )
	- vmcasenb(OP_TFORCALL,
	- StkId cb = ra + 3; /* call base */
	- setobjs2s(L, cb+2, ra+2);
	- setobjs2s(L, cb+1, ra+1);
	- setobjs2s(L, cb, ra);
	- L->top = cb + 3; /* func. + 2 args (state and index) */
	- Protect(luaD_call(L, cb, GETARG_C(i), 1));
	- L->top = ci->top;
	- i = (ci->u.l.savedpc++); / go to next instruction */
	- ra = RA(i);
	- lua_assert(GET_OPCODE(i) == OP_TFORLOOP);
	- goto l_tforloop;
	- )
	- vmcase(OP_TFORLOOP,
	- l_tforloop:
	- if (!ttisnil(ra + 1)) { /* continue loop? */
	- setobjs2s(L, ra, ra + 1); /* save control variable */
	- ci->u.l.savedpc += GETARG_sBx(i); /* jump back */
	- }
	- )
	- vmcase(OP_SETLIST,
	- int n = GETARG_B(i);
	- int c = GETARG_C(i);
	- int last;
	- Table *h;
	- if (n == 0) n = cast_int(L->top - ra) - 1;
	- if (c == 0) {
	- lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
	- c = GETARG_Ax(*ci->u.l.savedpc++);
	- }
	- luai_runtimecheck(L, ttistable(ra));
	- h = hvalue(ra);
	- last = ((c-1)*LFIELDS_PER_FLUSH) + n;
	- if (last > h->sizearray) /* needs more space? */
	- luaH_resizearray(L, h, last); /* pre-allocate it at once */
	- for (; n > 0; n--) {
	- TValue *val = ra+n;
	- luaH_setint(L, h, last--, val);
	- luaC_barrierback(L, obj2gco(h), val);
	- }
	- L->top = ci->top; /* correct top (in case of previous open call) */
	- )
	- vmcase(OP_CLOSURE,
	- Proto *p = cl->p->p[GETARG_Bx(i)];
	- Closure ncl = getcached(p, cl->upvals, base); / cached closure */
	- if (ncl == NULL) /* no match? */
	- pushclosure(L, p, cl->upvals, base, ra); /* create a new one */
	- else
	- setclLvalue(L, ra, ncl); /* push cashed closure */
	- checkGC(L, ra + 1);
	- )
	- vmcase(OP_VARARG,
	- int b = GETARG_B(i) - 1;
	- int j;
	- int n = cast_int(base - ci->func) - cl->p->numparams - 1;
	- if (b < 0) { /* B == 0? */
	- b = n; /* get all var. arguments */
	- Protect(luaD_checkstack(L, n));
	- ra = RA(i); /* previous call may change the stack */
	- L->top = ra + n;
	- }
	- for (j = 0; j < b; j++) {
	- if (j < n) {
	- setobjs2s(L, ra + j, base - n + j);
	- }
	- else {
	- setnilvalue(ra + j);
	- }
	- }
	- )
	- vmcase(OP_EXTRAARG,
	- lua_assert(0);
	- )
	- }
	- }
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h
	@@ -1,65 +0,0 @@
	-/*
	-** $Id: lzio.h,v 1.26.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Buffered streams
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#ifndef lzio_h
	-#define lzio_h
	-
	-#include "lua.h"
	-
	-#include "lmem.h"
	-
	-
	-#define EOZ (-1) /* end of stream */
	-
	-typedef struct Zio ZIO;
	-
	-#define zgetc(z) (((z)->n--)>0 ? cast_uchar(*(z)->p++) : luaZ_fill(z))
	-
	-
	-typedef struct Mbuffer {
	- char *buffer;
	- size_t n;
	- size_t buffsize;
	-} Mbuffer;
	-
	-#define luaZ_initbuffer(L, buff) ((buff)->buffer = NULL, (buff)->buffsize = 0)
	-
	-#define luaZ_buffer(buff) ((buff)->buffer)
	-#define luaZ_sizebuffer(buff) ((buff)->buffsize)
	-#define luaZ_bufflen(buff) ((buff)->n)
	-
	-#define luaZ_resetbuffer(buff) ((buff)->n = 0)
	-
	-
	-#define luaZ_resizebuffer(L, buff, size) \
	- (luaM_reallocvector(L, (buff)->buffer, (buff)->buffsize, size, char), \
	- (buff)->buffsize = size)
	-
	-#define luaZ_freebuffer(L, buff) luaZ_resizebuffer(L, buff, 0)
	-
	-
	-LUAI_FUNC char luaZ_openspace (lua_State L, Mbuffer *buff, size_t n);
	-LUAI_FUNC void luaZ_init (lua_State L, ZIO z, lua_Reader reader,
	- void *data);
	-LUAI_FUNC size_t luaZ_read (ZIO* z, void* b, size_t n); /* read next n bytes */
	-
	-
	-
	-/* --------- Private Part ------------------ */
	-
	-struct Zio {
	- size_t n; /* bytes still unread */
	- const char p; / current position in buffer */
	- lua_Reader reader; /* reader function */
	- void* data; /* additional data */
	- lua_State L; / Lua state (for reader) */
	-};
	-
	-
	-LUAI_FUNC int luaZ_fill (ZIO *z);
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c
	@@ -1,76 +0,0 @@
	-/*
	-** $Id: lzio.c,v 1.35.1.1 2013/04/12 18:48:47 roberto Exp $
	-** Buffered streams
	-** See Copyright Notice in lua.h
	-*/
	-
	-
	-#include <sys/zfs_context.h>
	-
	-#define lzio_c
	-#define LUA_CORE
	-
	-#include "lua.h"
	-
	-#include "llimits.h"
	-#include "lmem.h"
	-#include "lstate.h"
	-#include "lzio.h"
	-
	-
	-int luaZ_fill (ZIO *z) {
	- size_t size;
	- lua_State *L = z->L;
	- const char *buff;
	- lua_unlock(L);
	- buff = z->reader(L, z->data, &size);
	- lua_lock(L);
	- if (buff == NULL \|\| size == 0)
	- return EOZ;
	- z->n = size - 1; /* discount char being returned */
	- z->p = buff;
	- return cast_uchar(*(z->p++));
	-}
	-
	-
	-void luaZ_init (lua_State L, ZIO z, lua_Reader reader, void *data) {
	- z->L = L;
	- z->reader = reader;
	- z->data = data;
	- z->n = 0;
	- z->p = NULL;
	-}
	-
	-
	-/* --------------------------------------------------------------- read --- */
	-size_t luaZ_read (ZIO z, void b, size_t n) {
	- while (n) {
	- size_t m;
	- if (z->n == 0) { /* no bytes in buffer? */
	- if (luaZ_fill(z) == EOZ) /* try to read more */
	- return n; /* no more input; return number of missing bytes */
	- else {
	- z->n++; /* luaZ_fill consumed first byte; put it back */
	- z->p--;
	- }
	- }
	- m = (n <= z->n) ? n : z->n; /* min. between n and z->n */
	- memcpy(b, z->p, m);
	- z->n -= m;
	- z->p += m;
	- b = (char *)b + m;
	- n -= m;
	- }
	- return 0;
	-}
	-
	-/* ------------------------------------------------------------------------ */
	-char luaZ_openspace (lua_State L, Mbuffer *buff, size_t n) {
	- if (n > buff->buffsize) {
	- if (n < LUA_MINBUFFER) n = LUA_MINBUFFER;
	- luaZ_resizebuffer(L, buff, n);
	- }
	- return buff->buffer;
	-}
	-
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
	@@ -1,129 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-/*
	- * We keep our own copy of this algorithm for 3 main reasons:
	- * 1. If we didn't, anyone modifying common/os/compress.c would
	- * directly break our on disk format
	- * 2. Our version of lzjb does not have a number of checks that the
	- * common/os version needs and uses
	- * 3. We initialize the lempel to ensure deterministic results,
	- * so that identical blocks can always be deduplicated.
	- * In particular, we are adding the "feature" that compress() can
	- * take a destination buffer size and returns the compressed length, or the
	- * source length if compression would overflow the destination buffer.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/types.h>
	-#include <sys/param.h>
	-
	-#define MATCH_BITS 6
	-#define MATCH_MIN 3
	-#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
	-#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
	-#define LEMPEL_SIZE 1024
	-
	-/ARGSUSED/
	-size_t
	-lzjb_compress(void s_start, void d_start, size_t s_len, size_t d_len, int n)
	-{
	- uchar_t *src = s_start;
	- uchar_t *dst = d_start;
	- uchar_t *cpy;
	- uchar_t *copymap = NULL;
	- int copymask = 1 << (NBBY - 1);
	- int mlen, offset, hash;
	- uint16_t *hp;
	- uint16_t lempel[LEMPEL_SIZE] = { 0 };
	-
	- while (src < (uchar_t *)s_start + s_len) {
	- if ((copymask <<= 1) == (1 << NBBY)) {
	- if (dst >= (uchar_t )d_start + d_len - 1 - 2 NBBY)
	- return (s_len);
	- copymask = 1;
	- copymap = dst;
	- *dst++ = 0;
	- }
	- if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
	- dst++ = src++;
	- continue;
	- }
	- hash = (src[0] << 16) + (src[1] << 8) + src[2];
	- hash += hash >> 9;
	- hash += hash >> 5;
	- hp = &lempel[hash & (LEMPEL_SIZE - 1)];
	- offset = (intptr_t)(src - *hp) & OFFSET_MASK;
	- *hp = (uint16_t)(uintptr_t)src;
	- cpy = src - offset;
	- if (cpy >= (uchar_t *)s_start && cpy != src &&
	- src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
	- *copymap \|= copymask;
	- for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
	- if (src[mlen] != cpy[mlen])
	- break;
	- *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) \|
	- (offset >> NBBY);
	- *dst++ = (uchar_t)offset;
	- src += mlen;
	- } else {
	- dst++ = src++;
	- }
	- }
	- return (dst - (uchar_t *)d_start);
	-}
	-
	-/ARGSUSED/
	-int
	-lzjb_decompress(void s_start, void d_start, size_t s_len, size_t d_len, int n)
	-{
	- uchar_t *src = s_start;
	- uchar_t *dst = d_start;
	- uchar_t d_end = (uchar_t )d_start + d_len;
	- uchar_t *cpy;
	- uchar_t copymap = 0;
	- int copymask = 1 << (NBBY - 1);
	-
	- while (dst < d_end) {
	- if ((copymask <<= 1) == (1 << NBBY)) {
	- copymask = 1;
	- copymap = *src++;
	- }
	- if (copymap & copymask) {
	- int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
	- int offset = ((src[0] << NBBY) \| src[1]) & OFFSET_MASK;
	- src += 2;
	- if ((cpy = dst - offset) < (uchar_t *)d_start)
	- return (-1);
	- if (mlen > (d_end - dst))
	- mlen = d_end - dst;
	- while (--mlen >= 0)
	- dst++ = cpy++;
	- } else {
	- dst++ = src++;
	- }
	- }
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
	@@ -1,4624 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/space_map.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zio.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zfeature.h>
	-#include <sys/vdev_indirect_mapping.h>
	-#include <sys/zap.h>
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "ZFS metaslab");
	-
	-#define GANG_ALLOCATION(flags) \
	- ((flags) & (METASLAB_GANG_CHILD \| METASLAB_GANG_HEADER))
	-
	-uint64_t metaslab_aliquot = 512ULL << 10;
	-uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
	-SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, force_ganging, CTLFLAG_RWTUN,
	- &metaslab_force_ganging, 0,
	- "Force gang block allocation for blocks larger than or equal to this value");
	-
	-/*
	- * Since we can touch multiple metaslabs (and their respective space maps)
	- * with each transaction group, we benefit from having a smaller space map
	- * block size since it allows us to issue more I/O operations scattered
	- * around the disk.
	- */
	-int zfs_metaslab_sm_blksz = (1 << 12);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, metaslab_sm_blksz, CTLFLAG_RDTUN,
	- &zfs_metaslab_sm_blksz, 0,
	- "Block size for metaslab DTL space map. Power of 2 and greater than 4096.");
	-
	-/*
	- * The in-core space map representation is more compact than its on-disk form.
	- * The zfs_condense_pct determines how much more compact the in-core
	- * space map representation must be before we compact it on-disk.
	- * Values should be greater than or equal to 100.
	- */
	-int zfs_condense_pct = 200;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
	- &zfs_condense_pct, 0,
	- "Condense on-disk spacemap when it is more than this many percents"
	- " of in-memory counterpart");
	-
	-/*
	- * Condensing a metaslab is not guaranteed to actually reduce the amount of
	- * space used on disk. In particular, a space map uses data in increments of
	- * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
	- * same number of blocks after condensing. Since the goal of condensing is to
	- * reduce the number of IOPs required to read the space map, we only want to
	- * condense when we can be sure we will reduce the number of blocks used by the
	- * space map. Unfortunately, we cannot precisely compute whether or not this is
	- * the case in metaslab_should_condense since we are holding ms_lock. Instead,
	- * we apply the following heuristic: do not condense a spacemap unless the
	- * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
	- * blocks.
	- */
	-int zfs_metaslab_condense_block_threshold = 4;
	-
	-/*
	- * The zfs_mg_noalloc_threshold defines which metaslab groups should
	- * be eligible for allocation. The value is defined as a percentage of
	- * free space. Metaslab groups that have more free space than
	- * zfs_mg_noalloc_threshold are always eligible for allocations. Once
	- * a metaslab group's free space is less than or equal to the
	- * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
	- * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
	- * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
	- * groups are allowed to accept allocations. Gang blocks are always
	- * eligible to allocate on any metaslab group. The default value of 0 means
	- * no metaslab group will be excluded based on this criterion.
	- */
	-int zfs_mg_noalloc_threshold = 0;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
	- &zfs_mg_noalloc_threshold, 0,
	- "Percentage of metaslab group size that should be free"
	- " to make it eligible for allocation");
	-
	-/*
	- * Metaslab groups are considered eligible for allocations if their
	- * fragmenation metric (measured as a percentage) is less than or equal to
	- * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
	- * then it will be skipped unless all metaslab groups within the metaslab
	- * class have also crossed this threshold.
	- */
	-int zfs_mg_fragmentation_threshold = 85;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
	- &zfs_mg_fragmentation_threshold, 0,
	- "Percentage of metaslab group size that should be considered "
	- "eligible for allocations unless all metaslab groups within the metaslab class "
	- "have also crossed this threshold");
	-
	-/*
	- * Allow metaslabs to keep their active state as long as their fragmentation
	- * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
	- * active metaslab that exceeds this threshold will no longer keep its active
	- * status allowing better metaslabs to be selected.
	- */
	-int zfs_metaslab_fragmentation_threshold = 70;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
	- &zfs_metaslab_fragmentation_threshold, 0,
	- "Maximum percentage of metaslab fragmentation level to keep their active state");
	-
	-/*
	- * When set will load all metaslabs when pool is first opened.
	- */
	-int metaslab_debug_load = 0;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
	- &metaslab_debug_load, 0,
	- "Load all metaslabs when pool is first opened");
	-
	-/*
	- * When set will prevent metaslabs from being unloaded.
	- */
	-int metaslab_debug_unload = 0;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
	- &metaslab_debug_unload, 0,
	- "Prevent metaslabs from being unloaded");
	-
	-/*
	- * Minimum size which forces the dynamic allocator to change
	- * it's allocation strategy. Once the space map cannot satisfy
	- * an allocation of this size then it switches to using more
	- * aggressive strategy (i.e search by size rather than offset).
	- */
	-uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
	-SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
	- &metaslab_df_alloc_threshold, 0,
	- "Minimum size which forces the dynamic allocator to change it's allocation strategy");
	-
	-/*
	- * The minimum free space, in percent, which must be available
	- * in a space map to continue allocations in a first-fit fashion.
	- * Once the space map's free space drops below this level we dynamically
	- * switch to using best-fit allocations.
	- */
	-int metaslab_df_free_pct = 4;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
	- &metaslab_df_free_pct, 0,
	- "The minimum free space, in percent, which must be available in a "
	- "space map to continue allocations in a first-fit fashion");
	-
	-/*
	- * A metaslab is considered "free" if it contains a contiguous
	- * segment which is greater than metaslab_min_alloc_size.
	- */
	-uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
	-SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
	- &metaslab_min_alloc_size, 0,
	- "A metaslab is considered \"free\" if it contains a contiguous "
	- "segment which is greater than vfs.zfs.metaslab.min_alloc_size");
	-
	-/*
	- * Percentage of all cpus that can be used by the metaslab taskq.
	- */
	-int metaslab_load_pct = 50;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
	- &metaslab_load_pct, 0,
	- "Percentage of cpus that can be used by the metaslab taskq");
	-
	-/*
	- * Determines how many txgs a metaslab may remain loaded without having any
	- * allocations from it. As long as a metaslab continues to be used we will
	- * keep it loaded.
	- */
	-int metaslab_unload_delay = TXG_SIZE * 2;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
	- &metaslab_unload_delay, 0,
	- "Number of TXGs that an unused metaslab can be kept in memory");
	-
	-/*
	- * Max number of metaslabs per group to preload.
	- */
	-int metaslab_preload_limit = SPA_DVAS_PER_BP;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
	- &metaslab_preload_limit, 0,
	- "Max number of metaslabs per group to preload");
	-
	-/*
	- * Enable/disable preloading of metaslab.
	- */
	-boolean_t metaslab_preload_enabled = B_TRUE;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
	- &metaslab_preload_enabled, 0,
	- "Max number of metaslabs per group to preload");
	-
	-/*
	- * Enable/disable fragmentation weighting on metaslabs.
	- */
	-boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
	- &metaslab_fragmentation_factor_enabled, 0,
	- "Enable fragmentation weighting on metaslabs");
	-
	-/*
	- * Enable/disable lba weighting (i.e. outer tracks are given preference).
	- */
	-boolean_t metaslab_lba_weighting_enabled = B_TRUE;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
	- &metaslab_lba_weighting_enabled, 0,
	- "Enable LBA weighting (i.e. outer tracks are given preference)");
	-
	-/*
	- * Enable/disable metaslab group biasing.
	- */
	-boolean_t metaslab_bias_enabled = B_TRUE;
	-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
	- &metaslab_bias_enabled, 0,
	- "Enable metaslab group biasing");
	-
	-/*
	- * Enable/disable remapping of indirect DVAs to their concrete vdevs.
	- */
	-boolean_t zfs_remap_blkptr_enable = B_TRUE;
	-
	-/*
	- * Enable/disable segment-based metaslab selection.
	- */
	-boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
	-
	-/*
	- * When using segment-based metaslab selection, we will continue
	- * allocating from the active metaslab until we have exhausted
	- * zfs_metaslab_switch_threshold of its buckets.
	- */
	-int zfs_metaslab_switch_threshold = 2;
	-
	-/*
	- * Internal switch to enable/disable the metaslab allocation tracing
	- * facility.
	- */
	-#ifdef _METASLAB_TRACING
	-boolean_t metaslab_trace_enabled = B_TRUE;
	-#endif
	-
	-/*
	- * Maximum entries that the metaslab allocation tracing facility will keep
	- * in a given list when running in non-debug mode. We limit the number
	- * of entries in non-debug mode to prevent us from using up too much memory.
	- * The limit should be sufficiently large that we don't expect any allocation
	- * to every exceed this value. In debug mode, the system will panic if this
	- * limit is ever reached allowing for further investigation.
	- */
	-#ifdef _METASLAB_TRACING
	-uint64_t metaslab_trace_max_entries = 5000;
	-#endif
	-
	-static uint64_t metaslab_weight(metaslab_t *);
	-static void metaslab_set_fragmentation(metaslab_t *);
	-static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
	-static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
	-static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
	-static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
	-#ifdef _METASLAB_TRACING
	-kmem_cache_t *metaslab_alloc_trace_cache;
	-#endif
	-
	-/*
	- * ==========================================================================
	- * Metaslab classes
	- * ==========================================================================
	- */
	-metaslab_class_t *
	-metaslab_class_create(spa_t spa, metaslab_ops_t ops)
	-{
	- metaslab_class_t *mc;
	-
	- mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
	-
	- mc->mc_spa = spa;
	- mc->mc_rotor = NULL;
	- mc->mc_ops = ops;
	- mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
	- mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
	- sizeof (zfs_refcount_t), KM_SLEEP);
	- mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
	- sizeof (uint64_t), KM_SLEEP);
	- for (int i = 0; i < spa->spa_alloc_count; i++)
	- zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
	-
	- return (mc);
	-}
	-
	-void
	-metaslab_class_destroy(metaslab_class_t *mc)
	-{
	- ASSERT(mc->mc_rotor == NULL);
	- ASSERT(mc->mc_alloc == 0);
	- ASSERT(mc->mc_deferred == 0);
	- ASSERT(mc->mc_space == 0);
	- ASSERT(mc->mc_dspace == 0);
	-
	- for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
	- zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
	- kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
	- sizeof (zfs_refcount_t));
	- kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
	- sizeof (uint64_t));
	- mutex_destroy(&mc->mc_lock);
	- kmem_free(mc, sizeof (metaslab_class_t));
	-}
	-
	-int
	-metaslab_class_validate(metaslab_class_t *mc)
	-{
	- metaslab_group_t *mg;
	- vdev_t *vd;
	-
	- /*
	- * Must hold one of the spa_config locks.
	- */
	- ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) \|\|
	- spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
	-
	- if ((mg = mc->mc_rotor) == NULL)
	- return (0);
	-
	- do {
	- vd = mg->mg_vd;
	- ASSERT(vd->vdev_mg != NULL);
	- ASSERT3P(vd->vdev_top, ==, vd);
	- ASSERT3P(mg->mg_class, ==, mc);
	- ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
	- } while ((mg = mg->mg_next) != mc->mc_rotor);
	-
	- return (0);
	-}
	-
	-static void
	-metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
	- int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
	-{
	- atomic_add_64(&mc->mc_alloc, alloc_delta);
	- atomic_add_64(&mc->mc_deferred, defer_delta);
	- atomic_add_64(&mc->mc_space, space_delta);
	- atomic_add_64(&mc->mc_dspace, dspace_delta);
	-}
	-
	-void
	-metaslab_class_minblocksize_update(metaslab_class_t *mc)
	-{
	- metaslab_group_t *mg;
	- vdev_t *vd;
	- uint64_t minashift = UINT64_MAX;
	-
	- if ((mg = mc->mc_rotor) == NULL) {
	- mc->mc_minblocksize = SPA_MINBLOCKSIZE;
	- return;
	- }
	-
	- do {
	- vd = mg->mg_vd;
	- if (vd->vdev_ashift < minashift)
	- minashift = vd->vdev_ashift;
	- } while ((mg = mg->mg_next) != mc->mc_rotor);
	-
	- mc->mc_minblocksize = 1ULL << minashift;
	-}
	-
	-uint64_t
	-metaslab_class_get_alloc(metaslab_class_t *mc)
	-{
	- return (mc->mc_alloc);
	-}
	-
	-uint64_t
	-metaslab_class_get_deferred(metaslab_class_t *mc)
	-{
	- return (mc->mc_deferred);
	-}
	-
	-uint64_t
	-metaslab_class_get_space(metaslab_class_t *mc)
	-{
	- return (mc->mc_space);
	-}
	-
	-uint64_t
	-metaslab_class_get_dspace(metaslab_class_t *mc)
	-{
	- return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
	-}
	-
	-uint64_t
	-metaslab_class_get_minblocksize(metaslab_class_t *mc)
	-{
	- return (mc->mc_minblocksize);
	-}
	-
	-void
	-metaslab_class_histogram_verify(metaslab_class_t *mc)
	-{
	- spa_t *spa = mc->mc_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- uint64_t *mc_hist;
	- int i;
	-
	- if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
	- return;
	-
	- mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
	- KM_SLEEP);
	-
	- for (int c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	- metaslab_group_t *mg = tvd->vdev_mg;
	-
	- /*
	- * Skip any holes, uninitialized top-levels, or
	- * vdevs that are not in this metalab class.
	- */
	- if (!vdev_is_concrete(tvd) \|\| tvd->vdev_ms_shift == 0 \|\|
	- mg->mg_class != mc) {
	- continue;
	- }
	-
	- for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
	- mc_hist[i] += mg->mg_histogram[i];
	- }
	-
	- for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
	- VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
	-
	- kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
	-}
	-
	-/*
	- * Calculate the metaslab class's fragmentation metric. The metric
	- * is weighted based on the space contribution of each metaslab group.
	- * The return value will be a number between 0 and 100 (inclusive), or
	- * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
	- * zfs_frag_table for more information about the metric.
	- */
	-uint64_t
	-metaslab_class_fragmentation(metaslab_class_t *mc)
	-{
	- vdev_t *rvd = mc->mc_spa->spa_root_vdev;
	- uint64_t fragmentation = 0;
	-
	- spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
	-
	- for (int c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	- metaslab_group_t *mg = tvd->vdev_mg;
	-
	- /*
	- * Skip any holes, uninitialized top-levels,
	- * or vdevs that are not in this metalab class.
	- */
	- if (!vdev_is_concrete(tvd) \|\| tvd->vdev_ms_shift == 0 \|\|
	- mg->mg_class != mc) {
	- continue;
	- }
	-
	- /*
	- * If a metaslab group does not contain a fragmentation
	- * metric then just bail out.
	- */
	- if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
	- spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
	- return (ZFS_FRAG_INVALID);
	- }
	-
	- /*
	- * Determine how much this metaslab_group is contributing
	- * to the overall pool fragmentation metric.
	- */
	- fragmentation += mg->mg_fragmentation *
	- metaslab_group_get_space(mg);
	- }
	- fragmentation /= metaslab_class_get_space(mc);
	-
	- ASSERT3U(fragmentation, <=, 100);
	- spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
	- return (fragmentation);
	-}
	-
	-/*
	- * Calculate the amount of expandable space that is available in
	- * this metaslab class. If a device is expanded then its expandable
	- * space will be the amount of allocatable space that is currently not
	- * part of this metaslab class.
	- */
	-uint64_t
	-metaslab_class_expandable_space(metaslab_class_t *mc)
	-{
	- vdev_t *rvd = mc->mc_spa->spa_root_vdev;
	- uint64_t space = 0;
	-
	- spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
	- for (int c = 0; c < rvd->vdev_children; c++) {
	- uint64_t tspace;
	- vdev_t *tvd = rvd->vdev_child[c];
	- metaslab_group_t *mg = tvd->vdev_mg;
	-
	- if (!vdev_is_concrete(tvd) \|\| tvd->vdev_ms_shift == 0 \|\|
	- mg->mg_class != mc) {
	- continue;
	- }
	-
	- /*
	- * Calculate if we have enough space to add additional
	- * metaslabs. We report the expandable space in terms
	- * of the metaslab size since that's the unit of expansion.
	- * Adjust by efi system partition size.
	- */
	- tspace = tvd->vdev_max_asize - tvd->vdev_asize;
	- if (tspace > mc->mc_spa->spa_bootsize) {
	- tspace -= mc->mc_spa->spa_bootsize;
	- }
	- space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
	- }
	- spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
	- return (space);
	-}
	-
	-static int
	-metaslab_compare(const void x1, const void x2)
	-{
	- const metaslab_t m1 = (const metaslab_t )x1;
	- const metaslab_t m2 = (const metaslab_t )x2;
	-
	- int sort1 = 0;
	- int sort2 = 0;
	- if (m1->ms_allocator != -1 && m1->ms_primary)
	- sort1 = 1;
	- else if (m1->ms_allocator != -1 && !m1->ms_primary)
	- sort1 = 2;
	- if (m2->ms_allocator != -1 && m2->ms_primary)
	- sort2 = 1;
	- else if (m2->ms_allocator != -1 && !m2->ms_primary)
	- sort2 = 2;
	-
	- /*
	- * Sort inactive metaslabs first, then primaries, then secondaries. When
	- * selecting a metaslab to allocate from, an allocator first tries its
	- * primary, then secondary active metaslab. If it doesn't have active
	- * metaslabs, or can't allocate from them, it searches for an inactive
	- * metaslab to activate. If it can't find a suitable one, it will steal
	- * a primary or secondary metaslab from another allocator.
	- */
	- if (sort1 < sort2)
	- return (-1);
	- if (sort1 > sort2)
	- return (1);
	-
	- int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
	- if (likely(cmp))
	- return (cmp);
	-
	- IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
	-
	- return (AVL_CMP(m1->ms_start, m2->ms_start));
	-}
	-
	-uint64_t
	-metaslab_allocated_space(metaslab_t *msp)
	-{
	- return (msp->ms_allocated_space);
	-}
	-
	-/*
	- * Verify that the space accounting on disk matches the in-core range_trees.
	- */
	-static void
	-metaslab_verify_space(metaslab_t *msp, uint64_t txg)
	-{
	- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
	- uint64_t allocating = 0;
	- uint64_t sm_free_space, msp_free_space;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- ASSERT(!msp->ms_condensing);
	-
	- if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
	- return;
	-
	- /*
	- * We can only verify the metaslab space when we're called
	- * from syncing context with a loaded metaslab that has an
	- * allocated space map. Calling this in non-syncing context
	- * does not provide a consistent view of the metaslab since
	- * we're performing allocations in the future.
	- */
	- if (txg != spa_syncing_txg(spa) \|\| msp->ms_sm == NULL \|\|
	- !msp->ms_loaded)
	- return;
	-
	- /*
	- * Even though the smp_alloc field can get negative (e.g.
	- * see vdev_checkpoint_sm), that should never be the case
	- * when it come's to a metaslab's space map.
	- */
	- ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
	-
	- sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
	-
	- /*
	- * Account for future allocations since we would have
	- * already deducted that space from the ms_allocatable.
	- */
	- for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
	- allocating +=
	- range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
	- }
	-
	- ASSERT3U(msp->ms_deferspace, ==,
	- range_tree_space(msp->ms_defer[0]) +
	- range_tree_space(msp->ms_defer[1]));
	-
	- msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
	- msp->ms_deferspace + range_tree_space(msp->ms_freed);
	-
	- VERIFY3U(sm_free_space, ==, msp_free_space);
	-}
	-
	-/*
	- * ==========================================================================
	- * Metaslab groups
	- * ==========================================================================
	- */
	-/*
	- * Update the allocatable flag and the metaslab group's capacity.
	- * The allocatable flag is set to true if the capacity is below
	- * the zfs_mg_noalloc_threshold or has a fragmentation value that is
	- * greater than zfs_mg_fragmentation_threshold. If a metaslab group
	- * transitions from allocatable to non-allocatable or vice versa then the
	- * metaslab group's class is updated to reflect the transition.
	- */
	-static void
	-metaslab_group_alloc_update(metaslab_group_t *mg)
	-{
	- vdev_t *vd = mg->mg_vd;
	- metaslab_class_t *mc = mg->mg_class;
	- vdev_stat_t *vs = &vd->vdev_stat;
	- boolean_t was_allocatable;
	- boolean_t was_initialized;
	-
	- ASSERT(vd == vd->vdev_top);
	- ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
	- SCL_ALLOC);
	-
	- mutex_enter(&mg->mg_lock);
	- was_allocatable = mg->mg_allocatable;
	- was_initialized = mg->mg_initialized;
	-
	- mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
	- (vs->vs_space + 1);
	-
	- mutex_enter(&mc->mc_lock);
	-
	- /*
	- * If the metaslab group was just added then it won't
	- * have any space until we finish syncing out this txg.
	- * At that point we will consider it initialized and available
	- * for allocations. We also don't consider non-activated
	- * metaslab groups (e.g. vdevs that are in the middle of being removed)
	- * to be initialized, because they can't be used for allocation.
	- */
	- mg->mg_initialized = metaslab_group_initialized(mg);
	- if (!was_initialized && mg->mg_initialized) {
	- mc->mc_groups++;
	- } else if (was_initialized && !mg->mg_initialized) {
	- ASSERT3U(mc->mc_groups, >, 0);
	- mc->mc_groups--;
	- }
	- if (mg->mg_initialized)
	- mg->mg_no_free_space = B_FALSE;
	-
	- /*
	- * A metaslab group is considered allocatable if it has plenty
	- * of free space or is not heavily fragmented. We only take
	- * fragmentation into account if the metaslab group has a valid
	- * fragmentation metric (i.e. a value between 0 and 100).
	- */
	- mg->mg_allocatable = (mg->mg_activation_count > 0 &&
	- mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
	- (mg->mg_fragmentation == ZFS_FRAG_INVALID \|\|
	- mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
	-
	- /*
	- * The mc_alloc_groups maintains a count of the number of
	- * groups in this metaslab class that are still above the
	- * zfs_mg_noalloc_threshold. This is used by the allocating
	- * threads to determine if they should avoid allocations to
	- * a given group. The allocator will avoid allocations to a group
	- * if that group has reached or is below the zfs_mg_noalloc_threshold
	- * and there are still other groups that are above the threshold.
	- * When a group transitions from allocatable to non-allocatable or
	- * vice versa we update the metaslab class to reflect that change.
	- * When the mc_alloc_groups value drops to 0 that means that all
	- * groups have reached the zfs_mg_noalloc_threshold making all groups
	- * eligible for allocations. This effectively means that all devices
	- * are balanced again.
	- */
	- if (was_allocatable && !mg->mg_allocatable)
	- mc->mc_alloc_groups--;
	- else if (!was_allocatable && mg->mg_allocatable)
	- mc->mc_alloc_groups++;
	- mutex_exit(&mc->mc_lock);
	-
	- mutex_exit(&mg->mg_lock);
	-}
	-
	-metaslab_group_t *
	-metaslab_group_create(metaslab_class_t mc, vdev_t vd, int allocators)
	-{
	- metaslab_group_t *mg;
	-
	- mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
	- mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
	- mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
	- KM_SLEEP);
	- mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
	- KM_SLEEP);
	- avl_create(&mg->mg_metaslab_tree, metaslab_compare,
	- sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
	- mg->mg_vd = vd;
	- mg->mg_class = mc;
	- mg->mg_activation_count = 0;
	- mg->mg_initialized = B_FALSE;
	- mg->mg_no_free_space = B_TRUE;
	- mg->mg_allocators = allocators;
	-
	- mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
	- sizeof (zfs_refcount_t), KM_SLEEP);
	- mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
	- sizeof (uint64_t), KM_SLEEP);
	- for (int i = 0; i < allocators; i++) {
	- zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
	- mg->mg_cur_max_alloc_queue_depth[i] = 0;
	- }
	-
	- mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
	- minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
	-
	- return (mg);
	-}
	-
	-void
	-metaslab_group_destroy(metaslab_group_t *mg)
	-{
	- ASSERT(mg->mg_prev == NULL);
	- ASSERT(mg->mg_next == NULL);
	- /*
	- * We may have gone below zero with the activation count
	- * either because we never activated in the first place or
	- * because we're done, and possibly removing the vdev.
	- */
	- ASSERT(mg->mg_activation_count <= 0);
	-
	- taskq_destroy(mg->mg_taskq);
	- avl_destroy(&mg->mg_metaslab_tree);
	- kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
	- kmem_free(mg->mg_secondaries, mg->mg_allocators *
	- sizeof (metaslab_t *));
	- mutex_destroy(&mg->mg_lock);
	- mutex_destroy(&mg->mg_ms_initialize_lock);
	- cv_destroy(&mg->mg_ms_initialize_cv);
	-
	- for (int i = 0; i < mg->mg_allocators; i++) {
	- zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
	- mg->mg_cur_max_alloc_queue_depth[i] = 0;
	- }
	- kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
	- sizeof (zfs_refcount_t));
	- kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
	- sizeof (uint64_t));
	-
	- kmem_free(mg, sizeof (metaslab_group_t));
	-}
	-
	-void
	-metaslab_group_activate(metaslab_group_t *mg)
	-{
	- metaslab_class_t *mc = mg->mg_class;
	- metaslab_group_t mgprev, mgnext;
	-
	- ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
	-
	- ASSERT(mc->mc_rotor != mg);
	- ASSERT(mg->mg_prev == NULL);
	- ASSERT(mg->mg_next == NULL);
	- ASSERT(mg->mg_activation_count <= 0);
	-
	- if (++mg->mg_activation_count <= 0)
	- return;
	-
	- mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
	- metaslab_group_alloc_update(mg);
	-
	- if ((mgprev = mc->mc_rotor) == NULL) {
	- mg->mg_prev = mg;
	- mg->mg_next = mg;
	- } else {
	- mgnext = mgprev->mg_next;
	- mg->mg_prev = mgprev;
	- mg->mg_next = mgnext;
	- mgprev->mg_next = mg;
	- mgnext->mg_prev = mg;
	- }
	- mc->mc_rotor = mg;
	- metaslab_class_minblocksize_update(mc);
	-}
	-
	-/*
	- * Passivate a metaslab group and remove it from the allocation rotor.
	- * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
	- * a metaslab group. This function will momentarily drop spa_config_locks
	- * that are lower than the SCL_ALLOC lock (see comment below).
	- */
	-void
	-metaslab_group_passivate(metaslab_group_t *mg)
	-{
	- metaslab_class_t *mc = mg->mg_class;
	- spa_t *spa = mc->mc_spa;
	- metaslab_group_t mgprev, mgnext;
	- int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
	-
	- ASSERT3U(spa_config_held(spa, SCL_ALLOC \| SCL_ZIO, RW_WRITER), ==,
	- (SCL_ALLOC \| SCL_ZIO));
	-
	- if (--mg->mg_activation_count != 0) {
	- ASSERT(mc->mc_rotor != mg);
	- ASSERT(mg->mg_prev == NULL);
	- ASSERT(mg->mg_next == NULL);
	- ASSERT(mg->mg_activation_count < 0);
	- return;
	- }
	-
	- /*
	- * The spa_config_lock is an array of rwlocks, ordered as
	- * follows (from highest to lowest):
	- * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
	- * SCL_ZIO > SCL_FREE > SCL_VDEV
	- * (For more information about the spa_config_lock see spa_misc.c)
	- * The higher the lock, the broader its coverage. When we passivate
	- * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
	- * config locks. However, the metaslab group's taskq might be trying
	- * to preload metaslabs so we must drop the SCL_ZIO lock and any
	- * lower locks to allow the I/O to complete. At a minimum,
	- * we continue to hold the SCL_ALLOC lock, which prevents any future
	- * allocations from taking place and any changes to the vdev tree.
	- */
	- spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
	- taskq_wait(mg->mg_taskq);
	- spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
	- metaslab_group_alloc_update(mg);
	- for (int i = 0; i < mg->mg_allocators; i++) {
	- metaslab_t *msp = mg->mg_primaries[i];
	- if (msp != NULL) {
	- mutex_enter(&msp->ms_lock);
	- metaslab_passivate(msp,
	- metaslab_weight_from_range_tree(msp));
	- mutex_exit(&msp->ms_lock);
	- }
	- msp = mg->mg_secondaries[i];
	- if (msp != NULL) {
	- mutex_enter(&msp->ms_lock);
	- metaslab_passivate(msp,
	- metaslab_weight_from_range_tree(msp));
	- mutex_exit(&msp->ms_lock);
	- }
	- }
	-
	- mgprev = mg->mg_prev;
	- mgnext = mg->mg_next;
	-
	- if (mg == mgnext) {
	- mc->mc_rotor = NULL;
	- } else {
	- mc->mc_rotor = mgnext;
	- mgprev->mg_next = mgnext;
	- mgnext->mg_prev = mgprev;
	- }
	-
	- mg->mg_prev = NULL;
	- mg->mg_next = NULL;
	- metaslab_class_minblocksize_update(mc);
	-}
	-
	-boolean_t
	-metaslab_group_initialized(metaslab_group_t *mg)
	-{
	- vdev_t *vd = mg->mg_vd;
	- vdev_stat_t *vs = &vd->vdev_stat;
	-
	- return (vs->vs_space != 0 && mg->mg_activation_count > 0);
	-}
	-
	-uint64_t
	-metaslab_group_get_space(metaslab_group_t *mg)
	-{
	- return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
	-}
	-
	-void
	-metaslab_group_histogram_verify(metaslab_group_t *mg)
	-{
	- uint64_t *mg_hist;
	- vdev_t *vd = mg->mg_vd;
	- uint64_t ashift = vd->vdev_ashift;
	- int i;
	-
	- if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
	- return;
	-
	- mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
	- KM_SLEEP);
	-
	- ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
	- SPACE_MAP_HISTOGRAM_SIZE + ashift);
	-
	- for (int m = 0; m < vd->vdev_ms_count; m++) {
	- metaslab_t *msp = vd->vdev_ms[m];
	- ASSERT(msp != NULL);
	-
	- /* skip if not active or not a member */
	- if (msp->ms_sm == NULL \|\| msp->ms_group != mg)
	- continue;
	-
	- for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
	- mg_hist[i + ashift] +=
	- msp->ms_sm->sm_phys->smp_histogram[i];
	- }
	-
	- for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
	- VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
	-
	- kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
	-}
	-
	-static void
	-metaslab_group_histogram_add(metaslab_group_t mg, metaslab_t msp)
	-{
	- metaslab_class_t *mc = mg->mg_class;
	- uint64_t ashift = mg->mg_vd->vdev_ashift;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- if (msp->ms_sm == NULL)
	- return;
	-
	- mutex_enter(&mg->mg_lock);
	- for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
	- mg->mg_histogram[i + ashift] +=
	- msp->ms_sm->sm_phys->smp_histogram[i];
	- mc->mc_histogram[i + ashift] +=
	- msp->ms_sm->sm_phys->smp_histogram[i];
	- }
	- mutex_exit(&mg->mg_lock);
	-}
	-
	-void
	-metaslab_group_histogram_remove(metaslab_group_t mg, metaslab_t msp)
	-{
	- metaslab_class_t *mc = mg->mg_class;
	- uint64_t ashift = mg->mg_vd->vdev_ashift;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- if (msp->ms_sm == NULL)
	- return;
	-
	- mutex_enter(&mg->mg_lock);
	- for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
	- ASSERT3U(mg->mg_histogram[i + ashift], >=,
	- msp->ms_sm->sm_phys->smp_histogram[i]);
	- ASSERT3U(mc->mc_histogram[i + ashift], >=,
	- msp->ms_sm->sm_phys->smp_histogram[i]);
	-
	- mg->mg_histogram[i + ashift] -=
	- msp->ms_sm->sm_phys->smp_histogram[i];
	- mc->mc_histogram[i + ashift] -=
	- msp->ms_sm->sm_phys->smp_histogram[i];
	- }
	- mutex_exit(&mg->mg_lock);
	-}
	-
	-static void
	-metaslab_group_add(metaslab_group_t mg, metaslab_t msp)
	-{
	- ASSERT(msp->ms_group == NULL);
	- mutex_enter(&mg->mg_lock);
	- msp->ms_group = mg;
	- msp->ms_weight = 0;
	- avl_add(&mg->mg_metaslab_tree, msp);
	- mutex_exit(&mg->mg_lock);
	-
	- mutex_enter(&msp->ms_lock);
	- metaslab_group_histogram_add(mg, msp);
	- mutex_exit(&msp->ms_lock);
	-}
	-
	-static void
	-metaslab_group_remove(metaslab_group_t mg, metaslab_t msp)
	-{
	- mutex_enter(&msp->ms_lock);
	- metaslab_group_histogram_remove(mg, msp);
	- mutex_exit(&msp->ms_lock);
	-
	- mutex_enter(&mg->mg_lock);
	- ASSERT(msp->ms_group == mg);
	- avl_remove(&mg->mg_metaslab_tree, msp);
	- msp->ms_group = NULL;
	- mutex_exit(&mg->mg_lock);
	-}
	-
	-static void
	-metaslab_group_sort_impl(metaslab_group_t mg, metaslab_t msp, uint64_t weight)
	-{
	- ASSERT(MUTEX_HELD(&mg->mg_lock));
	- ASSERT(msp->ms_group == mg);
	- avl_remove(&mg->mg_metaslab_tree, msp);
	- msp->ms_weight = weight;
	- avl_add(&mg->mg_metaslab_tree, msp);
	-
	-}
	-
	-static void
	-metaslab_group_sort(metaslab_group_t mg, metaslab_t msp, uint64_t weight)
	-{
	- /*
	- * Although in principle the weight can be any value, in
	- * practice we do not use values in the range [1, 511].
	- */
	- ASSERT(weight >= SPA_MINBLOCKSIZE \|\| weight == 0);
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	-
	- mutex_enter(&mg->mg_lock);
	- metaslab_group_sort_impl(mg, msp, weight);
	- mutex_exit(&mg->mg_lock);
	-}
	-
	-/*
	- * Calculate the fragmentation for a given metaslab group. We can use
	- * a simple average here since all metaslabs within the group must have
	- * the same size. The return value will be a value between 0 and 100
	- * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
	- * group have a fragmentation metric.
	- */
	-uint64_t
	-metaslab_group_fragmentation(metaslab_group_t *mg)
	-{
	- vdev_t *vd = mg->mg_vd;
	- uint64_t fragmentation = 0;
	- uint64_t valid_ms = 0;
	-
	- for (int m = 0; m < vd->vdev_ms_count; m++) {
	- metaslab_t *msp = vd->vdev_ms[m];
	-
	- if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
	- continue;
	- if (msp->ms_group != mg)
	- continue;
	-
	- valid_ms++;
	- fragmentation += msp->ms_fragmentation;
	- }
	-
	- if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
	- return (ZFS_FRAG_INVALID);
	-
	- fragmentation /= valid_ms;
	- ASSERT3U(fragmentation, <=, 100);
	- return (fragmentation);
	-}
	-
	-/*
	- * Determine if a given metaslab group should skip allocations. A metaslab
	- * group should avoid allocations if its free capacity is less than the
	- * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
	- * zfs_mg_fragmentation_threshold and there is at least one metaslab group
	- * that can still handle allocations. If the allocation throttle is enabled
	- * then we skip allocations to devices that have reached their maximum
	- * allocation queue depth unless the selected metaslab group is the only
	- * eligible group remaining.
	- */
	-static boolean_t
	-metaslab_group_allocatable(metaslab_group_t mg, metaslab_group_t rotor,
	- uint64_t psize, int allocator, int d)
	-{
	- spa_t *spa = mg->mg_vd->vdev_spa;
	- metaslab_class_t *mc = mg->mg_class;
	-
	- /*
	- * We can only consider skipping this metaslab group if it's
	- * in the normal metaslab class and there are other metaslab
	- * groups to select from. Otherwise, we always consider it eligible
	- * for allocations.
	- */
	- if ((mc != spa_normal_class(spa) &&
	- mc != spa_special_class(spa) &&
	- mc != spa_dedup_class(spa)) \|\|
	- mc->mc_groups <= 1)
	- return (B_TRUE);
	-
	- /*
	- * If the metaslab group's mg_allocatable flag is set (see comments
	- * in metaslab_group_alloc_update() for more information) and
	- * the allocation throttle is disabled then allow allocations to this
	- * device. However, if the allocation throttle is enabled then
	- * check if we have reached our allocation limit (mg_alloc_queue_depth)
	- * to determine if we should allow allocations to this metaslab group.
	- * If all metaslab groups are no longer considered allocatable
	- * (mc_alloc_groups == 0) or we're trying to allocate the smallest
	- * gang block size then we allow allocations on this metaslab group
	- * regardless of the mg_allocatable or throttle settings.
	- */
	- if (mg->mg_allocatable) {
	- metaslab_group_t *mgp;
	- int64_t qdepth;
	- uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
	-
	- if (!mc->mc_alloc_throttle_enabled)
	- return (B_TRUE);
	-
	- /*
	- * If this metaslab group does not have any free space, then
	- * there is no point in looking further.
	- */
	- if (mg->mg_no_free_space)
	- return (B_FALSE);
	-
	- /*
	- * Relax allocation throttling for ditto blocks. Due to
	- * random imbalances in allocation it tends to push copies
	- * to one vdev, that looks a bit better at the moment.
	- */
	- qmax = qmax * (4 + d) / 4;
	-
	- qdepth = zfs_refcount_count(
	- &mg->mg_alloc_queue_depth[allocator]);
	-
	- /*
	- * If this metaslab group is below its qmax or it's
	- * the only allocatable metasable group, then attempt
	- * to allocate from it.
	- */
	- if (qdepth < qmax \|\| mc->mc_alloc_groups == 1)
	- return (B_TRUE);
	- ASSERT3U(mc->mc_alloc_groups, >, 1);
	-
	- /*
	- * Since this metaslab group is at or over its qmax, we
	- * need to determine if there are metaslab groups after this
	- * one that might be able to handle this allocation. This is
	- * racy since we can't hold the locks for all metaslab
	- * groups at the same time when we make this check.
	- */
	- for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
	- qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
	- qmax = qmax * (4 + d) / 4;
	- qdepth = zfs_refcount_count(
	- &mgp->mg_alloc_queue_depth[allocator]);
	-
	- /*
	- * If there is another metaslab group that
	- * might be able to handle the allocation, then
	- * we return false so that we skip this group.
	- */
	- if (qdepth < qmax && !mgp->mg_no_free_space)
	- return (B_FALSE);
	- }
	-
	- /*
	- * We didn't find another group to handle the allocation
	- * so we can't skip this metaslab group even though
	- * we are at or over our qmax.
	- */
	- return (B_TRUE);
	-
	- } else if (mc->mc_alloc_groups == 0 \|\| psize == SPA_MINBLOCKSIZE) {
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * ==========================================================================
	- * Range tree callbacks
	- * ==========================================================================
	- */
	-
	-/*
	- * Comparison function for the private size-ordered tree. Tree is sorted
	- * by size, larger sizes at the end of the tree.
	- */
	-static int
	-metaslab_rangesize_compare(const void x1, const void x2)
	-{
	- const range_seg_t *r1 = x1;
	- const range_seg_t *r2 = x2;
	- uint64_t rs_size1 = r1->rs_end - r1->rs_start;
	- uint64_t rs_size2 = r2->rs_end - r2->rs_start;
	-
	- int cmp = AVL_CMP(rs_size1, rs_size2);
	- if (likely(cmp))
	- return (cmp);
	-
	- return (AVL_CMP(r1->rs_start, r2->rs_start));
	-}
	-
	-/*
	- * ==========================================================================
	- * Common allocator routines
	- * ==========================================================================
	- */
	-
	-/*
	- * Return the maximum contiguous segment within the metaslab.
	- */
	-uint64_t
	-metaslab_block_maxsize(metaslab_t *msp)
	-{
	- avl_tree_t *t = &msp->ms_allocatable_by_size;
	- range_seg_t *rs;
	-
	- if (t == NULL \|\| (rs = avl_last(t)) == NULL)
	- return (0ULL);
	-
	- return (rs->rs_end - rs->rs_start);
	-}
	-
	-static range_seg_t *
	-metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
	-{
	- range_seg_t *rs, rsearch;
	- avl_index_t where;
	-
	- rsearch.rs_start = start;
	- rsearch.rs_end = start + size;
	-
	- rs = avl_find(t, &rsearch, &where);
	- if (rs == NULL) {
	- rs = avl_nearest(t, where, AVL_AFTER);
	- }
	-
	- return (rs);
	-}
	-
	-/*
	- * This is a helper function that can be used by the allocator to find
	- * a suitable block to allocate. This will search the specified AVL
	- * tree looking for a block that matches the specified criteria.
	- */
	-static uint64_t
	-metaslab_block_picker(avl_tree_t t, uint64_t cursor, uint64_t size,
	- uint64_t align)
	-{
	- range_seg_t rs = metaslab_block_find(t, cursor, size);
	-
	- while (rs != NULL) {
	- uint64_t offset = P2ROUNDUP(rs->rs_start, align);
	-
	- if (offset + size <= rs->rs_end) {
	- *cursor = offset + size;
	- return (offset);
	- }
	- rs = AVL_NEXT(t, rs);
	- }
	-
	- /*
	- * If we know we've searched the whole map (*cursor == 0), give up.
	- * Otherwise, reset the cursor to the beginning and try again.
	- */
	- if (*cursor == 0)
	- return (-1ULL);
	-
	- *cursor = 0;
	- return (metaslab_block_picker(t, cursor, size, align));
	-}
	-
	-/*
	- * ==========================================================================
	- * The first-fit block allocator
	- * ==========================================================================
	- */
	-static uint64_t
	-metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
	-{
	- /*
	- * Find the largest power of 2 block size that evenly divides the
	- * requested size. This is used to try to allocate blocks with similar
	- * alignment from the same area of the metaslab (i.e. same cursor
	- * bucket) but it does not guarantee that other allocations sizes
	- * may exist in the same region.
	- */
	- uint64_t align = size & -size;
	- uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
	- avl_tree_t *t = &msp->ms_allocatable->rt_root;
	-
	- return (metaslab_block_picker(t, cursor, size, align));
	-}
	-
	-static metaslab_ops_t metaslab_ff_ops = {
	- metaslab_ff_alloc
	-};
	-
	-/*
	- * ==========================================================================
	- * Dynamic block allocator -
	- * Uses the first fit allocation scheme until space get low and then
	- * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
	- * and metaslab_df_free_pct to determine when to switch the allocation scheme.
	- * ==========================================================================
	- */
	-static uint64_t
	-metaslab_df_alloc(metaslab_t *msp, uint64_t size)
	-{
	- /*
	- * Find the largest power of 2 block size that evenly divides the
	- * requested size. This is used to try to allocate blocks with similar
	- * alignment from the same area of the metaslab (i.e. same cursor
	- * bucket) but it does not guarantee that other allocations sizes
	- * may exist in the same region.
	- */
	- uint64_t align = size & -size;
	- uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
	- range_tree_t *rt = msp->ms_allocatable;
	- avl_tree_t *t = &rt->rt_root;
	- uint64_t max_size = metaslab_block_maxsize(msp);
	- int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- ASSERT3U(avl_numnodes(t), ==,
	- avl_numnodes(&msp->ms_allocatable_by_size));
	-
	- if (max_size < size)
	- return (-1ULL);
	-
	- /*
	- * If we're running low on space switch to using the size
	- * sorted AVL tree (best-fit).
	- */
	- if (max_size < metaslab_df_alloc_threshold \|\|
	- free_pct < metaslab_df_free_pct) {
	- t = &msp->ms_allocatable_by_size;
	- *cursor = 0;
	- }
	-
	- return (metaslab_block_picker(t, cursor, size, 1ULL));
	-}
	-
	-static metaslab_ops_t metaslab_df_ops = {
	- metaslab_df_alloc
	-};
	-
	-/*
	- * ==========================================================================
	- * Cursor fit block allocator -
	- * Select the largest region in the metaslab, set the cursor to the beginning
	- * of the range and the cursor_end to the end of the range. As allocations
	- * are made advance the cursor. Continue allocating from the cursor until
	- * the range is exhausted and then find a new range.
	- * ==========================================================================
	- */
	-static uint64_t
	-metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
	-{
	- range_tree_t *rt = msp->ms_allocatable;
	- avl_tree_t *t = &msp->ms_allocatable_by_size;
	- uint64_t *cursor = &msp->ms_lbas[0];
	- uint64_t *cursor_end = &msp->ms_lbas[1];
	- uint64_t offset = 0;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
	-
	- ASSERT3U(cursor_end, >=, cursor);
	-
	- if ((cursor + size) > cursor_end) {
	- range_seg_t *rs;
	-
	- rs = avl_last(&msp->ms_allocatable_by_size);
	- if (rs == NULL \|\| (rs->rs_end - rs->rs_start) < size)
	- return (-1ULL);
	-
	- *cursor = rs->rs_start;
	- *cursor_end = rs->rs_end;
	- }
	-
	- offset = *cursor;
	- *cursor += size;
	-
	- return (offset);
	-}
	-
	-static metaslab_ops_t metaslab_cf_ops = {
	- metaslab_cf_alloc
	-};
	-
	-/*
	- * ==========================================================================
	- * New dynamic fit allocator -
	- * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
	- * contiguous blocks. If no region is found then just use the largest segment
	- * that remains.
	- * ==========================================================================
	- */
	-
	-/*
	- * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
	- * to request from the allocator.
	- */
	-uint64_t metaslab_ndf_clump_shift = 4;
	-
	-static uint64_t
	-metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
	-{
	- avl_tree_t *t = &msp->ms_allocatable->rt_root;
	- avl_index_t where;
	- range_seg_t *rs, rsearch;
	- uint64_t hbit = highbit64(size);
	- uint64_t *cursor = &msp->ms_lbas[hbit - 1];
	- uint64_t max_size = metaslab_block_maxsize(msp);
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- ASSERT3U(avl_numnodes(t), ==,
	- avl_numnodes(&msp->ms_allocatable_by_size));
	-
	- if (max_size < size)
	- return (-1ULL);
	-
	- rsearch.rs_start = *cursor;
	- rsearch.rs_end = *cursor + size;
	-
	- rs = avl_find(t, &rsearch, &where);
	- if (rs == NULL \|\| (rs->rs_end - rs->rs_start) < size) {
	- t = &msp->ms_allocatable_by_size;
	-
	- rsearch.rs_start = 0;
	- rsearch.rs_end = MIN(max_size,
	- 1ULL << (hbit + metaslab_ndf_clump_shift));
	- rs = avl_find(t, &rsearch, &where);
	- if (rs == NULL)
	- rs = avl_nearest(t, where, AVL_AFTER);
	- ASSERT(rs != NULL);
	- }
	-
	- if ((rs->rs_end - rs->rs_start) >= size) {
	- *cursor = rs->rs_start + size;
	- return (rs->rs_start);
	- }
	- return (-1ULL);
	-}
	-
	-static metaslab_ops_t metaslab_ndf_ops = {
	- metaslab_ndf_alloc
	-};
	-
	-metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
	-
	-/*
	- * ==========================================================================
	- * Metaslabs
	- * ==========================================================================
	- */
	-
	-static void
	-metaslab_aux_histograms_clear(metaslab_t *msp)
	-{
	- /*
	- * Auxiliary histograms are only cleared when resetting them,
	- * which can only happen while the metaslab is loaded.
	- */
	- ASSERT(msp->ms_loaded);
	-
	- bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
	- for (int t = 0; t < TXG_DEFER_SIZE; t++)
	- bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
	-}
	-
	-static void
	-metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
	- range_tree_t *rt)
	-{
	- /*
	- * This is modeled after space_map_histogram_add(), so refer to that
	- * function for implementation details. We want this to work like
	- * the space map histogram, and not the range tree histogram, as we
	- * are essentially constructing a delta that will be later subtracted
	- * from the space map histogram.
	- */
	- int idx = 0;
	- for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
	- ASSERT3U(i, >=, idx + shift);
	- histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
	-
	- if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
	- ASSERT3U(idx + shift, ==, i);
	- idx++;
	- ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
	- }
	- }
	-}
	-
	-/*
	- * Called at every sync pass that the metaslab gets synced.
	- *
	- * The reason is that we want our auxiliary histograms to be updated
	- * wherever the metaslab's space map histogram is updated. This way
	- * we stay consistent on which parts of the metaslab space map's
	- * histogram are currently not available for allocations (e.g because
	- * they are in the defer, freed, and freeing trees).
	- */
	-static void
	-metaslab_aux_histograms_update(metaslab_t *msp)
	-{
	- space_map_t *sm = msp->ms_sm;
	- ASSERT(sm != NULL);
	-
	- /*
	- * This is similar to the metaslab's space map histogram updates
	- * that take place in metaslab_sync(). The only difference is that
	- * we only care about segments that haven't made it into the
	- * ms_allocatable tree yet.
	- */
	- if (msp->ms_loaded) {
	- metaslab_aux_histograms_clear(msp);
	-
	- metaslab_aux_histogram_add(msp->ms_synchist,
	- sm->sm_shift, msp->ms_freed);
	-
	- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	- metaslab_aux_histogram_add(msp->ms_deferhist[t],
	- sm->sm_shift, msp->ms_defer[t]);
	- }
	- }
	-
	- metaslab_aux_histogram_add(msp->ms_synchist,
	- sm->sm_shift, msp->ms_freeing);
	-}
	-
	-/*
	- * Called every time we are done syncing (writing to) the metaslab,
	- * i.e. at the end of each sync pass.
	- * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
	- */
	-static void
	-metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
	-{
	- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
	- space_map_t *sm = msp->ms_sm;
	-
	- if (sm == NULL) {
	- /*
	- * We came here from metaslab_init() when creating/opening a
	- * pool, looking at a metaslab that hasn't had any allocations
	- * yet.
	- */
	- return;
	- }
	-
	- /*
	- * This is similar to the actions that we take for the ms_freed
	- * and ms_defer trees in metaslab_sync_done().
	- */
	- uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
	- if (defer_allowed) {
	- bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
	- sizeof (msp->ms_synchist));
	- } else {
	- bzero(msp->ms_deferhist[hist_index],
	- sizeof (msp->ms_deferhist[hist_index]));
	- }
	- bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
	-}
	-
	-/*
	- * Ensure that the metaslab's weight and fragmentation are consistent
	- * with the contents of the histogram (either the range tree's histogram
	- * or the space map's depending whether the metaslab is loaded).
	- */
	-static void
	-metaslab_verify_weight_and_frag(metaslab_t *msp)
	-{
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	-
	- if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
	- return;
	-
	- /* see comment in metaslab_verify_unflushed_changes() */
	- if (msp->ms_group == NULL)
	- return;
	-
	- /*
	- * Devices being removed always return a weight of 0 and leave
	- * fragmentation and ms_max_size as is - there is nothing for
	- * us to verify here.
	- */
	- vdev_t *vd = msp->ms_group->mg_vd;
	- if (vd->vdev_removing)
	- return;
	-
	- /*
	- * If the metaslab is dirty it probably means that we've done
	- * some allocations or frees that have changed our histograms
	- * and thus the weight.
	- */
	- for (int t = 0; t < TXG_SIZE; t++) {
	- if (txg_list_member(&vd->vdev_ms_list, msp, t))
	- return;
	- }
	-
	- /*
	- * This verification checks that our in-memory state is consistent
	- * with what's on disk. If the pool is read-only then there aren't
	- * any changes and we just have the initially-loaded state.
	- */
	- if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
	- return;
	-
	- /* some extra verification for in-core tree if you can */
	- if (msp->ms_loaded) {
	- range_tree_stat_verify(msp->ms_allocatable);
	- VERIFY(space_map_histogram_verify(msp->ms_sm,
	- msp->ms_allocatable));
	- }
	-
	- uint64_t weight = msp->ms_weight;
	- uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
	- boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
	- uint64_t frag = msp->ms_fragmentation;
	- uint64_t max_segsize = msp->ms_max_size;
	-
	- msp->ms_weight = 0;
	- msp->ms_fragmentation = 0;
	- msp->ms_max_size = 0;
	-
	- /*
	- * This function is used for verification purposes. Regardless of
	- * whether metaslab_weight() thinks this metaslab should be active or
	- * not, we want to ensure that the actual weight (and therefore the
	- * value of ms_weight) would be the same if it was to be recalculated
	- * at this point.
	- */
	- msp->ms_weight = metaslab_weight(msp) \| was_active;
	-
	- VERIFY3U(max_segsize, ==, msp->ms_max_size);
	-
	- /*
	- * If the weight type changed then there is no point in doing
	- * verification. Revert fields to their original values.
	- */
	- if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) \|\|
	- (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
	- msp->ms_fragmentation = frag;
	- msp->ms_weight = weight;
	- return;
	- }
	-
	- VERIFY3U(msp->ms_fragmentation, ==, frag);
	- VERIFY3U(msp->ms_weight, ==, weight);
	-}
	-
	-/*
	- * Wait for any in-progress metaslab loads to complete.
	- */
	-static void
	-metaslab_load_wait(metaslab_t *msp)
	-{
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	-
	- while (msp->ms_loading) {
	- ASSERT(!msp->ms_loaded);
	- cv_wait(&msp->ms_load_cv, &msp->ms_lock);
	- }
	-}
	-
	-static int
	-metaslab_load_impl(metaslab_t *msp)
	-{
	- int error = 0;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- ASSERT(msp->ms_loading);
	- ASSERT(!msp->ms_condensing);
	-
	- /*
	- * We temporarily drop the lock to unblock other operations while we
	- * are reading the space map. Therefore, metaslab_sync() and
	- * metaslab_sync_done() can run at the same time as we do.
	- *
	- * metaslab_sync() can append to the space map while we are loading.
	- * Therefore we load only entries that existed when we started the
	- * load. Additionally, metaslab_sync_done() has to wait for the load
	- * to complete because there are potential races like metaslab_load()
	- * loading parts of the space map that are currently being appended
	- * by metaslab_sync(). If we didn't, the ms_allocatable would have
	- * entries that metaslab_sync_done() would try to re-add later.
	- *
	- * That's why before dropping the lock we remember the synced length
	- * of the metaslab and read up to that point of the space map,
	- * ignoring entries appended by metaslab_sync() that happen after we
	- * drop the lock.
	- */
	- uint64_t length = msp->ms_synced_length;
	- mutex_exit(&msp->ms_lock);
	-
	- if (msp->ms_sm != NULL) {
	- error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
	- SM_FREE, length);
	- } else {
	- /*
	- * The space map has not been allocated yet, so treat
	- * all the space in the metaslab as free and add it to the
	- * ms_allocatable tree.
	- */
	- range_tree_add(msp->ms_allocatable,
	- msp->ms_start, msp->ms_size);
	- }
	-
	- /*
	- * We need to grab the ms_sync_lock to prevent metaslab_sync() from
	- * changing the ms_sm and the metaslab's range trees while we are
	- * about to use them and populate the ms_allocatable. The ms_lock
	- * is insufficient for this because metaslab_sync() doesn't hold
	- * the ms_lock while writing the ms_checkpointing tree to disk.
	- */
	- mutex_enter(&msp->ms_sync_lock);
	- mutex_enter(&msp->ms_lock);
	- ASSERT(!msp->ms_condensing);
	-
	- if (error != 0) {
	- mutex_exit(&msp->ms_sync_lock);
	- return (error);
	- }
	-
	- ASSERT3P(msp->ms_group, !=, NULL);
	- msp->ms_loaded = B_TRUE;
	-
	- /*
	- * The ms_allocatable contains the segments that exist in the
	- * ms_defer trees [see ms_synced_length]. Thus we need to remove
	- * them from ms_allocatable as they will be added again in
	- * metaslab_sync_done().
	- */
	- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	- range_tree_walk(msp->ms_defer[t],
	- range_tree_remove, msp->ms_allocatable);
	- }
	-
	- /*
	- * Call metaslab_recalculate_weight_and_sort() now that the
	- * metaslab is loaded so we get the metaslab's real weight.
	- *
	- * Unless this metaslab was created with older software and
	- * has not yet been converted to use segment-based weight, we
	- * expect the new weight to be better or equal to the weight
	- * that the metaslab had while it was not loaded. This is
	- * because the old weight does not take into account the
	- * consolidation of adjacent segments between TXGs. [see
	- * comment for ms_synchist and ms_deferhist[] for more info]
	- */
	- uint64_t weight = msp->ms_weight;
	- metaslab_recalculate_weight_and_sort(msp);
	- if (!WEIGHT_IS_SPACEBASED(weight))
	- ASSERT3U(weight, <=, msp->ms_weight);
	- msp->ms_max_size = metaslab_block_maxsize(msp);
	-
	- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
	- metaslab_verify_space(msp, spa_syncing_txg(spa));
	- mutex_exit(&msp->ms_sync_lock);
	-
	- return (0);
	-}
	-
	-int
	-metaslab_load(metaslab_t *msp)
	-{
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	-
	- /*
	- * There may be another thread loading the same metaslab, if that's
	- * the case just wait until the other thread is done and return.
	- */
	- metaslab_load_wait(msp);
	- if (msp->ms_loaded)
	- return (0);
	- VERIFY(!msp->ms_loading);
	- ASSERT(!msp->ms_condensing);
	-
	- msp->ms_loading = B_TRUE;
	- int error = metaslab_load_impl(msp);
	- msp->ms_loading = B_FALSE;
	- cv_broadcast(&msp->ms_load_cv);
	-
	- return (error);
	-}
	-
	-void
	-metaslab_unload(metaslab_t *msp)
	-{
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	-
	- metaslab_verify_weight_and_frag(msp);
	-
	- range_tree_vacate(msp->ms_allocatable, NULL, NULL);
	- msp->ms_loaded = B_FALSE;
	-
	- msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
	- msp->ms_max_size = 0;
	-
	- /*
	- * We explicitly recalculate the metaslab's weight based on its space
	- * map (as it is now not loaded). We want unload metaslabs to always
	- * have their weights calculated from the space map histograms, while
	- * loaded ones have it calculated from their in-core range tree
	- * [see metaslab_load()]. This way, the weight reflects the information
	- * available in-core, whether it is loaded or not
	- *
	- * If ms_group == NULL means that we came here from metaslab_fini(),
	- * at which point it doesn't make sense for us to do the recalculation
	- * and the sorting.
	- */
	- if (msp->ms_group != NULL)
	- metaslab_recalculate_weight_and_sort(msp);
	-}
	-
	-static void
	-metaslab_space_update(vdev_t vd, metaslab_class_t mc, int64_t alloc_delta,
	- int64_t defer_delta, int64_t space_delta)
	-{
	- vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
	-
	- ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
	- ASSERT(vd->vdev_ms_count != 0);
	-
	- metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
	- vdev_deflated_space(vd, space_delta));
	-}
	-
	-int
	-metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
	- metaslab_t **msp)
	-{
	- vdev_t *vd = mg->mg_vd;
	- spa_t *spa = vd->vdev_spa;
	- objset_t *mos = spa->spa_meta_objset;
	- metaslab_t *ms;
	- int error;
	-
	- ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
	- mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
	-
	- ms->ms_id = id;
	- ms->ms_start = id << vd->vdev_ms_shift;
	- ms->ms_size = 1ULL << vd->vdev_ms_shift;
	- ms->ms_allocator = -1;
	- ms->ms_new = B_TRUE;
	-
	- /*
	- * We only open space map objects that already exist. All others
	- * will be opened when we finally allocate an object for it.
	- *
	- * Note:
	- * When called from vdev_expand(), we can't call into the DMU as
	- * we are holding the spa_config_lock as a writer and we would
	- * deadlock [see relevant comment in vdev_metaslab_init()]. in
	- * that case, the object parameter is zero though, so we won't
	- * call into the DMU.
	- */
	- if (object != 0) {
	- error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
	- ms->ms_size, vd->vdev_ashift);
	-
	- if (error != 0) {
	- kmem_free(ms, sizeof (metaslab_t));
	- return (error);
	- }
	-
	- ASSERT(ms->ms_sm != NULL);
	- ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
	- ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
	- }
	-
	- /*
	- * We create the ms_allocatable here, but we don't create the
	- * other range trees until metaslab_sync_done(). This serves
	- * two purposes: it allows metaslab_sync_done() to detect the
	- * addition of new space; and for debugging, it ensures that
	- * we'd data fault on any attempt to use this metaslab before
	- * it's ready.
	- */
	- ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
	- metaslab_rangesize_compare, 0);
	- metaslab_group_add(mg, ms);
	-
	- metaslab_set_fragmentation(ms);
	-
	- /*
	- * If we're opening an existing pool (txg == 0) or creating
	- * a new one (txg == TXG_INITIAL), all space is available now.
	- * If we're adding space to an existing pool, the new space
	- * does not become available until after this txg has synced.
	- * The metaslab's weight will also be initialized when we sync
	- * out this txg. This ensures that we don't attempt to allocate
	- * from it before we have initialized it completely.
	- */
	- if (txg <= TXG_INITIAL) {
	- metaslab_sync_done(ms, 0);
	- metaslab_space_update(vd, mg->mg_class,
	- metaslab_allocated_space(ms), 0, 0);
	- }
	-
	- /*
	- * If metaslab_debug_load is set and we're initializing a metaslab
	- * that has an allocated space map object then load the space map
	- * so that we can verify frees.
	- */
	- if (metaslab_debug_load && ms->ms_sm != NULL) {
	- mutex_enter(&ms->ms_lock);
	- VERIFY0(metaslab_load(ms));
	- mutex_exit(&ms->ms_lock);
	- }
	-
	- if (txg != 0) {
	- vdev_dirty(vd, 0, NULL, txg);
	- vdev_dirty(vd, VDD_METASLAB, ms, txg);
	- }
	-
	- *msp = ms;
	-
	- return (0);
	-}
	-
	-void
	-metaslab_fini(metaslab_t *msp)
	-{
	- metaslab_group_t *mg = msp->ms_group;
	- vdev_t *vd = mg->mg_vd;
	-
	- metaslab_group_remove(mg, msp);
	-
	- mutex_enter(&msp->ms_lock);
	- VERIFY(msp->ms_group == NULL);
	- metaslab_space_update(vd, mg->mg_class,
	- -metaslab_allocated_space(msp), 0, -msp->ms_size);
	-
	- space_map_close(msp->ms_sm);
	-
	- metaslab_unload(msp);
	-
	- range_tree_destroy(msp->ms_allocatable);
	- range_tree_destroy(msp->ms_freeing);
	- range_tree_destroy(msp->ms_freed);
	-
	- for (int t = 0; t < TXG_SIZE; t++) {
	- range_tree_destroy(msp->ms_allocating[t]);
	- }
	-
	- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	- range_tree_destroy(msp->ms_defer[t]);
	- }
	- ASSERT0(msp->ms_deferspace);
	-
	- range_tree_destroy(msp->ms_checkpointing);
	-
	- for (int t = 0; t < TXG_SIZE; t++)
	- ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
	-
	- mutex_exit(&msp->ms_lock);
	- cv_destroy(&msp->ms_load_cv);
	- mutex_destroy(&msp->ms_lock);
	- mutex_destroy(&msp->ms_sync_lock);
	- ASSERT3U(msp->ms_allocator, ==, -1);
	-
	- kmem_free(msp, sizeof (metaslab_t));
	-}
	-
	-#define FRAGMENTATION_TABLE_SIZE 17
	-
	-/*
	- * This table defines a segment size based fragmentation metric that will
	- * allow each metaslab to derive its own fragmentation value. This is done
	- * by calculating the space in each bucket of the spacemap histogram and
	- * multiplying that by the fragmentation metric in this table. Doing
	- * this for all buckets and dividing it by the total amount of free
	- * space in this metaslab (i.e. the total free space in all buckets) gives
	- * us the fragmentation metric. This means that a high fragmentation metric
	- * equates to most of the free space being comprised of small segments.
	- * Conversely, if the metric is low, then most of the free space is in
	- * large segments. A 10% change in fragmentation equates to approximately
	- * double the number of segments.
	- *
	- * This table defines 0% fragmented space using 16MB segments. Testing has
	- * shown that segments that are greater than or equal to 16MB do not suffer
	- * from drastic performance problems. Using this value, we derive the rest
	- * of the table. Since the fragmentation value is never stored on disk, it
	- * is possible to change these calculations in the future.
	- */
	-int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
	- 100, /* 512B */
	- 100, /* 1K */
	- 98, /* 2K */
	- 95, /* 4K */
	- 90, /* 8K */
	- 80, /* 16K */
	- 70, /* 32K */
	- 60, /* 64K */
	- 50, /* 128K */
	- 40, /* 256K */
	- 30, /* 512K */
	- 20, /* 1M */
	- 15, /* 2M */
	- 10, /* 4M */
	- 5, /* 8M */
	- 0 /* 16M */
	-};
	-
	-/*
	- * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
	- * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
	- * been upgraded and does not support this metric. Otherwise, the return
	- * value should be in the range [0, 100].
	- */
	-static void
	-metaslab_set_fragmentation(metaslab_t *msp)
	-{
	- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
	- uint64_t fragmentation = 0;
	- uint64_t total = 0;
	- boolean_t feature_enabled = spa_feature_is_enabled(spa,
	- SPA_FEATURE_SPACEMAP_HISTOGRAM);
	-
	- if (!feature_enabled) {
	- msp->ms_fragmentation = ZFS_FRAG_INVALID;
	- return;
	- }
	-
	- /*
	- * A null space map means that the entire metaslab is free
	- * and thus is not fragmented.
	- */
	- if (msp->ms_sm == NULL) {
	- msp->ms_fragmentation = 0;
	- return;
	- }
	-
	- /*
	- * If this metaslab's space map has not been upgraded, flag it
	- * so that we upgrade next time we encounter it.
	- */
	- if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
	- uint64_t txg = spa_syncing_txg(spa);
	- vdev_t *vd = msp->ms_group->mg_vd;
	-
	- /*
	- * If we've reached the final dirty txg, then we must
	- * be shutting down the pool. We don't want to dirty
	- * any data past this point so skip setting the condense
	- * flag. We can retry this action the next time the pool
	- * is imported.
	- */
	- if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
	- msp->ms_condense_wanted = B_TRUE;
	- vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
	- zfs_dbgmsg("txg %llu, requesting force condense: "
	- "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
	- vd->vdev_id);
	- }
	- msp->ms_fragmentation = ZFS_FRAG_INVALID;
	- return;
	- }
	-
	- for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
	- uint64_t space = 0;
	- uint8_t shift = msp->ms_sm->sm_shift;
	-
	- int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
	- FRAGMENTATION_TABLE_SIZE - 1);
	-
	- if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
	- continue;
	-
	- space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
	- total += space;
	-
	- ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
	- fragmentation += space * zfs_frag_table[idx];
	- }
	-
	- if (total > 0)
	- fragmentation /= total;
	- ASSERT3U(fragmentation, <=, 100);
	-
	- msp->ms_fragmentation = fragmentation;
	-}
	-
	-/*
	- * Compute a weight -- a selection preference value -- for the given metaslab.
	- * This is based on the amount of free space, the level of fragmentation,
	- * the LBA range, and whether the metaslab is loaded.
	- */
	-static uint64_t
	-metaslab_space_weight(metaslab_t *msp)
	-{
	- metaslab_group_t *mg = msp->ms_group;
	- vdev_t *vd = mg->mg_vd;
	- uint64_t weight, space;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- ASSERT(!vd->vdev_removing);
	-
	- /*
	- * The baseline weight is the metaslab's free space.
	- */
	- space = msp->ms_size - metaslab_allocated_space(msp);
	-
	- if (metaslab_fragmentation_factor_enabled &&
	- msp->ms_fragmentation != ZFS_FRAG_INVALID) {
	- /*
	- * Use the fragmentation information to inversely scale
	- * down the baseline weight. We need to ensure that we
	- * don't exclude this metaslab completely when it's 100%
	- * fragmented. To avoid this we reduce the fragmented value
	- * by 1.
	- */
	- space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
	-
	- /*
	- * If space < SPA_MINBLOCKSIZE, then we will not allocate from
	- * this metaslab again. The fragmentation metric may have
	- * decreased the space to something smaller than
	- * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
	- * so that we can consume any remaining space.
	- */
	- if (space > 0 && space < SPA_MINBLOCKSIZE)
	- space = SPA_MINBLOCKSIZE;
	- }
	- weight = space;
	-
	- /*
	- * Modern disks have uniform bit density and constant angular velocity.
	- * Therefore, the outer recording zones are faster (higher bandwidth)
	- * than the inner zones by the ratio of outer to inner track diameter,
	- * which is typically around 2:1. We account for this by assigning
	- * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
	- * In effect, this means that we'll select the metaslab with the most
	- * free bandwidth rather than simply the one with the most free space.
	- */
	- if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
	- weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
	- ASSERT(weight >= space && weight <= 2 * space);
	- }
	-
	- /*
	- * If this metaslab is one we're actively using, adjust its
	- * weight to make it preferable to any inactive metaslab so
	- * we'll polish it off. If the fragmentation on this metaslab
	- * has exceed our threshold, then don't mark it active.
	- */
	- if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
	- msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
	- weight \|= (msp->ms_weight & METASLAB_ACTIVE_MASK);
	- }
	-
	- WEIGHT_SET_SPACEBASED(weight);
	- return (weight);
	-}
	-
	-/*
	- * Return the weight of the specified metaslab, according to the segment-based
	- * weighting algorithm. The metaslab must be loaded. This function can
	- * be called within a sync pass since it relies only on the metaslab's
	- * range tree which is always accurate when the metaslab is loaded.
	- */
	-static uint64_t
	-metaslab_weight_from_range_tree(metaslab_t *msp)
	-{
	- uint64_t weight = 0;
	- uint32_t segments = 0;
	-
	- ASSERT(msp->ms_loaded);
	-
	- for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
	- i--) {
	- uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
	- int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
	-
	- segments <<= 1;
	- segments += msp->ms_allocatable->rt_histogram[i];
	-
	- /*
	- * The range tree provides more precision than the space map
	- * and must be downgraded so that all values fit within the
	- * space map's histogram. This allows us to compare loaded
	- * vs. unloaded metaslabs to determine which metaslab is
	- * considered "best".
	- */
	- if (i > max_idx)
	- continue;
	-
	- if (segments != 0) {
	- WEIGHT_SET_COUNT(weight, segments);
	- WEIGHT_SET_INDEX(weight, i);
	- WEIGHT_SET_ACTIVE(weight, 0);
	- break;
	- }
	- }
	- return (weight);
	-}
	-
	-/*
	- * Calculate the weight based on the on-disk histogram. This should only
	- * be called after a sync pass has completely finished since the on-disk
	- * information is updated in metaslab_sync().
	- */
	-static uint64_t
	-metaslab_weight_from_spacemap(metaslab_t *msp)
	-{
	- space_map_t *sm = msp->ms_sm;
	- ASSERT(!msp->ms_loaded);
	- ASSERT(sm != NULL);
	- ASSERT3U(space_map_object(sm), !=, 0);
	- ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
	-
	- /*
	- * Create a joint histogram from all the segments that have made
	- * it to the metaslab's space map histogram, that are not yet
	- * available for allocation because they are still in the freeing
	- * pipeline (e.g. freeing, freed, and defer trees). Then subtract
	- * these segments from the space map's histogram to get a more
	- * accurate weight.
	- */
	- uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
	- for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
	- deferspace_histogram[i] += msp->ms_synchist[i];
	- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	- for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
	- deferspace_histogram[i] += msp->ms_deferhist[t][i];
	- }
	- }
	-
	- uint64_t weight = 0;
	- for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
	- ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
	- deferspace_histogram[i]);
	- uint64_t count =
	- sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
	- if (count != 0) {
	- WEIGHT_SET_COUNT(weight, count);
	- WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
	- WEIGHT_SET_ACTIVE(weight, 0);
	- break;
	- }
	- }
	- return (weight);
	-}
	-
	-/*
	- * Compute a segment-based weight for the specified metaslab. The weight
	- * is determined by highest bucket in the histogram. The information
	- * for the highest bucket is encoded into the weight value.
	- */
	-static uint64_t
	-metaslab_segment_weight(metaslab_t *msp)
	-{
	- metaslab_group_t *mg = msp->ms_group;
	- uint64_t weight = 0;
	- uint8_t shift = mg->mg_vd->vdev_ashift;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	-
	- /*
	- * The metaslab is completely free.
	- */
	- if (metaslab_allocated_space(msp) == 0) {
	- int idx = highbit64(msp->ms_size) - 1;
	- int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
	-
	- if (idx < max_idx) {
	- WEIGHT_SET_COUNT(weight, 1ULL);
	- WEIGHT_SET_INDEX(weight, idx);
	- } else {
	- WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
	- WEIGHT_SET_INDEX(weight, max_idx);
	- }
	- WEIGHT_SET_ACTIVE(weight, 0);
	- ASSERT(!WEIGHT_IS_SPACEBASED(weight));
	-
	- return (weight);
	- }
	-
	- ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
	-
	- /*
	- * If the metaslab is fully allocated then just make the weight 0.
	- */
	- if (metaslab_allocated_space(msp) == msp->ms_size)
	- return (0);
	- /*
	- * If the metaslab is already loaded, then use the range tree to
	- * determine the weight. Otherwise, we rely on the space map information
	- * to generate the weight.
	- */
	- if (msp->ms_loaded) {
	- weight = metaslab_weight_from_range_tree(msp);
	- } else {
	- weight = metaslab_weight_from_spacemap(msp);
	- }
	-
	- /*
	- * If the metaslab was active the last time we calculated its weight
	- * then keep it active. We want to consume the entire region that
	- * is associated with this weight.
	- */
	- if (msp->ms_activation_weight != 0 && weight != 0)
	- WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
	- return (weight);
	-}
	-
	-/*
	- * Determine if we should attempt to allocate from this metaslab. If the
	- * metaslab has a maximum size then we can quickly determine if the desired
	- * allocation size can be satisfied. Otherwise, if we're using segment-based
	- * weighting then we can determine the maximum allocation that this metaslab
	- * can accommodate based on the index encoded in the weight. If we're using
	- * space-based weights then rely on the entire weight (excluding the weight
	- * type bit).
	- */
	-boolean_t
	-metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
	-{
	- boolean_t should_allocate;
	-
	- if (msp->ms_max_size != 0)
	- return (msp->ms_max_size >= asize);
	-
	- if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
	- /*
	- * The metaslab segment weight indicates segments in the
	- * range [2^i, 2^(i+1)), where i is the index in the weight.
	- * Since the asize might be in the middle of the range, we
	- * should attempt the allocation if asize < 2^(i+1).
	- */
	- should_allocate = (asize <
	- 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
	- } else {
	- should_allocate = (asize <=
	- (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
	- }
	- return (should_allocate);
	-}
	-
	-static uint64_t
	-metaslab_weight(metaslab_t *msp)
	-{
	- vdev_t *vd = msp->ms_group->mg_vd;
	- spa_t *spa = vd->vdev_spa;
	- uint64_t weight;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	-
	- /*
	- * If this vdev is in the process of being removed, there is nothing
	- * for us to do here.
	- */
	- if (vd->vdev_removing)
	- return (0);
	-
	- metaslab_set_fragmentation(msp);
	-
	- /*
	- * Update the maximum size if the metaslab is loaded. This will
	- * ensure that we get an accurate maximum size if newly freed space
	- * has been added back into the free tree.
	- */
	- if (msp->ms_loaded)
	- msp->ms_max_size = metaslab_block_maxsize(msp);
	- else
	- ASSERT0(msp->ms_max_size);
	-
	- /*
	- * Segment-based weighting requires space map histogram support.
	- */
	- if (zfs_metaslab_segment_weight_enabled &&
	- spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
	- (msp->ms_sm == NULL \|\| msp->ms_sm->sm_dbuf->db_size ==
	- sizeof (space_map_phys_t))) {
	- weight = metaslab_segment_weight(msp);
	- } else {
	- weight = metaslab_space_weight(msp);
	- }
	- return (weight);
	-}
	-
	-void
	-metaslab_recalculate_weight_and_sort(metaslab_t *msp)
	-{
	- /* note: we preserve the mask (e.g. indication of primary, etc..) */
	- uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
	- metaslab_group_sort(msp->ms_group, msp,
	- metaslab_weight(msp) \| was_active);
	-}
	-
	-static int
	-metaslab_activate_allocator(metaslab_group_t mg, metaslab_t msp,
	- int allocator, uint64_t activation_weight)
	-{
	- /*
	- * If we're activating for the claim code, we don't want to actually
	- * set the metaslab up for a specific allocator.
	- */
	- if (activation_weight == METASLAB_WEIGHT_CLAIM)
	- return (0);
	- metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
	- mg->mg_primaries : mg->mg_secondaries);
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- mutex_enter(&mg->mg_lock);
	- if (arr[allocator] != NULL) {
	- mutex_exit(&mg->mg_lock);
	- return (EEXIST);
	- }
	-
	- arr[allocator] = msp;
	- ASSERT3S(msp->ms_allocator, ==, -1);
	- msp->ms_allocator = allocator;
	- msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
	- mutex_exit(&mg->mg_lock);
	-
	- return (0);
	-}
	-
	-static int
	-metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
	-{
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	-
	- if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
	- int error = metaslab_load(msp);
	- if (error != 0) {
	- metaslab_group_sort(msp->ms_group, msp, 0);
	- return (error);
	- }
	- if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
	- /*
	- * The metaslab was activated for another allocator
	- * while we were waiting, we should reselect.
	- */
	- return (EBUSY);
	- }
	- if ((error = metaslab_activate_allocator(msp->ms_group, msp,
	- allocator, activation_weight)) != 0) {
	- return (error);
	- }
	-
	- msp->ms_activation_weight = msp->ms_weight;
	- metaslab_group_sort(msp->ms_group, msp,
	- msp->ms_weight \| activation_weight);
	- }
	- ASSERT(msp->ms_loaded);
	- ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
	-
	- return (0);
	-}
	-
	-static void
	-metaslab_passivate_allocator(metaslab_group_t mg, metaslab_t msp,
	- uint64_t weight)
	-{
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
	- metaslab_group_sort(mg, msp, weight);
	- return;
	- }
	-
	- mutex_enter(&mg->mg_lock);
	- ASSERT3P(msp->ms_group, ==, mg);
	- if (msp->ms_primary) {
	- ASSERT3U(0, <=, msp->ms_allocator);
	- ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
	- ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
	- ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
	- mg->mg_primaries[msp->ms_allocator] = NULL;
	- } else {
	- ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
	- ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
	- mg->mg_secondaries[msp->ms_allocator] = NULL;
	- }
	- msp->ms_allocator = -1;
	- metaslab_group_sort_impl(mg, msp, weight);
	- mutex_exit(&mg->mg_lock);
	-}
	-
	-static void
	-metaslab_passivate(metaslab_t *msp, uint64_t weight)
	-{
	- uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
	-
	- /*
	- * If size < SPA_MINBLOCKSIZE, then we will not allocate from
	- * this metaslab again. In that case, it had better be empty,
	- * or we would be leaving space on the table.
	- */
	- ASSERT(size >= SPA_MINBLOCKSIZE \|\|
	- range_tree_is_empty(msp->ms_allocatable));
	- ASSERT0(weight & METASLAB_ACTIVE_MASK);
	-
	- msp->ms_activation_weight = 0;
	- metaslab_passivate_allocator(msp->ms_group, msp, weight);
	- ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
	-}
	-
	-/*
	- * Segment-based metaslabs are activated once and remain active until
	- * we either fail an allocation attempt (similar to space-based metaslabs)
	- * or have exhausted the free space in zfs_metaslab_switch_threshold
	- * buckets since the metaslab was activated. This function checks to see
	- * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
	- * metaslab and passivates it proactively. This will allow us to select a
	- * metaslabs with larger contiguous region if any remaining within this
	- * metaslab group. If we're in sync pass > 1, then we continue using this
	- * metaslab so that we don't dirty more block and cause more sync passes.
	- */
	-void
	-metaslab_segment_may_passivate(metaslab_t *msp)
	-{
	- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
	-
	- if (WEIGHT_IS_SPACEBASED(msp->ms_weight) \|\| spa_sync_pass(spa) > 1)
	- return;
	-
	- /*
	- * Since we are in the middle of a sync pass, the most accurate
	- * information that is accessible to us is the in-core range tree
	- * histogram; calculate the new weight based on that information.
	- */
	- uint64_t weight = metaslab_weight_from_range_tree(msp);
	- int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
	- int current_idx = WEIGHT_GET_INDEX(weight);
	-
	- if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
	- metaslab_passivate(msp, weight);
	-}
	-
	-static void
	-metaslab_preload(void *arg)
	-{
	- metaslab_t *msp = arg;
	- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
	-
	- ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
	-
	- mutex_enter(&msp->ms_lock);
	- (void) metaslab_load(msp);
	- msp->ms_selected_txg = spa_syncing_txg(spa);
	- mutex_exit(&msp->ms_lock);
	-}
	-
	-static void
	-metaslab_group_preload(metaslab_group_t *mg)
	-{
	- spa_t *spa = mg->mg_vd->vdev_spa;
	- metaslab_t *msp;
	- avl_tree_t *t = &mg->mg_metaslab_tree;
	- int m = 0;
	-
	- if (spa_shutting_down(spa) \|\| !metaslab_preload_enabled) {
	- taskq_wait(mg->mg_taskq);
	- return;
	- }
	-
	- mutex_enter(&mg->mg_lock);
	-
	- /*
	- * Load the next potential metaslabs
	- */
	- for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
	- ASSERT3P(msp->ms_group, ==, mg);
	-
	- /*
	- * We preload only the maximum number of metaslabs specified
	- * by metaslab_preload_limit. If a metaslab is being forced
	- * to condense then we preload it too. This will ensure
	- * that force condensing happens in the next txg.
	- */
	- if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
	- continue;
	- }
	-
	- VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
	- msp, TQ_SLEEP) != 0);
	- }
	- mutex_exit(&mg->mg_lock);
	-}
	-
	-/*
	- * Determine if the space map's on-disk footprint is past our tolerance
	- * for inefficiency. We would like to use the following criteria to make
	- * our decision:
	- *
	- * 1. The size of the space map object should not dramatically increase as a
	- * result of writing out the free space range tree.
	- *
	- * 2. The minimal on-disk space map representation is zfs_condense_pct/100
	- * times the size than the free space range tree representation
	- * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
	- *
	- * 3. The on-disk size of the space map should actually decrease.
	- *
	- * Unfortunately, we cannot compute the on-disk size of the space map in this
	- * context because we cannot accurately compute the effects of compression, etc.
	- * Instead, we apply the heuristic described in the block comment for
	- * zfs_metaslab_condense_block_threshold - we only condense if the space used
	- * is greater than a threshold number of blocks.
	- */
	-static boolean_t
	-metaslab_should_condense(metaslab_t *msp)
	-{
	- space_map_t *sm = msp->ms_sm;
	- vdev_t *vd = msp->ms_group->mg_vd;
	- uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
	- uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- ASSERT(msp->ms_loaded);
	-
	- /*
	- * Allocations and frees in early passes are generally more space
	- * efficient (in terms of blocks described in space map entries)
	- * than the ones in later passes (e.g. we don't compress after
	- * sync pass 5) and condensing a metaslab multiple times in a txg
	- * could degrade performance.
	- *
	- * Thus we prefer condensing each metaslab at most once every txg at
	- * the earliest sync pass possible. If a metaslab is eligible for
	- * condensing again after being considered for condensing within the
	- * same txg, it will hopefully be dirty in the next txg where it will
	- * be condensed at an earlier pass.
	- */
	- if (msp->ms_condense_checked_txg == current_txg)
	- return (B_FALSE);
	- msp->ms_condense_checked_txg = current_txg;
	-
	- /*
	- * We always condense metaslabs that are empty and metaslabs for
	- * which a condense request has been made.
	- */
	- if (avl_is_empty(&msp->ms_allocatable_by_size) \|\|
	- msp->ms_condense_wanted)
	- return (B_TRUE);
	-
	- uint64_t object_size = space_map_length(msp->ms_sm);
	- uint64_t optimal_size = space_map_estimate_optimal_size(sm,
	- msp->ms_allocatable, SM_NO_VDEVID);
	-
	- dmu_object_info_t doi;
	- dmu_object_info_from_db(sm->sm_dbuf, &doi);
	- uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
	-
	- return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
	- object_size > zfs_metaslab_condense_block_threshold * record_size);
	-}
	-
	-/*
	- * Condense the on-disk space map representation to its minimized form.
	- * The minimized form consists of a small number of allocations followed by
	- * the entries of the free range tree.
	- */
	-static void
	-metaslab_condense(metaslab_t msp, uint64_t txg, dmu_tx_t tx)
	-{
	- range_tree_t *condense_tree;
	- space_map_t *sm = msp->ms_sm;
	-
	- ASSERT(MUTEX_HELD(&msp->ms_lock));
	- ASSERT(msp->ms_loaded);
	-
	- zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
	- "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
	- msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
	- msp->ms_group->mg_vd->vdev_spa->spa_name,
	- space_map_length(msp->ms_sm),
	- avl_numnodes(&msp->ms_allocatable->rt_root),
	- msp->ms_condense_wanted ? "TRUE" : "FALSE");
	-
	- msp->ms_condense_wanted = B_FALSE;
	-
	- /*
	- * Create an range tree that is 100% allocated. We remove segments
	- * that have been freed in this txg, any deferred frees that exist,
	- * and any allocation in the future. Removing segments should be
	- * a relatively inexpensive operation since we expect these trees to
	- * have a small number of nodes.
	- */
	- condense_tree = range_tree_create(NULL, NULL);
	- range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
	-
	- range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
	- range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
	-
	- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	- range_tree_walk(msp->ms_defer[t],
	- range_tree_remove, condense_tree);
	- }
	-
	- for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
	- range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
	- range_tree_remove, condense_tree);
	- }
	-
	- /*
	- * We're about to drop the metaslab's lock thus allowing
	- * other consumers to change it's content. Set the
	- * metaslab's ms_condensing flag to ensure that
	- * allocations on this metaslab do not occur while we're
	- * in the middle of committing it to disk. This is only critical
	- * for ms_allocatable as all other range trees use per txg
	- * views of their content.
	- */
	- msp->ms_condensing = B_TRUE;
	-
	- mutex_exit(&msp->ms_lock);
	- space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
	-
	- /*
	- * While we would ideally like to create a space map representation
	- * that consists only of allocation records, doing so can be
	- * prohibitively expensive because the in-core free tree can be
	- * large, and therefore computationally expensive to subtract
	- * from the condense_tree. Instead we sync out two trees, a cheap
	- * allocation only tree followed by the in-core free tree. While not
	- * optimal, this is typically close to optimal, and much cheaper to
	- * compute.
	- */
	- space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
	- range_tree_vacate(condense_tree, NULL, NULL);
	- range_tree_destroy(condense_tree);
	-
	- space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
	- mutex_enter(&msp->ms_lock);
	- msp->ms_condensing = B_FALSE;
	-}
	-
	-/*
	- * Write a metaslab to disk in the context of the specified transaction group.
	- */
	-void
	-metaslab_sync(metaslab_t *msp, uint64_t txg)
	-{
	- metaslab_group_t *mg = msp->ms_group;
	- vdev_t *vd = mg->mg_vd;
	- spa_t *spa = vd->vdev_spa;
	- objset_t *mos = spa_meta_objset(spa);
	- range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
	- dmu_tx_t *tx;
	- uint64_t object = space_map_object(msp->ms_sm);
	-
	- ASSERT(!vd->vdev_ishole);
	-
	- /*
	- * This metaslab has just been added so there's no work to do now.
	- */
	- if (msp->ms_freeing == NULL) {
	- ASSERT3P(alloctree, ==, NULL);
	- return;
	- }
	-
	- ASSERT3P(alloctree, !=, NULL);
	- ASSERT3P(msp->ms_freeing, !=, NULL);
	- ASSERT3P(msp->ms_freed, !=, NULL);
	- ASSERT3P(msp->ms_checkpointing, !=, NULL);
	-
	- /*
	- * Normally, we don't want to process a metaslab if there are no
	- * allocations or frees to perform. However, if the metaslab is being
	- * forced to condense and it's loaded, we need to let it through.
	- */
	- if (range_tree_is_empty(alloctree) &&
	- range_tree_is_empty(msp->ms_freeing) &&
	- range_tree_is_empty(msp->ms_checkpointing) &&
	- !(msp->ms_loaded && msp->ms_condense_wanted))
	- return;
	-
	-
	- VERIFY(txg <= spa_final_dirty_txg(spa));
	-
	- /*
	- * The only state that can actually be changing concurrently
	- * with metaslab_sync() is the metaslab's ms_allocatable. No
	- * other thread can be modifying this txg's alloc, freeing,
	- * freed, or space_map_phys_t. We drop ms_lock whenever we
	- * could call into the DMU, because the DMU can call down to
	- * us (e.g. via zio_free()) at any time.
	- *
	- * The spa_vdev_remove_thread() can be reading metaslab state
	- * concurrently, and it is locked out by the ms_sync_lock.
	- * Note that the ms_lock is insufficient for this, because it
	- * is dropped by space_map_write().
	- */
	- tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
	-
	- if (msp->ms_sm == NULL) {
	- uint64_t new_object;
	-
	- new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
	- VERIFY3U(new_object, !=, 0);
	-
	- VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
	- msp->ms_start, msp->ms_size, vd->vdev_ashift));
	-
	- ASSERT(msp->ms_sm != NULL);
	- ASSERT0(metaslab_allocated_space(msp));
	- }
	-
	- if (!range_tree_is_empty(msp->ms_checkpointing) &&
	- vd->vdev_checkpoint_sm == NULL) {
	- ASSERT(spa_has_checkpoint(spa));
	-
	- uint64_t new_object = space_map_alloc(mos,
	- vdev_standard_sm_blksz, tx);
	- VERIFY3U(new_object, !=, 0);
	-
	- VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
	- mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
	- ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
	-
	- /*
	- * We save the space map object as an entry in vdev_top_zap
	- * so it can be retrieved when the pool is reopened after an
	- * export or through zdb.
	- */
	- VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
	- vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
	- sizeof (new_object), 1, &new_object, tx));
	- }
	-
	- mutex_enter(&msp->ms_sync_lock);
	- mutex_enter(&msp->ms_lock);
	-
	- /*
	- * Note: metaslab_condense() clears the space map's histogram.
	- * Therefore we must verify and remove this histogram before
	- * condensing.
	- */
	- metaslab_group_histogram_verify(mg);
	- metaslab_class_histogram_verify(mg->mg_class);
	- metaslab_group_histogram_remove(mg, msp);
	-
	- if (msp->ms_loaded && metaslab_should_condense(msp)) {
	- metaslab_condense(msp, txg, tx);
	- } else {
	- mutex_exit(&msp->ms_lock);
	- space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
	- SM_NO_VDEVID, tx);
	- space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
	- SM_NO_VDEVID, tx);
	- mutex_enter(&msp->ms_lock);
	- }
	-
	- msp->ms_allocated_space += range_tree_space(alloctree);
	- ASSERT3U(msp->ms_allocated_space, >=,
	- range_tree_space(msp->ms_freeing));
	- msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
	-
	- if (!range_tree_is_empty(msp->ms_checkpointing)) {
	- ASSERT(spa_has_checkpoint(spa));
	- ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
	-
	- /*
	- * Since we are doing writes to disk and the ms_checkpointing
	- * tree won't be changing during that time, we drop the
	- * ms_lock while writing to the checkpoint space map.
	- */
	- mutex_exit(&msp->ms_lock);
	- space_map_write(vd->vdev_checkpoint_sm,
	- msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
	- mutex_enter(&msp->ms_lock);
	-
	- spa->spa_checkpoint_info.sci_dspace +=
	- range_tree_space(msp->ms_checkpointing);
	- vd->vdev_stat.vs_checkpoint_space +=
	- range_tree_space(msp->ms_checkpointing);
	- ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
	- -space_map_allocated(vd->vdev_checkpoint_sm));
	-
	- range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
	- }
	-
	- if (msp->ms_loaded) {
	- /*
	- * When the space map is loaded, we have an accurate
	- * histogram in the range tree. This gives us an opportunity
	- * to bring the space map's histogram up-to-date so we clear
	- * it first before updating it.
	- */
	- space_map_histogram_clear(msp->ms_sm);
	- space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
	-
	- /*
	- * Since we've cleared the histogram we need to add back
	- * any free space that has already been processed, plus
	- * any deferred space. This allows the on-disk histogram
	- * to accurately reflect all free space even if some space
	- * is not yet available for allocation (i.e. deferred).
	- */
	- space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
	-
	- /*
	- * Add back any deferred free space that has not been
	- * added back into the in-core free tree yet. This will
	- * ensure that we don't end up with a space map histogram
	- * that is completely empty unless the metaslab is fully
	- * allocated.
	- */
	- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	- space_map_histogram_add(msp->ms_sm,
	- msp->ms_defer[t], tx);
	- }
	- }
	-
	- /*
	- * Always add the free space from this sync pass to the space
	- * map histogram. We want to make sure that the on-disk histogram
	- * accounts for all free space. If the space map is not loaded,
	- * then we will lose some accuracy but will correct it the next
	- * time we load the space map.
	- */
	- space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
	- metaslab_aux_histograms_update(msp);
	-
	- metaslab_group_histogram_add(mg, msp);
	- metaslab_group_histogram_verify(mg);
	- metaslab_class_histogram_verify(mg->mg_class);
	-
	- /*
	- * For sync pass 1, we avoid traversing this txg's free range tree
	- * and instead will just swap the pointers for freeing and freed.
	- * We can safely do this since the freed_tree is guaranteed to be
	- * empty on the initial pass.
	- */
	- if (spa_sync_pass(spa) == 1) {
	- range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
	- ASSERT0(msp->ms_allocated_this_txg);
	- } else {
	- range_tree_vacate(msp->ms_freeing,
	- range_tree_add, msp->ms_freed);
	- }
	- msp->ms_allocated_this_txg += range_tree_space(alloctree);
	- range_tree_vacate(alloctree, NULL, NULL);
	-
	- ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
	- ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
	- & TXG_MASK]));
	- ASSERT0(range_tree_space(msp->ms_freeing));
	- ASSERT0(range_tree_space(msp->ms_checkpointing));
	-
	- mutex_exit(&msp->ms_lock);
	-
	- if (object != space_map_object(msp->ms_sm)) {
	- object = space_map_object(msp->ms_sm);
	- dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
	- msp->ms_id, sizeof (uint64_t), &object, tx);
	- }
	- mutex_exit(&msp->ms_sync_lock);
	- dmu_tx_commit(tx);
	-}
	-
	-/*
	- * Called after a transaction group has completely synced to mark
	- * all of the metaslab's free space as usable.
	- */
	-void
	-metaslab_sync_done(metaslab_t *msp, uint64_t txg)
	-{
	- metaslab_group_t *mg = msp->ms_group;
	- vdev_t *vd = mg->mg_vd;
	- spa_t *spa = vd->vdev_spa;
	- range_tree_t **defer_tree;
	- int64_t alloc_delta, defer_delta;
	- boolean_t defer_allowed = B_TRUE;
	-
	- ASSERT(!vd->vdev_ishole);
	-
	- mutex_enter(&msp->ms_lock);
	-
	- /*
	- * If this metaslab is just becoming available, initialize its
	- * range trees and add its capacity to the vdev.
	- */
	- if (msp->ms_freed == NULL) {
	- for (int t = 0; t < TXG_SIZE; t++) {
	- ASSERT(msp->ms_allocating[t] == NULL);
	-
	- msp->ms_allocating[t] = range_tree_create(NULL, NULL);
	- }
	-
	- ASSERT3P(msp->ms_freeing, ==, NULL);
	- msp->ms_freeing = range_tree_create(NULL, NULL);
	-
	- ASSERT3P(msp->ms_freed, ==, NULL);
	- msp->ms_freed = range_tree_create(NULL, NULL);
	-
	- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	- ASSERT(msp->ms_defer[t] == NULL);
	-
	- msp->ms_defer[t] = range_tree_create(NULL, NULL);
	- }
	-
	- ASSERT3P(msp->ms_checkpointing, ==, NULL);
	- msp->ms_checkpointing = range_tree_create(NULL, NULL);
	-
	- metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
	- }
	- ASSERT0(range_tree_space(msp->ms_freeing));
	- ASSERT0(range_tree_space(msp->ms_checkpointing));
	-
	- defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
	-
	- uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
	- metaslab_class_get_alloc(spa_normal_class(spa));
	- if (free_space <= spa_get_slop_space(spa) \|\| vd->vdev_removing) {
	- defer_allowed = B_FALSE;
	- }
	-
	- defer_delta = 0;
	- alloc_delta = msp->ms_allocated_this_txg -
	- range_tree_space(msp->ms_freed);
	- if (defer_allowed) {
	- defer_delta = range_tree_space(msp->ms_freed) -
	- range_tree_space(*defer_tree);
	- } else {
	- defer_delta -= range_tree_space(*defer_tree);
	- }
	-
	- metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
	- defer_delta, 0);
	-
	- /*
	- * If there's a metaslab_load() in progress, wait for it to complete
	- * so that we have a consistent view of the in-core space map.
	- */
	- metaslab_load_wait(msp);
	-
	- /*
	- * Move the frees from the defer_tree back to the free
	- * range tree (if it's loaded). Swap the freed_tree and
	- * the defer_tree -- this is safe to do because we've
	- * just emptied out the defer_tree.
	- */
	- range_tree_vacate(*defer_tree,
	- msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
	- if (defer_allowed) {
	- range_tree_swap(&msp->ms_freed, defer_tree);
	- } else {
	- range_tree_vacate(msp->ms_freed,
	- msp->ms_loaded ? range_tree_add : NULL,
	- msp->ms_allocatable);
	- }
	-
	- msp->ms_synced_length = space_map_length(msp->ms_sm);
	-
	- msp->ms_deferspace += defer_delta;
	- ASSERT3S(msp->ms_deferspace, >=, 0);
	- ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
	- if (msp->ms_deferspace != 0) {
	- /*
	- * Keep syncing this metaslab until all deferred frees
	- * are back in circulation.
	- */
	- vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
	- }
	- metaslab_aux_histograms_update_done(msp, defer_allowed);
	-
	- if (msp->ms_new) {
	- msp->ms_new = B_FALSE;
	- mutex_enter(&mg->mg_lock);
	- mg->mg_ms_ready++;
	- mutex_exit(&mg->mg_lock);
	- }
	-
	- /*
	- * Re-sort metaslab within its group now that we've adjusted
	- * its allocatable space.
	- */
	- metaslab_recalculate_weight_and_sort(msp);
	-
	- /*
	- * If the metaslab is loaded and we've not tried to load or allocate
	- * from it in 'metaslab_unload_delay' txgs, then unload it.
	- */
	- if (msp->ms_loaded &&
	- msp->ms_initializing == 0 &&
	- msp->ms_selected_txg + metaslab_unload_delay < txg) {
	- for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
	- VERIFY0(range_tree_space(
	- msp->ms_allocating[(txg + t) & TXG_MASK]));
	- }
	- if (msp->ms_allocator != -1) {
	- metaslab_passivate(msp, msp->ms_weight &
	- ~METASLAB_ACTIVE_MASK);
	- }
	-
	- if (!metaslab_debug_unload)
	- metaslab_unload(msp);
	- }
	-
	- ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
	- ASSERT0(range_tree_space(msp->ms_freeing));
	- ASSERT0(range_tree_space(msp->ms_freed));
	- ASSERT0(range_tree_space(msp->ms_checkpointing));
	-
	- msp->ms_allocated_this_txg = 0;
	- mutex_exit(&msp->ms_lock);
	-}
	-
	-void
	-metaslab_sync_reassess(metaslab_group_t *mg)
	-{
	- spa_t *spa = mg->mg_class->mc_spa;
	-
	- spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
	- metaslab_group_alloc_update(mg);
	- mg->mg_fragmentation = metaslab_group_fragmentation(mg);
	-
	- /*
	- * Preload the next potential metaslabs but only on active
	- * metaslab groups. We can get into a state where the metaslab
	- * is no longer active since we dirty metaslabs as we remove a
	- * a device, thus potentially making the metaslab group eligible
	- * for preloading.
	- */
	- if (mg->mg_activation_count > 0) {
	- metaslab_group_preload(mg);
	- }
	- spa_config_exit(spa, SCL_ALLOC, FTAG);
	-}
	-
	-/*
	- * When writing a ditto block (i.e. more than one DVA for a given BP) on
	- * the same vdev as an existing DVA of this BP, then try to allocate it
	- * on a different metaslab than existing DVAs (i.e. a unique metaslab).
	- */
	-static boolean_t
	-metaslab_is_unique(metaslab_t msp, dva_t dva)
	-{
	- uint64_t dva_ms_id;
	-
	- if (DVA_GET_ASIZE(dva) == 0)
	- return (B_TRUE);
	-
	- if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
	- return (B_TRUE);
	-
	- dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
	-
	- return (msp->ms_id != dva_ms_id);
	-}
	-
	-/*
	- * ==========================================================================
	- * Metaslab allocation tracing facility
	- * ==========================================================================
	- */
	-#ifdef _METASLAB_TRACING
	-kstat_t *metaslab_trace_ksp;
	-kstat_named_t metaslab_trace_over_limit;
	-
	-void
	-metaslab_alloc_trace_init(void)
	-{
	- ASSERT(metaslab_alloc_trace_cache == NULL);
	- metaslab_alloc_trace_cache = kmem_cache_create(
	- "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
	- 0, NULL, NULL, NULL, NULL, NULL, 0);
	- metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
	- "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
	- if (metaslab_trace_ksp != NULL) {
	- metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
	- kstat_named_init(&metaslab_trace_over_limit,
	- "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
	- kstat_install(metaslab_trace_ksp);
	- }
	-}
	-
	-void
	-metaslab_alloc_trace_fini(void)
	-{
	- if (metaslab_trace_ksp != NULL) {
	- kstat_delete(metaslab_trace_ksp);
	- metaslab_trace_ksp = NULL;
	- }
	- kmem_cache_destroy(metaslab_alloc_trace_cache);
	- metaslab_alloc_trace_cache = NULL;
	-}
	-
	-/*
	- * Add an allocation trace element to the allocation tracing list.
	- */
	-static void
	-metaslab_trace_add(zio_alloc_list_t zal, metaslab_group_t mg,
	- metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
	- int allocator)
	-{
	- if (!metaslab_trace_enabled)
	- return;
	-
	- /*
	- * When the tracing list reaches its maximum we remove
	- * the second element in the list before adding a new one.
	- * By removing the second element we preserve the original
	- * entry as a clue to what allocations steps have already been
	- * performed.
	- */
	- if (zal->zal_size == metaslab_trace_max_entries) {
	- metaslab_alloc_trace_t *mat_next;
	-#ifdef DEBUG
	- panic("too many entries in allocation list");
	-#endif
	- atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
	- zal->zal_size--;
	- mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
	- list_remove(&zal->zal_list, mat_next);
	- kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
	- }
	-
	- metaslab_alloc_trace_t *mat =
	- kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
	- list_link_init(&mat->mat_list_node);
	- mat->mat_mg = mg;
	- mat->mat_msp = msp;
	- mat->mat_size = psize;
	- mat->mat_dva_id = dva_id;
	- mat->mat_offset = offset;
	- mat->mat_weight = 0;
	- mat->mat_allocator = allocator;
	-
	- if (msp != NULL)
	- mat->mat_weight = msp->ms_weight;
	-
	- /*
	- * The list is part of the zio so locking is not required. Only
	- * a single thread will perform allocations for a given zio.
	- */
	- list_insert_tail(&zal->zal_list, mat);
	- zal->zal_size++;
	-
	- ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
	-}
	-
	-void
	-metaslab_trace_init(zio_alloc_list_t *zal)
	-{
	- list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
	- offsetof(metaslab_alloc_trace_t, mat_list_node));
	- zal->zal_size = 0;
	-}
	-
	-void
	-metaslab_trace_fini(zio_alloc_list_t *zal)
	-{
	- metaslab_alloc_trace_t *mat;
	-
	- while ((mat = list_remove_head(&zal->zal_list)) != NULL)
	- kmem_cache_free(metaslab_alloc_trace_cache, mat);
	- list_destroy(&zal->zal_list);
	- zal->zal_size = 0;
	-}
	-
	-#else
	-
	-#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
	-
	-void
	-metaslab_alloc_trace_init(void)
	-{
	-}
	-
	-void
	-metaslab_alloc_trace_fini(void)
	-{
	-}
	-
	-void
	-metaslab_trace_init(zio_alloc_list_t *zal)
	-{
	-}
	-
	-void
	-metaslab_trace_fini(zio_alloc_list_t *zal)
	-{
	-}
	-
	-#endif /* _METASLAB_TRACING */
	-
	-/*
	- * ==========================================================================
	- * Metaslab block operations
	- * ==========================================================================
	- */
	-
	-static void
	-metaslab_group_alloc_increment(spa_t spa, uint64_t vdev, void tag, int flags,
	- int allocator)
	-{
	- if (!(flags & METASLAB_ASYNC_ALLOC) \|\|
	- (flags & METASLAB_DONT_THROTTLE))
	- return;
	-
	- metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
	- if (!mg->mg_class->mc_alloc_throttle_enabled)
	- return;
	-
	- (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
	-}
	-
	-static void
	-metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
	-{
	- uint64_t max = mg->mg_max_alloc_queue_depth;
	- uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
	- while (cur < max) {
	- if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
	- cur, cur + 1) == cur) {
	- atomic_inc_64(
	- &mg->mg_class->mc_alloc_max_slots[allocator]);
	- return;
	- }
	- cur = mg->mg_cur_max_alloc_queue_depth[allocator];
	- }
	-}
	-
	-void
	-metaslab_group_alloc_decrement(spa_t spa, uint64_t vdev, void tag, int flags,
	- int allocator, boolean_t io_complete)
	-{
	- if (!(flags & METASLAB_ASYNC_ALLOC) \|\|
	- (flags & METASLAB_DONT_THROTTLE))
	- return;
	-
	- metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
	- if (!mg->mg_class->mc_alloc_throttle_enabled)
	- return;
	-
	- (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
	- if (io_complete)
	- metaslab_group_increment_qdepth(mg, allocator);
	-}
	-
	-void
	-metaslab_group_alloc_verify(spa_t spa, const blkptr_t bp, void *tag,
	- int allocator)
	-{
	-#ifdef ZFS_DEBUG
	- const dva_t *dva = bp->blk_dva;
	- int ndvas = BP_GET_NDVAS(bp);
	-
	- for (int d = 0; d < ndvas; d++) {
	- uint64_t vdev = DVA_GET_VDEV(&dva[d]);
	- metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
	- VERIFY(zfs_refcount_not_held(
	- &mg->mg_alloc_queue_depth[allocator], tag));
	- }
	-#endif
	-}
	-
	-static uint64_t
	-metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
	-{
	- uint64_t start;
	- range_tree_t *rt = msp->ms_allocatable;
	- metaslab_class_t *mc = msp->ms_group->mg_class;
	-
	- VERIFY(!msp->ms_condensing);
	- VERIFY0(msp->ms_initializing);
	-
	- start = mc->mc_ops->msop_alloc(msp, size);
	- if (start != -1ULL) {
	- metaslab_group_t *mg = msp->ms_group;
	- vdev_t *vd = mg->mg_vd;
	-
	- VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
	- VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
	- VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
	- range_tree_remove(rt, start, size);
	-
	- if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
	- vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
	-
	- range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
	-
	- /* Track the last successful allocation */
	- msp->ms_alloc_txg = txg;
	- metaslab_verify_space(msp, txg);
	- }
	-
	- /*
	- * Now that we've attempted the allocation we need to update the
	- * metaslab's maximum block size since it may have changed.
	- */
	- msp->ms_max_size = metaslab_block_maxsize(msp);
	- return (start);
	-}
	-
	-/*
	- * Find the metaslab with the highest weight that is less than what we've
	- * already tried. In the common case, this means that we will examine each
	- * metaslab at most once. Note that concurrent callers could reorder metaslabs
	- * by activation/passivation once we have dropped the mg_lock. If a metaslab is
	- * activated by another thread, and we fail to allocate from the metaslab we
	- * have selected, we may not try the newly-activated metaslab, and instead
	- * activate another metaslab. This is not optimal, but generally does not cause
	- * any problems (a possible exception being if every metaslab is completely full
	- * except for the the newly-activated metaslab which we fail to examine).
	- */
	-static metaslab_t *
	-find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
	- dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
	- zio_alloc_list_t zal, metaslab_t search, boolean_t *was_active)
	-{
	- avl_index_t idx;
	- avl_tree_t *t = &mg->mg_metaslab_tree;
	- metaslab_t *msp = avl_find(t, search, &idx);
	- if (msp == NULL)
	- msp = avl_nearest(t, idx, AVL_AFTER);
	-
	- for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
	- int i;
	- if (!metaslab_should_allocate(msp, asize)) {
	- metaslab_trace_add(zal, mg, msp, asize, d,
	- TRACE_TOO_SMALL, allocator);
	- continue;
	- }
	-
	- /*
	- * If the selected metaslab is condensing or being
	- * initialized, skip it.
	- */
	- if (msp->ms_condensing \|\| msp->ms_initializing > 0)
	- continue;
	-
	- *was_active = msp->ms_allocator != -1;
	- /*
	- * If we're activating as primary, this is our first allocation
	- * from this disk, so we don't need to check how close we are.
	- * If the metaslab under consideration was already active,
	- * we're getting desperate enough to steal another allocator's
	- * metaslab, so we still don't care about distances.
	- */
	- if (activation_weight == METASLAB_WEIGHT_PRIMARY \|\| *was_active)
	- break;
	-
	- for (i = 0; i < d; i++) {
	- if (want_unique &&
	- !metaslab_is_unique(msp, &dva[i]))
	- break; /* try another metaslab */
	- }
	- if (i == d)
	- break;
	- }
	-
	- if (msp != NULL) {
	- search->ms_weight = msp->ms_weight;
	- search->ms_start = msp->ms_start + 1;
	- search->ms_allocator = msp->ms_allocator;
	- search->ms_primary = msp->ms_primary;
	- }
	- return (msp);
	-}
	-
	-/* ARGSUSED */
	-static uint64_t
	-metaslab_group_alloc_normal(metaslab_group_t mg, zio_alloc_list_t zal,
	- uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
	- int d, int allocator)
	-{
	- metaslab_t *msp = NULL;
	- uint64_t offset = -1ULL;
	- uint64_t activation_weight;
	-
	- activation_weight = METASLAB_WEIGHT_PRIMARY;
	- for (int i = 0; i < d; i++) {
	- if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
	- DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
	- activation_weight = METASLAB_WEIGHT_SECONDARY;
	- } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
	- DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
	- activation_weight = METASLAB_WEIGHT_CLAIM;
	- break;
	- }
	- }
	-
	- /*
	- * If we don't have enough metaslabs active to fill the entire array, we
	- * just use the 0th slot.
	- */
	- if (mg->mg_ms_ready < mg->mg_allocators * 3)
	- allocator = 0;
	-
	- ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
	-
	- metaslab_t search = kmem_alloc(sizeof (search), KM_SLEEP);
	- search->ms_weight = UINT64_MAX;
	- search->ms_start = 0;
	- /*
	- * At the end of the metaslab tree are the already-active metaslabs,
	- * first the primaries, then the secondaries. When we resume searching
	- * through the tree, we need to consider ms_allocator and ms_primary so
	- * we start in the location right after where we left off, and don't
	- * accidentally loop forever considering the same metaslabs.
	- */
	- search->ms_allocator = -1;
	- search->ms_primary = B_TRUE;
	- for (;;) {
	- boolean_t was_active = B_FALSE;
	-
	- mutex_enter(&mg->mg_lock);
	-
	- if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
	- mg->mg_primaries[allocator] != NULL) {
	- msp = mg->mg_primaries[allocator];
	- was_active = B_TRUE;
	- } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
	- mg->mg_secondaries[allocator] != NULL) {
	- msp = mg->mg_secondaries[allocator];
	- was_active = B_TRUE;
	- } else {
	- msp = find_valid_metaslab(mg, activation_weight, dva, d,
	- want_unique, asize, allocator, zal, search,
	- &was_active);
	- }
	-
	- mutex_exit(&mg->mg_lock);
	- if (msp == NULL) {
	- kmem_free(search, sizeof (*search));
	- return (-1ULL);
	- }
	-
	- mutex_enter(&msp->ms_lock);
	- /*
	- * Ensure that the metaslab we have selected is still
	- * capable of handling our request. It's possible that
	- * another thread may have changed the weight while we
	- * were blocked on the metaslab lock. We check the
	- * active status first to see if we need to reselect
	- * a new metaslab.
	- */
	- if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
	- mutex_exit(&msp->ms_lock);
	- continue;
	- }
	-
	- /*
	- * If the metaslab is freshly activated for an allocator that
	- * isn't the one we're allocating from, or if it's a primary and
	- * we're seeking a secondary (or vice versa), we go back and
	- * select a new metaslab.
	- */
	- if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
	- (msp->ms_allocator != -1) &&
	- (msp->ms_allocator != allocator \|\| ((activation_weight ==
	- METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
	- mutex_exit(&msp->ms_lock);
	- continue;
	- }
	-
	- if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
	- activation_weight != METASLAB_WEIGHT_CLAIM) {
	- metaslab_passivate(msp, msp->ms_weight &
	- ~METASLAB_WEIGHT_CLAIM);
	- mutex_exit(&msp->ms_lock);
	- continue;
	- }
	-
	- if (metaslab_activate(msp, allocator, activation_weight) != 0) {
	- mutex_exit(&msp->ms_lock);
	- continue;
	- }
	-
	- msp->ms_selected_txg = txg;
	-
	- /*
	- * Now that we have the lock, recheck to see if we should
	- * continue to use this metaslab for this allocation. The
	- * the metaslab is now loaded so metaslab_should_allocate() can
	- * accurately determine if the allocation attempt should
	- * proceed.
	- */
	- if (!metaslab_should_allocate(msp, asize)) {
	- /* Passivate this metaslab and select a new one. */
	- metaslab_trace_add(zal, mg, msp, asize, d,
	- TRACE_TOO_SMALL, allocator);
	- goto next;
	- }
	-
	- /*
	- * If this metaslab is currently condensing then pick again as
	- * we can't manipulate this metaslab until it's committed
	- * to disk. If this metaslab is being initialized, we shouldn't
	- * allocate from it since the allocated region might be
	- * overwritten after allocation.
	- */
	- if (msp->ms_condensing) {
	- metaslab_trace_add(zal, mg, msp, asize, d,
	- TRACE_CONDENSING, allocator);
	- metaslab_passivate(msp, msp->ms_weight &
	- ~METASLAB_ACTIVE_MASK);
	- mutex_exit(&msp->ms_lock);
	- continue;
	- } else if (msp->ms_initializing > 0) {
	- metaslab_trace_add(zal, mg, msp, asize, d,
	- TRACE_INITIALIZING, allocator);
	- metaslab_passivate(msp, msp->ms_weight &
	- ~METASLAB_ACTIVE_MASK);
	- mutex_exit(&msp->ms_lock);
	- continue;
	- }
	-
	- offset = metaslab_block_alloc(msp, asize, txg);
	- metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
	-
	- if (offset != -1ULL) {
	- /* Proactively passivate the metaslab, if needed */
	- metaslab_segment_may_passivate(msp);
	- break;
	- }
	-next:
	- ASSERT(msp->ms_loaded);
	-
	- /*
	- * We were unable to allocate from this metaslab so determine
	- * a new weight for this metaslab. Now that we have loaded
	- * the metaslab we can provide a better hint to the metaslab
	- * selector.
	- *
	- * For space-based metaslabs, we use the maximum block size.
	- * This information is only available when the metaslab
	- * is loaded and is more accurate than the generic free
	- * space weight that was calculated by metaslab_weight().
	- * This information allows us to quickly compare the maximum
	- * available allocation in the metaslab to the allocation
	- * size being requested.
	- *
	- * For segment-based metaslabs, determine the new weight
	- * based on the highest bucket in the range tree. We
	- * explicitly use the loaded segment weight (i.e. the range
	- * tree histogram) since it contains the space that is
	- * currently available for allocation and is accurate
	- * even within a sync pass.
	- */
	- if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
	- uint64_t weight = metaslab_block_maxsize(msp);
	- WEIGHT_SET_SPACEBASED(weight);
	- metaslab_passivate(msp, weight);
	- } else {
	- metaslab_passivate(msp,
	- metaslab_weight_from_range_tree(msp));
	- }
	-
	- /*
	- * We have just failed an allocation attempt, check
	- * that metaslab_should_allocate() agrees. Otherwise,
	- * we may end up in an infinite loop retrying the same
	- * metaslab.
	- */
	- ASSERT(!metaslab_should_allocate(msp, asize));
	-
	- mutex_exit(&msp->ms_lock);
	- }
	- mutex_exit(&msp->ms_lock);
	- kmem_free(search, sizeof (*search));
	- return (offset);
	-}
	-
	-static uint64_t
	-metaslab_group_alloc(metaslab_group_t mg, zio_alloc_list_t zal,
	- uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
	- int d, int allocator)
	-{
	- uint64_t offset;
	- ASSERT(mg->mg_initialized);
	-
	- offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
	- dva, d, allocator);
	-
	- mutex_enter(&mg->mg_lock);
	- if (offset == -1ULL) {
	- mg->mg_failed_allocations++;
	- metaslab_trace_add(zal, mg, NULL, asize, d,
	- TRACE_GROUP_FAILURE, allocator);
	- if (asize == SPA_GANGBLOCKSIZE) {
	- /*
	- * This metaslab group was unable to allocate
	- * the minimum gang block size so it must be out of
	- * space. We must notify the allocation throttle
	- * to start skipping allocation attempts to this
	- * metaslab group until more space becomes available.
	- * Note: this failure cannot be caused by the
	- * allocation throttle since the allocation throttle
	- * is only responsible for skipping devices and
	- * not failing block allocations.
	- */
	- mg->mg_no_free_space = B_TRUE;
	- }
	- }
	- mg->mg_allocations++;
	- mutex_exit(&mg->mg_lock);
	- return (offset);
	-}
	-
	-/*
	- * Allocate a block for the specified i/o.
	- */
	-int
	-metaslab_alloc_dva(spa_t spa, metaslab_class_t mc, uint64_t psize,
	- dva_t dva, int d, dva_t hintdva, uint64_t txg, int flags,
	- zio_alloc_list_t *zal, int allocator)
	-{
	- metaslab_group_t mg, rotor;
	- vdev_t *vd;
	- boolean_t try_hard = B_FALSE;
	-
	- ASSERT(!DVA_IS_VALID(&dva[d]));
	-
	- /*
	- * For testing, make some blocks above a certain size be gang blocks.
	- * This will also test spilling from special to normal.
	- */
	- if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
	- metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
	- allocator);
	- return (SET_ERROR(ENOSPC));
	- }
	-
	- /*
	- * Start at the rotor and loop through all mgs until we find something.
	- * Note that there's no locking on mc_rotor or mc_aliquot because
	- * nothing actually breaks if we miss a few updates -- we just won't
	- * allocate quite as evenly. It all balances out over time.
	- *
	- * If we are doing ditto or log blocks, try to spread them across
	- * consecutive vdevs. If we're forced to reuse a vdev before we've
	- * allocated all of our ditto blocks, then try and spread them out on
	- * that vdev as much as possible. If it turns out to not be possible,
	- * gradually lower our standards until anything becomes acceptable.
	- * Also, allocating on consecutive vdevs (as opposed to random vdevs)
	- * gives us hope of containing our fault domains to something we're
	- * able to reason about. Otherwise, any two top-level vdev failures
	- * will guarantee the loss of data. With consecutive allocation,
	- * only two adjacent top-level vdev failures will result in data loss.
	- *
	- * If we are doing gang blocks (hintdva is non-NULL), try to keep
	- * ourselves on the same vdev as our gang block header. That
	- * way, we can hope for locality in vdev_cache, plus it makes our
	- * fault domains something tractable.
	- */
	- if (hintdva) {
	- vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
	-
	- /*
	- * It's possible the vdev we're using as the hint no
	- * longer exists or its mg has been closed (e.g. by
	- * device removal). Consult the rotor when
	- * all else fails.
	- */
	- if (vd != NULL && vd->vdev_mg != NULL) {
	- mg = vd->vdev_mg;
	-
	- if (flags & METASLAB_HINTBP_AVOID &&
	- mg->mg_next != NULL)
	- mg = mg->mg_next;
	- } else {
	- mg = mc->mc_rotor;
	- }
	- } else if (d != 0) {
	- vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
	- mg = vd->vdev_mg->mg_next;
	- } else {
	- ASSERT(mc->mc_rotor != NULL);
	- mg = mc->mc_rotor;
	- }
	-
	- /*
	- * If the hint put us into the wrong metaslab class, or into a
	- * metaslab group that has been passivated, just follow the rotor.
	- */
	- if (mg->mg_class != mc \|\| mg->mg_activation_count <= 0)
	- mg = mc->mc_rotor;
	-
	- rotor = mg;
	-top:
	- do {
	- boolean_t allocatable;
	-
	- ASSERT(mg->mg_activation_count == 1);
	- vd = mg->mg_vd;
	-
	- /*
	- * Don't allocate from faulted devices.
	- */
	- if (try_hard) {
	- spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
	- allocatable = vdev_allocatable(vd);
	- spa_config_exit(spa, SCL_ZIO, FTAG);
	- } else {
	- allocatable = vdev_allocatable(vd);
	- }
	-
	- /*
	- * Determine if the selected metaslab group is eligible
	- * for allocations. If we're ganging then don't allow
	- * this metaslab group to skip allocations since that would
	- * inadvertently return ENOSPC and suspend the pool
	- * even though space is still available.
	- */
	- if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
	- allocatable = metaslab_group_allocatable(mg, rotor,
	- psize, allocator, d);
	- }
	-
	- if (!allocatable) {
	- metaslab_trace_add(zal, mg, NULL, psize, d,
	- TRACE_NOT_ALLOCATABLE, allocator);
	- goto next;
	- }
	-
	- ASSERT(mg->mg_initialized);
	-
	- /*
	- * Avoid writing single-copy data to a failing,
	- * non-redundant vdev, unless we've already tried all
	- * other vdevs.
	- */
	- if ((vd->vdev_stat.vs_write_errors > 0 \|\|
	- vd->vdev_state < VDEV_STATE_HEALTHY) &&
	- d == 0 && !try_hard && vd->vdev_children == 0) {
	- metaslab_trace_add(zal, mg, NULL, psize, d,
	- TRACE_VDEV_ERROR, allocator);
	- goto next;
	- }
	-
	- ASSERT(mg->mg_class == mc);
	-
	- uint64_t asize = vdev_psize_to_asize(vd, psize);
	- ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
	-
	- /*
	- * If we don't need to try hard, then require that the
	- * block be on an different metaslab from any other DVAs
	- * in this BP (unique=true). If we are trying hard, then
	- * allow any metaslab to be used (unique=false).
	- */
	- uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
	- !try_hard, dva, d, allocator);
	-
	- if (offset != -1ULL) {
	- /*
	- * If we've just selected this metaslab group,
	- * figure out whether the corresponding vdev is
	- * over- or under-used relative to the pool,
	- * and set an allocation bias to even it out.
	- */
	- if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
	- vdev_stat_t *vs = &vd->vdev_stat;
	- int64_t vu, cu;
	-
	- vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
	- cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
	-
	- /*
	- * Calculate how much more or less we should
	- * try to allocate from this device during
	- * this iteration around the rotor.
	- * For example, if a device is 80% full
	- * and the pool is 20% full then we should
	- * reduce allocations by 60% on this device.
	- *
	- * mg_bias = (20 - 80) * 512K / 100 = -307K
	- *
	- * This reduces allocations by 307K for this
	- * iteration.
	- */
	- mg->mg_bias = ((cu - vu) *
	- (int64_t)mg->mg_aliquot) / 100;
	- } else if (!metaslab_bias_enabled) {
	- mg->mg_bias = 0;
	- }
	-
	- if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
	- mg->mg_aliquot + mg->mg_bias) {
	- mc->mc_rotor = mg->mg_next;
	- mc->mc_aliquot = 0;
	- }
	-
	- DVA_SET_VDEV(&dva[d], vd->vdev_id);
	- DVA_SET_OFFSET(&dva[d], offset);
	- DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
	- DVA_SET_ASIZE(&dva[d], asize);
	-
	- return (0);
	- }
	-next:
	- mc->mc_rotor = mg->mg_next;
	- mc->mc_aliquot = 0;
	- } while ((mg = mg->mg_next) != rotor);
	-
	- /*
	- * If we haven't tried hard, do so now.
	- */
	- if (!try_hard) {
	- try_hard = B_TRUE;
	- goto top;
	- }
	-
	- bzero(&dva[d], sizeof (dva_t));
	-
	- metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
	- return (SET_ERROR(ENOSPC));
	-}
	-
	-void
	-metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
	- boolean_t checkpoint)
	-{
	- metaslab_t *msp;
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT(vdev_is_concrete(vd));
	- ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
	- ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
	-
	- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	-
	- VERIFY(!msp->ms_condensing);
	- VERIFY3U(offset, >=, msp->ms_start);
	- VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
	- VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
	- VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
	-
	- metaslab_check_free_impl(vd, offset, asize);
	-
	- mutex_enter(&msp->ms_lock);
	- if (range_tree_is_empty(msp->ms_freeing) &&
	- range_tree_is_empty(msp->ms_checkpointing)) {
	- vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
	- }
	-
	- if (checkpoint) {
	- ASSERT(spa_has_checkpoint(spa));
	- range_tree_add(msp->ms_checkpointing, offset, asize);
	- } else {
	- range_tree_add(msp->ms_freeing, offset, asize);
	- }
	- mutex_exit(&msp->ms_lock);
	-}
	-
	-/* ARGSUSED */
	-void
	-metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
	- uint64_t size, void *arg)
	-{
	- boolean_t *checkpoint = arg;
	-
	- ASSERT3P(checkpoint, !=, NULL);
	-
	- if (vd->vdev_ops->vdev_op_remap != NULL)
	- vdev_indirect_mark_obsolete(vd, offset, size);
	- else
	- metaslab_free_impl(vd, offset, size, *checkpoint);
	-}
	-
	-static void
	-metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
	- boolean_t checkpoint)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
	-
	- if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
	- return;
	-
	- if (spa->spa_vdev_removal != NULL &&
	- spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
	- vdev_is_concrete(vd)) {
	- /*
	- * Note: we check if the vdev is concrete because when
	- * we complete the removal, we first change the vdev to be
	- * an indirect vdev (in open context), and then (in syncing
	- * context) clear spa_vdev_removal.
	- */
	- free_from_removing_vdev(vd, offset, size);
	- } else if (vd->vdev_ops->vdev_op_remap != NULL) {
	- vdev_indirect_mark_obsolete(vd, offset, size);
	- vd->vdev_ops->vdev_op_remap(vd, offset, size,
	- metaslab_free_impl_cb, &checkpoint);
	- } else {
	- metaslab_free_concrete(vd, offset, size, checkpoint);
	- }
	-}
	-
	-typedef struct remap_blkptr_cb_arg {
	- blkptr_t *rbca_bp;
	- spa_remap_cb_t rbca_cb;
	- vdev_t *rbca_remap_vd;
	- uint64_t rbca_remap_offset;
	- void *rbca_cb_arg;
	-} remap_blkptr_cb_arg_t;
	-
	-void
	-remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
	- uint64_t size, void *arg)
	-{
	- remap_blkptr_cb_arg_t *rbca = arg;
	- blkptr_t *bp = rbca->rbca_bp;
	-
	- /* We can not remap split blocks. */
	- if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
	- return;
	- ASSERT0(inner_offset);
	-
	- if (rbca->rbca_cb != NULL) {
	- /*
	- * At this point we know that we are not handling split
	- * blocks and we invoke the callback on the previous
	- * vdev which must be indirect.
	- */
	- ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
	-
	- rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
	- rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
	-
	- /* set up remap_blkptr_cb_arg for the next call */
	- rbca->rbca_remap_vd = vd;
	- rbca->rbca_remap_offset = offset;
	- }
	-
	- /*
	- * The phys birth time is that of dva[0]. This ensures that we know
	- * when each dva was written, so that resilver can determine which
	- * blocks need to be scrubbed (i.e. those written during the time
	- * the vdev was offline). It also ensures that the key used in
	- * the ARC hash table is unique (i.e. dva[0] + phys_birth). If
	- * we didn't change the phys_birth, a lookup in the ARC for a
	- * remapped BP could find the data that was previously stored at
	- * this vdev + offset.
	- */
	- vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
	- DVA_GET_VDEV(&bp->blk_dva[0]));
	- vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
	- bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
	- DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
	-
	- DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
	- DVA_SET_OFFSET(&bp->blk_dva[0], offset);
	-}
	-
	-/*
	- * If the block pointer contains any indirect DVAs, modify them to refer to
	- * concrete DVAs. Note that this will sometimes not be possible, leaving
	- * the indirect DVA in place. This happens if the indirect DVA spans multiple
	- * segments in the mapping (i.e. it is a "split block").
	- *
	- * If the BP was remapped, calls the callback on the original dva (note the
	- * callback can be called multiple times if the original indirect DVA refers
	- * to another indirect DVA, etc).
	- *
	- * Returns TRUE if the BP was remapped.
	- */
	-boolean_t
	-spa_remap_blkptr(spa_t spa, blkptr_t bp, spa_remap_cb_t callback, void *arg)
	-{
	- remap_blkptr_cb_arg_t rbca;
	-
	- if (!zfs_remap_blkptr_enable)
	- return (B_FALSE);
	-
	- if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
	- return (B_FALSE);
	-
	- /*
	- * Dedup BP's can not be remapped, because ddt_phys_select() depends
	- * on DVA[0] being the same in the BP as in the DDT (dedup table).
	- */
	- if (BP_GET_DEDUP(bp))
	- return (B_FALSE);
	-
	- /*
	- * Gang blocks can not be remapped, because
	- * zio_checksum_gang_verifier() depends on the DVA[0] that's in
	- * the BP used to read the gang block header (GBH) being the same
	- * as the DVA[0] that we allocated for the GBH.
	- */
	- if (BP_IS_GANG(bp))
	- return (B_FALSE);
	-
	- /*
	- * Embedded BP's have no DVA to remap.
	- */
	- if (BP_GET_NDVAS(bp) < 1)
	- return (B_FALSE);
	-
	- /*
	- * Note: we only remap dva[0]. If we remapped other dvas, we
	- * would no longer know what their phys birth txg is.
	- */
	- dva_t *dva = &bp->blk_dva[0];
	-
	- uint64_t offset = DVA_GET_OFFSET(dva);
	- uint64_t size = DVA_GET_ASIZE(dva);
	- vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
	-
	- if (vd->vdev_ops->vdev_op_remap == NULL)
	- return (B_FALSE);
	-
	- rbca.rbca_bp = bp;
	- rbca.rbca_cb = callback;
	- rbca.rbca_remap_vd = vd;
	- rbca.rbca_remap_offset = offset;
	- rbca.rbca_cb_arg = arg;
	-
	- /*
	- * remap_blkptr_cb() will be called in order for each level of
	- * indirection, until a concrete vdev is reached or a split block is
	- * encountered. old_vd and old_offset are updated within the callback
	- * as we go from the one indirect vdev to the next one (either concrete
	- * or indirect again) in that order.
	- */
	- vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
	-
	- /* Check if the DVA wasn't remapped because it is a split block */
	- if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
	- return (B_FALSE);
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Undo the allocation of a DVA which happened in the given transaction group.
	- */
	-void
	-metaslab_unalloc_dva(spa_t spa, const dva_t dva, uint64_t txg)
	-{
	- metaslab_t *msp;
	- vdev_t *vd;
	- uint64_t vdev = DVA_GET_VDEV(dva);
	- uint64_t offset = DVA_GET_OFFSET(dva);
	- uint64_t size = DVA_GET_ASIZE(dva);
	-
	- ASSERT(DVA_IS_VALID(dva));
	- ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
	-
	- if (txg > spa_freeze_txg(spa))
	- return;
	-
	- if ((vd = vdev_lookup_top(spa, vdev)) == NULL \|\|
	- (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
	- cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
	- (u_longlong_t)vdev, (u_longlong_t)offset);
	- ASSERT(0);
	- return;
	- }
	-
	- ASSERT(!vd->vdev_removing);
	- ASSERT(vdev_is_concrete(vd));
	- ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
	- ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
	-
	- if (DVA_GET_GANG(dva))
	- size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
	-
	- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	-
	- mutex_enter(&msp->ms_lock);
	- range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
	- offset, size);
	-
	- VERIFY(!msp->ms_condensing);
	- VERIFY3U(offset, >=, msp->ms_start);
	- VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
	- VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
	- msp->ms_size);
	- VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
	- VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
	- range_tree_add(msp->ms_allocatable, offset, size);
	- mutex_exit(&msp->ms_lock);
	-}
	-
	-/*
	- * Free the block represented by the given DVA.
	- */
	-void
	-metaslab_free_dva(spa_t spa, const dva_t dva, boolean_t checkpoint)
	-{
	- uint64_t vdev = DVA_GET_VDEV(dva);
	- uint64_t offset = DVA_GET_OFFSET(dva);
	- uint64_t size = DVA_GET_ASIZE(dva);
	- vdev_t *vd = vdev_lookup_top(spa, vdev);
	-
	- ASSERT(DVA_IS_VALID(dva));
	- ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
	-
	- if (DVA_GET_GANG(dva)) {
	- size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
	- }
	-
	- metaslab_free_impl(vd, offset, size, checkpoint);
	-}
	-
	-/*
	- * Reserve some allocation slots. The reservation system must be called
	- * before we call into the allocator. If there aren't any available slots
	- * then the I/O will be throttled until an I/O completes and its slots are
	- * freed up. The function returns true if it was successful in placing
	- * the reservation.
	- */
	-boolean_t
	-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
	- zio_t *zio, int flags)
	-{
	- uint64_t available_slots = 0;
	- boolean_t slot_reserved = B_FALSE;
	- uint64_t max = mc->mc_alloc_max_slots[allocator];
	-
	- ASSERT(mc->mc_alloc_throttle_enabled);
	- mutex_enter(&mc->mc_lock);
	-
	- uint64_t reserved_slots =
	- zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
	- if (reserved_slots < max)
	- available_slots = max - reserved_slots;
	-
	- if (slots <= available_slots \|\| GANG_ALLOCATION(flags) \|\|
	- flags & METASLAB_MUST_RESERVE) {
	- /*
	- * We reserve the slots individually so that we can unreserve
	- * them individually when an I/O completes.
	- */
	- for (int d = 0; d < slots; d++) {
	- reserved_slots =
	- zfs_refcount_add(&mc->mc_alloc_slots[allocator],
	- zio);
	- }
	- zio->io_flags \|= ZIO_FLAG_IO_ALLOCATING;
	- slot_reserved = B_TRUE;
	- }
	-
	- mutex_exit(&mc->mc_lock);
	- return (slot_reserved);
	-}
	-
	-void
	-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
	- int allocator, zio_t *zio)
	-{
	- ASSERT(mc->mc_alloc_throttle_enabled);
	- mutex_enter(&mc->mc_lock);
	- for (int d = 0; d < slots; d++) {
	- (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
	- zio);
	- }
	- mutex_exit(&mc->mc_lock);
	-}
	-
	-static int
	-metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
	- uint64_t txg)
	-{
	- metaslab_t *msp;
	- spa_t *spa = vd->vdev_spa;
	- int error = 0;
	-
	- if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
	- return (ENXIO);
	-
	- ASSERT3P(vd->vdev_ms, !=, NULL);
	- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	-
	- mutex_enter(&msp->ms_lock);
	-
	- if ((txg != 0 && spa_writeable(spa)) \|\| !msp->ms_loaded)
	- error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
	- /*
	- * No need to fail in that case; someone else has activated the
	- * metaslab, but that doesn't preclude us from using it.
	- */
	- if (error == EBUSY)
	- error = 0;
	-
	- if (error == 0 &&
	- !range_tree_contains(msp->ms_allocatable, offset, size))
	- error = SET_ERROR(ENOENT);
	-
	- if (error \|\| txg == 0) { /* txg == 0 indicates dry run */
	- mutex_exit(&msp->ms_lock);
	- return (error);
	- }
	-
	- VERIFY(!msp->ms_condensing);
	- VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
	- VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
	- VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
	- msp->ms_size);
	- range_tree_remove(msp->ms_allocatable, offset, size);
	-
	- if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
	- if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
	- vdev_dirty(vd, VDD_METASLAB, msp, txg);
	- range_tree_add(msp->ms_allocating[txg & TXG_MASK],
	- offset, size);
	- }
	-
	- mutex_exit(&msp->ms_lock);
	-
	- return (0);
	-}
	-
	-typedef struct metaslab_claim_cb_arg_t {
	- uint64_t mcca_txg;
	- int mcca_error;
	-} metaslab_claim_cb_arg_t;
	-
	-/* ARGSUSED */
	-static void
	-metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
	- uint64_t size, void *arg)
	-{
	- metaslab_claim_cb_arg_t *mcca_arg = arg;
	-
	- if (mcca_arg->mcca_error == 0) {
	- mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
	- size, mcca_arg->mcca_txg);
	- }
	-}
	-
	-int
	-metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
	-{
	- if (vd->vdev_ops->vdev_op_remap != NULL) {
	- metaslab_claim_cb_arg_t arg;
	-
	- /*
	- * Only zdb(1M) can claim on indirect vdevs. This is used
	- * to detect leaks of mapped space (that are not accounted
	- * for in the obsolete counts, spacemap, or bpobj).
	- */
	- ASSERT(!spa_writeable(vd->vdev_spa));
	- arg.mcca_error = 0;
	- arg.mcca_txg = txg;
	-
	- vd->vdev_ops->vdev_op_remap(vd, offset, size,
	- metaslab_claim_impl_cb, &arg);
	-
	- if (arg.mcca_error == 0) {
	- arg.mcca_error = metaslab_claim_concrete(vd,
	- offset, size, txg);
	- }
	- return (arg.mcca_error);
	- } else {
	- return (metaslab_claim_concrete(vd, offset, size, txg));
	- }
	-}
	-
	-/*
	- * Intent log support: upon opening the pool after a crash, notify the SPA
	- * of blocks that the intent log has allocated for immediate write, but
	- * which are still considered free by the SPA because the last transaction
	- * group didn't commit yet.
	- */
	-static int
	-metaslab_claim_dva(spa_t spa, const dva_t dva, uint64_t txg)
	-{
	- uint64_t vdev = DVA_GET_VDEV(dva);
	- uint64_t offset = DVA_GET_OFFSET(dva);
	- uint64_t size = DVA_GET_ASIZE(dva);
	- vdev_t *vd;
	-
	- if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
	- return (SET_ERROR(ENXIO));
	- }
	-
	- ASSERT(DVA_IS_VALID(dva));
	-
	- if (DVA_GET_GANG(dva))
	- size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
	-
	- return (metaslab_claim_impl(vd, offset, size, txg));
	-}
	-
	-int
	-metaslab_alloc(spa_t spa, metaslab_class_t mc, uint64_t psize, blkptr_t *bp,
	- int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
	- zio_alloc_list_t zal, zio_t zio, int allocator)
	-{
	- dva_t *dva = bp->blk_dva;
	- dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
	- int error = 0;
	-
	- ASSERT(bp->blk_birth == 0);
	- ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
	-
	- spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
	-
	- if (mc->mc_rotor == NULL) { /* no vdevs in this class */
	- spa_config_exit(spa, SCL_ALLOC, FTAG);
	- return (SET_ERROR(ENOSPC));
	- }
	-
	- ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
	- ASSERT(BP_GET_NDVAS(bp) == 0);
	- ASSERT(hintbp == NULL \|\| ndvas <= BP_GET_NDVAS(hintbp));
	- ASSERT3P(zal, !=, NULL);
	-
	- for (int d = 0; d < ndvas; d++) {
	- error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
	- txg, flags, zal, allocator);
	- if (error != 0) {
	- for (d--; d >= 0; d--) {
	- metaslab_unalloc_dva(spa, &dva[d], txg);
	- metaslab_group_alloc_decrement(spa,
	- DVA_GET_VDEV(&dva[d]), zio, flags,
	- allocator, B_FALSE);
	- bzero(&dva[d], sizeof (dva_t));
	- }
	- spa_config_exit(spa, SCL_ALLOC, FTAG);
	- return (error);
	- } else {
	- /*
	- * Update the metaslab group's queue depth
	- * based on the newly allocated dva.
	- */
	- metaslab_group_alloc_increment(spa,
	- DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
	- }
	-
	- }
	- ASSERT(error == 0);
	- ASSERT(BP_GET_NDVAS(bp) == ndvas);
	-
	- spa_config_exit(spa, SCL_ALLOC, FTAG);
	-
	- BP_SET_BIRTH(bp, txg, txg);
	-
	- return (0);
	-}
	-
	-void
	-metaslab_free(spa_t spa, const blkptr_t bp, uint64_t txg, boolean_t now)
	-{
	- const dva_t *dva = bp->blk_dva;
	- int ndvas = BP_GET_NDVAS(bp);
	-
	- ASSERT(!BP_IS_HOLE(bp));
	- ASSERT(!now \|\| bp->blk_birth >= spa_syncing_txg(spa));
	-
	- /*
	- * If we have a checkpoint for the pool we need to make sure that
	- * the blocks that we free that are part of the checkpoint won't be
	- * reused until the checkpoint is discarded or we revert to it.
	- *
	- * The checkpoint flag is passed down the metaslab_free code path
	- * and is set whenever we want to add a block to the checkpoint's
	- * accounting. That is, we "checkpoint" blocks that existed at the
	- * time the checkpoint was created and are therefore referenced by
	- * the checkpointed uberblock.
	- *
	- * Note that, we don't checkpoint any blocks if the current
	- * syncing txg <= spa_checkpoint_txg. We want these frees to sync
	- * normally as they will be referenced by the checkpointed uberblock.
	- */
	- boolean_t checkpoint = B_FALSE;
	- if (bp->blk_birth <= spa->spa_checkpoint_txg &&
	- spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
	- /*
	- * At this point, if the block is part of the checkpoint
	- * there is no way it was created in the current txg.
	- */
	- ASSERT(!now);
	- ASSERT3U(spa_syncing_txg(spa), ==, txg);
	- checkpoint = B_TRUE;
	- }
	-
	- spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
	-
	- for (int d = 0; d < ndvas; d++) {
	- if (now) {
	- metaslab_unalloc_dva(spa, &dva[d], txg);
	- } else {
	- ASSERT3U(txg, ==, spa_syncing_txg(spa));
	- metaslab_free_dva(spa, &dva[d], checkpoint);
	- }
	- }
	-
	- spa_config_exit(spa, SCL_FREE, FTAG);
	-}
	-
	-int
	-metaslab_claim(spa_t spa, const blkptr_t bp, uint64_t txg)
	-{
	- const dva_t *dva = bp->blk_dva;
	- int ndvas = BP_GET_NDVAS(bp);
	- int error = 0;
	-
	- ASSERT(!BP_IS_HOLE(bp));
	-
	- if (txg != 0) {
	- /*
	- * First do a dry run to make sure all DVAs are claimable,
	- * so we don't have to unwind from partial failures below.
	- */
	- if ((error = metaslab_claim(spa, bp, 0)) != 0)
	- return (error);
	- }
	-
	- spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
	-
	- for (int d = 0; d < ndvas; d++) {
	- error = metaslab_claim_dva(spa, &dva[d], txg);
	- if (error != 0)
	- break;
	- }
	-
	- spa_config_exit(spa, SCL_ALLOC, FTAG);
	-
	- ASSERT(error == 0 \|\| txg == 0);
	-
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static void
	-metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
	- uint64_t size, void *arg)
	-{
	- if (vd->vdev_ops == &vdev_indirect_ops)
	- return;
	-
	- metaslab_check_free_impl(vd, offset, size);
	-}
	-
	-static void
	-metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
	-{
	- metaslab_t *msp;
	- spa_t *spa = vd->vdev_spa;
	-
	- if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
	- return;
	-
	- if (vd->vdev_ops->vdev_op_remap != NULL) {
	- vd->vdev_ops->vdev_op_remap(vd, offset, size,
	- metaslab_check_free_impl_cb, NULL);
	- return;
	- }
	-
	- ASSERT(vdev_is_concrete(vd));
	- ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
	- ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
	-
	- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	-
	- mutex_enter(&msp->ms_lock);
	- if (msp->ms_loaded) {
	- range_tree_verify_not_present(msp->ms_allocatable,
	- offset, size);
	- }
	-
	- range_tree_verify_not_present(msp->ms_freeing, offset, size);
	- range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
	- range_tree_verify_not_present(msp->ms_freed, offset, size);
	- for (int j = 0; j < TXG_DEFER_SIZE; j++)
	- range_tree_verify_not_present(msp->ms_defer[j], offset, size);
	- mutex_exit(&msp->ms_lock);
	-}
	-
	-void
	-metaslab_check_free(spa_t spa, const blkptr_t bp)
	-{
	- if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
	- return;
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	- for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
	- uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
	- vdev_t *vd = vdev_lookup_top(spa, vdev);
	- uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
	- uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
	-
	- if (DVA_GET_GANG(&bp->blk_dva[i]))
	- size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
	-
	- ASSERT3P(vd, !=, NULL);
	-
	- metaslab_check_free_impl(vd, offset, size);
	- }
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c
	@@ -1,750 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
	- * Copyright 2019 Joyent, Inc.
	- */
	-
	-#include <sys/abd.h>
	-#include <sys/mmp.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/time.h>
	-#include <sys/vdev.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zfs_context.h>
	-#include <sys/callb.h>
	-
	-/*
	- * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
	- * or opening a pool on more than one host at a time. In particular, it
	- * prevents "zpool import -f" on a host from succeeding while the pool is
	- * already imported on another host. There are many other ways in which a
	- * device could be used by two hosts for different purposes at the same time
	- * resulting in pool damage. This implementation does not attempt to detect
	- * those cases.
	- *
	- * MMP operates by ensuring there are frequent visible changes on disk (a
	- * "heartbeat") at all times. And by altering the import process to check
	- * for these changes and failing the import when they are detected. This
	- * functionality is enabled by setting the 'multihost' pool property to on.
	- *
	- * Uberblocks written by the txg_sync thread always go into the first
	- * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
	- * They are used to hold uberblocks which are exactly the same as the last
	- * synced uberblock except that the ub_timestamp and mmp_config are frequently
	- * updated. Like all other uberblocks, the slot is written with an embedded
	- * checksum, and slots with invalid checksums are ignored. This provides the
	- * "heartbeat", with no risk of overwriting good uberblocks that must be
	- * preserved, e.g. previous txgs and associated block pointers.
	- *
	- * Three optional fields are added to uberblock structure; ub_mmp_magic,
	- * ub_mmp_config, and ub_mmp_delay. The ub_mmp_magic value allows zfs to tell
	- * whether the other ub_mmp_* fields are valid. The ub_mmp_config field tells
	- * the importing host the settings of zfs_multihost_interval and
	- * zfs_multihost_fail_intervals on the host which last had (or currently has)
	- * the pool imported. These determine how long a host must wait to detect
	- * activity in the pool, before concluding the pool is not in use. The
	- * mmp_delay field is a decaying average of the amount of time between
	- * completion of successive MMP writes, in nanoseconds. It indicates whether
	- * MMP is enabled.
	- *
	- * During import an activity test may now be performed to determine if
	- * the pool is in use. The activity test is typically required if the
	- * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
	- * POOL_STATE_ACTIVE, and the pool is not a root pool.
	- *
	- * The activity test finds the "best" uberblock (highest txg, timestamp, and, if
	- * ub_mmp_magic is valid, sequence number from ub_mmp_config). It then waits
	- * some time, and finds the "best" uberblock again. If any of the mentioned
	- * fields have different values in the newly read uberblock, the pool is in use
	- * by another host and the import fails. In order to assure the accuracy of the
	- * activity test, the default values result in an activity test duration of 20x
	- * the mmp write interval.
	- *
	- * The duration of the "zpool import" activity test depends on the information
	- * available in the "best" uberblock:
	- *
	- * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0:
	- * ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2
	- *
	- * In this case, a weak guarantee is provided. Since the host which last had
	- * the pool imported will suspend the pool if no mmp writes land within
	- * fail_intervals * multihost_interval ms, the absence of writes during that
	- * time means either the pool is not imported, or it is imported but the pool
	- * is suspended and no further writes will occur.
	- *
	- * Note that resuming the suspended pool on the remote host would invalidate
	- * this guarantee, and so it is not allowed.
	- *
	- * The factor of 2 provides a conservative safety factor and derives from
	- * MMP_IMPORT_SAFETY_FACTOR;
	- *
	- * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0:
	- * (ub_mmp_config.multihost_interval + ub_mmp_delay) *
	- * zfs_multihost_import_intervals
	- *
	- * In this case no guarantee can provided. However, as long as some devices
	- * are healthy and connected, it is likely that at least one write will land
	- * within (multihost_interval + mmp_delay) because multihost_interval is
	- * enough time for a write to be attempted to each leaf vdev, and mmp_delay
	- * is enough for one to land, based on past delays. Multiplying by
	- * zfs_multihost_import_intervals provides a conservative safety factor.
	- *
	- * 3) If uberblock was written by zfs-0.7:
	- * (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals
	- *
	- * The same logic as case #2 applies, but we do not know remote tunables.
	- *
	- * We use the local value for zfs_multihost_interval because the original MMP
	- * did not record this value in the uberblock.
	- *
	- * ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host
	- * has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect
	- * that. We will have waited enough time for zfs_multihost_import_intervals
	- * writes to be issued and all but one to land.
	- *
	- * single device pool example delays
	- *
	- * import_delay = (1 + 1) * 20 = 40s #defaults, no I/O delay
	- * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay
	- * import_delay = (10 + 10) * 20 = 400s #10s multihost_interval,
	- * no I/O delay
	- * 100 device pool example delays
	- *
	- * import_delay = (1 + .01) * 20 = 20s #defaults, no I/O delay
	- * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay
	- * import_delay = (10 + .1) * 20 = 202s #10s multihost_interval,
	- * no I/O delay
	- *
	- * 4) Otherwise, this uberblock was written by a pre-MMP zfs:
	- * zfs_multihost_import_intervals * zfs_multihost_interval
	- *
	- * In this case local tunables are used. By default this product = 10s, long
	- * enough for a pool with any activity at all to write at least one
	- * uberblock. No guarantee can be provided.
	- *
	- * Additionally, the duration is then extended by a random 25% to attempt to to
	- * detect simultaneous imports. For example, if both partner hosts are rebooted
	- * at the same time and automatically attempt to import the pool.
	- */
	-
	-/*
	- * Used to control the frequency of mmp writes which are performed when the
	- * 'multihost' pool property is on. This is one factor used to determine the
	- * length of the activity check during import.
	- *
	- * On average an mmp write will be issued for each leaf vdev every
	- * zfs_multihost_interval milliseconds. In practice, the observed period can
	- * vary with the I/O load and this observed value is the ub_mmp_delay which is
	- * stored in the uberblock. The minimum allowed value is 100 ms.
	- */
	-ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
	-#ifdef __FreeBSD__
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, multihost_interval, CTLFLAG_RWTUN,
	- &zfs_multihost_interval, 0, "Interval between MMP writes, milliseconds");
	-#endif
	-
	-/*
	- * Used to control the duration of the activity test on import. Smaller values
	- * of zfs_multihost_import_intervals will reduce the import time but increase
	- * the risk of failing to detect an active pool. The total activity check time
	- * is never allowed to drop below one second. A value of 0 is ignored and
	- * treated as if it was set to 1.
	- */
	-uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
	-#ifdef __FreeBSD__
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, multihost_import_intervals, CTLFLAG_RWTUN,
	- &zfs_multihost_import_intervals, 0,
	- "MMP activity check period for pool import, "
	- "in units of multihost_interval");
	-#endif
	-
	-/*
	- * Controls the behavior of the pool when mmp write failures or delays are
	- * detected.
	- *
	- * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are
	- * ignored. The failures will still be reported to the ZED which depending on
	- * its configuration may take action such as suspending the pool or taking a
	- * device offline.
	- *
	- * When zfs_multihost_fail_intervals > 0, the pool will be suspended if
	- * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass
	- * without a successful mmp write. This guarantees the activity test will see
	- * mmp writes if the pool is imported. A value of 1 is ignored and treated as
	- * if it was set to 2, because a single leaf vdev pool will issue a write once
	- * per multihost_interval and thus any variation in latency would cause the
	- * pool to be suspended.
	- */
	-uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
	-#ifdef __FreeBSD__
	-SYSCTL_UINT(_vfs_zfs, OID_AUTO, multihost_fail_intervals, CTLFLAG_RWTUN,
	- &zfs_multihost_fail_intervals, 0,
	- "How long to tolerate MMP write failures before suspending a pool, "
	- "in units of multihost_interval");
	-#endif
	-
	-char *mmp_tag = "mmp_write_uberblock";
	-static void mmp_thread(void *arg);
	-
	-void
	-mmp_init(spa_t *spa)
	-{
	- mmp_thread_t *mmp = &spa->spa_mmp;
	-
	- mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
	- mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
	- mmp->mmp_kstat_id = 1;
	-
	- /*
	- * mmp_write_done() calculates mmp_delay based on prior mmp_delay and
	- * the elapsed time since the last write. For the first mmp write,
	- * there is no "last write", so we start with fake non-zero values.
	- */
	- mmp->mmp_last_write = gethrtime();
	- mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
	-}
	-
	-void
	-mmp_fini(spa_t *spa)
	-{
	- mmp_thread_t *mmp = &spa->spa_mmp;
	-
	- mutex_destroy(&mmp->mmp_thread_lock);
	- cv_destroy(&mmp->mmp_thread_cv);
	- mutex_destroy(&mmp->mmp_io_lock);
	-}
	-
	-static void
	-mmp_thread_enter(mmp_thread_t mmp, callb_cpr_t cpr)
	-{
	- CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
	- mutex_enter(&mmp->mmp_thread_lock);
	-}
	-
	-static void
	-mmp_thread_exit(mmp_thread_t mmp, kthread_t mpp, callb_cpr_t cpr)
	-{
	- ASSERT(*mpp != NULL);
	- *mpp = NULL;
	- cv_broadcast(&mmp->mmp_thread_cv);
	- CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */
	- thread_exit();
	-}
	-
	-void
	-mmp_thread_start(spa_t *spa)
	-{
	- mmp_thread_t *mmp = &spa->spa_mmp;
	-
	- if (spa_writeable(spa)) {
	- mutex_enter(&mmp->mmp_thread_lock);
	- if (!mmp->mmp_thread) {
	- mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
	- spa, 0, &p0, TS_RUN, minclsyspri);
	- zfs_dbgmsg("MMP thread started pool '%s' "
	- "gethrtime %llu", spa_name(spa), gethrtime());
	- }
	- mutex_exit(&mmp->mmp_thread_lock);
	- }
	-}
	-
	-void
	-mmp_thread_stop(spa_t *spa)
	-{
	- mmp_thread_t *mmp = &spa->spa_mmp;
	-
	- mutex_enter(&mmp->mmp_thread_lock);
	- mmp->mmp_thread_exiting = 1;
	- cv_broadcast(&mmp->mmp_thread_cv);
	-
	- while (mmp->mmp_thread) {
	- cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
	- }
	- mutex_exit(&mmp->mmp_thread_lock);
	- zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
	- spa_name(spa), gethrtime());
	-
	- ASSERT(mmp->mmp_thread == NULL);
	- mmp->mmp_thread_exiting = 0;
	-}
	-
	-typedef enum mmp_vdev_state_flag {
	- MMP_FAIL_NOT_WRITABLE = (1 << 0),
	- MMP_FAIL_WRITE_PENDING = (1 << 1),
	-} mmp_vdev_state_flag_t;
	-
	-/*
	- * Find a leaf vdev to write an MMP block to. It must not have an outstanding
	- * mmp write (if so a new write will also likely block). If there is no usable
	- * leaf, a nonzero error value is returned. The error value returned is a bit
	- * field.
	- *
	- * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an
	- * outstanding MMP write.
	- * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable.
	- */
	-
	-static int
	-mmp_next_leaf(spa_t *spa)
	-{
	- vdev_t *leaf;
	- vdev_t *starting_leaf;
	- int fail_mask = 0;
	-
	- ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
	- ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
	- ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
	- ASSERT(!list_is_empty(&spa->spa_leaf_list));
	-
	- if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
	- spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
	- spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
	- }
	-
	- leaf = spa->spa_mmp.mmp_last_leaf;
	- if (leaf == NULL)
	- leaf = list_head(&spa->spa_leaf_list);
	- starting_leaf = leaf;
	-
	- do {
	- leaf = list_next(&spa->spa_leaf_list, leaf);
	- if (leaf == NULL)
	- leaf = list_head(&spa->spa_leaf_list);
	-
	- if (!vdev_writeable(leaf)) {
	- fail_mask \|= MMP_FAIL_NOT_WRITABLE;
	- } else if (leaf->vdev_mmp_pending != 0) {
	- fail_mask \|= MMP_FAIL_WRITE_PENDING;
	- } else {
	- spa->spa_mmp.mmp_last_leaf = leaf;
	- return (0);
	- }
	- } while (leaf != starting_leaf);
	-
	- ASSERT(fail_mask);
	-
	- return (fail_mask);
	-}
	-
	-/*
	- * MMP writes are issued on a fixed schedule, but may complete at variable,
	- * much longer, intervals. The mmp_delay captures long periods between
	- * successful writes for any reason, including disk latency, scheduling delays,
	- * etc.
	- *
	- * The mmp_delay is usually calculated as a decaying average, but if the latest
	- * delay is higher we do not average it, so that we do not hide sudden spikes
	- * which the importing host must wait for.
	- *
	- * If writes are occurring frequently, such as due to a high rate of txg syncs,
	- * the mmp_delay could become very small. Since those short delays depend on
	- * activity we cannot count on, we never allow mmp_delay to get lower than rate
	- * expected if only mmp_thread writes occur.
	- *
	- * If an mmp write was skipped or fails, and we have already waited longer than
	- * mmp_delay, we need to update it so the next write reflects the longer delay.
	- *
	- * Do not set mmp_delay if the multihost property is not on, so as not to
	- * trigger an activity check on import.
	- */
	-static void
	-mmp_delay_update(spa_t *spa, boolean_t write_completed)
	-{
	- mmp_thread_t *mts = &spa->spa_mmp;
	- hrtime_t delay = gethrtime() - mts->mmp_last_write;
	-
	- ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
	-
	- if (spa_multihost(spa) == B_FALSE) {
	- mts->mmp_delay = 0;
	- return;
	- }
	-
	- if (delay > mts->mmp_delay)
	- mts->mmp_delay = delay;
	-
	- if (write_completed == B_FALSE)
	- return;
	-
	- mts->mmp_last_write = gethrtime();
	-
	- /*
	- * strictly less than, in case delay was changed above.
	- */
	- if (delay < mts->mmp_delay) {
	- hrtime_t min_delay =
	- MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) /
	- MAX(1, vdev_count_leaves(spa));
	- mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
	- min_delay);
	- }
	-}
	-
	-static void
	-mmp_write_done(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- vdev_t *vd = zio->io_vd;
	- mmp_thread_t *mts = zio->io_private;
	-
	- mutex_enter(&mts->mmp_io_lock);
	- uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
	- hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
	-
	- mmp_delay_update(spa, (zio->io_error == 0));
	-
	- vd->vdev_mmp_pending = 0;
	- vd->vdev_mmp_kstat_id = 0;
	-
	- mutex_exit(&mts->mmp_io_lock);
	- spa_config_exit(spa, SCL_STATE, mmp_tag);
	-
	- abd_free(zio->io_abd);
	-}
	-
	-/*
	- * When the uberblock on-disk is updated by a spa_sync,
	- * creating a new "best" uberblock, update the one stored
	- * in the mmp thread state, used for mmp writes.
	- */
	-void
	-mmp_update_uberblock(spa_t spa, uberblock_t ub)
	-{
	- mmp_thread_t *mmp = &spa->spa_mmp;
	-
	- mutex_enter(&mmp->mmp_io_lock);
	- mmp->mmp_ub = *ub;
	- mmp->mmp_seq = 1;
	- mmp->mmp_ub.ub_timestamp = gethrestime_sec();
	- mmp_delay_update(spa, B_TRUE);
	- mutex_exit(&mmp->mmp_io_lock);
	-}
	-
	-/*
	- * Choose a random vdev, label, and MMP block, and write over it
	- * with a copy of the last-synced uberblock, whose timestamp
	- * has been updated to reflect that the pool is in use.
	- */
	-static void
	-mmp_write_uberblock(spa_t *spa)
	-{
	- int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL;
	- mmp_thread_t *mmp = &spa->spa_mmp;
	- uberblock_t *ub;
	- vdev_t *vd = NULL;
	- int label, error;
	- uint64_t offset;
	-
	- hrtime_t lock_acquire_time = gethrtime();
	- spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
	- lock_acquire_time = gethrtime() - lock_acquire_time;
	- if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
	- zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
	- "gethrtime %llu", spa_name(spa), lock_acquire_time,
	- gethrtime());
	-
	- mutex_enter(&mmp->mmp_io_lock);
	-
	- error = mmp_next_leaf(spa);
	-
	- /*
	- * spa_mmp_history has two types of entries:
	- * Issued MMP write: records time issued, error status, etc.
	- * Skipped MMP write: an MMP write could not be issued because no
	- * suitable leaf vdev was available. See comment above struct
	- * spa_mmp_history for details.
	- */
	-
	- if (error) {
	- mmp_delay_update(spa, B_FALSE);
	- if (mmp->mmp_skip_error == error) {
	- /*
	- * ZoL porting note: the following is TBD
	- * spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
	- */
	- } else {
	- mmp->mmp_skip_error = error;
	- /*
	- * ZoL porting note: the following is TBD
	- * spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
	- * gethrestime_sec(), mmp->mmp_delay, NULL, 0,
	- * mmp->mmp_kstat_id++, error);
	- */
	- zfs_dbgmsg("MMP error choosing leaf pool '%s' "
	- "gethrtime %llu fail_mask %#x", spa_name(spa),
	- gethrtime(), error);
	- }
	- mutex_exit(&mmp->mmp_io_lock);
	- spa_config_exit(spa, SCL_STATE, mmp_tag);
	- return;
	- }
	-
	- vd = spa->spa_mmp.mmp_last_leaf;
	- if (mmp->mmp_skip_error != 0) {
	- mmp->mmp_skip_error = 0;
	- zfs_dbgmsg("MMP write after skipping due to unavailable "
	- "leaves, pool '%s' gethrtime %llu leaf %#llu",
	- spa_name(spa), gethrtime(), vd->vdev_guid);
	- }
	-
	- if (mmp->mmp_zio_root == NULL)
	- mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
	- flags \| ZIO_FLAG_GODFATHER);
	-
	- if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) {
	- /*
	- * Want to reset mmp_seq when timestamp advances because after
	- * an mmp_seq wrap new values will not be chosen by
	- * uberblock_compare() as the "best".
	- */
	- mmp->mmp_ub.ub_timestamp = gethrestime_sec();
	- mmp->mmp_seq = 1;
	- }
	-
	- ub = &mmp->mmp_ub;
	- ub->ub_mmp_magic = MMP_MAGIC;
	- ub->ub_mmp_delay = mmp->mmp_delay;
	- ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) \|
	- MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) \|
	- MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK(
	- zfs_multihost_fail_intervals));
	- vd->vdev_mmp_pending = gethrtime();
	- vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
	-
	- zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
	- abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
	- abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
	- abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
	-
	- mmp->mmp_seq++;
	- mmp->mmp_kstat_id++;
	- mutex_exit(&mmp->mmp_io_lock);
	-
	- offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
	- MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
	-
	- label = spa_get_random(VDEV_LABELS);
	- vdev_label_write(zio, vd, label, ub_abd, offset,
	- VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
	- flags \| ZIO_FLAG_DONT_PROPAGATE);
	-
	- /*
	- * ZoL porting note: the following is TBD
	- * (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
	- * ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
	- */
	-
	- zio_nowait(zio);
	-}
	-
	-static void
	-mmp_thread(void *arg)
	-{
	- spa_t spa = (spa_t )arg;
	- mmp_thread_t *mmp = &spa->spa_mmp;
	- boolean_t suspended = spa_suspended(spa);
	- boolean_t multihost = spa_multihost(spa);
	- uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
	- zfs_multihost_interval));
	- uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
	- zfs_multihost_fail_intervals);
	- hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
	- boolean_t last_spa_suspended = suspended;
	- boolean_t last_spa_multihost = multihost;
	- uint64_t last_mmp_interval = mmp_interval;
	- uint32_t last_mmp_fail_intervals = mmp_fail_intervals;
	- hrtime_t last_mmp_fail_ns = mmp_fail_ns;
	- callb_cpr_t cpr;
	- int skip_wait = 0;
	-
	- mmp_thread_enter(mmp, &cpr);
	-
	- while (!mmp->mmp_thread_exiting) {
	- hrtime_t next_time = gethrtime() +
	- MSEC2NSEC(MMP_DEFAULT_INTERVAL);
	- int leaves = MAX(vdev_count_leaves(spa), 1);
	-
	- /* Detect changes in tunables or state */
	-
	- last_spa_suspended = suspended;
	- last_spa_multihost = multihost;
	- suspended = spa_suspended(spa);
	- multihost = spa_multihost(spa);
	-
	- last_mmp_interval = mmp_interval;
	- last_mmp_fail_intervals = mmp_fail_intervals;
	- last_mmp_fail_ns = mmp_fail_ns;
	- mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
	- zfs_multihost_interval));
	- mmp_fail_intervals = MMP_FAIL_INTVS_OK(
	- zfs_multihost_fail_intervals);
	-
	- /* Smooth so pool is not suspended when reducing tunables */
	- if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) {
	- mmp_fail_ns = (mmp_fail_ns * 31 +
	- mmp_fail_intervals * mmp_interval) / 32;
	- } else {
	- mmp_fail_ns = mmp_fail_intervals *
	- mmp_interval;
	- }
	-
	- if (mmp_interval != last_mmp_interval \|\|
	- mmp_fail_intervals != last_mmp_fail_intervals) {
	- /*
	- * We want other hosts to see new tunables as quickly as
	- * possible. Write out at higher frequency than usual.
	- */
	- skip_wait += leaves;
	- }
	-
	- if (multihost)
	- next_time = gethrtime() + mmp_interval / leaves;
	-
	- if (mmp_fail_ns != last_mmp_fail_ns) {
	- zfs_dbgmsg("MMP interval change pool '%s' "
	- "gethrtime %llu last_mmp_interval %llu "
	- "mmp_interval %llu last_mmp_fail_intervals %u "
	- "mmp_fail_intervals %u mmp_fail_ns %llu "
	- "skip_wait %d leaves %d next_time %llu",
	- spa_name(spa), gethrtime(), last_mmp_interval,
	- mmp_interval, last_mmp_fail_intervals,
	- mmp_fail_intervals, mmp_fail_ns, skip_wait, leaves,
	- next_time);
	- }
	-
	- /*
	- * MMP off => on, or suspended => !suspended:
	- * No writes occurred recently. Update mmp_last_write to give
	- * us some time to try.
	- */
	- if ((!last_spa_multihost && multihost) \|\|
	- (last_spa_suspended && !suspended)) {
	- zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
	- "last_spa_multihost %u multihost %u "
	- "last_spa_suspended %u suspended %u",
	- spa_name(spa), last_spa_multihost, multihost,
	- last_spa_suspended, suspended);
	- mutex_enter(&mmp->mmp_io_lock);
	- mmp->mmp_last_write = gethrtime();
	- mmp->mmp_delay = mmp_interval;
	- mutex_exit(&mmp->mmp_io_lock);
	- }
	-
	- /*
	- * MMP on => off:
	- * mmp_delay == 0 tells importing node to skip activity check.
	- */
	- if (last_spa_multihost && !multihost) {
	- mutex_enter(&mmp->mmp_io_lock);
	- mmp->mmp_delay = 0;
	- mutex_exit(&mmp->mmp_io_lock);
	- }
	-
	- /*
	- * Suspend the pool if no MMP write has succeeded in over
	- * mmp_interval * mmp_fail_intervals nanoseconds.
	- */
	- if (multihost && !suspended && mmp_fail_intervals &&
	- (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
	- zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
	- "mmp_last_write %llu mmp_interval %llu "
	- "mmp_fail_intervals %llu mmp_fail_ns %llu",
	- spa_name(spa), (u_longlong_t)gethrtime(),
	- (u_longlong_t)mmp->mmp_last_write,
	- (u_longlong_t)mmp_interval,
	- (u_longlong_t)mmp_fail_intervals,
	- (u_longlong_t)mmp_fail_ns);
	- cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
	- "succeeded in over %llu ms; suspending pool. "
	- "Hrtime %llu",
	- spa_name(spa),
	- NSEC2MSEC(gethrtime() - mmp->mmp_last_write),
	- gethrtime());
	- zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
	- }
	-
	- if (multihost && !suspended)
	- mmp_write_uberblock(spa);
	-
	- if (skip_wait > 0) {
	- next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) /
	- leaves;
	- skip_wait--;
	- }
	-
	- CALLB_CPR_SAFE_BEGIN(&cpr);
	-#if defined(illumos)
	- (void) cv_timedwait_sig_hrtime(&mmp->mmp_thread_cv,
	- &mmp->mmp_thread_lock, next_time);
	-#elif defined(_KERNEL)
	- (void) cv_timedwait_sig_sbt(&mmp->mmp_thread_cv,
	- &mmp->mmp_thread_lock, nstosbt(next_time),
	- 100 * SBT_1US, C_ABSOLUTE);
	-#else
	- (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv,
	- &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
	- CALLOUT_FLAG_ABSOLUTE);
	-#endif
	- CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
	- }
	-
	- /* Outstanding writes are allowed to complete. */
	- if (mmp->mmp_zio_root)
	- zio_wait(mmp->mmp_zio_root);
	-
	- mmp->mmp_zio_root = NULL;
	- mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
	-}
	-
	-/*
	- * Signal the MMP thread to wake it, when it is sleeping on
	- * its cv. Used when some module parameter has changed and
	- * we want the thread to know about it.
	- * Only signal if the pool is active and mmp thread is
	- * running, otherwise there is no thread to wake.
	- */
	-static void
	-mmp_signal_thread(spa_t *spa)
	-{
	- mmp_thread_t *mmp = &spa->spa_mmp;
	-
	- mutex_enter(&mmp->mmp_thread_lock);
	- if (mmp->mmp_thread)
	- cv_broadcast(&mmp->mmp_thread_cv);
	- mutex_exit(&mmp->mmp_thread_lock);
	-}
	-
	-void
	-mmp_signal_all_threads(void)
	-{
	- spa_t *spa = NULL;
	-
	- mutex_enter(&spa_namespace_lock);
	- while ((spa = spa_next(spa))) {
	- if (spa->spa_state == POOL_STATE_ACTIVE)
	- mmp_signal_thread(spa);
	- }
	- mutex_exit(&spa_namespace_lock);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
	@@ -1,423 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/multilist.h>
	-
	-/* needed for spa_get_random() */
	-#include <sys/spa.h>
	-
	-/*
	- * This overrides the number of sublists in each multilist_t, which defaults
	- * to the number of CPUs in the system (see multilist_create()).
	- */
	-int zfs_multilist_num_sublists = 0;
	-
	-/*
	- * Given the object contained on the list, return a pointer to the
	- * object's multilist_node_t structure it contains.
	- */
	-static multilist_node_t *
	-multilist_d2l(multilist_t ml, void obj)
	-{
	- return ((multilist_node_t )((char )obj + ml->ml_offset));
	-}
	-
	-/*
	- * Initialize a new mutlilist using the parameters specified.
	- *
	- * - 'size' denotes the size of the structure containing the
	- * multilist_node_t.
	- * - 'offset' denotes the byte offset of the mutlilist_node_t within
	- * the structure that contains it.
	- * - 'num' specifies the number of internal sublists to create.
	- * - 'index_func' is used to determine which sublist to insert into
	- * when the multilist_insert() function is called; as well as which
	- * sublist to remove from when multilist_remove() is called. The
	- * requirements this function must meet, are the following:
	- *
	- * - It must always return the same value when called on the same
	- * object (to ensure the object is removed from the list it was
	- * inserted into).
	- *
	- * - It must return a value in the range [0, number of sublists).
	- * The multilist_get_num_sublists() function may be used to
	- * determine the number of sublists in the multilist.
	- *
	- * Also, in order to reduce internal contention between the sublists
	- * during insertion and removal, this function should choose evenly
	- * between all available sublists when inserting. This isn't a hard
	- * requirement, but a general rule of thumb in order to garner the
	- * best multi-threaded performance out of the data structure.
	- */
	-static multilist_t *
	-multilist_create_impl(size_t size, size_t offset,
	- unsigned int num, multilist_sublist_index_func_t *index_func)
	-{
	- ASSERT3U(size, >, 0);
	- ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
	- ASSERT3U(num, >, 0);
	- ASSERT3P(index_func, !=, NULL);
	-
	- multilist_t ml = kmem_alloc(sizeof (ml), KM_SLEEP);
	- ml->ml_offset = offset;
	- ml->ml_num_sublists = num;
	- ml->ml_index_func = index_func;
	-
	- ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
	- ml->ml_num_sublists, KM_SLEEP);
	-
	- ASSERT3P(ml->ml_sublists, !=, NULL);
	-
	- for (int i = 0; i < ml->ml_num_sublists; i++) {
	- multilist_sublist_t *mls = &ml->ml_sublists[i];
	- mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
	- list_create(&mls->mls_list, size, offset);
	- }
	- return (ml);
	-}
	-
	-/*
	- * Allocate a new multilist, using the default number of sublists
	- * (the number of CPUs, or at least 4, or the tunable
	- * zfs_multilist_num_sublists).
	- */
	-multilist_t *
	-multilist_create(size_t size, size_t offset,
	- multilist_sublist_index_func_t *index_func)
	-{
	- int num_sublists;
	-
	- if (zfs_multilist_num_sublists > 0) {
	- num_sublists = zfs_multilist_num_sublists;
	- } else {
	- num_sublists = MAX(max_ncpus, 4);
	- }
	-
	- return (multilist_create_impl(size, offset, num_sublists, index_func));
	-}
	-
	-/*
	- * Destroy the given multilist object, and free up any memory it holds.
	- */
	-void
	-multilist_destroy(multilist_t *ml)
	-{
	- ASSERT(multilist_is_empty(ml));
	-
	- for (int i = 0; i < ml->ml_num_sublists; i++) {
	- multilist_sublist_t *mls = &ml->ml_sublists[i];
	-
	- ASSERT(list_is_empty(&mls->mls_list));
	-
	- list_destroy(&mls->mls_list);
	- mutex_destroy(&mls->mls_lock);
	- }
	-
	- ASSERT3P(ml->ml_sublists, !=, NULL);
	- kmem_free(ml->ml_sublists,
	- sizeof (multilist_sublist_t) * ml->ml_num_sublists);
	-
	- ml->ml_num_sublists = 0;
	- ml->ml_offset = 0;
	- kmem_free(ml, sizeof (multilist_t));
	-}
	-
	-/*
	- * Insert the given object into the multilist.
	- *
	- * This function will insert the object specified into the sublist
	- * determined using the function given at multilist creation time.
	- *
	- * The sublist locks are automatically acquired if not already held, to
	- * ensure consistency when inserting and removing from multiple threads.
	- */
	-void
	-multilist_insert(multilist_t ml, void obj)
	-{
	- unsigned int sublist_idx = ml->ml_index_func(ml, obj);
	- multilist_sublist_t *mls;
	- boolean_t need_lock;
	-
	- DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
	- unsigned int, sublist_idx, void *, obj);
	-
	- ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
	-
	- mls = &ml->ml_sublists[sublist_idx];
	-
	- /*
	- * Note: Callers may already hold the sublist lock by calling
	- * multilist_sublist_lock(). Here we rely on MUTEX_HELD()
	- * returning TRUE if and only if the current thread holds the
	- * lock. While it's a little ugly to make the lock recursive in
	- * this way, it works and allows the calling code to be much
	- * simpler -- otherwise it would have to pass around a flag
	- * indicating that it already has the lock.
	- */
	- need_lock = !MUTEX_HELD(&mls->mls_lock);
	-
	- if (need_lock)
	- mutex_enter(&mls->mls_lock);
	-
	- ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
	-
	- multilist_sublist_insert_head(mls, obj);
	-
	- if (need_lock)
	- mutex_exit(&mls->mls_lock);
	-}
	-
	-/*
	- * Remove the given object from the multilist.
	- *
	- * This function will remove the object specified from the sublist
	- * determined using the function given at multilist creation time.
	- *
	- * The necessary sublist locks are automatically acquired, to ensure
	- * consistency when inserting and removing from multiple threads.
	- */
	-void
	-multilist_remove(multilist_t ml, void obj)
	-{
	- unsigned int sublist_idx = ml->ml_index_func(ml, obj);
	- multilist_sublist_t *mls;
	- boolean_t need_lock;
	-
	- DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
	- unsigned int, sublist_idx, void *, obj);
	-
	- ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
	-
	- mls = &ml->ml_sublists[sublist_idx];
	- /* See comment in multilist_insert(). */
	- need_lock = !MUTEX_HELD(&mls->mls_lock);
	-
	- if (need_lock)
	- mutex_enter(&mls->mls_lock);
	-
	- ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
	-
	- multilist_sublist_remove(mls, obj);
	-
	- if (need_lock)
	- mutex_exit(&mls->mls_lock);
	-}
	-
	-/*
	- * Check to see if this multilist object is empty.
	- *
	- * This will return TRUE if it finds all of the sublists of this
	- * multilist to be empty, and FALSE otherwise. Each sublist lock will be
	- * automatically acquired as necessary.
	- *
	- * If concurrent insertions and removals are occurring, the semantics
	- * of this function become a little fuzzy. Instead of locking all
	- * sublists for the entire call time of the function, each sublist is
	- * only locked as it is individually checked for emptiness. Thus, it's
	- * possible for this function to return TRUE with non-empty sublists at
	- * the time the function returns. This would be due to another thread
	- * inserting into a given sublist, after that specific sublist was check
	- * and deemed empty, but before all sublists have been checked.
	- */
	-int
	-multilist_is_empty(multilist_t *ml)
	-{
	- for (int i = 0; i < ml->ml_num_sublists; i++) {
	- multilist_sublist_t *mls = &ml->ml_sublists[i];
	- /* See comment in multilist_insert(). */
	- boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
	-
	- if (need_lock)
	- mutex_enter(&mls->mls_lock);
	-
	- if (!list_is_empty(&mls->mls_list)) {
	- if (need_lock)
	- mutex_exit(&mls->mls_lock);
	-
	- return (FALSE);
	- }
	-
	- if (need_lock)
	- mutex_exit(&mls->mls_lock);
	- }
	-
	- return (TRUE);
	-}
	-
	-/* Return the number of sublists composing this multilist */
	-unsigned int
	-multilist_get_num_sublists(multilist_t *ml)
	-{
	- return (ml->ml_num_sublists);
	-}
	-
	-/* Return a randomly selected, valid sublist index for this multilist */
	-unsigned int
	-multilist_get_random_index(multilist_t *ml)
	-{
	- return (spa_get_random(ml->ml_num_sublists));
	-}
	-
	-/* Lock and return the sublist specified at the given index */
	-multilist_sublist_t *
	-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
	-{
	- multilist_sublist_t *mls;
	-
	- ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
	- mls = &ml->ml_sublists[sublist_idx];
	- mutex_enter(&mls->mls_lock);
	-
	- return (mls);
	-}
	-
	-/* Lock and return the sublist that would be used to store the specified obj */
	-multilist_sublist_t *
	-multilist_sublist_lock_obj(multilist_t ml, void obj)
	-{
	- return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
	-}
	-
	-void
	-multilist_sublist_unlock(multilist_sublist_t *mls)
	-{
	- mutex_exit(&mls->mls_lock);
	-}
	-
	-/*
	- * We're allowing any object to be inserted into this specific sublist,
	- * but this can lead to trouble if multilist_remove() is called to
	- * remove this object. Specifically, if calling ml_index_func on this
	- * object returns an index for sublist different than what is passed as
	- * a parameter here, any call to multilist_remove() with this newly
	- * inserted object is undefined! (the call to multilist_remove() will
	- * remove the object from a list that it isn't contained in)
	- */
	-void
	-multilist_sublist_insert_head(multilist_sublist_t mls, void obj)
	-{
	- ASSERT(MUTEX_HELD(&mls->mls_lock));
	- list_insert_head(&mls->mls_list, obj);
	-}
	-
	-/* please see comment above multilist_sublist_insert_head */
	-void
	-multilist_sublist_insert_tail(multilist_sublist_t mls, void obj)
	-{
	- ASSERT(MUTEX_HELD(&mls->mls_lock));
	- list_insert_tail(&mls->mls_list, obj);
	-}
	-
	-/*
	- * Move the object one element forward in the list.
	- *
	- * This function will move the given object forward in the list (towards
	- * the head) by one object. So, in essence, it will swap its position in
	- * the list with its "prev" pointer. If the given object is already at the
	- * head of the list, it cannot be moved forward any more than it already
	- * is, so no action is taken.
	- *
	- * NOTE: This function must not remove any object from the list other
	- * than the object given as the parameter. This is relied upon in
	- * arc_evict_state_impl().
	- */
	-void
	-multilist_sublist_move_forward(multilist_sublist_t mls, void obj)
	-{
	- void *prev = list_prev(&mls->mls_list, obj);
	-
	- ASSERT(MUTEX_HELD(&mls->mls_lock));
	- ASSERT(!list_is_empty(&mls->mls_list));
	-
	- /* 'obj' must be at the head of the list, nothing to do */
	- if (prev == NULL)
	- return;
	-
	- list_remove(&mls->mls_list, obj);
	- list_insert_before(&mls->mls_list, prev, obj);
	-}
	-
	-void
	-multilist_sublist_remove(multilist_sublist_t mls, void obj)
	-{
	- ASSERT(MUTEX_HELD(&mls->mls_lock));
	- list_remove(&mls->mls_list, obj);
	-}
	-
	-int
	-multilist_sublist_is_empty(multilist_sublist_t *mls)
	-{
	- ASSERT(MUTEX_HELD(&mls->mls_lock));
	- return (list_is_empty(&mls->mls_list));
	-}
	-
	-int
	-multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx)
	-{
	- multilist_sublist_t *mls;
	- int empty;
	-
	- ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
	- mls = &ml->ml_sublists[sublist_idx];
	- ASSERT(!MUTEX_HELD(&mls->mls_lock));
	- mutex_enter(&mls->mls_lock);
	- empty = list_is_empty(&mls->mls_list);
	- mutex_exit(&mls->mls_lock);
	- return (empty);
	-}
	-
	-void *
	-multilist_sublist_head(multilist_sublist_t *mls)
	-{
	- ASSERT(MUTEX_HELD(&mls->mls_lock));
	- return (list_head(&mls->mls_list));
	-}
	-
	-void *
	-multilist_sublist_tail(multilist_sublist_t *mls)
	-{
	- ASSERT(MUTEX_HELD(&mls->mls_lock));
	- return (list_tail(&mls->mls_list));
	-}
	-
	-void *
	-multilist_sublist_next(multilist_sublist_t mls, void obj)
	-{
	- ASSERT(MUTEX_HELD(&mls->mls_lock));
	- return (list_next(&mls->mls_list, obj));
	-}
	-
	-void *
	-multilist_sublist_prev(multilist_sublist_t mls, void obj)
	-{
	- ASSERT(MUTEX_HELD(&mls->mls_lock));
	- return (list_prev(&mls->mls_list, obj));
	-}
	-
	-void
	-multilist_link_init(multilist_node_t *link)
	-{
	- list_link_init(link);
	-}
	-
	-int
	-multilist_link_active(multilist_node_t *link)
	-{
	- return (list_link_active(link));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
	@@ -1,670 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/dmu.h>
	-#include <sys/dnode.h>
	-#include <sys/zio.h>
	-#include <sys/range_tree.h>
	-
	-/*
	- * Range trees are tree-based data structures that can be used to
	- * track free space or generally any space allocation information.
	- * A range tree keeps track of individual segments and automatically
	- * provides facilities such as adjacent extent merging and extent
	- * splitting in response to range add/remove requests.
	- *
	- * A range tree starts out completely empty, with no segments in it.
	- * Adding an allocation via range_tree_add to the range tree can either:
	- * 1) create a new extent
	- * 2) extend an adjacent extent
	- * 3) merge two adjacent extents
	- * Conversely, removing an allocation via range_tree_remove can:
	- * 1) completely remove an extent
	- * 2) shorten an extent (if the allocation was near one of its ends)
	- * 3) split an extent into two extents, in effect punching a hole
	- *
	- * A range tree is also capable of 'bridging' gaps when adding
	- * allocations. This is useful for cases when close proximity of
	- * allocations is an important detail that needs to be represented
	- * in the range tree. See range_tree_set_gap(). The default behavior
	- * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
	- *
	- * In order to traverse a range tree, use either the range_tree_walk()
	- * or range_tree_vacate() functions.
	- *
	- * To obtain more accurate information on individual segment
	- * operations that the range tree performs "under the hood", you can
	- * specify a set of callbacks by passing a range_tree_ops_t structure
	- * to the range_tree_create function. Any callbacks that are non-NULL
	- * are then called at the appropriate times.
	- *
	- * The range tree code also supports a special variant of range trees
	- * that can bridge small gaps between segments. This kind of tree is used
	- * by the dsl scanning code to group I/Os into mostly sequential chunks to
	- * optimize disk performance. The code here attempts to do this with as
	- * little memory and computational overhead as possible. One limitation of
	- * this implementation is that segments of range trees with gaps can only
	- * support removing complete segments.
	- */
	-
	-kmem_cache_t *range_seg_cache;
	-
	-/* Generic ops for managing an AVL tree alongside a range tree */
	-struct range_tree_ops rt_avl_ops = {
	- .rtop_create = rt_avl_create,
	- .rtop_destroy = rt_avl_destroy,
	- .rtop_add = rt_avl_add,
	- .rtop_remove = rt_avl_remove,
	- .rtop_vacate = rt_avl_vacate,
	-};
	-
	-void
	-range_tree_init(void)
	-{
	- ASSERT(range_seg_cache == NULL);
	- range_seg_cache = kmem_cache_create("range_seg_cache",
	- sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	-}
	-
	-void
	-range_tree_fini(void)
	-{
	- kmem_cache_destroy(range_seg_cache);
	- range_seg_cache = NULL;
	-}
	-
	-void
	-range_tree_stat_verify(range_tree_t *rt)
	-{
	- range_seg_t *rs;
	- uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
	- int i;
	-
	- for (rs = avl_first(&rt->rt_root); rs != NULL;
	- rs = AVL_NEXT(&rt->rt_root, rs)) {
	- uint64_t size = rs->rs_end - rs->rs_start;
	- int idx = highbit64(size) - 1;
	-
	- hist[idx]++;
	- ASSERT3U(hist[idx], !=, 0);
	- }
	-
	- for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
	- if (hist[i] != rt->rt_histogram[i]) {
	- zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
	- i, hist, hist[i], rt->rt_histogram[i]);
	- }
	- VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
	- }
	-}
	-
	-static void
	-range_tree_stat_incr(range_tree_t rt, range_seg_t rs)
	-{
	- uint64_t size = rs->rs_end - rs->rs_start;
	- int idx = highbit64(size) - 1;
	-
	- ASSERT(size != 0);
	- ASSERT3U(idx, <,
	- sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
	-
	- rt->rt_histogram[idx]++;
	- ASSERT3U(rt->rt_histogram[idx], !=, 0);
	-}
	-
	-static void
	-range_tree_stat_decr(range_tree_t rt, range_seg_t rs)
	-{
	- uint64_t size = rs->rs_end - rs->rs_start;
	- int idx = highbit64(size) - 1;
	-
	- ASSERT(size != 0);
	- ASSERT3U(idx, <,
	- sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
	-
	- ASSERT3U(rt->rt_histogram[idx], !=, 0);
	- rt->rt_histogram[idx]--;
	-}
	-
	-/*
	- * NOTE: caller is responsible for all locking.
	- */
	-static int
	-range_tree_seg_compare(const void x1, const void x2)
	-{
	- const range_seg_t r1 = (const range_seg_t )x1;
	- const range_seg_t r2 = (const range_seg_t )x2;
	-
	- ASSERT3U(r1->rs_start, <=, r1->rs_end);
	- ASSERT3U(r2->rs_start, <=, r2->rs_end);
	-
	- return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
	-}
	-
	-range_tree_t *
	-range_tree_create_impl(range_tree_ops_t ops, void arg,
	- int (avl_compare) (const void , const void *), uint64_t gap)
	-{
	- range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
	-
	- avl_create(&rt->rt_root, range_tree_seg_compare,
	- sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
	-
	- rt->rt_ops = ops;
	- rt->rt_arg = arg;
	- rt->rt_gap = gap;
	- rt->rt_avl_compare = avl_compare;
	-
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
	- rt->rt_ops->rtop_create(rt, rt->rt_arg);
	-
	- return (rt);
	-}
	-
	-range_tree_t *
	-range_tree_create(range_tree_ops_t ops, void arg)
	-{
	- return (range_tree_create_impl(ops, arg, NULL, 0));
	-}
	-
	-void
	-range_tree_destroy(range_tree_t *rt)
	-{
	- VERIFY0(rt->rt_space);
	-
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
	- rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
	-
	- avl_destroy(&rt->rt_root);
	- kmem_free(rt, sizeof (*rt));
	-}
	-
	-void
	-range_tree_adjust_fill(range_tree_t rt, range_seg_t rs, int64_t delta)
	-{
	- ASSERT3U(rs->rs_fill + delta, !=, 0);
	- ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
	-
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
	- rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
	- rs->rs_fill += delta;
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
	- rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
	-}
	-
	-static void
	-range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
	-{
	- range_tree_t *rt = arg;
	- avl_index_t where;
	- range_seg_t rsearch, rs_before, rs_after, *rs;
	- uint64_t end = start + size, gap = rt->rt_gap;
	- uint64_t bridge_size = 0;
	- boolean_t merge_before, merge_after;
	-
	- ASSERT3U(size, !=, 0);
	- ASSERT3U(fill, <=, size);
	-
	- rsearch.rs_start = start;
	- rsearch.rs_end = end;
	- rs = avl_find(&rt->rt_root, &rsearch, &where);
	-
	- if (gap == 0 && rs != NULL &&
	- rs->rs_start <= start && rs->rs_end >= end) {
	- zfs_panic_recover("zfs: allocating allocated segment"
	- "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
	- (longlong_t)start, (longlong_t)size,
	- (longlong_t)rs->rs_start,
	- (longlong_t)rs->rs_end - rs->rs_start);
	- return;
	- }
	-
	- /*
	- * If this is a gap-supporting range tree, it is possible that we
	- * are inserting into an existing segment. In this case simply
	- * bump the fill count and call the remove / add callbacks. If the
	- * new range will extend an existing segment, we remove the
	- * existing one, apply the new extent to it and re-insert it using
	- * the normal code paths.
	- */
	- if (rs != NULL) {
	- ASSERT3U(gap, !=, 0);
	- if (rs->rs_start <= start && rs->rs_end >= end) {
	- range_tree_adjust_fill(rt, rs, fill);
	- return;
	- }
	-
	- avl_remove(&rt->rt_root, rs);
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
	- rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
	-
	- range_tree_stat_decr(rt, rs);
	- rt->rt_space -= rs->rs_end - rs->rs_start;
	-
	- fill += rs->rs_fill;
	- start = MIN(start, rs->rs_start);
	- end = MAX(end, rs->rs_end);
	- size = end - start;
	-
	- range_tree_add_impl(rt, start, size, fill);
	-
	- kmem_cache_free(range_seg_cache, rs);
	- return;
	- }
	-
	- ASSERT3P(rs, ==, NULL);
	-
	- /*
	- * Determine whether or not we will have to merge with our neighbors.
	- * If gap != 0, we might need to merge with our neighbors even if we
	- * aren't directly touching.
	- */
	- rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
	- rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
	-
	- merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
	- merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
	-
	- if (merge_before && gap != 0)
	- bridge_size += start - rs_before->rs_end;
	- if (merge_after && gap != 0)
	- bridge_size += rs_after->rs_start - end;
	-
	- if (merge_before && merge_after) {
	- avl_remove(&rt->rt_root, rs_before);
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
	- rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
	- rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
	- }
	-
	- range_tree_stat_decr(rt, rs_before);
	- range_tree_stat_decr(rt, rs_after);
	-
	- rs_after->rs_fill += rs_before->rs_fill + fill;
	- rs_after->rs_start = rs_before->rs_start;
	- kmem_cache_free(range_seg_cache, rs_before);
	- rs = rs_after;
	- } else if (merge_before) {
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
	- rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
	-
	- range_tree_stat_decr(rt, rs_before);
	-
	- rs_before->rs_fill += fill;
	- rs_before->rs_end = end;
	- rs = rs_before;
	- } else if (merge_after) {
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
	- rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
	-
	- range_tree_stat_decr(rt, rs_after);
	-
	- rs_after->rs_fill += fill;
	- rs_after->rs_start = start;
	- rs = rs_after;
	- } else {
	- rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
	-
	- rs->rs_fill = fill;
	- rs->rs_start = start;
	- rs->rs_end = end;
	- avl_insert(&rt->rt_root, rs, where);
	- }
	-
	- if (gap != 0)
	- ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
	- else
	- ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
	-
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
	- rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
	-
	- range_tree_stat_incr(rt, rs);
	- rt->rt_space += size + bridge_size;
	-}
	-
	-void
	-range_tree_add(void *arg, uint64_t start, uint64_t size)
	-{
	- range_tree_add_impl(arg, start, size, size);
	-}
	-
	-static void
	-range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
	- boolean_t do_fill)
	-{
	- avl_index_t where;
	- range_seg_t rsearch, rs, newseg;
	- uint64_t end = start + size;
	- boolean_t left_over, right_over;
	-
	- VERIFY3U(size, !=, 0);
	- VERIFY3U(size, <=, rt->rt_space);
	-
	- rsearch.rs_start = start;
	- rsearch.rs_end = end;
	- rs = avl_find(&rt->rt_root, &rsearch, &where);
	-
	- /* Make sure we completely overlap with someone */
	- if (rs == NULL) {
	- zfs_panic_recover("zfs: freeing free segment "
	- "(offset=%llu size=%llu)",
	- (longlong_t)start, (longlong_t)size);
	- return;
	- }
	-
	- /*
	- * Range trees with gap support must only remove complete segments
	- * from the tree. This allows us to maintain accurate fill accounting
	- * and to ensure that bridged sections are not leaked. If we need to
	- * remove less than the full segment, we can only adjust the fill count.
	- */
	- if (rt->rt_gap != 0) {
	- if (do_fill) {
	- if (rs->rs_fill == size) {
	- start = rs->rs_start;
	- end = rs->rs_end;
	- size = end - start;
	- } else {
	- range_tree_adjust_fill(rt, rs, -size);
	- return;
	- }
	- } else if (rs->rs_start != start \|\| rs->rs_end != end) {
	- zfs_panic_recover("zfs: freeing partial segment of "
	- "gap tree (offset=%llu size=%llu) of "
	- "(offset=%llu size=%llu)",
	- (longlong_t)start, (longlong_t)size,
	- (longlong_t)rs->rs_start,
	- (longlong_t)rs->rs_end - rs->rs_start);
	- return;
	- }
	- }
	-
	- VERIFY3U(rs->rs_start, <=, start);
	- VERIFY3U(rs->rs_end, >=, end);
	-
	- left_over = (rs->rs_start != start);
	- right_over = (rs->rs_end != end);
	-
	- range_tree_stat_decr(rt, rs);
	-
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
	- rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
	-
	- if (left_over && right_over) {
	- newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
	- newseg->rs_start = end;
	- newseg->rs_end = rs->rs_end;
	- newseg->rs_fill = newseg->rs_end - newseg->rs_start;
	- range_tree_stat_incr(rt, newseg);
	-
	- rs->rs_end = start;
	-
	- avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
	- rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
	- } else if (left_over) {
	- rs->rs_end = start;
	- } else if (right_over) {
	- rs->rs_start = end;
	- } else {
	- avl_remove(&rt->rt_root, rs);
	- kmem_cache_free(range_seg_cache, rs);
	- rs = NULL;
	- }
	-
	- if (rs != NULL) {
	- /*
	- * The fill of the leftover segment will always be equal to
	- * the size, since we do not support removing partial segments
	- * of range trees with gaps.
	- */
	- rs->rs_fill = rs->rs_end - rs->rs_start;
	- range_tree_stat_incr(rt, rs);
	-
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
	- rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
	- }
	-
	- rt->rt_space -= size;
	-}
	-
	-void
	-range_tree_remove(void *arg, uint64_t start, uint64_t size)
	-{
	- range_tree_remove_impl(arg, start, size, B_FALSE);
	-}
	-
	-void
	-range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
	-{
	- range_tree_remove_impl(rt, start, size, B_TRUE);
	-}
	-
	-void
	-range_tree_resize_segment(range_tree_t rt, range_seg_t rs,
	- uint64_t newstart, uint64_t newsize)
	-{
	- int64_t delta = newsize - (rs->rs_end - rs->rs_start);
	-
	- range_tree_stat_decr(rt, rs);
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
	- rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
	-
	- rs->rs_start = newstart;
	- rs->rs_end = newstart + newsize;
	-
	- range_tree_stat_incr(rt, rs);
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
	- rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
	-
	- rt->rt_space += delta;
	-}
	-
	-static range_seg_t *
	-range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
	-{
	- range_seg_t rsearch;
	- uint64_t end = start + size;
	-
	- VERIFY(size != 0);
	-
	- rsearch.rs_start = start;
	- rsearch.rs_end = end;
	- return (avl_find(&rt->rt_root, &rsearch, NULL));
	-}
	-
	-range_seg_t *
	-range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
	-{
	- range_seg_t *rs = range_tree_find_impl(rt, start, size);
	- if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
	- return (rs);
	- return (NULL);
	-}
	-
	-void
	-range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
	-{
	- range_seg_t *rs = range_tree_find(rt, off, size);
	- if (rs != NULL)
	- panic("segment already in tree; rs=%p", (void *)rs);
	-}
	-
	-boolean_t
	-range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
	-{
	- return (range_tree_find(rt, start, size) != NULL);
	-}
	-
	-/*
	- * Ensure that this range is not in the tree, regardless of whether
	- * it is currently in the tree.
	- */
	-void
	-range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
	-{
	- range_seg_t *rs;
	-
	- if (size == 0)
	- return;
	-
	- while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
	- uint64_t free_start = MAX(rs->rs_start, start);
	- uint64_t free_end = MIN(rs->rs_end, start + size);
	- range_tree_remove(rt, free_start, free_end - free_start);
	- }
	-}
	-
	-void
	-range_tree_swap(range_tree_t rtsrc, range_tree_t rtdst)
	-{
	- range_tree_t *rt;
	-
	- ASSERT0(range_tree_space(*rtdst));
	- ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
	-
	- rt = *rtsrc;
	- rtsrc = rtdst;
	- *rtdst = rt;
	-}
	-
	-void
	-range_tree_vacate(range_tree_t rt, range_tree_func_t func, void *arg)
	-{
	- range_seg_t *rs;
	- void *cookie = NULL;
	-
	-
	- if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
	- rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
	-
	- while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
	- if (func != NULL)
	- func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
	- kmem_cache_free(range_seg_cache, rs);
	- }
	-
	- bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
	- rt->rt_space = 0;
	-}
	-
	-void
	-range_tree_walk(range_tree_t rt, range_tree_func_t func, void *arg)
	-{
	- range_seg_t *rs;
	-
	- for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
	- func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
	-}
	-
	-range_seg_t *
	-range_tree_first(range_tree_t *rt)
	-{
	- return (avl_first(&rt->rt_root));
	-}
	-
	-uint64_t
	-range_tree_space(range_tree_t *rt)
	-{
	- return (rt->rt_space);
	-}
	-
	-/* Generic range tree functions for maintaining segments in an AVL tree. */
	-void
	-rt_avl_create(range_tree_t rt, void arg)
	-{
	- avl_tree_t *tree = arg;
	-
	- avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
	- offsetof(range_seg_t, rs_pp_node));
	-}
	-
	-void
	-rt_avl_destroy(range_tree_t rt, void arg)
	-{
	- avl_tree_t *tree = arg;
	-
	- ASSERT0(avl_numnodes(tree));
	- avl_destroy(tree);
	-}
	-
	-void
	-rt_avl_add(range_tree_t rt, range_seg_t rs, void *arg)
	-{
	- avl_tree_t *tree = arg;
	- avl_add(tree, rs);
	-}
	-
	-void
	-rt_avl_remove(range_tree_t rt, range_seg_t rs, void *arg)
	-{
	- avl_tree_t *tree = arg;
	- avl_remove(tree, rs);
	-}
	-
	-void
	-rt_avl_vacate(range_tree_t rt, void arg)
	-{
	- /*
	- * Normally one would walk the tree freeing nodes along the way.
	- * Since the nodes are shared with the range trees we can avoid
	- * walking all nodes and just reinitialize the avl tree. The nodes
	- * will be freed by the range tree, so we don't want to free them here.
	- */
	- rt_avl_create(rt, arg);
	-}
	-
	-boolean_t
	-range_tree_is_empty(range_tree_t *rt)
	-{
	- ASSERT(rt != NULL);
	- return (range_tree_space(rt) == 0);
	-}
	-
	-uint64_t
	-range_tree_min(range_tree_t *rt)
	-{
	- range_seg_t *rs = avl_first(&rt->rt_root);
	- return (rs != NULL ? rs->rs_start : 0);
	-}
	-
	-uint64_t
	-range_tree_max(range_tree_t *rt)
	-{
	- range_seg_t *rs = avl_last(&rt->rt_root);
	- return (rs != NULL ? rs->rs_end : 0);
	-}
	-
	-uint64_t
	-range_tree_span(range_tree_t *rt)
	-{
	- return (range_tree_max(rt) - range_tree_min(rt));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
	@@ -1,321 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/refcount.h>
	-
	-#ifdef ZFS_DEBUG
	-
	-#ifdef _KERNEL
	-int reference_tracking_enable = FALSE; /* runs out of memory too easily */
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
	- &reference_tracking_enable, 0,
	- "Track reference holders to refcount_t objects, used mostly by ZFS");
	-#else
	-int reference_tracking_enable = TRUE;
	-#endif
	-int reference_history = 3; /* tunable */
	-
	-static kmem_cache_t *reference_cache;
	-static kmem_cache_t *reference_history_cache;
	-
	-void
	-zfs_refcount_init(void)
	-{
	- reference_cache = kmem_cache_create("reference_cache",
	- sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	-
	- reference_history_cache = kmem_cache_create("reference_history_cache",
	- sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	-}
	-
	-void
	-zfs_refcount_fini(void)
	-{
	- kmem_cache_destroy(reference_cache);
	- kmem_cache_destroy(reference_history_cache);
	-}
	-
	-void
	-zfs_refcount_create(zfs_refcount_t *rc)
	-{
	- mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
	- list_create(&rc->rc_list, sizeof (reference_t),
	- offsetof(reference_t, ref_link));
	- list_create(&rc->rc_removed, sizeof (reference_t),
	- offsetof(reference_t, ref_link));
	- rc->rc_count = 0;
	- rc->rc_removed_count = 0;
	- rc->rc_tracked = reference_tracking_enable;
	-}
	-
	-void
	-zfs_refcount_create_tracked(zfs_refcount_t *rc)
	-{
	- zfs_refcount_create(rc);
	- rc->rc_tracked = B_TRUE;
	-}
	-
	-void
	-zfs_refcount_create_untracked(zfs_refcount_t *rc)
	-{
	- zfs_refcount_create(rc);
	- rc->rc_tracked = B_FALSE;
	-}
	-
	-void
	-zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
	-{
	- reference_t *ref;
	-
	- ASSERT(rc->rc_count == number);
	- while (ref = list_head(&rc->rc_list)) {
	- list_remove(&rc->rc_list, ref);
	- kmem_cache_free(reference_cache, ref);
	- }
	- list_destroy(&rc->rc_list);
	-
	- while (ref = list_head(&rc->rc_removed)) {
	- list_remove(&rc->rc_removed, ref);
	- kmem_cache_free(reference_history_cache, ref->ref_removed);
	- kmem_cache_free(reference_cache, ref);
	- }
	- list_destroy(&rc->rc_removed);
	- mutex_destroy(&rc->rc_mtx);
	-}
	-
	-void
	-zfs_refcount_destroy(zfs_refcount_t *rc)
	-{
	- zfs_refcount_destroy_many(rc, 0);
	-}
	-
	-int
	-zfs_refcount_is_zero(zfs_refcount_t *rc)
	-{
	- return (rc->rc_count == 0);
	-}
	-
	-int64_t
	-zfs_refcount_count(zfs_refcount_t *rc)
	-{
	- return (rc->rc_count);
	-}
	-
	-int64_t
	-zfs_refcount_add_many(zfs_refcount_t rc, uint64_t number, void holder)
	-{
	- reference_t *ref = NULL;
	- int64_t count;
	-
	- if (rc->rc_tracked) {
	- ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
	- ref->ref_holder = holder;
	- ref->ref_number = number;
	- }
	- mutex_enter(&rc->rc_mtx);
	- ASSERT(rc->rc_count >= 0);
	- if (rc->rc_tracked)
	- list_insert_head(&rc->rc_list, ref);
	- rc->rc_count += number;
	- count = rc->rc_count;
	- mutex_exit(&rc->rc_mtx);
	-
	- return (count);
	-}
	-
	-int64_t
	-zfs_refcount_add(zfs_refcount_t rc, void holder)
	-{
	- return (zfs_refcount_add_many(rc, 1, holder));
	-}
	-
	-int64_t
	-zfs_refcount_remove_many(zfs_refcount_t rc, uint64_t number, void holder)
	-{
	- reference_t *ref;
	- int64_t count;
	-
	- mutex_enter(&rc->rc_mtx);
	- ASSERT(rc->rc_count >= number);
	-
	- if (!rc->rc_tracked) {
	- rc->rc_count -= number;
	- count = rc->rc_count;
	- mutex_exit(&rc->rc_mtx);
	- return (count);
	- }
	-
	- for (ref = list_head(&rc->rc_list); ref;
	- ref = list_next(&rc->rc_list, ref)) {
	- if (ref->ref_holder == holder && ref->ref_number == number) {
	- list_remove(&rc->rc_list, ref);
	- if (reference_history > 0) {
	- ref->ref_removed =
	- kmem_cache_alloc(reference_history_cache,
	- KM_SLEEP);
	- list_insert_head(&rc->rc_removed, ref);
	- rc->rc_removed_count++;
	- if (rc->rc_removed_count > reference_history) {
	- ref = list_tail(&rc->rc_removed);
	- list_remove(&rc->rc_removed, ref);
	- kmem_cache_free(reference_history_cache,
	- ref->ref_removed);
	- kmem_cache_free(reference_cache, ref);
	- rc->rc_removed_count--;
	- }
	- } else {
	- kmem_cache_free(reference_cache, ref);
	- }
	- rc->rc_count -= number;
	- count = rc->rc_count;
	- mutex_exit(&rc->rc_mtx);
	- return (count);
	- }
	- }
	- panic("No such hold %p on refcount %llx", holder,
	- (u_longlong_t)(uintptr_t)rc);
	- return (-1);
	-}
	-
	-int64_t
	-zfs_refcount_remove(zfs_refcount_t rc, void holder)
	-{
	- return (zfs_refcount_remove_many(rc, 1, holder));
	-}
	-
	-void
	-zfs_refcount_transfer(zfs_refcount_t dst, zfs_refcount_t src)
	-{
	- int64_t count, removed_count;
	- list_t list, removed;
	-
	- list_create(&list, sizeof (reference_t),
	- offsetof(reference_t, ref_link));
	- list_create(&removed, sizeof (reference_t),
	- offsetof(reference_t, ref_link));
	-
	- mutex_enter(&src->rc_mtx);
	- count = src->rc_count;
	- removed_count = src->rc_removed_count;
	- src->rc_count = 0;
	- src->rc_removed_count = 0;
	- list_move_tail(&list, &src->rc_list);
	- list_move_tail(&removed, &src->rc_removed);
	- mutex_exit(&src->rc_mtx);
	-
	- mutex_enter(&dst->rc_mtx);
	- dst->rc_count += count;
	- dst->rc_removed_count += removed_count;
	- list_move_tail(&dst->rc_list, &list);
	- list_move_tail(&dst->rc_removed, &removed);
	- mutex_exit(&dst->rc_mtx);
	-
	- list_destroy(&list);
	- list_destroy(&removed);
	-}
	-
	-void
	-zfs_refcount_transfer_ownership(zfs_refcount_t rc, void current_holder,
	- void *new_holder)
	-{
	- reference_t *ref;
	- boolean_t found = B_FALSE;
	-
	- mutex_enter(&rc->rc_mtx);
	- if (!rc->rc_tracked) {
	- mutex_exit(&rc->rc_mtx);
	- return;
	- }
	-
	- for (ref = list_head(&rc->rc_list); ref;
	- ref = list_next(&rc->rc_list, ref)) {
	- if (ref->ref_holder == current_holder) {
	- ref->ref_holder = new_holder;
	- found = B_TRUE;
	- break;
	- }
	- }
	- ASSERT(found);
	- mutex_exit(&rc->rc_mtx);
	-}
	-
	-/*
	- * If tracking is enabled, return true if a reference exists that matches
	- * the "holder" tag. If tracking is disabled, then return true if a reference
	- * might be held.
	- */
	-boolean_t
	-zfs_refcount_held(zfs_refcount_t rc, void holder)
	-{
	- reference_t *ref;
	-
	- mutex_enter(&rc->rc_mtx);
	-
	- if (!rc->rc_tracked) {
	- mutex_exit(&rc->rc_mtx);
	- return (rc->rc_count > 0);
	- }
	-
	- for (ref = list_head(&rc->rc_list); ref;
	- ref = list_next(&rc->rc_list, ref)) {
	- if (ref->ref_holder == holder) {
	- mutex_exit(&rc->rc_mtx);
	- return (B_TRUE);
	- }
	- }
	- mutex_exit(&rc->rc_mtx);
	- return (B_FALSE);
	-}
	-
	-/*
	- * If tracking is enabled, return true if a reference does not exist that
	- * matches the "holder" tag. If tracking is disabled, always return true
	- * since the reference might not be held.
	- */
	-boolean_t
	-zfs_refcount_not_held(zfs_refcount_t rc, void holder)
	-{
	- reference_t *ref;
	-
	- mutex_enter(&rc->rc_mtx);
	-
	- if (!rc->rc_tracked) {
	- mutex_exit(&rc->rc_mtx);
	- return (B_TRUE);
	- }
	-
	- for (ref = list_head(&rc->rc_list); ref;
	- ref = list_next(&rc->rc_list, ref)) {
	- if (ref->ref_holder == holder) {
	- mutex_exit(&rc->rc_mtx);
	- return (B_FALSE);
	- }
	- }
	- mutex_exit(&rc->rc_mtx);
	- return (B_TRUE);
	-}
	-#endif /* ZFS_DEBUG */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
	@@ -1,396 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/refcount.h>
	-#include <sys/rrwlock.h>
	-
	-/*
	- * This file contains the implementation of a re-entrant read
	- * reader/writer lock (aka "rrwlock").
	- *
	- * This is a normal reader/writer lock with the additional feature
	- * of allowing threads who have already obtained a read lock to
	- * re-enter another read lock (re-entrant read) - even if there are
	- * waiting writers.
	- *
	- * Callers who have not obtained a read lock give waiting writers priority.
	- *
	- * The rrwlock_t lock does not allow re-entrant writers, nor does it
	- * allow a re-entrant mix of reads and writes (that is, it does not
	- * allow a caller who has already obtained a read lock to be able to
	- * then grab a write lock without first dropping all read locks, and
	- * vice versa).
	- *
	- * The rrwlock_t uses tsd (thread specific data) to keep a list of
	- * nodes (rrw_node_t), where each node keeps track of which specific
	- * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering
	- * should be rare, a thread that grabs multiple reads on the same rrwlock_t
	- * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
	- * tsd list can represent a different rrwlock_t. This allows a thread
	- * to enter multiple and unique rrwlock_ts for read locks at the same time.
	- *
	- * Since using tsd exposes some overhead, the rrwlock_t only needs to
	- * keep tsd data when writers are waiting. If no writers are waiting, then
	- * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
	- * is needed. Once a writer attempts to grab the lock, readers then
	- * keep tsd data and bump the linked readers count (rr_linked_rcount).
	- *
	- * If there are waiting writers and there are anonymous readers, then a
	- * reader doesn't know if it is a re-entrant lock. But since it may be one,
	- * we allow the read to proceed (otherwise it could deadlock). Since once
	- * waiting writers are active, readers no longer bump the anonymous count,
	- * the anonymous readers will eventually flush themselves out. At this point,
	- * readers will be able to tell if they are a re-entrant lock (have a
	- * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
	- * we must let the proceed. If they are not, then the reader blocks for the
	- * waiting writers. Hence, we do not starve writers.
	- */
	-
	-/* global key for TSD */
	-uint_t rrw_tsd_key;
	-
	-typedef struct rrw_node {
	- struct rrw_node *rn_next;
	- rrwlock_t *rn_rrl;
	- void *rn_tag;
	-} rrw_node_t;
	-
	-static rrw_node_t *
	-rrn_find(rrwlock_t *rrl)
	-{
	- rrw_node_t *rn;
	-
	- if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
	- return (NULL);
	-
	- for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
	- if (rn->rn_rrl == rrl)
	- return (rn);
	- }
	- return (NULL);
	-}
	-
	-/*
	- * Add a node to the head of the singly linked list.
	- */
	-static void
	-rrn_add(rrwlock_t rrl, void tag)
	-{
	- rrw_node_t *rn;
	-
	- rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
	- rn->rn_rrl = rrl;
	- rn->rn_next = tsd_get(rrw_tsd_key);
	- rn->rn_tag = tag;
	- VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
	-}
	-
	-/*
	- * If a node is found for 'rrl', then remove the node from this
	- * thread's list and return TRUE; otherwise return FALSE.
	- */
	-static boolean_t
	-rrn_find_and_remove(rrwlock_t rrl, void tag)
	-{
	- rrw_node_t *rn;
	- rrw_node_t *prev = NULL;
	-
	- if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
	- return (B_FALSE);
	-
	- for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
	- if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
	- if (prev)
	- prev->rn_next = rn->rn_next;
	- else
	- VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
	- kmem_free(rn, sizeof (*rn));
	- return (B_TRUE);
	- }
	- prev = rn;
	- }
	- return (B_FALSE);
	-}
	-
	-void
	-rrw_init(rrwlock_t *rrl, boolean_t track_all)
	-{
	- mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
	- rrl->rr_writer = NULL;
	- zfs_refcount_create(&rrl->rr_anon_rcount);
	- zfs_refcount_create(&rrl->rr_linked_rcount);
	- rrl->rr_writer_wanted = B_FALSE;
	- rrl->rr_track_all = track_all;
	-}
	-
	-void
	-rrw_destroy(rrwlock_t *rrl)
	-{
	- mutex_destroy(&rrl->rr_lock);
	- cv_destroy(&rrl->rr_cv);
	- ASSERT(rrl->rr_writer == NULL);
	- zfs_refcount_destroy(&rrl->rr_anon_rcount);
	- zfs_refcount_destroy(&rrl->rr_linked_rcount);
	-}
	-
	-static void
	-rrw_enter_read_impl(rrwlock_t rrl, boolean_t prio, void tag)
	-{
	- mutex_enter(&rrl->rr_lock);
	-#if !defined(DEBUG) && defined(_KERNEL)
	- if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
	- !rrl->rr_track_all) {
	- rrl->rr_anon_rcount.rc_count++;
	- mutex_exit(&rrl->rr_lock);
	- return;
	- }
	- DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
	-#endif
	- ASSERT(rrl->rr_writer != curthread);
	- ASSERT(zfs_refcount_count(&rrl->rr_anon_rcount) >= 0);
	-
	- while (rrl->rr_writer != NULL \|\| (rrl->rr_writer_wanted &&
	- zfs_refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
	- rrn_find(rrl) == NULL))
	- cv_wait(&rrl->rr_cv, &rrl->rr_lock);
	-
	- if (rrl->rr_writer_wanted \|\| rrl->rr_track_all) {
	- /* may or may not be a re-entrant enter */
	- rrn_add(rrl, tag);
	- (void) zfs_refcount_add(&rrl->rr_linked_rcount, tag);
	- } else {
	- (void) zfs_refcount_add(&rrl->rr_anon_rcount, tag);
	- }
	- ASSERT(rrl->rr_writer == NULL);
	- mutex_exit(&rrl->rr_lock);
	-}
	-
	-void
	-rrw_enter_read(rrwlock_t rrl, void tag)
	-{
	- rrw_enter_read_impl(rrl, B_FALSE, tag);
	-}
	-
	-/*
	- * take a read lock even if there are pending write lock requests. if we want
	- * to take a lock reentrantly, but from different threads (that have a
	- * relationship to each other), the normal detection mechanism to overrule
	- * the pending writer does not work, so we have to give an explicit hint here.
	- */
	-void
	-rrw_enter_read_prio(rrwlock_t rrl, void tag)
	-{
	- rrw_enter_read_impl(rrl, B_TRUE, tag);
	-}
	-
	-
	-void
	-rrw_enter_write(rrwlock_t *rrl)
	-{
	- mutex_enter(&rrl->rr_lock);
	- ASSERT(rrl->rr_writer != curthread);
	-
	- while (zfs_refcount_count(&rrl->rr_anon_rcount) > 0 \|\|
	- zfs_refcount_count(&rrl->rr_linked_rcount) > 0 \|\|
	- rrl->rr_writer != NULL) {
	- rrl->rr_writer_wanted = B_TRUE;
	- cv_wait(&rrl->rr_cv, &rrl->rr_lock);
	- }
	- rrl->rr_writer_wanted = B_FALSE;
	- rrl->rr_writer = curthread;
	- mutex_exit(&rrl->rr_lock);
	-}
	-
	-void
	-rrw_enter(rrwlock_t rrl, krw_t rw, void tag)
	-{
	- if (rw == RW_READER)
	- rrw_enter_read(rrl, tag);
	- else
	- rrw_enter_write(rrl);
	-}
	-
	-void
	-rrw_exit(rrwlock_t rrl, void tag)
	-{
	- mutex_enter(&rrl->rr_lock);
	-#if !defined(DEBUG) && defined(_KERNEL)
	- if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
	- rrl->rr_anon_rcount.rc_count--;
	- if (rrl->rr_anon_rcount.rc_count == 0)
	- cv_broadcast(&rrl->rr_cv);
	- mutex_exit(&rrl->rr_lock);
	- return;
	- }
	- DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
	-#endif
	- ASSERT(!zfs_refcount_is_zero(&rrl->rr_anon_rcount) \|\|
	- !zfs_refcount_is_zero(&rrl->rr_linked_rcount) \|\|
	- rrl->rr_writer != NULL);
	-
	- if (rrl->rr_writer == NULL) {
	- int64_t count;
	- if (rrn_find_and_remove(rrl, tag)) {
	- count = zfs_refcount_remove(
	- &rrl->rr_linked_rcount, tag);
	- } else {
	- ASSERT(!rrl->rr_track_all);
	- count = zfs_refcount_remove(&rrl->rr_anon_rcount, tag);
	- }
	- if (count == 0)
	- cv_broadcast(&rrl->rr_cv);
	- } else {
	- ASSERT(rrl->rr_writer == curthread);
	- ASSERT(zfs_refcount_is_zero(&rrl->rr_anon_rcount) &&
	- zfs_refcount_is_zero(&rrl->rr_linked_rcount));
	- rrl->rr_writer = NULL;
	- cv_broadcast(&rrl->rr_cv);
	- }
	- mutex_exit(&rrl->rr_lock);
	-}
	-
	-/*
	- * If the lock was created with track_all, rrw_held(RW_READER) will return
	- * B_TRUE iff the current thread has the lock for reader. Otherwise it may
	- * return B_TRUE if any thread has the lock for reader.
	- */
	-boolean_t
	-rrw_held(rrwlock_t *rrl, krw_t rw)
	-{
	- boolean_t held;
	-
	- mutex_enter(&rrl->rr_lock);
	- if (rw == RW_WRITER) {
	- held = (rrl->rr_writer == curthread);
	- } else {
	- held = (!zfs_refcount_is_zero(&rrl->rr_anon_rcount) \|\|
	- rrn_find(rrl) != NULL);
	- }
	- mutex_exit(&rrl->rr_lock);
	-
	- return (held);
	-}
	-
	-void
	-rrw_tsd_destroy(void *arg)
	-{
	- rrw_node_t *rn = arg;
	- if (rn != NULL) {
	- panic("thread %p terminating with rrw lock %p held",
	- (void )curthread, (void )rn->rn_rrl);
	- }
	-}
	-
	-/*
	- * A reader-mostly lock implementation, tuning above reader-writer locks
	- * for hightly parallel read acquisitions, while pessimizing writes.
	- *
	- * The idea is to split single busy lock into array of locks, so that
	- * each reader can lock only one of them for read, depending on result
	- * of simple hash function. That proportionally reduces lock congestion.
	- * Writer same time has to sequentially aquire write on all the locks.
	- * That makes write aquisition proportionally slower, but in places where
	- * it is used (filesystem unmount) performance is not critical.
	- *
	- * All the functions below are direct wrappers around functions above.
	- */
	-void
	-rrm_init(rrmlock_t *rrl, boolean_t track_all)
	-{
	- int i;
	-
	- for (i = 0; i < RRM_NUM_LOCKS; i++)
	- rrw_init(&rrl->locks[i], track_all);
	-}
	-
	-void
	-rrm_destroy(rrmlock_t *rrl)
	-{
	- int i;
	-
	- for (i = 0; i < RRM_NUM_LOCKS; i++)
	- rrw_destroy(&rrl->locks[i]);
	-}
	-
	-void
	-rrm_enter(rrmlock_t rrl, krw_t rw, void tag)
	-{
	- if (rw == RW_READER)
	- rrm_enter_read(rrl, tag);
	- else
	- rrm_enter_write(rrl);
	-}
	-
	-/*
	- * This maps the current thread to a specific lock. Note that the lock
	- * must be released by the same thread that acquired it. We do this
	- * mapping by taking the thread pointer mod a prime number. We examine
	- * only the low 32 bits of the thread pointer, because 32-bit division
	- * is faster than 64-bit division, and the high 32 bits have little
	- * entropy anyway.
	- */
	-#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
	-
	-void
	-rrm_enter_read(rrmlock_t rrl, void tag)
	-{
	- rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
	-}
	-
	-void
	-rrm_enter_write(rrmlock_t *rrl)
	-{
	- int i;
	-
	- for (i = 0; i < RRM_NUM_LOCKS; i++)
	- rrw_enter_write(&rrl->locks[i]);
	-}
	-
	-void
	-rrm_exit(rrmlock_t rrl, void tag)
	-{
	- int i;
	-
	- if (rrl->locks[0].rr_writer == curthread) {
	- for (i = 0; i < RRM_NUM_LOCKS; i++)
	- rrw_exit(&rrl->locks[i], tag);
	- } else {
	- rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
	- }
	-}
	-
	-boolean_t
	-rrm_held(rrmlock_t *rrl, krw_t rw)
	-{
	- if (rw == RW_WRITER) {
	- return (rrw_held(&rrl->locks[0], rw));
	- } else {
	- return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
	@@ -1,2012 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Portions Copyright 2011 iXsystems, Inc
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/systm.h>
	-#include <sys/sysmacros.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dbuf.h>
	-#include <sys/dnode.h>
	-#include <sys/zap.h>
	-#include <sys/sa.h>
	-#include <sys/sunddi.h>
	-#include <sys/sa_impl.h>
	-#include <sys/dnode.h>
	-#include <sys/errno.h>
	-#include <sys/zfs_context.h>
	-
	-/*
	- * ZFS System attributes:
	- *
	- * A generic mechanism to allow for arbitrary attributes
	- * to be stored in a dnode. The data will be stored in the bonus buffer of
	- * the dnode and if necessary a special "spill" block will be used to handle
	- * overflow situations. The spill block will be sized to fit the data
	- * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the
	- * spill block is stored at the end of the current bonus buffer. Any
	- * attributes that would be in the way of the blkptr_t will be relocated
	- * into the spill block.
	- *
	- * Attribute registration:
	- *
	- * Stored persistently on a per dataset basis
	- * a mapping between attribute "string" names and their actual attribute
	- * numeric values, length, and byteswap function. The names are only used
	- * during registration. All attributes are known by their unique attribute
	- * id value. If an attribute can have a variable size then the value
	- * 0 will be used to indicate this.
	- *
	- * Attribute Layout:
	- *
	- * Attribute layouts are a way to compactly store multiple attributes, but
	- * without taking the overhead associated with managing each attribute
	- * individually. Since you will typically have the same set of attributes
	- * stored in the same order a single table will be used to represent that
	- * layout. The ZPL for example will usually have only about 10 different
	- * layouts (regular files, device files, symlinks,
	- * regular files + scanstamp, files/dir with extended attributes, and then
	- * you have the possibility of all of those minus ACL, because it would
	- * be kicked out into the spill block)
	- *
	- * Layouts are simply an array of the attributes and their
	- * ordering i.e. [0, 1, 4, 5, 2]
	- *
	- * Each distinct layout is given a unique layout number and that is whats
	- * stored in the header at the beginning of the SA data buffer.
	- *
	- * A layout only covers a single dbuf (bonus or spill). If a set of
	- * attributes is split up between the bonus buffer and a spill buffer then
	- * two different layouts will be used. This allows us to byteswap the
	- * spill without looking at the bonus buffer and keeps the on disk format of
	- * the bonus and spill buffer the same.
	- *
	- * Adding a single attribute will cause the entire set of attributes to
	- * be rewritten and could result in a new layout number being constructed
	- * as part of the rewrite if no such layout exists for the new set of
	- * attribues. The new attribute will be appended to the end of the already
	- * existing attributes.
	- *
	- * Both the attribute registration and attribute layout information are
	- * stored in normal ZAP attributes. Their should be a small number of
	- * known layouts and the set of attributes is assumed to typically be quite
	- * small.
	- *
	- * The registered attributes and layout "table" information is maintained
	- * in core and a special "sa_os_t" is attached to the objset_t.
	- *
	- * A special interface is provided to allow for quickly applying
	- * a large set of attributes at once. sa_replace_all_by_template() is
	- * used to set an array of attributes. This is used by the ZPL when
	- * creating a brand new file. The template that is passed into the function
	- * specifies the attribute, size for variable length attributes, location of
	- * data and special "data locator" function if the data isn't in a contiguous
	- * location.
	- *
	- * Byteswap implications:
	- *
	- * Since the SA attributes are not entirely self describing we can't do
	- * the normal byteswap processing. The special ZAP layout attribute and
	- * attribute registration attributes define the byteswap function and the
	- * size of the attributes, unless it is variable sized.
	- * The normal ZFS byteswapping infrastructure assumes you don't need
	- * to read any objects in order to do the necessary byteswapping. Whereas
	- * SA attributes can only be properly byteswapped if the dataset is opened
	- * and the layout/attribute ZAP attributes are available. Because of this
	- * the SA attributes will be byteswapped when they are first accessed by
	- * the SA code that will read the SA data.
	- */
	-
	-typedef void (sa_iterfunc_t)(void hdr, void addr, sa_attr_type_t,
	- uint16_t length, int length_idx, boolean_t, void *userp);
	-
	-static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
	-static void sa_idx_tab_hold(objset_t os, sa_idx_tab_t idx_tab);
	-static sa_idx_tab_t sa_find_idx_tab(objset_t os, dmu_object_type_t bonustype,
	- sa_hdr_phys_t *hdr);
	-static void sa_idx_tab_rele(objset_t os, void arg);
	-static void sa_copy_data(sa_data_locator_t func, void start, void *target,
	- int buflen);
	-static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
	- sa_data_op_t action, sa_data_locator_t locator, void datastart,
	- uint16_t buflen, dmu_tx_t *tx);
	-
	-arc_byteswap_func_t *sa_bswap_table[] = {
	- byteswap_uint64_array,
	- byteswap_uint32_array,
	- byteswap_uint16_array,
	- byteswap_uint8_array,
	- zfs_acl_byteswap,
	-};
	-
	-#define SA_COPY_DATA(f, s, t, l) \
	- { \
	- if (f == NULL) { \
	- if (l == 8) { \
	- (uint64_t )t = (uint64_t )s; \
	- } else if (l == 16) { \
	- (uint64_t )t = (uint64_t )s; \
	- (uint64_t )((uintptr_t)t + 8) = \
	- (uint64_t )((uintptr_t)s + 8); \
	- } else { \
	- bcopy(s, t, l); \
	- } \
	- } else \
	- sa_copy_data(f, s, t, l); \
	- }
	-
	-/*
	- * This table is fixed and cannot be changed. Its purpose is to
	- * allow the SA code to work with both old/new ZPL file systems.
	- * It contains the list of legacy attributes. These attributes aren't
	- * stored in the "attribute" registry zap objects, since older ZPL file systems
	- * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will
	- * use this static table.
	- */
	-sa_attr_reg_t sa_legacy_attrs[] = {
	- {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
	- {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
	- {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
	- {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
	- {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
	- {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
	- {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
	- {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
	- {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
	- {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
	- {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
	- {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
	- {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
	- {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
	- {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
	- {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
	-};
	-
	-/*
	- * This is only used for objects of type DMU_OT_ZNODE
	- */
	-sa_attr_type_t sa_legacy_zpl_layout[] = {
	- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	-};
	-
	-/*
	- * Special dummy layout used for buffers with no attributes.
	- */
	-sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
	-
	-static int sa_legacy_attr_count = 16;
	-static kmem_cache_t *sa_cache = NULL;
	-
	-/ARGSUSED/
	-static int
	-sa_cache_constructor(void buf, void unused, int kmflag)
	-{
	- sa_handle_t *hdl = buf;
	-
	- mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
	- return (0);
	-}
	-
	-/ARGSUSED/
	-static void
	-sa_cache_destructor(void buf, void unused)
	-{
	- sa_handle_t *hdl = buf;
	- mutex_destroy(&hdl->sa_lock);
	-}
	-
	-void
	-sa_cache_init(void)
	-{
	- sa_cache = kmem_cache_create("sa_cache",
	- sizeof (sa_handle_t), 0, sa_cache_constructor,
	- sa_cache_destructor, NULL, NULL, NULL, 0);
	-}
	-
	-void
	-sa_cache_fini(void)
	-{
	- if (sa_cache)
	- kmem_cache_destroy(sa_cache);
	-}
	-
	-static int
	-layout_num_compare(const void arg1, const void arg2)
	-{
	- const sa_lot_t node1 = (const sa_lot_t )arg1;
	- const sa_lot_t node2 = (const sa_lot_t )arg2;
	-
	- return (AVL_CMP(node1->lot_num, node2->lot_num));
	-}
	-
	-static int
	-layout_hash_compare(const void arg1, const void arg2)
	-{
	- const sa_lot_t node1 = (const sa_lot_t )arg1;
	- const sa_lot_t node2 = (const sa_lot_t )arg2;
	-
	- int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash);
	- if (likely(cmp))
	- return (cmp);
	-
	- return (AVL_CMP(node1->lot_instance, node2->lot_instance));
	-}
	-
	-boolean_t
	-sa_layout_equal(sa_lot_t tbf, sa_attr_type_t attrs, int count)
	-{
	- int i;
	-
	- if (count != tbf->lot_attr_count)
	- return (1);
	-
	- for (i = 0; i != count; i++) {
	- if (attrs[i] != tbf->lot_attrs[i])
	- return (1);
	- }
	- return (0);
	-}
	-
	-#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
	-
	-static uint64_t
	-sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
	-{
	- int i;
	- uint64_t crc = -1ULL;
	-
	- for (i = 0; i != attr_count; i++)
	- crc ^= SA_ATTR_HASH(attrs[i]);
	-
	- return (crc);
	-}
	-
	-static int
	-sa_get_spill(sa_handle_t *hdl)
	-{
	- int rc;
	- if (hdl->sa_spill == NULL) {
	- if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
	- &hdl->sa_spill)) == 0)
	- VERIFY(0 == sa_build_index(hdl, SA_SPILL));
	- } else {
	- rc = 0;
	- }
	-
	- return (rc);
	-}
	-
	-/*
	- * Main attribute lookup/update function
	- * returns 0 for success or non zero for failures
	- *
	- * Operates on bulk array, first failure will abort further processing
	- */
	-int
	-sa_attr_op(sa_handle_t hdl, sa_bulk_attr_t bulk, int count,
	- sa_data_op_t data_op, dmu_tx_t *tx)
	-{
	- sa_os_t *sa = hdl->sa_os->os_sa;
	- int i;
	- int error = 0;
	- sa_buf_type_t buftypes;
	-
	- buftypes = 0;
	-
	- ASSERT(count > 0);
	- for (i = 0; i != count; i++) {
	- ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
	-
	- bulk[i].sa_addr = NULL;
	- /* First check the bonus buffer */
	-
	- if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
	- hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
	- SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
	- SA_GET_HDR(hdl, SA_BONUS),
	- bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
	- if (tx && !(buftypes & SA_BONUS)) {
	- dmu_buf_will_dirty(hdl->sa_bonus, tx);
	- buftypes \|= SA_BONUS;
	- }
	- }
	- if (bulk[i].sa_addr == NULL &&
	- ((error = sa_get_spill(hdl)) == 0)) {
	- if (TOC_ATTR_PRESENT(
	- hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
	- SA_ATTR_INFO(sa, hdl->sa_spill_tab,
	- SA_GET_HDR(hdl, SA_SPILL),
	- bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
	- if (tx && !(buftypes & SA_SPILL) &&
	- bulk[i].sa_size == bulk[i].sa_length) {
	- dmu_buf_will_dirty(hdl->sa_spill, tx);
	- buftypes \|= SA_SPILL;
	- }
	- }
	- }
	- if (error && error != ENOENT) {
	- return ((error == ECKSUM) ? EIO : error);
	- }
	-
	- switch (data_op) {
	- case SA_LOOKUP:
	- if (bulk[i].sa_addr == NULL)
	- return (SET_ERROR(ENOENT));
	- if (bulk[i].sa_data) {
	- SA_COPY_DATA(bulk[i].sa_data_func,
	- bulk[i].sa_addr, bulk[i].sa_data,
	- bulk[i].sa_size);
	- }
	- continue;
	-
	- case SA_UPDATE:
	- /* existing rewrite of attr */
	- if (bulk[i].sa_addr &&
	- bulk[i].sa_size == bulk[i].sa_length) {
	- SA_COPY_DATA(bulk[i].sa_data_func,
	- bulk[i].sa_data, bulk[i].sa_addr,
	- bulk[i].sa_length);
	- continue;
	- } else if (bulk[i].sa_addr) { /* attr size change */
	- error = sa_modify_attrs(hdl, bulk[i].sa_attr,
	- SA_REPLACE, bulk[i].sa_data_func,
	- bulk[i].sa_data, bulk[i].sa_length, tx);
	- } else { /* adding new attribute */
	- error = sa_modify_attrs(hdl, bulk[i].sa_attr,
	- SA_ADD, bulk[i].sa_data_func,
	- bulk[i].sa_data, bulk[i].sa_length, tx);
	- }
	- if (error)
	- return (error);
	- break;
	- }
	- }
	- return (error);
	-}
	-
	-static sa_lot_t *
	-sa_add_layout_entry(objset_t os, sa_attr_type_t attrs, int attr_count,
	- uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
	-{
	- sa_os_t *sa = os->os_sa;
	- sa_lot_t tb, findtb;
	- int i;
	- avl_index_t loc;
	-
	- ASSERT(MUTEX_HELD(&sa->sa_lock));
	- tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
	- tb->lot_attr_count = attr_count;
	- tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
	- KM_SLEEP);
	- bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
	- tb->lot_num = lot_num;
	- tb->lot_hash = hash;
	- tb->lot_instance = 0;
	-
	- if (zapadd) {
	- char attr_name[8];
	-
	- if (sa->sa_layout_attr_obj == 0) {
	- sa->sa_layout_attr_obj = zap_create_link(os,
	- DMU_OT_SA_ATTR_LAYOUTS,
	- sa->sa_master_obj, SA_LAYOUTS, tx);
	- }
	-
	- (void) snprintf(attr_name, sizeof (attr_name),
	- "%d", (int)lot_num);
	- VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
	- attr_name, 2, attr_count, attrs, tx));
	- }
	-
	- list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
	- offsetof(sa_idx_tab_t, sa_next));
	-
	- for (i = 0; i != attr_count; i++) {
	- if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
	- tb->lot_var_sizes++;
	- }
	-
	- avl_add(&sa->sa_layout_num_tree, tb);
	-
	- /* verify we don't have a hash collision */
	- if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
	- for (; findtb && findtb->lot_hash == hash;
	- findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
	- if (findtb->lot_instance != tb->lot_instance)
	- break;
	- tb->lot_instance++;
	- }
	- }
	- avl_add(&sa->sa_layout_hash_tree, tb);
	- return (tb);
	-}
	-
	-static void
	-sa_find_layout(objset_t os, uint64_t hash, sa_attr_type_t attrs,
	- int count, dmu_tx_t tx, sa_lot_t *lot)
	-{
	- sa_lot_t *tb, tbsearch;
	- avl_index_t loc;
	- sa_os_t *sa = os->os_sa;
	- boolean_t found = B_FALSE;
	-
	- mutex_enter(&sa->sa_lock);
	- tbsearch.lot_hash = hash;
	- tbsearch.lot_instance = 0;
	- tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
	- if (tb) {
	- for (; tb && tb->lot_hash == hash;
	- tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
	- if (sa_layout_equal(tb, attrs, count) == 0) {
	- found = B_TRUE;
	- break;
	- }
	- }
	- }
	- if (!found) {
	- tb = sa_add_layout_entry(os, attrs, count,
	- avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
	- }
	- mutex_exit(&sa->sa_lock);
	- *lot = tb;
	-}
	-
	-static int
	-sa_resize_spill(sa_handle_t hdl, uint32_t size, dmu_tx_t tx)
	-{
	- int error;
	- uint32_t blocksize;
	-
	- if (size == 0) {
	- blocksize = SPA_MINBLOCKSIZE;
	- } else if (size > SPA_OLD_MAXBLOCKSIZE) {
	- ASSERT(0);
	- return (SET_ERROR(EFBIG));
	- } else {
	- blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
	- }
	-
	- error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
	- ASSERT(error == 0);
	- return (error);
	-}
	-
	-static void
	-sa_copy_data(sa_data_locator_t func, void datastart, void *target, int buflen)
	-{
	- if (func == NULL) {
	- bcopy(datastart, target, buflen);
	- } else {
	- boolean_t start;
	- int bytes;
	- void *dataptr;
	- void *saptr = target;
	- uint32_t length;
	-
	- start = B_TRUE;
	- bytes = 0;
	- while (bytes < buflen) {
	- func(&dataptr, &length, buflen, start, datastart);
	- bcopy(dataptr, saptr, length);
	- saptr = (void *)((caddr_t)saptr + length);
	- bytes += length;
	- start = B_FALSE;
	- }
	- }
	-}
	-
	-/*
	- * Determine several different sizes
	- * first the sa header size
	- * the number of bytes to be stored
	- * if spill would occur the index in the attribute array is returned
	- *
	- * the boolean will_spill will be set when spilling is necessary. It
	- * is only set when the buftype is SA_BONUS
	- */
	-static int
	-sa_find_sizes(sa_os_t sa, sa_bulk_attr_t attr_desc, int attr_count,
	- dmu_buf_t db, sa_buf_type_t buftype, int full_space, int index,
	- int total, boolean_t will_spill)
	-{
	- int var_size = 0;
	- int i;
	- int hdrsize;
	- int extra_hdrsize;
	-
	- if (buftype == SA_BONUS && sa->sa_force_spill) {
	- *total = 0;
	- *index = 0;
	- *will_spill = B_TRUE;
	- return (0);
	- }
	-
	- *index = -1;
	- *total = 0;
	- *will_spill = B_FALSE;
	-
	- extra_hdrsize = 0;
	- hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
	- sizeof (sa_hdr_phys_t);
	-
	- ASSERT(IS_P2ALIGNED(full_space, 8));
	-
	- for (i = 0; i != attr_count; i++) {
	- boolean_t is_var_sz;
	-
	- total = P2ROUNDUP(total, 8);
	- *total += attr_desc[i].sa_length;
	- if (*will_spill)
	- continue;
	-
	- is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
	- if (is_var_sz) {
	- var_size++;
	- }
	-
	- if (is_var_sz && var_size > 1) {
	- /*
	- * Don't worry that the spill block might overflow.
	- * It will be resized if needed in sa_build_layouts().
	- */
	- if (buftype == SA_SPILL \|\|
	- P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
	- *total < full_space) {
	- /*
	- * Account for header space used by array of
	- * optional sizes of variable-length attributes.
	- * Record the extra header size in case this
	- * increase needs to be reversed due to
	- * spill-over.
	- */
	- hdrsize += sizeof (uint16_t);
	- if (*index != -1)
	- extra_hdrsize += sizeof (uint16_t);
	- } else {
	- ASSERT(buftype == SA_BONUS);
	- if (*index == -1)
	- *index = i;
	- *will_spill = B_TRUE;
	- continue;
	- }
	- }
	-
	- /*
	- * find index of where spill could occur.
	- * Then continue to count of remainder attribute
	- * space. The sum is used later for sizing bonus
	- * and spill buffer.
	- */
	- if (buftype == SA_BONUS && *index == -1 &&
	- (*total + P2ROUNDUP(hdrsize, 8)) >
	- (full_space - sizeof (blkptr_t))) {
	- *index = i;
	- }
	-
	- if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
	- buftype == SA_BONUS)
	- *will_spill = B_TRUE;
	- }
	-
	- if (*will_spill)
	- hdrsize -= extra_hdrsize;
	-
	- hdrsize = P2ROUNDUP(hdrsize, 8);
	- return (hdrsize);
	-}
	-
	-#define BUF_SPACE_NEEDED(total, header) (total + header)
	-
	-/*
	- * Find layout that corresponds to ordering of attributes
	- * If not found a new layout number is created and added to
	- * persistent layout tables.
	- */
	-static int
	-sa_build_layouts(sa_handle_t hdl, sa_bulk_attr_t attr_desc, int attr_count,
	- dmu_tx_t *tx)
	-{
	- sa_os_t *sa = hdl->sa_os->os_sa;
	- uint64_t hash;
	- sa_buf_type_t buftype;
	- sa_hdr_phys_t *sahdr;
	- void *data_start;
	- int buf_space;
	- sa_attr_type_t attrs, attrs_start;
	- int i, lot_count;
	- int dnodesize;
	- int hdrsize;
	- int spillhdrsize = 0;
	- int used;
	- dmu_object_type_t bonustype;
	- sa_lot_t *lot;
	- int len_idx;
	- int spill_used;
	- int bonuslen;
	- boolean_t spilling;
	-
	- dmu_buf_will_dirty(hdl->sa_bonus, tx);
	- bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
	- dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
	- bonuslen = DN_BONUS_SIZE(dnodesize);
	-
	- dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
	- bonuslen = DN_BONUS_SIZE(dnodesize);
	-
	- /* first determine bonus header size and sum of all attributes */
	- hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
	- SA_BONUS, bonuslen, &i, &used, &spilling);
	-
	- if (used > SPA_OLD_MAXBLOCKSIZE)
	- return (SET_ERROR(EFBIG));
	-
	- VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
	- MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
	- used + hdrsize, tx));
	-
	- ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) \|\|
	- bonustype == DMU_OT_SA);
	-
	- /* setup and size spill buffer when needed */
	- if (spilling) {
	- boolean_t dummy;
	-
	- if (hdl->sa_spill == NULL) {
	- VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
	- &hdl->sa_spill) == 0);
	- }
	- dmu_buf_will_dirty(hdl->sa_spill, tx);
	-
	- spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
	- attr_count - i, hdl->sa_spill, SA_SPILL,
	- hdl->sa_spill->db_size, &i, &spill_used, &dummy);
	-
	- if (spill_used > SPA_OLD_MAXBLOCKSIZE)
	- return (SET_ERROR(EFBIG));
	-
	- buf_space = hdl->sa_spill->db_size - spillhdrsize;
	- if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
	- hdl->sa_spill->db_size)
	- VERIFY(0 == sa_resize_spill(hdl,
	- BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
	- }
	-
	- /* setup starting pointers to lay down data */
	- data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
	- sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
	- buftype = SA_BONUS;
	-
	- if (spilling)
	- buf_space = (sa->sa_force_spill) ?
	- 0 : SA_BLKPTR_SPACE - hdrsize;
	- else
	- buf_space = hdl->sa_bonus->db_size - hdrsize;
	-
	- attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
	- KM_SLEEP);
	- lot_count = 0;
	-
	- for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
	- uint16_t length;
	-
	- ASSERT(IS_P2ALIGNED(data_start, 8));
	- ASSERT(IS_P2ALIGNED(buf_space, 8));
	- attrs[i] = attr_desc[i].sa_attr;
	- length = SA_REGISTERED_LEN(sa, attrs[i]);
	- if (length == 0)
	- length = attr_desc[i].sa_length;
	- else
	- VERIFY(length == attr_desc[i].sa_length);
	-
	- if (buf_space < length) { /* switch to spill buffer */
	- VERIFY(spilling);
	- VERIFY(bonustype == DMU_OT_SA);
	- if (buftype == SA_BONUS && !sa->sa_force_spill) {
	- sa_find_layout(hdl->sa_os, hash, attrs_start,
	- lot_count, tx, &lot);
	- SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
	- }
	-
	- buftype = SA_SPILL;
	- hash = -1ULL;
	- len_idx = 0;
	-
	- sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
	- sahdr->sa_magic = SA_MAGIC;
	- data_start = (void *)((uintptr_t)sahdr +
	- spillhdrsize);
	- attrs_start = &attrs[i];
	- buf_space = hdl->sa_spill->db_size - spillhdrsize;
	- lot_count = 0;
	- }
	- hash ^= SA_ATTR_HASH(attrs[i]);
	- attr_desc[i].sa_addr = data_start;
	- attr_desc[i].sa_size = length;
	- SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
	- data_start, length);
	- if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
	- sahdr->sa_lengths[len_idx++] = length;
	- }
	- VERIFY((uintptr_t)data_start % 8 == 0);
	- data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
	- length), 8);
	- buf_space -= P2ROUNDUP(length, 8);
	- lot_count++;
	- }
	-
	- sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
	-
	- /*
	- * Verify that old znodes always have layout number 0.
	- * Must be DMU_OT_SA for arbitrary layouts
	- */
	- VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) \|\|
	- (bonustype == DMU_OT_SA && lot->lot_num > 1));
	-
	- if (bonustype == DMU_OT_SA) {
	- SA_SET_HDR(sahdr, lot->lot_num,
	- buftype == SA_BONUS ? hdrsize : spillhdrsize);
	- }
	-
	- kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
	- if (hdl->sa_bonus_tab) {
	- sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
	- hdl->sa_bonus_tab = NULL;
	- }
	- if (!sa->sa_force_spill)
	- VERIFY(0 == sa_build_index(hdl, SA_BONUS));
	- if (hdl->sa_spill) {
	- sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
	- if (!spilling) {
	- /*
	- * remove spill block that is no longer needed.
	- */
	- dmu_buf_rele(hdl->sa_spill, NULL);
	- hdl->sa_spill = NULL;
	- hdl->sa_spill_tab = NULL;
	- VERIFY(0 == dmu_rm_spill(hdl->sa_os,
	- sa_handle_object(hdl), tx));
	- } else {
	- VERIFY(0 == sa_build_index(hdl, SA_SPILL));
	- }
	- }
	-
	- return (0);
	-}
	-
	-static void
	-sa_free_attr_table(sa_os_t *sa)
	-{
	- int i;
	-
	- if (sa->sa_attr_table == NULL)
	- return;
	-
	- for (i = 0; i != sa->sa_num_attrs; i++) {
	- if (sa->sa_attr_table[i].sa_name)
	- kmem_free(sa->sa_attr_table[i].sa_name,
	- strlen(sa->sa_attr_table[i].sa_name) + 1);
	- }
	-
	- kmem_free(sa->sa_attr_table,
	- sizeof (sa_attr_table_t) * sa->sa_num_attrs);
	-
	- sa->sa_attr_table = NULL;
	-}
	-
	-static int
	-sa_attr_table_setup(objset_t os, sa_attr_reg_t reg_attrs, int count)
	-{
	- sa_os_t *sa = os->os_sa;
	- uint64_t sa_attr_count = 0;
	- uint64_t sa_reg_count = 0;
	- int error = 0;
	- uint64_t attr_value;
	- sa_attr_table_t *tb;
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- int registered_count = 0;
	- int i;
	- dmu_objset_type_t ostype = dmu_objset_type(os);
	-
	- sa->sa_user_table =
	- kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
	- sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
	-
	- if (sa->sa_reg_attr_obj != 0) {
	- error = zap_count(os, sa->sa_reg_attr_obj,
	- &sa_attr_count);
	-
	- /*
	- * Make sure we retrieved a count and that it isn't zero
	- */
	- if (error \|\| (error == 0 && sa_attr_count == 0)) {
	- if (error == 0)
	- error = SET_ERROR(EINVAL);
	- goto bail;
	- }
	- sa_reg_count = sa_attr_count;
	- }
	-
	- if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
	- sa_attr_count += sa_legacy_attr_count;
	-
	- /* Allocate attribute numbers for attributes that aren't registered */
	- for (i = 0; i != count; i++) {
	- boolean_t found = B_FALSE;
	- int j;
	-
	- if (ostype == DMU_OST_ZFS) {
	- for (j = 0; j != sa_legacy_attr_count; j++) {
	- if (strcmp(reg_attrs[i].sa_name,
	- sa_legacy_attrs[j].sa_name) == 0) {
	- sa->sa_user_table[i] =
	- sa_legacy_attrs[j].sa_attr;
	- found = B_TRUE;
	- }
	- }
	- }
	- if (found)
	- continue;
	-
	- if (sa->sa_reg_attr_obj)
	- error = zap_lookup(os, sa->sa_reg_attr_obj,
	- reg_attrs[i].sa_name, 8, 1, &attr_value);
	- else
	- error = SET_ERROR(ENOENT);
	- switch (error) {
	- case ENOENT:
	- sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
	- sa_attr_count++;
	- break;
	- case 0:
	- sa->sa_user_table[i] = ATTR_NUM(attr_value);
	- break;
	- default:
	- goto bail;
	- }
	- }
	-
	- sa->sa_num_attrs = sa_attr_count;
	- tb = sa->sa_attr_table =
	- kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
	-
	- /*
	- * Attribute table is constructed from requested attribute list,
	- * previously foreign registered attributes, and also the legacy
	- * ZPL set of attributes.
	- */
	-
	- if (sa->sa_reg_attr_obj) {
	- for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
	- (error = zap_cursor_retrieve(&zc, &za)) == 0;
	- zap_cursor_advance(&zc)) {
	- uint64_t value;
	- value = za.za_first_integer;
	-
	- registered_count++;
	- tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
	- tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
	- tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
	- tb[ATTR_NUM(value)].sa_registered = B_TRUE;
	-
	- if (tb[ATTR_NUM(value)].sa_name) {
	- continue;
	- }
	- tb[ATTR_NUM(value)].sa_name =
	- kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
	- (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
	- strlen(za.za_name) +1);
	- }
	- zap_cursor_fini(&zc);
	- /*
	- * Make sure we processed the correct number of registered
	- * attributes
	- */
	- if (registered_count != sa_reg_count) {
	- ASSERT(error != 0);
	- goto bail;
	- }
	-
	- }
	-
	- if (ostype == DMU_OST_ZFS) {
	- for (i = 0; i != sa_legacy_attr_count; i++) {
	- if (tb[i].sa_name)
	- continue;
	- tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
	- tb[i].sa_length = sa_legacy_attrs[i].sa_length;
	- tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
	- tb[i].sa_registered = B_FALSE;
	- tb[i].sa_name =
	- kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
	- KM_SLEEP);
	- (void) strlcpy(tb[i].sa_name,
	- sa_legacy_attrs[i].sa_name,
	- strlen(sa_legacy_attrs[i].sa_name) + 1);
	- }
	- }
	-
	- for (i = 0; i != count; i++) {
	- sa_attr_type_t attr_id;
	-
	- attr_id = sa->sa_user_table[i];
	- if (tb[attr_id].sa_name)
	- continue;
	-
	- tb[attr_id].sa_length = reg_attrs[i].sa_length;
	- tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
	- tb[attr_id].sa_attr = attr_id;
	- tb[attr_id].sa_name =
	- kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
	- (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
	- strlen(reg_attrs[i].sa_name) + 1);
	- }
	-
	- sa->sa_need_attr_registration =
	- (sa_attr_count != registered_count);
	-
	- return (0);
	-bail:
	- kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
	- sa->sa_user_table = NULL;
	- sa_free_attr_table(sa);
	- return ((error != 0) ? error : EINVAL);
	-}
	-
	-int
	-sa_setup(objset_t os, uint64_t sa_obj, sa_attr_reg_t reg_attrs, int count,
	- sa_attr_type_t **user_table)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- sa_os_t *sa;
	- dmu_objset_type_t ostype = dmu_objset_type(os);
	- sa_attr_type_t *tb;
	- int error;
	-
	- mutex_enter(&os->os_user_ptr_lock);
	- if (os->os_sa) {
	- mutex_enter(&os->os_sa->sa_lock);
	- mutex_exit(&os->os_user_ptr_lock);
	- tb = os->os_sa->sa_user_table;
	- mutex_exit(&os->os_sa->sa_lock);
	- *user_table = tb;
	- return (0);
	- }
	-
	- sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
	- mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
	- sa->sa_master_obj = sa_obj;
	-
	- os->os_sa = sa;
	- mutex_enter(&sa->sa_lock);
	- mutex_exit(&os->os_user_ptr_lock);
	- avl_create(&sa->sa_layout_num_tree, layout_num_compare,
	- sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
	- avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
	- sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
	-
	- if (sa_obj) {
	- error = zap_lookup(os, sa_obj, SA_LAYOUTS,
	- 8, 1, &sa->sa_layout_attr_obj);
	- if (error != 0 && error != ENOENT)
	- goto fail;
	- error = zap_lookup(os, sa_obj, SA_REGISTRY,
	- 8, 1, &sa->sa_reg_attr_obj);
	- if (error != 0 && error != ENOENT)
	- goto fail;
	- }
	-
	- if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
	- goto fail;
	-
	- if (sa->sa_layout_attr_obj != 0) {
	- uint64_t layout_count;
	-
	- error = zap_count(os, sa->sa_layout_attr_obj,
	- &layout_count);
	-
	- /*
	- * Layout number count should be > 0
	- */
	- if (error \|\| (error == 0 && layout_count == 0)) {
	- if (error == 0)
	- error = SET_ERROR(EINVAL);
	- goto fail;
	- }
	-
	- for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
	- (error = zap_cursor_retrieve(&zc, &za)) == 0;
	- zap_cursor_advance(&zc)) {
	- sa_attr_type_t *lot_attrs;
	- uint64_t lot_num;
	-
	- lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
	- za.za_num_integers, KM_SLEEP);
	-
	- if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
	- za.za_name, 2, za.za_num_integers,
	- lot_attrs))) != 0) {
	- kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
	- za.za_num_integers);
	- break;
	- }
	- VERIFY(ddi_strtoull(za.za_name, NULL, 10,
	- (unsigned long long *)&lot_num) == 0);
	-
	- (void) sa_add_layout_entry(os, lot_attrs,
	- za.za_num_integers, lot_num,
	- sa_layout_info_hash(lot_attrs,
	- za.za_num_integers), B_FALSE, NULL);
	- kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
	- za.za_num_integers);
	- }
	- zap_cursor_fini(&zc);
	-
	- /*
	- * Make sure layout count matches number of entries added
	- * to AVL tree
	- */
	- if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
	- ASSERT(error != 0);
	- goto fail;
	- }
	- }
	-
	- /* Add special layout number for old ZNODES */
	- if (ostype == DMU_OST_ZFS) {
	- (void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
	- sa_legacy_attr_count, 0,
	- sa_layout_info_hash(sa_legacy_zpl_layout,
	- sa_legacy_attr_count), B_FALSE, NULL);
	-
	- (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
	- 0, B_FALSE, NULL);
	- }
	- *user_table = os->os_sa->sa_user_table;
	- mutex_exit(&sa->sa_lock);
	- return (0);
	-fail:
	- os->os_sa = NULL;
	- sa_free_attr_table(sa);
	- if (sa->sa_user_table)
	- kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
	- mutex_exit(&sa->sa_lock);
	- avl_destroy(&sa->sa_layout_hash_tree);
	- avl_destroy(&sa->sa_layout_num_tree);
	- mutex_destroy(&sa->sa_lock);
	- kmem_free(sa, sizeof (sa_os_t));
	- return ((error == ECKSUM) ? EIO : error);
	-}
	-
	-void
	-sa_tear_down(objset_t *os)
	-{
	- sa_os_t *sa = os->os_sa;
	- sa_lot_t *layout;
	- void *cookie;
	-
	- kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
	-
	- /* Free up attr table */
	-
	- sa_free_attr_table(sa);
	-
	- cookie = NULL;
	- while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
	- sa_idx_tab_t *tab;
	- while (tab = list_head(&layout->lot_idx_tab)) {
	- ASSERT(zfs_refcount_count(&tab->sa_refcount));
	- sa_idx_tab_rele(os, tab);
	- }
	- }
	-
	- cookie = NULL;
	- while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
	- kmem_free(layout->lot_attrs,
	- sizeof (sa_attr_type_t) * layout->lot_attr_count);
	- kmem_free(layout, sizeof (sa_lot_t));
	- }
	-
	- avl_destroy(&sa->sa_layout_hash_tree);
	- avl_destroy(&sa->sa_layout_num_tree);
	- mutex_destroy(&sa->sa_lock);
	-
	- kmem_free(sa, sizeof (sa_os_t));
	- os->os_sa = NULL;
	-}
	-
	-void
	-sa_build_idx_tab(void hdr, void attr_addr, sa_attr_type_t attr,
	- uint16_t length, int length_idx, boolean_t var_length, void *userp)
	-{
	- sa_idx_tab_t *idx_tab = userp;
	-
	- if (var_length) {
	- ASSERT(idx_tab->sa_variable_lengths);
	- idx_tab->sa_variable_lengths[length_idx] = length;
	- }
	- TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
	- (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
	-}
	-
	-static void
	-sa_attr_iter(objset_t os, sa_hdr_phys_t hdr, dmu_object_type_t type,
	- sa_iterfunc_t func, sa_lot_t tab, void userp)
	-{
	- void *data_start;
	- sa_lot_t *tb = tab;
	- sa_lot_t search;
	- avl_index_t loc;
	- sa_os_t *sa = os->os_sa;
	- int i;
	- uint16_t *length_start = NULL;
	- uint8_t length_idx = 0;
	-
	- if (tab == NULL) {
	- search.lot_num = SA_LAYOUT_NUM(hdr, type);
	- tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
	- ASSERT(tb);
	- }
	-
	- if (IS_SA_BONUSTYPE(type)) {
	- data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
	- offsetof(sa_hdr_phys_t, sa_lengths) +
	- (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
	- length_start = hdr->sa_lengths;
	- } else {
	- data_start = hdr;
	- }
	-
	- for (i = 0; i != tb->lot_attr_count; i++) {
	- int attr_length, reg_length;
	- uint8_t idx_len;
	-
	- reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
	- if (reg_length) {
	- attr_length = reg_length;
	- idx_len = 0;
	- } else {
	- attr_length = length_start[length_idx];
	- idx_len = length_idx++;
	- }
	-
	- func(hdr, data_start, tb->lot_attrs[i], attr_length,
	- idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
	-
	- data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
	- attr_length), 8);
	- }
	-}
	-
	-/ARGSUSED/
	-void
	-sa_byteswap_cb(void hdr, void attr_addr, sa_attr_type_t attr,
	- uint16_t length, int length_idx, boolean_t variable_length, void *userp)
	-{
	- sa_handle_t *hdl = userp;
	- sa_os_t *sa = hdl->sa_os->os_sa;
	-
	- sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
	-}
	-
	-void
	-sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
	-{
	- sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
	- dmu_buf_impl_t *db;
	- sa_os_t *sa = hdl->sa_os->os_sa;
	- int num_lengths = 1;
	- int i;
	-
	- ASSERT(MUTEX_HELD(&sa->sa_lock));
	- if (sa_hdr_phys->sa_magic == SA_MAGIC)
	- return;
	-
	- db = SA_GET_DB(hdl, buftype);
	-
	- if (buftype == SA_SPILL) {
	- arc_release(db->db_buf, NULL);
	- arc_buf_thaw(db->db_buf);
	- }
	-
	- sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
	- sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
	-
	- /*
	- * Determine number of variable lenghts in header
	- * The standard 8 byte header has one for free and a
	- * 16 byte header would have 4 + 1;
	- */
	- if (SA_HDR_SIZE(sa_hdr_phys) > 8)
	- num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
	- for (i = 0; i != num_lengths; i++)
	- sa_hdr_phys->sa_lengths[i] =
	- BSWAP_16(sa_hdr_phys->sa_lengths[i]);
	-
	- sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
	- sa_byteswap_cb, NULL, hdl);
	-
	- if (buftype == SA_SPILL)
	- arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
	-}
	-
	-static int
	-sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
	-{
	- sa_hdr_phys_t *sa_hdr_phys;
	- dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
	- dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
	- sa_os_t *sa = hdl->sa_os->os_sa;
	- sa_idx_tab_t *idx_tab;
	-
	- sa_hdr_phys = SA_GET_HDR(hdl, buftype);
	-
	- mutex_enter(&sa->sa_lock);
	-
	- /* Do we need to byteswap? */
	-
	- /* only check if not old znode */
	- if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
	- sa_hdr_phys->sa_magic != 0) {
	- VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
	- sa_byteswap(hdl, buftype);
	- }
	-
	- idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
	-
	- if (buftype == SA_BONUS)
	- hdl->sa_bonus_tab = idx_tab;
	- else
	- hdl->sa_spill_tab = idx_tab;
	-
	- mutex_exit(&sa->sa_lock);
	- return (0);
	-}
	-
	-/ARGSUSED/
	-static void
	-sa_evict_sync(void *dbu)
	-{
	- panic("evicting sa dbuf\n");
	-}
	-
	-static void
	-sa_idx_tab_rele(objset_t os, void arg)
	-{
	- sa_os_t *sa = os->os_sa;
	- sa_idx_tab_t *idx_tab = arg;
	-
	- if (idx_tab == NULL)
	- return;
	-
	- mutex_enter(&sa->sa_lock);
	- if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
	- list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
	- if (idx_tab->sa_variable_lengths)
	- kmem_free(idx_tab->sa_variable_lengths,
	- sizeof (uint16_t) *
	- idx_tab->sa_layout->lot_var_sizes);
	- zfs_refcount_destroy(&idx_tab->sa_refcount);
	- kmem_free(idx_tab->sa_idx_tab,
	- sizeof (uint32_t) * sa->sa_num_attrs);
	- kmem_free(idx_tab, sizeof (sa_idx_tab_t));
	- }
	- mutex_exit(&sa->sa_lock);
	-}
	-
	-static void
	-sa_idx_tab_hold(objset_t os, sa_idx_tab_t idx_tab)
	-{
	- sa_os_t *sa = os->os_sa;
	-
	- ASSERT(MUTEX_HELD(&sa->sa_lock));
	- (void) zfs_refcount_add(&idx_tab->sa_refcount, NULL);
	-}
	-
	-void
	-sa_handle_destroy(sa_handle_t *hdl)
	-{
	- dmu_buf_t *db = hdl->sa_bonus;
	-
	- mutex_enter(&hdl->sa_lock);
	- (void) dmu_buf_remove_user(db, &hdl->sa_dbu);
	-
	- if (hdl->sa_bonus_tab)
	- sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
	-
	- if (hdl->sa_spill_tab)
	- sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
	-
	- dmu_buf_rele(hdl->sa_bonus, NULL);
	-
	- if (hdl->sa_spill)
	- dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
	- mutex_exit(&hdl->sa_lock);
	-
	- kmem_cache_free(sa_cache, hdl);
	-}
	-
	-int
	-sa_handle_get_from_db(objset_t os, dmu_buf_t db, void *userp,
	- sa_handle_type_t hdl_type, sa_handle_t **handlepp)
	-{
	- int error = 0;
	- dmu_object_info_t doi;
	- sa_handle_t *handle = NULL;
	-
	-#ifdef ZFS_DEBUG
	- dmu_object_info_from_db(db, &doi);
	- ASSERT(doi.doi_bonus_type == DMU_OT_SA \|\|
	- doi.doi_bonus_type == DMU_OT_ZNODE);
	-#endif
	- /* find handle, if it exists */
	- /* if one doesn't exist then create a new one, and initialize it */
	-
	- if (hdl_type == SA_HDL_SHARED)
	- handle = dmu_buf_get_user(db);
	-
	- if (handle == NULL) {
	- sa_handle_t *winner = NULL;
	-
	- handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
	- handle->sa_dbu.dbu_evict_func_sync = NULL;
	- handle->sa_dbu.dbu_evict_func_async = NULL;
	- handle->sa_userp = userp;
	- handle->sa_bonus = db;
	- handle->sa_os = os;
	- handle->sa_spill = NULL;
	- handle->sa_bonus_tab = NULL;
	- handle->sa_spill_tab = NULL;
	-
	- error = sa_build_index(handle, SA_BONUS);
	-
	- if (hdl_type == SA_HDL_SHARED) {
	- dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
	- NULL);
	- winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
	- }
	-
	- if (winner != NULL) {
	- kmem_cache_free(sa_cache, handle);
	- handle = winner;
	- }
	- }
	- *handlepp = handle;
	-
	- return (error);
	-}
	-
	-int
	-sa_handle_get(objset_t objset, uint64_t objid, void userp,
	- sa_handle_type_t hdl_type, sa_handle_t **handlepp)
	-{
	- dmu_buf_t *db;
	- int error;
	-
	- if (error = dmu_bonus_hold(objset, objid, NULL, &db))
	- return (error);
	-
	- return (sa_handle_get_from_db(objset, db, userp, hdl_type,
	- handlepp));
	-}
	-
	-int
	-sa_buf_hold(objset_t objset, uint64_t obj_num, void tag, dmu_buf_t **db)
	-{
	- return (dmu_bonus_hold(objset, obj_num, tag, db));
	-}
	-
	-void
	-sa_buf_rele(dmu_buf_t db, void tag)
	-{
	- dmu_buf_rele(db, tag);
	-}
	-
	-int
	-sa_lookup_impl(sa_handle_t hdl, sa_bulk_attr_t bulk, int count)
	-{
	- ASSERT(hdl);
	- ASSERT(MUTEX_HELD(&hdl->sa_lock));
	- return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
	-}
	-
	-int
	-sa_lookup(sa_handle_t hdl, sa_attr_type_t attr, void buf, uint32_t buflen)
	-{
	- int error;
	- sa_bulk_attr_t bulk;
	-
	- bulk.sa_attr = attr;
	- bulk.sa_data = buf;
	- bulk.sa_length = buflen;
	- bulk.sa_data_func = NULL;
	-
	- ASSERT(hdl);
	- mutex_enter(&hdl->sa_lock);
	- error = sa_lookup_impl(hdl, &bulk, 1);
	- mutex_exit(&hdl->sa_lock);
	- return (error);
	-}
	-
	-#ifdef _KERNEL
	-int
	-sa_lookup_uio(sa_handle_t hdl, sa_attr_type_t attr, uio_t uio)
	-{
	- int error;
	- sa_bulk_attr_t bulk;
	-
	- bulk.sa_data = NULL;
	- bulk.sa_attr = attr;
	- bulk.sa_data_func = NULL;
	-
	- ASSERT(hdl);
	-
	- mutex_enter(&hdl->sa_lock);
	- if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
	- error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
	- uio->uio_resid), UIO_READ, uio);
	- }
	- mutex_exit(&hdl->sa_lock);
	- return (error);
	-
	-}
	-#endif
	-
	-static sa_idx_tab_t *
	-sa_find_idx_tab(objset_t os, dmu_object_type_t bonustype, sa_hdr_phys_t hdr)
	-{
	- sa_idx_tab_t *idx_tab;
	- sa_os_t *sa = os->os_sa;
	- sa_lot_t *tb, search;
	- avl_index_t loc;
	-
	- /*
	- * Deterimine layout number. If SA node and header == 0 then
	- * force the index table to the dummy "1" empty layout.
	- *
	- * The layout number would only be zero for a newly created file
	- * that has not added any attributes yet, or with crypto enabled which
	- * doesn't write any attributes to the bonus buffer.
	- */
	-
	- search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
	-
	- tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
	-
	- /* Verify header size is consistent with layout information */
	- ASSERT(tb);
	- ASSERT(IS_SA_BONUSTYPE(bonustype) &&
	- SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \|\| !IS_SA_BONUSTYPE(bonustype) \|\|
	- (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
	-
	- /*
	- * See if any of the already existing TOC entries can be reused?
	- */
	-
	- for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
	- idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
	- boolean_t valid_idx = B_TRUE;
	- int i;
	-
	- if (tb->lot_var_sizes != 0 &&
	- idx_tab->sa_variable_lengths != NULL) {
	- for (i = 0; i != tb->lot_var_sizes; i++) {
	- if (hdr->sa_lengths[i] !=
	- idx_tab->sa_variable_lengths[i]) {
	- valid_idx = B_FALSE;
	- break;
	- }
	- }
	- }
	- if (valid_idx) {
	- sa_idx_tab_hold(os, idx_tab);
	- return (idx_tab);
	- }
	- }
	-
	- /* No such luck, create a new entry */
	- idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
	- idx_tab->sa_idx_tab =
	- kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
	- idx_tab->sa_layout = tb;
	- zfs_refcount_create(&idx_tab->sa_refcount);
	- if (tb->lot_var_sizes)
	- idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
	- tb->lot_var_sizes, KM_SLEEP);
	-
	- sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
	- tb, idx_tab);
	- sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */
	- sa_idx_tab_hold(os, idx_tab); /* one for layout */
	- list_insert_tail(&tb->lot_idx_tab, idx_tab);
	- return (idx_tab);
	-}
	-
	-void
	-sa_default_locator(void *dataptr, uint32_t len, uint32_t total_len,
	- boolean_t start, void *userdata)
	-{
	- ASSERT(start);
	-
	- *dataptr = userdata;
	- *len = total_len;
	-}
	-
	-static void
	-sa_attr_register_sync(sa_handle_t hdl, dmu_tx_t tx)
	-{
	- uint64_t attr_value = 0;
	- sa_os_t *sa = hdl->sa_os->os_sa;
	- sa_attr_table_t *tb = sa->sa_attr_table;
	- int i;
	-
	- mutex_enter(&sa->sa_lock);
	-
	- if (!sa->sa_need_attr_registration \|\| sa->sa_master_obj == 0) {
	- mutex_exit(&sa->sa_lock);
	- return;
	- }
	-
	- if (sa->sa_reg_attr_obj == 0) {
	- sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
	- DMU_OT_SA_ATTR_REGISTRATION,
	- sa->sa_master_obj, SA_REGISTRY, tx);
	- }
	- for (i = 0; i != sa->sa_num_attrs; i++) {
	- if (sa->sa_attr_table[i].sa_registered)
	- continue;
	- ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
	- tb[i].sa_byteswap);
	- VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
	- tb[i].sa_name, 8, 1, &attr_value, tx));
	- tb[i].sa_registered = B_TRUE;
	- }
	- sa->sa_need_attr_registration = B_FALSE;
	- mutex_exit(&sa->sa_lock);
	-}
	-
	-/*
	- * Replace all attributes with attributes specified in template.
	- * If dnode had a spill buffer then those attributes will be
	- * also be replaced, possibly with just an empty spill block
	- *
	- * This interface is intended to only be used for bulk adding of
	- * attributes for a new file. It will also be used by the ZPL
	- * when converting and old formatted znode to native SA support.
	- */
	-int
	-sa_replace_all_by_template_locked(sa_handle_t hdl, sa_bulk_attr_t attr_desc,
	- int attr_count, dmu_tx_t *tx)
	-{
	- sa_os_t *sa = hdl->sa_os->os_sa;
	-
	- if (sa->sa_need_attr_registration)
	- sa_attr_register_sync(hdl, tx);
	- return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
	-}
	-
	-int
	-sa_replace_all_by_template(sa_handle_t hdl, sa_bulk_attr_t attr_desc,
	- int attr_count, dmu_tx_t *tx)
	-{
	- int error;
	-
	- mutex_enter(&hdl->sa_lock);
	- error = sa_replace_all_by_template_locked(hdl, attr_desc,
	- attr_count, tx);
	- mutex_exit(&hdl->sa_lock);
	- return (error);
	-}
	-
	-/*
	- * Add/remove a single attribute or replace a variable-sized attribute value
	- * with a value of a different size, and then rewrite the entire set
	- * of attributes.
	- * Same-length attribute value replacement (including fixed-length attributes)
	- * is handled more efficiently by the upper layers.
	- */
	-static int
	-sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
	- sa_data_op_t action, sa_data_locator_t locator, void datastart,
	- uint16_t buflen, dmu_tx_t *tx)
	-{
	- sa_os_t *sa = hdl->sa_os->os_sa;
	- dmu_buf_impl_t db = (dmu_buf_impl_t )hdl->sa_bonus;
	- dnode_t *dn;
	- sa_bulk_attr_t *attr_desc;
	- void *old_data[2];
	- int bonus_attr_count = 0;
	- int bonus_data_size = 0;
	- int spill_data_size = 0;
	- int spill_attr_count = 0;
	- int error;
	- uint16_t length, reg_length;
	- int i, j, k, length_idx;
	- sa_hdr_phys_t *hdr;
	- sa_idx_tab_t *idx_tab;
	- int attr_count;
	- int count;
	-
	- ASSERT(MUTEX_HELD(&hdl->sa_lock));
	-
	- /* First make of copy of the old data */
	-
	- DB_DNODE_ENTER(db);
	- dn = DB_DNODE(db);
	- if (dn->dn_bonuslen != 0) {
	- bonus_data_size = hdl->sa_bonus->db_size;
	- old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
	- bcopy(hdl->sa_bonus->db_data, old_data[0],
	- hdl->sa_bonus->db_size);
	- bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
	- } else {
	- old_data[0] = NULL;
	- }
	- DB_DNODE_EXIT(db);
	-
	- /* Bring spill buffer online if it isn't currently */
	-
	- if ((error = sa_get_spill(hdl)) == 0) {
	- spill_data_size = hdl->sa_spill->db_size;
	- old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
	- bcopy(hdl->sa_spill->db_data, old_data[1],
	- hdl->sa_spill->db_size);
	- spill_attr_count =
	- hdl->sa_spill_tab->sa_layout->lot_attr_count;
	- } else if (error && error != ENOENT) {
	- if (old_data[0])
	- kmem_free(old_data[0], bonus_data_size);
	- return (error);
	- } else {
	- old_data[1] = NULL;
	- }
	-
	- /* build descriptor of all attributes */
	-
	- attr_count = bonus_attr_count + spill_attr_count;
	- if (action == SA_ADD)
	- attr_count++;
	- else if (action == SA_REMOVE)
	- attr_count--;
	-
	- attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
	-
	- /*
	- * loop through bonus and spill buffer if it exists, and
	- * build up new attr_descriptor to reset the attributes
	- */
	- k = j = 0;
	- count = bonus_attr_count;
	- hdr = SA_GET_HDR(hdl, SA_BONUS);
	- idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
	- for (; k != 2; k++) {
	- /*
	- * Iterate over each attribute in layout. Fetch the
	- * size of variable-length attributes needing rewrite
	- * from sa_lengths[].
	- */
	- for (i = 0, length_idx = 0; i != count; i++) {
	- sa_attr_type_t attr;
	-
	- attr = idx_tab->sa_layout->lot_attrs[i];
	- reg_length = SA_REGISTERED_LEN(sa, attr);
	- if (reg_length == 0) {
	- length = hdr->sa_lengths[length_idx];
	- length_idx++;
	- } else {
	- length = reg_length;
	- }
	- if (attr == newattr) {
	- /*
	- * There is nothing to do for SA_REMOVE,
	- * so it is just skipped.
	- */
	- if (action == SA_REMOVE)
	- continue;
	-
	- /*
	- * Duplicate attributes are not allowed, so the
	- * action can not be SA_ADD here.
	- */
	- ASSERT3S(action, ==, SA_REPLACE);
	-
	- /*
	- * Only a variable-sized attribute can be
	- * replaced here, and its size must be changing.
	- */
	- ASSERT3U(reg_length, ==, 0);
	- ASSERT3U(length, !=, buflen);
	- SA_ADD_BULK_ATTR(attr_desc, j, attr,
	- locator, datastart, buflen);
	- } else {
	- SA_ADD_BULK_ATTR(attr_desc, j, attr,
	- NULL, (void *)
	- (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
	- (uintptr_t)old_data[k]), length);
	- }
	- }
	- if (k == 0 && hdl->sa_spill) {
	- hdr = SA_GET_HDR(hdl, SA_SPILL);
	- idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
	- count = spill_attr_count;
	- } else {
	- break;
	- }
	- }
	- if (action == SA_ADD) {
	- reg_length = SA_REGISTERED_LEN(sa, newattr);
	- IMPLY(reg_length != 0, reg_length == buflen);
	- SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
	- datastart, buflen);
	- }
	- ASSERT3U(j, ==, attr_count);
	-
	- error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
	-
	- if (old_data[0])
	- kmem_free(old_data[0], bonus_data_size);
	- if (old_data[1])
	- kmem_free(old_data[1], spill_data_size);
	- kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
	-
	- return (error);
	-}
	-
	-static int
	-sa_bulk_update_impl(sa_handle_t hdl, sa_bulk_attr_t bulk, int count,
	- dmu_tx_t *tx)
	-{
	- int error;
	- sa_os_t *sa = hdl->sa_os->os_sa;
	- dmu_object_type_t bonustype;
	-
	- bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
	-
	- ASSERT(hdl);
	- ASSERT(MUTEX_HELD(&hdl->sa_lock));
	-
	- /* sync out registration table if necessary */
	- if (sa->sa_need_attr_registration)
	- sa_attr_register_sync(hdl, tx);
	-
	- error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
	- if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
	- sa->sa_update_cb(hdl, tx);
	-
	- return (error);
	-}
	-
	-/*
	- * update or add new attribute
	- */
	-int
	-sa_update(sa_handle_t *hdl, sa_attr_type_t type,
	- void buf, uint32_t buflen, dmu_tx_t tx)
	-{
	- int error;
	- sa_bulk_attr_t bulk;
	-
	- bulk.sa_attr = type;
	- bulk.sa_data_func = NULL;
	- bulk.sa_length = buflen;
	- bulk.sa_data = buf;
	-
	- mutex_enter(&hdl->sa_lock);
	- error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
	- mutex_exit(&hdl->sa_lock);
	- return (error);
	-}
	-
	-int
	-sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
	- uint32_t buflen, sa_data_locator_t locator, void userdata, dmu_tx_t *tx)
	-{
	- int error;
	- sa_bulk_attr_t bulk;
	-
	- bulk.sa_attr = attr;
	- bulk.sa_data = userdata;
	- bulk.sa_data_func = locator;
	- bulk.sa_length = buflen;
	-
	- mutex_enter(&hdl->sa_lock);
	- error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
	- mutex_exit(&hdl->sa_lock);
	- return (error);
	-}
	-
	-/*
	- * Return size of an attribute
	- */
	-
	-int
	-sa_size(sa_handle_t hdl, sa_attr_type_t attr, int size)
	-{
	- sa_bulk_attr_t bulk;
	- int error;
	-
	- bulk.sa_data = NULL;
	- bulk.sa_attr = attr;
	- bulk.sa_data_func = NULL;
	-
	- ASSERT(hdl);
	- mutex_enter(&hdl->sa_lock);
	- if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
	- mutex_exit(&hdl->sa_lock);
	- return (error);
	- }
	- *size = bulk.sa_size;
	-
	- mutex_exit(&hdl->sa_lock);
	- return (0);
	-}
	-
	-int
	-sa_bulk_lookup_locked(sa_handle_t hdl, sa_bulk_attr_t attrs, int count)
	-{
	- ASSERT(hdl);
	- ASSERT(MUTEX_HELD(&hdl->sa_lock));
	- return (sa_lookup_impl(hdl, attrs, count));
	-}
	-
	-int
	-sa_bulk_lookup(sa_handle_t hdl, sa_bulk_attr_t attrs, int count)
	-{
	- int error;
	-
	- ASSERT(hdl);
	- mutex_enter(&hdl->sa_lock);
	- error = sa_bulk_lookup_locked(hdl, attrs, count);
	- mutex_exit(&hdl->sa_lock);
	- return (error);
	-}
	-
	-int
	-sa_bulk_update(sa_handle_t hdl, sa_bulk_attr_t attrs, int count, dmu_tx_t *tx)
	-{
	- int error;
	-
	- ASSERT(hdl);
	- mutex_enter(&hdl->sa_lock);
	- error = sa_bulk_update_impl(hdl, attrs, count, tx);
	- mutex_exit(&hdl->sa_lock);
	- return (error);
	-}
	-
	-int
	-sa_remove(sa_handle_t hdl, sa_attr_type_t attr, dmu_tx_t tx)
	-{
	- int error;
	-
	- mutex_enter(&hdl->sa_lock);
	- error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
	- NULL, 0, tx);
	- mutex_exit(&hdl->sa_lock);
	- return (error);
	-}
	-
	-void
	-sa_object_info(sa_handle_t hdl, dmu_object_info_t doi)
	-{
	- dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
	-}
	-
	-void
	-sa_object_size(sa_handle_t hdl, uint32_t blksize, u_longlong_t *nblocks)
	-{
	- dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
	- blksize, nblocks);
	-}
	-
	-void
	-sa_set_userp(sa_handle_t hdl, void ptr)
	-{
	- hdl->sa_userp = ptr;
	-}
	-
	-dmu_buf_t *
	-sa_get_db(sa_handle_t *hdl)
	-{
	- return ((dmu_buf_t *)hdl->sa_bonus);
	-}
	-
	-void *
	-sa_get_userdata(sa_handle_t *hdl)
	-{
	- return (hdl->sa_userp);
	-}
	-
	-void
	-sa_register_update_callback_locked(objset_t os, sa_update_cb_t func)
	-{
	- ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
	- os->os_sa->sa_update_cb = func;
	-}
	-
	-void
	-sa_register_update_callback(objset_t os, sa_update_cb_t func)
	-{
	-
	- mutex_enter(&os->os_sa->sa_lock);
	- sa_register_update_callback_locked(os, func);
	- mutex_exit(&os->os_sa->sa_lock);
	-}
	-
	-uint64_t
	-sa_handle_object(sa_handle_t *hdl)
	-{
	- return (hdl->sa_bonus->db_object);
	-}
	-
	-boolean_t
	-sa_enabled(objset_t *os)
	-{
	- return (os->os_sa == NULL);
	-}
	-
	-int
	-sa_set_sa_object(objset_t *os, uint64_t sa_object)
	-{
	- sa_os_t *sa = os->os_sa;
	-
	- if (sa->sa_master_obj)
	- return (1);
	-
	- sa->sa_master_obj = sa_object;
	-
	- return (0);
	-}
	-
	-int
	-sa_hdrsize(void *arg)
	-{
	- sa_hdr_phys_t *hdr = arg;
	-
	- return (SA_HDR_SIZE(hdr));
	-}
	-
	-void
	-sa_handle_lock(sa_handle_t *hdl)
	-{
	- ASSERT(hdl);
	- mutex_enter(&hdl->sa_lock);
	-}
	-
	-void
	-sa_handle_unlock(sa_handle_t *hdl)
	-{
	- ASSERT(hdl);
	- mutex_exit(&hdl->sa_lock);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
	@@ -1,105 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-#include <sys/zfs_context.h>
	-#include <sys/zio.h>
	-#ifdef _KERNEL
	-#include <crypto/sha2/sha256.h>
	-#include <crypto/sha2/sha512t.h>
	-#else
	-#include <sha256.h>
	-#include <sha512t.h>
	-#endif
	-#include <sys/abd.h>
	-
	-static int
	-sha256_incremental(void buf, size_t size, void arg)
	-{
	- SHA256_CTX *ctx = arg;
	- SHA256_Update(ctx, buf, size);
	- return (0);
	-}
	-
	-static int
	-sha512_incremental(void buf, size_t size, void arg)
	-{
	- SHA512_CTX *ctx = arg;
	- SHA512_256_Update(ctx, buf, size);
	- return (0);
	-}
	-
	-/ARGSUSED/
	-void
	-abd_checksum_SHA256(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- SHA256_CTX ctx;
	- zio_cksum_t tmp;
	-
	- SHA256_Init(&ctx);
	- (void) abd_iterate_func(abd, 0, size, sha256_incremental, &ctx);
	- SHA256_Final((unsigned char *)&tmp, &ctx);
	-
	- /*
	- * A prior implementation of this function had a
	- * private SHA256 implementation always wrote things out in
	- * Big Endian and there wasn't a byteswap variant of it.
	- * To preserve on disk compatibility we need to force that
	- * behavior.
	- */
	- zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
	- zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
	- zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
	- zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
	-}
	-
	-/ARGSUSED/
	-void
	-abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- SHA512_CTX ctx;
	-
	- SHA512_256_Init(&ctx);
	- (void) abd_iterate_func(abd, 0, size, sha512_incremental, &ctx);
	- SHA512_256_Final((unsigned char *)zcp, &ctx);
	-}
	-
	-/ARGSUSED/
	-void
	-abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- zio_cksum_t tmp;
	-
	- abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
	- zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
	- zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
	- zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
	- zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c
	@@ -1,105 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://opensource.org/licenses/CDDL-1.0.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-#include <sys/zfs_context.h>
	-#include <sys/zio.h>
	-#ifdef _KERNEL
	-#include <crypto/skein/skein.h>
	-#else
	-#include <skein.h>
	-#endif
	-#include <sys/abd.h>
	-
	-static int
	-skein_incremental(void buf, size_t size, void arg)
	-{
	- Skein_512_Ctxt_t *ctx = arg;
	- (void) Skein_512_Update(ctx, buf, size);
	- return (0);
	-}
	-
	-/*
	- * Computes a native 256-bit skein MAC checksum. Please note that this
	- * function requires the presence of a ctx_template that should be allocated
	- * using abd_checksum_skein_tmpl_init.
	- */
	-/ARGSUSED/
	-void
	-abd_checksum_skein_native(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- Skein_512_Ctxt_t ctx;
	-
	- ASSERT(ctx_template != NULL);
	- bcopy(ctx_template, &ctx, sizeof (ctx));
	- (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
	- (void) Skein_512_Final(&ctx, (uint8_t *)zcp);
	- bzero(&ctx, sizeof (ctx));
	-}
	-
	-/*
	- * Byteswapped version of abd_checksum_skein_native. This just invokes
	- * the native checksum function and byteswaps the resulting checksum (since
	- * skein is internally endian-insensitive).
	- */
	-void
	-abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- zio_cksum_t tmp;
	-
	- abd_checksum_skein_native(abd, size, ctx_template, &tmp);
	- zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
	- zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
	- zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
	- zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
	-}
	-
	-/*
	- * Allocates a skein MAC template suitable for using in skein MAC checksum
	- * computations and returns a pointer to it.
	- */
	-void *
	-abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
	-{
	- Skein_512_Ctxt_t *ctx;
	-
	- ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
	- (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
	- salt->zcs_bytes, sizeof (salt->zcs_bytes));
	- return (ctx);
	-}
	-
	-/*
	- * Frees a skein context template previously allocated using
	- * abd_checksum_skein_tmpl_init.
	- */
	-void
	-abd_checksum_skein_tmpl_free(void *ctx_template)
	-{
	- Skein_512_Ctxt_t *ctx = ctx_template;
	-
	- bzero(ctx, sizeof (*ctx));
	- kmem_free(ctx, sizeof (*ctx));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
	@@ -1,8972 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016 Toomas Soome <tsoome@me.com>
	- * Copyright 2018 Joyent, Inc.
	- * Copyright (c) 2017, Intel Corporation.
	- * Copyright (c) 2017 Datto Inc.
	- * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
	- * Copyright (c) 2016 Actifio, Inc. All rights reserved.
	- */
	-
	-/*
	- * SPA: Storage Pool Allocator
	- *
	- * This file contains all the routines used when modifying on-disk SPA state.
	- * This includes opening, importing, destroying, exporting a pool, and syncing a
	- * pool.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/fm/fs/zfs.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zio.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/zap.h>
	-#include <sys/zil.h>
	-#include <sys/ddt.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/vdev_removal.h>
	-#include <sys/vdev_indirect_mapping.h>
	-#include <sys/vdev_indirect_births.h>
	-#include <sys/vdev_initialize.h>
	-#include <sys/metaslab.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/mmp.h>
	-#include <sys/uberblock_impl.h>
	-#include <sys/txg.h>
	-#include <sys/avl.h>
	-#include <sys/bpobj.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/unique.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/arc.h>
	-#include <sys/callb.h>
	-#include <sys/spa_boot.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/dmu_send.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/dsl_userhold.h>
	-#include <sys/zfeature.h>
	-#include <sys/zvol.h>
	-#include <sys/trim_map.h>
	-#include <sys/abd.h>
	-
	-#ifdef _KERNEL
	-#include <sys/callb.h>
	-#include <sys/cpupart.h>
	-#include <sys/zone.h>
	-#endif /* _KERNEL */
	-
	-#include "zfs_prop.h"
	-#include "zfs_comutil.h"
	-
	-/* Check hostid on import? */
	-static int check_hostid = 1;
	-
	-/*
	- * The interval, in seconds, at which failed configuration cache file writes
	- * should be retried.
	- */
	-int zfs_ccw_retry_interval = 300;
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
	- "Check hostid on import?");
	-TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
	- &zfs_ccw_retry_interval, 0,
	- "Configuration cache file write, retry after failure, interval (seconds)");
	-
	-typedef enum zti_modes {
	- ZTI_MODE_FIXED, /* value is # of threads (min 1) */
	- ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
	- ZTI_MODE_NULL, /* don't create a taskq */
	- ZTI_NMODES
	-} zti_modes_t;
	-
	-#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
	-#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
	-#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
	-
	-#define ZTI_N(n) ZTI_P(n, 1)
	-#define ZTI_ONE ZTI_N(1)
	-
	-typedef struct zio_taskq_info {
	- zti_modes_t zti_mode;
	- uint_t zti_value;
	- uint_t zti_count;
	-} zio_taskq_info_t;
	-
	-static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
	- "issue", "issue_high", "intr", "intr_high"
	-};
	-
	-/*
	- * This table defines the taskq settings for each ZFS I/O type. When
	- * initializing a pool, we use this table to create an appropriately sized
	- * taskq. Some operations are low volume and therefore have a small, static
	- * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
	- * macros. Other operations process a large amount of data; the ZTI_BATCH
	- * macro causes us to create a taskq oriented for throughput. Some operations
	- * are so high frequency and short-lived that the taskq itself can become a a
	- * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
	- * additional degree of parallelism specified by the number of threads per-
	- * taskq and the number of taskqs; when dispatching an event in this case, the
	- * particular taskq is chosen at random.
	- *
	- * The different taskq priorities are to handle the different contexts (issue
	- * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
	- * need to be handled with minimum delay.
	- */
	-const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
	- /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
	- { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
	- { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
	- { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */
	- { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
	- { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
	- { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
	-};
	-
	-static void spa_sync_version(void arg, dmu_tx_t tx);
	-static void spa_sync_props(void arg, dmu_tx_t tx);
	-static boolean_t spa_has_active_shared_spare(spa_t *spa);
	-static int spa_load_impl(spa_t spa, spa_import_type_t type, char *ereport);
	-static void spa_vdev_resilver_done(spa_t *spa);
	-
	-uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
	-#ifdef PSRSET_BIND
	-id_t zio_taskq_psrset_bind = PS_NONE;
	-#endif
	-#ifdef SYSDC
	-boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
	-uint_t zio_taskq_basedc = 80; /* base duty cycle */
	-#endif
	-
	-#ifdef _KERNEL
	-#define SPA_PROCESS
	-#endif
	-boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
	-
	-extern int zfs_sync_pass_deferred_free;
	-
	-/*
	- * Report any spa_load_verify errors found, but do not fail spa_load.
	- * This is used by zdb to analyze non-idle pools.
	- */
	-boolean_t spa_load_verify_dryrun = B_FALSE;
	-
	-/*
	- * This (illegal) pool name is used when temporarily importing a spa_t in order
	- * to get the vdev stats associated with the imported devices.
	- */
	-#define TRYIMPORT_NAME "$import"
	-
	-/*
	- * For debugging purposes: print out vdev tree during pool import.
	- */
	-int spa_load_print_vdev_tree = B_FALSE;
	-
	-/*
	- * A non-zero value for zfs_max_missing_tvds means that we allow importing
	- * pools with missing top-level vdevs. This is strictly intended for advanced
	- * pool recovery cases since missing data is almost inevitable. Pools with
	- * missing devices can only be imported read-only for safety reasons, and their
	- * fail-mode will be automatically set to "continue".
	- *
	- * With 1 missing vdev we should be able to import the pool and mount all
	- * datasets. User data that was not modified after the missing device has been
	- * added should be recoverable. This means that snapshots created prior to the
	- * addition of that device should be completely intact.
	- *
	- * With 2 missing vdevs, some datasets may fail to mount since there are
	- * dataset statistics that are stored as regular metadata. Some data might be
	- * recoverable if those vdevs were added recently.
	- *
	- * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
	- * may be missing entirely. Chances of data recovery are very low. Note that
	- * there are also risks of performing an inadvertent rewind as we might be
	- * missing all the vdevs with the latest uberblocks.
	- */
	-uint64_t zfs_max_missing_tvds = 0;
	-
	-/*
	- * The parameters below are similar to zfs_max_missing_tvds but are only
	- * intended for a preliminary open of the pool with an untrusted config which
	- * might be incomplete or out-dated.
	- *
	- * We are more tolerant for pools opened from a cachefile since we could have
	- * an out-dated cachefile where a device removal was not registered.
	- * We could have set the limit arbitrarily high but in the case where devices
	- * are really missing we would want to return the proper error codes; we chose
	- * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
	- * and we get a chance to retrieve the trusted config.
	- */
	-uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
	-
	-/*
	- * In the case where config was assembled by scanning device paths (/dev/dsks
	- * by default) we are less tolerant since all the existing devices should have
	- * been detected and we want spa_load to return the right error codes.
	- */
	-uint64_t zfs_max_missing_tvds_scan = 0;
	-
	-
	-SYSCTL_DECL(_vfs_zfs_zio);
	-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_batch_pct, CTLFLAG_RDTUN,
	- &zio_taskq_batch_pct, 0,
	- "Percentage of CPUs to run an IO worker thread");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_print_vdev_tree, CTLFLAG_RWTUN,
	- &spa_load_print_vdev_tree, 0,
	- "print out vdev tree during pool import");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds, CTLFLAG_RWTUN,
	- &zfs_max_missing_tvds, 0,
	- "allow importing pools with missing top-level vdevs");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN,
	- &zfs_max_missing_tvds_cachefile, 0,
	- "allow importing pools with missing top-level vdevs in cache file");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN,
	- &zfs_max_missing_tvds_scan, 0,
	- "allow importing pools with missing top-level vdevs during scan");
	-
	-/*
	- * Debugging aid that pauses spa_sync() towards the end.
	- */
	-boolean_t zfs_pause_spa_sync = B_FALSE;
	-
	-/*
	- * ==========================================================================
	- * SPA properties routines
	- * ==========================================================================
	- */
	-
	-/*
	- * Add a (source=src, propname=propval) list to an nvlist.
	- */
	-static void
	-spa_prop_add_list(nvlist_t nvl, zpool_prop_t prop, char strval,
	- uint64_t intval, zprop_source_t src)
	-{
	- const char *propname = zpool_prop_to_name(prop);
	- nvlist_t *propval;
	-
	- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
	-
	- if (strval != NULL)
	- VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
	- else
	- VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
	-
	- VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
	- nvlist_free(propval);
	-}
	-
	-/*
	- * Get property values from the spa configuration.
	- */
	-static void
	-spa_prop_get_config(spa_t spa, nvlist_t *nvp)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- dsl_pool_t *pool = spa->spa_dsl_pool;
	- uint64_t size, alloc, cap, version;
	- zprop_source_t src = ZPROP_SRC_NONE;
	- spa_config_dirent_t *dp;
	- metaslab_class_t *mc = spa_normal_class(spa);
	-
	- ASSERT(MUTEX_HELD(&spa->spa_props_lock));
	-
	- if (rvd != NULL) {
	- alloc = metaslab_class_get_alloc(mc);
	- alloc += metaslab_class_get_alloc(spa_special_class(spa));
	- alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
	-
	- size = metaslab_class_get_space(mc);
	- size += metaslab_class_get_space(spa_special_class(spa));
	- size += metaslab_class_get_space(spa_dedup_class(spa));
	-
	- spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
	- spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
	- spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
	- spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
	- size - alloc, src);
	- spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
	- spa->spa_checkpoint_info.sci_dspace, src);
	-
	- spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
	- metaslab_class_fragmentation(mc), src);
	- spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
	- metaslab_class_expandable_space(mc), src);
	- spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
	- (spa_mode(spa) == FREAD), src);
	-
	- cap = (size == 0) ? 0 : (alloc * 100 / size);
	- spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
	-
	- spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
	- ddt_get_pool_dedup_ratio(spa), src);
	-
	- spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
	- rvd->vdev_state, src);
	-
	- version = spa_version(spa);
	- if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
	- src = ZPROP_SRC_DEFAULT;
	- else
	- src = ZPROP_SRC_LOCAL;
	- spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
	- }
	-
	- if (pool != NULL) {
	- /*
	- * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
	- * when opening pools before this version freedir will be NULL.
	- */
	- if (pool->dp_free_dir != NULL) {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
	- dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
	- src);
	- } else {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
	- NULL, 0, src);
	- }
	-
	- if (pool->dp_leak_dir != NULL) {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
	- dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
	- src);
	- } else {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
	- NULL, 0, src);
	- }
	- }
	-
	- spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
	-
	- if (spa->spa_comment != NULL) {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
	- 0, ZPROP_SRC_LOCAL);
	- }
	-
	- if (spa->spa_root != NULL)
	- spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
	- 0, ZPROP_SRC_LOCAL);
	-
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
	- MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
	- } else {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
	- SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
	- }
	-
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
	- DNODE_MAX_SIZE, ZPROP_SRC_NONE);
	- } else {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
	- DNODE_MIN_SIZE, ZPROP_SRC_NONE);
	- }
	-
	- if ((dp = list_head(&spa->spa_config_list)) != NULL) {
	- if (dp->scd_path == NULL) {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
	- "none", 0, ZPROP_SRC_LOCAL);
	- } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
	- spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
	- dp->scd_path, 0, ZPROP_SRC_LOCAL);
	- }
	- }
	-}
	-
	-/*
	- * Get zpool property values.
	- */
	-int
	-spa_prop_get(spa_t spa, nvlist_t *nvp)
	-{
	- objset_t *mos = spa->spa_meta_objset;
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- int err;
	-
	- VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- mutex_enter(&spa->spa_props_lock);
	-
	- /*
	- * Get properties from the spa config.
	- */
	- spa_prop_get_config(spa, nvp);
	-
	- /* If no pool property object, no more prop to get. */
	- if (mos == NULL \|\| spa->spa_pool_props_object == 0) {
	- mutex_exit(&spa->spa_props_lock);
	- return (0);
	- }
	-
	- /*
	- * Get properties from the MOS pool property object.
	- */
	- for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
	- (err = zap_cursor_retrieve(&zc, &za)) == 0;
	- zap_cursor_advance(&zc)) {
	- uint64_t intval = 0;
	- char *strval = NULL;
	- zprop_source_t src = ZPROP_SRC_DEFAULT;
	- zpool_prop_t prop;
	-
	- if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
	- continue;
	-
	- switch (za.za_integer_length) {
	- case 8:
	- /* integer property */
	- if (za.za_first_integer !=
	- zpool_prop_default_numeric(prop))
	- src = ZPROP_SRC_LOCAL;
	-
	- if (prop == ZPOOL_PROP_BOOTFS) {
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds = NULL;
	-
	- dp = spa_get_dsl(spa);
	- dsl_pool_config_enter(dp, FTAG);
	- err = dsl_dataset_hold_obj(dp,
	- za.za_first_integer, FTAG, &ds);
	- if (err != 0) {
	- dsl_pool_config_exit(dp, FTAG);
	- break;
	- }
	-
	- strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
	- KM_SLEEP);
	- dsl_dataset_name(ds, strval);
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_config_exit(dp, FTAG);
	- } else {
	- strval = NULL;
	- intval = za.za_first_integer;
	- }
	-
	- spa_prop_add_list(*nvp, prop, strval, intval, src);
	-
	- if (strval != NULL)
	- kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
	-
	- break;
	-
	- case 1:
	- /* string property */
	- strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
	- err = zap_lookup(mos, spa->spa_pool_props_object,
	- za.za_name, 1, za.za_num_integers, strval);
	- if (err) {
	- kmem_free(strval, za.za_num_integers);
	- break;
	- }
	- spa_prop_add_list(*nvp, prop, strval, 0, src);
	- kmem_free(strval, za.za_num_integers);
	- break;
	-
	- default:
	- break;
	- }
	- }
	- zap_cursor_fini(&zc);
	- mutex_exit(&spa->spa_props_lock);
	-out:
	- if (err && err != ENOENT) {
	- nvlist_free(*nvp);
	- *nvp = NULL;
	- return (err);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Validate the given pool properties nvlist and modify the list
	- * for the property values to be set.
	- */
	-static int
	-spa_prop_validate(spa_t spa, nvlist_t props)
	-{
	- nvpair_t *elem;
	- int error = 0, reset_bootfs = 0;
	- uint64_t objnum = 0;
	- boolean_t has_feature = B_FALSE;
	-
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
	- uint64_t intval;
	- char strval, slash, check, fname;
	- const char *propname = nvpair_name(elem);
	- zpool_prop_t prop = zpool_name_to_prop(propname);
	-
	- switch (prop) {
	- case ZPOOL_PROP_INVAL:
	- if (!zpool_prop_feature(propname)) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- /*
	- * Sanitize the input.
	- */
	- if (nvpair_type(elem) != DATA_TYPE_UINT64) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- if (nvpair_value_uint64(elem, &intval) != 0) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- if (intval != 0) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- fname = strchr(propname, '@') + 1;
	- if (zfeature_lookup_name(fname, NULL) != 0) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- has_feature = B_TRUE;
	- break;
	-
	- case ZPOOL_PROP_VERSION:
	- error = nvpair_value_uint64(elem, &intval);
	- if (!error &&
	- (intval < spa_version(spa) \|\|
	- intval > SPA_VERSION_BEFORE_FEATURES \|\|
	- has_feature))
	- error = SET_ERROR(EINVAL);
	- break;
	-
	- case ZPOOL_PROP_DELEGATION:
	- case ZPOOL_PROP_AUTOREPLACE:
	- case ZPOOL_PROP_LISTSNAPS:
	- case ZPOOL_PROP_AUTOEXPAND:
	- error = nvpair_value_uint64(elem, &intval);
	- if (!error && intval > 1)
	- error = SET_ERROR(EINVAL);
	- break;
	-
	- case ZPOOL_PROP_MULTIHOST:
	- error = nvpair_value_uint64(elem, &intval);
	- if (!error && intval > 1)
	- error = SET_ERROR(EINVAL);
	-
	- if (!error && !spa_get_hostid())
	- error = SET_ERROR(ENOTSUP);
	-
	- break;
	-
	- case ZPOOL_PROP_BOOTFS:
	- /*
	- * If the pool version is less than SPA_VERSION_BOOTFS,
	- * or the pool is still being created (version == 0),
	- * the bootfs property cannot be set.
	- */
	- if (spa_version(spa) < SPA_VERSION_BOOTFS) {
	- error = SET_ERROR(ENOTSUP);
	- break;
	- }
	-
	- /*
	- * Make sure the vdev config is bootable
	- */
	- if (!vdev_is_bootable(spa->spa_root_vdev)) {
	- error = SET_ERROR(ENOTSUP);
	- break;
	- }
	-
	- reset_bootfs = 1;
	-
	- error = nvpair_value_string(elem, &strval);
	-
	- if (!error) {
	- objset_t *os;
	- uint64_t propval;
	-
	- if (strval == NULL \|\| strval[0] == '\0') {
	- objnum = zpool_prop_default_numeric(
	- ZPOOL_PROP_BOOTFS);
	- break;
	- }
	-
	- error = dmu_objset_hold(strval, FTAG, &os);
	- if (error != 0)
	- break;
	-
	- /*
	- * Must be ZPL, and its property settings
	- * must be supported.
	- */
	-
	- if (dmu_objset_type(os) != DMU_OST_ZFS) {
	- error = SET_ERROR(ENOTSUP);
	- } else if ((error =
	- dsl_prop_get_int_ds(dmu_objset_ds(os),
	- zfs_prop_to_name(ZFS_PROP_COMPRESSION),
	- &propval)) == 0 &&
	- !BOOTFS_COMPRESS_VALID(propval)) {
	- error = SET_ERROR(ENOTSUP);
	- } else {
	- objnum = dmu_objset_id(os);
	- }
	- dmu_objset_rele(os, FTAG);
	- }
	- break;
	-
	- case ZPOOL_PROP_FAILUREMODE:
	- error = nvpair_value_uint64(elem, &intval);
	- if (!error && (intval < ZIO_FAILURE_MODE_WAIT \|\|
	- intval > ZIO_FAILURE_MODE_PANIC))
	- error = SET_ERROR(EINVAL);
	-
	- /*
	- * This is a special case which only occurs when
	- * the pool has completely failed. This allows
	- * the user to change the in-core failmode property
	- * without syncing it out to disk (I/Os might
	- * currently be blocked). We do this by returning
	- * EIO to the caller (spa_prop_set) to trick it
	- * into thinking we encountered a property validation
	- * error.
	- */
	- if (!error && spa_suspended(spa)) {
	- spa->spa_failmode = intval;
	- error = SET_ERROR(EIO);
	- }
	- break;
	-
	- case ZPOOL_PROP_CACHEFILE:
	- if ((error = nvpair_value_string(elem, &strval)) != 0)
	- break;
	-
	- if (strval[0] == '\0')
	- break;
	-
	- if (strcmp(strval, "none") == 0)
	- break;
	-
	- if (strval[0] != '/') {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- slash = strrchr(strval, '/');
	- ASSERT(slash != NULL);
	-
	- if (slash[1] == '\0' \|\| strcmp(slash, "/.") == 0 \|\|
	- strcmp(slash, "/..") == 0)
	- error = SET_ERROR(EINVAL);
	- break;
	-
	- case ZPOOL_PROP_COMMENT:
	- if ((error = nvpair_value_string(elem, &strval)) != 0)
	- break;
	- for (check = strval; *check != '\0'; check++) {
	- /*
	- * The kernel doesn't have an easy isprint()
	- * check. For this kernel check, we merely
	- * check ASCII apart from DEL. Fix this if
	- * there is an easy-to-use kernel isprint().
	- */
	- if (*check >= 0x7f) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	- }
	- if (strlen(strval) > ZPROP_MAX_COMMENT)
	- error = E2BIG;
	- break;
	-
	- case ZPOOL_PROP_DEDUPDITTO:
	- if (spa_version(spa) < SPA_VERSION_DEDUP)
	- error = SET_ERROR(ENOTSUP);
	- else
	- error = nvpair_value_uint64(elem, &intval);
	- if (error == 0 &&
	- intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- if (error)
	- break;
	- }
	-
	- if (!error && reset_bootfs) {
	- error = nvlist_remove(props,
	- zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
	-
	- if (!error) {
	- error = nvlist_add_uint64(props,
	- zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
	- }
	- }
	-
	- return (error);
	-}
	-
	-void
	-spa_configfile_set(spa_t spa, nvlist_t nvp, boolean_t need_sync)
	-{
	- char *cachefile;
	- spa_config_dirent_t *dp;
	-
	- if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
	- &cachefile) != 0)
	- return;
	-
	- dp = kmem_alloc(sizeof (spa_config_dirent_t),
	- KM_SLEEP);
	-
	- if (cachefile[0] == '\0')
	- dp->scd_path = spa_strdup(spa_config_path);
	- else if (strcmp(cachefile, "none") == 0)
	- dp->scd_path = NULL;
	- else
	- dp->scd_path = spa_strdup(cachefile);
	-
	- list_insert_head(&spa->spa_config_list, dp);
	- if (need_sync)
	- spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
	-}
	-
	-int
	-spa_prop_set(spa_t spa, nvlist_t nvp)
	-{
	- int error;
	- nvpair_t *elem = NULL;
	- boolean_t need_sync = B_FALSE;
	-
	- if ((error = spa_prop_validate(spa, nvp)) != 0)
	- return (error);
	-
	- while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
	- zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
	-
	- if (prop == ZPOOL_PROP_CACHEFILE \|\|
	- prop == ZPOOL_PROP_ALTROOT \|\|
	- prop == ZPOOL_PROP_READONLY)
	- continue;
	-
	- if (prop == ZPOOL_PROP_VERSION \|\| prop == ZPOOL_PROP_INVAL) {
	- uint64_t ver;
	-
	- if (prop == ZPOOL_PROP_VERSION) {
	- VERIFY(nvpair_value_uint64(elem, &ver) == 0);
	- } else {
	- ASSERT(zpool_prop_feature(nvpair_name(elem)));
	- ver = SPA_VERSION_FEATURES;
	- need_sync = B_TRUE;
	- }
	-
	- /* Save time if the version is already set. */
	- if (ver == spa_version(spa))
	- continue;
	-
	- /*
	- * In addition to the pool directory object, we might
	- * create the pool properties object, the features for
	- * read object, the features for write object, or the
	- * feature descriptions object.
	- */
	- error = dsl_sync_task(spa->spa_name, NULL,
	- spa_sync_version, &ver,
	- 6, ZFS_SPACE_CHECK_RESERVED);
	- if (error)
	- return (error);
	- continue;
	- }
	-
	- need_sync = B_TRUE;
	- break;
	- }
	-
	- if (need_sync) {
	- return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
	- nvp, 6, ZFS_SPACE_CHECK_RESERVED));
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * If the bootfs property value is dsobj, clear it.
	- */
	-void
	-spa_prop_clear_bootfs(spa_t spa, uint64_t dsobj, dmu_tx_t tx)
	-{
	- if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
	- VERIFY(zap_remove(spa->spa_meta_objset,
	- spa->spa_pool_props_object,
	- zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
	- spa->spa_bootfs = 0;
	- }
	-}
	-
	-/ARGSUSED/
	-static int
	-spa_change_guid_check(void arg, dmu_tx_t tx)
	-{
	- uint64_t *newguid = arg;
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- uint64_t vdev_state;
	-
	- if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
	- int error = (spa_has_checkpoint(spa)) ?
	- ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
	- return (SET_ERROR(error));
	- }
	-
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	- vdev_state = rvd->vdev_state;
	- spa_config_exit(spa, SCL_STATE, FTAG);
	-
	- if (vdev_state != VDEV_STATE_HEALTHY)
	- return (SET_ERROR(ENXIO));
	-
	- ASSERT3U(spa_guid(spa), !=, *newguid);
	-
	- return (0);
	-}
	-
	-static void
	-spa_change_guid_sync(void arg, dmu_tx_t tx)
	-{
	- uint64_t *newguid = arg;
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- uint64_t oldguid;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- oldguid = spa_guid(spa);
	-
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	- rvd->vdev_guid = *newguid;
	- rvd->vdev_guid_sum += (*newguid - oldguid);
	- vdev_config_dirty(rvd);
	- spa_config_exit(spa, SCL_STATE, FTAG);
	-
	- spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
	- oldguid, *newguid);
	-}
	-
	-/*
	- * Change the GUID for the pool. This is done so that we can later
	- * re-import a pool built from a clone of our own vdevs. We will modify
	- * the root vdev's guid, our own pool guid, and then mark all of our
	- * vdevs dirty. Note that we must make sure that all our vdevs are
	- * online when we do this, or else any vdevs that weren't present
	- * would be orphaned from our pool. We are also going to issue a
	- * sysevent to update any watchers.
	- */
	-int
	-spa_change_guid(spa_t *spa)
	-{
	- int error;
	- uint64_t guid;
	-
	- mutex_enter(&spa->spa_vdev_top_lock);
	- mutex_enter(&spa_namespace_lock);
	- guid = spa_generate_guid(NULL);
	-
	- error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
	- spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
	-
	- if (error == 0) {
	- spa_write_cachefile(spa, B_FALSE, B_TRUE);
	- spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
	- }
	-
	- mutex_exit(&spa_namespace_lock);
	- mutex_exit(&spa->spa_vdev_top_lock);
	-
	- return (error);
	-}
	-
	-/*
	- * ==========================================================================
	- * SPA state manipulation (open/create/destroy/import/export)
	- * ==========================================================================
	- */
	-
	-static int
	-spa_error_entry_compare(const void a, const void b)
	-{
	- const spa_error_entry_t sa = (const spa_error_entry_t )a;
	- const spa_error_entry_t sb = (const spa_error_entry_t )b;
	- int ret;
	-
	- ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
	- sizeof (zbookmark_phys_t));
	-
	- return (AVL_ISIGN(ret));
	-}
	-
	-/*
	- * Utility function which retrieves copies of the current logs and
	- * re-initializes them in the process.
	- */
	-void
	-spa_get_errlists(spa_t spa, avl_tree_t last, avl_tree_t *scrub)
	-{
	- ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
	-
	- bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
	- bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
	-
	- avl_create(&spa->spa_errlist_scrub,
	- spa_error_entry_compare, sizeof (spa_error_entry_t),
	- offsetof(spa_error_entry_t, se_avl));
	- avl_create(&spa->spa_errlist_last,
	- spa_error_entry_compare, sizeof (spa_error_entry_t),
	- offsetof(spa_error_entry_t, se_avl));
	-}
	-
	-static void
	-spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
	-{
	- const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
	- enum zti_modes mode = ztip->zti_mode;
	- uint_t value = ztip->zti_value;
	- uint_t count = ztip->zti_count;
	- spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
	- char name[32];
	- uint_t flags = 0;
	- boolean_t batch = B_FALSE;
	-
	- if (mode == ZTI_MODE_NULL) {
	- tqs->stqs_count = 0;
	- tqs->stqs_taskq = NULL;
	- return;
	- }
	-
	- ASSERT3U(count, >, 0);
	-
	- tqs->stqs_count = count;
	- tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
	-
	- switch (mode) {
	- case ZTI_MODE_FIXED:
	- ASSERT3U(value, >=, 1);
	- value = MAX(value, 1);
	- break;
	-
	- case ZTI_MODE_BATCH:
	- batch = B_TRUE;
	- flags \|= TASKQ_THREADS_CPU_PCT;
	- value = zio_taskq_batch_pct;
	- break;
	-
	- default:
	- panic("unrecognized mode for %s_%s taskq (%u:%u) in "
	- "spa_activate()",
	- zio_type_name[t], zio_taskq_types[q], mode, value);
	- break;
	- }
	-
	- for (uint_t i = 0; i < count; i++) {
	- taskq_t *tq;
	-
	- if (count > 1) {
	- (void) snprintf(name, sizeof (name), "%s_%s_%u",
	- zio_type_name[t], zio_taskq_types[q], i);
	- } else {
	- (void) snprintf(name, sizeof (name), "%s_%s",
	- zio_type_name[t], zio_taskq_types[q]);
	- }
	-
	-#ifdef SYSDC
	- if (zio_taskq_sysdc && spa->spa_proc != &p0) {
	- if (batch)
	- flags \|= TASKQ_DC_BATCH;
	-
	- tq = taskq_create_sysdc(name, value, 50, INT_MAX,
	- spa->spa_proc, zio_taskq_basedc, flags);
	- } else {
	-#endif
	- pri_t pri = maxclsyspri;
	- /*
	- * The write issue taskq can be extremely CPU
	- * intensive. Run it at slightly lower priority
	- * than the other taskqs.
	- * FreeBSD notes:
	- * - numerically higher priorities are lower priorities;
	- * - if priorities divided by four (RQ_PPQ) are equal
	- * then a difference between them is insignificant.
	- */
	- if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
	-#ifdef illumos
	- pri--;
	-#else
	- pri += 4;
	-#endif
	-
	- tq = taskq_create_proc(name, value, pri, 50,
	- INT_MAX, spa->spa_proc, flags);
	-#ifdef SYSDC
	- }
	-#endif
	-
	- tqs->stqs_taskq[i] = tq;
	- }
	-}
	-
	-static void
	-spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
	-{
	- spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
	-
	- if (tqs->stqs_taskq == NULL) {
	- ASSERT0(tqs->stqs_count);
	- return;
	- }
	-
	- for (uint_t i = 0; i < tqs->stqs_count; i++) {
	- ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
	- taskq_destroy(tqs->stqs_taskq[i]);
	- }
	-
	- kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
	- tqs->stqs_taskq = NULL;
	-}
	-
	-/*
	- * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
	- * Note that a type may have multiple discrete taskqs to avoid lock contention
	- * on the taskq itself. In that case we choose which taskq at random by using
	- * the low bits of gethrtime().
	- */
	-void
	-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
	- task_func_t func, void arg, uint_t flags, taskq_ent_t *ent)
	-{
	- spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
	- taskq_t *tq;
	-
	- ASSERT3P(tqs->stqs_taskq, !=, NULL);
	- ASSERT3U(tqs->stqs_count, !=, 0);
	-
	- if (tqs->stqs_count == 1) {
	- tq = tqs->stqs_taskq[0];
	- } else {
	-#ifdef _KERNEL
	- tq = tqs->stqs_taskq[(u_int)(sbinuptime() + curcpu) %
	- tqs->stqs_count];
	-#else
	- tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
	-#endif
	- }
	-
	- taskq_dispatch_ent(tq, func, arg, flags, ent);
	-}
	-
	-static void
	-spa_create_zio_taskqs(spa_t *spa)
	-{
	- for (int t = 0; t < ZIO_TYPES; t++) {
	- for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
	- spa_taskqs_init(spa, t, q);
	- }
	- }
	-}
	-
	-#ifdef SPA_PROCESS
	-static int
	-newproc(void (pc)(void ), void *arg, id_t cid, int pri,
	- void **ct, pid_t pid)
	-{
	- va_list ap;
	- spa_t spa = (spa_t )arg; /* XXX */
	- struct proc *newp;
	- struct thread *td;
	- int error;
	-
	- ASSERT(ct == NULL);
	- ASSERT(pid == 0);
	- ASSERT(cid == syscid);
	-
	- error = kproc_create(pc, arg, &newp, 0, 0, "zpool-%s", spa->spa_name);
	- if (error != 0)
	- return (error);
	- td = FIRST_THREAD_IN_PROC(newp);
	- thread_lock(td);
	- sched_prio(td, pri);
	- thread_unlock(td);
	- return (0);
	-}
	-
	-static void
	-spa_thread(void *arg)
	-{
	- callb_cpr_t cprinfo;
	-
	- spa_t *spa = arg;
	-#ifdef illumos
	- user_t *pu = PTOU(curproc);
	-#endif
	- CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
	- spa->spa_name);
	-
	- ASSERT(curproc != &p0);
	-#ifdef illumos
	- (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
	- "zpool-%s", spa->spa_name);
	- (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
	-#endif
	-
	-#ifdef PSRSET_BIND
	- /* bind this thread to the requested psrset */
	- if (zio_taskq_psrset_bind != PS_NONE) {
	- pool_lock();
	- mutex_enter(&cpu_lock);
	- mutex_enter(&pidlock);
	- mutex_enter(&curproc->p_lock);
	-
	- if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
	- 0, NULL, NULL) == 0) {
	- curthread->t_bind_pset = zio_taskq_psrset_bind;
	- } else {
	- cmn_err(CE_WARN,
	- "Couldn't bind process for zfs pool \"%s\" to "
	- "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
	- }
	-
	- mutex_exit(&curproc->p_lock);
	- mutex_exit(&pidlock);
	- mutex_exit(&cpu_lock);
	- pool_unlock();
	- }
	-#endif
	-
	-#ifdef SYSDC
	- if (zio_taskq_sysdc) {
	- sysdc_thread_enter(curthread, 100, 0);
	- }
	-#endif
	-
	- spa->spa_proc = curproc;
	- spa->spa_did = curthread->t_did;
	-
	- spa_create_zio_taskqs(spa);
	-
	- mutex_enter(&spa->spa_proc_lock);
	- ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
	-
	- spa->spa_proc_state = SPA_PROC_ACTIVE;
	- cv_broadcast(&spa->spa_proc_cv);
	-
	- CALLB_CPR_SAFE_BEGIN(&cprinfo);
	- while (spa->spa_proc_state == SPA_PROC_ACTIVE)
	- cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
	- CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
	-
	- ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
	- spa->spa_proc_state = SPA_PROC_GONE;
	- spa->spa_proc = &p0;
	- cv_broadcast(&spa->spa_proc_cv);
	- CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
	-
	-#ifdef illumos
	- mutex_enter(&curproc->p_lock);
	- lwp_exit();
	-#else
	- kthread_exit();
	-#endif
	-}
	-#endif /* SPA_PROCESS */
	-
	-/*
	- * Activate an uninitialized pool.
	- */
	-static void
	-spa_activate(spa_t *spa, int mode)
	-{
	- ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
	-
	- spa->spa_state = POOL_STATE_ACTIVE;
	- spa->spa_mode = mode;
	-
	- spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
	- spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
	- spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
	- spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
	-
	- /* Try to create a covering process */
	- mutex_enter(&spa->spa_proc_lock);
	- ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
	- ASSERT(spa->spa_proc == &p0);
	- spa->spa_did = 0;
	-
	-#ifdef SPA_PROCESS
	- /* Only create a process if we're going to be around a while. */
	- if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
	- if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
	- NULL, 0) == 0) {
	- spa->spa_proc_state = SPA_PROC_CREATED;
	- while (spa->spa_proc_state == SPA_PROC_CREATED) {
	- cv_wait(&spa->spa_proc_cv,
	- &spa->spa_proc_lock);
	- }
	- ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
	- ASSERT(spa->spa_proc != &p0);
	- ASSERT(spa->spa_did != 0);
	- } else {
	-#ifdef _KERNEL
	- cmn_err(CE_WARN,
	- "Couldn't create process for zfs pool \"%s\"\n",
	- spa->spa_name);
	-#endif
	- }
	- }
	-#endif /* SPA_PROCESS */
	- mutex_exit(&spa->spa_proc_lock);
	-
	- /* If we didn't create a process, we need to create our taskqs. */
	-#ifndef SPA_PROCESS
	- ASSERT(spa->spa_proc == &p0);
	-#endif /* SPA_PROCESS */
	- if (spa->spa_proc == &p0) {
	- spa_create_zio_taskqs(spa);
	- }
	-
	- /*
	- * Start TRIM thread.
	- */
	- trim_thread_create(spa);
	-
	- /*
	- * This taskq is used to perform zvol-minor-related tasks
	- * asynchronously. This has several advantages, including easy
	- * resolution of various deadlocks (zfsonlinux bug #3681).
	- *
	- * The taskq must be single threaded to ensure tasks are always
	- * processed in the order in which they were dispatched.
	- *
	- * A taskq per pool allows one to keep the pools independent.
	- * This way if one pool is suspended, it will not impact another.
	- *
	- * The preferred location to dispatch a zvol minor task is a sync
	- * task. In this context, there is easy access to the spa_t and minimal
	- * error handling is required because the sync task must succeed.
	- */
	- spa->spa_zvol_taskq = taskq_create("z_zvol", 1, minclsyspri,
	- 1, INT_MAX, 0);
	-
	- for (size_t i = 0; i < TXG_SIZE; i++) {
	- spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
	- ZIO_FLAG_CANFAIL);
	- }
	-
	- list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
	- offsetof(vdev_t, vdev_config_dirty_node));
	- list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
	- offsetof(objset_t, os_evicting_node));
	- list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
	- offsetof(vdev_t, vdev_state_dirty_node));
	-
	- txg_list_create(&spa->spa_vdev_txg_list, spa,
	- offsetof(struct vdev, vdev_txg_node));
	-
	- avl_create(&spa->spa_errlist_scrub,
	- spa_error_entry_compare, sizeof (spa_error_entry_t),
	- offsetof(spa_error_entry_t, se_avl));
	- avl_create(&spa->spa_errlist_last,
	- spa_error_entry_compare, sizeof (spa_error_entry_t),
	- offsetof(spa_error_entry_t, se_avl));
	-}
	-
	-/*
	- * Opposite of spa_activate().
	- */
	-static void
	-spa_deactivate(spa_t *spa)
	-{
	- ASSERT(spa->spa_sync_on == B_FALSE);
	- ASSERT(spa->spa_dsl_pool == NULL);
	- ASSERT(spa->spa_root_vdev == NULL);
	- ASSERT(spa->spa_async_zio_root == NULL);
	- ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
	-
	- /*
	- * Stop TRIM thread in case spa_unload() wasn't called directly
	- * before spa_deactivate().
	- */
	- trim_thread_destroy(spa);
	-
	- spa_evicting_os_wait(spa);
	-
	- if (spa->spa_zvol_taskq) {
	- taskq_destroy(spa->spa_zvol_taskq);
	- spa->spa_zvol_taskq = NULL;
	- }
	-
	- txg_list_destroy(&spa->spa_vdev_txg_list);
	-
	- list_destroy(&spa->spa_config_dirty_list);
	- list_destroy(&spa->spa_evicting_os_list);
	- list_destroy(&spa->spa_state_dirty_list);
	-
	- for (int t = 0; t < ZIO_TYPES; t++) {
	- for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
	- spa_taskqs_fini(spa, t, q);
	- }
	- }
	-
	- for (size_t i = 0; i < TXG_SIZE; i++) {
	- ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
	- VERIFY0(zio_wait(spa->spa_txg_zio[i]));
	- spa->spa_txg_zio[i] = NULL;
	- }
	-
	- metaslab_class_destroy(spa->spa_normal_class);
	- spa->spa_normal_class = NULL;
	-
	- metaslab_class_destroy(spa->spa_log_class);
	- spa->spa_log_class = NULL;
	-
	- metaslab_class_destroy(spa->spa_special_class);
	- spa->spa_special_class = NULL;
	-
	- metaslab_class_destroy(spa->spa_dedup_class);
	- spa->spa_dedup_class = NULL;
	-
	- /*
	- * If this was part of an import or the open otherwise failed, we may
	- * still have errors left in the queues. Empty them just in case.
	- */
	- spa_errlog_drain(spa);
	-
	- avl_destroy(&spa->spa_errlist_scrub);
	- avl_destroy(&spa->spa_errlist_last);
	-
	- spa->spa_state = POOL_STATE_UNINITIALIZED;
	-
	- mutex_enter(&spa->spa_proc_lock);
	- if (spa->spa_proc_state != SPA_PROC_NONE) {
	- ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
	- spa->spa_proc_state = SPA_PROC_DEACTIVATE;
	- cv_broadcast(&spa->spa_proc_cv);
	- while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
	- ASSERT(spa->spa_proc != &p0);
	- cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
	- }
	- ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
	- spa->spa_proc_state = SPA_PROC_NONE;
	- }
	- ASSERT(spa->spa_proc == &p0);
	- mutex_exit(&spa->spa_proc_lock);
	-
	-#ifdef SPA_PROCESS
	-#ifdef illumos
	- /*
	- * We want to make sure spa_thread() has actually exited the ZFS
	- * module, so that the module can't be unloaded out from underneath
	- * it.
	- */
	- if (spa->spa_did != 0) {
	- thread_join(spa->spa_did);
	- spa->spa_did = 0;
	- }
	-#endif
	-#endif /* SPA_PROCESS */
	-}
	-
	-/*
	- * Verify a pool configuration, and construct the vdev tree appropriately. This
	- * will create all the necessary vdevs in the appropriate layout, with each vdev
	- * in the CLOSED state. This will prep the pool before open/creation/import.
	- * All vdev validation is done by the vdev_alloc() routine.
	- */
	-static int
	-spa_config_parse(spa_t spa, vdev_t vdp, nvlist_t nv, vdev_t *parent,
	- uint_t id, int atype)
	-{
	- nvlist_t **child;
	- uint_t children;
	- int error;
	-
	- if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
	- return (error);
	-
	- if ((*vdp)->vdev_ops->vdev_op_leaf)
	- return (0);
	-
	- error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- &child, &children);
	-
	- if (error == ENOENT)
	- return (0);
	-
	- if (error) {
	- vdev_free(*vdp);
	- *vdp = NULL;
	- return (SET_ERROR(EINVAL));
	- }
	-
	- for (int c = 0; c < children; c++) {
	- vdev_t *vd;
	- if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
	- atype)) != 0) {
	- vdev_free(*vdp);
	- *vdp = NULL;
	- return (error);
	- }
	- }
	-
	- ASSERT(*vdp != NULL);
	-
	- return (0);
	-}
	-
	-/*
	- * Opposite of spa_load().
	- */
	-static void
	-spa_unload(spa_t *spa)
	-{
	- int i;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- spa_load_note(spa, "UNLOADING");
	-
	- /*
	- * Stop TRIM thread.
	- */
	- trim_thread_destroy(spa);
	-
	- /*
	- * Stop async tasks.
	- */
	- spa_async_suspend(spa);
	-
	- if (spa->spa_root_vdev) {
	- vdev_initialize_stop_all(spa->spa_root_vdev,
	- VDEV_INITIALIZE_ACTIVE);
	- }
	-
	- /*
	- * Stop syncing.
	- */
	- if (spa->spa_sync_on) {
	- txg_sync_stop(spa->spa_dsl_pool);
	- spa->spa_sync_on = B_FALSE;
	- }
	-
	- /*
	- * Even though vdev_free() also calls vdev_metaslab_fini, we need
	- * to call it earlier, before we wait for async i/o to complete.
	- * This ensures that there is no async metaslab prefetching, by
	- * calling taskq_wait(mg_taskq).
	- */
	- if (spa->spa_root_vdev != NULL) {
	- spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
	- for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
	- vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
	- spa_config_exit(spa, SCL_ALL, spa);
	- }
	-
	- if (spa->spa_mmp.mmp_thread)
	- mmp_thread_stop(spa);
	-
	- /*
	- * Wait for any outstanding async I/O to complete.
	- */
	- if (spa->spa_async_zio_root != NULL) {
	- for (int i = 0; i < max_ncpus; i++)
	- (void) zio_wait(spa->spa_async_zio_root[i]);
	- kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
	- spa->spa_async_zio_root = NULL;
	- }
	-
	- if (spa->spa_vdev_removal != NULL) {
	- spa_vdev_removal_destroy(spa->spa_vdev_removal);
	- spa->spa_vdev_removal = NULL;
	- }
	-
	- if (spa->spa_condense_zthr != NULL) {
	- zthr_destroy(spa->spa_condense_zthr);
	- spa->spa_condense_zthr = NULL;
	- }
	-
	- if (spa->spa_checkpoint_discard_zthr != NULL) {
	- zthr_destroy(spa->spa_checkpoint_discard_zthr);
	- spa->spa_checkpoint_discard_zthr = NULL;
	- }
	-
	- spa_condense_fini(spa);
	-
	- bpobj_close(&spa->spa_deferred_bpobj);
	-
	- spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
	-
	- /*
	- * Close all vdevs.
	- */
	- if (spa->spa_root_vdev)
	- vdev_free(spa->spa_root_vdev);
	- ASSERT(spa->spa_root_vdev == NULL);
	-
	- /*
	- * Close the dsl pool.
	- */
	- if (spa->spa_dsl_pool) {
	- dsl_pool_close(spa->spa_dsl_pool);
	- spa->spa_dsl_pool = NULL;
	- spa->spa_meta_objset = NULL;
	- }
	-
	- ddt_unload(spa);
	-
	- /*
	- * Drop and purge level 2 cache
	- */
	- spa_l2cache_drop(spa);
	-
	- for (i = 0; i < spa->spa_spares.sav_count; i++)
	- vdev_free(spa->spa_spares.sav_vdevs[i]);
	- if (spa->spa_spares.sav_vdevs) {
	- kmem_free(spa->spa_spares.sav_vdevs,
	- spa->spa_spares.sav_count * sizeof (void *));
	- spa->spa_spares.sav_vdevs = NULL;
	- }
	- if (spa->spa_spares.sav_config) {
	- nvlist_free(spa->spa_spares.sav_config);
	- spa->spa_spares.sav_config = NULL;
	- }
	- spa->spa_spares.sav_count = 0;
	-
	- for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
	- vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
	- vdev_free(spa->spa_l2cache.sav_vdevs[i]);
	- }
	- if (spa->spa_l2cache.sav_vdevs) {
	- kmem_free(spa->spa_l2cache.sav_vdevs,
	- spa->spa_l2cache.sav_count * sizeof (void *));
	- spa->spa_l2cache.sav_vdevs = NULL;
	- }
	- if (spa->spa_l2cache.sav_config) {
	- nvlist_free(spa->spa_l2cache.sav_config);
	- spa->spa_l2cache.sav_config = NULL;
	- }
	- spa->spa_l2cache.sav_count = 0;
	-
	- spa->spa_async_suspended = 0;
	-
	- spa->spa_indirect_vdevs_loaded = B_FALSE;
	-
	- if (spa->spa_comment != NULL) {
	- spa_strfree(spa->spa_comment);
	- spa->spa_comment = NULL;
	- }
	-
	- spa_config_exit(spa, SCL_ALL, spa);
	-}
	-
	-/*
	- * Load (or re-load) the current list of vdevs describing the active spares for
	- * this pool. When this is called, we have some form of basic information in
	- * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
	- * then re-generate a more complete list including status information.
	- */
	-void
	-spa_load_spares(spa_t *spa)
	-{
	- nvlist_t **spares;
	- uint_t nspares;
	- int i;
	- vdev_t vd, tvd;
	-
	-#ifndef _KERNEL
	- /*
	- * zdb opens both the current state of the pool and the
	- * checkpointed state (if present), with a different spa_t.
	- *
	- * As spare vdevs are shared among open pools, we skip loading
	- * them when we load the checkpointed state of the pool.
	- */
	- if (!spa_writeable(spa))
	- return;
	-#endif
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- /*
	- * First, close and free any existing spare vdevs.
	- */
	- for (i = 0; i < spa->spa_spares.sav_count; i++) {
	- vd = spa->spa_spares.sav_vdevs[i];
	-
	- /* Undo the call to spa_activate() below */
	- if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
	- B_FALSE)) != NULL && tvd->vdev_isspare)
	- spa_spare_remove(tvd);
	- vdev_close(vd);
	- vdev_free(vd);
	- }
	-
	- if (spa->spa_spares.sav_vdevs)
	- kmem_free(spa->spa_spares.sav_vdevs,
	- spa->spa_spares.sav_count * sizeof (void *));
	-
	- if (spa->spa_spares.sav_config == NULL)
	- nspares = 0;
	- else
	- VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
	-
	- spa->spa_spares.sav_count = (int)nspares;
	- spa->spa_spares.sav_vdevs = NULL;
	-
	- if (nspares == 0)
	- return;
	-
	- /*
	- * Construct the array of vdevs, opening them to get status in the
	- * process. For each spare, there is potentially two different vdev_t
	- * structures associated with it: one in the list of spares (used only
	- * for basic validation purposes) and one in the active vdev
	- * configuration (if it's spared in). During this phase we open and
	- * validate each vdev on the spare list. If the vdev also exists in the
	- * active configuration, then we also mark this vdev as an active spare.
	- */
	- spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
	- KM_SLEEP);
	- for (i = 0; i < spa->spa_spares.sav_count; i++) {
	- VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
	- VDEV_ALLOC_SPARE) == 0);
	- ASSERT(vd != NULL);
	-
	- spa->spa_spares.sav_vdevs[i] = vd;
	-
	- if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
	- B_FALSE)) != NULL) {
	- if (!tvd->vdev_isspare)
	- spa_spare_add(tvd);
	-
	- /*
	- * We only mark the spare active if we were successfully
	- * able to load the vdev. Otherwise, importing a pool
	- * with a bad active spare would result in strange
	- * behavior, because multiple pool would think the spare
	- * is actively in use.
	- *
	- * There is a vulnerability here to an equally bizarre
	- * circumstance, where a dead active spare is later
	- * brought back to life (onlined or otherwise). Given
	- * the rarity of this scenario, and the extra complexity
	- * it adds, we ignore the possibility.
	- */
	- if (!vdev_is_dead(tvd))
	- spa_spare_activate(tvd);
	- }
	-
	- vd->vdev_top = vd;
	- vd->vdev_aux = &spa->spa_spares;
	-
	- if (vdev_open(vd) != 0)
	- continue;
	-
	- if (vdev_validate_aux(vd) == 0)
	- spa_spare_add(vd);
	- }
	-
	- /*
	- * Recompute the stashed list of spares, with status information
	- * this time.
	- */
	- VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
	- DATA_TYPE_NVLIST_ARRAY) == 0);
	-
	- spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
	- KM_SLEEP);
	- for (i = 0; i < spa->spa_spares.sav_count; i++)
	- spares[i] = vdev_config_generate(spa,
	- spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
	- VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
	- for (i = 0; i < spa->spa_spares.sav_count; i++)
	- nvlist_free(spares[i]);
	- kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
	-}
	-
	-/*
	- * Load (or re-load) the current list of vdevs describing the active l2cache for
	- * this pool. When this is called, we have some form of basic information in
	- * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
	- * then re-generate a more complete list including status information.
	- * Devices which are already active have their details maintained, and are
	- * not re-opened.
	- */
	-void
	-spa_load_l2cache(spa_t *spa)
	-{
	- nvlist_t **l2cache;
	- uint_t nl2cache;
	- int i, j, oldnvdevs;
	- uint64_t guid;
	- vdev_t vd, oldvdevs, *newvdevs;
	- spa_aux_vdev_t *sav = &spa->spa_l2cache;
	-
	-#ifndef _KERNEL
	- /*
	- * zdb opens both the current state of the pool and the
	- * checkpointed state (if present), with a different spa_t.
	- *
	- * As L2 caches are part of the ARC which is shared among open
	- * pools, we skip loading them when we load the checkpointed
	- * state of the pool.
	- */
	- if (!spa_writeable(spa))
	- return;
	-#endif
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- if (sav->sav_config != NULL) {
	- VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
	- ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
	- newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
	- } else {
	- nl2cache = 0;
	- newvdevs = NULL;
	- }
	-
	- oldvdevs = sav->sav_vdevs;
	- oldnvdevs = sav->sav_count;
	- sav->sav_vdevs = NULL;
	- sav->sav_count = 0;
	-
	- /*
	- * Process new nvlist of vdevs.
	- */
	- for (i = 0; i < nl2cache; i++) {
	- VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
	- &guid) == 0);
	-
	- newvdevs[i] = NULL;
	- for (j = 0; j < oldnvdevs; j++) {
	- vd = oldvdevs[j];
	- if (vd != NULL && guid == vd->vdev_guid) {
	- /*
	- * Retain previous vdev for add/remove ops.
	- */
	- newvdevs[i] = vd;
	- oldvdevs[j] = NULL;
	- break;
	- }
	- }
	-
	- if (newvdevs[i] == NULL) {
	- /*
	- * Create new vdev
	- */
	- VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
	- VDEV_ALLOC_L2CACHE) == 0);
	- ASSERT(vd != NULL);
	- newvdevs[i] = vd;
	-
	- /*
	- * Commit this vdev as an l2cache device,
	- * even if it fails to open.
	- */
	- spa_l2cache_add(vd);
	-
	- vd->vdev_top = vd;
	- vd->vdev_aux = sav;
	-
	- spa_l2cache_activate(vd);
	-
	- if (vdev_open(vd) != 0)
	- continue;
	-
	- (void) vdev_validate_aux(vd);
	-
	- if (!vdev_is_dead(vd))
	- l2arc_add_vdev(spa, vd);
	- }
	- }
	-
	- /*
	- * Purge vdevs that were dropped
	- */
	- for (i = 0; i < oldnvdevs; i++) {
	- uint64_t pool;
	-
	- vd = oldvdevs[i];
	- if (vd != NULL) {
	- ASSERT(vd->vdev_isl2cache);
	-
	- if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
	- pool != 0ULL && l2arc_vdev_present(vd))
	- l2arc_remove_vdev(vd);
	- vdev_clear_stats(vd);
	- vdev_free(vd);
	- }
	- }
	-
	- if (oldvdevs)
	- kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
	-
	- if (sav->sav_config == NULL)
	- goto out;
	-
	- sav->sav_vdevs = newvdevs;
	- sav->sav_count = (int)nl2cache;
	-
	- /*
	- * Recompute the stashed list of l2cache devices, with status
	- * information this time.
	- */
	- VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
	- DATA_TYPE_NVLIST_ARRAY) == 0);
	-
	- l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
	- for (i = 0; i < sav->sav_count; i++)
	- l2cache[i] = vdev_config_generate(spa,
	- sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
	- VERIFY(nvlist_add_nvlist_array(sav->sav_config,
	- ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
	-out:
	- for (i = 0; i < sav->sav_count; i++)
	- nvlist_free(l2cache[i]);
	- if (sav->sav_count)
	- kmem_free(l2cache, sav->sav_count * sizeof (void *));
	-}
	-
	-static int
	-load_nvlist(spa_t spa, uint64_t obj, nvlist_t *value)
	-{
	- dmu_buf_t *db;
	- char *packed = NULL;
	- size_t nvsize = 0;
	- int error;
	- *value = NULL;
	-
	- error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
	- if (error != 0)
	- return (error);
	-
	- nvsize = (uint64_t )db->db_data;
	- dmu_buf_rele(db, FTAG);
	-
	- packed = kmem_alloc(nvsize, KM_SLEEP);
	- error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
	- DMU_READ_PREFETCH);
	- if (error == 0)
	- error = nvlist_unpack(packed, nvsize, value, 0);
	- kmem_free(packed, nvsize);
	-
	- return (error);
	-}
	-
	-/*
	- * Concrete top-level vdevs that are not missing and are not logs. At every
	- * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
	- */
	-static uint64_t
	-spa_healthy_core_tvds(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- uint64_t tvds = 0;
	-
	- for (uint64_t i = 0; i < rvd->vdev_children; i++) {
	- vdev_t *vd = rvd->vdev_child[i];
	- if (vd->vdev_islog)
	- continue;
	- if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
	- tvds++;
	- }
	-
	- return (tvds);
	-}
	-
	-/*
	- * Checks to see if the given vdev could not be opened, in which case we post a
	- * sysevent to notify the autoreplace code that the device has been removed.
	- */
	-static void
	-spa_check_removed(vdev_t *vd)
	-{
	- for (uint64_t c = 0; c < vd->vdev_children; c++)
	- spa_check_removed(vd->vdev_child[c]);
	-
	- if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
	- vdev_is_concrete(vd)) {
	- zfs_post_autoreplace(vd->vdev_spa, vd);
	- spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
	- }
	-}
	-
	-static int
	-spa_check_for_missing_logs(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- /*
	- * If we're doing a normal import, then build up any additional
	- * diagnostic information about missing log devices.
	- * We'll pass this up to the user for further processing.
	- */
	- if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
	- nvlist_t *child, nv;
	- uint64_t idx = 0;
	-
	- child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
	- KM_SLEEP);
	- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	-
	- /*
	- * We consider a device as missing only if it failed
	- * to open (i.e. offline or faulted is not considered
	- * as missing).
	- */
	- if (tvd->vdev_islog &&
	- tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
	- child[idx++] = vdev_config_generate(spa, tvd,
	- B_FALSE, VDEV_CONFIG_MISSING);
	- }
	- }
	-
	- if (idx > 0) {
	- fnvlist_add_nvlist_array(nv,
	- ZPOOL_CONFIG_CHILDREN, child, idx);
	- fnvlist_add_nvlist(spa->spa_load_info,
	- ZPOOL_CONFIG_MISSING_DEVICES, nv);
	-
	- for (uint64_t i = 0; i < idx; i++)
	- nvlist_free(child[i]);
	- }
	- nvlist_free(nv);
	- kmem_free(child, rvd->vdev_children * sizeof (char **));
	-
	- if (idx > 0) {
	- spa_load_failed(spa, "some log devices are missing");
	- vdev_dbgmsg_print_tree(rvd, 2);
	- return (SET_ERROR(ENXIO));
	- }
	- } else {
	- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	-
	- if (tvd->vdev_islog &&
	- tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
	- spa_set_log_state(spa, SPA_LOG_CLEAR);
	- spa_load_note(spa, "some log devices are "
	- "missing, ZIL is dropped.");
	- vdev_dbgmsg_print_tree(rvd, 2);
	- break;
	- }
	- }
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Check for missing log devices
	- */
	-static boolean_t
	-spa_check_logs(spa_t *spa)
	-{
	- boolean_t rv = B_FALSE;
	- dsl_pool_t *dp = spa_get_dsl(spa);
	-
	- switch (spa->spa_log_state) {
	- case SPA_LOG_MISSING:
	- /* need to recheck in case slog has been restored */
	- case SPA_LOG_UNKNOWN:
	- rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	- zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
	- if (rv)
	- spa_set_log_state(spa, SPA_LOG_MISSING);
	- break;
	- }
	- return (rv);
	-}
	-
	-static boolean_t
	-spa_passivate_log(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- boolean_t slog_found = B_FALSE;
	-
	- ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
	-
	- if (!spa_has_slogs(spa))
	- return (B_FALSE);
	-
	- for (int c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	- metaslab_group_t *mg = tvd->vdev_mg;
	-
	- if (tvd->vdev_islog) {
	- metaslab_group_passivate(mg);
	- slog_found = B_TRUE;
	- }
	- }
	-
	- return (slog_found);
	-}
	-
	-static void
	-spa_activate_log(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
	-
	- for (int c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	- metaslab_group_t *mg = tvd->vdev_mg;
	-
	- if (tvd->vdev_islog)
	- metaslab_group_activate(mg);
	- }
	-}
	-
	-int
	-spa_reset_logs(spa_t *spa)
	-{
	- int error;
	-
	- error = dmu_objset_find(spa_name(spa), zil_reset,
	- NULL, DS_FIND_CHILDREN);
	- if (error == 0) {
	- /*
	- * We successfully offlined the log device, sync out the
	- * current txg so that the "stubby" block can be removed
	- * by zil_sync().
	- */
	- txg_wait_synced(spa->spa_dsl_pool, 0);
	- }
	- return (error);
	-}
	-
	-static void
	-spa_aux_check_removed(spa_aux_vdev_t *sav)
	-{
	- int i;
	-
	- for (i = 0; i < sav->sav_count; i++)
	- spa_check_removed(sav->sav_vdevs[i]);
	-}
	-
	-void
	-spa_claim_notify(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	-
	- if (zio->io_error)
	- return;
	-
	- mutex_enter(&spa->spa_props_lock); /* any mutex will do */
	- if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
	- spa->spa_claim_max_txg = zio->io_bp->blk_birth;
	- mutex_exit(&spa->spa_props_lock);
	-}
	-
	-typedef struct spa_load_error {
	- uint64_t sle_meta_count;
	- uint64_t sle_data_count;
	-} spa_load_error_t;
	-
	-static void
	-spa_load_verify_done(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	- spa_load_error_t *sle = zio->io_private;
	- dmu_object_type_t type = BP_GET_TYPE(bp);
	- int error = zio->io_error;
	- spa_t *spa = zio->io_spa;
	-
	- abd_free(zio->io_abd);
	- if (error) {
	- if ((BP_GET_LEVEL(bp) != 0 \|\| DMU_OT_IS_METADATA(type)) &&
	- type != DMU_OT_INTENT_LOG)
	- atomic_inc_64(&sle->sle_meta_count);
	- else
	- atomic_inc_64(&sle->sle_data_count);
	- }
	-
	- mutex_enter(&spa->spa_scrub_lock);
	- spa->spa_load_verify_ios--;
	- cv_broadcast(&spa->spa_scrub_io_cv);
	- mutex_exit(&spa->spa_scrub_lock);
	-}
	-
	-/*
	- * Maximum number of concurrent scrub i/os to create while verifying
	- * a pool while importing it.
	- */
	-int spa_load_verify_maxinflight = 10000;
	-boolean_t spa_load_verify_metadata = B_TRUE;
	-boolean_t spa_load_verify_data = B_TRUE;
	-
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
	- &spa_load_verify_maxinflight, 0,
	- "Maximum number of concurrent scrub I/Os to create while verifying a "
	- "pool while importing it");
	-
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
	- &spa_load_verify_metadata, 0,
	- "Check metadata on import?");
	-
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
	- &spa_load_verify_data, 0,
	- "Check user data on import?");
	-
	-/ARGSUSED/
	-static int
	-spa_load_verify_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	-{
	- if (bp == NULL \|\| BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp))
	- return (0);
	- /*
	- * Note: normally this routine will not be called if
	- * spa_load_verify_metadata is not set. However, it may be useful
	- * to manually set the flag after the traversal has begun.
	- */
	- if (!spa_load_verify_metadata)
	- return (0);
	- if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
	- return (0);
	-
	- zio_t *rio = arg;
	- size_t size = BP_GET_PSIZE(bp);
	-
	- mutex_enter(&spa->spa_scrub_lock);
	- while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
	- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
	- spa->spa_load_verify_ios++;
	- mutex_exit(&spa->spa_scrub_lock);
	-
	- zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
	- spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
	- ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_CANFAIL \|
	- ZIO_FLAG_SCRUB \| ZIO_FLAG_RAW, zb));
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-int
	-verify_dataset_name_len(dsl_pool_t dp, dsl_dataset_t ds, void *arg)
	-{
	- if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
	- return (SET_ERROR(ENAMETOOLONG));
	-
	- return (0);
	-}
	-
	-static int
	-spa_load_verify(spa_t *spa)
	-{
	- zio_t *rio;
	- spa_load_error_t sle = { 0 };
	- zpool_load_policy_t policy;
	- boolean_t verify_ok = B_FALSE;
	- int error = 0;
	-
	- zpool_get_load_policy(spa->spa_config, &policy);
	-
	- if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
	- return (0);
	-
	- dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
	- error = dmu_objset_find_dp(spa->spa_dsl_pool,
	- spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
	- DS_FIND_CHILDREN);
	- dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
	- if (error != 0)
	- return (error);
	-
	- rio = zio_root(spa, NULL, &sle,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE);
	-
	- if (spa_load_verify_metadata) {
	- if (spa->spa_extreme_rewind) {
	- spa_load_note(spa, "performing a complete scan of the "
	- "pool since extreme rewind is on. This may take "
	- "a very long time.\n (spa_load_verify_data=%u, "
	- "spa_load_verify_metadata=%u)",
	- spa_load_verify_data, spa_load_verify_metadata);
	- }
	- error = traverse_pool(spa, spa->spa_verify_min_txg,
	- TRAVERSE_PRE \| TRAVERSE_PREFETCH_METADATA,
	- spa_load_verify_cb, rio);
	- }
	-
	- (void) zio_wait(rio);
	-
	- spa->spa_load_meta_errors = sle.sle_meta_count;
	- spa->spa_load_data_errors = sle.sle_data_count;
	-
	- if (sle.sle_meta_count != 0 \|\| sle.sle_data_count != 0) {
	- spa_load_note(spa, "spa_load_verify found %llu metadata errors "
	- "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
	- (u_longlong_t)sle.sle_data_count);
	- }
	-
	- if (spa_load_verify_dryrun \|\|
	- (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
	- sle.sle_data_count <= policy.zlp_maxdata)) {
	- int64_t loss = 0;
	-
	- verify_ok = B_TRUE;
	- spa->spa_load_txg = spa->spa_uberblock.ub_txg;
	- spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
	-
	- loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
	- VERIFY(nvlist_add_uint64(spa->spa_load_info,
	- ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
	- VERIFY(nvlist_add_int64(spa->spa_load_info,
	- ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
	- VERIFY(nvlist_add_uint64(spa->spa_load_info,
	- ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
	- } else {
	- spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
	- }
	-
	- if (spa_load_verify_dryrun)
	- return (0);
	-
	- if (error) {
	- if (error != ENXIO && error != EIO)
	- error = SET_ERROR(EIO);
	- return (error);
	- }
	-
	- return (verify_ok ? 0 : EIO);
	-}
	-
	-/*
	- * Find a value in the pool props object.
	- */
	-static void
	-spa_prop_find(spa_t spa, zpool_prop_t prop, uint64_t val)
	-{
	- (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
	- zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
	-}
	-
	-/*
	- * Find a value in the pool directory object.
	- */
	-static int
	-spa_dir_prop(spa_t spa, const char name, uint64_t *val, boolean_t log_enoent)
	-{
	- int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- name, sizeof (uint64_t), 1, val);
	-
	- if (error != 0 && (error != ENOENT \|\| log_enoent)) {
	- spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
	- "[error=%d]", name, error);
	- }
	-
	- return (error);
	-}
	-
	-static int
	-spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
	-{
	- vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
	- return (SET_ERROR(err));
	-}
	-
	-static void
	-spa_spawn_aux_threads(spa_t *spa)
	-{
	- ASSERT(spa_writeable(spa));
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- spa_start_indirect_condensing_thread(spa);
	-
	- ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
	- spa->spa_checkpoint_discard_zthr =
	- zthr_create(spa_checkpoint_discard_thread_check,
	- spa_checkpoint_discard_thread, spa);
	-}
	-
	-/*
	- * Fix up config after a partly-completed split. This is done with the
	- * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
	- * pool have that entry in their config, but only the splitting one contains
	- * a list of all the guids of the vdevs that are being split off.
	- *
	- * This function determines what to do with that list: either rejoin
	- * all the disks to the pool, or complete the splitting process. To attempt
	- * the rejoin, each disk that is offlined is marked online again, and
	- * we do a reopen() call. If the vdev label for every disk that was
	- * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
	- * then we call vdev_split() on each disk, and complete the split.
	- *
	- * Otherwise we leave the config alone, with all the vdevs in place in
	- * the original pool.
	- */
	-static void
	-spa_try_repair(spa_t spa, nvlist_t config)
	-{
	- uint_t extracted;
	- uint64_t *glist;
	- uint_t i, gcount;
	- nvlist_t *nvl;
	- vdev_t **vd;
	- boolean_t attempt_reopen;
	-
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
	- return;
	-
	- /* check that the config is complete */
	- if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
	- &glist, &gcount) != 0)
	- return;
	-
	- vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
	-
	- /* attempt to online all the vdevs & validate */
	- attempt_reopen = B_TRUE;
	- for (i = 0; i < gcount; i++) {
	- if (glist[i] == 0) /* vdev is hole */
	- continue;
	-
	- vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
	- if (vd[i] == NULL) {
	- /*
	- * Don't bother attempting to reopen the disks;
	- * just do the split.
	- */
	- attempt_reopen = B_FALSE;
	- } else {
	- /* attempt to re-online it */
	- vd[i]->vdev_offline = B_FALSE;
	- }
	- }
	-
	- if (attempt_reopen) {
	- vdev_reopen(spa->spa_root_vdev);
	-
	- /* check each device to see what state it's in */
	- for (extracted = 0, i = 0; i < gcount; i++) {
	- if (vd[i] != NULL &&
	- vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
	- break;
	- ++extracted;
	- }
	- }
	-
	- /*
	- * If every disk has been moved to the new pool, or if we never
	- * even attempted to look at them, then we split them off for
	- * good.
	- */
	- if (!attempt_reopen \|\| gcount == extracted) {
	- for (i = 0; i < gcount; i++)
	- if (vd[i] != NULL)
	- vdev_split(vd[i]);
	- vdev_reopen(spa->spa_root_vdev);
	- }
	-
	- kmem_free(vd, gcount * sizeof (vdev_t *));
	-}
	-
	-static int
	-spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
	-{
	- char *ereport = FM_EREPORT_ZFS_POOL;
	- int error;
	-
	- spa->spa_load_state = state;
	-
	- gethrestime(&spa->spa_loaded_ts);
	- error = spa_load_impl(spa, type, &ereport);
	-
	- /*
	- * Don't count references from objsets that are already closed
	- * and are making their way through the eviction process.
	- */
	- spa_evicting_os_wait(spa);
	- spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
	- if (error) {
	- if (error != EEXIST) {
	- spa->spa_loaded_ts.tv_sec = 0;
	- spa->spa_loaded_ts.tv_nsec = 0;
	- }
	- if (error != EBADF) {
	- zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
	- }
	- }
	- spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
	- spa->spa_ena = 0;
	-
	- return (error);
	-}
	-
	-/*
	- * Count the number of per-vdev ZAPs associated with all of the vdevs in the
	- * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
	- * spa's per-vdev ZAP list.
	- */
	-static uint64_t
	-vdev_count_verify_zaps(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- uint64_t total = 0;
	- if (vd->vdev_top_zap != 0) {
	- total++;
	- ASSERT0(zap_lookup_int(spa->spa_meta_objset,
	- spa->spa_all_vdev_zaps, vd->vdev_top_zap));
	- }
	- if (vd->vdev_leaf_zap != 0) {
	- total++;
	- ASSERT0(zap_lookup_int(spa->spa_meta_objset,
	- spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
	- }
	-
	- for (uint64_t i = 0; i < vd->vdev_children; i++) {
	- total += vdev_count_verify_zaps(vd->vdev_child[i]);
	- }
	-
	- return (total);
	-}
	-
	-/*
	- * Determine whether the activity check is required.
	- */
	-static boolean_t
	-spa_activity_check_required(spa_t spa, uberblock_t ub, nvlist_t *label,
	- nvlist_t *config)
	-{
	- uint64_t state = 0;
	- uint64_t hostid = 0;
	- uint64_t tryconfig_txg = 0;
	- uint64_t tryconfig_timestamp = 0;
	- uint16_t tryconfig_mmp_seq = 0;
	- nvlist_t *nvinfo;
	-
	- if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
	- nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
	- (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
	- &tryconfig_txg);
	- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
	- &tryconfig_timestamp);
	- (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
	- &tryconfig_mmp_seq);
	- }
	-
	- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
	-
	- /*
	- * Disable the MMP activity check - This is used by zdb which
	- * is intended to be used on potentially active pools.
	- */
	- if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
	- return (B_FALSE);
	-
	- /*
	- * Skip the activity check when the MMP feature is disabled.
	- */
	- if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
	- return (B_FALSE);
	-
	- /*
	- * If the tryconfig_ values are nonzero, they are the results of an
	- * earlier tryimport. If they all match the uberblock we just found,
	- * then the pool has not changed and we return false so we do not test
	- * a second time.
	- */
	- if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
	- tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
	- tryconfig_mmp_seq && tryconfig_mmp_seq ==
	- (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
	- return (B_FALSE);
	-
	- /*
	- * Allow the activity check to be skipped when importing the pool
	- * on the same host which last imported it. Since the hostid from
	- * configuration may be stale use the one read from the label.
	- */
	- if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
	- hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
	-
	- if (hostid == spa_get_hostid())
	- return (B_FALSE);
	-
	- /*
	- * Skip the activity test when the pool was cleanly exported.
	- */
	- if (state != POOL_STATE_ACTIVE)
	- return (B_FALSE);
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Nanoseconds the activity check must watch for changes on-disk.
	- */
	-static uint64_t
	-spa_activity_check_duration(spa_t spa, uberblock_t ub)
	-{
	- uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
	- uint64_t multihost_interval = MSEC2NSEC(
	- MMP_INTERVAL_OK(zfs_multihost_interval));
	- uint64_t import_delay = MAX(NANOSEC, import_intervals *
	- multihost_interval);
	-
	- /*
	- * Local tunables determine a minimum duration except for the case
	- * where we know when the remote host will suspend the pool if MMP
	- * writes do not land.
	- *
	- * See Big Theory comment at the top of mmp.c for the reasoning behind
	- * these cases and times.
	- */
	-
	- ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
	-
	- if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
	- MMP_FAIL_INT(ub) > 0) {
	-
	- /* MMP on remote host will suspend pool after failed writes */
	- import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
	- MMP_IMPORT_SAFETY_FACTOR / 100;
	-
	- zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
	- "mmp_fails=%llu ub_mmp mmp_interval=%llu "
	- "import_intervals=%u", import_delay, MMP_FAIL_INT(ub),
	- MMP_INTERVAL(ub), import_intervals);
	-
	- } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
	- MMP_FAIL_INT(ub) == 0) {
	-
	- /* MMP on remote host will never suspend pool */
	- import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
	- ub->ub_mmp_delay) * import_intervals);
	-
	- zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
	- "mmp_interval=%llu ub_mmp_delay=%llu "
	- "import_intervals=%u", import_delay, MMP_INTERVAL(ub),
	- ub->ub_mmp_delay, import_intervals);
	-
	- } else if (MMP_VALID(ub)) {
	- /*
	- * zfs-0.7 compatability case
	- */
	-
	- import_delay = MAX(import_delay, (multihost_interval +
	- ub->ub_mmp_delay) * import_intervals);
	-
	- zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
	- "import_intervals=%u leaves=%u", import_delay,
	- ub->ub_mmp_delay, import_intervals,
	- vdev_count_leaves(spa));
	- } else {
	- /* Using local tunings is the only reasonable option */
	- zfs_dbgmsg("pool last imported on non-MMP aware "
	- "host using import_delay=%llu multihost_interval=%llu "
	- "import_intervals=%u", import_delay, multihost_interval,
	- import_intervals);
	- }
	-
	- return (import_delay);
	-}
	-
	-/*
	- * Perform the import activity check. If the user canceled the import or
	- * we detected activity then fail.
	- */
	-static int
	-spa_activity_check(spa_t spa, uberblock_t ub, nvlist_t *config)
	-{
	- uint64_t txg = ub->ub_txg;
	- uint64_t timestamp = ub->ub_timestamp;
	- uint64_t mmp_config = ub->ub_mmp_config;
	- uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
	- uint64_t import_delay;
	- hrtime_t import_expire;
	- nvlist_t *mmp_label = NULL;
	- vdev_t *rvd = spa->spa_root_vdev;
	- kcondvar_t cv;
	- kmutex_t mtx;
	- int error = 0;
	-
	- cv_init(&cv, NULL, CV_DEFAULT, NULL);
	- mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
	- mutex_enter(&mtx);
	-
	- /*
	- * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
	- * during the earlier tryimport. If the txg recorded there is 0 then
	- * the pool is known to be active on another host.
	- *
	- * Otherwise, the pool might be in use on another host. Check for
	- * changes in the uberblocks on disk if necessary.
	- */
	- if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
	- nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
	- ZPOOL_CONFIG_LOAD_INFO);
	-
	- if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
	- fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
	- vdev_uberblock_load(rvd, ub, &mmp_label);
	- error = SET_ERROR(EREMOTEIO);
	- goto out;
	- }
	- }
	-
	- import_delay = spa_activity_check_duration(spa, ub);
	-
	- /* Add a small random factor in case of simultaneous imports (0-25%) */
	- import_delay += import_delay * spa_get_random(250) / 1000;
	-
	- import_expire = gethrtime() + import_delay;
	-
	- while (gethrtime() < import_expire) {
	- vdev_uberblock_load(rvd, ub, &mmp_label);
	-
	- if (txg != ub->ub_txg \|\| timestamp != ub->ub_timestamp \|\|
	- mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
	- zfs_dbgmsg("multihost activity detected "
	- "txg %llu ub_txg %llu "
	- "timestamp %llu ub_timestamp %llu "
	- "mmp_config %#llx ub_mmp_config %#llx",
	- txg, ub->ub_txg, timestamp, ub->ub_timestamp,
	- mmp_config, ub->ub_mmp_config);
	-
	- error = SET_ERROR(EREMOTEIO);
	- break;
	- }
	-
	- if (mmp_label) {
	- nvlist_free(mmp_label);
	- mmp_label = NULL;
	- }
	- error = cv_timedwait_sig(&cv, &mtx, hz);
	-#if defined(illumos) \|\| !defined(_KERNEL)
	- if (error != -1) {
	-#else
	- if (error != EWOULDBLOCK) {
	-#endif
	- error = SET_ERROR(EINTR);
	- break;
	- }
	- error = 0;
	- }
	-
	-out:
	- mutex_exit(&mtx);
	- mutex_destroy(&mtx);
	- cv_destroy(&cv);
	-
	- /*
	- * If the pool is determined to be active store the status in the
	- * spa->spa_load_info nvlist. If the remote hostname or hostid are
	- * available from configuration read from disk store them as well.
	- * This allows 'zpool import' to generate a more useful message.
	- *
	- * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory)
	- * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
	- * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
	- */
	- if (error == EREMOTEIO) {
	- char *hostname = "<unknown>";
	- uint64_t hostid = 0;
	-
	- if (mmp_label) {
	- if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
	- hostname = fnvlist_lookup_string(mmp_label,
	- ZPOOL_CONFIG_HOSTNAME);
	- fnvlist_add_string(spa->spa_load_info,
	- ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
	- }
	-
	- if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
	- hostid = fnvlist_lookup_uint64(mmp_label,
	- ZPOOL_CONFIG_HOSTID);
	- fnvlist_add_uint64(spa->spa_load_info,
	- ZPOOL_CONFIG_MMP_HOSTID, hostid);
	- }
	- }
	-
	- fnvlist_add_uint64(spa->spa_load_info,
	- ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
	- fnvlist_add_uint64(spa->spa_load_info,
	- ZPOOL_CONFIG_MMP_TXG, 0);
	-
	- error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
	- }
	-
	- if (mmp_label)
	- nvlist_free(mmp_label);
	-
	- return (error);
	-}
	-
	-static int
	-spa_verify_host(spa_t spa, nvlist_t mos_config)
	-{
	- uint64_t hostid;
	- char *hostname;
	- uint64_t myhostid = 0;
	-
	- if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
	- ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
	- hostname = fnvlist_lookup_string(mos_config,
	- ZPOOL_CONFIG_HOSTNAME);
	-
	- myhostid = zone_get_hostid(NULL);
	-
	- if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
	- cmn_err(CE_WARN, "pool '%s' could not be "
	- "loaded as it was last accessed by "
	- "another system (host: %s hostid: 0x%llx). "
	- "See: http://illumos.org/msg/ZFS-8000-EY",
	- spa_name(spa), hostname, (u_longlong_t)hostid);
	- spa_load_failed(spa, "hostid verification failed: pool "
	- "last accessed by host: %s (hostid: 0x%llx)",
	- hostname, (u_longlong_t)hostid);
	- return (SET_ERROR(EBADF));
	- }
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
	-{
	- int error = 0;
	- nvlist_t nvtree, nvl, *config = spa->spa_config;
	- int parse;
	- vdev_t *rvd;
	- uint64_t pool_guid;
	- char *comment;
	-
	- /*
	- * Versioning wasn't explicitly added to the label until later, so if
	- * it's not present treat it as the initial version.
	- */
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	- &spa->spa_ubsync.ub_version) != 0)
	- spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
	-
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
	- spa_load_failed(spa, "invalid config provided: '%s' missing",
	- ZPOOL_CONFIG_POOL_GUID);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * If we are doing an import, ensure that the pool is not already
	- * imported by checking if its pool guid already exists in the
	- * spa namespace.
	- *
	- * The only case that we allow an already imported pool to be
	- * imported again, is when the pool is checkpointed and we want to
	- * look at its checkpointed state from userland tools like zdb.
	- */
	-#ifdef _KERNEL
	- if ((spa->spa_load_state == SPA_LOAD_IMPORT \|\|
	- spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
	- spa_guid_exists(pool_guid, 0)) {
	-#else
	- if ((spa->spa_load_state == SPA_LOAD_IMPORT \|\|
	- spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
	- spa_guid_exists(pool_guid, 0) &&
	- !spa_importing_readonly_checkpoint(spa)) {
	-#endif
	- spa_load_failed(spa, "a pool with guid %llu is already open",
	- (u_longlong_t)pool_guid);
	- return (SET_ERROR(EEXIST));
	- }
	-
	- spa->spa_config_guid = pool_guid;
	-
	- nvlist_free(spa->spa_load_info);
	- spa->spa_load_info = fnvlist_alloc();
	-
	- ASSERT(spa->spa_comment == NULL);
	- if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
	- spa->spa_comment = spa_strdup(comment);
	-
	- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
	- &spa->spa_config_txg);
	-
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
	- spa->spa_config_splitting = fnvlist_dup(nvl);
	-
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
	- spa_load_failed(spa, "invalid config provided: '%s' missing",
	- ZPOOL_CONFIG_VDEV_TREE);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * Create "The Godfather" zio to hold all async IOs
	- */
	- spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
	- KM_SLEEP);
	- for (int i = 0; i < max_ncpus; i++) {
	- spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	- ZIO_FLAG_GODFATHER);
	- }
	-
	- /*
	- * Parse the configuration into a vdev tree. We explicitly set the
	- * value that will be returned by spa_version() since parsing the
	- * configuration requires knowing the version number.
	- */
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- parse = (type == SPA_IMPORT_EXISTING ?
	- VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
	- error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- if (error != 0) {
	- spa_load_failed(spa, "unable to parse config [error=%d]",
	- error);
	- return (error);
	- }
	-
	- ASSERT(spa->spa_root_vdev == rvd);
	- ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
	- ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
	-
	- if (type != SPA_IMPORT_ASSEMBLE) {
	- ASSERT(spa_guid(spa) == pool_guid);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Recursively open all vdevs in the vdev tree. This function is called twice:
	- * first with the untrusted config, then with the trusted config.
	- */
	-static int
	-spa_ld_open_vdevs(spa_t *spa)
	-{
	- int error = 0;
	-
	- /*
	- * spa_missing_tvds_allowed defines how many top-level vdevs can be
	- * missing/unopenable for the root vdev to be still considered openable.
	- */
	- if (spa->spa_trust_config) {
	- spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
	- } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
	- spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
	- } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
	- spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
	- } else {
	- spa->spa_missing_tvds_allowed = 0;
	- }
	-
	- spa->spa_missing_tvds_allowed =
	- MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- error = vdev_open(spa->spa_root_vdev);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- if (spa->spa_missing_tvds != 0) {
	- spa_load_note(spa, "vdev tree has %lld missing top-level "
	- "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
	- if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
	- /*
	- * Although theoretically we could allow users to open
	- * incomplete pools in RW mode, we'd need to add a lot
	- * of extra logic (e.g. adjust pool space to account
	- * for missing vdevs).
	- * This limitation also prevents users from accidentally
	- * opening the pool in RW mode during data recovery and
	- * damaging it further.
	- */
	- spa_load_note(spa, "pools with missing top-level "
	- "vdevs can only be opened in read-only mode.");
	- error = SET_ERROR(ENXIO);
	- } else {
	- spa_load_note(spa, "current settings allow for maximum "
	- "%lld missing top-level vdevs at this stage.",
	- (u_longlong_t)spa->spa_missing_tvds_allowed);
	- }
	- }
	- if (error != 0) {
	- spa_load_failed(spa, "unable to open vdev tree [error=%d]",
	- error);
	- }
	- if (spa->spa_missing_tvds != 0 \|\| error != 0)
	- vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
	-
	- return (error);
	-}
	-
	-/*
	- * We need to validate the vdev labels against the configuration that
	- * we have in hand. This function is called twice: first with an untrusted
	- * config, then with a trusted config. The validation is more strict when the
	- * config is trusted.
	- */
	-static int
	-spa_ld_validate_vdevs(spa_t *spa)
	-{
	- int error = 0;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- error = vdev_validate(rvd);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- if (error != 0) {
	- spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
	- return (error);
	- }
	-
	- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
	- spa_load_failed(spa, "cannot open vdev tree after invalidating "
	- "some vdevs");
	- vdev_dbgmsg_print_tree(rvd, 2);
	- return (SET_ERROR(ENXIO));
	- }
	-
	- return (0);
	-}
	-
	-static void
	-spa_ld_select_uberblock_done(spa_t spa, uberblock_t ub)
	-{
	- spa->spa_state = POOL_STATE_ACTIVE;
	- spa->spa_ubsync = spa->spa_uberblock;
	- spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
	- TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
	- spa->spa_first_txg = spa->spa_last_ubsync_txg ?
	- spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
	- spa->spa_claim_max_txg = spa->spa_first_txg;
	- spa->spa_prev_software_version = ub->ub_software_version;
	-}
	-
	-static int
	-spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- nvlist_t *label;
	- uberblock_t *ub = &spa->spa_uberblock;
	- boolean_t activity_check = B_FALSE;
	-
	- /*
	- * If we are opening the checkpointed state of the pool by
	- * rewinding to it, at this point we will have written the
	- * checkpointed uberblock to the vdev labels, so searching
	- * the labels will find the right uberblock. However, if
	- * we are opening the checkpointed state read-only, we have
	- * not modified the labels. Therefore, we must ignore the
	- * labels and continue using the spa_uberblock that was set
	- * by spa_ld_checkpoint_rewind.
	- *
	- * Note that it would be fine to ignore the labels when
	- * rewinding (opening writeable) as well. However, if we
	- * crash just after writing the labels, we will end up
	- * searching the labels. Doing so in the common case means
	- * that this code path gets exercised normally, rather than
	- * just in the edge case.
	- */
	- if (ub->ub_checkpoint_txg != 0 &&
	- spa_importing_readonly_checkpoint(spa)) {
	- spa_ld_select_uberblock_done(spa, ub);
	- return (0);
	- }
	-
	- /*
	- * Find the best uberblock.
	- */
	- vdev_uberblock_load(rvd, ub, &label);
	-
	- /*
	- * If we weren't able to find a single valid uberblock, return failure.
	- */
	- if (ub->ub_txg == 0) {
	- nvlist_free(label);
	- spa_load_failed(spa, "no valid uberblock found");
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
	- }
	-
	- spa_load_note(spa, "using uberblock with txg=%llu",
	- (u_longlong_t)ub->ub_txg);
	-
	- /*
	- * For pools which have the multihost property on determine if the
	- * pool is truly inactive and can be safely imported. Prevent
	- * hosts which don't have a hostid set from importing the pool.
	- */
	- activity_check = spa_activity_check_required(spa, ub, label,
	- spa->spa_config);
	- if (activity_check) {
	- if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
	- spa_get_hostid() == 0) {
	- nvlist_free(label);
	- fnvlist_add_uint64(spa->spa_load_info,
	- ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
	- return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
	- }
	-
	- int error = spa_activity_check(spa, ub, spa->spa_config);
	- if (error) {
	- nvlist_free(label);
	- return (error);
	- }
	-
	- fnvlist_add_uint64(spa->spa_load_info,
	- ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
	- fnvlist_add_uint64(spa->spa_load_info,
	- ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
	- fnvlist_add_uint16(spa->spa_load_info,
	- ZPOOL_CONFIG_MMP_SEQ,
	- (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
	- }
	-
	- /*
	- * If the pool has an unsupported version we can't open it.
	- */
	- if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
	- nvlist_free(label);
	- spa_load_failed(spa, "version %llu is not supported",
	- (u_longlong_t)ub->ub_version);
	- return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
	- }
	-
	- if (ub->ub_version >= SPA_VERSION_FEATURES) {
	- nvlist_t *features;
	-
	- /*
	- * If we weren't able to find what's necessary for reading the
	- * MOS in the label, return failure.
	- */
	- if (label == NULL) {
	- spa_load_failed(spa, "label config unavailable");
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
	- ENXIO));
	- }
	-
	- if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
	- &features) != 0) {
	- nvlist_free(label);
	- spa_load_failed(spa, "invalid label: '%s' missing",
	- ZPOOL_CONFIG_FEATURES_FOR_READ);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
	- ENXIO));
	- }
	-
	- /*
	- * Update our in-core representation with the definitive values
	- * from the label.
	- */
	- nvlist_free(spa->spa_label_features);
	- VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
	- }
	-
	- nvlist_free(label);
	-
	- /*
	- * Look through entries in the label nvlist's features_for_read. If
	- * there is a feature listed there which we don't understand then we
	- * cannot open a pool.
	- */
	- if (ub->ub_version >= SPA_VERSION_FEATURES) {
	- nvlist_t *unsup_feat;
	-
	- VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
	- 0);
	-
	- for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
	- NULL); nvp != NULL;
	- nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
	- if (!zfeature_is_supported(nvpair_name(nvp))) {
	- VERIFY(nvlist_add_string(unsup_feat,
	- nvpair_name(nvp), "") == 0);
	- }
	- }
	-
	- if (!nvlist_empty(unsup_feat)) {
	- VERIFY(nvlist_add_nvlist(spa->spa_load_info,
	- ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
	- nvlist_free(unsup_feat);
	- spa_load_failed(spa, "some features are unsupported");
	- return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
	- ENOTSUP));
	- }
	-
	- nvlist_free(unsup_feat);
	- }
	-
	- if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- spa_try_repair(spa, spa->spa_config);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- nvlist_free(spa->spa_config_splitting);
	- spa->spa_config_splitting = NULL;
	- }
	-
	- /*
	- * Initialize internal SPA structures.
	- */
	- spa_ld_select_uberblock_done(spa, ub);
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_open_rootbp(spa_t *spa)
	-{
	- int error = 0;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
	- if (error != 0) {
	- spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
	- "[error=%d]", error);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	- spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
	- boolean_t reloading)
	-{
	- vdev_t mrvd, rvd = spa->spa_root_vdev;
	- nvlist_t nv, mos_config, *policy;
	- int error = 0, copy_error;
	- uint64_t healthy_tvds, healthy_tvds_mos;
	- uint64_t mos_config_txg;
	-
	- if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
	- != 0)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	-
	- /*
	- * If we're assembling a pool from a split, the config provided is
	- * already trusted so there is nothing to do.
	- */
	- if (type == SPA_IMPORT_ASSEMBLE)
	- return (0);
	-
	- healthy_tvds = spa_healthy_core_tvds(spa);
	-
	- if (load_nvlist(spa, spa->spa_config_object, &mos_config)
	- != 0) {
	- spa_load_failed(spa, "unable to retrieve MOS config");
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- /*
	- * If we are doing an open, pool owner wasn't verified yet, thus do
	- * the verification here.
	- */
	- if (spa->spa_load_state == SPA_LOAD_OPEN) {
	- error = spa_verify_host(spa, mos_config);
	- if (error != 0) {
	- nvlist_free(mos_config);
	- return (error);
	- }
	- }
	-
	- nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	-
	- /*
	- * Build a new vdev tree from the trusted config
	- */
	- VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
	-
	- /*
	- * Vdev paths in the MOS may be obsolete. If the untrusted config was
	- * obtained by scanning /dev/dsk, then it will have the right vdev
	- * paths. We update the trusted MOS config with this information.
	- * We first try to copy the paths with vdev_copy_path_strict, which
	- * succeeds only when both configs have exactly the same vdev tree.
	- * If that fails, we fall back to a more flexible method that has a
	- * best effort policy.
	- */
	- copy_error = vdev_copy_path_strict(rvd, mrvd);
	- if (copy_error != 0 \|\| spa_load_print_vdev_tree) {
	- spa_load_note(spa, "provided vdev tree:");
	- vdev_dbgmsg_print_tree(rvd, 2);
	- spa_load_note(spa, "MOS vdev tree:");
	- vdev_dbgmsg_print_tree(mrvd, 2);
	- }
	- if (copy_error != 0) {
	- spa_load_note(spa, "vdev_copy_path_strict failed, falling "
	- "back to vdev_copy_path_relaxed");
	- vdev_copy_path_relaxed(rvd, mrvd);
	- }
	-
	- vdev_close(rvd);
	- vdev_free(rvd);
	- spa->spa_root_vdev = mrvd;
	- rvd = mrvd;
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- /*
	- * We will use spa_config if we decide to reload the spa or if spa_load
	- * fails and we rewind. We must thus regenerate the config using the
	- * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
	- * pass settings on how to load the pool and is not stored in the MOS.
	- * We copy it over to our new, trusted config.
	- */
	- mos_config_txg = fnvlist_lookup_uint64(mos_config,
	- ZPOOL_CONFIG_POOL_TXG);
	- nvlist_free(mos_config);
	- mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
	- if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
	- &policy) == 0)
	- fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
	- spa_config_set(spa, mos_config);
	- spa->spa_config_source = SPA_CONFIG_SRC_MOS;
	-
	- /*
	- * Now that we got the config from the MOS, we should be more strict
	- * in checking blkptrs and can make assumptions about the consistency
	- * of the vdev tree. spa_trust_config must be set to true before opening
	- * vdevs in order for them to be writeable.
	- */
	- spa->spa_trust_config = B_TRUE;
	-
	- /*
	- * Open and validate the new vdev tree
	- */
	- error = spa_ld_open_vdevs(spa);
	- if (error != 0)
	- return (error);
	-
	- error = spa_ld_validate_vdevs(spa);
	- if (error != 0)
	- return (error);
	-
	- if (copy_error != 0 \|\| spa_load_print_vdev_tree) {
	- spa_load_note(spa, "final vdev tree:");
	- vdev_dbgmsg_print_tree(rvd, 2);
	- }
	-
	- if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
	- !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
	- /*
	- * Sanity check to make sure that we are indeed loading the
	- * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
	- * in the config provided and they happened to be the only ones
	- * to have the latest uberblock, we could involuntarily perform
	- * an extreme rewind.
	- */
	- healthy_tvds_mos = spa_healthy_core_tvds(spa);
	- if (healthy_tvds_mos - healthy_tvds >=
	- SPA_SYNC_MIN_VDEVS) {
	- spa_load_note(spa, "config provided misses too many "
	- "top-level vdevs compared to MOS (%lld vs %lld). ",
	- (u_longlong_t)healthy_tvds,
	- (u_longlong_t)healthy_tvds_mos);
	- spa_load_note(spa, "vdev tree:");
	- vdev_dbgmsg_print_tree(rvd, 2);
	- if (reloading) {
	- spa_load_failed(spa, "config was already "
	- "provided from MOS. Aborting.");
	- return (spa_vdev_err(rvd,
	- VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	- spa_load_note(spa, "spa must be reloaded using MOS "
	- "config");
	- return (SET_ERROR(EAGAIN));
	- }
	- }
	-
	- error = spa_check_for_missing_logs(spa);
	- if (error != 0)
	- return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
	-
	- if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
	- spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
	- "guid sum (%llu != %llu)",
	- (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
	- (u_longlong_t)rvd->vdev_guid_sum);
	- return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
	- ENXIO));
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_open_indirect_vdev_metadata(spa_t *spa)
	-{
	- int error = 0;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- /*
	- * Everything that we read before spa_remove_init() must be stored
	- * on concreted vdevs. Therefore we do this as early as possible.
	- */
	- error = spa_remove_init(spa);
	- if (error != 0) {
	- spa_load_failed(spa, "spa_remove_init failed [error=%d]",
	- error);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- /*
	- * Retrieve information needed to condense indirect vdev mappings.
	- */
	- error = spa_condense_init(spa);
	- if (error != 0) {
	- spa_load_failed(spa, "spa_condense_init failed [error=%d]",
	- error);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_check_features(spa_t spa, boolean_t missing_feat_writep)
	-{
	- int error = 0;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- if (spa_version(spa) >= SPA_VERSION_FEATURES) {
	- boolean_t missing_feat_read = B_FALSE;
	- nvlist_t unsup_feat, enabled_feat;
	-
	- if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
	- &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
	- &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
	- &spa->spa_feat_desc_obj, B_TRUE) != 0) {
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- enabled_feat = fnvlist_alloc();
	- unsup_feat = fnvlist_alloc();
	-
	- if (!spa_features_check(spa, B_FALSE,
	- unsup_feat, enabled_feat))
	- missing_feat_read = B_TRUE;
	-
	- if (spa_writeable(spa) \|\|
	- spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
	- if (!spa_features_check(spa, B_TRUE,
	- unsup_feat, enabled_feat)) {
	- *missing_feat_writep = B_TRUE;
	- }
	- }
	-
	- fnvlist_add_nvlist(spa->spa_load_info,
	- ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
	-
	- if (!nvlist_empty(unsup_feat)) {
	- fnvlist_add_nvlist(spa->spa_load_info,
	- ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
	- }
	-
	- fnvlist_free(enabled_feat);
	- fnvlist_free(unsup_feat);
	-
	- if (!missing_feat_read) {
	- fnvlist_add_boolean(spa->spa_load_info,
	- ZPOOL_CONFIG_CAN_RDONLY);
	- }
	-
	- /*
	- * If the state is SPA_LOAD_TRYIMPORT, our objective is
	- * twofold: to determine whether the pool is available for
	- * import in read-write mode and (if it is not) whether the
	- * pool is available for import in read-only mode. If the pool
	- * is available for import in read-write mode, it is displayed
	- * as available in userland; if it is not available for import
	- * in read-only mode, it is displayed as unavailable in
	- * userland. If the pool is available for import in read-only
	- * mode but not read-write mode, it is displayed as unavailable
	- * in userland with a special note that the pool is actually
	- * available for open in read-only mode.
	- *
	- * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
	- * missing a feature for write, we must first determine whether
	- * the pool can be opened read-only before returning to
	- * userland in order to know whether to display the
	- * abovementioned note.
	- */
	- if (missing_feat_read \|\| (*missing_feat_writep &&
	- spa_writeable(spa))) {
	- spa_load_failed(spa, "pool uses unsupported features");
	- return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
	- ENOTSUP));
	- }
	-
	- /*
	- * Load refcounts for ZFS features from disk into an in-memory
	- * cache during SPA initialization.
	- */
	- for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
	- uint64_t refcount;
	-
	- error = feature_get_refcount_from_disk(spa,
	- &spa_feature_table[i], &refcount);
	- if (error == 0) {
	- spa->spa_feat_refcount_cache[i] = refcount;
	- } else if (error == ENOTSUP) {
	- spa->spa_feat_refcount_cache[i] =
	- SPA_FEATURE_DISABLED;
	- } else {
	- spa_load_failed(spa, "error getting refcount "
	- "for feature %s [error=%d]",
	- spa_feature_table[i].fi_guid, error);
	- return (spa_vdev_err(rvd,
	- VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	- }
	- }
	-
	- if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
	- if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
	- &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_load_special_directories(spa_t *spa)
	-{
	- int error = 0;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- spa->spa_is_initializing = B_TRUE;
	- error = dsl_pool_open(spa->spa_dsl_pool);
	- spa->spa_is_initializing = B_FALSE;
	- if (error != 0) {
	- spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_get_props(spa_t *spa)
	-{
	- int error = 0;
	- uint64_t obj;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- /* Grab the secret checksum salt from the MOS. */
	- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_CHECKSUM_SALT, 1,
	- sizeof (spa->spa_cksum_salt.zcs_bytes),
	- spa->spa_cksum_salt.zcs_bytes);
	- if (error == ENOENT) {
	- /* Generate a new salt for subsequent use */
	- (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
	- sizeof (spa->spa_cksum_salt.zcs_bytes));
	- } else if (error != 0) {
	- spa_load_failed(spa, "unable to retrieve checksum salt from "
	- "MOS [error=%d]", error);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
	- if (error != 0) {
	- spa_load_failed(spa, "error opening deferred-frees bpobj "
	- "[error=%d]", error);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- /*
	- * Load the bit that tells us to use the new accounting function
	- * (raid-z deflation). If we have an older pool, this will not
	- * be present.
	- */
	- error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
	- if (error != 0 && error != ENOENT)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	-
	- error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
	- &spa->spa_creation_version, B_FALSE);
	- if (error != 0 && error != ENOENT)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	-
	- /*
	- * Load the persistent error log. If we have an older pool, this will
	- * not be present.
	- */
	- error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
	- B_FALSE);
	- if (error != 0 && error != ENOENT)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	-
	- error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
	- &spa->spa_errlog_scrub, B_FALSE);
	- if (error != 0 && error != ENOENT)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	-
	- /*
	- * Load the history object. If we have an older pool, this
	- * will not be present.
	- */
	- error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
	- if (error != 0 && error != ENOENT)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	-
	- /*
	- * Load the per-vdev ZAP map. If we have an older pool, this will not
	- * be present; in this case, defer its creation to a later time to
	- * avoid dirtying the MOS this early / out of sync context. See
	- * spa_sync_config_object.
	- */
	-
	- /* The sentinel is only available in the MOS config. */
	- nvlist_t *mos_config;
	- if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
	- spa_load_failed(spa, "unable to retrieve MOS config");
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
	- &spa->spa_all_vdev_zaps, B_FALSE);
	-
	- if (error == ENOENT) {
	- VERIFY(!nvlist_exists(mos_config,
	- ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
	- spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
	- ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
	- } else if (error != 0) {
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
	- /*
	- * An older version of ZFS overwrote the sentinel value, so
	- * we have orphaned per-vdev ZAPs in the MOS. Defer their
	- * destruction to later; see spa_sync_config_object.
	- */
	- spa->spa_avz_action = AVZ_ACTION_DESTROY;
	- /*
	- * We're assuming that no vdevs have had their ZAPs created
	- * before this. Better be sure of it.
	- */
	- ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
	- }
	- nvlist_free(mos_config);
	-
	- spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
	-
	- error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
	- B_FALSE);
	- if (error && error != ENOENT)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	-
	- if (error == 0) {
	- uint64_t autoreplace;
	-
	- spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
	- spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
	- spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
	- spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
	- spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
	- spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
	- spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
	- &spa->spa_dedup_ditto);
	-
	- spa->spa_autoreplace = (autoreplace != 0);
	- }
	-
	- /*
	- * If we are importing a pool with missing top-level vdevs,
	- * we enforce that the pool doesn't panic or get suspended on
	- * error since the likelihood of missing data is extremely high.
	- */
	- if (spa->spa_missing_tvds > 0 &&
	- spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
	- spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
	- spa_load_note(spa, "forcing failmode to 'continue' "
	- "as some top level vdevs are missing");
	- spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
	-{
	- int error = 0;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- /*
	- * If we're assembling the pool from the split-off vdevs of
	- * an existing pool, we don't want to attach the spares & cache
	- * devices.
	- */
	-
	- /*
	- * Load any hot spares for this pool.
	- */
	- error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
	- B_FALSE);
	- if (error != 0 && error != ENOENT)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
	- ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
	- if (load_nvlist(spa, spa->spa_spares.sav_object,
	- &spa->spa_spares.sav_config) != 0) {
	- spa_load_failed(spa, "error loading spares nvlist");
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- spa_load_spares(spa);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- } else if (error == 0) {
	- spa->spa_spares.sav_sync = B_TRUE;
	- }
	-
	- /*
	- * Load any level 2 ARC devices for this pool.
	- */
	- error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
	- &spa->spa_l2cache.sav_object, B_FALSE);
	- if (error != 0 && error != ENOENT)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
	- ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
	- if (load_nvlist(spa, spa->spa_l2cache.sav_object,
	- &spa->spa_l2cache.sav_config) != 0) {
	- spa_load_failed(spa, "error loading l2cache nvlist");
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- spa_load_l2cache(spa);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- } else if (error == 0) {
	- spa->spa_l2cache.sav_sync = B_TRUE;
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_load_vdev_metadata(spa_t *spa)
	-{
	- int error = 0;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- /*
	- * If the 'multihost' property is set, then never allow a pool to
	- * be imported when the system hostid is zero. The exception to
	- * this rule is zdb which is always allowed to access pools.
	- */
	- if (spa_multihost(spa) && spa_get_hostid() == 0 &&
	- (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
	- fnvlist_add_uint64(spa->spa_load_info,
	- ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
	- return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
	- }
	-
	- /*
	- * If the 'autoreplace' property is set, then post a resource notifying
	- * the ZFS DE that it should not issue any faults for unopenable
	- * devices. We also iterate over the vdevs, and post a sysevent for any
	- * unopenable vdevs so that the normal autoreplace handler can take
	- * over.
	- */
	- if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
	- spa_check_removed(spa->spa_root_vdev);
	- /*
	- * For the import case, this is done in spa_import(), because
	- * at this point we're using the spare definitions from
	- * the MOS config, not necessarily from the userland config.
	- */
	- if (spa->spa_load_state != SPA_LOAD_IMPORT) {
	- spa_aux_check_removed(&spa->spa_spares);
	- spa_aux_check_removed(&spa->spa_l2cache);
	- }
	- }
	-
	- /*
	- * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
	- */
	- error = vdev_load(rvd);
	- if (error != 0) {
	- spa_load_failed(spa, "vdev_load failed [error=%d]", error);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
	- }
	-
	- /*
	- * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
	- */
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_load_dedup_tables(spa_t *spa)
	-{
	- int error = 0;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- error = ddt_load(spa);
	- if (error != 0) {
	- spa_load_failed(spa, "ddt_load failed [error=%d]", error);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_verify_logs(spa_t spa, spa_import_type_t type, char *ereport)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
	- boolean_t missing = spa_check_logs(spa);
	- if (missing) {
	- if (spa->spa_missing_tvds != 0) {
	- spa_load_note(spa, "spa_check_logs failed "
	- "so dropping the logs");
	- } else {
	- *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
	- spa_load_failed(spa, "spa_check_logs failed");
	- return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
	- ENXIO));
	- }
	- }
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_verify_pool_data(spa_t *spa)
	-{
	- int error = 0;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- /*
	- * We've successfully opened the pool, verify that we're ready
	- * to start pushing transactions.
	- */
	- if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
	- error = spa_load_verify(spa);
	- if (error != 0) {
	- spa_load_failed(spa, "spa_load_verify failed "
	- "[error=%d]", error);
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
	- error));
	- }
	- }
	-
	- return (0);
	-}
	-
	-static void
	-spa_ld_claim_log_blocks(spa_t *spa)
	-{
	- dmu_tx_t *tx;
	- dsl_pool_t *dp = spa_get_dsl(spa);
	-
	- /*
	- * Claim log blocks that haven't been committed yet.
	- * This must all happen in a single txg.
	- * Note: spa_claim_max_txg is updated by spa_claim_notify(),
	- * invoked from zil_claim_log_block()'s i/o done callback.
	- * Price of rollback is that we abandon the log.
	- */
	- spa->spa_claiming = B_TRUE;
	-
	- tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
	- (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	- zil_claim, tx, DS_FIND_CHILDREN);
	- dmu_tx_commit(tx);
	-
	- spa->spa_claiming = B_FALSE;
	-
	- spa_set_log_state(spa, SPA_LOG_GOOD);
	-}
	-
	-static void
	-spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
	- boolean_t update_config_cache)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- int need_update = B_FALSE;
	-
	- /*
	- * If the config cache is stale, or we have uninitialized
	- * metaslabs (see spa_vdev_add()), then update the config.
	- *
	- * If this is a verbatim import, trust the current
	- * in-core spa_config and update the disk labels.
	- */
	- if (update_config_cache \|\| config_cache_txg != spa->spa_config_txg \|\|
	- spa->spa_load_state == SPA_LOAD_IMPORT \|\|
	- spa->spa_load_state == SPA_LOAD_RECOVER \|\|
	- (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
	- need_update = B_TRUE;
	-
	- for (int c = 0; c < rvd->vdev_children; c++)
	- if (rvd->vdev_child[c]->vdev_ms_array == 0)
	- need_update = B_TRUE;
	-
	- /*
	- * Update the config cache asychronously in case we're the
	- * root pool, in which case the config cache isn't writable yet.
	- */
	- if (need_update)
	- spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
	-}
	-
	-static void
	-spa_ld_prepare_for_reload(spa_t *spa)
	-{
	- int mode = spa->spa_mode;
	- int async_suspended = spa->spa_async_suspended;
	-
	- spa_unload(spa);
	- spa_deactivate(spa);
	- spa_activate(spa, mode);
	-
	- /*
	- * We save the value of spa_async_suspended as it gets reset to 0 by
	- * spa_unload(). We want to restore it back to the original value before
	- * returning as we might be calling spa_async_resume() later.
	- */
	- spa->spa_async_suspended = async_suspended;
	-}
	-
	-static int
	-spa_ld_read_checkpoint_txg(spa_t *spa)
	-{
	- uberblock_t checkpoint;
	- int error = 0;
	-
	- ASSERT0(spa->spa_checkpoint_txg);
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
	- sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
	-
	- if (error == ENOENT)
	- return (0);
	-
	- if (error != 0)
	- return (error);
	-
	- ASSERT3U(checkpoint.ub_txg, !=, 0);
	- ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
	- ASSERT3U(checkpoint.ub_timestamp, !=, 0);
	- spa->spa_checkpoint_txg = checkpoint.ub_txg;
	- spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
	-{
	- int error = 0;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
	-
	- /*
	- * Never trust the config that is provided unless we are assembling
	- * a pool following a split.
	- * This means don't trust blkptrs and the vdev tree in general. This
	- * also effectively puts the spa in read-only mode since
	- * spa_writeable() checks for spa_trust_config to be true.
	- * We will later load a trusted config from the MOS.
	- */
	- if (type != SPA_IMPORT_ASSEMBLE)
	- spa->spa_trust_config = B_FALSE;
	-
	- /*
	- * Parse the config provided to create a vdev tree.
	- */
	- error = spa_ld_parse_config(spa, type);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Now that we have the vdev tree, try to open each vdev. This involves
	- * opening the underlying physical device, retrieving its geometry and
	- * probing the vdev with a dummy I/O. The state of each vdev will be set
	- * based on the success of those operations. After this we'll be ready
	- * to read from the vdevs.
	- */
	- error = spa_ld_open_vdevs(spa);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Read the label of each vdev and make sure that the GUIDs stored
	- * there match the GUIDs in the config provided.
	- * If we're assembling a new pool that's been split off from an
	- * existing pool, the labels haven't yet been updated so we skip
	- * validation for now.
	- */
	- if (type != SPA_IMPORT_ASSEMBLE) {
	- error = spa_ld_validate_vdevs(spa);
	- if (error != 0)
	- return (error);
	- }
	-
	- /*
	- * Read all vdev labels to find the best uberblock (i.e. latest,
	- * unless spa_load_max_txg is set) and store it in spa_uberblock. We
	- * get the list of features required to read blkptrs in the MOS from
	- * the vdev label with the best uberblock and verify that our version
	- * of zfs supports them all.
	- */
	- error = spa_ld_select_uberblock(spa, type);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Pass that uberblock to the dsl_pool layer which will open the root
	- * blkptr. This blkptr points to the latest version of the MOS and will
	- * allow us to read its contents.
	- */
	- error = spa_ld_open_rootbp(spa);
	- if (error != 0)
	- return (error);
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_checkpoint_rewind(spa_t *spa)
	-{
	- uberblock_t checkpoint;
	- int error = 0;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
	-
	- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
	- sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
	-
	- if (error != 0) {
	- spa_load_failed(spa, "unable to retrieve checkpointed "
	- "uberblock from the MOS config [error=%d]", error);
	-
	- if (error == ENOENT)
	- error = ZFS_ERR_NO_CHECKPOINT;
	-
	- return (error);
	- }
	-
	- ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
	- ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
	-
	- /*
	- * We need to update the txg and timestamp of the checkpointed
	- * uberblock to be higher than the latest one. This ensures that
	- * the checkpointed uberblock is selected if we were to close and
	- * reopen the pool right after we've written it in the vdev labels.
	- * (also see block comment in vdev_uberblock_compare)
	- */
	- checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
	- checkpoint.ub_timestamp = gethrestime_sec();
	-
	- /*
	- * Set current uberblock to be the checkpointed uberblock.
	- */
	- spa->spa_uberblock = checkpoint;
	-
	- /*
	- * If we are doing a normal rewind, then the pool is open for
	- * writing and we sync the "updated" checkpointed uberblock to
	- * disk. Once this is done, we've basically rewound the whole
	- * pool and there is no way back.
	- *
	- * There are cases when we don't want to attempt and sync the
	- * checkpointed uberblock to disk because we are opening a
	- * pool as read-only. Specifically, verifying the checkpointed
	- * state with zdb, and importing the checkpointed state to get
	- * a "preview" of its content.
	- */
	- if (spa_writeable(spa)) {
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
	- int svdcount = 0;
	- int children = rvd->vdev_children;
	- int c0 = spa_get_random(children);
	-
	- for (int c = 0; c < children; c++) {
	- vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
	-
	- /* Stop when revisiting the first vdev */
	- if (c > 0 && svd[0] == vd)
	- break;
	-
	- if (vd->vdev_ms_array == 0 \|\| vd->vdev_islog \|\|
	- !vdev_is_concrete(vd))
	- continue;
	-
	- svd[svdcount++] = vd;
	- if (svdcount == SPA_SYNC_MIN_VDEVS)
	- break;
	- }
	- error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
	- if (error == 0)
	- spa->spa_last_synced_guid = rvd->vdev_guid;
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- if (error != 0) {
	- spa_load_failed(spa, "failed to write checkpointed "
	- "uberblock to the vdev labels [error=%d]", error);
	- return (error);
	- }
	- }
	-
	- return (0);
	-}
	-
	-static int
	-spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
	- boolean_t *update_config_cache)
	-{
	- int error;
	-
	- /*
	- * Parse the config for pool, open and validate vdevs,
	- * select an uberblock, and use that uberblock to open
	- * the MOS.
	- */
	- error = spa_ld_mos_init(spa, type);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Retrieve the trusted config stored in the MOS and use it to create
	- * a new, exact version of the vdev tree, then reopen all vdevs.
	- */
	- error = spa_ld_trusted_config(spa, type, B_FALSE);
	- if (error == EAGAIN) {
	- if (update_config_cache != NULL)
	- *update_config_cache = B_TRUE;
	-
	- /*
	- * Redo the loading process with the trusted config if it is
	- * too different from the untrusted config.
	- */
	- spa_ld_prepare_for_reload(spa);
	- spa_load_note(spa, "RELOADING");
	- error = spa_ld_mos_init(spa, type);
	- if (error != 0)
	- return (error);
	-
	- error = spa_ld_trusted_config(spa, type, B_TRUE);
	- if (error != 0)
	- return (error);
	-
	- } else if (error != 0) {
	- return (error);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Load an existing storage pool, using the config provided. This config
	- * describes which vdevs are part of the pool and is later validated against
	- * partial configs present in each vdev's label and an entire copy of the
	- * config stored in the MOS.
	- */
	-static int
	-spa_load_impl(spa_t spa, spa_import_type_t type, char *ereport)
	-{
	- int error = 0;
	- boolean_t missing_feat_write = B_FALSE;
	- boolean_t checkpoint_rewind =
	- (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
	- boolean_t update_config_cache = B_FALSE;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
	-
	- spa_load_note(spa, "LOADING");
	-
	- error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * If we are rewinding to the checkpoint then we need to repeat
	- * everything we've done so far in this function but this time
	- * selecting the checkpointed uberblock and using that to open
	- * the MOS.
	- */
	- if (checkpoint_rewind) {
	- /*
	- * If we are rewinding to the checkpoint update config cache
	- * anyway.
	- */
	- update_config_cache = B_TRUE;
	-
	- /*
	- * Extract the checkpointed uberblock from the current MOS
	- * and use this as the pool's uberblock from now on. If the
	- * pool is imported as writeable we also write the checkpoint
	- * uberblock to the labels, making the rewind permanent.
	- */
	- error = spa_ld_checkpoint_rewind(spa);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Redo the loading process process again with the
	- * checkpointed uberblock.
	- */
	- spa_ld_prepare_for_reload(spa);
	- spa_load_note(spa, "LOADING checkpointed uberblock");
	- error = spa_ld_mos_with_trusted_config(spa, type, NULL);
	- if (error != 0)
	- return (error);
	- }
	-
	- /*
	- * Retrieve the checkpoint txg if the pool has a checkpoint.
	- */
	- error = spa_ld_read_checkpoint_txg(spa);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Retrieve the mapping of indirect vdevs. Those vdevs were removed
	- * from the pool and their contents were re-mapped to other vdevs. Note
	- * that everything that we read before this step must have been
	- * rewritten on concrete vdevs after the last device removal was
	- * initiated. Otherwise we could be reading from indirect vdevs before
	- * we have loaded their mappings.
	- */
	- error = spa_ld_open_indirect_vdev_metadata(spa);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Retrieve the full list of active features from the MOS and check if
	- * they are all supported.
	- */
	- error = spa_ld_check_features(spa, &missing_feat_write);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Load several special directories from the MOS needed by the dsl_pool
	- * layer.
	- */
	- error = spa_ld_load_special_directories(spa);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Retrieve pool properties from the MOS.
	- */
	- error = spa_ld_get_props(spa);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Retrieve the list of auxiliary devices - cache devices and spares -
	- * and open them.
	- */
	- error = spa_ld_open_aux_vdevs(spa, type);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Load the metadata for all vdevs. Also check if unopenable devices
	- * should be autoreplaced.
	- */
	- error = spa_ld_load_vdev_metadata(spa);
	- if (error != 0)
	- return (error);
	-
	- error = spa_ld_load_dedup_tables(spa);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Verify the logs now to make sure we don't have any unexpected errors
	- * when we claim log blocks later.
	- */
	- error = spa_ld_verify_logs(spa, type, ereport);
	- if (error != 0)
	- return (error);
	-
	- if (missing_feat_write) {
	- ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
	-
	- /*
	- * At this point, we know that we can open the pool in
	- * read-only mode but not read-write mode. We now have enough
	- * information and can return to userland.
	- */
	- return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
	- ENOTSUP));
	- }
	-
	- /*
	- * Traverse the last txgs to make sure the pool was left off in a safe
	- * state. When performing an extreme rewind, we verify the whole pool,
	- * which can take a very long time.
	- */
	- error = spa_ld_verify_pool_data(spa);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Calculate the deflated space for the pool. This must be done before
	- * we write anything to the pool because we'd need to update the space
	- * accounting using the deflated sizes.
	- */
	- spa_update_dspace(spa);
	-
	- /*
	- * We have now retrieved all the information we needed to open the
	- * pool. If we are importing the pool in read-write mode, a few
	- * additional steps must be performed to finish the import.
	- */
	- if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER \|\|
	- spa->spa_load_max_txg == UINT64_MAX)) {
	- uint64_t config_cache_txg = spa->spa_config_txg;
	-
	- ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
	-
	- /*
	- * In case of a checkpoint rewind, log the original txg
	- * of the checkpointed uberblock.
	- */
	- if (checkpoint_rewind) {
	- spa_history_log_internal(spa, "checkpoint rewind",
	- NULL, "rewound state to txg=%llu",
	- (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
	- }
	-
	- /*
	- * Traverse the ZIL and claim all blocks.
	- */
	- spa_ld_claim_log_blocks(spa);
	-
	- /*
	- * Kick-off the syncing thread.
	- */
	- spa->spa_sync_on = B_TRUE;
	- txg_sync_start(spa->spa_dsl_pool);
	- mmp_thread_start(spa);
	-
	- /*
	- * Wait for all claims to sync. We sync up to the highest
	- * claimed log block birth time so that claimed log blocks
	- * don't appear to be from the future. spa_claim_max_txg
	- * will have been set for us by ZIL traversal operations
	- * performed above.
	- */
	- txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
	-
	- /*
	- * Check if we need to request an update of the config. On the
	- * next sync, we would update the config stored in vdev labels
	- * and the cachefile (by default /etc/zfs/zpool.cache).
	- */
	- spa_ld_check_for_config_update(spa, config_cache_txg,
	- update_config_cache);
	-
	- /*
	- * Check all DTLs to see if anything needs resilvering.
	- */
	- if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
	- vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
	- spa_async_request(spa, SPA_ASYNC_RESILVER);
	-
	- /*
	- * Log the fact that we booted up (so that we can detect if
	- * we rebooted in the middle of an operation).
	- */
	- spa_history_log_version(spa, "open");
	-
	- spa_restart_removal(spa);
	- spa_spawn_aux_threads(spa);
	-
	- /*
	- * Delete any inconsistent datasets.
	- *
	- * Note:
	- * Since we may be issuing deletes for clones here,
	- * we make sure to do so after we've spawned all the
	- * auxiliary threads above (from which the livelist
	- * deletion zthr is part of).
	- */
	- (void) dmu_objset_find(spa_name(spa),
	- dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
	-
	- /*
	- * Clean up any stale temporary dataset userrefs.
	- */
	- dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
	-
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	- vdev_initialize_restart(spa->spa_root_vdev);
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	- }
	-
	- spa_load_note(spa, "LOADED");
	-
	- return (0);
	-}
	-
	-static int
	-spa_load_retry(spa_t *spa, spa_load_state_t state)
	-{
	- int mode = spa->spa_mode;
	-
	- spa_unload(spa);
	- spa_deactivate(spa);
	-
	- spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
	-
	- spa_activate(spa, mode);
	- spa_async_suspend(spa);
	-
	- spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
	- (u_longlong_t)spa->spa_load_max_txg);
	-
	- return (spa_load(spa, state, SPA_IMPORT_EXISTING));
	-}
	-
	-/*
	- * If spa_load() fails this function will try loading prior txg's. If
	- * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
	- * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
	- * function will not rewind the pool and will return the same error as
	- * spa_load().
	- */
	-static int
	-spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
	- int rewind_flags)
	-{
	- nvlist_t *loadinfo = NULL;
	- nvlist_t *config = NULL;
	- int load_error, rewind_error;
	- uint64_t safe_rewind_txg;
	- uint64_t min_txg;
	-
	- if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
	- spa->spa_load_max_txg = spa->spa_load_txg;
	- spa_set_log_state(spa, SPA_LOG_CLEAR);
	- } else {
	- spa->spa_load_max_txg = max_request;
	- if (max_request != UINT64_MAX)
	- spa->spa_extreme_rewind = B_TRUE;
	- }
	-
	- load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
	- if (load_error == 0)
	- return (0);
	- if (load_error == ZFS_ERR_NO_CHECKPOINT) {
	- /*
	- * When attempting checkpoint-rewind on a pool with no
	- * checkpoint, we should not attempt to load uberblocks
	- * from previous txgs when spa_load fails.
	- */
	- ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
	- return (load_error);
	- }
	-
	- if (spa->spa_root_vdev != NULL)
	- config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
	-
	- spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
	- spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
	-
	- if (rewind_flags & ZPOOL_NEVER_REWIND) {
	- nvlist_free(config);
	- return (load_error);
	- }
	-
	- if (state == SPA_LOAD_RECOVER) {
	- /* Price of rolling back is discarding txgs, including log */
	- spa_set_log_state(spa, SPA_LOG_CLEAR);
	- } else {
	- /*
	- * If we aren't rolling back save the load info from our first
	- * import attempt so that we can restore it after attempting
	- * to rewind.
	- */
	- loadinfo = spa->spa_load_info;
	- spa->spa_load_info = fnvlist_alloc();
	- }
	-
	- spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
	- safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
	- min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
	- TXG_INITIAL : safe_rewind_txg;
	-
	- /*
	- * Continue as long as we're finding errors, we're still within
	- * the acceptable rewind range, and we're still finding uberblocks
	- */
	- while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
	- spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
	- if (spa->spa_load_max_txg < safe_rewind_txg)
	- spa->spa_extreme_rewind = B_TRUE;
	- rewind_error = spa_load_retry(spa, state);
	- }
	-
	- spa->spa_extreme_rewind = B_FALSE;
	- spa->spa_load_max_txg = UINT64_MAX;
	-
	- if (config && (rewind_error \|\| state != SPA_LOAD_RECOVER))
	- spa_config_set(spa, config);
	- else
	- nvlist_free(config);
	-
	- if (state == SPA_LOAD_RECOVER) {
	- ASSERT3P(loadinfo, ==, NULL);
	- return (rewind_error);
	- } else {
	- /* Store the rewind info as part of the initial load info */
	- fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
	- spa->spa_load_info);
	-
	- /* Restore the initial load info */
	- fnvlist_free(spa->spa_load_info);
	- spa->spa_load_info = loadinfo;
	-
	- return (load_error);
	- }
	-}
	-
	-/*
	- * Pool Open/Import
	- *
	- * The import case is identical to an open except that the configuration is sent
	- * down from userland, instead of grabbed from the configuration cache. For the
	- * case of an open, the pool configuration will exist in the
	- * POOL_STATE_UNINITIALIZED state.
	- *
	- * The stats information (gen/count/ustats) is used to gather vdev statistics at
	- * the same time open the pool, without having to keep around the spa_t in some
	- * ambiguous state.
	- */
	-static int
	-spa_open_common(const char pool, spa_t spapp, void tag, nvlist_t *nvpolicy,
	- nvlist_t **config)
	-{
	- spa_t *spa;
	- spa_load_state_t state = SPA_LOAD_OPEN;
	- int error;
	- int locked = B_FALSE;
	- int firstopen = B_FALSE;
	-
	- *spapp = NULL;
	-
	- /*
	- * As disgusting as this is, we need to support recursive calls to this
	- * function because dsl_dir_open() is called during spa_load(), and ends
	- * up calling spa_open() again. The real fix is to figure out how to
	- * avoid dsl_dir_open() calling this in the first place.
	- */
	- if (mutex_owner(&spa_namespace_lock) != curthread) {
	- mutex_enter(&spa_namespace_lock);
	- locked = B_TRUE;
	- }
	-
	- if ((spa = spa_lookup(pool)) == NULL) {
	- if (locked)
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
	- zpool_load_policy_t policy;
	-
	- firstopen = B_TRUE;
	-
	- zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
	- &policy);
	- if (policy.zlp_rewind & ZPOOL_DO_REWIND)
	- state = SPA_LOAD_RECOVER;
	-
	- spa_activate(spa, spa_mode_global);
	-
	- if (state != SPA_LOAD_RECOVER)
	- spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
	- spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
	-
	- zfs_dbgmsg("spa_open_common: opening %s", pool);
	- error = spa_load_best(spa, state, policy.zlp_txg,
	- policy.zlp_rewind);
	-
	- if (error == EBADF) {
	- /*
	- * If vdev_validate() returns failure (indicated by
	- * EBADF), it indicates that one of the vdevs indicates
	- * that the pool has been exported or destroyed. If
	- * this is the case, the config cache is out of sync and
	- * we should remove the pool from the namespace.
	- */
	- spa_unload(spa);
	- spa_deactivate(spa);
	- spa_write_cachefile(spa, B_TRUE, B_TRUE);
	- spa_remove(spa);
	- if (locked)
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- if (error) {
	- /*
	- * We can't open the pool, but we still have useful
	- * information: the state of each vdev after the
	- * attempted vdev_open(). Return this to the user.
	- */
	- if (config != NULL && spa->spa_config) {
	- VERIFY(nvlist_dup(spa->spa_config, config,
	- KM_SLEEP) == 0);
	- VERIFY(nvlist_add_nvlist(*config,
	- ZPOOL_CONFIG_LOAD_INFO,
	- spa->spa_load_info) == 0);
	- }
	- spa_unload(spa);
	- spa_deactivate(spa);
	- spa->spa_last_open_failed = error;
	- if (locked)
	- mutex_exit(&spa_namespace_lock);
	- *spapp = NULL;
	- return (error);
	- }
	- }
	-
	- spa_open_ref(spa, tag);
	-
	- if (config != NULL)
	- *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
	-
	- /*
	- * If we've recovered the pool, pass back any information we
	- * gathered while doing the load.
	- */
	- if (state == SPA_LOAD_RECOVER) {
	- VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
	- spa->spa_load_info) == 0);
	- }
	-
	- if (locked) {
	- spa->spa_last_open_failed = 0;
	- spa->spa_last_ubsync_txg = 0;
	- spa->spa_load_txg = 0;
	- mutex_exit(&spa_namespace_lock);
	-#ifdef __FreeBSD__
	-#ifdef _KERNEL
	- if (firstopen)
	- zvol_create_minors(spa, spa->spa_name);
	-#endif
	-#endif
	- }
	-
	- *spapp = spa;
	-
	- return (0);
	-}
	-
	-int
	-spa_open_rewind(const char name, spa_t spapp, void tag, nvlist_t *policy,
	- nvlist_t **config)
	-{
	- return (spa_open_common(name, spapp, tag, policy, config));
	-}
	-
	-int
	-spa_open(const char name, spa_t spapp, void tag)
	-{
	- return (spa_open_common(name, spapp, tag, NULL, NULL));
	-}
	-
	-/*
	- * Lookup the given spa_t, incrementing the inject count in the process,
	- * preventing it from being exported or destroyed.
	- */
	-spa_t *
	-spa_inject_addref(char *name)
	-{
	- spa_t *spa;
	-
	- mutex_enter(&spa_namespace_lock);
	- if ((spa = spa_lookup(name)) == NULL) {
	- mutex_exit(&spa_namespace_lock);
	- return (NULL);
	- }
	- spa->spa_inject_ref++;
	- mutex_exit(&spa_namespace_lock);
	-
	- return (spa);
	-}
	-
	-void
	-spa_inject_delref(spa_t *spa)
	-{
	- mutex_enter(&spa_namespace_lock);
	- spa->spa_inject_ref--;
	- mutex_exit(&spa_namespace_lock);
	-}
	-
	-/*
	- * Add spares device information to the nvlist.
	- */
	-static void
	-spa_add_spares(spa_t spa, nvlist_t config)
	-{
	- nvlist_t **spares;
	- uint_t i, nspares;
	- nvlist_t *nvroot;
	- uint64_t guid;
	- vdev_stat_t *vs;
	- uint_t vsc;
	- uint64_t pool;
	-
	- ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
	-
	- if (spa->spa_spares.sav_count == 0)
	- return;
	-
	- VERIFY(nvlist_lookup_nvlist(config,
	- ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	- VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
	- if (nspares != 0) {
	- VERIFY(nvlist_add_nvlist_array(nvroot,
	- ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
	- VERIFY(nvlist_lookup_nvlist_array(nvroot,
	- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
	-
	- /*
	- * Go through and find any spares which have since been
	- * repurposed as an active spare. If this is the case, update
	- * their status appropriately.
	- */
	- for (i = 0; i < nspares; i++) {
	- VERIFY(nvlist_lookup_uint64(spares[i],
	- ZPOOL_CONFIG_GUID, &guid) == 0);
	- if (spa_spare_exists(guid, &pool, NULL) &&
	- pool != 0ULL) {
	- VERIFY(nvlist_lookup_uint64_array(
	- spares[i], ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t **)&vs, &vsc) == 0);
	- vs->vs_state = VDEV_STATE_CANT_OPEN;
	- vs->vs_aux = VDEV_AUX_SPARED;
	- }
	- }
	- }
	-}
	-
	-/*
	- * Add l2cache device information to the nvlist, including vdev stats.
	- */
	-static void
	-spa_add_l2cache(spa_t spa, nvlist_t config)
	-{
	- nvlist_t **l2cache;
	- uint_t i, j, nl2cache;
	- nvlist_t *nvroot;
	- uint64_t guid;
	- vdev_t *vd;
	- vdev_stat_t *vs;
	- uint_t vsc;
	-
	- ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
	-
	- if (spa->spa_l2cache.sav_count == 0)
	- return;
	-
	- VERIFY(nvlist_lookup_nvlist(config,
	- ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	- VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
	- ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
	- if (nl2cache != 0) {
	- VERIFY(nvlist_add_nvlist_array(nvroot,
	- ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
	- VERIFY(nvlist_lookup_nvlist_array(nvroot,
	- ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
	-
	- /*
	- * Update level 2 cache device stats.
	- */
	-
	- for (i = 0; i < nl2cache; i++) {
	- VERIFY(nvlist_lookup_uint64(l2cache[i],
	- ZPOOL_CONFIG_GUID, &guid) == 0);
	-
	- vd = NULL;
	- for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
	- if (guid ==
	- spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
	- vd = spa->spa_l2cache.sav_vdevs[j];
	- break;
	- }
	- }
	- ASSERT(vd != NULL);
	-
	- VERIFY(nvlist_lookup_uint64_array(l2cache[i],
	- ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
	- == 0);
	- vdev_get_stats(vd, vs);
	- }
	- }
	-}
	-
	-static void
	-spa_feature_stats_from_disk(spa_t spa, nvlist_t features)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- /* We may be unable to read features if pool is suspended. */
	- if (spa_suspended(spa))
	- return;
	-
	- if (spa->spa_feat_for_read_obj != 0) {
	- for (zap_cursor_init(&zc, spa->spa_meta_objset,
	- spa->spa_feat_for_read_obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- ASSERT(za.za_integer_length == sizeof (uint64_t) &&
	- za.za_num_integers == 1);
	- VERIFY0(nvlist_add_uint64(features, za.za_name,
	- za.za_first_integer));
	- }
	- zap_cursor_fini(&zc);
	- }
	-
	- if (spa->spa_feat_for_write_obj != 0) {
	- for (zap_cursor_init(&zc, spa->spa_meta_objset,
	- spa->spa_feat_for_write_obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- ASSERT(za.za_integer_length == sizeof (uint64_t) &&
	- za.za_num_integers == 1);
	- VERIFY0(nvlist_add_uint64(features, za.za_name,
	- za.za_first_integer));
	- }
	- zap_cursor_fini(&zc);
	- }
	-}
	-
	-static void
	-spa_feature_stats_from_cache(spa_t spa, nvlist_t features)
	-{
	- int i;
	-
	- for (i = 0; i < SPA_FEATURES; i++) {
	- zfeature_info_t feature = spa_feature_table[i];
	- uint64_t refcount;
	-
	- if (feature_get_refcount(spa, &feature, &refcount) != 0)
	- continue;
	-
	- VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
	- }
	-}
	-
	-/*
	- * Store a list of pool features and their reference counts in the
	- * config.
	- *
	- * The first time this is called on a spa, allocate a new nvlist, fetch
	- * the pool features and reference counts from disk, then save the list
	- * in the spa. In subsequent calls on the same spa use the saved nvlist
	- * and refresh its values from the cached reference counts. This
	- * ensures we don't block here on I/O on a suspended pool so 'zpool
	- * clear' can resume the pool.
	- */
	-static void
	-spa_add_feature_stats(spa_t spa, nvlist_t config)
	-{
	- nvlist_t *features;
	-
	- ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
	-
	- mutex_enter(&spa->spa_feat_stats_lock);
	- features = spa->spa_feat_stats;
	-
	- if (features != NULL) {
	- spa_feature_stats_from_cache(spa, features);
	- } else {
	- VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
	- spa->spa_feat_stats = features;
	- spa_feature_stats_from_disk(spa, features);
	- }
	-
	- VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
	- features));
	-
	- mutex_exit(&spa->spa_feat_stats_lock);
	-}
	-
	-int
	-spa_get_stats(const char name, nvlist_t *config,
	- char *altroot, size_t buflen)
	-{
	- int error;
	- spa_t *spa;
	-
	- *config = NULL;
	- error = spa_open_common(name, &spa, FTAG, NULL, config);
	-
	- if (spa != NULL) {
	- /*
	- * This still leaves a window of inconsistency where the spares
	- * or l2cache devices could change and the config would be
	- * self-inconsistent.
	- */
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	-
	- if (*config != NULL) {
	- uint64_t loadtimes[2];
	-
	- loadtimes[0] = spa->spa_loaded_ts.tv_sec;
	- loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
	- VERIFY(nvlist_add_uint64_array(*config,
	- ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
	-
	- VERIFY(nvlist_add_uint64(*config,
	- ZPOOL_CONFIG_ERRCOUNT,
	- spa_get_errlog_size(spa)) == 0);
	-
	- if (spa_suspended(spa)) {
	- VERIFY(nvlist_add_uint64(*config,
	- ZPOOL_CONFIG_SUSPENDED,
	- spa->spa_failmode) == 0);
	- VERIFY(nvlist_add_uint64(*config,
	- ZPOOL_CONFIG_SUSPENDED_REASON,
	- spa->spa_suspended) == 0);
	- }
	-
	- spa_add_spares(spa, *config);
	- spa_add_l2cache(spa, *config);
	- spa_add_feature_stats(spa, *config);
	- }
	- }
	-
	- /*
	- * We want to get the alternate root even for faulted pools, so we cheat
	- * and call spa_lookup() directly.
	- */
	- if (altroot) {
	- if (spa == NULL) {
	- mutex_enter(&spa_namespace_lock);
	- spa = spa_lookup(name);
	- if (spa)
	- spa_altroot(spa, altroot, buflen);
	- else
	- altroot[0] = '\0';
	- spa = NULL;
	- mutex_exit(&spa_namespace_lock);
	- } else {
	- spa_altroot(spa, altroot, buflen);
	- }
	- }
	-
	- if (spa != NULL) {
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	- spa_close(spa, FTAG);
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * Validate that the auxiliary device array is well formed. We must have an
	- * array of nvlists, each which describes a valid leaf vdev. If this is an
	- * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
	- * specified, as long as they are well-formed.
	- */
	-static int
	-spa_validate_aux_devs(spa_t spa, nvlist_t nvroot, uint64_t crtxg, int mode,
	- spa_aux_vdev_t sav, const char config, uint64_t version,
	- vdev_labeltype_t label)
	-{
	- nvlist_t **dev;
	- uint_t i, ndev;
	- vdev_t *vd;
	- int error;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- /*
	- * It's acceptable to have no devs specified.
	- */
	- if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
	- return (0);
	-
	- if (ndev == 0)
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * Make sure the pool is formatted with a version that supports this
	- * device type.
	- */
	- if (spa_version(spa) < version)
	- return (SET_ERROR(ENOTSUP));
	-
	- /*
	- * Set the pending device list so we correctly handle device in-use
	- * checking.
	- */
	- sav->sav_pending = dev;
	- sav->sav_npending = ndev;
	-
	- for (i = 0; i < ndev; i++) {
	- if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
	- mode)) != 0)
	- goto out;
	-
	- if (!vd->vdev_ops->vdev_op_leaf) {
	- vdev_free(vd);
	- error = SET_ERROR(EINVAL);
	- goto out;
	- }
	-
	- vd->vdev_top = vd;
	-
	- if ((error = vdev_open(vd)) == 0 &&
	- (error = vdev_label_init(vd, crtxg, label)) == 0) {
	- VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
	- vd->vdev_guid) == 0);
	- }
	-
	- vdev_free(vd);
	-
	- if (error &&
	- (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
	- goto out;
	- else
	- error = 0;
	- }
	-
	-out:
	- sav->sav_pending = NULL;
	- sav->sav_npending = 0;
	- return (error);
	-}
	-
	-static int
	-spa_validate_aux(spa_t spa, nvlist_t nvroot, uint64_t crtxg, int mode)
	-{
	- int error;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
	- &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
	- VDEV_LABEL_SPARE)) != 0) {
	- return (error);
	- }
	-
	- return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
	- &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
	- VDEV_LABEL_L2CACHE));
	-}
	-
	-static void
	-spa_set_aux_vdevs(spa_aux_vdev_t sav, nvlist_t *devs, int ndevs,
	- const char *config)
	-{
	- int i;
	-
	- if (sav->sav_config != NULL) {
	- nvlist_t **olddevs;
	- uint_t oldndevs;
	- nvlist_t **newdevs;
	-
	- /*
	- * Generate new dev list by concatentating with the
	- * current dev list.
	- */
	- VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
	- &olddevs, &oldndevs) == 0);
	-
	- newdevs = kmem_alloc(sizeof (void )
	- (ndevs + oldndevs), KM_SLEEP);
	- for (i = 0; i < oldndevs; i++)
	- VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
	- KM_SLEEP) == 0);
	- for (i = 0; i < ndevs; i++)
	- VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
	- KM_SLEEP) == 0);
	-
	- VERIFY(nvlist_remove(sav->sav_config, config,
	- DATA_TYPE_NVLIST_ARRAY) == 0);
	-
	- VERIFY(nvlist_add_nvlist_array(sav->sav_config,
	- config, newdevs, ndevs + oldndevs) == 0);
	- for (i = 0; i < oldndevs + ndevs; i++)
	- nvlist_free(newdevs[i]);
	- kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
	- } else {
	- /*
	- * Generate a new dev list.
	- */
	- VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
	- KM_SLEEP) == 0);
	- VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
	- devs, ndevs) == 0);
	- }
	-}
	-
	-/*
	- * Stop and drop level 2 ARC devices
	- */
	-void
	-spa_l2cache_drop(spa_t *spa)
	-{
	- vdev_t *vd;
	- int i;
	- spa_aux_vdev_t *sav = &spa->spa_l2cache;
	-
	- for (i = 0; i < sav->sav_count; i++) {
	- uint64_t pool;
	-
	- vd = sav->sav_vdevs[i];
	- ASSERT(vd != NULL);
	-
	- if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
	- pool != 0ULL && l2arc_vdev_present(vd))
	- l2arc_remove_vdev(vd);
	- }
	-}
	-
	-/*
	- * Pool Creation
	- */
	-int
	-spa_create(const char pool, nvlist_t nvroot, nvlist_t *props,
	- nvlist_t *zplprops)
	-{
	- spa_t *spa;
	- char *altroot = NULL;
	- vdev_t *rvd;
	- dsl_pool_t *dp;
	- dmu_tx_t *tx;
	- int error = 0;
	- uint64_t txg = TXG_INITIAL;
	- nvlist_t spares, l2cache;
	- uint_t nspares, nl2cache;
	- uint64_t version, obj;
	- boolean_t has_features;
	- char *poolname;
	- nvlist_t *nvl;
	-
	- if (props == NULL \|\|
	- nvlist_lookup_string(props,
	- zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
	- poolname = (char *)pool;
	-
	- /*
	- * If this pool already exists, return failure.
	- */
	- mutex_enter(&spa_namespace_lock);
	- if (spa_lookup(poolname) != NULL) {
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(EEXIST));
	- }
	-
	- /*
	- * Allocate a new spa_t structure.
	- */
	- nvl = fnvlist_alloc();
	- fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
	- (void) nvlist_lookup_string(props,
	- zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
	- spa = spa_add(poolname, nvl, altroot);
	- fnvlist_free(nvl);
	- spa_activate(spa, spa_mode_global);
	-
	- if (props && (error = spa_prop_validate(spa, props))) {
	- spa_deactivate(spa);
	- spa_remove(spa);
	- mutex_exit(&spa_namespace_lock);
	- return (error);
	- }
	-
	- /*
	- * Temporary pool names should never be written to disk.
	- */
	- if (poolname != pool)
	- spa->spa_import_flags \|= ZFS_IMPORT_TEMP_NAME;
	-
	- has_features = B_FALSE;
	- for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
	- elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
	- if (zpool_prop_feature(nvpair_name(elem)))
	- has_features = B_TRUE;
	- }
	-
	- if (has_features \|\| nvlist_lookup_uint64(props,
	- zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
	- version = SPA_VERSION;
	- }
	- ASSERT(SPA_VERSION_IS_SUPPORTED(version));
	-
	- spa->spa_first_txg = txg;
	- spa->spa_uberblock.ub_txg = txg - 1;
	- spa->spa_uberblock.ub_version = version;
	- spa->spa_ubsync = spa->spa_uberblock;
	- spa->spa_load_state = SPA_LOAD_CREATE;
	- spa->spa_removing_phys.sr_state = DSS_NONE;
	- spa->spa_removing_phys.sr_removing_vdev = -1;
	- spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
	- spa->spa_indirect_vdevs_loaded = B_TRUE;
	-
	- /*
	- * Create "The Godfather" zio to hold all async IOs
	- */
	- spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
	- KM_SLEEP);
	- for (int i = 0; i < max_ncpus; i++) {
	- spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	- ZIO_FLAG_GODFATHER);
	- }
	-
	- /*
	- * Create the root vdev.
	- */
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	-
	- error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
	-
	- ASSERT(error != 0 \|\| rvd != NULL);
	- ASSERT(error != 0 \|\| spa->spa_root_vdev == rvd);
	-
	- if (error == 0 && !zfs_allocatable_devs(nvroot))
	- error = SET_ERROR(EINVAL);
	-
	- if (error == 0 &&
	- (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
	- (error = spa_validate_aux(spa, nvroot, txg,
	- VDEV_ALLOC_ADD)) == 0) {
	- /*
	- * instantiate the metaslab groups (this will dirty the vdevs)
	- * we can no longer error exit past this point
	- */
	- for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
	- vdev_t *vd = rvd->vdev_child[c];
	-
	- vdev_ashift_optimize(vd);
	- vdev_metaslab_set_size(vd);
	- vdev_expand(vd, txg);
	- }
	- }
	-
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- if (error != 0) {
	- spa_unload(spa);
	- spa_deactivate(spa);
	- spa_remove(spa);
	- mutex_exit(&spa_namespace_lock);
	- return (error);
	- }
	-
	- /*
	- * Get the list of spares, if specified.
	- */
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	- &spares, &nspares) == 0) {
	- VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
	- KM_SLEEP) == 0);
	- VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- spa_load_spares(spa);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- spa->spa_spares.sav_sync = B_TRUE;
	- }
	-
	- /*
	- * Get the list of level 2 cache devices, if specified.
	- */
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	- &l2cache, &nl2cache) == 0) {
	- VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
	- NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
	- ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- spa_load_l2cache(spa);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- spa->spa_l2cache.sav_sync = B_TRUE;
	- }
	-
	- spa->spa_is_initializing = B_TRUE;
	- spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
	- spa->spa_meta_objset = dp->dp_meta_objset;
	- spa->spa_is_initializing = B_FALSE;
	-
	- /*
	- * Create DDTs (dedup tables).
	- */
	- ddt_create(spa);
	-
	- spa_update_dspace(spa);
	-
	- tx = dmu_tx_create_assigned(dp, txg);
	-
	- /*
	- * Create the pool config object.
	- */
	- spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
	- DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
	- DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
	-
	- if (zap_add(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
	- sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
	- cmn_err(CE_PANIC, "failed to add pool config");
	- }
	-
	- if (spa_version(spa) >= SPA_VERSION_FEATURES)
	- spa_feature_create_zap_objects(spa, tx);
	-
	- if (zap_add(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
	- sizeof (uint64_t), 1, &version, tx) != 0) {
	- cmn_err(CE_PANIC, "failed to add pool version");
	- }
	-
	- /* Newly created pools with the right version are always deflated. */
	- if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
	- spa->spa_deflate = TRUE;
	- if (zap_add(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
	- sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
	- cmn_err(CE_PANIC, "failed to add deflate");
	- }
	- }
	-
	- /*
	- * Create the deferred-free bpobj. Turn off compression
	- * because sync-to-convergence takes longer if the blocksize
	- * keeps changing.
	- */
	- obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
	- dmu_object_set_compress(spa->spa_meta_objset, obj,
	- ZIO_COMPRESS_OFF, tx);
	- if (zap_add(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
	- sizeof (uint64_t), 1, &obj, tx) != 0) {
	- cmn_err(CE_PANIC, "failed to add bpobj");
	- }
	- VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
	- spa->spa_meta_objset, obj));
	-
	- /*
	- * Create the pool's history object.
	- */
	- if (version >= SPA_VERSION_ZPOOL_HISTORY)
	- spa_history_create_obj(spa, tx);
	-
	- /*
	- * Generate some random noise for salted checksums to operate on.
	- */
	- (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
	- sizeof (spa->spa_cksum_salt.zcs_bytes));
	-
	- /*
	- * Set pool properties.
	- */
	- spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
	- spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
	- spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
	- spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
	- spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
	-
	- if (props != NULL) {
	- spa_configfile_set(spa, props, B_FALSE);
	- spa_sync_props(props, tx);
	- }
	-
	- dmu_tx_commit(tx);
	-
	- spa->spa_sync_on = B_TRUE;
	- txg_sync_start(spa->spa_dsl_pool);
	- mmp_thread_start(spa);
	-
	- /*
	- * We explicitly wait for the first transaction to complete so that our
	- * bean counters are appropriately updated.
	- */
	- txg_wait_synced(spa->spa_dsl_pool, txg);
	-
	- spa_spawn_aux_threads(spa);
	-
	- spa_write_cachefile(spa, B_FALSE, B_TRUE);
	- spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
	-
	- spa_history_log_version(spa, "create");
	-
	- /*
	- * Don't count references from objsets that are already closed
	- * and are making their way through the eviction process.
	- */
	- spa_evicting_os_wait(spa);
	- spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
	- spa->spa_load_state = SPA_LOAD_NONE;
	-
	- mutex_exit(&spa_namespace_lock);
	-
	- return (0);
	-}
	-
	-#ifdef _KERNEL
	-#ifdef illumos
	-/*
	- * Get the root pool information from the root disk, then import the root pool
	- * during the system boot up time.
	- */
	-extern int vdev_disk_read_rootlabel(char , char , nvlist_t **);
	-
	-static nvlist_t *
	-spa_generate_rootconf(char devpath, char devid, uint64_t *guid)
	-{
	- nvlist_t *config;
	- nvlist_t nvtop, nvroot;
	- uint64_t pgid;
	-
	- if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
	- return (NULL);
	-
	- /*
	- * Add this top-level vdev to the child array.
	- */
	- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvtop) == 0);
	- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	- &pgid) == 0);
	- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
	-
	- /*
	- * Put this pool's top-level vdevs into a root vdev.
	- */
	- VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_ROOT) == 0);
	- VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
	- VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
	- VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- &nvtop, 1) == 0);
	-
	- /*
	- * Replace the existing vdev_tree with the new root vdev in
	- * this pool's configuration (remove the old, add the new).
	- */
	- VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
	- nvlist_free(nvroot);
	- return (config);
	-}
	-
	-/*
	- * Walk the vdev tree and see if we can find a device with "better"
	- * configuration. A configuration is "better" if the label on that
	- * device has a more recent txg.
	- */
	-static void
	-spa_alt_rootvdev(vdev_t vd, vdev_t avd, uint64_t txg)
	-{
	- for (int c = 0; c < vd->vdev_children; c++)
	- spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
	-
	- if (vd->vdev_ops->vdev_op_leaf) {
	- nvlist_t *label;
	- uint64_t label_txg;
	-
	- if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
	- &label) != 0)
	- return;
	-
	- VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
	- &label_txg) == 0);
	-
	- /*
	- * Do we have a better boot device?
	- */
	- if (label_txg > *txg) {
	- *txg = label_txg;
	- *avd = vd;
	- }
	- nvlist_free(label);
	- }
	-}
	-
	-/*
	- * Import a root pool.
	- *
	- * For x86. devpath_list will consist of devid and/or physpath name of
	- * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
	- * The GRUB "findroot" command will return the vdev we should boot.
	- *
	- * For Sparc, devpath_list consists the physpath name of the booting device
	- * no matter the rootpool is a single device pool or a mirrored pool.
	- * e.g.
	- * "/pci@1f,0/ide@d/disk@0,0:a"
	- */
	-int
	-spa_import_rootpool(char devpath, char devid)
	-{
	- spa_t *spa;
	- vdev_t rvd, bvd, *avd = NULL;
	- nvlist_t config, nvtop;
	- uint64_t guid, txg;
	- char *pname;
	- int error;
	-
	- /*
	- * Read the label from the boot device and generate a configuration.
	- */
	- config = spa_generate_rootconf(devpath, devid, &guid);
	-#if defined(_OBP) && defined(_KERNEL)
	- if (config == NULL) {
	- if (strstr(devpath, "/iscsi/ssd") != NULL) {
	- /* iscsi boot */
	- get_iscsi_bootpath_phy(devpath);
	- config = spa_generate_rootconf(devpath, devid, &guid);
	- }
	- }
	-#endif
	- if (config == NULL) {
	- cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
	- devpath);
	- return (SET_ERROR(EIO));
	- }
	-
	- VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	- &pname) == 0);
	- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
	-
	- mutex_enter(&spa_namespace_lock);
	- if ((spa = spa_lookup(pname)) != NULL) {
	- /*
	- * Remove the existing root pool from the namespace so that we
	- * can replace it with the correct config we just read in.
	- */
	- spa_remove(spa);
	- }
	-
	- spa = spa_add(pname, config, NULL);
	- spa->spa_is_root = B_TRUE;
	- spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	- &spa->spa_ubsync.ub_version) != 0)
	- spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
	-
	- /*
	- * Build up a vdev tree based on the boot device's label config.
	- */
	- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvtop) == 0);
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
	- VDEV_ALLOC_ROOTPOOL);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- if (error) {
	- mutex_exit(&spa_namespace_lock);
	- nvlist_free(config);
	- cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
	- pname);
	- return (error);
	- }
	-
	- /*
	- * Get the boot vdev.
	- */
	- if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
	- cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
	- (u_longlong_t)guid);
	- error = SET_ERROR(ENOENT);
	- goto out;
	- }
	-
	- /*
	- * Determine if there is a better boot device.
	- */
	- avd = bvd;
	- spa_alt_rootvdev(rvd, &avd, &txg);
	- if (avd != bvd) {
	- cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
	- "try booting from '%s'", avd->vdev_path);
	- error = SET_ERROR(EINVAL);
	- goto out;
	- }
	-
	- /*
	- * If the boot device is part of a spare vdev then ensure that
	- * we're booting off the active spare.
	- */
	- if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
	- !bvd->vdev_isspare) {
	- cmn_err(CE_NOTE, "The boot device is currently spared. Please "
	- "try booting from '%s'",
	- bvd->vdev_parent->
	- vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
	- error = SET_ERROR(EINVAL);
	- goto out;
	- }
	-
	- error = 0;
	-out:
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- vdev_free(rvd);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- mutex_exit(&spa_namespace_lock);
	-
	- nvlist_free(config);
	- return (error);
	-}
	-
	-#else /* !illumos */
	-
	-extern int vdev_geom_read_pool_label(const char name, nvlist_t **configs,
	- uint64_t *count);
	-
	-static nvlist_t *
	-spa_generate_rootconf(const char *name)
	-{
	- nvlist_t configs, tops;
	- nvlist_t *config;
	- nvlist_t best_cfg, nvtop, *nvroot;
	- uint64_t *holes;
	- uint64_t best_txg;
	- uint64_t nchildren;
	- uint64_t pgid;
	- uint64_t count;
	- uint64_t i;
	- uint_t nholes;
	-
	- if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
	- return (NULL);
	-
	- ASSERT3U(count, !=, 0);
	- best_txg = 0;
	- for (i = 0; i < count; i++) {
	- uint64_t txg;
	-
	- VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
	- &txg) == 0);
	- if (txg > best_txg) {
	- best_txg = txg;
	- best_cfg = configs[i];
	- }
	- }
	-
	- nchildren = 1;
	- nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
	- holes = NULL;
	- nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
	- &holes, &nholes);
	-
	- tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
	- for (i = 0; i < nchildren; i++) {
	- if (i >= count)
	- break;
	- if (configs[i] == NULL)
	- continue;
	- VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
	- &nvtop) == 0);
	- nvlist_dup(nvtop, &tops[i], KM_SLEEP);
	- }
	- for (i = 0; holes != NULL && i < nholes; i++) {
	- if (i >= nchildren)
	- continue;
	- if (tops[holes[i]] != NULL)
	- continue;
	- nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
	- VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_HOLE) == 0);
	- VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
	- holes[i]) == 0);
	- VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
	- 0) == 0);
	- }
	- for (i = 0; i < nchildren; i++) {
	- if (tops[i] != NULL)
	- continue;
	- nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
	- VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_MISSING) == 0);
	- VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
	- i) == 0);
	- VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
	- 0) == 0);
	- }
	-
	- /*
	- * Create pool config based on the best vdev config.
	- */
	- nvlist_dup(best_cfg, &config, KM_SLEEP);
	-
	- /*
	- * Put this pool's top-level vdevs into a root vdev.
	- */
	- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	- &pgid) == 0);
	- VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
	- VDEV_TYPE_ROOT) == 0);
	- VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
	- VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
	- VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	- tops, nchildren) == 0);
	-
	- /*
	- * Replace the existing vdev_tree with the new root vdev in
	- * this pool's configuration (remove the old, add the new).
	- */
	- VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
	-
	- /*
	- * Drop vdev config elements that should not be present at pool level.
	- */
	- nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
	- nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
	-
	- for (i = 0; i < count; i++)
	- nvlist_free(configs[i]);
	- kmem_free(configs, count * sizeof(void *));
	- for (i = 0; i < nchildren; i++)
	- nvlist_free(tops[i]);
	- kmem_free(tops, nchildren * sizeof(void *));
	- nvlist_free(nvroot);
	- return (config);
	-}
	-
	-int
	-spa_import_rootpool(const char *name, bool checkpointrewind)
	-{
	- spa_t *spa;
	- vdev_t rvd, bvd, *avd = NULL;
	- nvlist_t config, nvtop;
	- uint64_t txg;
	- char *pname;
	- int error;
	-
	- /*
	- * Read the label from the boot device and generate a configuration.
	- */
	- config = spa_generate_rootconf(name);
	-
	- mutex_enter(&spa_namespace_lock);
	- if (config != NULL) {
	- VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	- &pname) == 0 && strcmp(name, pname) == 0);
	- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
	- == 0);
	-
	- if ((spa = spa_lookup(pname)) != NULL) {
	- /*
	- * The pool could already be imported,
	- * e.g., after reboot -r.
	- */
	- if (spa->spa_state == POOL_STATE_ACTIVE) {
	- mutex_exit(&spa_namespace_lock);
	- nvlist_free(config);
	- return (0);
	- }
	-
	- /*
	- * Remove the existing root pool from the namespace so
	- * that we can replace it with the correct config
	- * we just read in.
	- */
	- spa_remove(spa);
	- }
	- spa = spa_add(pname, config, NULL);
	-
	- /*
	- * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
	- * via spa_version().
	- */
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	- &spa->spa_ubsync.ub_version) != 0)
	- spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
	- } else if ((spa = spa_lookup(name)) == NULL) {
	- mutex_exit(&spa_namespace_lock);
	- nvlist_free(config);
	- cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
	- name);
	- return (EIO);
	- } else {
	- VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
	- }
	- spa->spa_is_root = B_TRUE;
	- spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
	- if (checkpointrewind) {
	- spa->spa_import_flags \|= ZFS_IMPORT_CHECKPOINT;
	- }
	-
	- /*
	- * Build up a vdev tree based on the boot device's label config.
	- */
	- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvtop) == 0);
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
	- VDEV_ALLOC_ROOTPOOL);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- if (error) {
	- mutex_exit(&spa_namespace_lock);
	- nvlist_free(config);
	- cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
	- pname);
	- return (error);
	- }
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- vdev_free(rvd);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- mutex_exit(&spa_namespace_lock);
	-
	- nvlist_free(config);
	- return (0);
	-}
	-
	-#endif /* illumos */
	-#endif /* _KERNEL */
	-
	-/*
	- * Import a non-root pool into the system.
	- */
	-int
	-spa_import(const char pool, nvlist_t config, nvlist_t *props, uint64_t flags)
	-{
	- spa_t *spa;
	- char *altroot = NULL;
	- spa_load_state_t state = SPA_LOAD_IMPORT;
	- zpool_load_policy_t policy;
	- uint64_t mode = spa_mode_global;
	- uint64_t readonly = B_FALSE;
	- int error;
	- nvlist_t *nvroot;
	- nvlist_t spares, l2cache;
	- uint_t nspares, nl2cache;
	-
	- /*
	- * If a pool with this name exists, return failure.
	- */
	- mutex_enter(&spa_namespace_lock);
	- if (spa_lookup(pool) != NULL) {
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(EEXIST));
	- }
	-
	- /*
	- * Create and initialize the spa structure.
	- */
	- (void) nvlist_lookup_string(props,
	- zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
	- (void) nvlist_lookup_uint64(props,
	- zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
	- if (readonly)
	- mode = FREAD;
	- spa = spa_add(pool, config, altroot);
	- spa->spa_import_flags = flags;
	-
	- /*
	- * Verbatim import - Take a pool and insert it into the namespace
	- * as if it had been loaded at boot.
	- */
	- if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
	- if (props != NULL)
	- spa_configfile_set(spa, props, B_FALSE);
	-
	- spa_write_cachefile(spa, B_FALSE, B_TRUE);
	- spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
	- zfs_dbgmsg("spa_import: verbatim import of %s", pool);
	- mutex_exit(&spa_namespace_lock);
	- return (0);
	- }
	-
	- spa_activate(spa, mode);
	-
	- /*
	- * Don't start async tasks until we know everything is healthy.
	- */
	- spa_async_suspend(spa);
	-
	- zpool_get_load_policy(config, &policy);
	- if (policy.zlp_rewind & ZPOOL_DO_REWIND)
	- state = SPA_LOAD_RECOVER;
	-
	- spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
	-
	- if (state != SPA_LOAD_RECOVER) {
	- spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
	- zfs_dbgmsg("spa_import: importing %s", pool);
	- } else {
	- zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
	- "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
	- }
	- error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
	-
	- /*
	- * Propagate anything learned while loading the pool and pass it
	- * back to caller (i.e. rewind info, missing devices, etc).
	- */
	- VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
	- spa->spa_load_info) == 0);
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- /*
	- * Toss any existing sparelist, as it doesn't have any validity
	- * anymore, and conflicts with spa_has_spare().
	- */
	- if (spa->spa_spares.sav_config) {
	- nvlist_free(spa->spa_spares.sav_config);
	- spa->spa_spares.sav_config = NULL;
	- spa_load_spares(spa);
	- }
	- if (spa->spa_l2cache.sav_config) {
	- nvlist_free(spa->spa_l2cache.sav_config);
	- spa->spa_l2cache.sav_config = NULL;
	- spa_load_l2cache(spa);
	- }
	-
	- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	- if (error == 0)
	- error = spa_validate_aux(spa, nvroot, -1ULL,
	- VDEV_ALLOC_SPARE);
	- if (error == 0)
	- error = spa_validate_aux(spa, nvroot, -1ULL,
	- VDEV_ALLOC_L2CACHE);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- if (props != NULL)
	- spa_configfile_set(spa, props, B_FALSE);
	-
	- if (error != 0 \|\| (props && spa_writeable(spa) &&
	- (error = spa_prop_set(spa, props)))) {
	- spa_unload(spa);
	- spa_deactivate(spa);
	- spa_remove(spa);
	- mutex_exit(&spa_namespace_lock);
	- return (error);
	- }
	-
	- spa_async_resume(spa);
	-
	- /*
	- * Override any spares and level 2 cache devices as specified by
	- * the user, as these may have correct device names/devids, etc.
	- */
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	- &spares, &nspares) == 0) {
	- if (spa->spa_spares.sav_config)
	- VERIFY(nvlist_remove(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
	- else
	- VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
	- NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- spa_load_spares(spa);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- spa->spa_spares.sav_sync = B_TRUE;
	- }
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	- &l2cache, &nl2cache) == 0) {
	- if (spa->spa_l2cache.sav_config)
	- VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
	- ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
	- else
	- VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
	- NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
	- ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- spa_load_l2cache(spa);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- spa->spa_l2cache.sav_sync = B_TRUE;
	- }
	-
	- /*
	- * Check for any removed devices.
	- */
	- if (spa->spa_autoreplace) {
	- spa_aux_check_removed(&spa->spa_spares);
	- spa_aux_check_removed(&spa->spa_l2cache);
	- }
	-
	- if (spa_writeable(spa)) {
	- /*
	- * Update the config cache to include the newly-imported pool.
	- */
	- spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
	- }
	-
	- /*
	- * It's possible that the pool was expanded while it was exported.
	- * We kick off an async task to handle this for us.
	- */
	- spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
	-
	- spa_history_log_version(spa, "import");
	-
	- spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
	-
	- mutex_exit(&spa_namespace_lock);
	-
	-#ifdef __FreeBSD__
	-#ifdef _KERNEL
	- zvol_create_minors(spa, pool);
	-#endif
	-#endif
	- return (0);
	-}
	-
	-nvlist_t *
	-spa_tryimport(nvlist_t *tryconfig)
	-{
	- nvlist_t *config = NULL;
	- char poolname, cachefile;
	- spa_t *spa;
	- uint64_t state;
	- int error;
	- zpool_load_policy_t policy;
	-
	- if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
	- return (NULL);
	-
	- if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
	- return (NULL);
	-
	- /*
	- * Create and initialize the spa structure.
	- */
	- mutex_enter(&spa_namespace_lock);
	- spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
	- spa_activate(spa, FREAD);
	-
	- /*
	- * Rewind pool if a max txg was provided.
	- */
	- zpool_get_load_policy(spa->spa_config, &policy);
	- if (policy.zlp_txg != UINT64_MAX) {
	- spa->spa_load_max_txg = policy.zlp_txg;
	- spa->spa_extreme_rewind = B_TRUE;
	- zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
	- poolname, (longlong_t)policy.zlp_txg);
	- } else {
	- zfs_dbgmsg("spa_tryimport: importing %s", poolname);
	- }
	-
	- if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
	- == 0) {
	- zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
	- spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
	- } else {
	- spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
	- }
	-
	- error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
	-
	- /*
	- * If 'tryconfig' was at least parsable, return the current config.
	- */
	- if (spa->spa_root_vdev != NULL) {
	- config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
	- VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
	- poolname) == 0);
	- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	- state) == 0);
	- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
	- spa->spa_uberblock.ub_timestamp) == 0);
	- VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
	- spa->spa_load_info) == 0);
	-
	- /*
	- * If the bootfs property exists on this pool then we
	- * copy it out so that external consumers can tell which
	- * pools are bootable.
	- */
	- if ((!error \|\| error == EEXIST) && spa->spa_bootfs) {
	- char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	-
	- /*
	- * We have to play games with the name since the
	- * pool was opened as TRYIMPORT_NAME.
	- */
	- if (dsl_dsobj_to_dsname(spa_name(spa),
	- spa->spa_bootfs, tmpname) == 0) {
	- char *cp;
	- char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	-
	- cp = strchr(tmpname, '/');
	- if (cp == NULL) {
	- (void) strlcpy(dsname, tmpname,
	- MAXPATHLEN);
	- } else {
	- (void) snprintf(dsname, MAXPATHLEN,
	- "%s/%s", poolname, ++cp);
	- }
	- VERIFY(nvlist_add_string(config,
	- ZPOOL_CONFIG_BOOTFS, dsname) == 0);
	- kmem_free(dsname, MAXPATHLEN);
	- }
	- kmem_free(tmpname, MAXPATHLEN);
	- }
	-
	- /*
	- * Add the list of hot spares and level 2 cache devices.
	- */
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	- spa_add_spares(spa, config);
	- spa_add_l2cache(spa, config);
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	- }
	-
	- spa_unload(spa);
	- spa_deactivate(spa);
	- spa_remove(spa);
	- mutex_exit(&spa_namespace_lock);
	-
	- return (config);
	-}
	-
	-/*
	- * Pool export/destroy
	- *
	- * The act of destroying or exporting a pool is very simple. We make sure there
	- * is no more pending I/O and any references to the pool are gone. Then, we
	- * update the pool state and sync all the labels to disk, removing the
	- * configuration from the cache afterwards. If the 'hardforce' flag is set, then
	- * we don't sync the labels or remove the configuration cache.
	- */
	-static int
	-spa_export_common(char pool, int new_state, nvlist_t *oldconfig,
	- boolean_t force, boolean_t hardforce)
	-{
	- spa_t *spa;
	-
	- if (oldconfig)
	- *oldconfig = NULL;
	-
	- if (!(spa_mode_global & FWRITE))
	- return (SET_ERROR(EROFS));
	-
	- mutex_enter(&spa_namespace_lock);
	- if ((spa = spa_lookup(pool)) == NULL) {
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- /*
	- * Put a hold on the pool, drop the namespace lock, stop async tasks,
	- * reacquire the namespace lock, and see if we can export.
	- */
	- spa_open_ref(spa, FTAG);
	- mutex_exit(&spa_namespace_lock);
	- spa_async_suspend(spa);
	- if (spa->spa_zvol_taskq) {
	-#ifdef _KERNEL
	- zvol_remove_minors(spa, spa_name(spa));
	-#endif
	- taskq_wait(spa->spa_zvol_taskq);
	- }
	- mutex_enter(&spa_namespace_lock);
	- spa_close(spa, FTAG);
	-
	- /*
	- * The pool will be in core if it's openable,
	- * in which case we can modify its state.
	- */
	- if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
	-
	- /*
	- * Objsets may be open only because they're dirty, so we
	- * have to force it to sync before checking spa_refcnt.
	- */
	- txg_wait_synced(spa->spa_dsl_pool, 0);
	- spa_evicting_os_wait(spa);
	-
	- /*
	- * A pool cannot be exported or destroyed if there are active
	- * references. If we are resetting a pool, allow references by
	- * fault injection handlers.
	- */
	- if (!spa_refcount_zero(spa) \|\|
	- (spa->spa_inject_ref != 0 &&
	- new_state != POOL_STATE_UNINITIALIZED)) {
	- spa_async_resume(spa);
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(EBUSY));
	- }
	-
	- /*
	- * A pool cannot be exported if it has an active shared spare.
	- * This is to prevent other pools stealing the active spare
	- * from an exported pool. At user's own will, such pool can
	- * be forcedly exported.
	- */
	- if (!force && new_state == POOL_STATE_EXPORTED &&
	- spa_has_active_shared_spare(spa)) {
	- spa_async_resume(spa);
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(EXDEV));
	- }
	-
	- /*
	- * We're about to export or destroy this pool. Make sure
	- * we stop all initializtion activity here before we
	- * set the spa_final_txg. This will ensure that all
	- * dirty data resulting from the initialization is
	- * committed to disk before we unload the pool.
	- */
	- if (spa->spa_root_vdev != NULL) {
	- vdev_initialize_stop_all(spa->spa_root_vdev,
	- VDEV_INITIALIZE_ACTIVE);
	- }
	-
	- /*
	- * We want this to be reflected on every label,
	- * so mark them all dirty. spa_unload() will do the
	- * final sync that pushes these changes out.
	- */
	- if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- spa->spa_state = new_state;
	- spa->spa_final_txg = spa_last_synced_txg(spa) +
	- TXG_DEFER_SIZE + 1;
	- vdev_config_dirty(spa->spa_root_vdev);
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- }
	- }
	-
	- spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
	-
	- if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
	- spa_unload(spa);
	- spa_deactivate(spa);
	- }
	-
	- if (oldconfig && spa->spa_config)
	- VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
	-
	- if (new_state != POOL_STATE_UNINITIALIZED) {
	- if (!hardforce)
	- spa_write_cachefile(spa, B_TRUE, B_TRUE);
	- spa_remove(spa);
	- }
	- mutex_exit(&spa_namespace_lock);
	-
	- return (0);
	-}
	-
	-/*
	- * Destroy a storage pool.
	- */
	-int
	-spa_destroy(char *pool)
	-{
	- return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
	- B_FALSE, B_FALSE));
	-}
	-
	-/*
	- * Export a storage pool.
	- */
	-int
	-spa_export(char pool, nvlist_t *oldconfig, boolean_t force,
	- boolean_t hardforce)
	-{
	- return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
	- force, hardforce));
	-}
	-
	-/*
	- * Similar to spa_export(), this unloads the spa_t without actually removing it
	- * from the namespace in any way.
	- */
	-int
	-spa_reset(char *pool)
	-{
	- return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
	- B_FALSE, B_FALSE));
	-}
	-
	-/*
	- * ==========================================================================
	- * Device manipulation
	- * ==========================================================================
	- */
	-
	-/*
	- * Add a device to a storage pool.
	- */
	-int
	-spa_vdev_add(spa_t spa, nvlist_t nvroot)
	-{
	- uint64_t txg, id;
	- int error;
	- vdev_t *rvd = spa->spa_root_vdev;
	- vdev_t vd, tvd;
	- nvlist_t spares, l2cache;
	- uint_t nspares, nl2cache;
	-
	- ASSERT(spa_writeable(spa));
	-
	- txg = spa_vdev_enter(spa);
	-
	- if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
	- VDEV_ALLOC_ADD)) != 0)
	- return (spa_vdev_exit(spa, NULL, txg, error));
	-
	- spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
	-
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
	- &nspares) != 0)
	- nspares = 0;
	-
	- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
	- &nl2cache) != 0)
	- nl2cache = 0;
	-
	- if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
	- return (spa_vdev_exit(spa, vd, txg, EINVAL));
	-
	- if (vd->vdev_children != 0 &&
	- (error = vdev_create(vd, txg, B_FALSE)) != 0)
	- return (spa_vdev_exit(spa, vd, txg, error));
	-
	- /*
	- * We must validate the spares and l2cache devices after checking the
	- * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
	- */
	- if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
	- return (spa_vdev_exit(spa, vd, txg, error));
	-
	- /*
	- * If we are in the middle of a device removal, we can only add
	- * devices which match the existing devices in the pool.
	- * If we are in the middle of a removal, or have some indirect
	- * vdevs, we can not add raidz toplevels.
	- */
	- if (spa->spa_vdev_removal != NULL \|\|
	- spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
	- for (int c = 0; c < vd->vdev_children; c++) {
	- tvd = vd->vdev_child[c];
	- if (spa->spa_vdev_removal != NULL &&
	- tvd->vdev_ashift != spa->spa_max_ashift) {
	- return (spa_vdev_exit(spa, vd, txg, EINVAL));
	- }
	- /* Fail if top level vdev is raidz */
	- if (tvd->vdev_ops == &vdev_raidz_ops) {
	- return (spa_vdev_exit(spa, vd, txg, EINVAL));
	- }
	- /*
	- * Need the top level mirror to be
	- * a mirror of leaf vdevs only
	- */
	- if (tvd->vdev_ops == &vdev_mirror_ops) {
	- for (uint64_t cid = 0;
	- cid < tvd->vdev_children; cid++) {
	- vdev_t *cvd = tvd->vdev_child[cid];
	- if (!cvd->vdev_ops->vdev_op_leaf) {
	- return (spa_vdev_exit(spa, vd,
	- txg, EINVAL));
	- }
	- }
	- }
	- }
	- }
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	-
	- /*
	- * Set the vdev id to the first hole, if one exists.
	- */
	- for (id = 0; id < rvd->vdev_children; id++) {
	- if (rvd->vdev_child[id]->vdev_ishole) {
	- vdev_free(rvd->vdev_child[id]);
	- break;
	- }
	- }
	- tvd = vd->vdev_child[c];
	- vdev_remove_child(vd, tvd);
	- tvd->vdev_id = id;
	- vdev_add_child(rvd, tvd);
	- vdev_config_dirty(tvd);
	- }
	-
	- if (nspares != 0) {
	- spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
	- ZPOOL_CONFIG_SPARES);
	- spa_load_spares(spa);
	- spa->spa_spares.sav_sync = B_TRUE;
	- }
	-
	- if (nl2cache != 0) {
	- spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
	- ZPOOL_CONFIG_L2CACHE);
	- spa_load_l2cache(spa);
	- spa->spa_l2cache.sav_sync = B_TRUE;
	- }
	-
	- /*
	- * We have to be careful when adding new vdevs to an existing pool.
	- * If other threads start allocating from these vdevs before we
	- * sync the config cache, and we lose power, then upon reboot we may
	- * fail to open the pool because there are DVAs that the config cache
	- * can't translate. Therefore, we first add the vdevs without
	- * initializing metaslabs; sync the config cache (via spa_vdev_exit());
	- * and then let spa_config_update() initialize the new metaslabs.
	- *
	- * spa_load() checks for added-but-not-initialized vdevs, so that
	- * if we lose power at any point in this sequence, the remaining
	- * steps will be completed the next time we load the pool.
	- */
	- (void) spa_vdev_exit(spa, vd, txg, 0);
	-
	- mutex_enter(&spa_namespace_lock);
	- spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
	- spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
	- mutex_exit(&spa_namespace_lock);
	-
	- return (0);
	-}
	-
	-/*
	- * Attach a device to a mirror. The arguments are the path to any device
	- * in the mirror, and the nvroot for the new device. If the path specifies
	- * a device that is not mirrored, we automatically insert the mirror vdev.
	- *
	- * If 'replacing' is specified, the new device is intended to replace the
	- * existing device; in this case the two devices are made into their own
	- * mirror using the 'replacing' vdev, which is functionally identical to
	- * the mirror vdev (it actually reuses all the same ops) but has a few
	- * extra rules: you can't attach to it after it's been created, and upon
	- * completion of resilvering, the first disk (the one being replaced)
	- * is automatically detached.
	- */
	-int
	-spa_vdev_attach(spa_t spa, uint64_t guid, nvlist_t nvroot, int replacing)
	-{
	- uint64_t txg, dtl_max_txg;
	- vdev_t *rvd = spa->spa_root_vdev;
	- vdev_t oldvd, newvd, newrootvd, pvd, *tvd;
	- vdev_ops_t *pvops;
	- char oldvdpath, newvdpath;
	- int newvd_isspare;
	- int error;
	-
	- ASSERT(spa_writeable(spa));
	-
	- txg = spa_vdev_enter(spa);
	-
	- oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
	- error = (spa_has_checkpoint(spa)) ?
	- ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
	- return (spa_vdev_exit(spa, NULL, txg, error));
	- }
	-
	- if (spa->spa_vdev_removal != NULL)
	- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
	-
	- if (oldvd == NULL)
	- return (spa_vdev_exit(spa, NULL, txg, ENODEV));
	-
	- if (!oldvd->vdev_ops->vdev_op_leaf)
	- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
	-
	- pvd = oldvd->vdev_parent;
	-
	- if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
	- VDEV_ALLOC_ATTACH)) != 0)
	- return (spa_vdev_exit(spa, NULL, txg, EINVAL));
	-
	- if (newrootvd->vdev_children != 1)
	- return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
	-
	- newvd = newrootvd->vdev_child[0];
	-
	- if (!newvd->vdev_ops->vdev_op_leaf)
	- return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
	-
	- if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
	- return (spa_vdev_exit(spa, newrootvd, txg, error));
	-
	- /*
	- * Spares can't replace logs
	- */
	- if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
	- return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
	-
	- if (!replacing) {
	- /*
	- * For attach, the only allowable parent is a mirror or the root
	- * vdev.
	- */
	- if (pvd->vdev_ops != &vdev_mirror_ops &&
	- pvd->vdev_ops != &vdev_root_ops)
	- return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
	-
	- pvops = &vdev_mirror_ops;
	- } else {
	- /*
	- * Active hot spares can only be replaced by inactive hot
	- * spares.
	- */
	- if (pvd->vdev_ops == &vdev_spare_ops &&
	- oldvd->vdev_isspare &&
	- !spa_has_spare(spa, newvd->vdev_guid))
	- return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
	-
	- /*
	- * If the source is a hot spare, and the parent isn't already a
	- * spare, then we want to create a new hot spare. Otherwise, we
	- * want to create a replacing vdev. The user is not allowed to
	- * attach to a spared vdev child unless the 'isspare' state is
	- * the same (spare replaces spare, non-spare replaces
	- * non-spare).
	- */
	- if (pvd->vdev_ops == &vdev_replacing_ops &&
	- spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
	- return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
	- } else if (pvd->vdev_ops == &vdev_spare_ops &&
	- newvd->vdev_isspare != oldvd->vdev_isspare) {
	- return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
	- }
	-
	- if (newvd->vdev_isspare)
	- pvops = &vdev_spare_ops;
	- else
	- pvops = &vdev_replacing_ops;
	- }
	-
	- /*
	- * Make sure the new device is big enough.
	- */
	- if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
	- return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
	-
	- /*
	- * The new device cannot have a higher alignment requirement
	- * than the top-level vdev.
	- */
	- if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
	- return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
	-
	- /*
	- * If this is an in-place replacement, update oldvd's path and devid
	- * to make it distinguishable from newvd, and unopenable from now on.
	- */
	- if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
	- spa_strfree(oldvd->vdev_path);
	- oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
	- KM_SLEEP);
	- (void) sprintf(oldvd->vdev_path, "%s/%s",
	- newvd->vdev_path, "old");
	- if (oldvd->vdev_devid != NULL) {
	- spa_strfree(oldvd->vdev_devid);
	- oldvd->vdev_devid = NULL;
	- }
	- }
	-
	- /* mark the device being resilvered */
	- newvd->vdev_resilver_txg = txg;
	-
	- /*
	- * If the parent is not a mirror, or if we're replacing, insert the new
	- * mirror/replacing/spare vdev above oldvd.
	- */
	- if (pvd->vdev_ops != pvops)
	- pvd = vdev_add_parent(oldvd, pvops);
	-
	- ASSERT(pvd->vdev_top->vdev_parent == rvd);
	- ASSERT(pvd->vdev_ops == pvops);
	- ASSERT(oldvd->vdev_parent == pvd);
	-
	- /*
	- * Extract the new device from its root and add it to pvd.
	- */
	- vdev_remove_child(newrootvd, newvd);
	- newvd->vdev_id = pvd->vdev_children;
	- newvd->vdev_crtxg = oldvd->vdev_crtxg;
	- vdev_add_child(pvd, newvd);
	-
	- tvd = newvd->vdev_top;
	- ASSERT(pvd->vdev_top == tvd);
	- ASSERT(tvd->vdev_parent == rvd);
	-
	- vdev_config_dirty(tvd);
	-
	- /*
	- * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
	- * for any dmu_sync-ed blocks. It will propagate upward when
	- * spa_vdev_exit() calls vdev_dtl_reassess().
	- */
	- dtl_max_txg = txg + TXG_CONCURRENT_STATES;
	-
	- vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
	- dtl_max_txg - TXG_INITIAL);
	-
	- if (newvd->vdev_isspare) {
	- spa_spare_activate(newvd);
	- spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
	- }
	-
	- oldvdpath = spa_strdup(oldvd->vdev_path);
	- newvdpath = spa_strdup(newvd->vdev_path);
	- newvd_isspare = newvd->vdev_isspare;
	-
	- /*
	- * Mark newvd's DTL dirty in this txg.
	- */
	- vdev_dirty(tvd, VDD_DTL, newvd, txg);
	-
	- /*
	- * Schedule the resilver to restart in the future. We do this to
	- * ensure that dmu_sync-ed blocks have been stitched into the
	- * respective datasets.
	- */
	- dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
	-
	- if (spa->spa_bootfs)
	- spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
	-
	- spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
	-
	- /*
	- * Commit the config
	- */
	- (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
	-
	- spa_history_log_internal(spa, "vdev attach", NULL,
	- "%s vdev=%s %s vdev=%s",
	- replacing && newvd_isspare ? "spare in" :
	- replacing ? "replace" : "attach", newvdpath,
	- replacing ? "for" : "to", oldvdpath);
	-
	- spa_strfree(oldvdpath);
	- spa_strfree(newvdpath);
	-
	- return (0);
	-}
	-
	-/*
	- * Detach a device from a mirror or replacing vdev.
	- *
	- * If 'replace_done' is specified, only detach if the parent
	- * is a replacing vdev.
	- */
	-int
	-spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
	-{
	- uint64_t txg;
	- int error;
	- vdev_t *rvd = spa->spa_root_vdev;
	- vdev_t vd, pvd, cvd, tvd;
	- boolean_t unspare = B_FALSE;
	- uint64_t unspare_guid = 0;
	- char *vdpath;
	-
	- ASSERT(spa_writeable(spa));
	-
	- txg = spa_vdev_enter(spa);
	-
	- vd = spa_lookup_by_guid(spa, guid, B_FALSE);
	-
	- /*
	- * Besides being called directly from the userland through the
	- * ioctl interface, spa_vdev_detach() can be potentially called
	- * at the end of spa_vdev_resilver_done().
	- *
	- * In the regular case, when we have a checkpoint this shouldn't
	- * happen as we never empty the DTLs of a vdev during the scrub
	- * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
	- * should never get here when we have a checkpoint.
	- *
	- * That said, even in a case when we checkpoint the pool exactly
	- * as spa_vdev_resilver_done() calls this function everything
	- * should be fine as the resilver will return right away.
	- */
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
	- error = (spa_has_checkpoint(spa)) ?
	- ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
	- return (spa_vdev_exit(spa, NULL, txg, error));
	- }
	-
	- if (vd == NULL)
	- return (spa_vdev_exit(spa, NULL, txg, ENODEV));
	-
	- if (!vd->vdev_ops->vdev_op_leaf)
	- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
	-
	- pvd = vd->vdev_parent;
	-
	- /*
	- * If the parent/child relationship is not as expected, don't do it.
	- * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
	- * vdev that's replacing B with C. The user's intent in replacing
	- * is to go from M(A,B) to M(A,C). If the user decides to cancel
	- * the replace by detaching C, the expected behavior is to end up
	- * M(A,B). But suppose that right after deciding to detach C,
	- * the replacement of B completes. We would have M(A,C), and then
	- * ask to detach C, which would leave us with just A -- not what
	- * the user wanted. To prevent this, we make sure that the
	- * parent/child relationship hasn't changed -- in this example,
	- * that C's parent is still the replacing vdev R.
	- */
	- if (pvd->vdev_guid != pguid && pguid != 0)
	- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
	-
	- /*
	- * Only 'replacing' or 'spare' vdevs can be replaced.
	- */
	- if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
	- pvd->vdev_ops != &vdev_spare_ops)
	- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
	-
	- ASSERT(pvd->vdev_ops != &vdev_spare_ops \|\|
	- spa_version(spa) >= SPA_VERSION_SPARES);
	-
	- /*
	- * Only mirror, replacing, and spare vdevs support detach.
	- */
	- if (pvd->vdev_ops != &vdev_replacing_ops &&
	- pvd->vdev_ops != &vdev_mirror_ops &&
	- pvd->vdev_ops != &vdev_spare_ops)
	- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
	-
	- /*
	- * If this device has the only valid copy of some data,
	- * we cannot safely detach it.
	- */
	- if (vdev_dtl_required(vd))
	- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
	-
	- ASSERT(pvd->vdev_children >= 2);
	-
	- /*
	- * If we are detaching the second disk from a replacing vdev, then
	- * check to see if we changed the original vdev's path to have "/old"
	- * at the end in spa_vdev_attach(). If so, undo that change now.
	- */
	- if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
	- vd->vdev_path != NULL) {
	- size_t len = strlen(vd->vdev_path);
	-
	- for (int c = 0; c < pvd->vdev_children; c++) {
	- cvd = pvd->vdev_child[c];
	-
	- if (cvd == vd \|\| cvd->vdev_path == NULL)
	- continue;
	-
	- if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
	- strcmp(cvd->vdev_path + len, "/old") == 0) {
	- spa_strfree(cvd->vdev_path);
	- cvd->vdev_path = spa_strdup(vd->vdev_path);
	- break;
	- }
	- }
	- }
	-
	- /*
	- * If we are detaching the original disk from a spare, then it implies
	- * that the spare should become a real disk, and be removed from the
	- * active spare list for the pool.
	- */
	- if (pvd->vdev_ops == &vdev_spare_ops &&
	- vd->vdev_id == 0 &&
	- pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
	- unspare = B_TRUE;
	-
	- /*
	- * Erase the disk labels so the disk can be used for other things.
	- * This must be done after all other error cases are handled,
	- * but before we disembowel vd (so we can still do I/O to it).
	- * But if we can't do it, don't treat the error as fatal --
	- * it may be that the unwritability of the disk is the reason
	- * it's being detached!
	- */
	- error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
	-
	- /*
	- * Remove vd from its parent and compact the parent's children.
	- */
	- vdev_remove_child(pvd, vd);
	- vdev_compact_children(pvd);
	-
	- /*
	- * Remember one of the remaining children so we can get tvd below.
	- */
	- cvd = pvd->vdev_child[pvd->vdev_children - 1];
	-
	- /*
	- * If we need to remove the remaining child from the list of hot spares,
	- * do it now, marking the vdev as no longer a spare in the process.
	- * We must do this before vdev_remove_parent(), because that can
	- * change the GUID if it creates a new toplevel GUID. For a similar
	- * reason, we must remove the spare now, in the same txg as the detach;
	- * otherwise someone could attach a new sibling, change the GUID, and
	- * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
	- */
	- if (unspare) {
	- ASSERT(cvd->vdev_isspare);
	- spa_spare_remove(cvd);
	- unspare_guid = cvd->vdev_guid;
	- (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
	- cvd->vdev_unspare = B_TRUE;
	- }
	-
	- /*
	- * If the parent mirror/replacing vdev only has one child,
	- * the parent is no longer needed. Remove it from the tree.
	- */
	- if (pvd->vdev_children == 1) {
	- if (pvd->vdev_ops == &vdev_spare_ops)
	- cvd->vdev_unspare = B_FALSE;
	- vdev_remove_parent(cvd);
	- }
	-
	-
	- /*
	- * We don't set tvd until now because the parent we just removed
	- * may have been the previous top-level vdev.
	- */
	- tvd = cvd->vdev_top;
	- ASSERT(tvd->vdev_parent == rvd);
	-
	- /*
	- * Reevaluate the parent vdev state.
	- */
	- vdev_propagate_state(cvd);
	-
	- /*
	- * If the 'autoexpand' property is set on the pool then automatically
	- * try to expand the size of the pool. For example if the device we
	- * just detached was smaller than the others, it may be possible to
	- * add metaslabs (i.e. grow the pool). We need to reopen the vdev
	- * first so that we can obtain the updated sizes of the leaf vdevs.
	- */
	- if (spa->spa_autoexpand) {
	- vdev_reopen(tvd);
	- vdev_expand(tvd, txg);
	- }
	-
	- vdev_config_dirty(tvd);
	-
	- /*
	- * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
	- * vd->vdev_detached is set and free vd's DTL object in syncing context.
	- * But first make sure we're not on any other txg's DTL list, to
	- * prevent vd from being accessed after it's freed.
	- */
	- vdpath = spa_strdup(vd->vdev_path);
	- for (int t = 0; t < TXG_SIZE; t++)
	- (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
	- vd->vdev_detached = B_TRUE;
	- vdev_dirty(tvd, VDD_DTL, vd, txg);
	-
	- spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
	-
	- /* hang on to the spa before we release the lock */
	- spa_open_ref(spa, FTAG);
	-
	- error = spa_vdev_exit(spa, vd, txg, 0);
	-
	- spa_history_log_internal(spa, "detach", NULL,
	- "vdev=%s", vdpath);
	- spa_strfree(vdpath);
	-
	- /*
	- * If this was the removal of the original device in a hot spare vdev,
	- * then we want to go through and remove the device from the hot spare
	- * list of every other pool.
	- */
	- if (unspare) {
	- spa_t *altspa = NULL;
	-
	- mutex_enter(&spa_namespace_lock);
	- while ((altspa = spa_next(altspa)) != NULL) {
	- if (altspa->spa_state != POOL_STATE_ACTIVE \|\|
	- altspa == spa)
	- continue;
	-
	- spa_open_ref(altspa, FTAG);
	- mutex_exit(&spa_namespace_lock);
	- (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
	- mutex_enter(&spa_namespace_lock);
	- spa_close(altspa, FTAG);
	- }
	- mutex_exit(&spa_namespace_lock);
	-
	- /* search the rest of the vdevs for spares to remove */
	- spa_vdev_resilver_done(spa);
	- }
	-
	- /* all done with the spa; OK to release */
	- mutex_enter(&spa_namespace_lock);
	- spa_close(spa, FTAG);
	- mutex_exit(&spa_namespace_lock);
	-
	- return (error);
	-}
	-
	-int
	-spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
	-{
	- /*
	- * We hold the namespace lock through the whole function
	- * to prevent any changes to the pool while we're starting or
	- * stopping initialization. The config and state locks are held so that
	- * we can properly assess the vdev state before we commit to
	- * the initializing operation.
	- */
	- mutex_enter(&spa_namespace_lock);
	- spa_config_enter(spa, SCL_CONFIG \| SCL_STATE, FTAG, RW_READER);
	-
	- /* Look up vdev and ensure it's a leaf. */
	- vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
	- if (vd == NULL \|\| vd->vdev_detached) {
	- spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(ENODEV));
	- } else if (!vd->vdev_ops->vdev_op_leaf \|\| !vdev_is_concrete(vd)) {
	- spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(EINVAL));
	- } else if (!vdev_writeable(vd)) {
	- spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(EROFS));
	- }
	- mutex_enter(&vd->vdev_initialize_lock);
	- spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	-
	- /*
	- * When we activate an initialize action we check to see
	- * if the vdev_initialize_thread is NULL. We do this instead
	- * of using the vdev_initialize_state since there might be
	- * a previous initialization process which has completed but
	- * the thread is not exited.
	- */
	- if (cmd_type == POOL_INITIALIZE_DO &&
	- (vd->vdev_initialize_thread != NULL \|\|
	- vd->vdev_top->vdev_removing)) {
	- mutex_exit(&vd->vdev_initialize_lock);
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(EBUSY));
	- } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
	- (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
	- vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
	- mutex_exit(&vd->vdev_initialize_lock);
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(ESRCH));
	- } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
	- vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
	- mutex_exit(&vd->vdev_initialize_lock);
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(ESRCH));
	- }
	-
	- switch (cmd_type) {
	- case POOL_INITIALIZE_DO:
	- vdev_initialize(vd);
	- break;
	- case POOL_INITIALIZE_CANCEL:
	- vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
	- break;
	- case POOL_INITIALIZE_SUSPEND:
	- vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
	- break;
	- default:
	- panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
	- }
	- mutex_exit(&vd->vdev_initialize_lock);
	-
	- /* Sync out the initializing state */
	- txg_wait_synced(spa->spa_dsl_pool, 0);
	- mutex_exit(&spa_namespace_lock);
	-
	- return (0);
	-}
	-
	-
	-/*
	- * Split a set of devices from their mirrors, and create a new pool from them.
	- */
	-int
	-spa_vdev_split_mirror(spa_t spa, char newname, nvlist_t *config,
	- nvlist_t *props, boolean_t exp)
	-{
	- int error = 0;
	- uint64_t txg, *glist;
	- spa_t *newspa;
	- uint_t c, children, lastlog;
	- nvlist_t *child, nvl, *tmp;
	- dmu_tx_t *tx;
	- char *altroot = NULL;
	- vdev_t rvd, vml = NULL; / vdev modify list */
	- boolean_t activate_slog;
	-
	- ASSERT(spa_writeable(spa));
	-
	- txg = spa_vdev_enter(spa);
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
	- error = (spa_has_checkpoint(spa)) ?
	- ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
	- return (spa_vdev_exit(spa, NULL, txg, error));
	- }
	-
	- /* clear the log and flush everything up to now */
	- activate_slog = spa_passivate_log(spa);
	- (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
	- error = spa_reset_logs(spa);
	- txg = spa_vdev_config_enter(spa);
	-
	- if (activate_slog)
	- spa_activate_log(spa);
	-
	- if (error != 0)
	- return (spa_vdev_exit(spa, NULL, txg, error));
	-
	- /* check new spa name before going any further */
	- if (spa_lookup(newname) != NULL)
	- return (spa_vdev_exit(spa, NULL, txg, EEXIST));
	-
	- /*
	- * scan through all the children to ensure they're all mirrors
	- */
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 \|\|
	- nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
	- &children) != 0)
	- return (spa_vdev_exit(spa, NULL, txg, EINVAL));
	-
	- /* first, check to ensure we've got the right child count */
	- rvd = spa->spa_root_vdev;
	- lastlog = 0;
	- for (c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *vd = rvd->vdev_child[c];
	-
	- /* don't count the holes & logs as children */
	- if (vd->vdev_islog \|\| !vdev_is_concrete(vd)) {
	- if (lastlog == 0)
	- lastlog = c;
	- continue;
	- }
	-
	- lastlog = 0;
	- }
	- if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
	- return (spa_vdev_exit(spa, NULL, txg, EINVAL));
	-
	- /* next, ensure no spare or cache devices are part of the split */
	- if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 \|\|
	- nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
	- return (spa_vdev_exit(spa, NULL, txg, EINVAL));
	-
	- vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
	- glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
	-
	- /* then, loop over each vdev and validate it */
	- for (c = 0; c < children; c++) {
	- uint64_t is_hole = 0;
	-
	- (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
	- &is_hole);
	-
	- if (is_hole != 0) {
	- if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole \|\|
	- spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
	- continue;
	- } else {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	- }
	-
	- /* which disk is going to be split? */
	- if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
	- &glist[c]) != 0) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- /* look it up in the spa */
	- vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
	- if (vml[c] == NULL) {
	- error = SET_ERROR(ENODEV);
	- break;
	- }
	-
	- /* make sure there's nothing stopping the split */
	- if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops \|\|
	- vml[c]->vdev_islog \|\|
	- !vdev_is_concrete(vml[c]) \|\|
	- vml[c]->vdev_isspare \|\|
	- vml[c]->vdev_isl2cache \|\|
	- !vdev_writeable(vml[c]) \|\|
	- vml[c]->vdev_children != 0 \|\|
	- vml[c]->vdev_state != VDEV_STATE_HEALTHY \|\|
	- c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- if (vdev_dtl_required(vml[c])) {
	- error = SET_ERROR(EBUSY);
	- break;
	- }
	-
	- /* we need certain info from the top level */
	- VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
	- vml[c]->vdev_top->vdev_ms_array) == 0);
	- VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
	- vml[c]->vdev_top->vdev_ms_shift) == 0);
	- VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
	- vml[c]->vdev_top->vdev_asize) == 0);
	- VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
	- vml[c]->vdev_top->vdev_ashift) == 0);
	-
	- /* transfer per-vdev ZAPs */
	- ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
	- VERIFY0(nvlist_add_uint64(child[c],
	- ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
	-
	- ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
	- VERIFY0(nvlist_add_uint64(child[c],
	- ZPOOL_CONFIG_VDEV_TOP_ZAP,
	- vml[c]->vdev_parent->vdev_top_zap));
	- }
	-
	- if (error != 0) {
	- kmem_free(vml, children * sizeof (vdev_t *));
	- kmem_free(glist, children * sizeof (uint64_t));
	- return (spa_vdev_exit(spa, NULL, txg, error));
	- }
	-
	- /* stop writers from using the disks */
	- for (c = 0; c < children; c++) {
	- if (vml[c] != NULL)
	- vml[c]->vdev_offline = B_TRUE;
	- }
	- vdev_reopen(spa->spa_root_vdev);
	-
	- /*
	- * Temporarily record the splitting vdevs in the spa config. This
	- * will disappear once the config is regenerated.
	- */
	- VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
	- glist, children) == 0);
	- kmem_free(glist, children * sizeof (uint64_t));
	-
	- mutex_enter(&spa->spa_props_lock);
	- VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
	- nvl) == 0);
	- mutex_exit(&spa->spa_props_lock);
	- spa->spa_config_splitting = nvl;
	- vdev_config_dirty(spa->spa_root_vdev);
	-
	- /* configure and create the new pool */
	- VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
	- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	- exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
	- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
	- spa_version(spa)) == 0);
	- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
	- spa->spa_config_txg) == 0);
	- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	- spa_generate_guid(NULL)) == 0);
	- VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
	- (void) nvlist_lookup_string(props,
	- zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
	-
	- /* add the new pool to the namespace */
	- newspa = spa_add(newname, config, altroot);
	- newspa->spa_avz_action = AVZ_ACTION_REBUILD;
	- newspa->spa_config_txg = spa->spa_config_txg;
	- spa_set_log_state(newspa, SPA_LOG_CLEAR);
	-
	- /* release the spa config lock, retaining the namespace lock */
	- spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
	-
	- if (zio_injection_enabled)
	- zio_handle_panic_injection(spa, FTAG, 1);
	-
	- spa_activate(newspa, spa_mode_global);
	- spa_async_suspend(newspa);
	-
	- for (c = 0; c < children; c++) {
	- if (vml[c] != NULL) {
	- /*
	- * Temporarily stop the initializing activity. We set
	- * the state to ACTIVE so that we know to resume
	- * the initializing once the split has completed.
	- */
	- mutex_enter(&vml[c]->vdev_initialize_lock);
	- vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
	- mutex_exit(&vml[c]->vdev_initialize_lock);
	- }
	- }
	-
	-#ifndef illumos
	- /* mark that we are creating new spa by splitting */
	- newspa->spa_splitting_newspa = B_TRUE;
	-#endif
	- newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
	-
	- /* create the new pool from the disks of the original pool */
	- error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
	-#ifndef illumos
	- newspa->spa_splitting_newspa = B_FALSE;
	-#endif
	- if (error)
	- goto out;
	-
	- /* if that worked, generate a real config for the new pool */
	- if (newspa->spa_root_vdev != NULL) {
	- VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
	- NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
	- ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
	- spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
	- B_TRUE));
	- }
	-
	- /* set the props */
	- if (props != NULL) {
	- spa_configfile_set(newspa, props, B_FALSE);
	- error = spa_prop_set(newspa, props);
	- if (error)
	- goto out;
	- }
	-
	- /* flush everything */
	- txg = spa_vdev_config_enter(newspa);
	- vdev_config_dirty(newspa->spa_root_vdev);
	- (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
	-
	- if (zio_injection_enabled)
	- zio_handle_panic_injection(spa, FTAG, 2);
	-
	- spa_async_resume(newspa);
	-
	- /* finally, update the original pool's config */
	- txg = spa_vdev_config_enter(spa);
	- tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error != 0)
	- dmu_tx_abort(tx);
	- for (c = 0; c < children; c++) {
	- if (vml[c] != NULL) {
	- vdev_split(vml[c]);
	- if (error == 0)
	- spa_history_log_internal(spa, "detach", tx,
	- "vdev=%s", vml[c]->vdev_path);
	-
	- vdev_free(vml[c]);
	- }
	- }
	- spa->spa_avz_action = AVZ_ACTION_REBUILD;
	- vdev_config_dirty(spa->spa_root_vdev);
	- spa->spa_config_splitting = NULL;
	- nvlist_free(nvl);
	- if (error == 0)
	- dmu_tx_commit(tx);
	- (void) spa_vdev_exit(spa, NULL, txg, 0);
	-
	- if (zio_injection_enabled)
	- zio_handle_panic_injection(spa, FTAG, 3);
	-
	- /* split is complete; log a history record */
	- spa_history_log_internal(newspa, "split", NULL,
	- "from pool %s", spa_name(spa));
	-
	- kmem_free(vml, children * sizeof (vdev_t *));
	-
	- /* if we're not going to mount the filesystems in userland, export */
	- if (exp)
	- error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
	- B_FALSE, B_FALSE);
	-
	- return (error);
	-
	-out:
	- spa_unload(newspa);
	- spa_deactivate(newspa);
	- spa_remove(newspa);
	-
	- txg = spa_vdev_config_enter(spa);
	-
	- /* re-online all offlined disks */
	- for (c = 0; c < children; c++) {
	- if (vml[c] != NULL)
	- vml[c]->vdev_offline = B_FALSE;
	- }
	-
	- /* restart initializing disks as necessary */
	- spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
	-
	- vdev_reopen(spa->spa_root_vdev);
	-
	- nvlist_free(spa->spa_config_splitting);
	- spa->spa_config_splitting = NULL;
	- (void) spa_vdev_exit(spa, NULL, txg, error);
	-
	- kmem_free(vml, children * sizeof (vdev_t *));
	- return (error);
	-}
	-
	-/*
	- * Find any device that's done replacing, or a vdev marked 'unspare' that's
	- * currently spared, so we can detach it.
	- */
	-static vdev_t *
	-spa_vdev_resilver_done_hunt(vdev_t *vd)
	-{
	- vdev_t newvd, oldvd;
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
	- if (oldvd != NULL)
	- return (oldvd);
	- }
	-
	- /*
	- * Check for a completed replacement. We always consider the first
	- * vdev in the list to be the oldest vdev, and the last one to be
	- * the newest (see spa_vdev_attach() for how that works). In
	- * the case where the newest vdev is faulted, we will not automatically
	- * remove it after a resilver completes. This is OK as it will require
	- * user intervention to determine which disk the admin wishes to keep.
	- */
	- if (vd->vdev_ops == &vdev_replacing_ops) {
	- ASSERT(vd->vdev_children > 1);
	-
	- newvd = vd->vdev_child[vd->vdev_children - 1];
	- oldvd = vd->vdev_child[0];
	-
	- if (vdev_dtl_empty(newvd, DTL_MISSING) &&
	- vdev_dtl_empty(newvd, DTL_OUTAGE) &&
	- !vdev_dtl_required(oldvd))
	- return (oldvd);
	- }
	-
	- /*
	- * Check for a completed resilver with the 'unspare' flag set.
	- * Also potentially update faulted state.
	- */
	- if (vd->vdev_ops == &vdev_spare_ops) {
	- vdev_t *first = vd->vdev_child[0];
	- vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
	-
	- if (last->vdev_unspare) {
	- oldvd = first;
	- newvd = last;
	- } else if (first->vdev_unspare) {
	- oldvd = last;
	- newvd = first;
	- } else {
	- oldvd = NULL;
	- }
	-
	- if (oldvd != NULL &&
	- vdev_dtl_empty(newvd, DTL_MISSING) &&
	- vdev_dtl_empty(newvd, DTL_OUTAGE) &&
	- !vdev_dtl_required(oldvd))
	- return (oldvd);
	-
	- vdev_propagate_state(vd);
	-
	- /*
	- * If there are more than two spares attached to a disk,
	- * and those spares are not required, then we want to
	- * attempt to free them up now so that they can be used
	- * by other pools. Once we're back down to a single
	- * disk+spare, we stop removing them.
	- */
	- if (vd->vdev_children > 2) {
	- newvd = vd->vdev_child[1];
	-
	- if (newvd->vdev_isspare && last->vdev_isspare &&
	- vdev_dtl_empty(last, DTL_MISSING) &&
	- vdev_dtl_empty(last, DTL_OUTAGE) &&
	- !vdev_dtl_required(newvd))
	- return (newvd);
	- }
	- }
	-
	- return (NULL);
	-}
	-
	-static void
	-spa_vdev_resilver_done(spa_t *spa)
	-{
	- vdev_t vd, pvd, *ppvd;
	- uint64_t guid, sguid, pguid, ppguid;
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	-
	- while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
	- pvd = vd->vdev_parent;
	- ppvd = pvd->vdev_parent;
	- guid = vd->vdev_guid;
	- pguid = pvd->vdev_guid;
	- ppguid = ppvd->vdev_guid;
	- sguid = 0;
	- /*
	- * If we have just finished replacing a hot spared device, then
	- * we need to detach the parent's first child (the original hot
	- * spare) as well.
	- */
	- if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
	- ppvd->vdev_children == 2) {
	- ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
	- sguid = ppvd->vdev_child[1]->vdev_guid;
	- }
	- ASSERT(vd->vdev_resilver_txg == 0 \|\| !vdev_dtl_required(vd));
	-
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
	- return;
	- if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
	- return;
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- }
	-
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-}
	-
	-/*
	- * Update the stored path or FRU for this vdev.
	- */
	-int
	-spa_vdev_set_common(spa_t spa, uint64_t guid, const char value,
	- boolean_t ispath)
	-{
	- vdev_t *vd;
	- boolean_t sync = B_FALSE;
	-
	- ASSERT(spa_writeable(spa));
	-
	- spa_vdev_state_enter(spa, SCL_ALL);
	-
	- if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	- return (spa_vdev_state_exit(spa, NULL, ENOENT));
	-
	- if (!vd->vdev_ops->vdev_op_leaf)
	- return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
	-
	- if (ispath) {
	- if (strcmp(value, vd->vdev_path) != 0) {
	- spa_strfree(vd->vdev_path);
	- vd->vdev_path = spa_strdup(value);
	- sync = B_TRUE;
	- }
	- } else {
	- if (vd->vdev_fru == NULL) {
	- vd->vdev_fru = spa_strdup(value);
	- sync = B_TRUE;
	- } else if (strcmp(value, vd->vdev_fru) != 0) {
	- spa_strfree(vd->vdev_fru);
	- vd->vdev_fru = spa_strdup(value);
	- sync = B_TRUE;
	- }
	- }
	-
	- return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
	-}
	-
	-int
	-spa_vdev_setpath(spa_t spa, uint64_t guid, const char newpath)
	-{
	- return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
	-}
	-
	-int
	-spa_vdev_setfru(spa_t spa, uint64_t guid, const char newfru)
	-{
	- return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
	-}
	-
	-/*
	- * ==========================================================================
	- * SPA Scanning
	- * ==========================================================================
	- */
	-int
	-spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
	-{
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
	-
	- if (dsl_scan_resilvering(spa->spa_dsl_pool))
	- return (SET_ERROR(EBUSY));
	-
	- return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
	-}
	-
	-int
	-spa_scan_stop(spa_t *spa)
	-{
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
	- if (dsl_scan_resilvering(spa->spa_dsl_pool))
	- return (SET_ERROR(EBUSY));
	- return (dsl_scan_cancel(spa->spa_dsl_pool));
	-}
	-
	-int
	-spa_scan(spa_t *spa, pool_scan_func_t func)
	-{
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
	-
	- if (func >= POOL_SCAN_FUNCS \|\| func == POOL_SCAN_NONE)
	- return (SET_ERROR(ENOTSUP));
	-
	- /*
	- * If a resilver was requested, but there is no DTL on a
	- * writeable leaf device, we have nothing to do.
	- */
	- if (func == POOL_SCAN_RESILVER &&
	- !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
	- spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
	- return (0);
	- }
	-
	- return (dsl_scan(spa->spa_dsl_pool, func));
	-}
	-
	-/*
	- * ==========================================================================
	- * SPA async task processing
	- * ==========================================================================
	- */
	-
	-static void
	-spa_async_remove(spa_t spa, vdev_t vd)
	-{
	- if (vd->vdev_remove_wanted) {
	- vd->vdev_remove_wanted = B_FALSE;
	- vd->vdev_delayed_close = B_FALSE;
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
	-
	- /*
	- * We want to clear the stats, but we don't want to do a full
	- * vdev_clear() as that will cause us to throw away
	- * degraded/faulted state as well as attempt to reopen the
	- * device, all of which is a waste.
	- */
	- vd->vdev_stat.vs_read_errors = 0;
	- vd->vdev_stat.vs_write_errors = 0;
	- vd->vdev_stat.vs_checksum_errors = 0;
	-
	- vdev_state_dirty(vd->vdev_top);
	- /* Tell userspace that the vdev is gone. */
	- zfs_post_remove(spa, vd);
	- }
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- spa_async_remove(spa, vd->vdev_child[c]);
	-}
	-
	-static void
	-spa_async_probe(spa_t spa, vdev_t vd)
	-{
	- if (vd->vdev_probe_wanted) {
	- vd->vdev_probe_wanted = B_FALSE;
	- vdev_reopen(vd); /* vdev_open() does the actual probe */
	- }
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- spa_async_probe(spa, vd->vdev_child[c]);
	-}
	-
	-static void
	-spa_async_autoexpand(spa_t spa, vdev_t vd)
	-{
	- sysevent_id_t eid;
	- nvlist_t *attr;
	- char *physpath;
	-
	- if (!spa->spa_autoexpand)
	- return;
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- vdev_t *cvd = vd->vdev_child[c];
	- spa_async_autoexpand(spa, cvd);
	- }
	-
	- if (!vd->vdev_ops->vdev_op_leaf \|\| vd->vdev_physpath == NULL)
	- return;
	-
	- physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
	- (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
	-
	- VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
	-
	- (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
	- ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
	-
	- nvlist_free(attr);
	- kmem_free(physpath, MAXPATHLEN);
	-}
	-
	-static void
	-spa_async_thread(void *arg)
	-{
	- spa_t spa = (spa_t )arg;
	- int tasks;
	-
	- ASSERT(spa->spa_sync_on);
	-
	- mutex_enter(&spa->spa_async_lock);
	- tasks = spa->spa_async_tasks;
	- spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
	- mutex_exit(&spa->spa_async_lock);
	-
	- /*
	- * See if the config needs to be updated.
	- */
	- if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
	- uint64_t old_space, new_space;
	-
	- mutex_enter(&spa_namespace_lock);
	- old_space = metaslab_class_get_space(spa_normal_class(spa));
	- old_space += metaslab_class_get_space(spa_special_class(spa));
	- old_space += metaslab_class_get_space(spa_dedup_class(spa));
	-
	- spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
	-
	- new_space = metaslab_class_get_space(spa_normal_class(spa));
	- new_space += metaslab_class_get_space(spa_special_class(spa));
	- new_space += metaslab_class_get_space(spa_dedup_class(spa));
	- mutex_exit(&spa_namespace_lock);
	-
	- /*
	- * If the pool grew as a result of the config update,
	- * then log an internal history event.
	- */
	- if (new_space != old_space) {
	- spa_history_log_internal(spa, "vdev online", NULL,
	- "pool '%s' size: %llu(+%llu)",
	- spa_name(spa), new_space, new_space - old_space);
	- }
	- }
	-
	- if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	- spa_async_autoexpand(spa, spa->spa_root_vdev);
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	- }
	-
	- /*
	- * See if any devices need to be probed.
	- */
	- if (tasks & SPA_ASYNC_PROBE) {
	- spa_vdev_state_enter(spa, SCL_NONE);
	- spa_async_probe(spa, spa->spa_root_vdev);
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	- }
	-
	- /*
	- * If any devices are done replacing, detach them.
	- */
	- if (tasks & SPA_ASYNC_RESILVER_DONE)
	- spa_vdev_resilver_done(spa);
	-
	- /*
	- * Kick off a resilver.
	- */
	- if (tasks & SPA_ASYNC_RESILVER)
	- dsl_resilver_restart(spa->spa_dsl_pool, 0);
	-
	- if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
	- mutex_enter(&spa_namespace_lock);
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	- vdev_initialize_restart(spa->spa_root_vdev);
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	- mutex_exit(&spa_namespace_lock);
	- }
	-
	- /*
	- * Let the world know that we're done.
	- */
	- mutex_enter(&spa->spa_async_lock);
	- spa->spa_async_thread = NULL;
	- cv_broadcast(&spa->spa_async_cv);
	- mutex_exit(&spa->spa_async_lock);
	- thread_exit();
	-}
	-
	-static void
	-spa_async_thread_vd(void *arg)
	-{
	- spa_t *spa = arg;
	- int tasks;
	-
	- mutex_enter(&spa->spa_async_lock);
	- tasks = spa->spa_async_tasks;
	-retry:
	- spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
	- mutex_exit(&spa->spa_async_lock);
	-
	- /*
	- * See if any devices need to be marked REMOVED.
	- */
	- if (tasks & SPA_ASYNC_REMOVE) {
	- spa_vdev_state_enter(spa, SCL_NONE);
	- spa_async_remove(spa, spa->spa_root_vdev);
	- for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
	- spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
	- for (int i = 0; i < spa->spa_spares.sav_count; i++)
	- spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	- }
	-
	- /*
	- * Let the world know that we're done.
	- */
	- mutex_enter(&spa->spa_async_lock);
	- tasks = spa->spa_async_tasks;
	- if ((tasks & SPA_ASYNC_REMOVE) != 0)
	- goto retry;
	- spa->spa_async_thread_vd = NULL;
	- cv_broadcast(&spa->spa_async_cv);
	- mutex_exit(&spa->spa_async_lock);
	- thread_exit();
	-}
	-
	-void
	-spa_async_suspend(spa_t *spa)
	-{
	- mutex_enter(&spa->spa_async_lock);
	- spa->spa_async_suspended++;
	- while (spa->spa_async_thread != NULL \|\|
	- spa->spa_async_thread_vd != NULL)
	- cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
	- mutex_exit(&spa->spa_async_lock);
	-
	- spa_vdev_remove_suspend(spa);
	-
	- zthr_t *condense_thread = spa->spa_condense_zthr;
	- if (condense_thread != NULL)
	- zthr_cancel(condense_thread);
	-
	- zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
	- if (discard_thread != NULL)
	- zthr_cancel(discard_thread);
	-}
	-
	-void
	-spa_async_resume(spa_t *spa)
	-{
	- mutex_enter(&spa->spa_async_lock);
	- ASSERT(spa->spa_async_suspended != 0);
	- spa->spa_async_suspended--;
	- mutex_exit(&spa->spa_async_lock);
	- spa_restart_removal(spa);
	-
	- zthr_t *condense_thread = spa->spa_condense_zthr;
	- if (condense_thread != NULL)
	- zthr_resume(condense_thread);
	-
	- zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
	- if (discard_thread != NULL)
	- zthr_resume(discard_thread);
	-}
	-
	-static boolean_t
	-spa_async_tasks_pending(spa_t *spa)
	-{
	- uint_t non_config_tasks;
	- uint_t config_task;
	- boolean_t config_task_suspended;
	-
	- non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE \|
	- SPA_ASYNC_REMOVE);
	- config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
	- if (spa->spa_ccw_fail_time == 0) {
	- config_task_suspended = B_FALSE;
	- } else {
	- config_task_suspended =
	- (gethrtime() - spa->spa_ccw_fail_time) <
	- (zfs_ccw_retry_interval * NANOSEC);
	- }
	-
	- return (non_config_tasks \|\| (config_task && !config_task_suspended));
	-}
	-
	-static void
	-spa_async_dispatch(spa_t *spa)
	-{
	- mutex_enter(&spa->spa_async_lock);
	- if (spa_async_tasks_pending(spa) &&
	- !spa->spa_async_suspended &&
	- spa->spa_async_thread == NULL &&
	- rootdir != NULL)
	- spa->spa_async_thread = thread_create(NULL, 0,
	- spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
	- mutex_exit(&spa->spa_async_lock);
	-}
	-
	-static void
	-spa_async_dispatch_vd(spa_t *spa)
	-{
	- mutex_enter(&spa->spa_async_lock);
	- if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
	- !spa->spa_async_suspended &&
	- spa->spa_async_thread_vd == NULL &&
	- rootdir != NULL)
	- spa->spa_async_thread_vd = thread_create(NULL, 0,
	- spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
	- mutex_exit(&spa->spa_async_lock);
	-}
	-
	-void
	-spa_async_request(spa_t *spa, int task)
	-{
	- zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
	- mutex_enter(&spa->spa_async_lock);
	- spa->spa_async_tasks \|= task;
	- mutex_exit(&spa->spa_async_lock);
	- spa_async_dispatch_vd(spa);
	-}
	-
	-/*
	- * ==========================================================================
	- * SPA syncing routines
	- * ==========================================================================
	- */
	-
	-static int
	-bpobj_enqueue_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- bpobj_t *bpo = arg;
	- bpobj_enqueue(bpo, bp, tx);
	- return (0);
	-}
	-
	-static int
	-spa_free_sync_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	-{
	- zio_t *zio = arg;
	-
	- zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
	- BP_GET_PSIZE(bp), zio->io_flags));
	- return (0);
	-}
	-
	-/*
	- * Note: this simple function is not inlined to make it easier to dtrace the
	- * amount of time spent syncing frees.
	- */
	-static void
	-spa_sync_frees(spa_t spa, bplist_t bpl, dmu_tx_t *tx)
	-{
	- zio_t *zio = zio_root(spa, NULL, NULL, 0);
	- bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
	- VERIFY(zio_wait(zio) == 0);
	-}
	-
	-/*
	- * Note: this simple function is not inlined to make it easier to dtrace the
	- * amount of time spent syncing deferred frees.
	- */
	-static void
	-spa_sync_deferred_frees(spa_t spa, dmu_tx_t tx)
	-{
	- zio_t *zio = zio_root(spa, NULL, NULL, 0);
	- VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
	- spa_free_sync_cb, zio, tx), ==, 0);
	- VERIFY0(zio_wait(zio));
	-}
	-
	-
	-static void
	-spa_sync_nvlist(spa_t spa, uint64_t obj, nvlist_t nv, dmu_tx_t *tx)
	-{
	- char *packed = NULL;
	- size_t bufsize;
	- size_t nvsize = 0;
	- dmu_buf_t *db;
	-
	- VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
	-
	- /*
	- * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
	- * information. This avoids the dmu_buf_will_dirty() path and
	- * saves us a pre-read to get data we don't actually care about.
	- */
	- bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
	- packed = kmem_alloc(bufsize, KM_SLEEP);
	-
	- VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
	- KM_SLEEP) == 0);
	- bzero(packed + nvsize, bufsize - nvsize);
	-
	- dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
	-
	- kmem_free(packed, bufsize);
	-
	- VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
	- dmu_buf_will_dirty(db, tx);
	- (uint64_t )db->db_data = nvsize;
	- dmu_buf_rele(db, FTAG);
	-}
	-
	-static void
	-spa_sync_aux_dev(spa_t spa, spa_aux_vdev_t sav, dmu_tx_t *tx,
	- const char config, const char entry)
	-{
	- nvlist_t *nvroot;
	- nvlist_t **list;
	- int i;
	-
	- if (!sav->sav_sync)
	- return;
	-
	- /*
	- * Update the MOS nvlist describing the list of available devices.
	- * spa_validate_aux() will have already made sure this nvlist is
	- * valid and the vdevs are labeled appropriately.
	- */
	- if (sav->sav_object == 0) {
	- sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
	- DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
	- sizeof (uint64_t), tx);
	- VERIFY(zap_update(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
	- &sav->sav_object, tx) == 0);
	- }
	-
	- VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- if (sav->sav_count == 0) {
	- VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
	- } else {
	- list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
	- for (i = 0; i < sav->sav_count; i++)
	- list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
	- B_FALSE, VDEV_CONFIG_L2CACHE);
	- VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
	- sav->sav_count) == 0);
	- for (i = 0; i < sav->sav_count; i++)
	- nvlist_free(list[i]);
	- kmem_free(list, sav->sav_count * sizeof (void *));
	- }
	-
	- spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
	- nvlist_free(nvroot);
	-
	- sav->sav_sync = B_FALSE;
	-}
	-
	-/*
	- * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
	- * The all-vdev ZAP must be empty.
	- */
	-static void
	-spa_avz_build(vdev_t vd, uint64_t avz, dmu_tx_t tx)
	-{
	- spa_t *spa = vd->vdev_spa;
	- if (vd->vdev_top_zap != 0) {
	- VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
	- vd->vdev_top_zap, tx));
	- }
	- if (vd->vdev_leaf_zap != 0) {
	- VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
	- vd->vdev_leaf_zap, tx));
	- }
	- for (uint64_t i = 0; i < vd->vdev_children; i++) {
	- spa_avz_build(vd->vdev_child[i], avz, tx);
	- }
	-}
	-
	-static void
	-spa_sync_config_object(spa_t spa, dmu_tx_t tx)
	-{
	- nvlist_t *config;
	-
	- /*
	- * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
	- * its config may not be dirty but we still need to build per-vdev ZAPs.
	- * Similarly, if the pool is being assembled (e.g. after a split), we
	- * need to rebuild the AVZ although the config may not be dirty.
	- */
	- if (list_is_empty(&spa->spa_config_dirty_list) &&
	- spa->spa_avz_action == AVZ_ACTION_NONE)
	- return;
	-
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	-
	- ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE \|\|
	- spa->spa_avz_action == AVZ_ACTION_INITIALIZE \|\|
	- spa->spa_all_vdev_zaps != 0);
	-
	- if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
	- /* Make and build the new AVZ */
	- uint64_t new_avz = zap_create(spa->spa_meta_objset,
	- DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
	- spa_avz_build(spa->spa_root_vdev, new_avz, tx);
	-
	- /* Diff old AVZ with new one */
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- for (zap_cursor_init(&zc, spa->spa_meta_objset,
	- spa->spa_all_vdev_zaps);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- uint64_t vdzap = za.za_first_integer;
	- if (zap_lookup_int(spa->spa_meta_objset, new_avz,
	- vdzap) == ENOENT) {
	- /*
	- * ZAP is listed in old AVZ but not in new one;
	- * destroy it
	- */
	- VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
	- tx));
	- }
	- }
	-
	- zap_cursor_fini(&zc);
	-
	- /* Destroy the old AVZ */
	- VERIFY0(zap_destroy(spa->spa_meta_objset,
	- spa->spa_all_vdev_zaps, tx));
	-
	- /* Replace the old AVZ in the dir obj with the new one */
	- VERIFY0(zap_update(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
	- sizeof (new_avz), 1, &new_avz, tx));
	-
	- spa->spa_all_vdev_zaps = new_avz;
	- } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
	- zap_cursor_t zc;
	- zap_attribute_t za;
	-
	- /* Walk through the AVZ and destroy all listed ZAPs */
	- for (zap_cursor_init(&zc, spa->spa_meta_objset,
	- spa->spa_all_vdev_zaps);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- uint64_t zap = za.za_first_integer;
	- VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
	- }
	-
	- zap_cursor_fini(&zc);
	-
	- /* Destroy and unlink the AVZ itself */
	- VERIFY0(zap_destroy(spa->spa_meta_objset,
	- spa->spa_all_vdev_zaps, tx));
	- VERIFY0(zap_remove(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
	- spa->spa_all_vdev_zaps = 0;
	- }
	-
	- if (spa->spa_all_vdev_zaps == 0) {
	- spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
	- DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_VDEV_ZAP_MAP, tx);
	- }
	- spa->spa_avz_action = AVZ_ACTION_NONE;
	-
	- /* Create ZAPs for vdevs that don't have them. */
	- vdev_construct_zaps(spa->spa_root_vdev, tx);
	-
	- config = spa_config_generate(spa, spa->spa_root_vdev,
	- dmu_tx_get_txg(tx), B_FALSE);
	-
	- /*
	- * If we're upgrading the spa version then make sure that
	- * the config object gets updated with the correct version.
	- */
	- if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
	- fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
	- spa->spa_uberblock.ub_version);
	-
	- spa_config_exit(spa, SCL_STATE, FTAG);
	-
	- nvlist_free(spa->spa_config_syncing);
	- spa->spa_config_syncing = config;
	-
	- spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
	-}
	-
	-static void
	-spa_sync_version(void arg, dmu_tx_t tx)
	-{
	- uint64_t *versionp = arg;
	- uint64_t version = *versionp;
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- /*
	- * Setting the version is special cased when first creating the pool.
	- */
	- ASSERT(tx->tx_txg != TXG_INITIAL);
	-
	- ASSERT(SPA_VERSION_IS_SUPPORTED(version));
	- ASSERT(version >= spa_version(spa));
	-
	- spa->spa_uberblock.ub_version = version;
	- vdev_config_dirty(spa->spa_root_vdev);
	- spa_history_log_internal(spa, "set", tx, "version=%lld", version);
	-}
	-
	-/*
	- * Set zpool properties.
	- */
	-static void
	-spa_sync_props(void arg, dmu_tx_t tx)
	-{
	- nvlist_t *nvp = arg;
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- objset_t *mos = spa->spa_meta_objset;
	- nvpair_t *elem = NULL;
	-
	- mutex_enter(&spa->spa_props_lock);
	-
	- while ((elem = nvlist_next_nvpair(nvp, elem))) {
	- uint64_t intval;
	- char strval, fname;
	- zpool_prop_t prop;
	- const char *propname;
	- zprop_type_t proptype;
	- spa_feature_t fid;
	-
	- switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
	- case ZPOOL_PROP_INVAL:
	- /*
	- * We checked this earlier in spa_prop_validate().
	- */
	- ASSERT(zpool_prop_feature(nvpair_name(elem)));
	-
	- fname = strchr(nvpair_name(elem), '@') + 1;
	- VERIFY0(zfeature_lookup_name(fname, &fid));
	-
	- spa_feature_enable(spa, fid, tx);
	- spa_history_log_internal(spa, "set", tx,
	- "%s=enabled", nvpair_name(elem));
	- break;
	-
	- case ZPOOL_PROP_VERSION:
	- intval = fnvpair_value_uint64(elem);
	- /*
	- * The version is synced seperatly before other
	- * properties and should be correct by now.
	- */
	- ASSERT3U(spa_version(spa), >=, intval);
	- break;
	-
	- case ZPOOL_PROP_ALTROOT:
	- /*
	- * 'altroot' is a non-persistent property. It should
	- * have been set temporarily at creation or import time.
	- */
	- ASSERT(spa->spa_root != NULL);
	- break;
	-
	- case ZPOOL_PROP_READONLY:
	- case ZPOOL_PROP_CACHEFILE:
	- /*
	- * 'readonly' and 'cachefile' are also non-persisitent
	- * properties.
	- */
	- break;
	- case ZPOOL_PROP_COMMENT:
	- strval = fnvpair_value_string(elem);
	- if (spa->spa_comment != NULL)
	- spa_strfree(spa->spa_comment);
	- spa->spa_comment = spa_strdup(strval);
	- /*
	- * We need to dirty the configuration on all the vdevs
	- * so that their labels get updated. It's unnecessary
	- * to do this for pool creation since the vdev's
	- * configuratoin has already been dirtied.
	- */
	- if (tx->tx_txg != TXG_INITIAL)
	- vdev_config_dirty(spa->spa_root_vdev);
	- spa_history_log_internal(spa, "set", tx,
	- "%s=%s", nvpair_name(elem), strval);
	- break;
	- default:
	- /*
	- * Set pool property values in the poolprops mos object.
	- */
	- if (spa->spa_pool_props_object == 0) {
	- spa->spa_pool_props_object =
	- zap_create_link(mos, DMU_OT_POOL_PROPS,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
	- tx);
	- }
	-
	- /* normalize the property name */
	- propname = zpool_prop_to_name(prop);
	- proptype = zpool_prop_get_type(prop);
	-
	- if (nvpair_type(elem) == DATA_TYPE_STRING) {
	- ASSERT(proptype == PROP_TYPE_STRING);
	- strval = fnvpair_value_string(elem);
	- VERIFY0(zap_update(mos,
	- spa->spa_pool_props_object, propname,
	- 1, strlen(strval) + 1, strval, tx));
	- spa_history_log_internal(spa, "set", tx,
	- "%s=%s", nvpair_name(elem), strval);
	- } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
	- intval = fnvpair_value_uint64(elem);
	-
	- if (proptype == PROP_TYPE_INDEX) {
	- const char *unused;
	- VERIFY0(zpool_prop_index_to_string(
	- prop, intval, &unused));
	- }
	- VERIFY0(zap_update(mos,
	- spa->spa_pool_props_object, propname,
	- 8, 1, &intval, tx));
	- spa_history_log_internal(spa, "set", tx,
	- "%s=%lld", nvpair_name(elem), intval);
	- } else {
	- ASSERT(0); /* not allowed */
	- }
	-
	- switch (prop) {
	- case ZPOOL_PROP_DELEGATION:
	- spa->spa_delegation = intval;
	- break;
	- case ZPOOL_PROP_BOOTFS:
	- spa->spa_bootfs = intval;
	- break;
	- case ZPOOL_PROP_FAILUREMODE:
	- spa->spa_failmode = intval;
	- break;
	- case ZPOOL_PROP_AUTOEXPAND:
	- spa->spa_autoexpand = intval;
	- if (tx->tx_txg != TXG_INITIAL)
	- spa_async_request(spa,
	- SPA_ASYNC_AUTOEXPAND);
	- break;
	- case ZPOOL_PROP_MULTIHOST:
	- spa->spa_multihost = intval;
	- break;
	- case ZPOOL_PROP_DEDUPDITTO:
	- spa->spa_dedup_ditto = intval;
	- break;
	- default:
	- break;
	- }
	- }
	-
	- }
	-
	- mutex_exit(&spa->spa_props_lock);
	-}
	-
	-/*
	- * Perform one-time upgrade on-disk changes. spa_version() does not
	- * reflect the new version this txg, so there must be no changes this
	- * txg to anything that the upgrade code depends on after it executes.
	- * Therefore this must be called after dsl_pool_sync() does the sync
	- * tasks.
	- */
	-static void
	-spa_sync_upgrades(spa_t spa, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = spa->spa_dsl_pool;
	-
	- ASSERT(spa->spa_sync_pass == 1);
	-
	- rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
	-
	- if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
	- spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
	- dsl_pool_create_origin(dp, tx);
	-
	- /* Keeping the origin open increases spa_minref */
	- spa->spa_minref += 3;
	- }
	-
	- if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
	- spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
	- dsl_pool_upgrade_clones(dp, tx);
	- }
	-
	- if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
	- spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
	- dsl_pool_upgrade_dir_clones(dp, tx);
	-
	- /* Keeping the freedir open increases spa_minref */
	- spa->spa_minref += 3;
	- }
	-
	- if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
	- spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
	- spa_feature_create_zap_objects(spa, tx);
	- }
	-
	- /*
	- * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
	- * when possibility to use lz4 compression for metadata was added
	- * Old pools that have this feature enabled must be upgraded to have
	- * this feature active
	- */
	- if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
	- boolean_t lz4_en = spa_feature_is_enabled(spa,
	- SPA_FEATURE_LZ4_COMPRESS);
	- boolean_t lz4_ac = spa_feature_is_active(spa,
	- SPA_FEATURE_LZ4_COMPRESS);
	-
	- if (lz4_en && !lz4_ac)
	- spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
	- }
	-
	- /*
	- * If we haven't written the salt, do so now. Note that the
	- * feature may not be activated yet, but that's fine since
	- * the presence of this ZAP entry is backwards compatible.
	- */
	- if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_CHECKSUM_SALT) == ENOENT) {
	- VERIFY0(zap_add(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
	- sizeof (spa->spa_cksum_salt.zcs_bytes),
	- spa->spa_cksum_salt.zcs_bytes, tx));
	- }
	-
	- rrw_exit(&dp->dp_config_rwlock, FTAG);
	-}
	-
	-static void
	-vdev_indirect_state_sync_verify(vdev_t *vd)
	-{
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- vdev_indirect_births_t *vib = vd->vdev_indirect_births;
	-
	- if (vd->vdev_ops == &vdev_indirect_ops) {
	- ASSERT(vim != NULL);
	- ASSERT(vib != NULL);
	- }
	-
	- if (vdev_obsolete_sm_object(vd) != 0) {
	- ASSERT(vd->vdev_obsolete_sm != NULL);
	- ASSERT(vd->vdev_removing \|\|
	- vd->vdev_ops == &vdev_indirect_ops);
	- ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
	- ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
	-
	- ASSERT3U(vdev_obsolete_sm_object(vd), ==,
	- space_map_object(vd->vdev_obsolete_sm));
	- ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
	- space_map_allocated(vd->vdev_obsolete_sm));
	- }
	- ASSERT(vd->vdev_obsolete_segments != NULL);
	-
	- /*
	- * Since frees / remaps to an indirect vdev can only
	- * happen in syncing context, the obsolete segments
	- * tree must be empty when we start syncing.
	- */
	- ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
	-}
	-
	-/*
	- * Sync the specified transaction group. New blocks may be dirtied as
	- * part of the process, so we iterate until it converges.
	- */
	-void
	-spa_sync(spa_t *spa, uint64_t txg)
	-{
	- dsl_pool_t *dp = spa->spa_dsl_pool;
	- objset_t *mos = spa->spa_meta_objset;
	- bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
	- metaslab_class_t *normal = spa_normal_class(spa);
	- metaslab_class_t *special = spa_special_class(spa);
	- metaslab_class_t *dedup = spa_dedup_class(spa);
	- vdev_t *rvd = spa->spa_root_vdev;
	- vdev_t *vd;
	- dmu_tx_t *tx;
	- int error;
	- uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
	- zfs_vdev_queue_depth_pct / 100;
	-
	- VERIFY(spa_writeable(spa));
	-
	- /*
	- * Wait for i/os issued in open context that need to complete
	- * before this txg syncs.
	- */
	- (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
	- spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
	- ZIO_FLAG_CANFAIL);
	-
	- /*
	- * Lock out configuration changes.
	- */
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	-
	- spa->spa_syncing_txg = txg;
	- spa->spa_sync_pass = 0;
	-
	- for (int i = 0; i < spa->spa_alloc_count; i++) {
	- mutex_enter(&spa->spa_alloc_locks[i]);
	- VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
	- mutex_exit(&spa->spa_alloc_locks[i]);
	- }
	-
	- /*
	- * If there are any pending vdev state changes, convert them
	- * into config changes that go out with this transaction group.
	- */
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	- while (list_head(&spa->spa_state_dirty_list) != NULL) {
	- /*
	- * We need the write lock here because, for aux vdevs,
	- * calling vdev_config_dirty() modifies sav_config.
	- * This is ugly and will become unnecessary when we
	- * eliminate the aux vdev wart by integrating all vdevs
	- * into the root vdev tree.
	- */
	- spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	- spa_config_enter(spa, SCL_CONFIG \| SCL_STATE, FTAG, RW_WRITER);
	- while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
	- vdev_state_clean(vd);
	- vdev_config_dirty(vd);
	- }
	- spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	- spa_config_enter(spa, SCL_CONFIG \| SCL_STATE, FTAG, RW_READER);
	- }
	- spa_config_exit(spa, SCL_STATE, FTAG);
	-
	- tx = dmu_tx_create_assigned(dp, txg);
	-
	- spa->spa_sync_starttime = gethrtime();
	-#ifdef illumos
	- VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
	- spa->spa_sync_starttime + spa->spa_deadman_synctime));
	-#else /* !illumos */
	-#ifdef _KERNEL
	- callout_schedule(&spa->spa_deadman_cycid,
	- hz * spa->spa_deadman_synctime / NANOSEC);
	-#endif
	-#endif /* illumos */
	-
	- /*
	- * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
	- * set spa_deflate if we have no raid-z vdevs.
	- */
	- if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
	- spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
	- int i;
	-
	- for (i = 0; i < rvd->vdev_children; i++) {
	- vd = rvd->vdev_child[i];
	- if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
	- break;
	- }
	- if (i == rvd->vdev_children) {
	- spa->spa_deflate = TRUE;
	- VERIFY(0 == zap_add(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
	- sizeof (uint64_t), 1, &spa->spa_deflate, tx));
	- }
	- }
	-
	- /*
	- * Set the top-level vdev's max queue depth. Evaluate each
	- * top-level's async write queue depth in case it changed.
	- * The max queue depth will not change in the middle of syncing
	- * out this txg.
	- */
	- uint64_t slots_per_allocator = 0;
	- for (int c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	- metaslab_group_t *mg = tvd->vdev_mg;
	- metaslab_class_t *mc;
	-
	- if (mg == NULL \|\| !metaslab_group_initialized(mg))
	- continue;
	-
	- mc = mg->mg_class;
	- if (mc != normal && mc != special && mc != dedup)
	- continue;
	-
	- /*
	- * It is safe to do a lock-free check here because only async
	- * allocations look at mg_max_alloc_queue_depth, and async
	- * allocations all happen from spa_sync().
	- */
	- for (int i = 0; i < spa->spa_alloc_count; i++)
	- ASSERT0(zfs_refcount_count(
	- &(mg->mg_alloc_queue_depth[i])));
	- mg->mg_max_alloc_queue_depth = max_queue_depth;
	-
	- for (int i = 0; i < spa->spa_alloc_count; i++) {
	- mg->mg_cur_max_alloc_queue_depth[i] =
	- zfs_vdev_def_queue_depth;
	- }
	- slots_per_allocator += zfs_vdev_def_queue_depth;
	- }
	-
	- for (int i = 0; i < spa->spa_alloc_count; i++) {
	- ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i]));
	- ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i]));
	- ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i]));
	- normal->mc_alloc_max_slots[i] = slots_per_allocator;
	- special->mc_alloc_max_slots[i] = slots_per_allocator;
	- dedup->mc_alloc_max_slots[i] = slots_per_allocator;
	- }
	- normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
	- special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
	- dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
	-
	- for (int c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *vd = rvd->vdev_child[c];
	- vdev_indirect_state_sync_verify(vd);
	-
	- if (vdev_indirect_should_condense(vd)) {
	- spa_condense_indirect_start_sync(vd, tx);
	- break;
	- }
	- }
	-
	- /*
	- * Iterate to convergence.
	- */
	- do {
	- int pass = ++spa->spa_sync_pass;
	-
	- spa_sync_config_object(spa, tx);
	- spa_sync_aux_dev(spa, &spa->spa_spares, tx,
	- ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
	- spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
	- ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
	- spa_errlog_sync(spa, txg);
	- dsl_pool_sync(dp, txg);
	-
	- if (pass < zfs_sync_pass_deferred_free) {
	- spa_sync_frees(spa, free_bpl, tx);
	- } else {
	- /*
	- * We can not defer frees in pass 1, because
	- * we sync the deferred frees later in pass 1.
	- */
	- ASSERT3U(pass, >, 1);
	- bplist_iterate(free_bpl, bpobj_enqueue_cb,
	- &spa->spa_deferred_bpobj, tx);
	- }
	-
	- ddt_sync(spa, txg);
	- dsl_scan_sync(dp, tx);
	-
	- if (spa->spa_vdev_removal != NULL)
	- svr_sync(spa, tx);
	-
	- while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
	- != NULL)
	- vdev_sync(vd, txg);
	-
	- if (pass == 1) {
	- spa_sync_upgrades(spa, tx);
	- ASSERT3U(txg, >=,
	- spa->spa_uberblock.ub_rootbp.blk_birth);
	- /*
	- * Note: We need to check if the MOS is dirty
	- * because we could have marked the MOS dirty
	- * without updating the uberblock (e.g. if we
	- * have sync tasks but no dirty user data). We
	- * need to check the uberblock's rootbp because
	- * it is updated if we have synced out dirty
	- * data (though in this case the MOS will most
	- * likely also be dirty due to second order
	- * effects, we don't want to rely on that here).
	- */
	- if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
	- !dmu_objset_is_dirty(mos, txg)) {
	- /*
	- * Nothing changed on the first pass,
	- * therefore this TXG is a no-op. Avoid
	- * syncing deferred frees, so that we
	- * can keep this TXG as a no-op.
	- */
	- ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
	- txg));
	- ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
	- ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
	- ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
	- txg));
	- break;
	- }
	- spa_sync_deferred_frees(spa, tx);
	- }
	-
	- } while (dmu_objset_is_dirty(mos, txg));
	-
	- if (!list_is_empty(&spa->spa_config_dirty_list)) {
	- /*
	- * Make sure that the number of ZAPs for all the vdevs matches
	- * the number of ZAPs in the per-vdev ZAP list. This only gets
	- * called if the config is dirty; otherwise there may be
	- * outstanding AVZ operations that weren't completed in
	- * spa_sync_config_object.
	- */
	- uint64_t all_vdev_zap_entry_count;
	- ASSERT0(zap_count(spa->spa_meta_objset,
	- spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
	- ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
	- all_vdev_zap_entry_count);
	- }
	-
	- if (spa->spa_vdev_removal != NULL) {
	- ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
	- }
	-
	- /*
	- * Rewrite the vdev configuration (which includes the uberblock)
	- * to commit the transaction group.
	- *
	- * If there are no dirty vdevs, we sync the uberblock to a few
	- * random top-level vdevs that are known to be visible in the
	- * config cache (see spa_vdev_add() for a complete description).
	- * If there are dirty vdevs, sync the uberblock to all vdevs.
	- */
	- for (;;) {
	- /*
	- * We hold SCL_STATE to prevent vdev open/close/etc.
	- * while we're attempting to write the vdev labels.
	- */
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	-
	- if (list_is_empty(&spa->spa_config_dirty_list)) {
	- vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
	- int svdcount = 0;
	- int children = rvd->vdev_children;
	- int c0 = spa_get_random(children);
	-
	- for (int c = 0; c < children; c++) {
	- vd = rvd->vdev_child[(c0 + c) % children];
	-
	- /* Stop when revisiting the first vdev */
	- if (c > 0 && svd[0] == vd)
	- break;
	-
	- if (vd->vdev_ms_array == 0 \|\| vd->vdev_islog \|\|
	- !vdev_is_concrete(vd))
	- continue;
	-
	- svd[svdcount++] = vd;
	- if (svdcount == SPA_SYNC_MIN_VDEVS)
	- break;
	- }
	- error = vdev_config_sync(svd, svdcount, txg);
	- } else {
	- error = vdev_config_sync(rvd->vdev_child,
	- rvd->vdev_children, txg);
	- }
	-
	- if (error == 0)
	- spa->spa_last_synced_guid = rvd->vdev_guid;
	-
	- spa_config_exit(spa, SCL_STATE, FTAG);
	-
	- if (error == 0)
	- break;
	- zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
	- zio_resume_wait(spa);
	- }
	- dmu_tx_commit(tx);
	-
	-#ifdef illumos
	- VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
	-#else /* !illumos */
	-#ifdef _KERNEL
	- callout_drain(&spa->spa_deadman_cycid);
	-#endif
	-#endif /* illumos */
	-
	- /*
	- * Clear the dirty config list.
	- */
	- while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
	- vdev_config_clean(vd);
	-
	- /*
	- * Now that the new config has synced transactionally,
	- * let it become visible to the config cache.
	- */
	- if (spa->spa_config_syncing != NULL) {
	- spa_config_set(spa, spa->spa_config_syncing);
	- spa->spa_config_txg = txg;
	- spa->spa_config_syncing = NULL;
	- }
	-
	- dsl_pool_sync_done(dp, txg);
	-
	- for (int i = 0; i < spa->spa_alloc_count; i++) {
	- mutex_enter(&spa->spa_alloc_locks[i]);
	- VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
	- mutex_exit(&spa->spa_alloc_locks[i]);
	- }
	-
	- /*
	- * Update usable space statistics.
	- */
	- while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
	- != NULL)
	- vdev_sync_done(vd, txg);
	-
	- spa_update_dspace(spa);
	-
	- /*
	- * It had better be the case that we didn't dirty anything
	- * since vdev_config_sync().
	- */
	- ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
	- ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
	- ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
	-
	- while (zfs_pause_spa_sync)
	- delay(1);
	-
	- spa->spa_sync_pass = 0;
	-
	- /*
	- * Update the last synced uberblock here. We want to do this at
	- * the end of spa_sync() so that consumers of spa_last_synced_txg()
	- * will be guaranteed that all the processing associated with
	- * that txg has been completed.
	- */
	- spa->spa_ubsync = spa->spa_uberblock;
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	-
	- spa_handle_ignored_writes(spa);
	-
	- /*
	- * If any async tasks have been requested, kick them off.
	- */
	- spa_async_dispatch(spa);
	- spa_async_dispatch_vd(spa);
	-}
	-
	-/*
	- * Sync all pools. We don't want to hold the namespace lock across these
	- * operations, so we take a reference on the spa_t and drop the lock during the
	- * sync.
	- */
	-void
	-spa_sync_allpools(void)
	-{
	- spa_t *spa = NULL;
	- mutex_enter(&spa_namespace_lock);
	- while ((spa = spa_next(spa)) != NULL) {
	- if (spa_state(spa) != POOL_STATE_ACTIVE \|\|
	- !spa_writeable(spa) \|\| spa_suspended(spa))
	- continue;
	- spa_open_ref(spa, FTAG);
	- mutex_exit(&spa_namespace_lock);
	- txg_wait_synced(spa_get_dsl(spa), 0);
	- mutex_enter(&spa_namespace_lock);
	- spa_close(spa, FTAG);
	- }
	- mutex_exit(&spa_namespace_lock);
	-}
	-
	-/*
	- * ==========================================================================
	- * Miscellaneous routines
	- * ==========================================================================
	- */
	-
	-/*
	- * Remove all pools in the system.
	- */
	-void
	-spa_evict_all(void)
	-{
	- spa_t *spa;
	-
	- /*
	- * Remove all cached state. All pools should be closed now,
	- * so every spa in the AVL tree should be unreferenced.
	- */
	- mutex_enter(&spa_namespace_lock);
	- while ((spa = spa_next(NULL)) != NULL) {
	- /*
	- * Stop async tasks. The async thread may need to detach
	- * a device that's been replaced, which requires grabbing
	- * spa_namespace_lock, so we must drop it here.
	- */
	- spa_open_ref(spa, FTAG);
	- mutex_exit(&spa_namespace_lock);
	- spa_async_suspend(spa);
	- mutex_enter(&spa_namespace_lock);
	- spa_close(spa, FTAG);
	-
	- if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
	- spa_unload(spa);
	- spa_deactivate(spa);
	- }
	- spa_remove(spa);
	- }
	- mutex_exit(&spa_namespace_lock);
	-}
	-
	-vdev_t *
	-spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
	-{
	- vdev_t *vd;
	- int i;
	-
	- if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
	- return (vd);
	-
	- if (aux) {
	- for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
	- vd = spa->spa_l2cache.sav_vdevs[i];
	- if (vd->vdev_guid == guid)
	- return (vd);
	- }
	-
	- for (i = 0; i < spa->spa_spares.sav_count; i++) {
	- vd = spa->spa_spares.sav_vdevs[i];
	- if (vd->vdev_guid == guid)
	- return (vd);
	- }
	- }
	-
	- return (NULL);
	-}
	-
	-void
	-spa_upgrade(spa_t *spa, uint64_t version)
	-{
	- ASSERT(spa_writeable(spa));
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	-
	- /*
	- * This should only be called for a non-faulted pool, and since a
	- * future version would result in an unopenable pool, this shouldn't be
	- * possible.
	- */
	- ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
	- ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
	-
	- spa->spa_uberblock.ub_version = version;
	- vdev_config_dirty(spa->spa_root_vdev);
	-
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- txg_wait_synced(spa_get_dsl(spa), 0);
	-}
	-
	-boolean_t
	-spa_has_spare(spa_t *spa, uint64_t guid)
	-{
	- int i;
	- uint64_t spareguid;
	- spa_aux_vdev_t *sav = &spa->spa_spares;
	-
	- for (i = 0; i < sav->sav_count; i++)
	- if (sav->sav_vdevs[i]->vdev_guid == guid)
	- return (B_TRUE);
	-
	- for (i = 0; i < sav->sav_npending; i++) {
	- if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
	- &spareguid) == 0 && spareguid == guid)
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Check if a pool has an active shared spare device.
	- * Note: reference count of an active spare is 2, as a spare and as a replace
	- */
	-static boolean_t
	-spa_has_active_shared_spare(spa_t *spa)
	-{
	- int i, refcnt;
	- uint64_t pool;
	- spa_aux_vdev_t *sav = &spa->spa_spares;
	-
	- for (i = 0; i < sav->sav_count; i++) {
	- if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
	- &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
	- refcnt > 2)
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-sysevent_t *
	-spa_event_create(spa_t spa, vdev_t vd, nvlist_t hist_nvl, const char name)
	-{
	- sysevent_t *ev = NULL;
	-#ifdef _KERNEL
	- sysevent_attr_list_t *attr = NULL;
	- sysevent_value_t value;
	-
	- ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
	- SE_SLEEP);
	- ASSERT(ev != NULL);
	-
	- value.value_type = SE_DATA_TYPE_STRING;
	- value.value.sv_string = spa_name(spa);
	- if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
	- goto done;
	-
	- value.value_type = SE_DATA_TYPE_UINT64;
	- value.value.sv_uint64 = spa_guid(spa);
	- if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
	- goto done;
	-
	- if (vd) {
	- value.value_type = SE_DATA_TYPE_UINT64;
	- value.value.sv_uint64 = vd->vdev_guid;
	- if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
	- SE_SLEEP) != 0)
	- goto done;
	-
	- if (vd->vdev_path) {
	- value.value_type = SE_DATA_TYPE_STRING;
	- value.value.sv_string = vd->vdev_path;
	- if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
	- &value, SE_SLEEP) != 0)
	- goto done;
	- }
	- }
	-
	- if (hist_nvl != NULL) {
	- fnvlist_merge((nvlist_t *)attr, hist_nvl);
	- }
	-
	- if (sysevent_attach_attributes(ev, attr) != 0)
	- goto done;
	- attr = NULL;
	-
	-done:
	- if (attr)
	- sysevent_free_attr(attr);
	-
	-#endif
	- return (ev);
	-}
	-
	-void
	-spa_event_post(sysevent_t *ev)
	-{
	-#ifdef _KERNEL
	- sysevent_id_t eid;
	-
	- (void) log_sysevent(ev, SE_SLEEP, &eid);
	- sysevent_free(ev);
	-#endif
	-}
	-
	-void
	-spa_event_discard(sysevent_t *ev)
	-{
	-#ifdef _KERNEL
	- sysevent_free(ev);
	-#endif
	-}
	-
	-/*
	- * Post a sysevent corresponding to the given event. The 'name' must be one of
	- * the event definitions in sys/sysevent/eventdefs.h. The payload will be
	- * filled in from the spa and (optionally) the vdev and history nvl. This
	- * doesn't do anything in the userland libzpool, as we don't want consumers to
	- * misinterpret ztest or zdb as real changes.
	- */
	-void
	-spa_event_notify(spa_t spa, vdev_t vd, nvlist_t hist_nvl, const char name)
	-{
	- spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
	@@ -1,623 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2017 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * Storage Pool Checkpoint
	- *
	- * A storage pool checkpoint can be thought of as a pool-wide snapshot or
	- * a stable version of extreme rewind that guarantees no blocks from the
	- * checkpointed state will have been overwritten. It remembers the entire
	- * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
	- * point that it was taken and the user can rewind back to that point even if
	- * they applied destructive operations on their datasets or even enabled new
	- * zpool on-disk features. If a pool has a checkpoint that is no longer
	- * needed, the user can discard it.
	- *
	- * == On disk data structures used ==
	- *
	- * - The pool has a new feature flag and a new entry in the MOS. The feature
	- * flag is set to active when we create the checkpoint and remains active
	- * until the checkpoint is fully discarded. The entry in the MOS config
	- * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
	- * references the state of the pool when we take the checkpoint. The entry
	- * remains populated until we start discarding the checkpoint or we rewind
	- * back to it.
	- *
	- * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
	- * which persists until the checkpoint is fully discarded. The space map
	- * contains entries that have been freed in the current state of the pool
	- * but we want to keep around in case we decide to rewind to the checkpoint.
	- * [see vdev_checkpoint_sm]
	- *
	- * - Each metaslab's ms_sm space map behaves the same as without the
	- * checkpoint, with the only exception being the scenario when we free
	- * blocks that belong to the checkpoint. In this case, these blocks remain
	- * ALLOCATED in the metaslab's space map and they are added as FREE in the
	- * vdev's checkpoint space map.
	- *
	- * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
	- * the uberblock was checkpointed. For normal uberblocks this field is 0.
	- *
	- * == Overview of operations ==
	- *
	- * - To create a checkpoint, we first wait for the current TXG to be synced,
	- * so we can use the most recently synced uberblock (spa_ubsync) as the
	- * checkpointed uberblock. Then we use an early synctask to place that
	- * uberblock in MOS config, increment the feature flag for the checkpoint
	- * (marking it active), and setting spa_checkpoint_txg (see its use below)
	- * to the TXG of the checkpointed uberblock. We use an early synctask for
	- * the aforementioned operations to ensure that no blocks were dirtied
	- * between the current TXG and the TXG of the checkpointed uberblock
	- * (e.g the previous txg).
	- *
	- * - When a checkpoint exists, we need to ensure that the blocks that
	- * belong to the checkpoint are freed but never reused. This means that
	- * these blocks should never end up in the ms_allocatable or the ms_freeing
	- * trees of a metaslab. Therefore, whenever there is a checkpoint the new
	- * ms_checkpointing tree is used in addition to the aforementioned ones.
	- *
	- * Whenever a block is freed and we find out that it is referenced by the
	- * checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
	- * we place it in the ms_checkpointing tree instead of the ms_freeingtree.
	- * This way, we divide the blocks that are being freed into checkpointed
	- * and not-checkpointed blocks.
	- *
	- * In order to persist these frees, we write the extents from the
	- * ms_freeingtree to the ms_sm as usual, and the extents from the
	- * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
	- * checkpointed extents will remain allocated in the metaslab's ms_sm space
	- * map, and therefore won't be reused [see metaslab_sync()]. In addition,
	- * when we discard the checkpoint, we can find the entries that have
	- * actually been freed in vdev_checkpoint_sm.
	- * [see spa_checkpoint_discard_thread_sync()]
	- *
	- * - To discard the checkpoint we use an early synctask to delete the
	- * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
	- * and wakeup the discarding zthr thread (an open-context async thread).
	- * We use an early synctask to ensure that the operation happens before any
	- * new data end up in the checkpoint's data structures.
	- *
	- * Once the synctask is done and the discarding zthr is awake, we discard
	- * the checkpointed data over multiple TXGs by having the zthr prefetching
	- * entries from vdev_checkpoint_sm and then starting a synctask that places
	- * them as free blocks in to their respective ms_allocatable and ms_sm
	- * structures.
	- * [see spa_checkpoint_discard_thread()]
	- *
	- * When there are no entries left in the vdev_checkpoint_sm of all
	- * top-level vdevs, a final synctask runs that decrements the feature flag.
	- *
	- * - To rewind to the checkpoint, we first use the current uberblock and
	- * open the MOS so we can access the checkpointed uberblock from the MOS
	- * config. After we retrieve the checkpointed uberblock, we use it as the
	- * current uberblock for the pool by writing it to disk with an updated
	- * TXG, opening its version of the MOS, and moving on as usual from there.
	- * [see spa_ld_checkpoint_rewind()]
	- *
	- * An important note on rewinding to the checkpoint has to do with how we
	- * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
	- * blocks that have not been claimed by the time we took the checkpoint
	- * as they should no longer be valid.
	- * [see comment in zil_claim()]
	- *
	- * == Miscellaneous information ==
	- *
	- * - In the hypothetical event that we take a checkpoint, remove a vdev,
	- * and attempt to rewind, the rewind would fail as the checkpointed
	- * uberblock would reference data in the removed device. For this reason
	- * and others of similar nature, we disallow the following operations that
	- * can change the config:
	- * vdev removal and attach/detach, mirror splitting, and pool reguid.
	- *
	- * - As most of the checkpoint logic is implemented in the SPA and doesn't
	- * distinguish datasets when it comes to space accounting, having a
	- * checkpoint can potentially break the boundaries set by dataset
	- * reservations.
	- */
	-
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/spa_checkpoint.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zap.h>
	-#include <sys/zfeature.h>
	-
	-/*
	- * The following parameter limits the amount of memory to be used for the
	- * prefetching of the checkpoint space map done on each vdev while
	- * discarding the checkpoint.
	- *
	- * The reason it exists is because top-level vdevs with long checkpoint
	- * space maps can potentially take up a lot of memory depending on the
	- * amount of checkpointed data that has been freed within them while
	- * the pool had a checkpoint.
	- */
	-uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
	-
	-int
	-spa_checkpoint_get_stats(spa_t spa, pool_checkpoint_stat_t pcs)
	-{
	- if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
	- return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
	-
	- bzero(pcs, sizeof (pool_checkpoint_stat_t));
	-
	- int error = zap_contains(spa_meta_objset(spa),
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
	- ASSERT(error == 0 \|\| error == ENOENT);
	-
	- if (error == ENOENT)
	- pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
	- else
	- pcs->pcs_state = CS_CHECKPOINT_EXISTS;
	-
	- pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
	- pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
	-
	- return (0);
	-}
	-
	-static void
	-spa_checkpoint_discard_complete_sync(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = arg;
	-
	- spa->spa_checkpoint_info.sci_timestamp = 0;
	-
	- spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
	-
	- spa_history_log_internal(spa, "spa discard checkpoint", tx,
	- "finished discarding checkpointed state from the pool");
	-}
	-
	-typedef struct spa_checkpoint_discard_sync_callback_arg {
	- vdev_t *sdc_vd;
	- uint64_t sdc_txg;
	- uint64_t sdc_entry_limit;
	-} spa_checkpoint_discard_sync_callback_arg_t;
	-
	-static int
	-spa_checkpoint_discard_sync_callback(space_map_entry_t sme, void arg)
	-{
	- spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
	- vdev_t *vd = sdc->sdc_vd;
	- metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
	- uint64_t end = sme->sme_offset + sme->sme_run;
	-
	- if (sdc->sdc_entry_limit == 0)
	- return (EINTR);
	-
	- /*
	- * Since the space map is not condensed, we know that
	- * none of its entries is crossing the boundaries of
	- * its respective metaslab.
	- *
	- * That said, there is no fundamental requirement that
	- * the checkpoint's space map entries should not cross
	- * metaslab boundaries. So if needed we could add code
	- * that handles metaslab-crossing segments in the future.
	- */
	- VERIFY3U(sme->sme_type, ==, SM_FREE);
	- VERIFY3U(sme->sme_offset, >=, ms->ms_start);
	- VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
	-
	- /*
	- * At this point we should not be processing any
	- * other frees concurrently, so the lock is technically
	- * unnecessary. We use the lock anyway though to
	- * potentially save ourselves from future headaches.
	- */
	- mutex_enter(&ms->ms_lock);
	- if (range_tree_is_empty(ms->ms_freeing))
	- vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
	- range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
	- mutex_exit(&ms->ms_lock);
	-
	- ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
	- sme->sme_run);
	- ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
	-
	- vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
	- vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
	- sdc->sdc_entry_limit--;
	-
	- return (0);
	-}
	-
	-static void
	-spa_checkpoint_accounting_verify(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- uint64_t ckpoint_sm_space_sum = 0;
	- uint64_t vs_ckpoint_space_sum = 0;
	-
	- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *vd = rvd->vdev_child[c];
	-
	- if (vd->vdev_checkpoint_sm != NULL) {
	- ckpoint_sm_space_sum +=
	- -space_map_allocated(vd->vdev_checkpoint_sm);
	- vs_ckpoint_space_sum +=
	- vd->vdev_stat.vs_checkpoint_space;
	- ASSERT3U(ckpoint_sm_space_sum, ==,
	- vs_ckpoint_space_sum);
	- } else {
	- ASSERT0(vd->vdev_stat.vs_checkpoint_space);
	- }
	- }
	- ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
	-}
	-
	-static void
	-spa_checkpoint_discard_thread_sync(void arg, dmu_tx_t tx)
	-{
	- vdev_t *vd = arg;
	- int error;
	-
	- /*
	- * The space map callback is applied only to non-debug entries.
	- * Because the number of debug entries is less or equal to the
	- * number of non-debug entries, we want to ensure that we only
	- * read what we prefetched from open-context.
	- *
	- * Thus, we set the maximum entries that the space map callback
	- * will be applied to be half the entries that could fit in the
	- * imposed memory limit.
	- *
	- * Note that since this is a conservative estimate we also
	- * assume the worst case scenario in our computation where each
	- * entry is two-word.
	- */
	- uint64_t max_entry_limit =
	- (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
	-
	- /*
	- * Iterate from the end of the space map towards the beginning,
	- * placing its entries on ms_freeing and removing them from the
	- * space map. The iteration stops if one of the following
	- * conditions is true:
	- *
	- * 1] We reached the beginning of the space map. At this point
	- * the space map should be completely empty and
	- * space_map_incremental_destroy should have returned 0.
	- * The next step would be to free and close the space map
	- * and remove its entry from its vdev's top zap. This allows
	- * spa_checkpoint_discard_thread() to move on to the next vdev.
	- *
	- * 2] We reached the memory limit (amount of memory used to hold
	- * space map entries in memory) and space_map_incremental_destroy
	- * returned EINTR. This means that there are entries remaining
	- * in the space map that will be cleared in a future invocation
	- * of this function by spa_checkpoint_discard_thread().
	- */
	- spa_checkpoint_discard_sync_callback_arg_t sdc;
	- sdc.sdc_vd = vd;
	- sdc.sdc_txg = tx->tx_txg;
	- sdc.sdc_entry_limit = max_entry_limit;
	-
	- uint64_t words_before =
	- space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
	-
	- error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
	- spa_checkpoint_discard_sync_callback, &sdc, tx);
	-
	- uint64_t words_after =
	- space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
	-
	-#ifdef DEBUG
	- spa_checkpoint_accounting_verify(vd->vdev_spa);
	-#endif
	-
	- zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
	- "deleted %llu words - %llu words are left",
	- tx->tx_txg, vd->vdev_id, (words_before - words_after),
	- words_after);
	-
	- if (error != EINTR) {
	- if (error != 0) {
	- zfs_panic_recover("zfs: error %d was returned "
	- "while incrementally destroying the checkpoint "
	- "space map of vdev %llu\n",
	- error, vd->vdev_id);
	- }
	- ASSERT0(words_after);
	- ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
	- ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
	-
	- space_map_free(vd->vdev_checkpoint_sm, tx);
	- space_map_close(vd->vdev_checkpoint_sm);
	- vd->vdev_checkpoint_sm = NULL;
	-
	- VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
	- vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
	- }
	-}
	-
	-static boolean_t
	-spa_checkpoint_discard_is_done(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- ASSERT(!spa_has_checkpoint(spa));
	- ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
	-
	- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	- if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
	- return (B_FALSE);
	- ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
	- }
	-
	- return (B_TRUE);
	-}
	-
	-/* ARGSUSED */
	-boolean_t
	-spa_checkpoint_discard_thread_check(void arg, zthr_t zthr)
	-{
	- spa_t *spa = arg;
	-
	- if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
	- return (B_FALSE);
	-
	- if (spa_has_checkpoint(spa))
	- return (B_FALSE);
	-
	- return (B_TRUE);
	-}
	-
	-void
	-spa_checkpoint_discard_thread(void arg, zthr_t zthr)
	-{
	- spa_t *spa = arg;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *vd = rvd->vdev_child[c];
	-
	- while (vd->vdev_checkpoint_sm != NULL) {
	- space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
	- int numbufs;
	- dmu_buf_t **dbp;
	-
	- if (zthr_iscancelled(zthr))
	- return;
	-
	- ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
	-
	- uint64_t size = MIN(space_map_length(checkpoint_sm),
	- zfs_spa_discard_memory_limit);
	- uint64_t offset =
	- space_map_length(checkpoint_sm) - size;
	-
	- /*
	- * Ensure that the part of the space map that will
	- * be destroyed by the synctask, is prefetched in
	- * memory before the synctask runs.
	- */
	- int error = dmu_buf_hold_array_by_bonus(
	- checkpoint_sm->sm_dbuf, offset, size,
	- B_TRUE, FTAG, &numbufs, &dbp);
	- if (error != 0) {
	- zfs_panic_recover("zfs: error %d was returned "
	- "while prefetching checkpoint space map "
	- "entries of vdev %llu\n",
	- error, vd->vdev_id);
	- }
	-
	- VERIFY0(dsl_sync_task(spa->spa_name, NULL,
	- spa_checkpoint_discard_thread_sync, vd,
	- 0, ZFS_SPACE_CHECK_NONE));
	-
	- dmu_buf_rele_array(dbp, numbufs, FTAG);
	- }
	- }
	-
	- VERIFY(spa_checkpoint_discard_is_done(spa));
	- VERIFY0(spa->spa_checkpoint_info.sci_dspace);
	- VERIFY0(dsl_sync_task(spa->spa_name, NULL,
	- spa_checkpoint_discard_complete_sync, spa,
	- 0, ZFS_SPACE_CHECK_NONE));
	-}
	-
	-
	-/* ARGSUSED */
	-static int
	-spa_checkpoint_check(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
	- return (SET_ERROR(ENOTSUP));
	-
	- if (!spa_top_vdevs_spacemap_addressable(spa))
	- return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
	-
	- if (spa->spa_vdev_removal != NULL)
	- return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
	-
	- if (spa->spa_checkpoint_txg != 0)
	- return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
	-
	- if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
	- return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-spa_checkpoint_sync(void arg, dmu_tx_t tx)
	-{
	- dsl_pool_t *dp = dmu_tx_pool(tx);
	- spa_t *spa = dp->dp_spa;
	- uberblock_t checkpoint = spa->spa_ubsync;
	-
	- /*
	- * At this point, there should not be a checkpoint in the MOS.
	- */
	- ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
	-
	- ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
	- ASSERT0(spa->spa_checkpoint_info.sci_dspace);
	-
	- /*
	- * Since the checkpointed uberblock is the one that just got synced
	- * (we use spa_ubsync), its txg must be equal to the txg number of
	- * the txg we are syncing, minus 1.
	- */
	- ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
	-
	- /*
	- * Once the checkpoint is in place, we need to ensure that none of
	- * its blocks will be marked for reuse after it has been freed.
	- * When there is a checkpoint and a block is freed, we compare its
	- * birth txg to the txg of the checkpointed uberblock to see if the
	- * block is part of the checkpoint or not. Therefore, we have to set
	- * spa_checkpoint_txg before any frees happen in this txg (which is
	- * why this is done as an early_synctask as explained in the comment
	- * in spa_checkpoint()).
	- */
	- spa->spa_checkpoint_txg = checkpoint.ub_txg;
	- spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
	-
	- checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
	- VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
	- sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
	- &checkpoint, tx));
	-
	- /*
	- * Increment the feature refcount and thus activate the feature.
	- * Note that the feature will be deactivated when we've
	- * completely discarded all checkpointed state (both vdev
	- * space maps and uberblock).
	- */
	- spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
	-
	- spa_history_log_internal(spa, "spa checkpoint", tx,
	- "checkpointed uberblock txg=%llu", checkpoint.ub_txg);
	-}
	-
	-/*
	- * Create a checkpoint for the pool.
	- */
	-int
	-spa_checkpoint(const char *pool)
	-{
	- int error;
	- spa_t *spa;
	-
	- error = spa_open(pool, &spa, FTAG);
	- if (error != 0)
	- return (error);
	-
	- mutex_enter(&spa->spa_vdev_top_lock);
	-
	- /*
	- * Wait for current syncing txg to finish so the latest synced
	- * uberblock (spa_ubsync) has all the changes that we expect
	- * to see if we were to revert later to the checkpoint. In other
	- * words we want the checkpointed uberblock to include/reference
	- * all the changes that were pending at the time that we issued
	- * the checkpoint command.
	- */
	- txg_wait_synced(spa_get_dsl(spa), 0);
	-
	- /*
	- * As the checkpointed uberblock references blocks from the previous
	- * txg (spa_ubsync) we want to ensure that are not freeing any of
	- * these blocks in the same txg that the following synctask will
	- * run. Thus, we run it as an early synctask, so the dirty changes
	- * that are synced to disk afterwards during zios and other synctasks
	- * do not reuse checkpointed blocks.
	- */
	- error = dsl_early_sync_task(pool, spa_checkpoint_check,
	- spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
	-
	- mutex_exit(&spa->spa_vdev_top_lock);
	-
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-spa_checkpoint_discard_check(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
	- return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
	-
	- if (spa->spa_checkpoint_txg == 0)
	- return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
	-
	- VERIFY0(zap_contains(spa_meta_objset(spa),
	- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-spa_checkpoint_discard_sync(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_ZPOOL_CHECKPOINT, tx));
	-
	- spa->spa_checkpoint_txg = 0;
	-
	- zthr_wakeup(spa->spa_checkpoint_discard_zthr);
	-
	- spa_history_log_internal(spa, "spa discard checkpoint", tx,
	- "started discarding checkpointed state from the pool");
	-}
	-
	-/*
	- * Discard the checkpoint from a pool.
	- */
	-int
	-spa_checkpoint_discard(const char *pool)
	-{
	- /*
	- * Similarly to spa_checkpoint(), we want our synctask to run
	- * before any pending dirty data are written to disk so they
	- * won't end up in the checkpoint's data structures (e.g.
	- * ms_checkpointing and vdev_checkpoint_sm) and re-create any
	- * space maps that the discarding open-context thread has
	- * deleted.
	- * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
	- */
	- return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
	- spa_checkpoint_discard_sync, NULL, 0,
	- ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
	@@ -1,594 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright 2017 Joyent, Inc.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/fm/fs/zfs.h>
	-#include <sys/spa_impl.h>
	-#include <sys/nvpair.h>
	-#include <sys/uio.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/utsname.h>
	-#include <sys/sunddi.h>
	-#include <sys/zfeature.h>
	-#ifdef _KERNEL
	-#include <sys/kobj.h>
	-#include <sys/zone.h>
	-#endif
	-
	-/*
	- * Pool configuration repository.
	- *
	- * Pool configuration is stored as a packed nvlist on the filesystem. By
	- * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
	- * (when the ZFS module is loaded). Pools can also have the 'cachefile'
	- * property set that allows them to be stored in an alternate location until
	- * the control of external software.
	- *
	- * For each cache file, we have a single nvlist which holds all the
	- * configuration information. When the module loads, we read this information
	- * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is
	- * maintained independently in spa.c. Whenever the namespace is modified, or
	- * the configuration of a pool is changed, we call spa_write_cachefile(), which
	- * walks through all the active pools and writes the configuration to disk.
	- */
	-
	-static uint64_t spa_config_generation = 1;
	-
	-/*
	- * This can be overridden in userland to preserve an alternate namespace for
	- * userland pools when doing testing.
	- */
	-const char *spa_config_path = ZPOOL_CACHE;
	-
	-/*
	- * Called when the module is first loaded, this routine loads the configuration
	- * file into the SPA namespace. It does not actually open or load the pools; it
	- * only populates the namespace.
	- */
	-void
	-spa_config_load(void)
	-{
	- void *buf = NULL;
	- nvlist_t nvlist, child;
	- nvpair_t *nvpair;
	- char *pathname;
	- struct _buf *file;
	- uint64_t fsize;
	-
	- /*
	- * Open the configuration file.
	- */
	- pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	-
	- (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
	-
	- file = kobj_open_file(pathname);
	-
	- kmem_free(pathname, MAXPATHLEN);
	-
	- if (file == (struct _buf *)-1)
	- return;
	-
	- if (kobj_get_filesize(file, &fsize) != 0)
	- goto out;
	-
	- buf = kmem_alloc(fsize, KM_SLEEP);
	-
	- /*
	- * Read the nvlist from the file.
	- */
	- if (kobj_read_file(file, buf, fsize, 0) < 0)
	- goto out;
	-
	- /*
	- * Unpack the nvlist.
	- */
	- if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
	- goto out;
	-
	- /*
	- * Iterate over all elements in the nvlist, creating a new spa_t for
	- * each one with the specified configuration.
	- */
	- mutex_enter(&spa_namespace_lock);
	- nvpair = NULL;
	- while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
	- if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
	- continue;
	-
	- child = fnvpair_value_nvlist(nvpair);
	-
	- if (spa_lookup(nvpair_name(nvpair)) != NULL)
	- continue;
	- (void) spa_add(nvpair_name(nvpair), child, NULL);
	- }
	- mutex_exit(&spa_namespace_lock);
	-
	- nvlist_free(nvlist);
	-
	-out:
	- if (buf != NULL)
	- kmem_free(buf, fsize);
	-
	- kobj_close_file(file);
	-}
	-
	-static void
	-spa_config_clean(nvlist_t *nvl)
	-{
	- nvlist_t **child;
	- nvlist_t *nvroot = NULL;
	- uint_t c, children;
	-
	- if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
	- &children) == 0) {
	- for (c = 0; c < children; c++)
	- spa_config_clean(child[c]);
	- }
	-
	- if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0)
	- spa_config_clean(nvroot);
	-
	- nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY);
	- nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY);
	-}
	-
	-static int
	-spa_config_write(spa_config_dirent_t dp, nvlist_t nvl)
	-{
	- size_t buflen;
	- char *buf;
	- vnode_t *vp;
	- int oflags = FWRITE \| FTRUNC \| FCREAT \| FOFFMAX;
	- char *temp;
	- int err;
	-
	- /*
	- * If the nvlist is empty (NULL), then remove the old cachefile.
	- */
	- if (nvl == NULL) {
	- err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
	- return (err);
	- }
	-
	- /*
	- * Pack the configuration into a buffer.
	- */
	- buf = fnvlist_pack(nvl, &buflen);
	- temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
	-
	- /*
	- * Write the configuration to disk. We need to do the traditional
	- * 'write to temporary file, sync, move over original' to make sure we
	- * always have a consistent view of the data.
	- */
	- (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path);
	-
	- err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0);
	- if (err == 0) {
	- err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
	- 0, RLIM64_INFINITY, kcred, NULL);
	- if (err == 0)
	- err = VOP_FSYNC(vp, FSYNC, kcred, NULL);
	- if (err == 0)
	- err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
	- (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
	- }
	-
	- (void) vn_remove(temp, UIO_SYSSPACE, RMFILE);
	-
	- fnvlist_pack_free(buf, buflen);
	- kmem_free(temp, MAXPATHLEN);
	- return (err);
	-}
	-
	-/*
	- * Synchronize pool configuration to disk. This must be called with the
	- * namespace lock held. Synchronizing the pool cache is typically done after
	- * the configuration has been synced to the MOS. This exposes a window where
	- * the MOS config will have been updated but the cache file has not. If
	- * the system were to crash at that instant then the cached config may not
	- * contain the correct information to open the pool and an explicit import
	- * would be required.
	- */
	-void
	-spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
	-{
	- spa_config_dirent_t dp, tdp;
	- nvlist_t *nvl;
	- boolean_t ccw_failure;
	- int error;
	- char *pool_name;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- if (rootdir == NULL \|\| !(spa_mode_global & FWRITE))
	- return;
	-
	- /*
	- * Iterate over all cachefiles for the pool, past or present. When the
	- * cachefile is changed, the new one is pushed onto this list, allowing
	- * us to update previous cachefiles that no longer contain this pool.
	- */
	- ccw_failure = B_FALSE;
	- for (dp = list_head(&target->spa_config_list); dp != NULL;
	- dp = list_next(&target->spa_config_list, dp)) {
	- spa_t *spa = NULL;
	- if (dp->scd_path == NULL)
	- continue;
	-
	- /*
	- * Iterate over all pools, adding any matching pools to 'nvl'.
	- */
	- nvl = NULL;
	- while ((spa = spa_next(spa)) != NULL) {
	- nvlist_t *nvroot = NULL;
	- /*
	- * Skip over our own pool if we're about to remove
	- * ourselves from the spa namespace or any pool that
	- * is readonly. Since we cannot guarantee that a
	- * readonly pool would successfully import upon reboot,
	- * we don't allow them to be written to the cache file.
	- */
	- if ((spa == target && removing) \|\|
	- (spa_state(spa) == POOL_STATE_ACTIVE &&
	- !spa_writeable(spa)))
	- continue;
	-
	- mutex_enter(&spa->spa_props_lock);
	- tdp = list_head(&spa->spa_config_list);
	- if (spa->spa_config == NULL \|\|
	- tdp->scd_path == NULL \|\|
	- strcmp(tdp->scd_path, dp->scd_path) != 0) {
	- mutex_exit(&spa->spa_props_lock);
	- continue;
	- }
	-
	- if (nvl == NULL)
	- nvl = fnvlist_alloc();
	-
	- if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
	- pool_name = fnvlist_lookup_string(
	- spa->spa_config, ZPOOL_CONFIG_POOL_NAME);
	- } else {
	- pool_name = spa_name(spa);
	- }
	-
	- fnvlist_add_nvlist(nvl, pool_name,
	- spa->spa_config);
	- mutex_exit(&spa->spa_props_lock);
	-
	- if (nvlist_lookup_nvlist(nvl, pool_name, &nvroot) == 0)
	- spa_config_clean(nvroot);
	- }
	-
	- error = spa_config_write(dp, nvl);
	- if (error != 0)
	- ccw_failure = B_TRUE;
	- nvlist_free(nvl);
	- }
	-
	- if (ccw_failure) {
	- /*
	- * Keep trying so that configuration data is
	- * written if/when any temporary filesystem
	- * resource issues are resolved.
	- */
	- if (target->spa_ccw_fail_time == 0) {
	- zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
	- target, NULL, NULL, 0, 0);
	- }
	- target->spa_ccw_fail_time = gethrtime();
	- spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
	- } else {
	- /*
	- * Do not rate limit future attempts to update
	- * the config cache.
	- */
	- target->spa_ccw_fail_time = 0;
	- }
	-
	- /*
	- * Remove any config entries older than the current one.
	- */
	- dp = list_head(&target->spa_config_list);
	- while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) {
	- list_remove(&target->spa_config_list, tdp);
	- if (tdp->scd_path != NULL)
	- spa_strfree(tdp->scd_path);
	- kmem_free(tdp, sizeof (spa_config_dirent_t));
	- }
	-
	- spa_config_generation++;
	-
	- if (postsysevent)
	- spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
	-}
	-
	-/*
	- * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
	- * and we don't want to allow the local zone to see all the pools anyway.
	- * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
	- * information for all pool visible within the zone.
	- */
	-nvlist_t *
	-spa_all_configs(uint64_t *generation)
	-{
	- nvlist_t *pools;
	- spa_t *spa = NULL;
	-
	- if (*generation == spa_config_generation)
	- return (NULL);
	-
	- pools = fnvlist_alloc();
	-
	- mutex_enter(&spa_namespace_lock);
	- while ((spa = spa_next(spa)) != NULL) {
	- if (INGLOBALZONE(curthread) \|\|
	- zone_dataset_visible(spa_name(spa), NULL)) {
	- mutex_enter(&spa->spa_props_lock);
	- fnvlist_add_nvlist(pools, spa_name(spa),
	- spa->spa_config);
	- mutex_exit(&spa->spa_props_lock);
	- }
	- }
	- *generation = spa_config_generation;
	- mutex_exit(&spa_namespace_lock);
	-
	- return (pools);
	-}
	-
	-void
	-spa_config_set(spa_t spa, nvlist_t config)
	-{
	- mutex_enter(&spa->spa_props_lock);
	- if (spa->spa_config != NULL && spa->spa_config != config)
	- nvlist_free(spa->spa_config);
	- spa->spa_config = config;
	- mutex_exit(&spa->spa_props_lock);
	-}
	-
	-/*
	- * Generate the pool's configuration based on the current in-core state.
	- *
	- * We infer whether to generate a complete config or just one top-level config
	- * based on whether vd is the root vdev.
	- */
	-nvlist_t *
	-spa_config_generate(spa_t spa, vdev_t vd, uint64_t txg, int getstats)
	-{
	- nvlist_t config, nvroot;
	- vdev_t *rvd = spa->spa_root_vdev;
	- unsigned long hostid = 0;
	- boolean_t locked = B_FALSE;
	- uint64_t split_guid;
	- char *pool_name;
	-
	- if (vd == NULL) {
	- vd = rvd;
	- locked = B_TRUE;
	- spa_config_enter(spa, SCL_CONFIG \| SCL_STATE, FTAG, RW_READER);
	- }
	-
	- ASSERT(spa_config_held(spa, SCL_CONFIG \| SCL_STATE, RW_READER) ==
	- (SCL_CONFIG \| SCL_STATE));
	-
	- /*
	- * If txg is -1, report the current value of spa->spa_config_txg.
	- */
	- if (txg == -1ULL)
	- txg = spa->spa_config_txg;
	-
	- /*
	- * Originally, users had to handle spa namespace collisions by either
	- * exporting the already imported pool or by specifying a new name for
	- * the pool with a conflicting name. In the case of root pools from
	- * virtual guests, neither approach to collision resolution is
	- * reasonable. This is addressed by extending the new name syntax with
	- * an option to specify that the new name is temporary. When specified,
	- * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us
	- * to use the previous name, which we do below.
	- */
	- if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
	- pool_name = fnvlist_lookup_string(spa->spa_config,
	- ZPOOL_CONFIG_POOL_NAME);
	- } else {
	- pool_name = spa_name(spa);
	- }
	-
	- config = fnvlist_alloc();
	-
	- fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
	- fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name);
	- fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
	- fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
	- fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
	- if (spa->spa_comment != NULL) {
	- fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
	- spa->spa_comment);
	- }
	-
	- hostid = spa_get_hostid();
	- if (hostid != 0) {
	- fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
	- }
	- fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename);
	-
	- int config_gen_flags = 0;
	- if (vd != rvd) {
	- fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
	- vd->vdev_top->vdev_guid);
	- fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
	- vd->vdev_guid);
	- if (vd->vdev_isspare) {
	- fnvlist_add_uint64(config,
	- ZPOOL_CONFIG_IS_SPARE, 1ULL);
	- }
	- if (vd->vdev_islog) {
	- fnvlist_add_uint64(config,
	- ZPOOL_CONFIG_IS_LOG, 1ULL);
	- }
	- vd = vd->vdev_top; /* label contains top config */
	- } else {
	- /*
	- * Only add the (potentially large) split information
	- * in the mos config, and not in the vdev labels
	- */
	- if (spa->spa_config_splitting != NULL)
	- fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
	- spa->spa_config_splitting);
	- fnvlist_add_boolean(config,
	- ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS);
	-
	- config_gen_flags \|= VDEV_CONFIG_MOS;
	- }
	-
	- /*
	- * Add the top-level config. We even add this on pools which
	- * don't support holes in the namespace.
	- */
	- vdev_top_config_generate(spa, config);
	-
	- /*
	- * If we're splitting, record the original pool's guid.
	- */
	- if (spa->spa_config_splitting != NULL &&
	- nvlist_lookup_uint64(spa->spa_config_splitting,
	- ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
	- fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
	- split_guid);
	- }
	-
	- nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags);
	- fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot);
	- nvlist_free(nvroot);
	-
	- /*
	- * Store what's necessary for reading the MOS in the label.
	- */
	- fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
	- spa->spa_label_features);
	-
	- if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
	- ddt_histogram_t *ddh;
	- ddt_stat_t *dds;
	- ddt_object_t *ddo;
	-
	- ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
	- ddt_get_dedup_histogram(spa, ddh);
	- fnvlist_add_uint64_array(config,
	- ZPOOL_CONFIG_DDT_HISTOGRAM,
	- (uint64_t )ddh, sizeof (ddh) / sizeof (uint64_t));
	- kmem_free(ddh, sizeof (ddt_histogram_t));
	-
	- ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
	- ddt_get_dedup_object_stats(spa, ddo);
	- fnvlist_add_uint64_array(config,
	- ZPOOL_CONFIG_DDT_OBJ_STATS,
	- (uint64_t )ddo, sizeof (ddo) / sizeof (uint64_t));
	- kmem_free(ddo, sizeof (ddt_object_t));
	-
	- dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
	- ddt_get_dedup_stats(spa, dds);
	- fnvlist_add_uint64_array(config,
	- ZPOOL_CONFIG_DDT_STATS,
	- (uint64_t )dds, sizeof (dds) / sizeof (uint64_t));
	- kmem_free(dds, sizeof (ddt_stat_t));
	- }
	-
	- if (locked)
	- spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	-
	- return (config);
	-}
	-
	-/*
	- * Update all disk labels, generate a fresh config based on the current
	- * in-core state, and sync the global config cache (do not sync the config
	- * cache if this is a booting rootpool).
	- */
	-void
	-spa_config_update(spa_t *spa, int what)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- uint64_t txg;
	- int c;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- txg = spa_last_synced_txg(spa) + 1;
	- if (what == SPA_CONFIG_UPDATE_POOL) {
	- vdev_config_dirty(rvd);
	- } else {
	- /*
	- * If we have top-level vdevs that were added but have
	- * not yet been prepared for allocation, do that now.
	- * (It's safe now because the config cache is up to date,
	- * so it will be able to translate the new DVAs.)
	- * See comments in spa_vdev_add() for full details.
	- */
	- for (c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	-
	- /*
	- * Explicitly skip vdevs that are indirect or
	- * log vdevs that are being removed. The reason
	- * is that both of those can have vdev_ms_array
	- * set to 0 and we wouldn't want to change their
	- * metaslab size nor call vdev_expand() on them.
	- */
	- if (!vdev_is_concrete(tvd) \|\|
	- (tvd->vdev_islog && tvd->vdev_removing))
	- continue;
	-
	- if (tvd->vdev_ms_array == 0) {
	- vdev_ashift_optimize(tvd);
	- vdev_metaslab_set_size(tvd);
	- }
	- vdev_expand(tvd, txg);
	- }
	- }
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-
	- /*
	- * Wait for the mosconfig to be regenerated and synced.
	- */
	- txg_wait_synced(spa->spa_dsl_pool, txg);
	-
	- /*
	- * Update the global config cache to reflect the new mosconfig.
	- */
	- spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
	-
	- if (what == SPA_CONFIG_UPDATE_POOL)
	- spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
	@@ -1,406 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * Routines to manage the on-disk persistent error log.
	- *
	- * Each pool stores a log of all logical data errors seen during normal
	- * operation. This is actually the union of two distinct logs: the last log,
	- * and the current log. All errors seen are logged to the current log. When a
	- * scrub completes, the current log becomes the last log, the last log is thrown
	- * out, and the current log is reinitialized. This way, if an error is somehow
	- * corrected, a new scrub will show that that it no longer exists, and will be
	- * deleted from the log when the scrub completes.
	- *
	- * The log is stored using a ZAP object whose key is a string form of the
	- * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
	- * optional 'objset:object' human-readable string describing the data. When an
	- * error is first logged, this string will be empty, indicating that no name is
	- * known. This prevents us from having to issue a potentially large amount of
	- * I/O to discover the object name during an error path. Instead, we do the
	- * calculation when the data is requested, storing the result so future queries
	- * will be faster.
	- *
	- * This log is then shipped into an nvlist where the key is the dataset name and
	- * the value is the object name. Userland is then responsible for uniquifying
	- * this list and displaying it to the user.
	- */
	-
	-#include <sys/dmu_tx.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zap.h>
	-#include <sys/zio.h>
	-
	-
	-/*
	- * Convert a bookmark to a string.
	- */
	-static void
	-bookmark_to_name(zbookmark_phys_t zb, char buf, size_t len)
	-{
	- (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
	- (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
	- (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
	-}
	-
	-/*
	- * Convert a string to a bookmark
	- */
	-#ifdef _KERNEL
	-static void
	-name_to_bookmark(char buf, zbookmark_phys_t zb)
	-{
	- zb->zb_objset = zfs_strtonum(buf, &buf);
	- ASSERT(*buf == ':');
	- zb->zb_object = zfs_strtonum(buf + 1, &buf);
	- ASSERT(*buf == ':');
	- zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
	- ASSERT(*buf == ':');
	- zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
	- ASSERT(*buf == '\0');
	-}
	-#endif
	-
	-/*
	- * Log an uncorrectable error to the persistent error log. We add it to the
	- * spa's list of pending errors. The changes are actually synced out to disk
	- * during spa_errlog_sync().
	- */
	-void
	-spa_log_error(spa_t spa, zio_t zio)
	-{
	- zbookmark_phys_t *zb = &zio->io_logical->io_bookmark;
	- spa_error_entry_t search;
	- spa_error_entry_t *new;
	- avl_tree_t *tree;
	- avl_index_t where;
	-
	- /*
	- * If we are trying to import a pool, ignore any errors, as we won't be
	- * writing to the pool any time soon.
	- */
	- if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
	- return;
	-
	- mutex_enter(&spa->spa_errlist_lock);
	-
	- /*
	- * If we have had a request to rotate the log, log it to the next list
	- * instead of the current one.
	- */
	- if (spa->spa_scrub_active \|\| spa->spa_scrub_finished)
	- tree = &spa->spa_errlist_scrub;
	- else
	- tree = &spa->spa_errlist_last;
	-
	- search.se_bookmark = *zb;
	- if (avl_find(tree, &search, &where) != NULL) {
	- mutex_exit(&spa->spa_errlist_lock);
	- return;
	- }
	-
	- new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
	- new->se_bookmark = *zb;
	- avl_insert(tree, new, where);
	-
	- mutex_exit(&spa->spa_errlist_lock);
	-}
	-
	-/*
	- * Return the number of errors currently in the error log. This is actually the
	- * sum of both the last log and the current log, since we don't know the union
	- * of these logs until we reach userland.
	- */
	-uint64_t
	-spa_get_errlog_size(spa_t *spa)
	-{
	- uint64_t total = 0, count;
	-
	- mutex_enter(&spa->spa_errlog_lock);
	- if (spa->spa_errlog_scrub != 0 &&
	- zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
	- &count) == 0)
	- total += count;
	-
	- if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
	- zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
	- &count) == 0)
	- total += count;
	- mutex_exit(&spa->spa_errlog_lock);
	-
	- mutex_enter(&spa->spa_errlist_lock);
	- total += avl_numnodes(&spa->spa_errlist_last);
	- total += avl_numnodes(&spa->spa_errlist_scrub);
	- mutex_exit(&spa->spa_errlist_lock);
	-
	- return (total);
	-}
	-
	-#ifdef _KERNEL
	-static int
	-process_error_log(spa_t spa, uint64_t obj, void addr, size_t *count)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- zbookmark_phys_t zb;
	-
	- if (obj == 0)
	- return (0);
	-
	- for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	-
	- if (*count == 0) {
	- zap_cursor_fini(&zc);
	- return (SET_ERROR(ENOMEM));
	- }
	-
	- name_to_bookmark(za.za_name, &zb);
	-
	- if (copyout(&zb, (char *)addr +
	- (count - 1) sizeof (zbookmark_phys_t),
	- sizeof (zbookmark_phys_t)) != 0) {
	- zap_cursor_fini(&zc);
	- return (SET_ERROR(EFAULT));
	- }
	-
	- *count -= 1;
	- }
	-
	- zap_cursor_fini(&zc);
	-
	- return (0);
	-}
	-
	-static int
	-process_error_list(avl_tree_t list, void addr, size_t *count)
	-{
	- spa_error_entry_t *se;
	-
	- for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
	-
	- if (*count == 0)
	- return (SET_ERROR(ENOMEM));
	-
	- if (copyout(&se->se_bookmark, (char *)addr +
	- (count - 1) sizeof (zbookmark_phys_t),
	- sizeof (zbookmark_phys_t)) != 0)
	- return (SET_ERROR(EFAULT));
	-
	- *count -= 1;
	- }
	-
	- return (0);
	-}
	-#endif
	-
	-/*
	- * Copy all known errors to userland as an array of bookmarks. This is
	- * actually a union of the on-disk last log and current log, as well as any
	- * pending error requests.
	- *
	- * Because the act of reading the on-disk log could cause errors to be
	- * generated, we have two separate locks: one for the error log and one for the
	- * in-core error lists. We only need the error list lock to log and error, so
	- * we grab the error log lock while we read the on-disk logs, and only pick up
	- * the error list lock when we are finished.
	- */
	-int
	-spa_get_errlog(spa_t spa, void uaddr, size_t *count)
	-{
	- int ret = 0;
	-
	-#ifdef _KERNEL
	- mutex_enter(&spa->spa_errlog_lock);
	-
	- ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
	-
	- if (!ret && !spa->spa_scrub_finished)
	- ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
	- count);
	-
	- mutex_enter(&spa->spa_errlist_lock);
	- if (!ret)
	- ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
	- count);
	- if (!ret)
	- ret = process_error_list(&spa->spa_errlist_last, uaddr,
	- count);
	- mutex_exit(&spa->spa_errlist_lock);
	-
	- mutex_exit(&spa->spa_errlog_lock);
	-#endif
	-
	- return (ret);
	-}
	-
	-/*
	- * Called when a scrub completes. This simply set a bit which tells which AVL
	- * tree to add new errors. spa_errlog_sync() is responsible for actually
	- * syncing the changes to the underlying objects.
	- */
	-void
	-spa_errlog_rotate(spa_t *spa)
	-{
	- mutex_enter(&spa->spa_errlist_lock);
	- spa->spa_scrub_finished = B_TRUE;
	- mutex_exit(&spa->spa_errlist_lock);
	-}
	-
	-/*
	- * Discard any pending errors from the spa_t. Called when unloading a faulted
	- * pool, as the errors encountered during the open cannot be synced to disk.
	- */
	-void
	-spa_errlog_drain(spa_t *spa)
	-{
	- spa_error_entry_t *se;
	- void *cookie;
	-
	- mutex_enter(&spa->spa_errlist_lock);
	-
	- cookie = NULL;
	- while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
	- &cookie)) != NULL)
	- kmem_free(se, sizeof (spa_error_entry_t));
	- cookie = NULL;
	- while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
	- &cookie)) != NULL)
	- kmem_free(se, sizeof (spa_error_entry_t));
	-
	- mutex_exit(&spa->spa_errlist_lock);
	-}
	-
	-/*
	- * Process a list of errors into the current on-disk log.
	- */
	-static void
	-sync_error_list(spa_t spa, avl_tree_t t, uint64_t obj, dmu_tx_t tx)
	-{
	- spa_error_entry_t *se;
	- char buf[64];
	- void *cookie;
	-
	- if (avl_numnodes(t) != 0) {
	- /* create log if necessary */
	- if (*obj == 0)
	- *obj = zap_create(spa->spa_meta_objset,
	- DMU_OT_ERROR_LOG, DMU_OT_NONE,
	- 0, tx);
	-
	- /* add errors to the current log */
	- for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
	- char *name = se->se_name ? se->se_name : "";
	-
	- bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
	-
	- (void) zap_update(spa->spa_meta_objset,
	- *obj, buf, 1, strlen(name) + 1, name, tx);
	- }
	-
	- /* purge the error list */
	- cookie = NULL;
	- while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
	- kmem_free(se, sizeof (spa_error_entry_t));
	- }
	-}
	-
	-/*
	- * Sync the error log out to disk. This is a little tricky because the act of
	- * writing the error log requires the spa_errlist_lock. So, we need to lock the
	- * error lists, take a copy of the lists, and then reinitialize them. Then, we
	- * drop the error list lock and take the error log lock, at which point we
	- * do the errlog processing. Then, if we encounter an I/O error during this
	- * process, we can successfully add the error to the list. Note that this will
	- * result in the perpetual recycling of errors, but it is an unlikely situation
	- * and not a performance critical operation.
	- */
	-void
	-spa_errlog_sync(spa_t *spa, uint64_t txg)
	-{
	- dmu_tx_t *tx;
	- avl_tree_t scrub, last;
	- int scrub_finished;
	-
	- mutex_enter(&spa->spa_errlist_lock);
	-
	- /*
	- * Bail out early under normal circumstances.
	- */
	- if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
	- avl_numnodes(&spa->spa_errlist_last) == 0 &&
	- !spa->spa_scrub_finished) {
	- mutex_exit(&spa->spa_errlist_lock);
	- return;
	- }
	-
	- spa_get_errlists(spa, &last, &scrub);
	- scrub_finished = spa->spa_scrub_finished;
	- spa->spa_scrub_finished = B_FALSE;
	-
	- mutex_exit(&spa->spa_errlist_lock);
	- mutex_enter(&spa->spa_errlog_lock);
	-
	- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	-
	- /*
	- * Sync out the current list of errors.
	- */
	- sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
	-
	- /*
	- * Rotate the log if necessary.
	- */
	- if (scrub_finished) {
	- if (spa->spa_errlog_last != 0)
	- VERIFY(dmu_object_free(spa->spa_meta_objset,
	- spa->spa_errlog_last, tx) == 0);
	- spa->spa_errlog_last = spa->spa_errlog_scrub;
	- spa->spa_errlog_scrub = 0;
	-
	- sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
	- }
	-
	- /*
	- * Sync out any pending scrub errors.
	- */
	- sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
	-
	- /*
	- * Update the MOS to reflect the new values.
	- */
	- (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
	- &spa->spa_errlog_last, tx);
	- (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
	- &spa->spa_errlog_scrub, tx);
	-
	- dmu_tx_commit(tx);
	-
	- mutex_exit(&spa->spa_errlog_lock);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
	@@ -1,628 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Joyent, Inc.
	- */
	-
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zap.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/utsname.h>
	-#include <sys/sunddi.h>
	-#include <sys/cred.h>
	-#include "zfs_comutil.h"
	-#ifdef _KERNEL
	-#include <sys/cmn_err.h>
	-#include <sys/zone.h>
	-#endif
	-
	-/*
	- * Routines to manage the on-disk history log.
	- *
	- * The history log is stored as a dmu object containing
	- * <packed record length, record nvlist> tuples.
	- *
	- * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
	- * "packed record length" is the packed length of the "record nvlist" stored
	- * as a little endian uint64_t.
	- *
	- * The log is implemented as a ring buffer, though the original creation
	- * of the pool ('zpool create') is never overwritten.
	- *
	- * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer
	- * of 'spa_history' stores the offsets for logging/retrieving history as
	- * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of
	- * where the 'zpool create' record is stored. This allows us to never
	- * overwrite the original creation of the pool. 'sh_phys_max_off' is the
	- * physical ending offset in bytes of the log. This tells you the length of
	- * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record
	- * is added, 'sh_eof' is incremented by the the size of the record.
	- * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes).
	- * This is where the consumer should start reading from after reading in
	- * the 'zpool create' portion of the log.
	- *
	- * 'sh_records_lost' keeps track of how many records have been overwritten
	- * and permanently lost.
	- */
	-
	-/* convert a logical offset to physical */
	-static uint64_t
	-spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
	-{
	- uint64_t phys_len;
	-
	- phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
	- return ((log_off - shpp->sh_pool_create_len) % phys_len
	- + shpp->sh_pool_create_len);
	-}
	-
	-void
	-spa_history_create_obj(spa_t spa, dmu_tx_t tx)
	-{
	- dmu_buf_t *dbp;
	- spa_history_phys_t *shpp;
	- objset_t *mos = spa->spa_meta_objset;
	-
	- ASSERT(spa->spa_history == 0);
	- spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
	- SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
	- sizeof (spa_history_phys_t), tx);
	-
	- VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_HISTORY, sizeof (uint64_t), 1,
	- &spa->spa_history, tx) == 0);
	-
	- VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
	- ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
	-
	- shpp = dbp->db_data;
	- dmu_buf_will_dirty(dbp, tx);
	-
	- /*
	- * Figure out maximum size of history log. We set it at
	- * 0.1% of pool size, with a max of 1G and min of 128KB.
	- */
	- shpp->sh_phys_max_off =
	- metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
	- shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
	- shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
	-
	- dmu_buf_rele(dbp, FTAG);
	-}
	-
	-/*
	- * Change 'sh_bof' to the beginning of the next record.
	- */
	-static int
	-spa_history_advance_bof(spa_t spa, spa_history_phys_t shpp)
	-{
	- objset_t *mos = spa->spa_meta_objset;
	- uint64_t firstread, reclen, phys_bof;
	- char buf[sizeof (reclen)];
	- int err;
	-
	- phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
	- firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
	-
	- if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
	- buf, DMU_READ_PREFETCH)) != 0)
	- return (err);
	- if (firstread != sizeof (reclen)) {
	- if ((err = dmu_read(mos, spa->spa_history,
	- shpp->sh_pool_create_len, sizeof (reclen) - firstread,
	- buf + firstread, DMU_READ_PREFETCH)) != 0)
	- return (err);
	- }
	-
	- reclen = LE_64(((uint64_t )buf));
	- shpp->sh_bof += reclen + sizeof (reclen);
	- shpp->sh_records_lost++;
	- return (0);
	-}
	-
	-static int
	-spa_history_write(spa_t spa, void buf, uint64_t len, spa_history_phys_t *shpp,
	- dmu_tx_t *tx)
	-{
	- uint64_t firstwrite, phys_eof;
	- objset_t *mos = spa->spa_meta_objset;
	- int err;
	-
	- ASSERT(MUTEX_HELD(&spa->spa_history_lock));
	-
	- /* see if we need to reset logical BOF */
	- while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
	- (shpp->sh_eof - shpp->sh_bof) <= len) {
	- if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
	- return (err);
	- }
	- }
	-
	- phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
	- firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
	- shpp->sh_eof += len;
	- dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
	-
	- len -= firstwrite;
	- if (len > 0) {
	- /* write out the rest at the beginning of physical file */
	- dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
	- len, (char *)buf + firstwrite, tx);
	- }
	-
	- return (0);
	-}
	-
	-static char *
	-spa_history_zone(void)
	-{
	-#ifdef _KERNEL
	- /* XXX: pr_hostname can be changed by default from within a jail! */
	- if (jailed(curthread->td_ucred))
	- return (curthread->td_ucred->cr_prison->pr_hostname);
	-#endif
	- return (NULL);
	-}
	-
	-/*
	- * Post a history sysevent.
	- *
	- * The nvlist_t* passed into this function will be transformed into a new
	- * nvlist where:
	- *
	- * 1. Nested nvlists will be flattened to a single level
	- * 2. Keys will have their names normalized (to remove any problematic
	- * characters, such as whitespace)
	- *
	- * The nvlist_t passed into this function will duplicated and should be freed
	- * by caller.
	- *
	- */
	-static void
	-spa_history_log_notify(spa_t spa, nvlist_t nvl)
	-{
	- nvlist_t *hist_nvl = fnvlist_alloc();
	- uint64_t uint64;
	- char *string;
	-
	- if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0)
	- fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string);
	-
	- if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
	- fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
	-
	- if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0)
	- fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string);
	-
	- if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0)
	- fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string);
	-
	- if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0)
	- fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string);
	-
	- if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0)
	- fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string);
	-
	- if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0)
	- fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string);
	-
	- if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
	- fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
	-
	- if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0)
	- fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64);
	-
	- if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0)
	- fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64);
	-
	- if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0)
	- fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64);
	-
	- if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0)
	- fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64);
	-
	- if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0)
	- fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64);
	-
	- spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT);
	-
	- nvlist_free(hist_nvl);
	-}
	-
	-/*
	- * Write out a history event.
	- */
	-/ARGSUSED/
	-static void
	-spa_history_log_sync(void arg, dmu_tx_t tx)
	-{
	- nvlist_t *nvl = arg;
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- objset_t *mos = spa->spa_meta_objset;
	- dmu_buf_t *dbp;
	- spa_history_phys_t *shpp;
	- size_t reclen;
	- uint64_t le_len;
	- char *record_packed = NULL;
	- int ret;
	-
	- /*
	- * If we have an older pool that doesn't have a command
	- * history object, create it now.
	- */
	- mutex_enter(&spa->spa_history_lock);
	- if (!spa->spa_history)
	- spa_history_create_obj(spa, tx);
	- mutex_exit(&spa->spa_history_lock);
	-
	- /*
	- * Get the offset of where we need to write via the bonus buffer.
	- * Update the offset when the write completes.
	- */
	- VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
	- shpp = dbp->db_data;
	-
	- dmu_buf_will_dirty(dbp, tx);
	-
	-#ifdef ZFS_DEBUG
	- {
	- dmu_object_info_t doi;
	- dmu_object_info_from_db(dbp, &doi);
	- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
	- }
	-#endif
	-
	- fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
	-#ifdef _KERNEL
	- fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename);
	-#endif
	- if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
	- zfs_dbgmsg("command: %s",
	- fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
	- } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
	- if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
	- zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
	- fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
	- fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
	- fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
	- fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
	- fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
	- } else {
	- zfs_dbgmsg("txg %lld %s %s",
	- fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
	- fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
	- fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
	- }
	- /*
	- * The history sysevent is posted only for internal history
	- * messages to show what has happened, not how it happened. For
	- * example, the following command:
	- *
	- * # zfs destroy -r tank/foo
	- *
	- * will result in one sysevent posted per dataset that is
	- * destroyed as a result of the command - which could be more
	- * than one event in total. By contrast, if the sysevent was
	- * posted as a result of the ZPOOL_HIST_CMD key being present
	- * it would result in only one sysevent being posted with the
	- * full command line arguments, requiring the consumer to know
	- * how to parse and understand zfs(1M) command invocations.
	- */
	- spa_history_log_notify(spa, nvl);
	- } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
	- zfs_dbgmsg("ioctl %s",
	- fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
	- }
	-
	- record_packed = fnvlist_pack(nvl, &reclen);
	-
	- mutex_enter(&spa->spa_history_lock);
	-
	- /* write out the packed length as little endian */
	- le_len = LE_64((uint64_t)reclen);
	- ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
	- if (!ret)
	- ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
	-
	- /* The first command is the create, which we keep forever */
	- if (ret == 0 && shpp->sh_pool_create_len == 0 &&
	- nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
	- shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
	- }
	-
	- mutex_exit(&spa->spa_history_lock);
	- fnvlist_pack_free(record_packed, reclen);
	- dmu_buf_rele(dbp, FTAG);
	- fnvlist_free(nvl);
	-}
	-
	-/*
	- * Write out a history event.
	- */
	-int
	-spa_history_log(spa_t spa, const char msg)
	-{
	- int err;
	- nvlist_t *nvl = fnvlist_alloc();
	-
	- fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
	- err = spa_history_log_nvl(spa, nvl);
	- fnvlist_free(nvl);
	- return (err);
	-}
	-
	-int
	-spa_history_log_nvl(spa_t spa, nvlist_t nvl)
	-{
	- int err = 0;
	- dmu_tx_t *tx;
	- nvlist_t *nvarg;
	-
	- if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY)
	- return (EINVAL);
	-
	- if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY \|\| !spa_writeable(spa))
	- return (SET_ERROR(EINVAL));
	-
	- tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err) {
	- dmu_tx_abort(tx);
	- return (err);
	- }
	-
	- nvarg = fnvlist_dup(nvl);
	- if (spa_history_zone() != NULL) {
	- fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
	- spa_history_zone());
	- }
	- fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
	-
	- /* Kick this off asynchronously; errors are ignored. */
	- dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
	- nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
	- dmu_tx_commit(tx);
	-
	- /* spa_history_log_sync will free nvl */
	- return (err);
	-
	-}
	-
	-/*
	- * Read out the command history.
	- */
	-int
	-spa_history_get(spa_t spa, uint64_t offp, uint64_t len, char buf)
	-{
	- objset_t *mos = spa->spa_meta_objset;
	- dmu_buf_t *dbp;
	- uint64_t read_len, phys_read_off, phys_eof;
	- uint64_t leftover = 0;
	- spa_history_phys_t *shpp;
	- int err;
	-
	- /*
	- * If the command history doesn't exist (older pool),
	- * that's ok, just return ENOENT.
	- */
	- if (!spa->spa_history)
	- return (SET_ERROR(ENOENT));
	-
	- /*
	- * The history is logged asynchronously, so when they request
	- * the first chunk of history, make sure everything has been
	- * synced to disk so that we get it.
	- */
	- if (*offp == 0 && spa_writeable(spa))
	- txg_wait_synced(spa_get_dsl(spa), 0);
	-
	- if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
	- return (err);
	- shpp = dbp->db_data;
	-
	-#ifdef ZFS_DEBUG
	- {
	- dmu_object_info_t doi;
	- dmu_object_info_from_db(dbp, &doi);
	- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
	- }
	-#endif
	-
	- mutex_enter(&spa->spa_history_lock);
	- phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
	-
	- if (*offp < shpp->sh_pool_create_len) {
	- /* read in just the zpool create history */
	- phys_read_off = *offp;
	- read_len = MIN(*len, shpp->sh_pool_create_len -
	- phys_read_off);
	- } else {
	- /*
	- * Need to reset passed in offset to BOF if the passed in
	- * offset has since been overwritten.
	- */
	- offp = MAX(offp, shpp->sh_bof);
	- phys_read_off = spa_history_log_to_phys(*offp, shpp);
	-
	- /*
	- * Read up to the minimum of what the user passed down or
	- * the EOF (physical or logical). If we hit physical EOF,
	- * use 'leftover' to read from the physical BOF.
	- */
	- if (phys_read_off <= phys_eof) {
	- read_len = MIN(*len, phys_eof - phys_read_off);
	- } else {
	- read_len = MIN(*len,
	- shpp->sh_phys_max_off - phys_read_off);
	- if (phys_read_off + *len > shpp->sh_phys_max_off) {
	- leftover = MIN(*len - read_len,
	- phys_eof - shpp->sh_pool_create_len);
	- }
	- }
	- }
	-
	- /* offset for consumer to use next */
	- *offp += read_len + leftover;
	-
	- /* tell the consumer how much you actually read */
	- *len = read_len + leftover;
	-
	- if (read_len == 0) {
	- mutex_exit(&spa->spa_history_lock);
	- dmu_buf_rele(dbp, FTAG);
	- return (0);
	- }
	-
	- err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
	- DMU_READ_PREFETCH);
	- if (leftover && err == 0) {
	- err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
	- leftover, buf + read_len, DMU_READ_PREFETCH);
	- }
	- mutex_exit(&spa->spa_history_lock);
	-
	- dmu_buf_rele(dbp, FTAG);
	- return (err);
	-}
	-
	-/*
	- * The nvlist will be consumed by this call.
	- */
	-static void
	-log_internal(nvlist_t nvl, const char operation, spa_t *spa,
	- dmu_tx_t tx, const char fmt, va_list adx)
	-{
	- char *msg;
	- va_list adx2;
	-
	- /*
	- * If this is part of creating a pool, not everything is
	- * initialized yet, so don't bother logging the internal events.
	- * Likewise if the pool is not writeable.
	- */
	- if (tx->tx_txg == TXG_INITIAL \|\| !spa_writeable(spa)) {
	- fnvlist_free(nvl);
	- return;
	- }
	-
	- va_copy(adx2, adx);
	-
	- msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
	- (void) vsprintf(msg, fmt, adx2);
	- fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
	- strfree(msg);
	-
	- va_end(adx2);
	-
	- fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
	- fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
	-
	- if (dmu_tx_is_syncing(tx)) {
	- spa_history_log_sync(nvl, tx);
	- } else {
	- dsl_sync_task_nowait(spa_get_dsl(spa),
	- spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
	- }
	- /* spa_history_log_sync() will free nvl */
	-}
	-
	-void
	-spa_history_log_internal(spa_t spa, const char operation,
	- dmu_tx_t tx, const char fmt, ...)
	-{
	- dmu_tx_t *htx = tx;
	- va_list adx;
	-
	- /* create a tx if we didn't get one */
	- if (tx == NULL) {
	- htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	- if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
	- dmu_tx_abort(htx);
	- return;
	- }
	- }
	-
	- va_start(adx, fmt);
	- log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
	- va_end(adx);
	-
	- /* if we didn't get a tx from the caller, commit the one we made */
	- if (tx == NULL)
	- dmu_tx_commit(htx);
	-}
	-
	-void
	-spa_history_log_internal_ds(dsl_dataset_t ds, const char operation,
	- dmu_tx_t tx, const char fmt, ...)
	-{
	- va_list adx;
	- char namebuf[ZFS_MAX_DATASET_NAME_LEN];
	- nvlist_t *nvl = fnvlist_alloc();
	-
	- ASSERT(tx != NULL);
	-
	- dsl_dataset_name(ds, namebuf);
	- fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
	- fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
	-
	- va_start(adx, fmt);
	- log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
	- va_end(adx);
	-}
	-
	-void
	-spa_history_log_internal_dd(dsl_dir_t dd, const char operation,
	- dmu_tx_t tx, const char fmt, ...)
	-{
	- va_list adx;
	- char namebuf[ZFS_MAX_DATASET_NAME_LEN];
	- nvlist_t *nvl = fnvlist_alloc();
	-
	- ASSERT(tx != NULL);
	-
	- dsl_dir_name(dd, namebuf);
	- fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
	- fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
	- dsl_dir_phys(dd)->dd_head_dataset_obj);
	-
	- va_start(adx, fmt);
	- log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
	- va_end(adx);
	-}
	-
	-void
	-spa_history_log_version(spa_t spa, const char operation)
	-{
	- spa_history_log_internal(spa, operation, NULL,
	- "pool version %llu; software version %llu/%llu; uts %s %s %s %s",
	- (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION,
	- utsname.nodename, utsname.release, utsname.version,
	- utsname.machine);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
	@@ -1,2523 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright (c) 2017 Datto Inc.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa_impl.h>
	-#include <sys/spa_boot.h>
	-#include <sys/zio.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zio_compress.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/zap.h>
	-#include <sys/zil.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/vdev_file.h>
	-#include <sys/vdev_initialize.h>
	-#include <sys/metaslab.h>
	-#include <sys/uberblock_impl.h>
	-#include <sys/txg.h>
	-#include <sys/avl.h>
	-#include <sys/unique.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/arc.h>
	-#include <sys/ddt.h>
	-#include "zfs_prop.h"
	-#include <sys/zfeature.h>
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-#include <sys/types.h>
	-#include <sys/sysctl.h>
	-#endif
	-
	-/*
	- * SPA locking
	- *
	- * There are four basic locks for managing spa_t structures:
	- *
	- * spa_namespace_lock (global mutex)
	- *
	- * This lock must be acquired to do any of the following:
	- *
	- * - Lookup a spa_t by name
	- * - Add or remove a spa_t from the namespace
	- * - Increase spa_refcount from non-zero
	- * - Check if spa_refcount is zero
	- * - Rename a spa_t
	- * - add/remove/attach/detach devices
	- * - Held for the duration of create/destroy/import/export
	- *
	- * It does not need to handle recursion. A create or destroy may
	- * reference objects (files or zvols) in other pools, but by
	- * definition they must have an existing reference, and will never need
	- * to lookup a spa_t by name.
	- *
	- * spa_refcount (per-spa zfs_refcount_t protected by mutex)
	- *
	- * This reference count keep track of any active users of the spa_t. The
	- * spa_t cannot be destroyed or freed while this is non-zero. Internally,
	- * the refcount is never really 'zero' - opening a pool implicitly keeps
	- * some references in the DMU. Internally we check against spa_minref, but
	- * present the image of a zero/non-zero value to consumers.
	- *
	- * spa_config_lock[] (per-spa array of rwlocks)
	- *
	- * This protects the spa_t from config changes, and must be held in
	- * the following circumstances:
	- *
	- * - RW_READER to perform I/O to the spa
	- * - RW_WRITER to change the vdev config
	- *
	- * The locking order is fairly straightforward:
	- *
	- * spa_namespace_lock -> spa_refcount
	- *
	- * The namespace lock must be acquired to increase the refcount from 0
	- * or to check if it is zero.
	- *
	- * spa_refcount -> spa_config_lock[]
	- *
	- * There must be at least one valid reference on the spa_t to acquire
	- * the config lock.
	- *
	- * spa_namespace_lock -> spa_config_lock[]
	- *
	- * The namespace lock must always be taken before the config lock.
	- *
	- *
	- * The spa_namespace_lock can be acquired directly and is globally visible.
	- *
	- * The namespace is manipulated using the following functions, all of which
	- * require the spa_namespace_lock to be held.
	- *
	- * spa_lookup() Lookup a spa_t by name.
	- *
	- * spa_add() Create a new spa_t in the namespace.
	- *
	- * spa_remove() Remove a spa_t from the namespace. This also
	- * frees up any memory associated with the spa_t.
	- *
	- * spa_next() Returns the next spa_t in the system, or the
	- * first if NULL is passed.
	- *
	- * spa_evict_all() Shutdown and remove all spa_t structures in
	- * the system.
	- *
	- * spa_guid_exists() Determine whether a pool/device guid exists.
	- *
	- * The spa_refcount is manipulated using the following functions:
	- *
	- * spa_open_ref() Adds a reference to the given spa_t. Must be
	- * called with spa_namespace_lock held if the
	- * refcount is currently zero.
	- *
	- * spa_close() Remove a reference from the spa_t. This will
	- * not free the spa_t or remove it from the
	- * namespace. No locking is required.
	- *
	- * spa_refcount_zero() Returns true if the refcount is currently
	- * zero. Must be called with spa_namespace_lock
	- * held.
	- *
	- * The spa_config_lock[] is an array of rwlocks, ordered as follows:
	- * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
	- * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
	- *
	- * To read the configuration, it suffices to hold one of these locks as reader.
	- * To modify the configuration, you must hold all locks as writer. To modify
	- * vdev state without altering the vdev tree's topology (e.g. online/offline),
	- * you must hold SCL_STATE and SCL_ZIO as writer.
	- *
	- * We use these distinct config locks to avoid recursive lock entry.
	- * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
	- * block allocations (SCL_ALLOC), which may require reading space maps
	- * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
	- *
	- * The spa config locks cannot be normal rwlocks because we need the
	- * ability to hand off ownership. For example, SCL_ZIO is acquired
	- * by the issuing thread and later released by an interrupt thread.
	- * They do, however, obey the usual write-wanted semantics to prevent
	- * writer (i.e. system administrator) starvation.
	- *
	- * The lock acquisition rules are as follows:
	- *
	- * SCL_CONFIG
	- * Protects changes to the vdev tree topology, such as vdev
	- * add/remove/attach/detach. Protects the dirty config list
	- * (spa_config_dirty_list) and the set of spares and l2arc devices.
	- *
	- * SCL_STATE
	- * Protects changes to pool state and vdev state, such as vdev
	- * online/offline/fault/degrade/clear. Protects the dirty state list
	- * (spa_state_dirty_list) and global pool state (spa_state).
	- *
	- * SCL_ALLOC
	- * Protects changes to metaslab groups and classes.
	- * Held as reader by metaslab_alloc() and metaslab_claim().
	- *
	- * SCL_ZIO
	- * Held by bp-level zios (those which have no io_vd upon entry)
	- * to prevent changes to the vdev tree. The bp-level zio implicitly
	- * protects all of its vdev child zios, which do not hold SCL_ZIO.
	- *
	- * SCL_FREE
	- * Protects changes to metaslab groups and classes.
	- * Held as reader by metaslab_free(). SCL_FREE is distinct from
	- * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
	- * blocks in zio_done() while another i/o that holds either
	- * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
	- *
	- * SCL_VDEV
	- * Held as reader to prevent changes to the vdev tree during trivial
	- * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
	- * other locks, and lower than all of them, to ensure that it's safe
	- * to acquire regardless of caller context.
	- *
	- * In addition, the following rules apply:
	- *
	- * (a) spa_props_lock protects pool properties, spa_config and spa_config_list.
	- * The lock ordering is SCL_CONFIG > spa_props_lock.
	- *
	- * (b) I/O operations on leaf vdevs. For any zio operation that takes
	- * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
	- * or zio_write_phys() -- the caller must ensure that the config cannot
	- * cannot change in the interim, and that the vdev cannot be reopened.
	- * SCL_STATE as reader suffices for both.
	- *
	- * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
	- *
	- * spa_vdev_enter() Acquire the namespace lock and the config lock
	- * for writing.
	- *
	- * spa_vdev_exit() Release the config lock, wait for all I/O
	- * to complete, sync the updated configs to the
	- * cache, and release the namespace lock.
	- *
	- * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
	- * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
	- * locking is, always, based on spa_namespace_lock and spa_config_lock[].
	- */
	-
	-static avl_tree_t spa_namespace_avl;
	-kmutex_t spa_namespace_lock;
	-static kcondvar_t spa_namespace_cv;
	-static int spa_active_count;
	-int spa_max_replication_override = SPA_DVAS_PER_BP;
	-
	-static kmutex_t spa_spare_lock;
	-static avl_tree_t spa_spare_avl;
	-static kmutex_t spa_l2cache_lock;
	-static avl_tree_t spa_l2cache_avl;
	-
	-kmem_cache_t *spa_buffer_pool;
	-int spa_mode_global;
	-
	-#ifdef ZFS_DEBUG
	-/*
	- * Everything except dprintf, spa, and indirect_remap is on by default
	- * in debug builds.
	- */
	-int zfs_flags = ~(ZFS_DEBUG_DPRINTF \| ZFS_DEBUG_INDIRECT_REMAP);
	-#else
	-int zfs_flags = 0;
	-#endif
	-
	-/*
	- * zfs_recover can be set to nonzero to attempt to recover from
	- * otherwise-fatal errors, typically caused by on-disk corruption. When
	- * set, calls to zfs_panic_recover() will turn into warning messages.
	- * This should only be used as a last resort, as it typically results
	- * in leaked space, or worse.
	- */
	-boolean_t zfs_recover = B_FALSE;
	-
	-/*
	- * If destroy encounters an EIO while reading metadata (e.g. indirect
	- * blocks), space referenced by the missing metadata can not be freed.
	- * Normally this causes the background destroy to become "stalled", as
	- * it is unable to make forward progress. While in this stalled state,
	- * all remaining space to free from the error-encountering filesystem is
	- * "temporarily leaked". Set this flag to cause it to ignore the EIO,
	- * permanently leak the space from indirect blocks that can not be read,
	- * and continue to free everything else that it can.
	- *
	- * The default, "stalling" behavior is useful if the storage partially
	- * fails (i.e. some but not all i/os fail), and then later recovers. In
	- * this case, we will be able to continue pool operations while it is
	- * partially failed, and when it recovers, we can continue to free the
	- * space, with no leaks. However, note that this case is actually
	- * fairly rare.
	- *
	- * Typically pools either (a) fail completely (but perhaps temporarily,
	- * e.g. a top-level vdev going offline), or (b) have localized,
	- * permanent errors (e.g. disk returns the wrong data due to bit flip or
	- * firmware bug). In case (a), this setting does not matter because the
	- * pool will be suspended and the sync thread will not be able to make
	- * forward progress regardless. In case (b), because the error is
	- * permanent, the best we can do is leak the minimum amount of space,
	- * which is what setting this flag will do. Therefore, it is reasonable
	- * for this flag to normally be set, but we chose the more conservative
	- * approach of not setting it, so that there is no possibility of
	- * leaking space in the "partial temporary" failure case.
	- */
	-boolean_t zfs_free_leak_on_eio = B_FALSE;
	-
	-/*
	- * Expiration time in milliseconds. This value has two meanings. First it is
	- * used to determine when the spa_deadman() logic should fire. By default the
	- * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
	- * Secondly, the value determines if an I/O is considered "hung". Any I/O that
	- * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
	- * in a system panic.
	- */
	-uint64_t zfs_deadman_synctime_ms = 1000000ULL;
	-
	-/*
	- * Check time in milliseconds. This defines the frequency at which we check
	- * for hung I/O.
	- */
	-uint64_t zfs_deadman_checktime_ms = 5000ULL;
	-
	-/*
	- * Default value of -1 for zfs_deadman_enabled is resolved in
	- * zfs_deadman_init()
	- */
	-int zfs_deadman_enabled = -1;
	-
	-/*
	- * The worst case is single-sector max-parity RAID-Z blocks, in which
	- * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
	- * times the size; so just assume that. Add to this the fact that
	- * we can have up to 3 DVAs per bp, and one more factor of 2 because
	- * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
	- * the worst case is:
	- * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
	- */
	-int spa_asize_inflation = 24;
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
	- "Try to recover from otherwise-fatal errors.");
	-
	-static int
	-sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
	-{
	- int err, val;
	-
	- val = zfs_flags;
	- err = sysctl_handle_int(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- /*
	- * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
	- * arc buffers in the system have the necessary additional
	- * checksum data. However, it is safe to disable at any
	- * time.
	- */
	- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
	- val &= ~ZFS_DEBUG_MODIFY;
	- zfs_flags = val;
	-
	- return (0);
	-}
	-
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
	- CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN, 0, sizeof(int),
	- sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
	-
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RWTUN,
	- &zfs_deadman_synctime_ms, 0,
	- "Stalled ZFS I/O expiration time in milliseconds");
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RWTUN,
	- &zfs_deadman_checktime_ms, 0,
	- "Period of checks for stalled ZFS I/O in milliseconds");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RWTUN,
	- &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
	- &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
	-#endif
	-
	-#ifndef illumos
	-#ifdef _KERNEL
	-static void
	-zfs_deadman_init()
	-{
	- /*
	- * If we are not i386 or amd64 or in a virtual machine,
	- * disable ZFS deadman thread by default
	- */
	- if (zfs_deadman_enabled == -1) {
	-#if defined(__amd64__) \|\| defined(__i386__)
	- zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
	-#else
	- zfs_deadman_enabled = 0;
	-#endif
	- }
	-}
	-#endif /* _KERNEL */
	-#endif /* !illumos */
	-
	-/*
	- * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
	- * the pool to be consumed. This ensures that we don't run the pool
	- * completely out of space, due to unaccounted changes (e.g. to the MOS).
	- * It also limits the worst-case time to allocate space. If we have
	- * less than this amount of free space, most ZPL operations (e.g. write,
	- * create) will return ENOSPC.
	- *
	- * Certain operations (e.g. file removal, most administrative actions) can
	- * use half the slop space. They will only return ENOSPC if less than half
	- * the slop space is free. Typically, once the pool has less than the slop
	- * space free, the user will use these operations to free up space in the pool.
	- * These are the operations that call dsl_pool_adjustedsize() with the netfree
	- * argument set to TRUE.
	- *
	- * Operations that are almost guaranteed to free up space in the absence of
	- * a pool checkpoint can use up to three quarters of the slop space
	- * (e.g zfs destroy).
	- *
	- * A very restricted set of operations are always permitted, regardless of
	- * the amount of free space. These are the operations that call
	- * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
	- * increase in the amount of space used, it is possible to run the pool
	- * completely out of space, causing it to be permanently read-only.
	- *
	- * Note that on very small pools, the slop space will be larger than
	- * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
	- * but we never allow it to be more than half the pool size.
	- *
	- * See also the comments in zfs_space_check_t.
	- */
	-int spa_slop_shift = 5;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
	- &spa_slop_shift, 0,
	- "Shift value of reserved space (1/(2^spa_slop_shift)).");
	-uint64_t spa_min_slop = 128 * 1024 * 1024;
	-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
	- &spa_min_slop, 0,
	- "Minimal value of reserved space");
	-
	-int spa_allocators = 4;
	-
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_allocators, CTLFLAG_RWTUN,
	- &spa_allocators, 0,
	- "Number of allocators per metaslab group");
	-
	-/PRINTFLIKE2/
	-void
	-spa_load_failed(spa_t spa, const char fmt, ...)
	-{
	- va_list adx;
	- char buf[256];
	-
	- va_start(adx, fmt);
	- (void) vsnprintf(buf, sizeof (buf), fmt, adx);
	- va_end(adx);
	-
	- zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
	- spa->spa_trust_config ? "trusted" : "untrusted", buf);
	-}
	-
	-/PRINTFLIKE2/
	-void
	-spa_load_note(spa_t spa, const char fmt, ...)
	-{
	- va_list adx;
	- char buf[256];
	-
	- va_start(adx, fmt);
	- (void) vsnprintf(buf, sizeof (buf), fmt, adx);
	- va_end(adx);
	-
	- zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
	- spa->spa_trust_config ? "trusted" : "untrusted", buf);
	-}
	-
	-/*
	- * By default dedup and user data indirects land in the special class
	- */
	-int zfs_ddt_data_is_special = B_TRUE;
	-int zfs_user_indirect_is_special = B_TRUE;
	-
	-/*
	- * The percentage of special class final space reserved for metadata only.
	- * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
	- * let metadata into the class.
	- */
	-int zfs_special_class_metadata_reserve_pct = 25;
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, ddt_data_is_special, CTLFLAG_RWTUN,
	- &zfs_ddt_data_is_special, 0,
	- "Whether DDT data is eligible for the special class vdevs");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, user_indirect_is_special, CTLFLAG_RWTUN,
	- &zfs_user_indirect_is_special, 0,
	- "Whether indirect blocks are eligible for the special class vdevs");
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, special_class_metadata_reserve_pct,
	- CTLFLAG_RWTUN, &zfs_special_class_metadata_reserve_pct, 0,
	- "Percentage of space in the special class reserved solely for metadata");
	-#endif
	-
	-/*
	- * ==========================================================================
	- * SPA config locking
	- * ==========================================================================
	- */
	-static void
	-spa_config_lock_init(spa_t *spa)
	-{
	- for (int i = 0; i < SCL_LOCKS; i++) {
	- spa_config_lock_t *scl = &spa->spa_config_lock[i];
	- mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
	- zfs_refcount_create_untracked(&scl->scl_count);
	- scl->scl_writer = NULL;
	- scl->scl_write_wanted = 0;
	- }
	-}
	-
	-static void
	-spa_config_lock_destroy(spa_t *spa)
	-{
	- for (int i = 0; i < SCL_LOCKS; i++) {
	- spa_config_lock_t *scl = &spa->spa_config_lock[i];
	- mutex_destroy(&scl->scl_lock);
	- cv_destroy(&scl->scl_cv);
	- zfs_refcount_destroy(&scl->scl_count);
	- ASSERT(scl->scl_writer == NULL);
	- ASSERT(scl->scl_write_wanted == 0);
	- }
	-}
	-
	-int
	-spa_config_tryenter(spa_t spa, int locks, void tag, krw_t rw)
	-{
	- for (int i = 0; i < SCL_LOCKS; i++) {
	- spa_config_lock_t *scl = &spa->spa_config_lock[i];
	- if (!(locks & (1 << i)))
	- continue;
	- mutex_enter(&scl->scl_lock);
	- if (rw == RW_READER) {
	- if (scl->scl_writer \|\| scl->scl_write_wanted) {
	- mutex_exit(&scl->scl_lock);
	- spa_config_exit(spa, locks & ((1 << i) - 1),
	- tag);
	- return (0);
	- }
	- } else {
	- ASSERT(scl->scl_writer != curthread);
	- if (!zfs_refcount_is_zero(&scl->scl_count)) {
	- mutex_exit(&scl->scl_lock);
	- spa_config_exit(spa, locks & ((1 << i) - 1),
	- tag);
	- return (0);
	- }
	- scl->scl_writer = curthread;
	- }
	- (void) zfs_refcount_add(&scl->scl_count, tag);
	- mutex_exit(&scl->scl_lock);
	- }
	- return (1);
	-}
	-
	-void
	-spa_config_enter(spa_t spa, int locks, void tag, krw_t rw)
	-{
	- int wlocks_held = 0;
	-
	- ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
	-
	- for (int i = 0; i < SCL_LOCKS; i++) {
	- spa_config_lock_t *scl = &spa->spa_config_lock[i];
	- if (scl->scl_writer == curthread)
	- wlocks_held \|= (1 << i);
	- if (!(locks & (1 << i)))
	- continue;
	- mutex_enter(&scl->scl_lock);
	- if (rw == RW_READER) {
	- while (scl->scl_writer \|\| scl->scl_write_wanted) {
	- cv_wait(&scl->scl_cv, &scl->scl_lock);
	- }
	- } else {
	- ASSERT(scl->scl_writer != curthread);
	- while (!zfs_refcount_is_zero(&scl->scl_count)) {
	- scl->scl_write_wanted++;
	- cv_wait(&scl->scl_cv, &scl->scl_lock);
	- scl->scl_write_wanted--;
	- }
	- scl->scl_writer = curthread;
	- }
	- (void) zfs_refcount_add(&scl->scl_count, tag);
	- mutex_exit(&scl->scl_lock);
	- }
	- ASSERT3U(wlocks_held, <=, locks);
	-}
	-
	-void
	-spa_config_exit(spa_t spa, int locks, void tag)
	-{
	- for (int i = SCL_LOCKS - 1; i >= 0; i--) {
	- spa_config_lock_t *scl = &spa->spa_config_lock[i];
	- if (!(locks & (1 << i)))
	- continue;
	- mutex_enter(&scl->scl_lock);
	- ASSERT(!zfs_refcount_is_zero(&scl->scl_count));
	- if (zfs_refcount_remove(&scl->scl_count, tag) == 0) {
	- ASSERT(scl->scl_writer == NULL \|\|
	- scl->scl_writer == curthread);
	- scl->scl_writer = NULL; /* OK in either case */
	- cv_broadcast(&scl->scl_cv);
	- }
	- mutex_exit(&scl->scl_lock);
	- }
	-}
	-
	-int
	-spa_config_held(spa_t *spa, int locks, krw_t rw)
	-{
	- int locks_held = 0;
	-
	- for (int i = 0; i < SCL_LOCKS; i++) {
	- spa_config_lock_t *scl = &spa->spa_config_lock[i];
	- if (!(locks & (1 << i)))
	- continue;
	- if ((rw == RW_READER &&
	- !zfs_refcount_is_zero(&scl->scl_count)) \|\|
	- (rw == RW_WRITER && scl->scl_writer == curthread))
	- locks_held \|= 1 << i;
	- }
	-
	- return (locks_held);
	-}
	-
	-/*
	- * ==========================================================================
	- * SPA namespace functions
	- * ==========================================================================
	- */
	-
	-/*
	- * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
	- * Returns NULL if no matching spa_t is found.
	- */
	-spa_t *
	-spa_lookup(const char *name)
	-{
	- static spa_t search; /* spa_t is large; don't allocate on stack */
	- spa_t *spa;
	- avl_index_t where;
	- char *cp;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
	-
	- /*
	- * If it's a full dataset name, figure out the pool name and
	- * just use that.
	- */
	- cp = strpbrk(search.spa_name, "/@#");
	- if (cp != NULL)
	- *cp = '\0';
	-
	- spa = avl_find(&spa_namespace_avl, &search, &where);
	-
	- return (spa);
	-}
	-
	-/*
	- * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
	- * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
	- * looking for potentially hung I/Os.
	- */
	-static void
	-spa_deadman(void *arg, int pending)
	-{
	- spa_t *spa = arg;
	-
	- /*
	- * Disable the deadman timer if the pool is suspended.
	- */
	- if (spa_suspended(spa)) {
	-#ifdef illumos
	- VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
	-#else
	- /* Nothing. just don't schedule any future callouts. */
	-#endif
	- return;
	- }
	-
	- zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
	- (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
	- ++spa->spa_deadman_calls);
	- if (zfs_deadman_enabled)
	- vdev_deadman(spa->spa_root_vdev);
	-#ifdef __FreeBSD__
	-#ifdef _KERNEL
	- callout_schedule(&spa->spa_deadman_cycid,
	- hz * zfs_deadman_checktime_ms / MILLISEC);
	-#endif
	-#endif
	-}
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	-static void
	-spa_deadman_timeout(void *arg)
	-{
	- spa_t *spa = arg;
	-
	- taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task);
	-}
	-#endif
	-
	-/*
	- * Create an uninitialized spa_t with the given name. Requires
	- * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
	- * exist by calling spa_lookup() first.
	- */
	-spa_t *
	-spa_add(const char name, nvlist_t config, const char *altroot)
	-{
	- spa_t *spa;
	- spa_config_dirent_t *dp;
	-#ifdef illumos
	- cyc_handler_t hdlr;
	- cyc_time_t when;
	-#endif
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
	-
	- mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
	-
	- for (int t = 0; t < TXG_SIZE; t++)
	- bplist_create(&spa->spa_free_bplist[t]);
	-
	- (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
	- spa->spa_state = POOL_STATE_UNINITIALIZED;
	- spa->spa_freeze_txg = UINT64_MAX;
	- spa->spa_final_txg = UINT64_MAX;
	- spa->spa_load_max_txg = UINT64_MAX;
	- spa->spa_proc = &p0;
	- spa->spa_proc_state = SPA_PROC_NONE;
	- spa->spa_trust_config = B_TRUE;
	-
	-#ifdef illumos
	- hdlr.cyh_func = spa_deadman;
	- hdlr.cyh_arg = spa;
	- hdlr.cyh_level = CY_LOW_LEVEL;
	-#endif
	-
	- spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
	-
	-#ifdef illumos
	- /*
	- * This determines how often we need to check for hung I/Os after
	- * the cyclic has already fired. Since checking for hung I/Os is
	- * an expensive operation we don't want to check too frequently.
	- * Instead wait for 5 seconds before checking again.
	- */
	- when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
	- when.cyt_when = CY_INFINITY;
	- mutex_enter(&cpu_lock);
	- spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
	- mutex_exit(&cpu_lock);
	-#else /* !illumos */
	-#ifdef _KERNEL
	- /*
	- * callout(9) does not provide a way to initialize a callout with
	- * a function and an argument, so we use callout_reset() to schedule
	- * the callout in the very distant future. Even if that event ever
	- * fires, it should be okayas we won't have any active zio-s.
	- * But normally spa_sync() will reschedule the callout with a proper
	- * timeout.
	- * callout(9) does not allow the callback function to sleep but
	- * vdev_deadman() needs to acquire vq_lock and illumos mutexes are
	- * emulated using sx(9). For this reason spa_deadman_timeout()
	- * will schedule spa_deadman() as task on a taskqueue that allows
	- * sleeping.
	- */
	- TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa);
	- callout_init(&spa->spa_deadman_cycid, 1);
	- callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0,
	- spa_deadman_timeout, spa, 0);
	-#endif
	-#endif
	- zfs_refcount_create(&spa->spa_refcount);
	- spa_config_lock_init(spa);
	-
	- avl_add(&spa_namespace_avl, spa);
	-
	- /*
	- * Set the alternate root, if there is one.
	- */
	- if (altroot) {
	- spa->spa_root = spa_strdup(altroot);
	- spa_active_count++;
	- }
	-
	- spa->spa_alloc_count = spa_allocators;
	- spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
	- sizeof (kmutex_t), KM_SLEEP);
	- spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
	- sizeof (avl_tree_t), KM_SLEEP);
	- for (int i = 0; i < spa->spa_alloc_count; i++) {
	- mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
	- avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
	- sizeof (zio_t), offsetof(zio_t, io_alloc_node));
	- }
	-
	- /*
	- * Every pool starts with the default cachefile
	- */
	- list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
	- offsetof(spa_config_dirent_t, scd_link));
	-
	- dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
	- dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
	- list_insert_head(&spa->spa_config_list, dp);
	-
	- VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
	- KM_SLEEP) == 0);
	-
	- if (config != NULL) {
	- nvlist_t *features;
	-
	- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
	- &features) == 0) {
	- VERIFY(nvlist_dup(features, &spa->spa_label_features,
	- 0) == 0);
	- }
	-
	- VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
	- }
	-
	- if (spa->spa_label_features == NULL) {
	- VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
	- KM_SLEEP) == 0);
	- }
	-
	- spa->spa_min_ashift = INT_MAX;
	- spa->spa_max_ashift = 0;
	-
	- /*
	- * As a pool is being created, treat all features as disabled by
	- * setting SPA_FEATURE_DISABLED for all entries in the feature
	- * refcount cache.
	- */
	- for (int i = 0; i < SPA_FEATURES; i++) {
	- spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
	- }
	-
	- list_create(&spa->spa_leaf_list, sizeof (vdev_t),
	- offsetof(vdev_t, vdev_leaf_node));
	-
	- return (spa);
	-}
	-
	-/*
	- * Removes a spa_t from the namespace, freeing up any memory used. Requires
	- * spa_namespace_lock. This is called only after the spa_t has been closed and
	- * deactivated.
	- */
	-void
	-spa_remove(spa_t *spa)
	-{
	- spa_config_dirent_t *dp;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
	- ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
	-
	- nvlist_free(spa->spa_config_splitting);
	-
	- avl_remove(&spa_namespace_avl, spa);
	- cv_broadcast(&spa_namespace_cv);
	-
	- if (spa->spa_root) {
	- spa_strfree(spa->spa_root);
	- spa_active_count--;
	- }
	-
	- while ((dp = list_head(&spa->spa_config_list)) != NULL) {
	- list_remove(&spa->spa_config_list, dp);
	- if (dp->scd_path != NULL)
	- spa_strfree(dp->scd_path);
	- kmem_free(dp, sizeof (spa_config_dirent_t));
	- }
	-
	- for (int i = 0; i < spa->spa_alloc_count; i++) {
	- avl_destroy(&spa->spa_alloc_trees[i]);
	- mutex_destroy(&spa->spa_alloc_locks[i]);
	- }
	- kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
	- sizeof (kmutex_t));
	- kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
	- sizeof (avl_tree_t));
	-
	- list_destroy(&spa->spa_config_list);
	- list_destroy(&spa->spa_leaf_list);
	-
	- nvlist_free(spa->spa_label_features);
	- nvlist_free(spa->spa_load_info);
	- nvlist_free(spa->spa_feat_stats);
	- spa_config_set(spa, NULL);
	-
	-#ifdef illumos
	- mutex_enter(&cpu_lock);
	- if (spa->spa_deadman_cycid != CYCLIC_NONE)
	- cyclic_remove(spa->spa_deadman_cycid);
	- mutex_exit(&cpu_lock);
	- spa->spa_deadman_cycid = CYCLIC_NONE;
	-#else /* !illumos */
	-#ifdef _KERNEL
	- callout_drain(&spa->spa_deadman_cycid);
	- taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task);
	-#endif
	-#endif
	-
	- zfs_refcount_destroy(&spa->spa_refcount);
	-
	- spa_config_lock_destroy(spa);
	-
	- for (int t = 0; t < TXG_SIZE; t++)
	- bplist_destroy(&spa->spa_free_bplist[t]);
	-
	- zio_checksum_templates_free(spa);
	-
	- cv_destroy(&spa->spa_async_cv);
	- cv_destroy(&spa->spa_evicting_os_cv);
	- cv_destroy(&spa->spa_proc_cv);
	- cv_destroy(&spa->spa_scrub_io_cv);
	- cv_destroy(&spa->spa_suspend_cv);
	-
	- mutex_destroy(&spa->spa_async_lock);
	- mutex_destroy(&spa->spa_errlist_lock);
	- mutex_destroy(&spa->spa_errlog_lock);
	- mutex_destroy(&spa->spa_evicting_os_lock);
	- mutex_destroy(&spa->spa_history_lock);
	- mutex_destroy(&spa->spa_proc_lock);
	- mutex_destroy(&spa->spa_props_lock);
	- mutex_destroy(&spa->spa_cksum_tmpls_lock);
	- mutex_destroy(&spa->spa_scrub_lock);
	- mutex_destroy(&spa->spa_suspend_lock);
	- mutex_destroy(&spa->spa_vdev_top_lock);
	- mutex_destroy(&spa->spa_feat_stats_lock);
	-
	- kmem_free(spa, sizeof (spa_t));
	-}
	-
	-/*
	- * Given a pool, return the next pool in the namespace, or NULL if there is
	- * none. If 'prev' is NULL, return the first pool.
	- */
	-spa_t *
	-spa_next(spa_t *prev)
	-{
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- if (prev)
	- return (AVL_NEXT(&spa_namespace_avl, prev));
	- else
	- return (avl_first(&spa_namespace_avl));
	-}
	-
	-/*
	- * ==========================================================================
	- * SPA refcount functions
	- * ==========================================================================
	- */
	-
	-/*
	- * Add a reference to the given spa_t. Must have at least one reference, or
	- * have the namespace lock held.
	- */
	-void
	-spa_open_ref(spa_t spa, void tag)
	-{
	- ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref \|\|
	- MUTEX_HELD(&spa_namespace_lock));
	- (void) zfs_refcount_add(&spa->spa_refcount, tag);
	-}
	-
	-/*
	- * Remove a reference to the given spa_t. Must have at least one reference, or
	- * have the namespace lock held.
	- */
	-void
	-spa_close(spa_t spa, void tag)
	-{
	- ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref \|\|
	- MUTEX_HELD(&spa_namespace_lock));
	- (void) zfs_refcount_remove(&spa->spa_refcount, tag);
	-}
	-
	-/*
	- * Remove a reference to the given spa_t held by a dsl dir that is
	- * being asynchronously released. Async releases occur from a taskq
	- * performing eviction of dsl datasets and dirs. The namespace lock
	- * isn't held and the hold by the object being evicted may contribute to
	- * spa_minref (e.g. dataset or directory released during pool export),
	- * so the asserts in spa_close() do not apply.
	- */
	-void
	-spa_async_close(spa_t spa, void tag)
	-{
	- (void) zfs_refcount_remove(&spa->spa_refcount, tag);
	-}
	-
	-/*
	- * Check to see if the spa refcount is zero. Must be called with
	- * spa_namespace_lock held. We really compare against spa_minref, which is the
	- * number of references acquired when opening a pool
	- */
	-boolean_t
	-spa_refcount_zero(spa_t *spa)
	-{
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
	-}
	-
	-/*
	- * ==========================================================================
	- * SPA spare and l2cache tracking
	- * ==========================================================================
	- */
	-
	-/*
	- * Hot spares and cache devices are tracked using the same code below,
	- * for 'auxiliary' devices.
	- */
	-
	-typedef struct spa_aux {
	- uint64_t aux_guid;
	- uint64_t aux_pool;
	- avl_node_t aux_avl;
	- int aux_count;
	-} spa_aux_t;
	-
	-static inline int
	-spa_aux_compare(const void a, const void b)
	-{
	- const spa_aux_t sa = (const spa_aux_t )a;
	- const spa_aux_t sb = (const spa_aux_t )b;
	-
	- return (AVL_CMP(sa->aux_guid, sb->aux_guid));
	-}
	-
	-void
	-spa_aux_add(vdev_t vd, avl_tree_t avl)
	-{
	- avl_index_t where;
	- spa_aux_t search;
	- spa_aux_t *aux;
	-
	- search.aux_guid = vd->vdev_guid;
	- if ((aux = avl_find(avl, &search, &where)) != NULL) {
	- aux->aux_count++;
	- } else {
	- aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
	- aux->aux_guid = vd->vdev_guid;
	- aux->aux_count = 1;
	- avl_insert(avl, aux, where);
	- }
	-}
	-
	-void
	-spa_aux_remove(vdev_t vd, avl_tree_t avl)
	-{
	- spa_aux_t search;
	- spa_aux_t *aux;
	- avl_index_t where;
	-
	- search.aux_guid = vd->vdev_guid;
	- aux = avl_find(avl, &search, &where);
	-
	- ASSERT(aux != NULL);
	-
	- if (--aux->aux_count == 0) {
	- avl_remove(avl, aux);
	- kmem_free(aux, sizeof (spa_aux_t));
	- } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
	- aux->aux_pool = 0ULL;
	- }
	-}
	-
	-boolean_t
	-spa_aux_exists(uint64_t guid, uint64_t pool, int refcnt, avl_tree_t *avl)
	-{
	- spa_aux_t search, *found;
	-
	- search.aux_guid = guid;
	- found = avl_find(avl, &search, NULL);
	-
	- if (pool) {
	- if (found)
	- *pool = found->aux_pool;
	- else
	- *pool = 0ULL;
	- }
	-
	- if (refcnt) {
	- if (found)
	- *refcnt = found->aux_count;
	- else
	- *refcnt = 0;
	- }
	-
	- return (found != NULL);
	-}
	-
	-void
	-spa_aux_activate(vdev_t vd, avl_tree_t avl)
	-{
	- spa_aux_t search, *found;
	- avl_index_t where;
	-
	- search.aux_guid = vd->vdev_guid;
	- found = avl_find(avl, &search, &where);
	- ASSERT(found != NULL);
	- ASSERT(found->aux_pool == 0ULL);
	-
	- found->aux_pool = spa_guid(vd->vdev_spa);
	-}
	-
	-/*
	- * Spares are tracked globally due to the following constraints:
	- *
	- * - A spare may be part of multiple pools.
	- * - A spare may be added to a pool even if it's actively in use within
	- * another pool.
	- * - A spare in use in any pool can only be the source of a replacement if
	- * the target is a spare in the same pool.
	- *
	- * We keep track of all spares on the system through the use of a reference
	- * counted AVL tree. When a vdev is added as a spare, or used as a replacement
	- * spare, then we bump the reference count in the AVL tree. In addition, we set
	- * the 'vdev_isspare' member to indicate that the device is a spare (active or
	- * inactive). When a spare is made active (used to replace a device in the
	- * pool), we also keep track of which pool its been made a part of.
	- *
	- * The 'spa_spare_lock' protects the AVL tree. These functions are normally
	- * called under the spa_namespace lock as part of vdev reconfiguration. The
	- * separate spare lock exists for the status query path, which does not need to
	- * be completely consistent with respect to other vdev configuration changes.
	- */
	-
	-static int
	-spa_spare_compare(const void a, const void b)
	-{
	- return (spa_aux_compare(a, b));
	-}
	-
	-void
	-spa_spare_add(vdev_t *vd)
	-{
	- mutex_enter(&spa_spare_lock);
	- ASSERT(!vd->vdev_isspare);
	- spa_aux_add(vd, &spa_spare_avl);
	- vd->vdev_isspare = B_TRUE;
	- mutex_exit(&spa_spare_lock);
	-}
	-
	-void
	-spa_spare_remove(vdev_t *vd)
	-{
	- mutex_enter(&spa_spare_lock);
	- ASSERT(vd->vdev_isspare);
	- spa_aux_remove(vd, &spa_spare_avl);
	- vd->vdev_isspare = B_FALSE;
	- mutex_exit(&spa_spare_lock);
	-}
	-
	-boolean_t
	-spa_spare_exists(uint64_t guid, uint64_t pool, int refcnt)
	-{
	- boolean_t found;
	-
	- mutex_enter(&spa_spare_lock);
	- found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
	- mutex_exit(&spa_spare_lock);
	-
	- return (found);
	-}
	-
	-void
	-spa_spare_activate(vdev_t *vd)
	-{
	- mutex_enter(&spa_spare_lock);
	- ASSERT(vd->vdev_isspare);
	- spa_aux_activate(vd, &spa_spare_avl);
	- mutex_exit(&spa_spare_lock);
	-}
	-
	-/*
	- * Level 2 ARC devices are tracked globally for the same reasons as spares.
	- * Cache devices currently only support one pool per cache device, and so
	- * for these devices the aux reference count is currently unused beyond 1.
	- */
	-
	-static int
	-spa_l2cache_compare(const void a, const void b)
	-{
	- return (spa_aux_compare(a, b));
	-}
	-
	-void
	-spa_l2cache_add(vdev_t *vd)
	-{
	- mutex_enter(&spa_l2cache_lock);
	- ASSERT(!vd->vdev_isl2cache);
	- spa_aux_add(vd, &spa_l2cache_avl);
	- vd->vdev_isl2cache = B_TRUE;
	- mutex_exit(&spa_l2cache_lock);
	-}
	-
	-void
	-spa_l2cache_remove(vdev_t *vd)
	-{
	- mutex_enter(&spa_l2cache_lock);
	- ASSERT(vd->vdev_isl2cache);
	- spa_aux_remove(vd, &spa_l2cache_avl);
	- vd->vdev_isl2cache = B_FALSE;
	- mutex_exit(&spa_l2cache_lock);
	-}
	-
	-boolean_t
	-spa_l2cache_exists(uint64_t guid, uint64_t *pool)
	-{
	- boolean_t found;
	-
	- mutex_enter(&spa_l2cache_lock);
	- found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
	- mutex_exit(&spa_l2cache_lock);
	-
	- return (found);
	-}
	-
	-void
	-spa_l2cache_activate(vdev_t *vd)
	-{
	- mutex_enter(&spa_l2cache_lock);
	- ASSERT(vd->vdev_isl2cache);
	- spa_aux_activate(vd, &spa_l2cache_avl);
	- mutex_exit(&spa_l2cache_lock);
	-}
	-
	-/*
	- * ==========================================================================
	- * SPA vdev locking
	- * ==========================================================================
	- */
	-
	-/*
	- * Lock the given spa_t for the purpose of adding or removing a vdev.
	- * Grabs the global spa_namespace_lock plus the spa config lock for writing.
	- * It returns the next transaction group for the spa_t.
	- */
	-uint64_t
	-spa_vdev_enter(spa_t *spa)
	-{
	- mutex_enter(&spa->spa_vdev_top_lock);
	- mutex_enter(&spa_namespace_lock);
	- return (spa_vdev_config_enter(spa));
	-}
	-
	-/*
	- * Internal implementation for spa_vdev_enter(). Used when a vdev
	- * operation requires multiple syncs (i.e. removing a device) while
	- * keeping the spa_namespace_lock held.
	- */
	-uint64_t
	-spa_vdev_config_enter(spa_t *spa)
	-{
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
	-
	- return (spa_last_synced_txg(spa) + 1);
	-}
	-
	-/*
	- * Used in combination with spa_vdev_config_enter() to allow the syncing
	- * of multiple transactions without releasing the spa_namespace_lock.
	- */
	-void
	-spa_vdev_config_exit(spa_t spa, vdev_t vd, uint64_t txg, int error, char *tag)
	-{
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- int config_changed = B_FALSE;
	-
	- ASSERT(txg > spa_last_synced_txg(spa));
	-
	- spa->spa_pending_vdev = NULL;
	-
	- /*
	- * Reassess the DTLs.
	- */
	- vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
	-
	- if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
	- config_changed = B_TRUE;
	- spa->spa_config_generation++;
	- }
	-
	- /*
	- * Verify the metaslab classes.
	- */
	- ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
	- ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
	- ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
	- ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
	-
	- spa_config_exit(spa, SCL_ALL, spa);
	-
	- /*
	- * Panic the system if the specified tag requires it. This
	- * is useful for ensuring that configurations are updated
	- * transactionally.
	- */
	- if (zio_injection_enabled)
	- zio_handle_panic_injection(spa, tag, 0);
	-
	- /*
	- * Note: this txg_wait_synced() is important because it ensures
	- * that there won't be more than one config change per txg.
	- * This allows us to use the txg as the generation number.
	- */
	- if (error == 0)
	- txg_wait_synced(spa->spa_dsl_pool, txg);
	-
	- if (vd != NULL) {
	- ASSERT(!vd->vdev_detached \|\| vd->vdev_dtl_sm == NULL);
	- if (vd->vdev_ops->vdev_op_leaf) {
	- mutex_enter(&vd->vdev_initialize_lock);
	- vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
	- mutex_exit(&vd->vdev_initialize_lock);
	- }
	-
	- spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
	- vdev_free(vd);
	- spa_config_exit(spa, SCL_ALL, spa);
	- }
	-
	- /*
	- * If the config changed, update the config cache.
	- */
	- if (config_changed)
	- spa_write_cachefile(spa, B_FALSE, B_TRUE);
	-}
	-
	-/*
	- * Unlock the spa_t after adding or removing a vdev. Besides undoing the
	- * locking of spa_vdev_enter(), we also want make sure the transactions have
	- * synced to disk, and then update the global configuration cache with the new
	- * information.
	- */
	-int
	-spa_vdev_exit(spa_t spa, vdev_t vd, uint64_t txg, int error)
	-{
	- spa_vdev_config_exit(spa, vd, txg, error, FTAG);
	- mutex_exit(&spa_namespace_lock);
	- mutex_exit(&spa->spa_vdev_top_lock);
	-
	- return (error);
	-}
	-
	-/*
	- * Lock the given spa_t for the purpose of changing vdev state.
	- */
	-void
	-spa_vdev_state_enter(spa_t *spa, int oplocks)
	-{
	- int locks = SCL_STATE_ALL \| oplocks;
	-
	- /*
	- * Root pools may need to read of the underlying devfs filesystem
	- * when opening up a vdev. Unfortunately if we're holding the
	- * SCL_ZIO lock it will result in a deadlock when we try to issue
	- * the read from the root filesystem. Instead we "prefetch"
	- * the associated vnodes that we need prior to opening the
	- * underlying devices and cache them so that we can prevent
	- * any I/O when we are doing the actual open.
	- */
	- if (spa_is_root(spa)) {
	- int low = locks & ~(SCL_ZIO - 1);
	- int high = locks & ~low;
	-
	- spa_config_enter(spa, high, spa, RW_WRITER);
	- vdev_hold(spa->spa_root_vdev);
	- spa_config_enter(spa, low, spa, RW_WRITER);
	- } else {
	- spa_config_enter(spa, locks, spa, RW_WRITER);
	- }
	- spa->spa_vdev_locks = locks;
	-}
	-
	-int
	-spa_vdev_state_exit(spa_t spa, vdev_t vd, int error)
	-{
	- boolean_t config_changed = B_FALSE;
	-
	- if (vd != NULL \|\| error == 0)
	- vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
	- 0, 0, B_FALSE);
	-
	- if (vd != NULL) {
	- vdev_state_dirty(vd->vdev_top);
	- config_changed = B_TRUE;
	- spa->spa_config_generation++;
	- }
	-
	- if (spa_is_root(spa))
	- vdev_rele(spa->spa_root_vdev);
	-
	- ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
	- spa_config_exit(spa, spa->spa_vdev_locks, spa);
	-
	- /*
	- * If anything changed, wait for it to sync. This ensures that,
	- * from the system administrator's perspective, zpool(1M) commands
	- * are synchronous. This is important for things like zpool offline:
	- * when the command completes, you expect no further I/O from ZFS.
	- */
	- if (vd != NULL)
	- txg_wait_synced(spa->spa_dsl_pool, 0);
	-
	- /*
	- * If the config changed, update the config cache.
	- */
	- if (config_changed) {
	- mutex_enter(&spa_namespace_lock);
	- spa_write_cachefile(spa, B_FALSE, B_TRUE);
	- mutex_exit(&spa_namespace_lock);
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * ==========================================================================
	- * Miscellaneous functions
	- * ==========================================================================
	- */
	-
	-void
	-spa_activate_mos_feature(spa_t spa, const char feature, dmu_tx_t *tx)
	-{
	- if (!nvlist_exists(spa->spa_label_features, feature)) {
	- fnvlist_add_boolean(spa->spa_label_features, feature);
	- /*
	- * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
	- * dirty the vdev config because lock SCL_CONFIG is not held.
	- * Thankfully, in this case we don't need to dirty the config
	- * because it will be written out anyway when we finish
	- * creating the pool.
	- */
	- if (tx->tx_txg != TXG_INITIAL)
	- vdev_config_dirty(spa->spa_root_vdev);
	- }
	-}
	-
	-void
	-spa_deactivate_mos_feature(spa_t spa, const char feature)
	-{
	- if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
	- vdev_config_dirty(spa->spa_root_vdev);
	-}
	-
	-/*
	- * Return the spa_t associated with given pool_guid, if it exists. If
	- * device_guid is non-zero, determine whether the pool exists and contains
	- * a device with the specified device_guid.
	- */
	-spa_t *
	-spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
	-{
	- spa_t *spa;
	- avl_tree_t *t = &spa_namespace_avl;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
	- if (spa->spa_state == POOL_STATE_UNINITIALIZED)
	- continue;
	- if (spa->spa_root_vdev == NULL)
	- continue;
	- if (spa_guid(spa) == pool_guid) {
	- if (device_guid == 0)
	- break;
	-
	- if (vdev_lookup_by_guid(spa->spa_root_vdev,
	- device_guid) != NULL)
	- break;
	-
	- /*
	- * Check any devices we may be in the process of adding.
	- */
	- if (spa->spa_pending_vdev) {
	- if (vdev_lookup_by_guid(spa->spa_pending_vdev,
	- device_guid) != NULL)
	- break;
	- }
	- }
	- }
	-
	- return (spa);
	-}
	-
	-/*
	- * Determine whether a pool with the given pool_guid exists.
	- */
	-boolean_t
	-spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
	-{
	- return (spa_by_guid(pool_guid, device_guid) != NULL);
	-}
	-
	-char *
	-spa_strdup(const char *s)
	-{
	- size_t len;
	- char *new;
	-
	- len = strlen(s);
	- new = kmem_alloc(len + 1, KM_SLEEP);
	- bcopy(s, new, len);
	- new[len] = '\0';
	-
	- return (new);
	-}
	-
	-void
	-spa_strfree(char *s)
	-{
	- kmem_free(s, strlen(s) + 1);
	-}
	-
	-uint64_t
	-spa_get_random(uint64_t range)
	-{
	- uint64_t r;
	-
	- ASSERT(range != 0);
	-
	- if (range == 1)
	- return (0);
	-
	- (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
	-
	- return (r % range);
	-}
	-
	-uint64_t
	-spa_generate_guid(spa_t *spa)
	-{
	- uint64_t guid = spa_get_random(-1ULL);
	-
	- if (spa != NULL) {
	- while (guid == 0 \|\| spa_guid_exists(spa_guid(spa), guid))
	- guid = spa_get_random(-1ULL);
	- } else {
	- while (guid == 0 \|\| spa_guid_exists(guid, 0))
	- guid = spa_get_random(-1ULL);
	- }
	-
	- return (guid);
	-}
	-
	-void
	-snprintf_blkptr(char buf, size_t buflen, const blkptr_t bp)
	-{
	- char type[256];
	- char *checksum = NULL;
	- char *compress = NULL;
	-
	- if (bp != NULL) {
	- if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
	- dmu_object_byteswap_t bswap =
	- DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
	- (void) snprintf(type, sizeof (type), "bswap %s %s",
	- DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
	- "metadata" : "data",
	- dmu_ot_byteswap[bswap].ob_name);
	- } else {
	- (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
	- sizeof (type));
	- }
	- if (!BP_IS_EMBEDDED(bp)) {
	- checksum =
	- zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
	- }
	- compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
	- }
	-
	- SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
	- compress);
	-}
	-
	-void
	-spa_freeze(spa_t *spa)
	-{
	- uint64_t freeze_txg = 0;
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- if (spa->spa_freeze_txg == UINT64_MAX) {
	- freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
	- spa->spa_freeze_txg = freeze_txg;
	- }
	- spa_config_exit(spa, SCL_ALL, FTAG);
	- if (freeze_txg != 0)
	- txg_wait_synced(spa_get_dsl(spa), freeze_txg);
	-}
	-
	-void
	-zfs_panic_recover(const char *fmt, ...)
	-{
	- va_list adx;
	-
	- va_start(adx, fmt);
	- vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
	- va_end(adx);
	-}
	-
	-/*
	- * This is a stripped-down version of strtoull, suitable only for converting
	- * lowercase hexadecimal numbers that don't overflow.
	- */
	-uint64_t
	-zfs_strtonum(const char str, char *nptr)
	-{
	- uint64_t val = 0;
	- char c;
	- int digit;
	-
	- while ((c = *str) != '\0') {
	- if (c >= '0' && c <= '9')
	- digit = c - '0';
	- else if (c >= 'a' && c <= 'f')
	- digit = 10 + c - 'a';
	- else
	- break;
	-
	- val *= 16;
	- val += digit;
	-
	- str++;
	- }
	-
	- if (nptr)
	- nptr = (char )str;
	-
	- return (val);
	-}
	-
	-void
	-spa_activate_allocation_classes(spa_t spa, dmu_tx_t tx)
	-{
	- /*
	- * We bump the feature refcount for each special vdev added to the pool
	- */
	- ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
	- spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
	-}
	-
	-/*
	- * ==========================================================================
	- * Accessor functions
	- * ==========================================================================
	- */
	-
	-boolean_t
	-spa_shutting_down(spa_t *spa)
	-{
	- return (spa->spa_async_suspended);
	-}
	-
	-dsl_pool_t *
	-spa_get_dsl(spa_t *spa)
	-{
	- return (spa->spa_dsl_pool);
	-}
	-
	-boolean_t
	-spa_is_initializing(spa_t *spa)
	-{
	- return (spa->spa_is_initializing);
	-}
	-
	-boolean_t
	-spa_indirect_vdevs_loaded(spa_t *spa)
	-{
	- return (spa->spa_indirect_vdevs_loaded);
	-}
	-
	-blkptr_t *
	-spa_get_rootblkptr(spa_t *spa)
	-{
	- return (&spa->spa_ubsync.ub_rootbp);
	-}
	-
	-void
	-spa_set_rootblkptr(spa_t spa, const blkptr_t bp)
	-{
	- spa->spa_uberblock.ub_rootbp = *bp;
	-}
	-
	-void
	-spa_altroot(spa_t spa, char buf, size_t buflen)
	-{
	- if (spa->spa_root == NULL)
	- buf[0] = '\0';
	- else
	- (void) strncpy(buf, spa->spa_root, buflen);
	-}
	-
	-int
	-spa_sync_pass(spa_t *spa)
	-{
	- return (spa->spa_sync_pass);
	-}
	-
	-char *
	-spa_name(spa_t *spa)
	-{
	- return (spa->spa_name);
	-}
	-
	-uint64_t
	-spa_guid(spa_t *spa)
	-{
	- dsl_pool_t *dp = spa_get_dsl(spa);
	- uint64_t guid;
	-
	- /*
	- * If we fail to parse the config during spa_load(), we can go through
	- * the error path (which posts an ereport) and end up here with no root
	- * vdev. We stash the original pool guid in 'spa_config_guid' to handle
	- * this case.
	- */
	- if (spa->spa_root_vdev == NULL)
	- return (spa->spa_config_guid);
	-
	- guid = spa->spa_last_synced_guid != 0 ?
	- spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
	-
	- /*
	- * Return the most recently synced out guid unless we're
	- * in syncing context.
	- */
	- if (dp && dsl_pool_sync_context(dp))
	- return (spa->spa_root_vdev->vdev_guid);
	- else
	- return (guid);
	-}
	-
	-uint64_t
	-spa_load_guid(spa_t *spa)
	-{
	- /*
	- * This is a GUID that exists solely as a reference for the
	- * purposes of the arc. It is generated at load time, and
	- * is never written to persistent storage.
	- */
	- return (spa->spa_load_guid);
	-}
	-
	-uint64_t
	-spa_last_synced_txg(spa_t *spa)
	-{
	- return (spa->spa_ubsync.ub_txg);
	-}
	-
	-uint64_t
	-spa_first_txg(spa_t *spa)
	-{
	- return (spa->spa_first_txg);
	-}
	-
	-uint64_t
	-spa_syncing_txg(spa_t *spa)
	-{
	- return (spa->spa_syncing_txg);
	-}
	-
	-/*
	- * Return the last txg where data can be dirtied. The final txgs
	- * will be used to just clear out any deferred frees that remain.
	- */
	-uint64_t
	-spa_final_dirty_txg(spa_t *spa)
	-{
	- return (spa->spa_final_txg - TXG_DEFER_SIZE);
	-}
	-
	-pool_state_t
	-spa_state(spa_t *spa)
	-{
	- return (spa->spa_state);
	-}
	-
	-spa_load_state_t
	-spa_load_state(spa_t *spa)
	-{
	- return (spa->spa_load_state);
	-}
	-
	-uint64_t
	-spa_freeze_txg(spa_t *spa)
	-{
	- return (spa->spa_freeze_txg);
	-}
	-
	-/* ARGSUSED */
	-uint64_t
	-spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
	-{
	- return (lsize * spa_asize_inflation);
	-}
	-
	-/*
	- * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%),
	- * or at least 128MB, unless that would cause it to be more than half the
	- * pool size.
	- *
	- * See the comment above spa_slop_shift for details.
	- */
	-uint64_t
	-spa_get_slop_space(spa_t *spa)
	-{
	- uint64_t space = spa_get_dspace(spa);
	- return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
	-}
	-
	-uint64_t
	-spa_get_dspace(spa_t *spa)
	-{
	- return (spa->spa_dspace);
	-}
	-
	-uint64_t
	-spa_get_checkpoint_space(spa_t *spa)
	-{
	- return (spa->spa_checkpoint_info.sci_dspace);
	-}
	-
	-void
	-spa_update_dspace(spa_t *spa)
	-{
	- spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
	- ddt_get_dedup_dspace(spa);
	- if (spa->spa_vdev_removal != NULL) {
	- /*
	- * We can't allocate from the removing device, so
	- * subtract its size. This prevents the DMU/DSL from
	- * filling up the (now smaller) pool while we are in the
	- * middle of removing the device.
	- *
	- * Note that the DMU/DSL doesn't actually know or care
	- * how much space is allocated (it does its own tracking
	- * of how much space has been logically used). So it
	- * doesn't matter that the data we are moving may be
	- * allocated twice (on the old device and the new
	- * device).
	- */
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	- vdev_t *vd =
	- vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
	- spa->spa_dspace -= spa_deflate(spa) ?
	- vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	- }
	-}
	-
	-/*
	- * Return the failure mode that has been set to this pool. The default
	- * behavior will be to block all I/Os when a complete failure occurs.
	- */
	-uint8_t
	-spa_get_failmode(spa_t *spa)
	-{
	- return (spa->spa_failmode);
	-}
	-
	-boolean_t
	-spa_suspended(spa_t *spa)
	-{
	- return (spa->spa_suspended != ZIO_SUSPEND_NONE);
	-}
	-
	-uint64_t
	-spa_version(spa_t *spa)
	-{
	- return (spa->spa_ubsync.ub_version);
	-}
	-
	-boolean_t
	-spa_deflate(spa_t *spa)
	-{
	- return (spa->spa_deflate);
	-}
	-
	-metaslab_class_t *
	-spa_normal_class(spa_t *spa)
	-{
	- return (spa->spa_normal_class);
	-}
	-
	-metaslab_class_t *
	-spa_log_class(spa_t *spa)
	-{
	- return (spa->spa_log_class);
	-}
	-
	-metaslab_class_t *
	-spa_special_class(spa_t *spa)
	-{
	- return (spa->spa_special_class);
	-}
	-
	-metaslab_class_t *
	-spa_dedup_class(spa_t *spa)
	-{
	- return (spa->spa_dedup_class);
	-}
	-
	-/*
	- * Locate an appropriate allocation class
	- */
	-metaslab_class_t *
	-spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
	- uint_t level, uint_t special_smallblk)
	-{
	- if (DMU_OT_IS_ZIL(objtype)) {
	- if (spa->spa_log_class->mc_groups != 0)
	- return (spa_log_class(spa));
	- else
	- return (spa_normal_class(spa));
	- }
	-
	- boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
	-
	- if (DMU_OT_IS_DDT(objtype)) {
	- if (spa->spa_dedup_class->mc_groups != 0)
	- return (spa_dedup_class(spa));
	- else if (has_special_class && zfs_ddt_data_is_special)
	- return (spa_special_class(spa));
	- else
	- return (spa_normal_class(spa));
	- }
	-
	- /* Indirect blocks for user data can land in special if allowed */
	- if (level > 0 && (DMU_OT_IS_FILE(objtype) \|\| objtype == DMU_OT_ZVOL)) {
	- if (has_special_class && zfs_user_indirect_is_special)
	- return (spa_special_class(spa));
	- else
	- return (spa_normal_class(spa));
	- }
	-
	- if (DMU_OT_IS_METADATA(objtype) \|\| level > 0) {
	- if (has_special_class)
	- return (spa_special_class(spa));
	- else
	- return (spa_normal_class(spa));
	- }
	-
	- /*
	- * Allow small file blocks in special class in some cases (like
	- * for the dRAID vdev feature). But always leave a reserve of
	- * zfs_special_class_metadata_reserve_pct exclusively for metadata.
	- */
	- if (DMU_OT_IS_FILE(objtype) &&
	- has_special_class && size <= special_smallblk) {
	- metaslab_class_t *special = spa_special_class(spa);
	- uint64_t alloc = metaslab_class_get_alloc(special);
	- uint64_t space = metaslab_class_get_space(special);
	- uint64_t limit =
	- (space * (100 - zfs_special_class_metadata_reserve_pct))
	- / 100;
	-
	- if (alloc < limit)
	- return (special);
	- }
	-
	- return (spa_normal_class(spa));
	-}
	-
	-void
	-spa_evicting_os_register(spa_t spa, objset_t os)
	-{
	- mutex_enter(&spa->spa_evicting_os_lock);
	- list_insert_head(&spa->spa_evicting_os_list, os);
	- mutex_exit(&spa->spa_evicting_os_lock);
	-}
	-
	-void
	-spa_evicting_os_deregister(spa_t spa, objset_t os)
	-{
	- mutex_enter(&spa->spa_evicting_os_lock);
	- list_remove(&spa->spa_evicting_os_list, os);
	- cv_broadcast(&spa->spa_evicting_os_cv);
	- mutex_exit(&spa->spa_evicting_os_lock);
	-}
	-
	-void
	-spa_evicting_os_wait(spa_t *spa)
	-{
	- mutex_enter(&spa->spa_evicting_os_lock);
	- while (!list_is_empty(&spa->spa_evicting_os_list))
	- cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
	- mutex_exit(&spa->spa_evicting_os_lock);
	-
	- dmu_buf_user_evict_wait();
	-}
	-
	-int
	-spa_max_replication(spa_t *spa)
	-{
	- /*
	- * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
	- * handle BPs with more than one DVA allocated. Set our max
	- * replication level accordingly.
	- */
	- if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
	- return (1);
	- return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
	-}
	-
	-int
	-spa_prev_software_version(spa_t *spa)
	-{
	- return (spa->spa_prev_software_version);
	-}
	-
	-uint64_t
	-spa_deadman_synctime(spa_t *spa)
	-{
	- return (spa->spa_deadman_synctime);
	-}
	-
	-struct proc *
	-spa_proc(spa_t *spa)
	-{
	- return (spa->spa_proc);
	-}
	-
	-uint64_t
	-dva_get_dsize_sync(spa_t spa, const dva_t dva)
	-{
	- uint64_t asize = DVA_GET_ASIZE(dva);
	- uint64_t dsize = asize;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
	-
	- if (asize != 0 && spa->spa_deflate) {
	- uint64_t vdev = DVA_GET_VDEV(dva);
	- vdev_t *vd = vdev_lookup_top(spa, vdev);
	- if (vd == NULL) {
	- panic(
	- "dva_get_dsize_sync(): bad DVA %llu:%llu",
	- (u_longlong_t)vdev, (u_longlong_t)asize);
	- }
	- dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
	- }
	-
	- return (dsize);
	-}
	-
	-uint64_t
	-bp_get_dsize_sync(spa_t spa, const blkptr_t bp)
	-{
	- uint64_t dsize = 0;
	-
	- for (int d = 0; d < BP_GET_NDVAS(bp); d++)
	- dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
	-
	- return (dsize);
	-}
	-
	-uint64_t
	-bp_get_dsize(spa_t spa, const blkptr_t bp)
	-{
	- uint64_t dsize = 0;
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	-
	- for (int d = 0; d < BP_GET_NDVAS(bp); d++)
	- dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
	-
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- return (dsize);
	-}
	-
	-uint64_t
	-spa_dirty_data(spa_t *spa)
	-{
	- return (spa->spa_dsl_pool->dp_dirty_total);
	-}
	-
	-/*
	- * ==========================================================================
	- * Initialization and Termination
	- * ==========================================================================
	- */
	-
	-static int
	-spa_name_compare(const void a1, const void a2)
	-{
	- const spa_t *s1 = a1;
	- const spa_t *s2 = a2;
	- int s;
	-
	- s = strcmp(s1->spa_name, s2->spa_name);
	-
	- return (AVL_ISIGN(s));
	-}
	-
	-int
	-spa_busy(void)
	-{
	- return (spa_active_count);
	-}
	-
	-void
	-spa_boot_init()
	-{
	- spa_config_load();
	-}
	-
	-#ifdef _KERNEL
	-EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
	-#endif
	-
	-void
	-spa_init(int mode)
	-{
	- mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
	-
	- avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
	- offsetof(spa_t, spa_avl));
	-
	- avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
	- offsetof(spa_aux_t, aux_avl));
	-
	- avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
	- offsetof(spa_aux_t, aux_avl));
	-
	- spa_mode_global = mode;
	-
	-#ifdef illumos
	-#ifdef _KERNEL
	- spa_arch_init();
	-#else
	- if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
	- arc_procfd = open("/proc/self/ctl", O_WRONLY);
	- if (arc_procfd == -1) {
	- perror("could not enable watchpoints: "
	- "opening /proc/self/ctl failed: ");
	- } else {
	- arc_watch = B_TRUE;
	- }
	- }
	-#endif
	-#endif /* illumos */
	-
	- zfs_refcount_init();
	- unique_init();
	- range_tree_init();
	- metaslab_alloc_trace_init();
	- zio_init();
	- lz4_init();
	- dmu_init();
	- zil_init();
	- vdev_cache_stat_init();
	- vdev_file_init();
	- zfs_prop_init();
	- zpool_prop_init();
	- zpool_feature_init();
	- spa_config_load();
	- l2arc_start();
	- scan_init();
	- dsl_scan_global_init();
	-#ifndef illumos
	-#ifdef _KERNEL
	- zfs_deadman_init();
	-#endif
	-#endif /* !illumos */
	-}
	-
	-void
	-spa_fini(void)
	-{
	- l2arc_stop();
	-
	- spa_evict_all();
	-
	- vdev_file_fini();
	- vdev_cache_stat_fini();
	- zil_fini();
	- dmu_fini();
	- lz4_fini();
	- zio_fini();
	- metaslab_alloc_trace_fini();
	- range_tree_fini();
	- unique_fini();
	- zfs_refcount_fini();
	- scan_fini();
	-
	- avl_destroy(&spa_namespace_avl);
	- avl_destroy(&spa_spare_avl);
	- avl_destroy(&spa_l2cache_avl);
	-
	- cv_destroy(&spa_namespace_cv);
	- mutex_destroy(&spa_namespace_lock);
	- mutex_destroy(&spa_spare_lock);
	- mutex_destroy(&spa_l2cache_lock);
	-}
	-
	-/*
	- * Return whether this pool has slogs. No locking needed.
	- * It's not a problem if the wrong answer is returned as it's only for
	- * performance and not correctness
	- */
	-boolean_t
	-spa_has_slogs(spa_t *spa)
	-{
	- return (spa->spa_log_class->mc_rotor != NULL);
	-}
	-
	-spa_log_state_t
	-spa_get_log_state(spa_t *spa)
	-{
	- return (spa->spa_log_state);
	-}
	-
	-void
	-spa_set_log_state(spa_t *spa, spa_log_state_t state)
	-{
	- spa->spa_log_state = state;
	-}
	-
	-boolean_t
	-spa_is_root(spa_t *spa)
	-{
	- return (spa->spa_is_root);
	-}
	-
	-boolean_t
	-spa_writeable(spa_t *spa)
	-{
	- return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config);
	-}
	-
	-/*
	- * Returns true if there is a pending sync task in any of the current
	- * syncing txg, the current quiescing txg, or the current open txg.
	- */
	-boolean_t
	-spa_has_pending_synctask(spa_t *spa)
	-{
	- return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) \|\|
	- !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
	-}
	-
	-int
	-spa_mode(spa_t *spa)
	-{
	- return (spa->spa_mode);
	-}
	-
	-uint64_t
	-spa_bootfs(spa_t *spa)
	-{
	- return (spa->spa_bootfs);
	-}
	-
	-uint64_t
	-spa_delegation(spa_t *spa)
	-{
	- return (spa->spa_delegation);
	-}
	-
	-objset_t *
	-spa_meta_objset(spa_t *spa)
	-{
	- return (spa->spa_meta_objset);
	-}
	-
	-enum zio_checksum
	-spa_dedup_checksum(spa_t *spa)
	-{
	- return (spa->spa_dedup_checksum);
	-}
	-
	-/*
	- * Reset pool scan stat per scan pass (or reboot).
	- */
	-void
	-spa_scan_stat_init(spa_t *spa)
	-{
	- /* data not stored on disk */
	- spa->spa_scan_pass_start = gethrestime_sec();
	- if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
	- spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
	- else
	- spa->spa_scan_pass_scrub_pause = 0;
	- spa->spa_scan_pass_scrub_spent_paused = 0;
	- spa->spa_scan_pass_exam = 0;
	- spa->spa_scan_pass_issued = 0;
	- vdev_scan_stat_init(spa->spa_root_vdev);
	-}
	-
	-/*
	- * Get scan stats for zpool status reports
	- */
	-int
	-spa_scan_get_stats(spa_t spa, pool_scan_stat_t ps)
	-{
	- dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
	-
	- if (scn == NULL \|\| scn->scn_phys.scn_func == POOL_SCAN_NONE)
	- return (SET_ERROR(ENOENT));
	- bzero(ps, sizeof (pool_scan_stat_t));
	-
	- /* data stored on disk */
	- ps->pss_func = scn->scn_phys.scn_func;
	- ps->pss_state = scn->scn_phys.scn_state;
	- ps->pss_start_time = scn->scn_phys.scn_start_time;
	- ps->pss_end_time = scn->scn_phys.scn_end_time;
	- ps->pss_to_examine = scn->scn_phys.scn_to_examine;
	- ps->pss_to_process = scn->scn_phys.scn_to_process;
	- ps->pss_processed = scn->scn_phys.scn_processed;
	- ps->pss_errors = scn->scn_phys.scn_errors;
	- ps->pss_examined = scn->scn_phys.scn_examined;
	- ps->pss_issued =
	- scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
	- /* data not stored on disk */
	- ps->pss_pass_start = spa->spa_scan_pass_start;
	- ps->pss_pass_exam = spa->spa_scan_pass_exam;
	- ps->pss_pass_issued = spa->spa_scan_pass_issued;
	- ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
	- ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
	-
	- return (0);
	-}
	-
	-int
	-spa_maxblocksize(spa_t *spa)
	-{
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
	- return (SPA_MAXBLOCKSIZE);
	- else
	- return (SPA_OLD_MAXBLOCKSIZE);
	-}
	-
	-int
	-spa_maxdnodesize(spa_t *spa)
	-{
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
	- return (DNODE_MAX_SIZE);
	- else
	- return (DNODE_MIN_SIZE);
	-}
	-
	-boolean_t
	-spa_multihost(spa_t *spa)
	-{
	- return (spa->spa_multihost ? B_TRUE : B_FALSE);
	-}
	-
	-unsigned long
	-spa_get_hostid(void)
	-{
	- unsigned long myhostid;
	-
	-#ifdef _KERNEL
	- myhostid = zone_get_hostid(NULL);
	-#else /* _KERNEL */
	- /*
	- * We're emulating the system's hostid in userland, so
	- * we can't use zone_get_hostid().
	- */
	- (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
	-#endif /* _KERNEL */
	-
	- return (myhostid);
	-}
	-
	-/*
	- * Returns the txg that the last device removal completed. No indirect mappings
	- * have been added since this txg.
	- */
	-uint64_t
	-spa_get_last_removal_txg(spa_t *spa)
	-{
	- uint64_t vdevid;
	- uint64_t ret = -1ULL;
	-
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	- /*
	- * sr_prev_indirect_vdev is only modified while holding all the
	- * config locks, so it is sufficient to hold SCL_VDEV as reader when
	- * examining it.
	- */
	- vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
	-
	- while (vdevid != -1ULL) {
	- vdev_t *vd = vdev_lookup_top(spa, vdevid);
	- vdev_indirect_births_t *vib = vd->vdev_indirect_births;
	-
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	-
	- /*
	- * If the removal did not remap any data, we don't care.
	- */
	- if (vdev_indirect_births_count(vib) != 0) {
	- ret = vdev_indirect_births_last_entry_txg(vib);
	- break;
	- }
	-
	- vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
	- }
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- IMPLY(ret != -1ULL,
	- spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
	-
	- return (ret);
	-}
	-
	-boolean_t
	-spa_trust_config(spa_t *spa)
	-{
	- return (spa->spa_trust_config);
	-}
	-
	-uint64_t
	-spa_missing_tvds_allowed(spa_t *spa)
	-{
	- return (spa->spa_missing_tvds_allowed);
	-}
	-
	-void
	-spa_set_missing_tvds(spa_t *spa, uint64_t missing)
	-{
	- spa->spa_missing_tvds = missing;
	-}
	-
	-boolean_t
	-spa_top_vdevs_spacemap_addressable(spa_t *spa)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	- if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
	- return (B_FALSE);
	- }
	- return (B_TRUE);
	-}
	-
	-boolean_t
	-spa_has_checkpoint(spa_t *spa)
	-{
	- return (spa->spa_checkpoint_txg != 0);
	-}
	-
	-boolean_t
	-spa_importing_readonly_checkpoint(spa_t *spa)
	-{
	- return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
	- spa->spa_mode == FREAD);
	-}
	-
	-uint64_t
	-spa_min_claim_txg(spa_t *spa)
	-{
	- uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
	-
	- if (checkpoint_txg != 0)
	- return (checkpoint_txg + 1);
	-
	- return (spa->spa_first_txg);
	-}
	-
	-/*
	- * If there is a checkpoint, async destroys may consume more space from
	- * the pool instead of freeing it. In an attempt to save the pool from
	- * getting suspended when it is about to run out of space, we stop
	- * processing async destroys.
	- */
	-boolean_t
	-spa_suspend_async_destroy(spa_t *spa)
	-{
	- dsl_pool_t *dp = spa_get_dsl(spa);
	-
	- uint64_t unreserved = dsl_pool_unreserved_space(dp,
	- ZFS_SPACE_CHECK_EXTRA_RESERVED);
	- uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
	- uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
	-
	- if (spa_has_checkpoint(spa) && avail == 0)
	- return (B_TRUE);
	-
	- return (B_FALSE);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
	@@ -1,1073 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dnode.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/zio.h>
	-#include <sys/space_map.h>
	-#include <sys/refcount.h>
	-#include <sys/zfeature.h>
	-
	-SYSCTL_DECL(_vfs_zfs);
	-
	-/*
	- * Note on space map block size:
	- *
	- * The data for a given space map can be kept on blocks of any size.
	- * Larger blocks entail fewer I/O operations, but they also cause the
	- * DMU to keep more data in-core, and also to waste more I/O bandwidth
	- * when only a few blocks have changed since the last transaction group.
	- */
	-
	-/*
	- * Enabled whenever we want to stress test the use of double-word
	- * space map entries.
	- */
	-boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
	-
	-/*
	- * Override the default indirect block size of 128K, instead using 16K for
	- * spacemaps (2^14 bytes). This dramatically reduces write inflation since
	- * appending to a spacemap typically has to write one data block (4KB) and one
	- * or two indirect blocks (16K-32K, rather than 128K).
	- */
	-int space_map_ibs = 14;
	-
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
	- &space_map_ibs, 0, "Space map indirect block shift");
	-
	-boolean_t
	-sm_entry_is_debug(uint64_t e)
	-{
	- return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
	-}
	-
	-boolean_t
	-sm_entry_is_single_word(uint64_t e)
	-{
	- uint8_t prefix = SM_PREFIX_DECODE(e);
	- return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
	-}
	-
	-boolean_t
	-sm_entry_is_double_word(uint64_t e)
	-{
	- return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
	-}
	-
	-/*
	- * Iterate through the space map, invoking the callback on each (non-debug)
	- * space map entry. Stop after reading 'end' bytes of the space map.
	- */
	-int
	-space_map_iterate(space_map_t sm, uint64_t end, sm_cb_t callback, void arg)
	-{
	- uint64_t blksz = sm->sm_blksz;
	-
	- ASSERT3U(blksz, !=, 0);
	- ASSERT3U(end, <=, space_map_length(sm));
	- ASSERT0(P2PHASE(end, sizeof (uint64_t)));
	-
	- dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
	- ZIO_PRIORITY_SYNC_READ);
	-
	- int error = 0;
	- for (uint64_t block_base = 0; block_base < end && error == 0;
	- block_base += blksz) {
	- dmu_buf_t *db;
	- error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
	- block_base, FTAG, &db, DMU_READ_PREFETCH);
	- if (error != 0)
	- return (error);
	-
	- uint64_t *block_start = db->db_data;
	- uint64_t block_length = MIN(end - block_base, blksz);
	- uint64_t *block_end = block_start +
	- (block_length / sizeof (uint64_t));
	-
	- VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
	- VERIFY3U(block_length, !=, 0);
	- ASSERT3U(blksz, ==, db->db_size);
	-
	- for (uint64_t *block_cursor = block_start;
	- block_cursor < block_end && error == 0; block_cursor++) {
	- uint64_t e = *block_cursor;
	-
	- if (sm_entry_is_debug(e)) /* Skip debug entries */
	- continue;
	-
	- uint64_t raw_offset, raw_run, vdev_id;
	- maptype_t type;
	- if (sm_entry_is_single_word(e)) {
	- type = SM_TYPE_DECODE(e);
	- vdev_id = SM_NO_VDEVID;
	- raw_offset = SM_OFFSET_DECODE(e);
	- raw_run = SM_RUN_DECODE(e);
	- } else {
	- /* it is a two-word entry */
	- ASSERT(sm_entry_is_double_word(e));
	- raw_run = SM2_RUN_DECODE(e);
	- vdev_id = SM2_VDEV_DECODE(e);
	-
	- /* move on to the second word */
	- block_cursor++;
	- e = *block_cursor;
	- VERIFY3P(block_cursor, <=, block_end);
	-
	- type = SM2_TYPE_DECODE(e);
	- raw_offset = SM2_OFFSET_DECODE(e);
	- }
	-
	- uint64_t entry_offset = (raw_offset << sm->sm_shift) +
	- sm->sm_start;
	- uint64_t entry_run = raw_run << sm->sm_shift;
	-
	- VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
	- VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
	- ASSERT3U(entry_offset, >=, sm->sm_start);
	- ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
	- ASSERT3U(entry_run, <=, sm->sm_size);
	- ASSERT3U(entry_offset + entry_run, <=,
	- sm->sm_start + sm->sm_size);
	-
	- space_map_entry_t sme = {
	- .sme_type = type,
	- .sme_vdev = vdev_id,
	- .sme_offset = entry_offset,
	- .sme_run = entry_run
	- };
	- error = callback(&sme, arg);
	- }
	- dmu_buf_rele(db, FTAG);
	- }
	- return (error);
	-}
	-
	-/*
	- * Reads the entries from the last block of the space map into
	- * buf in reverse order. Populates nwords with number of words
	- * in the last block.
	- *
	- * Refer to block comment within space_map_incremental_destroy()
	- * to understand why this function is needed.
	- */
	-static int
	-space_map_reversed_last_block_entries(space_map_t sm, uint64_t buf,
	- uint64_t bufsz, uint64_t *nwords)
	-{
	- int error = 0;
	- dmu_buf_t *db;
	-
	- /*
	- * Find the offset of the last word in the space map and use
	- * that to read the last block of the space map with
	- * dmu_buf_hold().
	- */
	- uint64_t last_word_offset =
	- sm->sm_phys->smp_length - sizeof (uint64_t);
	- error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
	- FTAG, &db, DMU_READ_NO_PREFETCH);
	- if (error != 0)
	- return (error);
	-
	- ASSERT3U(sm->sm_object, ==, db->db_object);
	- ASSERT3U(sm->sm_blksz, ==, db->db_size);
	- ASSERT3U(bufsz, >=, db->db_size);
	- ASSERT(nwords != NULL);
	-
	- uint64_t *words = db->db_data;
	- *nwords =
	- (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
	-
	- ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
	-
	- uint64_t n = *nwords;
	- uint64_t j = n - 1;
	- for (uint64_t i = 0; i < n; i++) {
	- uint64_t entry = words[i];
	- if (sm_entry_is_double_word(entry)) {
	- /*
	- * Since we are populating the buffer backwards
	- * we have to be extra careful and add the two
	- * words of the double-word entry in the right
	- * order.
	- */
	- ASSERT3U(j, >, 0);
	- buf[j - 1] = entry;
	-
	- i++;
	- ASSERT3U(i, <, n);
	- entry = words[i];
	- buf[j] = entry;
	- j -= 2;
	- } else {
	- ASSERT(sm_entry_is_debug(entry) \|\|
	- sm_entry_is_single_word(entry));
	- buf[j] = entry;
	- j--;
	- }
	- }
	-
	- /*
	- * Assert that we wrote backwards all the
	- * way to the beginning of the buffer.
	- */
	- ASSERT3S(j, ==, -1);
	-
	- dmu_buf_rele(db, FTAG);
	- return (error);
	-}
	-
	-/*
	- * Note: This function performs destructive actions - specifically
	- * it deletes entries from the end of the space map. Thus, callers
	- * should ensure that they are holding the appropriate locks for
	- * the space map that they provide.
	- */
	-int
	-space_map_incremental_destroy(space_map_t sm, sm_cb_t callback, void arg,
	- dmu_tx_t *tx)
	-{
	- uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
	- uint64_t *buf = zio_buf_alloc(bufsz);
	-
	- dmu_buf_will_dirty(sm->sm_dbuf, tx);
	-
	- /*
	- * Ideally we would want to iterate from the beginning of the
	- * space map to the end in incremental steps. The issue with this
	- * approach is that we don't have any field on-disk that points
	- * us where to start between each step. We could try zeroing out
	- * entries that we've destroyed, but this doesn't work either as
	- * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
	- *
	- * As a result, we destroy its entries incrementally starting from
	- * the end after applying the callback to each of them.
	- *
	- * The problem with this approach is that we cannot literally
	- * iterate through the words in the space map backwards as we
	- * can't distinguish two-word space map entries from their second
	- * word. Thus we do the following:
	- *
	- * 1] We get all the entries from the last block of the space map
	- * and put them into a buffer in reverse order. This way the
	- * last entry comes first in the buffer, the second to last is
	- * second, etc.
	- * 2] We iterate through the entries in the buffer and we apply
	- * the callback to each one. As we move from entry to entry we
	- * we decrease the size of the space map, deleting effectively
	- * each entry.
	- * 3] If there are no more entries in the space map or the callback
	- * returns a value other than 0, we stop iterating over the
	- * space map. If there are entries remaining and the callback
	- * returned 0, we go back to step [1].
	- */
	- int error = 0;
	- while (space_map_length(sm) > 0 && error == 0) {
	- uint64_t nwords = 0;
	- error = space_map_reversed_last_block_entries(sm, buf, bufsz,
	- &nwords);
	- if (error != 0)
	- break;
	-
	- ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
	-
	- for (uint64_t i = 0; i < nwords; i++) {
	- uint64_t e = buf[i];
	-
	- if (sm_entry_is_debug(e)) {
	- sm->sm_phys->smp_length -= sizeof (uint64_t);
	- continue;
	- }
	-
	- int words = 1;
	- uint64_t raw_offset, raw_run, vdev_id;
	- maptype_t type;
	- if (sm_entry_is_single_word(e)) {
	- type = SM_TYPE_DECODE(e);
	- vdev_id = SM_NO_VDEVID;
	- raw_offset = SM_OFFSET_DECODE(e);
	- raw_run = SM_RUN_DECODE(e);
	- } else {
	- ASSERT(sm_entry_is_double_word(e));
	- words = 2;
	-
	- raw_run = SM2_RUN_DECODE(e);
	- vdev_id = SM2_VDEV_DECODE(e);
	-
	- /* move to the second word */
	- i++;
	- e = buf[i];
	-
	- ASSERT3P(i, <=, nwords);
	-
	- type = SM2_TYPE_DECODE(e);
	- raw_offset = SM2_OFFSET_DECODE(e);
	- }
	-
	- uint64_t entry_offset =
	- (raw_offset << sm->sm_shift) + sm->sm_start;
	- uint64_t entry_run = raw_run << sm->sm_shift;
	-
	- VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
	- VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
	- VERIFY3U(entry_offset, >=, sm->sm_start);
	- VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
	- VERIFY3U(entry_run, <=, sm->sm_size);
	- VERIFY3U(entry_offset + entry_run, <=,
	- sm->sm_start + sm->sm_size);
	-
	- space_map_entry_t sme = {
	- .sme_type = type,
	- .sme_vdev = vdev_id,
	- .sme_offset = entry_offset,
	- .sme_run = entry_run
	- };
	- error = callback(&sme, arg);
	- if (error != 0)
	- break;
	-
	- if (type == SM_ALLOC)
	- sm->sm_phys->smp_alloc -= entry_run;
	- else
	- sm->sm_phys->smp_alloc += entry_run;
	- sm->sm_phys->smp_length -= words * sizeof (uint64_t);
	- }
	- }
	-
	- if (space_map_length(sm) == 0) {
	- ASSERT0(error);
	- ASSERT0(space_map_allocated(sm));
	- }
	-
	- zio_buf_free(buf, bufsz);
	- return (error);
	-}
	-
	-typedef struct space_map_load_arg {
	- space_map_t *smla_sm;
	- range_tree_t *smla_rt;
	- maptype_t smla_type;
	-} space_map_load_arg_t;
	-
	-static int
	-space_map_load_callback(space_map_entry_t sme, void arg)
	-{
	- space_map_load_arg_t *smla = arg;
	- if (sme->sme_type == smla->smla_type) {
	- VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
	- smla->smla_sm->sm_size);
	- range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
	- } else {
	- range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Load the spacemap into the rangetree, like space_map_load. But only
	- * read the first 'length' bytes of the spacemap.
	- */
	-int
	-space_map_load_length(space_map_t sm, range_tree_t rt, maptype_t maptype,
	- uint64_t length)
	-{
	- space_map_load_arg_t smla;
	-
	- VERIFY0(range_tree_space(rt));
	-
	- if (maptype == SM_FREE)
	- range_tree_add(rt, sm->sm_start, sm->sm_size);
	-
	- smla.smla_rt = rt;
	- smla.smla_sm = sm;
	- smla.smla_type = maptype;
	- int err = space_map_iterate(sm, length,
	- space_map_load_callback, &smla);
	-
	- if (err != 0)
	- range_tree_vacate(rt, NULL, NULL);
	-
	- return (err);
	-}
	-
	-/*
	- * Load the space map disk into the specified range tree. Segments of maptype
	- * are added to the range tree, other segment types are removed.
	- */
	-int
	-space_map_load(space_map_t sm, range_tree_t rt, maptype_t maptype)
	-{
	- return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
	-}
	-
	-void
	-space_map_histogram_clear(space_map_t *sm)
	-{
	- if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
	- return;
	-
	- bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
	-}
	-
	-boolean_t
	-space_map_histogram_verify(space_map_t sm, range_tree_t rt)
	-{
	- /*
	- * Verify that the in-core range tree does not have any
	- * ranges smaller than our sm_shift size.
	- */
	- for (int i = 0; i < sm->sm_shift; i++) {
	- if (rt->rt_histogram[i] != 0)
	- return (B_FALSE);
	- }
	- return (B_TRUE);
	-}
	-
	-void
	-space_map_histogram_add(space_map_t sm, range_tree_t rt, dmu_tx_t *tx)
	-{
	- int idx = 0;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- VERIFY3U(space_map_object(sm), !=, 0);
	-
	- if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
	- return;
	-
	- dmu_buf_will_dirty(sm->sm_dbuf, tx);
	-
	- ASSERT(space_map_histogram_verify(sm, rt));
	- /*
	- * Transfer the content of the range tree histogram to the space
	- * map histogram. The space map histogram contains 32 buckets ranging
	- * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
	- * however, can represent ranges from 2^0 to 2^63. Since the space
	- * map only cares about allocatable blocks (minimum of sm_shift) we
	- * can safely ignore all ranges in the range tree smaller than sm_shift.
	- */
	- for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
	-
	- /*
	- * Since the largest histogram bucket in the space map is
	- * 2^(32+sm_shift-1), we need to normalize the values in
	- * the range tree for any bucket larger than that size. For
	- * example given an sm_shift of 9, ranges larger than 2^40
	- * would get normalized as if they were 1TB ranges. Assume
	- * the range tree had a count of 5 in the 2^44 (16TB) bucket,
	- * the calculation below would normalize this to 5 * 2^4 (16).
	- */
	- ASSERT3U(i, >=, idx + sm->sm_shift);
	- sm->sm_phys->smp_histogram[idx] +=
	- rt->rt_histogram[i] << (i - idx - sm->sm_shift);
	-
	- /*
	- * Increment the space map's index as long as we haven't
	- * reached the maximum bucket size. Accumulate all ranges
	- * larger than the max bucket size into the last bucket.
	- */
	- if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
	- ASSERT3U(idx + sm->sm_shift, ==, i);
	- idx++;
	- ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
	- }
	- }
	-}
	-
	-static void
	-space_map_write_intro_debug(space_map_t sm, maptype_t maptype, dmu_tx_t tx)
	-{
	- dmu_buf_will_dirty(sm->sm_dbuf, tx);
	-
	- uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) \|
	- SM_DEBUG_ACTION_ENCODE(maptype) \|
	- SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) \|
	- SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
	-
	- dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
	- sizeof (dentry), &dentry, tx);
	-
	- sm->sm_phys->smp_length += sizeof (dentry);
	-}
	-
	-/*
	- * Writes one or more entries given a segment.
	- *
	- * Note: The function may release the dbuf from the pointer initially
	- * passed to it, and return a different dbuf. Also, the space map's
	- * dbuf must be dirty for the changes in sm_phys to take effect.
	- */
	-static void
	-space_map_write_seg(space_map_t sm, range_seg_t rs, maptype_t maptype,
	- uint64_t vdev_id, uint8_t words, dmu_buf_t *dbp, void tag, dmu_tx_t *tx)
	-{
	- ASSERT3U(words, !=, 0);
	- ASSERT3U(words, <=, 2);
	-
	- /* ensure the vdev_id can be represented by the space map */
	- ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
	-
	- /*
	- * if this is a single word entry, ensure that no vdev was
	- * specified.
	- */
	- IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
	-
	- dmu_buf_t db = dbp;
	- ASSERT3U(db->db_size, ==, sm->sm_blksz);
	-
	- uint64_t *block_base = db->db_data;
	- uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
	- uint64_t *block_cursor = block_base +
	- (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
	-
	- ASSERT3P(block_cursor, <=, block_end);
	-
	- uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
	- uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
	- uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
	-
	- ASSERT3U(rs->rs_start, >=, sm->sm_start);
	- ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
	- ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
	- ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
	-
	- while (size != 0) {
	- ASSERT3P(block_cursor, <=, block_end);
	-
	- /*
	- * If we are at the end of this block, flush it and start
	- * writing again from the beginning.
	- */
	- if (block_cursor == block_end) {
	- dmu_buf_rele(db, tag);
	-
	- uint64_t next_word_offset = sm->sm_phys->smp_length;
	- VERIFY0(dmu_buf_hold(sm->sm_os,
	- space_map_object(sm), next_word_offset,
	- tag, &db, DMU_READ_PREFETCH));
	- dmu_buf_will_dirty(db, tx);
	-
	- /* update caller's dbuf */
	- *dbp = db;
	-
	- ASSERT3U(db->db_size, ==, sm->sm_blksz);
	-
	- block_base = db->db_data;
	- block_cursor = block_base;
	- block_end = block_base +
	- (db->db_size / sizeof (uint64_t));
	- }
	-
	- /*
	- * If we are writing a two-word entry and we only have one
	- * word left on this block, just pad it with an empty debug
	- * entry and write the two-word entry in the next block.
	- */
	- uint64_t *next_entry = block_cursor + 1;
	- if (next_entry == block_end && words > 1) {
	- ASSERT3U(words, ==, 2);
	- *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) \|
	- SM_DEBUG_ACTION_ENCODE(0) \|
	- SM_DEBUG_SYNCPASS_ENCODE(0) \|
	- SM_DEBUG_TXG_ENCODE(0);
	- block_cursor++;
	- sm->sm_phys->smp_length += sizeof (uint64_t);
	- ASSERT3P(block_cursor, ==, block_end);
	- continue;
	- }
	-
	- uint64_t run_len = MIN(size, run_max);
	- switch (words) {
	- case 1:
	- *block_cursor = SM_OFFSET_ENCODE(start) \|
	- SM_TYPE_ENCODE(maptype) \|
	- SM_RUN_ENCODE(run_len);
	- block_cursor++;
	- break;
	- case 2:
	- /* write the first word of the entry */
	- *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) \|
	- SM2_RUN_ENCODE(run_len) \|
	- SM2_VDEV_ENCODE(vdev_id);
	- block_cursor++;
	-
	- /* move on to the second word of the entry */
	- ASSERT3P(block_cursor, <, block_end);
	- *block_cursor = SM2_TYPE_ENCODE(maptype) \|
	- SM2_OFFSET_ENCODE(start);
	- block_cursor++;
	- break;
	- default:
	- panic("%d-word space map entries are not supported",
	- words);
	- break;
	- }
	- sm->sm_phys->smp_length += words * sizeof (uint64_t);
	-
	- start += run_len;
	- size -= run_len;
	- }
	- ASSERT0(size);
	-
	-}
	-
	-/*
	- * Note: The space map's dbuf must be dirty for the changes in sm_phys to
	- * take effect.
	- */
	-static void
	-space_map_write_impl(space_map_t sm, range_tree_t rt, maptype_t maptype,
	- uint64_t vdev_id, dmu_tx_t *tx)
	-{
	- spa_t *spa = tx->tx_pool->dp_spa;
	- dmu_buf_t *db;
	-
	- space_map_write_intro_debug(sm, maptype, tx);
	-
	-#ifdef DEBUG
	- /*
	- * We do this right after we write the intro debug entry
	- * because the estimate does not take it into account.
	- */
	- uint64_t initial_objsize = sm->sm_phys->smp_length;
	- uint64_t estimated_growth =
	- space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
	- uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
	-#endif
	-
	- /*
	- * Find the offset right after the last word in the space map
	- * and use that to get a hold of the last block, so we can
	- * start appending to it.
	- */
	- uint64_t next_word_offset = sm->sm_phys->smp_length;
	- VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
	- next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
	- ASSERT3U(db->db_size, ==, sm->sm_blksz);
	-
	- dmu_buf_will_dirty(db, tx);
	-
	- avl_tree_t *t = &rt->rt_root;
	- for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
	- uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
	- uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
	- uint8_t words = 1;
	-
	- /*
	- * We only write two-word entries when both of the following
	- * are true:
	- *
	- * [1] The feature is enabled.
	- * [2] The offset or run is too big for a single-word entry,
	- * or the vdev_id is set (meaning not equal to
	- * SM_NO_VDEVID).
	- *
	- * Note that for purposes of testing we've added the case that
	- * we write two-word entries occasionally when the feature is
	- * enabled and zfs_force_some_double_word_sm_entries has been
	- * set.
	- */
	- if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
	- (offset >= (1ULL << SM_OFFSET_BITS) \|\|
	- length > SM_RUN_MAX \|\|
	- vdev_id != SM_NO_VDEVID \|\|
	- (zfs_force_some_double_word_sm_entries &&
	- spa_get_random(100) == 0)))
	- words = 2;
	-
	- space_map_write_seg(sm, rs, maptype, vdev_id, words,
	- &db, FTAG, tx);
	- }
	-
	- dmu_buf_rele(db, FTAG);
	-
	-#ifdef DEBUG
	- /*
	- * We expect our estimation to be based on the worst case
	- * scenario [see comment in space_map_estimate_optimal_size()].
	- * Therefore we expect the actual objsize to be equal or less
	- * than whatever we estimated it to be.
	- */
	- ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
	-#endif
	-}
	-
	-/*
	- * Note: This function manipulates the state of the given space map but
	- * does not hold any locks implicitly. Thus the caller is responsible
	- * for synchronizing writes to the space map.
	- */
	-void
	-space_map_write(space_map_t sm, range_tree_t rt, maptype_t maptype,
	- uint64_t vdev_id, dmu_tx_t *tx)
	-{
	- objset_t *os = sm->sm_os;
	-
	- ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
	- VERIFY3U(space_map_object(sm), !=, 0);
	-
	- dmu_buf_will_dirty(sm->sm_dbuf, tx);
	-
	- /*
	- * This field is no longer necessary since the in-core space map
	- * now contains the object number but is maintained for backwards
	- * compatibility.
	- */
	- sm->sm_phys->smp_object = sm->sm_object;
	-
	- if (range_tree_is_empty(rt)) {
	- VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
	- return;
	- }
	-
	- if (maptype == SM_ALLOC)
	- sm->sm_phys->smp_alloc += range_tree_space(rt);
	- else
	- sm->sm_phys->smp_alloc -= range_tree_space(rt);
	-
	- uint64_t nodes = avl_numnodes(&rt->rt_root);
	- uint64_t rt_space = range_tree_space(rt);
	-
	- space_map_write_impl(sm, rt, maptype, vdev_id, tx);
	-
	- /*
	- * Ensure that the space_map's accounting wasn't changed
	- * while we were in the middle of writing it out.
	- */
	- VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
	- VERIFY3U(range_tree_space(rt), ==, rt_space);
	-}
	-
	-static int
	-space_map_open_impl(space_map_t *sm)
	-{
	- int error;
	- u_longlong_t blocks;
	-
	- error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
	- if (error)
	- return (error);
	-
	- dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
	- sm->sm_phys = sm->sm_dbuf->db_data;
	- return (0);
	-}
	-
	-int
	-space_map_open(space_map_t *smp, objset_t os, uint64_t object,
	- uint64_t start, uint64_t size, uint8_t shift)
	-{
	- space_map_t *sm;
	- int error;
	-
	- ASSERT(*smp == NULL);
	- ASSERT(os != NULL);
	- ASSERT(object != 0);
	-
	- sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
	-
	- sm->sm_start = start;
	- sm->sm_size = size;
	- sm->sm_shift = shift;
	- sm->sm_os = os;
	- sm->sm_object = object;
	-
	- error = space_map_open_impl(sm);
	- if (error != 0) {
	- space_map_close(sm);
	- return (error);
	- }
	- *smp = sm;
	-
	- return (0);
	-}
	-
	-void
	-space_map_close(space_map_t *sm)
	-{
	- if (sm == NULL)
	- return;
	-
	- if (sm->sm_dbuf != NULL)
	- dmu_buf_rele(sm->sm_dbuf, sm);
	- sm->sm_dbuf = NULL;
	- sm->sm_phys = NULL;
	-
	- kmem_free(sm, sizeof (*sm));
	-}
	-
	-void
	-space_map_truncate(space_map_t sm, int blocksize, dmu_tx_t tx)
	-{
	- objset_t *os = sm->sm_os;
	- spa_t *spa = dmu_objset_spa(os);
	- dmu_object_info_t doi;
	-
	- ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
	- ASSERT(dmu_tx_is_syncing(tx));
	- VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa));
	-
	- dmu_object_info_from_db(sm->sm_dbuf, &doi);
	-
	- /*
	- * If the space map has the wrong bonus size (because
	- * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
	- * the wrong block size (because space_map_blksz has changed),
	- * free and re-allocate its object with the updated sizes.
	- *
	- * Otherwise, just truncate the current object.
	- */
	- if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
	- doi.doi_bonus_size != sizeof (space_map_phys_t)) \|\|
	- doi.doi_data_block_size != blocksize \|\|
	- doi.doi_metadata_block_size != 1 << space_map_ibs) {
	- zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
	- "object[%llu]: old bonus %u, old blocksz %u",
	- dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
	- doi.doi_bonus_size, doi.doi_data_block_size);
	-
	- space_map_free(sm, tx);
	- dmu_buf_rele(sm->sm_dbuf, sm);
	-
	- sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
	- VERIFY0(space_map_open_impl(sm));
	- } else {
	- VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
	-
	- /*
	- * If the spacemap is reallocated, its histogram
	- * will be reset. Do the same in the common case so that
	- * bugs related to the uncommon case do not go unnoticed.
	- */
	- bzero(sm->sm_phys->smp_histogram,
	- sizeof (sm->sm_phys->smp_histogram));
	- }
	-
	- dmu_buf_will_dirty(sm->sm_dbuf, tx);
	- sm->sm_phys->smp_length = 0;
	- sm->sm_phys->smp_alloc = 0;
	-}
	-
	-uint64_t
	-space_map_alloc(objset_t os, int blocksize, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_objset_spa(os);
	- uint64_t object;
	- int bonuslen;
	-
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
	- spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
	- bonuslen = sizeof (space_map_phys_t);
	- ASSERT3U(bonuslen, <=, dmu_bonus_max());
	- } else {
	- bonuslen = SPACE_MAP_SIZE_V0;
	- }
	-
	- object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
	- space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
	-
	- return (object);
	-}
	-
	-void
	-space_map_free_obj(objset_t os, uint64_t smobj, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_objset_spa(os);
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
	- dmu_object_info_t doi;
	-
	- VERIFY0(dmu_object_info(os, smobj, &doi));
	- if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
	- spa_feature_decr(spa,
	- SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
	- }
	- }
	-
	- VERIFY0(dmu_object_free(os, smobj, tx));
	-}
	-
	-void
	-space_map_free(space_map_t sm, dmu_tx_t tx)
	-{
	- if (sm == NULL)
	- return;
	-
	- space_map_free_obj(sm->sm_os, space_map_object(sm), tx);
	- sm->sm_object = 0;
	-}
	-
	-/*
	- * Given a range tree, it makes a worst-case estimate of how much
	- * space would the tree's segments take if they were written to
	- * the given space map.
	- */
	-uint64_t
	-space_map_estimate_optimal_size(space_map_t sm, range_tree_t rt,
	- uint64_t vdev_id)
	-{
	- spa_t *spa = dmu_objset_spa(sm->sm_os);
	- uint64_t shift = sm->sm_shift;
	- uint64_t *histogram = rt->rt_histogram;
	- uint64_t entries_for_seg = 0;
	-
	- /*
	- * In order to get a quick estimate of the optimal size that this
	- * range tree would have on-disk as a space map, we iterate through
	- * its histogram buckets instead of iterating through its nodes.
	- *
	- * Note that this is a highest-bound/worst-case estimate for the
	- * following reasons:
	- *
	- * 1] We assume that we always add a debug padding for each block
	- * we write and we also assume that we start at the last word
	- * of a block attempting to write a two-word entry.
	- * 2] Rounding up errors due to the way segments are distributed
	- * in the buckets of the range tree's histogram.
	- * 3] The activation of zfs_force_some_double_word_sm_entries
	- * (tunable) when testing.
	- *
	- * = Math and Rounding Errors =
	- *
	- * rt_histogram[i] bucket of a range tree represents the number
	- * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
	- * that, we want to divide the buckets into groups: Buckets that
	- * can be represented using a single-word entry, ones that can
	- * be represented with a double-word entry, and ones that can
	- * only be represented with multiple two-word entries.
	- *
	- * [Note that if the new encoding feature is not enabled there
	- * are only two groups: single-word entry buckets and multiple
	- * single-word entry buckets. The information below assumes
	- * two-word entries enabled, but it can easily applied when
	- * the feature is not enabled]
	- *
	- * To find the highest bucket that can be represented with a
	- * single-word entry we look at the maximum run that such entry
	- * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
	- * the run of a space map entry is shifted by sm_shift, thus we
	- * add it to the exponent]. This way, excluding the value of the
	- * maximum run that can be represented by a single-word entry,
	- * all runs that are smaller exist in buckets 0 to
	- * SM_RUN_BITS + shift - 1.
	- *
	- * To find the highest bucket that can be represented with a
	- * double-word entry, we follow the same approach. Finally, any
	- * bucket higher than that are represented with multiple two-word
	- * entries. To be more specific, if the highest bucket whose
	- * segments can be represented with a single two-word entry is X,
	- * then bucket X+1 will need 2 two-word entries for each of its
	- * segments, X+2 will need 4, X+3 will need 8, ...etc.
	- *
	- * With all of the above we make our estimation based on bucket
	- * groups. There is a rounding error though. As we mentioned in
	- * the example with the one-word entry, the maximum run that can
	- * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
	- * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
	- * that length fall into the next bucket (and bucket group) where
	- * we start counting two-word entries and this is one more reason
	- * why the estimated size may end up being bigger than the actual
	- * size written.
	- */
	- uint64_t size = 0;
	- uint64_t idx = 0;
	-
	- if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) \|\|
	- (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
	-
	- /*
	- * If we are trying to force some double word entries just
	- * assume the worst-case of every single word entry being
	- * written as a double word entry.
	- */
	- uint64_t entry_size =
	- (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
	- zfs_force_some_double_word_sm_entries) ?
	- (2 * sizeof (uint64_t)) : sizeof (uint64_t);
	-
	- uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
	- for (; idx <= single_entry_max_bucket; idx++)
	- size += histogram[idx] * entry_size;
	-
	- if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
	- for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
	- ASSERT3U(idx, >=, single_entry_max_bucket);
	- entries_for_seg =
	- 1ULL << (idx - single_entry_max_bucket);
	- size += histogram[idx] *
	- entries_for_seg * entry_size;
	- }
	- return (size);
	- }
	- }
	-
	- ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
	-
	- uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
	- for (; idx <= double_entry_max_bucket; idx++)
	- size += histogram[idx] * 2 * sizeof (uint64_t);
	-
	- for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
	- ASSERT3U(idx, >=, double_entry_max_bucket);
	- entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
	- size += histogram[idx] *
	- entries_for_seg * 2 * sizeof (uint64_t);
	- }
	-
	- /*
	- * Assume the worst case where we start with the padding at the end
	- * of the current block and we add an extra padding entry at the end
	- * of all subsequent blocks.
	- */
	- size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
	-
	- return (size);
	-}
	-
	-uint64_t
	-space_map_object(space_map_t *sm)
	-{
	- return (sm != NULL ? sm->sm_object : 0);
	-}
	-
	-int64_t
	-space_map_allocated(space_map_t *sm)
	-{
	- return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
	-}
	-
	-uint64_t
	-space_map_length(space_map_t *sm)
	-{
	- return (sm != NULL ? sm->sm_phys->smp_length : 0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
	@@ -1,149 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/range_tree.h>
	-#include <sys/space_reftree.h>
	-
	-/*
	- * Space reference trees.
	- *
	- * A range tree is a collection of integers. Every integer is either
	- * in the tree, or it's not. A space reference tree generalizes
	- * the idea: it allows its members to have arbitrary reference counts,
	- * as opposed to the implicit reference count of 0 or 1 in a range tree.
	- * This representation comes in handy when computing the union or
	- * intersection of multiple space maps. For example, the union of
	- * N range trees is the subset of the reference tree with refcnt >= 1.
	- * The intersection of N range trees is the subset with refcnt >= N.
	- *
	- * [It's very much like a Fourier transform. Unions and intersections
	- * are hard to perform in the 'range tree domain', so we convert the trees
	- * into the 'reference count domain', where it's trivial, then invert.]
	- *
	- * vdev_dtl_reassess() uses computations of this form to determine
	- * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
	- * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
	- * has an outage wherever refcnt >= vdev_children.
	- */
	-static int
	-space_reftree_compare(const void x1, const void x2)
	-{
	- const space_ref_t sr1 = (const space_ref_t )x1;
	- const space_ref_t sr2 = (const space_ref_t )x2;
	-
	- int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset);
	- if (likely(cmp))
	- return (cmp);
	-
	- return (AVL_PCMP(sr1, sr2));
	-}
	-
	-void
	-space_reftree_create(avl_tree_t *t)
	-{
	- avl_create(t, space_reftree_compare,
	- sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
	-}
	-
	-void
	-space_reftree_destroy(avl_tree_t *t)
	-{
	- space_ref_t *sr;
	- void *cookie = NULL;
	-
	- while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
	- kmem_free(sr, sizeof (*sr));
	-
	- avl_destroy(t);
	-}
	-
	-static void
	-space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
	-{
	- space_ref_t *sr;
	-
	- sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
	- sr->sr_offset = offset;
	- sr->sr_refcnt = refcnt;
	-
	- avl_add(t, sr);
	-}
	-
	-void
	-space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
	- int64_t refcnt)
	-{
	- space_reftree_add_node(t, start, refcnt);
	- space_reftree_add_node(t, end, -refcnt);
	-}
	-
	-/*
	- * Convert (or add) a range tree into a reference tree.
	- */
	-void
	-space_reftree_add_map(avl_tree_t t, range_tree_t rt, int64_t refcnt)
	-{
	- range_seg_t *rs;
	-
	- for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
	- space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
	-}
	-
	-/*
	- * Convert a reference tree into a range tree. The range tree will contain
	- * all members of the reference tree for which refcnt >= minref.
	- */
	-void
	-space_reftree_generate_map(avl_tree_t t, range_tree_t rt, int64_t minref)
	-{
	- uint64_t start = -1ULL;
	- int64_t refcnt = 0;
	- space_ref_t *sr;
	-
	- range_tree_vacate(rt, NULL, NULL);
	-
	- for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
	- refcnt += sr->sr_refcnt;
	- if (refcnt >= minref) {
	- if (start == -1ULL) {
	- start = sr->sr_offset;
	- }
	- } else {
	- if (start != -1ULL) {
	- uint64_t end = sr->sr_offset;
	- ASSERT(start <= end);
	- if (end > start)
	- range_tree_add(rt, start, end - start);
	- start = -1ULL;
	- }
	- }
	- }
	- ASSERT(refcnt == 0);
	- ASSERT(start == -1ULL);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h
	@@ -1,154 +0,0 @@
	-/*
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- */
	-
	-/*
	- * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _ABD_H
	-#define _ABD_H
	-
	-#include <sys/isa_defs.h>
	-#ifdef illumos
	-#include <sys/int_types.h>
	-#else
	-#include <sys/stdint.h>
	-#endif
	-#include <sys/debug.h>
	-#include <sys/refcount.h>
	-#ifdef _KERNEL
	-#include <sys/uio.h>
	-#endif
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef enum abd_flags {
	- ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
	- ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
	- ABD_FLAG_META = 1 << 2 /* does this represent FS metadata? */
	-} abd_flags_t;
	-
	-typedef struct abd {
	- abd_flags_t abd_flags;
	- uint_t abd_size; /* excludes scattered abd_offset */
	- struct abd *abd_parent;
	- zfs_refcount_t abd_children;
	- union {
	- struct abd_scatter {
	- uint_t abd_offset;
	- uint_t abd_chunk_size;
	- void *abd_chunks[];
	- } abd_scatter;
	- struct abd_linear {
	- void *abd_buf;
	- } abd_linear;
	- } abd_u;
	-} abd_t;
	-
	-typedef int abd_iter_func_t(void , size_t, void );
	-typedef int abd_iter_func2_t(void , void , size_t, void *);
	-
	-extern boolean_t zfs_abd_scatter_enabled;
	-
	-inline boolean_t
	-abd_is_linear(abd_t *abd)
	-{
	- return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
	-}
	-
	-/*
	- * Allocations and deallocations
	- */
	-
	-abd_t *abd_alloc(size_t, boolean_t);
	-abd_t *abd_alloc_linear(size_t, boolean_t);
	-abd_t *abd_alloc_for_io(size_t, boolean_t);
	-abd_t abd_alloc_sametype(abd_t , size_t);
	-void abd_free(abd_t *);
	-abd_t abd_get_offset(abd_t , size_t);
	-abd_t abd_get_from_buf(void , size_t);
	-void abd_put(abd_t *);
	-
	-/*
	- * Conversion to and from a normal buffer
	- */
	-
	-void abd_to_buf(abd_t );
	-void abd_borrow_buf(abd_t , size_t);
	-void abd_borrow_buf_copy(abd_t , size_t);
	-void abd_return_buf(abd_t , void , size_t);
	-void abd_return_buf_copy(abd_t , void , size_t);
	-void abd_take_ownership_of_buf(abd_t *, boolean_t);
	-void abd_release_ownership_of_buf(abd_t *);
	-
	-/*
	- * ABD operations
	- */
	-
	-int abd_iterate_func(abd_t , size_t, size_t, abd_iter_func_t , void *);
	-int abd_iterate_func2(abd_t , abd_t , size_t, size_t, size_t,
	- abd_iter_func2_t , void );
	-void abd_copy_off(abd_t , abd_t , size_t, size_t, size_t);
	-void abd_copy_from_buf_off(abd_t , const void , size_t, size_t);
	-void abd_copy_to_buf_off(void , abd_t , size_t, size_t);
	-int abd_cmp(abd_t , abd_t , size_t);
	-int abd_cmp_buf_off(abd_t , const void , size_t, size_t);
	-void abd_zero_off(abd_t *, size_t, size_t);
	-
	-/*
	- * Wrappers for calls with offsets of 0
	- */
	-
	-inline void
	-abd_copy(abd_t dabd, abd_t sabd, size_t size)
	-{
	- abd_copy_off(dabd, sabd, 0, 0, size);
	-}
	-
	-inline void
	-abd_copy_from_buf(abd_t abd, const void buf, size_t size)
	-{
	- abd_copy_from_buf_off(abd, buf, 0, size);
	-}
	-
	-inline void
	-abd_copy_to_buf(void* buf, abd_t *abd, size_t size)
	-{
	- abd_copy_to_buf_off(buf, abd, 0, size);
	-}
	-
	-inline int
	-abd_cmp_buf(abd_t abd, const void buf, size_t size)
	-{
	- return (abd_cmp_buf_off(abd, buf, 0, size));
	-}
	-
	-inline void
	-abd_zero(abd_t *abd, size_t size)
	-{
	- abd_zero_off(abd, 0, size);
	-}
	-
	-/*
	- * Module lifecycle
	- */
	-
	-void abd_init(void);
	-void abd_fini(void);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ABD_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h
	@@ -1,58 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_AGGSUM_H
	-#define _SYS_AGGSUM_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct aggsum_bucket {
	- kmutex_t asc_lock;
	- int64_t asc_delta;
	- uint64_t asc_borrowed;
	- uint64_t asc_pad[2]; /* pad out to cache line (64 bytes) */
	-} aggsum_bucket_t __aligned(CACHE_LINE_SIZE);
	-
	-/*
	- * Fan out over FANOUT cpus.
	- */
	-typedef struct aggsum {
	- kmutex_t as_lock;
	- int64_t as_lower_bound;
	- int64_t as_upper_bound;
	- uint_t as_numbuckets;
	- aggsum_bucket_t *as_buckets;
	-} aggsum_t;
	-
	-void aggsum_init(aggsum_t *, uint64_t);
	-void aggsum_fini(aggsum_t *);
	-int64_t aggsum_lower_bound(aggsum_t *);
	-int64_t aggsum_upper_bound(aggsum_t *);
	-int aggsum_compare(aggsum_t *, uint64_t);
	-uint64_t aggsum_value(aggsum_t *);
	-void aggsum_add(aggsum_t *, int64_t);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_AGGSUM_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
	@@ -1,290 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- */
	-
	-#ifndef _SYS_ARC_H
	-#define _SYS_ARC_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#include <sys/zio.h>
	-#include <sys/dmu.h>
	-#include <sys/spa.h>
	-
	-/*
	- * Used by arc_flush() to inform arc_evict_state() that it should evict
	- * all available buffers from the arc state being passed in.
	- */
	-#define ARC_EVICT_ALL -1ULL
	-
	-#define HDR_SET_LSIZE(hdr, x) do { \
	- ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
	- (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define HDR_SET_PSIZE(hdr, x) do { \
	- ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
	- (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
	-#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT)
	-
	-typedef struct arc_buf_hdr arc_buf_hdr_t;
	-typedef struct arc_buf arc_buf_t;
	-typedef struct arc_prune arc_prune_t;
	-
	-/*
	- * Because the ARC can store encrypted data, errors (not due to bugs) may arise
	- * while transforming data into its desired format - specifically, when
	- * decrypting, the key may not be present, or the HMAC may not be correct
	- * which signifies deliberate tampering with the on-disk state
	- * (assuming that the checksum was correct). If any error occurs, the "buf"
	- * parameter will be NULL.
	- */
	-typedef void arc_read_done_func_t(zio_t zio, const zbookmark_phys_t zb,
	- const blkptr_t bp, arc_buf_t buf, void *priv);
	-typedef void arc_write_done_func_t(zio_t zio, arc_buf_t buf, void *priv);
	-typedef void arc_prune_func_t(int64_t bytes, void *priv);
	-
	-/* Shared module parameters */
	-extern uint64_t zfs_arc_average_blocksize;
	-
	-/* generic arc_done_func_t's which you can use */
	-arc_read_done_func_t arc_bcopy_func;
	-arc_read_done_func_t arc_getbuf_func;
	-
	-/* generic arc_prune_func_t wrapper for callbacks */
	-struct arc_prune {
	- arc_prune_func_t *p_pfunc;
	- void *p_private;
	- uint64_t p_adjust;
	- list_node_t p_node;
	- zfs_refcount_t p_refcnt;
	-};
	-
	-typedef enum arc_strategy {
	- ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */
	- ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */
	-} arc_strategy_t;
	-
	-typedef enum arc_flags
	-{
	- /*
	- * Public flags that can be passed into the ARC by external consumers.
	- */
	- ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */
	- ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */
	- ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */
	- ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
	- ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
	- ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
	- ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */
	-
	- /*
	- * Private ARC flags. These flags are private ARC only flags that
	- * will show up in b_flags in the arc_hdr_buf_t. These flags should
	- * only be set by ARC code.
	- */
	- ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
	- ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
	- ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
	- ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */
	- /* Indicates that block was read with ASYNC priority. */
	- ARC_FLAG_PRIO_ASYNC_READ = 1 << 11,
	- ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */
	- ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */
	- ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */
	- /* indicates that the buffer contains metadata (otherwise, data) */
	- ARC_FLAG_BUFC_METADATA = 1 << 15,
	-
	- /* Flags specifying whether optional hdr struct fields are defined */
	- ARC_FLAG_HAS_L1HDR = 1 << 16,
	- ARC_FLAG_HAS_L2HDR = 1 << 17,
	-
	- /*
	- * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
	- * This allows the l2arc to use the blkptr's checksum to verify
	- * the data without having to store the checksum in the hdr.
	- */
	- ARC_FLAG_COMPRESSED_ARC = 1 << 18,
	- ARC_FLAG_SHARED_DATA = 1 << 19,
	-
	- /*
	- * The arc buffer's compression mode is stored in the top 7 bits of the
	- * flags field, so these dummy flags are included so that MDB can
	- * interpret the enum properly.
	- */
	- ARC_FLAG_COMPRESS_0 = 1 << 24,
	- ARC_FLAG_COMPRESS_1 = 1 << 25,
	- ARC_FLAG_COMPRESS_2 = 1 << 26,
	- ARC_FLAG_COMPRESS_3 = 1 << 27,
	- ARC_FLAG_COMPRESS_4 = 1 << 28,
	- ARC_FLAG_COMPRESS_5 = 1 << 29,
	- ARC_FLAG_COMPRESS_6 = 1 << 30
	-
	-} arc_flags_t;
	-
	-typedef enum arc_buf_flags {
	- ARC_BUF_FLAG_SHARED = 1 << 0,
	- ARC_BUF_FLAG_COMPRESSED = 1 << 1
	-} arc_buf_flags_t;
	-
	-struct arc_buf {
	- arc_buf_hdr_t *b_hdr;
	- arc_buf_t *b_next;
	- kmutex_t b_evict_lock;
	- void *b_data;
	- arc_buf_flags_t b_flags;
	-};
	-
	-typedef enum arc_buf_contents {
	- ARC_BUFC_INVALID, /* invalid type */
	- ARC_BUFC_DATA, /* buffer contains data */
	- ARC_BUFC_METADATA, /* buffer contains metadata */
	- ARC_BUFC_NUMTYPES
	-} arc_buf_contents_t;
	-
	-/*
	- * The following breakdows of arc_size exist for kstat only.
	- */
	-typedef enum arc_space_type {
	- ARC_SPACE_DATA,
	- ARC_SPACE_META,
	- ARC_SPACE_HDRS,
	- ARC_SPACE_L2HDRS,
	- ARC_SPACE_DBUF,
	- ARC_SPACE_DNODE,
	- ARC_SPACE_BONUS,
	- ARC_SPACE_NUMTYPES
	-} arc_space_type_t;
	-
	-typedef enum arc_state_type {
	- ARC_STATE_ANON,
	- ARC_STATE_MRU,
	- ARC_STATE_MRU_GHOST,
	- ARC_STATE_MFU,
	- ARC_STATE_MFU_GHOST,
	- ARC_STATE_L2C_ONLY,
	- ARC_STATE_NUMTYPES
	-} arc_state_type_t;
	-
	-typedef struct arc_buf_info {
	- arc_state_type_t abi_state_type;
	- arc_buf_contents_t abi_state_contents;
	- uint64_t abi_state_index;
	- uint32_t abi_flags;
	- uint32_t abi_bufcnt;
	- uint64_t abi_size;
	- uint64_t abi_spa;
	- uint64_t abi_access;
	- uint32_t abi_mru_hits;
	- uint32_t abi_mru_ghost_hits;
	- uint32_t abi_mfu_hits;
	- uint32_t abi_mfu_ghost_hits;
	- uint32_t abi_l2arc_hits;
	- uint32_t abi_holds;
	- uint64_t abi_l2arc_dattr;
	- uint64_t abi_l2arc_asize;
	- enum zio_compress abi_l2arc_compress;
	-} arc_buf_info_t;
	-
	-void arc_space_consume(uint64_t space, arc_space_type_t type);
	-void arc_space_return(uint64_t space, arc_space_type_t type);
	-boolean_t arc_is_metadata(arc_buf_t *buf);
	-enum zio_compress arc_get_compression(arc_buf_t *buf);
	-int arc_decompress(arc_buf_t *buf);
	-arc_buf_t arc_alloc_buf(spa_t spa, void *tag, arc_buf_contents_t type,
	- int32_t size);
	-arc_buf_t arc_alloc_compressed_buf(spa_t spa, void *tag,
	- uint64_t psize, uint64_t lsize, enum zio_compress compression_type);
	-arc_buf_t arc_loan_buf(spa_t spa, boolean_t is_metadata, int size);
	-arc_buf_t arc_loan_compressed_buf(spa_t spa, uint64_t psize, uint64_t lsize,
	- enum zio_compress compression_type);
	-void arc_return_buf(arc_buf_t buf, void tag);
	-void arc_loan_inuse_buf(arc_buf_t buf, void tag);
	-void arc_buf_destroy(arc_buf_t buf, void tag);
	-void arc_buf_info(arc_buf_t buf, arc_buf_info_t abi, int state_index);
	-int arc_buf_size(arc_buf_t *buf);
	-int arc_buf_lsize(arc_buf_t *buf);
	-void arc_buf_access(arc_buf_t *buf);
	-void arc_release(arc_buf_t buf, void tag);
	-int arc_released(arc_buf_t *buf);
	-void arc_buf_freeze(arc_buf_t *buf);
	-void arc_buf_thaw(arc_buf_t *buf);
	-#ifdef ZFS_DEBUG
	-int arc_referenced(arc_buf_t *buf);
	-#endif
	-
	-int arc_read(zio_t pio, spa_t spa, const blkptr_t *bp,
	- arc_read_done_func_t done, void priv, zio_priority_t priority,
	- int flags, arc_flags_t arc_flags, const zbookmark_phys_t zb);
	-zio_t arc_write(zio_t pio, spa_t *spa, uint64_t txg,
	- blkptr_t bp, arc_buf_t buf, boolean_t l2arc, const zio_prop_t *zp,
	- arc_write_done_func_t ready, arc_write_done_func_t child_ready,
	- arc_write_done_func_t physdone, arc_write_done_func_t done,
	- void *priv, zio_priority_t priority, int zio_flags,
	- const zbookmark_phys_t *zb);
	-
	-arc_prune_t arc_add_prune_callback(arc_prune_func_t func, void *priv);
	-void arc_remove_prune_callback(arc_prune_t *p);
	-void arc_freed(spa_t spa, const blkptr_t bp);
	-
	-void arc_flush(spa_t *spa, boolean_t retry);
	-void arc_tempreserve_clear(uint64_t reserve);
	-int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
	-
	-uint64_t arc_max_bytes(void);
	-void arc_init(void);
	-void arc_fini(void);
	-
	-/*
	- * Level 2 ARC
	- */
	-
	-void l2arc_add_vdev(spa_t spa, vdev_t vd);
	-void l2arc_remove_vdev(vdev_t *vd);
	-boolean_t l2arc_vdev_present(vdev_t *vd);
	-void l2arc_init(void);
	-void l2arc_fini(void);
	-void l2arc_start(void);
	-void l2arc_stop(void);
	-
	-#ifdef illumos
	-#ifndef _KERNEL
	-extern boolean_t arc_watch;
	-extern int arc_procfd;
	-#endif
	-#endif /* illumos */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ARC_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h
	@@ -1,39 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_BLKPTR_H
	-#define _SYS_BLKPTR_H
	-
	-#include <sys/spa.h>
	-#include <sys/zio.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-void encode_embedded_bp_compressed(blkptr_t , void ,
	- enum zio_compress, int, int);
	-void decode_embedded_bp_compressed(const blkptr_t , void );
	-int decode_embedded_bp(const blkptr_t , void , int);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_BLKPTR_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
	@@ -1,57 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _SYS_BPLIST_H
	-#define _SYS_BPLIST_H
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct bplist_entry {
	- blkptr_t bpe_blk;
	- list_node_t bpe_node;
	-} bplist_entry_t;
	-
	-typedef struct bplist {
	- kmutex_t bpl_lock;
	- list_t bpl_list;
	-} bplist_t;
	-
	-typedef int bplist_itor_t(void arg, const blkptr_t bp, dmu_tx_t *tx);
	-
	-void bplist_create(bplist_t *bpl);
	-void bplist_destroy(bplist_t *bpl);
	-void bplist_append(bplist_t bpl, const blkptr_t bp);
	-void bplist_iterate(bplist_t bpl, bplist_itor_t func,
	- void arg, dmu_tx_t tx);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_BPLIST_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
	@@ -1,95 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_BPOBJ_H
	-#define _SYS_BPOBJ_H
	-
	-#include <sys/dmu.h>
	-#include <sys/spa.h>
	-#include <sys/txg.h>
	-#include <sys/zio.h>
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct bpobj_phys {
	- /*
	- * This is the bonus buffer for the dead lists. The object's
	- * contents is an array of bpo_entries blkptr_t's, representing
	- * a total of bpo_bytes physical space.
	- */
	- uint64_t bpo_num_blkptrs;
	- uint64_t bpo_bytes;
	- uint64_t bpo_comp;
	- uint64_t bpo_uncomp;
	- uint64_t bpo_subobjs;
	- uint64_t bpo_num_subobjs;
	-} bpobj_phys_t;
	-
	-#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
	-#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
	-
	-typedef struct bpobj {
	- kmutex_t bpo_lock;
	- objset_t *bpo_os;
	- uint64_t bpo_object;
	- int bpo_epb;
	- uint8_t bpo_havecomp;
	- uint8_t bpo_havesubobj;
	- bpobj_phys_t *bpo_phys;
	- dmu_buf_t *bpo_dbuf;
	- dmu_buf_t *bpo_cached_dbuf;
	-} bpobj_t;
	-
	-typedef int bpobj_itor_t(void arg, const blkptr_t bp, dmu_tx_t *tx);
	-
	-uint64_t bpobj_alloc(objset_t mos, int blocksize, dmu_tx_t tx);
	-uint64_t bpobj_alloc_empty(objset_t os, int blocksize, dmu_tx_t tx);
	-void bpobj_free(objset_t os, uint64_t obj, dmu_tx_t tx);
	-void bpobj_decr_empty(objset_t os, dmu_tx_t tx);
	-
	-int bpobj_open(bpobj_t bpo, objset_t mos, uint64_t object);
	-void bpobj_close(bpobj_t *bpo);
	-boolean_t bpobj_is_open(const bpobj_t *bpo);
	-
	-int bpobj_iterate(bpobj_t bpo, bpobj_itor_t func, void arg, dmu_tx_t *tx);
	-int bpobj_iterate_nofree(bpobj_t bpo, bpobj_itor_t func, void , dmu_tx_t *);
	-
	-void bpobj_enqueue_subobj(bpobj_t bpo, uint64_t subobj, dmu_tx_t tx);
	-void bpobj_enqueue(bpobj_t bpo, const blkptr_t bp, dmu_tx_t *tx);
	-
	-int bpobj_space(bpobj_t *bpo,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	-int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	-boolean_t bpobj_is_empty(bpobj_t *bpo);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_BPOBJ_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
	@@ -1,65 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_BPTREE_H
	-#define _SYS_BPTREE_H
	-
	-#include <sys/spa.h>
	-#include <sys/zio.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct bptree_phys {
	- uint64_t bt_begin;
	- uint64_t bt_end;
	- uint64_t bt_bytes;
	- uint64_t bt_comp;
	- uint64_t bt_uncomp;
	-} bptree_phys_t;
	-
	-typedef struct bptree_entry_phys {
	- blkptr_t be_bp;
	- uint64_t be_birth_txg; /* only delete blocks born after this txg */
	- zbookmark_phys_t be_zb; /* holds traversal resume point if needed */
	-} bptree_entry_phys_t;
	-
	-typedef int bptree_itor_t(void arg, const blkptr_t bp, dmu_tx_t *tx);
	-
	-uint64_t bptree_alloc(objset_t os, dmu_tx_t tx);
	-int bptree_free(objset_t os, uint64_t obj, dmu_tx_t tx);
	-boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
	-
	-void bptree_add(objset_t os, uint64_t obj, blkptr_t bp, uint64_t birth_txg,
	- uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
	-
	-int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free,
	- bptree_itor_t func, void arg, dmu_tx_t tx);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_BPTREE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
	@@ -1,54 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2014 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _BQUEUE_H
	-#define _BQUEUE_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#include <sys/zfs_context.h>
	-
	-typedef struct bqueue {
	- list_t bq_list;
	- kmutex_t bq_lock;
	- kcondvar_t bq_add_cv;
	- kcondvar_t bq_pop_cv;
	- uint64_t bq_size;
	- uint64_t bq_maxsize;
	- size_t bq_node_offset;
	-} bqueue_t;
	-
	-typedef struct bqueue_node {
	- list_node_t bqn_node;
	- uint64_t bqn_size;
	-} bqueue_node_t;
	-
	-
	-int bqueue_init(bqueue_t *, uint64_t, size_t);
	-void bqueue_destroy(bqueue_t *);
	-void bqueue_enqueue(bqueue_t , void , uint64_t);
	-void bqueue_dequeue(bqueue_t );
	-boolean_t bqueue_empty(bqueue_t *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _BQUEUE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h
	@@ -1,41 +0,0 @@
	-// Copyright (c) 2011 Google, Inc.
	-//
	-// Permission is hereby granted, free of charge, to any person obtaining a copy
	-// of this software and associated documentation files (the "Software"), to deal
	-// in the Software without restriction, including without limitation the rights
	-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	-// copies of the Software, and to permit persons to whom the Software is
	-// furnished to do so, subject to the following conditions:
	-//
	-// The above copyright notice and this permission notice shall be included in
	-// all copies or substantial portions of the Software.
	-//
	-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	-// THE SOFTWARE.
	-
	-
	-/*
	- * Copyright (c) 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_CITYHASH_H
	-#define _SYS_CITYHASH_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_CITYHASH_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
	@@ -1,417 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- */
	-
	-#ifndef _SYS_DBUF_H
	-#define _SYS_DBUF_H
	-
	-#include <sys/dmu.h>
	-#include <sys/spa.h>
	-#include <sys/txg.h>
	-#include <sys/zio.h>
	-#include <sys/arc.h>
	-#include <sys/zfs_context.h>
	-#include <sys/refcount.h>
	-#include <sys/zrlock.h>
	-#include <sys/multilist.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define IN_DMU_SYNC 2
	-
	-/*
	- * define flags for dbuf_read
	- */
	-
	-#define DB_RF_MUST_SUCCEED (1 << 0)
	-#define DB_RF_CANFAIL (1 << 1)
	-#define DB_RF_HAVESTRUCT (1 << 2)
	-#define DB_RF_NOPREFETCH (1 << 3)
	-#define DB_RF_NEVERWAIT (1 << 4)
	-#define DB_RF_CACHED (1 << 5)
	-
	-/*
	- * The simplified state transition diagram for dbufs looks like:
	- *
	- * +----> READ ----+
	- * \| \|
	- * \| V
	- * (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
	- * \| ^ ^
	- * \| \| \|
	- * +----> FILL ----+ \|
	- * \| \|
	- * \| \|
	- * +--------> NOFILL -------+
	- *
	- * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
	- * to find all dbufs in a range of a dnode and must be less than any other
	- * dbuf_states_t (see comment on dn_dbufs in dnode.h).
	- */
	-typedef enum dbuf_states {
	- DB_SEARCH = -1,
	- DB_UNCACHED,
	- DB_FILL,
	- DB_NOFILL,
	- DB_READ,
	- DB_CACHED,
	- DB_EVICTING
	-} dbuf_states_t;
	-
	-typedef enum dbuf_cached_state {
	- DB_NO_CACHE = -1,
	- DB_DBUF_CACHE,
	- DB_DBUF_METADATA_CACHE,
	- DB_CACHE_MAX
	-} dbuf_cached_state_t;
	-
	-struct dnode;
	-struct dmu_tx;
	-
	-/*
	- * level = 0 means the user data
	- * level = 1 means the single indirect block
	- * etc.
	- */
	-
	-struct dmu_buf_impl;
	-
	-typedef enum override_states {
	- DR_NOT_OVERRIDDEN,
	- DR_IN_DMU_SYNC,
	- DR_OVERRIDDEN
	-} override_states_t;
	-
	-typedef struct dbuf_dirty_record {
	- /* link on our parents dirty list */
	- list_node_t dr_dirty_node;
	-
	- /* transaction group this data will sync in */
	- uint64_t dr_txg;
	-
	- /* zio of outstanding write IO */
	- zio_t *dr_zio;
	-
	- /* pointer back to our dbuf */
	- struct dmu_buf_impl *dr_dbuf;
	-
	- /* pointer to next dirty record */
	- struct dbuf_dirty_record *dr_next;
	-
	- /* pointer to parent dirty record */
	- struct dbuf_dirty_record *dr_parent;
	-
	- /* How much space was changed to dsl_pool_dirty_space() for this? */
	- unsigned int dr_accounted;
	-
	- /* A copy of the bp that points to us */
	- blkptr_t dr_bp_copy;
	-
	- union dirty_types {
	- struct dirty_indirect {
	-
	- /* protect access to list */
	- kmutex_t dr_mtx;
	-
	- /* Our list of dirty children */
	- list_t dr_children;
	- } di;
	- struct dirty_leaf {
	-
	- /*
	- * dr_data is set when we dirty the buffer
	- * so that we can retain the pointer even if it
	- * gets COW'd in a subsequent transaction group.
	- */
	- arc_buf_t *dr_data;
	- blkptr_t dr_overridden_by;
	- override_states_t dr_override_state;
	- uint8_t dr_copies;
	- boolean_t dr_nopwrite;
	- } dl;
	- } dt;
	-} dbuf_dirty_record_t;
	-
	-typedef struct dmu_buf_impl {
	- /*
	- * The following members are immutable, with the exception of
	- * db.db_data, which is protected by db_mtx.
	- */
	-
	- /* the publicly visible structure */
	- dmu_buf_t db;
	-
	- /* the objset we belong to */
	- struct objset *db_objset;
	-
	- /*
	- * handle to safely access the dnode we belong to (NULL when evicted)
	- */
	- struct dnode_handle *db_dnode_handle;
	-
	- /*
	- * our parent buffer; if the dnode points to us directly,
	- * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
	- * only accessed by sync thread ???
	- * (NULL when evicted)
	- * May change from NULL to non-NULL under the protection of db_mtx
	- * (see dbuf_check_blkptr())
	- */
	- struct dmu_buf_impl *db_parent;
	-
	- /*
	- * link for hash table of all dmu_buf_impl_t's
	- */
	- struct dmu_buf_impl *db_hash_next;
	-
	- /*
	- * Our link on the owner dnodes's dn_dbufs list.
	- * Protected by its dn_dbufs_mtx. Should be on the same cache line
	- * as db_level and db_blkid for the best avl_add() performance.
	- */
	- avl_node_t db_link;
	-
	- /* our block number */
	- uint64_t db_blkid;
	-
	- /*
	- * Pointer to the blkptr_t which points to us. May be NULL if we
	- * don't have one yet. (NULL when evicted)
	- */
	- blkptr_t *db_blkptr;
	-
	- /*
	- * Our indirection level. Data buffers have db_level==0.
	- * Indirect buffers which point to data buffers have
	- * db_level==1. etc. Buffers which contain dnodes have
	- * db_level==0, since the dnodes are stored in a file.
	- */
	- uint8_t db_level;
	-
	- /* db_mtx protects the members below */
	- kmutex_t db_mtx;
	-
	- /*
	- * Current state of the buffer
	- */
	- dbuf_states_t db_state;
	-
	- /*
	- * Refcount accessed by dmu_buf_{hold,rele}.
	- * If nonzero, the buffer can't be destroyed.
	- * Protected by db_mtx.
	- */
	- zfs_refcount_t db_holds;
	-
	- /* buffer holding our data */
	- arc_buf_t *db_buf;
	-
	- kcondvar_t db_changed;
	- dbuf_dirty_record_t *db_data_pending;
	-
	- /* pointer to most recent dirty record for this buffer */
	- dbuf_dirty_record_t *db_last_dirty;
	-
	- /* Link in dbuf_cache or dbuf_metadata_cache */
	- multilist_node_t db_cache_link;
	-
	- /* Tells us which dbuf cache this dbuf is in, if any */
	- dbuf_cached_state_t db_caching_status;
	-
	- /* Data which is unique to data (leaf) blocks: */
	-
	- /* User callback information. */
	- dmu_buf_user_t *db_user;
	-
	- /*
	- * Evict user data as soon as the dirty and reference
	- * counts are equal.
	- */
	- uint8_t db_user_immediate_evict;
	-
	- /*
	- * This block was freed while a read or write was
	- * active.
	- */
	- uint8_t db_freed_in_flight;
	-
	- /*
	- * dnode_evict_dbufs() or dnode_evict_bonus() tried to
	- * evict this dbuf, but couldn't due to outstanding
	- * references. Evict once the refcount drops to 0.
	- */
	- uint8_t db_pending_evict;
	-
	- uint8_t db_dirtycnt;
	-} dmu_buf_impl_t;
	-
	-/* Note: the dbuf hash table is exposed only for the mdb module */
	-#define DBUF_MUTEXES 256
	-#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
	-typedef struct dbuf_hash_table {
	- uint64_t hash_table_mask;
	- dmu_buf_impl_t **hash_table;
	- kmutex_t hash_mutexes[DBUF_MUTEXES];
	-} dbuf_hash_table_t;
	-
	-uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
	-
	-dmu_buf_impl_t dbuf_create_tlib(struct dnode dn, char *data);
	-void dbuf_create_bonus(struct dnode *dn);
	-int dbuf_spill_set_blksz(dmu_buf_t db, uint64_t blksz, dmu_tx_t tx);
	-void dbuf_spill_hold(struct dnode dn, dmu_buf_impl_t dbp, void tag);
	-
	-void dbuf_rm_spill(struct dnode dn, dmu_tx_t tx);
	-
	-dmu_buf_impl_t dbuf_hold(struct dnode dn, uint64_t blkid, void *tag);
	-dmu_buf_impl_t dbuf_hold_level(struct dnode dn, int level, uint64_t blkid,
	- void *tag);
	-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
	- boolean_t fail_sparse, boolean_t fail_uncached,
	- void tag, dmu_buf_impl_t *dbp);
	-
	-void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
	- zio_priority_t prio, arc_flags_t aflags);
	-
	-void dbuf_add_ref(dmu_buf_impl_t db, void tag);
	-boolean_t dbuf_try_add_ref(dmu_buf_t db, objset_t os, uint64_t obj,
	- uint64_t blkid, void *tag);
	-uint64_t dbuf_refcount(dmu_buf_impl_t *db);
	-
	-void dbuf_rele(dmu_buf_impl_t db, void tag);
	-void dbuf_rele_and_unlock(dmu_buf_impl_t db, void tag, boolean_t evicting);
	-
	-dmu_buf_impl_t dbuf_find(struct objset os, uint64_t object, uint8_t level,
	- uint64_t blkid);
	-
	-int dbuf_read(dmu_buf_impl_t db, zio_t zio, uint32_t flags);
	-void dmu_buf_will_not_fill(dmu_buf_t db, dmu_tx_t tx);
	-void dmu_buf_will_fill(dmu_buf_t db, dmu_tx_t tx);
	-void dmu_buf_fill_done(dmu_buf_t db, dmu_tx_t tx);
	-void dbuf_assign_arcbuf(dmu_buf_impl_t db, arc_buf_t buf, dmu_tx_t *tx);
	-dbuf_dirty_record_t dbuf_dirty(dmu_buf_impl_t db, dmu_tx_t *tx);
	-arc_buf_t dbuf_loan_arcbuf(dmu_buf_impl_t db);
	-void dmu_buf_write_embedded(dmu_buf_t dbuf, void data,
	- bp_embedded_type_t etype, enum zio_compress comp,
	- int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
	-
	-void dbuf_destroy(dmu_buf_impl_t *db);
	-
	-void dbuf_setdirty(dmu_buf_impl_t db, dmu_tx_t tx);
	-void dbuf_unoverride(dbuf_dirty_record_t *dr);
	-void dbuf_sync_list(list_t list, int level, dmu_tx_t tx);
	-void dbuf_release_bp(dmu_buf_impl_t *db);
	-
	-boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
	-
	-void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
	- struct dmu_tx *);
	-
	-void dbuf_new_size(dmu_buf_impl_t db, int size, dmu_tx_t tx);
	-
	-void dbuf_stats_init(dbuf_hash_table_t *hash);
	-void dbuf_stats_destroy(void);
	-
	-#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
	-#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
	-#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
	-#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
	-#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
	-
	-void dbuf_init(void);
	-void dbuf_fini(void);
	-
	-boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
	-
	-#define DBUF_GET_BUFC_TYPE(_db) \
	- (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
	-
	-#define DBUF_IS_CACHEABLE(_db) \
	- ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL \|\| \
	- (dbuf_is_metadata(_db) && \
	- ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
	-
	-#define DBUF_IS_L2CACHEABLE(_db) \
	- ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL \|\| \
	- (dbuf_is_metadata(_db) && \
	- ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
	-
	-#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \
	- ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL \|\| \
	- (((_level) > 0 \|\| \
	- DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \
	- ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
	-
	-#ifdef ZFS_DEBUG
	-
	-/*
	- * There should be a ## between the string literal and fmt, to make it
	- * clear that we're joining two strings together, but gcc does not
	- * support that preprocessor token.
	- */
	-#define dprintf_dbuf(dbuf, fmt, ...) do { \
	- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	- char __db_buf[32]; \
	- uint64_t __db_obj = (dbuf)->db.db_object; \
	- if (__db_obj == DMU_META_DNODE_OBJECT) \
	- (void) strcpy(__db_buf, "mdn"); \
	- else \
	- (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
	- (u_longlong_t)__db_obj); \
	- dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
	- "obj=%s lvl=%u blkid=%lld " fmt, \
	- __db_buf, (dbuf)->db_level, \
	- (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
	- } \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
	- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	- char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
	- snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
	- dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
	- kmem_free(__blkbuf, BP_SPRINTF_LEN); \
	- } \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define DBUF_VERIFY(db) dbuf_verify(db)
	-
	-#else
	-
	-#define dprintf_dbuf(db, fmt, ...)
	-#define dprintf_dbuf_bp(db, bp, fmt, ...)
	-#define DBUF_VERIFY(db)
	-
	-#endif
	-
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DBUF_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
	@@ -1,248 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_DDT_H
	-#define _SYS_DDT_H
	-
	-#include <sys/sysmacros.h>
	-#include <sys/types.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zio.h>
	-#include <sys/dmu.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct abd;
	-
	-/*
	- * On-disk DDT formats, in the desired search order (newest version first).
	- */
	-enum ddt_type {
	- DDT_TYPE_ZAP = 0,
	- DDT_TYPES
	-};
	-
	-/*
	- * DDT classes, in the desired search order (highest replication level first).
	- */
	-enum ddt_class {
	- DDT_CLASS_DITTO = 0,
	- DDT_CLASS_DUPLICATE,
	- DDT_CLASS_UNIQUE,
	- DDT_CLASSES
	-};
	-
	-#define DDT_TYPE_CURRENT 0
	-
	-#define DDT_COMPRESS_BYTEORDER_MASK 0x80
	-#define DDT_COMPRESS_FUNCTION_MASK 0x7f
	-
	-/*
	- * On-disk ddt entry: key (name) and physical storage (value).
	- */
	-typedef struct ddt_key {
	- zio_cksum_t ddk_cksum; /* 256-bit block checksum */
	- /*
	- * Encoded with logical & physical size, and compression, as follows:
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * \| 0 \| 0 \| 0 \| comp \| PSIZE \| LSIZE \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- */
	- uint64_t ddk_prop;
	-} ddt_key_t;
	-
	-#define DDK_GET_LSIZE(ddk) \
	- BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
	-#define DDK_SET_LSIZE(ddk, x) \
	- BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
	-
	-#define DDK_GET_PSIZE(ddk) \
	- BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
	-#define DDK_SET_PSIZE(ddk, x) \
	- BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
	-
	-#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8)
	-#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x)
	-
	-#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t))
	-
	-typedef struct ddt_phys {
	- dva_t ddp_dva[SPA_DVAS_PER_BP];
	- uint64_t ddp_refcnt;
	- uint64_t ddp_phys_birth;
	-} ddt_phys_t;
	-
	-enum ddt_phys_type {
	- DDT_PHYS_DITTO = 0,
	- DDT_PHYS_SINGLE = 1,
	- DDT_PHYS_DOUBLE = 2,
	- DDT_PHYS_TRIPLE = 3,
	- DDT_PHYS_TYPES
	-};
	-
	-/*
	- * In-core ddt entry
	- */
	-struct ddt_entry {
	- ddt_key_t dde_key;
	- ddt_phys_t dde_phys[DDT_PHYS_TYPES];
	- zio_t *dde_lead_zio[DDT_PHYS_TYPES];
	- struct abd *dde_repair_abd;
	- enum ddt_type dde_type;
	- enum ddt_class dde_class;
	- uint8_t dde_loading;
	- uint8_t dde_loaded;
	- kcondvar_t dde_cv;
	- avl_node_t dde_node;
	-};
	-
	-/*
	- * In-core ddt
	- */
	-struct ddt {
	- kmutex_t ddt_lock;
	- avl_tree_t ddt_tree;
	- avl_tree_t ddt_repair_tree;
	- enum zio_checksum ddt_checksum;
	- spa_t *ddt_spa;
	- objset_t *ddt_os;
	- uint64_t ddt_stat_object;
	- uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
	- ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
	- ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
	- ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
	- avl_node_t ddt_node;
	-};
	-
	-/*
	- * In-core and on-disk bookmark for DDT walks
	- */
	-typedef struct ddt_bookmark {
	- uint64_t ddb_class;
	- uint64_t ddb_type;
	- uint64_t ddb_checksum;
	- uint64_t ddb_cursor;
	-} ddt_bookmark_t;
	-
	-/*
	- * Ops vector to access a specific DDT object type.
	- */
	-typedef struct ddt_ops {
	- char ddt_op_name[32];
	- int (ddt_op_create)(objset_t os, uint64_t object, dmu_tx_t tx,
	- boolean_t prehash);
	- int (ddt_op_destroy)(objset_t os, uint64_t object, dmu_tx_t *tx);
	- int (ddt_op_lookup)(objset_t os, uint64_t object, ddt_entry_t *dde);
	- void (ddt_op_prefetch)(objset_t os, uint64_t object,
	- ddt_entry_t *dde);
	- int (ddt_op_update)(objset_t os, uint64_t object, ddt_entry_t *dde,
	- dmu_tx_t *tx);
	- int (ddt_op_remove)(objset_t os, uint64_t object, ddt_entry_t *dde,
	- dmu_tx_t *tx);
	- int (ddt_op_walk)(objset_t os, uint64_t object, ddt_entry_t *dde,
	- uint64_t *walk);
	- int (ddt_op_count)(objset_t os, uint64_t object, uint64_t *count);
	-} ddt_ops_t;
	-
	-#define DDT_NAMELEN 80
	-
	-extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
	- enum ddt_class cls, char *name);
	-extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
	- enum ddt_class cls, uint64_t walk, ddt_entry_t dde);
	-extern int ddt_object_count(ddt_t *ddt, enum ddt_type type,
	- enum ddt_class cls, uint64_t *count);
	-extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
	- enum ddt_class cls, dmu_object_info_t *);
	-extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
	- enum ddt_class cls);
	-
	-extern void ddt_bp_fill(const ddt_phys_t ddp, blkptr_t bp,
	- uint64_t txg);
	-extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
	- const ddt_phys_t ddp, blkptr_t bp);
	-
	-extern void ddt_key_fill(ddt_key_t ddk, const blkptr_t bp);
	-
	-extern void ddt_phys_fill(ddt_phys_t ddp, const blkptr_t bp);
	-extern void ddt_phys_clear(ddt_phys_t *ddp);
	-extern void ddt_phys_addref(ddt_phys_t *ddp);
	-extern void ddt_phys_decref(ddt_phys_t *ddp);
	-extern void ddt_phys_free(ddt_t ddt, ddt_key_t ddk, ddt_phys_t *ddp,
	- uint64_t txg);
	-extern ddt_phys_t ddt_phys_select(const ddt_entry_t dde, const blkptr_t *bp);
	-extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
	-
	-extern void ddt_stat_add(ddt_stat_t dst, const ddt_stat_t src, uint64_t neg);
	-
	-extern void ddt_histogram_add(ddt_histogram_t dst, const ddt_histogram_t src);
	-extern void ddt_histogram_stat(ddt_stat_t dds, const ddt_histogram_t ddh);
	-extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
	-extern void ddt_get_dedup_object_stats(spa_t spa, ddt_object_t ddo);
	-extern void ddt_get_dedup_histogram(spa_t spa, ddt_histogram_t ddh);
	-extern void ddt_get_dedup_stats(spa_t spa, ddt_stat_t dds_total);
	-
	-extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
	-extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
	-
	-extern int ddt_ditto_copies_needed(ddt_t ddt, ddt_entry_t dde,
	- ddt_phys_t *ddp_willref);
	-extern int ddt_ditto_copies_present(ddt_entry_t *dde);
	-
	-extern size_t ddt_compress(void src, uchar_t dst, size_t s_len, size_t d_len);
	-extern void ddt_decompress(uchar_t src, void dst, size_t s_len, size_t d_len);
	-
	-extern ddt_t ddt_select(spa_t spa, const blkptr_t *bp);
	-extern void ddt_enter(ddt_t *ddt);
	-extern void ddt_exit(ddt_t *ddt);
	-extern ddt_entry_t ddt_lookup(ddt_t ddt, const blkptr_t *bp, boolean_t add);
	-extern void ddt_prefetch(spa_t spa, const blkptr_t bp);
	-extern void ddt_remove(ddt_t ddt, ddt_entry_t dde);
	-
	-extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
	- const blkptr_t *bp);
	-
	-extern ddt_entry_t ddt_repair_start(ddt_t ddt, const blkptr_t *bp);
	-extern void ddt_repair_done(ddt_t ddt, ddt_entry_t dde);
	-
	-extern int ddt_entry_compare(const void x1, const void x2);
	-
	-extern void ddt_create(spa_t *spa);
	-extern int ddt_load(spa_t *spa);
	-extern void ddt_unload(spa_t *spa);
	-extern void ddt_sync(spa_t *spa, uint64_t txg);
	-extern int ddt_walk(spa_t spa, ddt_bookmark_t ddb, ddt_entry_t *dde);
	-extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
	- enum ddt_class cls, ddt_entry_t dde, dmu_tx_t tx);
	-
	-extern const ddt_ops_t ddt_zap_ops;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DDT_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
	@@ -1,1028 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
	- * Copyright 2013 DEY Storage Systems, Inc.
	- * Copyright 2014 HybridCluster. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2017, Intel Corporation.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#ifndef _SYS_DMU_H
	-#define _SYS_DMU_H
	-
	-/*
	- * This file describes the interface that the DMU provides for its
	- * consumers.
	- *
	- * The DMU also interacts with the SPA. That interface is described in
	- * dmu_spa.h.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/cred.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zio_compress.h>
	-#include <sys/zio_priority.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct uio;
	-struct xuio;
	-struct page;
	-struct vnode;
	-struct spa;
	-struct zilog;
	-struct zio;
	-struct blkptr;
	-struct zap_cursor;
	-struct dsl_dataset;
	-struct dsl_pool;
	-struct dnode;
	-struct drr_begin;
	-struct drr_end;
	-struct zbookmark_phys;
	-struct spa;
	-struct nvlist;
	-struct arc_buf;
	-struct zio_prop;
	-struct sa_handle;
	-struct file;
	-struct locked_range;
	-
	-typedef struct objset objset_t;
	-typedef struct dmu_tx dmu_tx_t;
	-typedef struct dsl_dir dsl_dir_t;
	-typedef struct dnode dnode_t;
	-
	-typedef enum dmu_object_byteswap {
	- DMU_BSWAP_UINT8,
	- DMU_BSWAP_UINT16,
	- DMU_BSWAP_UINT32,
	- DMU_BSWAP_UINT64,
	- DMU_BSWAP_ZAP,
	- DMU_BSWAP_DNODE,
	- DMU_BSWAP_OBJSET,
	- DMU_BSWAP_ZNODE,
	- DMU_BSWAP_OLDACL,
	- DMU_BSWAP_ACL,
	- /*
	- * Allocating a new byteswap type number makes the on-disk format
	- * incompatible with any other format that uses the same number.
	- *
	- * Data can usually be structured to work with one of the
	- * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
	- */
	- DMU_BSWAP_NUMFUNCS
	-} dmu_object_byteswap_t;
	-
	-#define DMU_OT_NEWTYPE 0x80
	-#define DMU_OT_METADATA 0x40
	-#define DMU_OT_BYTESWAP_MASK 0x3f
	-
	-/*
	- * Defines a uint8_t object type. Object types specify if the data
	- * in the object is metadata (boolean) and how to byteswap the data
	- * (dmu_object_byteswap_t). All of the types created by this method
	- * are cached in the dbuf metadata cache.
	- */
	-#define DMU_OT(byteswap, metadata) \
	- (DMU_OT_NEWTYPE \| \
	- ((metadata) ? DMU_OT_METADATA : 0) \| \
	- ((byteswap) & DMU_OT_BYTESWAP_MASK))
	-
	-#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	- ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
	- (ot) < DMU_OT_NUMTYPES)
	-
	-#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	- ((ot) & DMU_OT_METADATA) : \
	- dmu_ot[(ot)].ot_metadata)
	-
	-#define DMU_OT_IS_DDT(ot) \
	- ((ot) == DMU_OT_DDT_ZAP)
	-
	-#define DMU_OT_IS_ZIL(ot) \
	- ((ot) == DMU_OT_INTENT_LOG)
	-
	-/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
	-#define DMU_OT_IS_FILE(ot) \
	- ((ot) == DMU_OT_PLAIN_FILE_CONTENTS \|\| (ot) == DMU_OT_UINT64_OTHER)
	-
	-#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	- B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
	-
	-/*
	- * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
	- * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
	- * is repurposed for embedded BPs.
	- */
	-#define DMU_OT_HAS_FILL(ot) \
	- ((ot) == DMU_OT_DNODE \|\| (ot) == DMU_OT_OBJSET)
	-
	-#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	- ((ot) & DMU_OT_BYTESWAP_MASK) : \
	- dmu_ot[(ot)].ot_byteswap)
	-
	-typedef enum dmu_object_type {
	- DMU_OT_NONE,
	- /* general: */
	- DMU_OT_OBJECT_DIRECTORY, /* ZAP */
	- DMU_OT_OBJECT_ARRAY, /* UINT64 */
	- DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
	- DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
	- DMU_OT_BPOBJ, /* UINT64 */
	- DMU_OT_BPOBJ_HDR, /* UINT64 */
	- /* spa: */
	- DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
	- DMU_OT_SPACE_MAP, /* UINT64 */
	- /* zil: */
	- DMU_OT_INTENT_LOG, /* UINT64 */
	- /* dmu: */
	- DMU_OT_DNODE, /* DNODE */
	- DMU_OT_OBJSET, /* OBJSET */
	- /* dsl: */
	- DMU_OT_DSL_DIR, /* UINT64 */
	- DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
	- DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
	- DMU_OT_DSL_PROPS, /* ZAP */
	- DMU_OT_DSL_DATASET, /* UINT64 */
	- /* zpl: */
	- DMU_OT_ZNODE, /* ZNODE */
	- DMU_OT_OLDACL, /* Old ACL */
	- DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
	- DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
	- DMU_OT_MASTER_NODE, /* ZAP */
	- DMU_OT_UNLINKED_SET, /* ZAP */
	- /* zvol: */
	- DMU_OT_ZVOL, /* UINT8 */
	- DMU_OT_ZVOL_PROP, /* ZAP */
	- /* other; for testing only! */
	- DMU_OT_PLAIN_OTHER, /* UINT8 */
	- DMU_OT_UINT64_OTHER, /* UINT64 */
	- DMU_OT_ZAP_OTHER, /* ZAP */
	- /* new object types: */
	- DMU_OT_ERROR_LOG, /* ZAP */
	- DMU_OT_SPA_HISTORY, /* UINT8 */
	- DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
	- DMU_OT_POOL_PROPS, /* ZAP */
	- DMU_OT_DSL_PERMS, /* ZAP */
	- DMU_OT_ACL, /* ACL */
	- DMU_OT_SYSACL, /* SYSACL */
	- DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
	- DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
	- DMU_OT_NEXT_CLONES, /* ZAP */
	- DMU_OT_SCAN_QUEUE, /* ZAP */
	- DMU_OT_USERGROUP_USED, /* ZAP */
	- DMU_OT_USERGROUP_QUOTA, /* ZAP */
	- DMU_OT_USERREFS, /* ZAP */
	- DMU_OT_DDT_ZAP, /* ZAP */
	- DMU_OT_DDT_STATS, /* ZAP */
	- DMU_OT_SA, /* System attr */
	- DMU_OT_SA_MASTER_NODE, /* ZAP */
	- DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
	- DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
	- DMU_OT_SCAN_XLATE, /* ZAP */
	- DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
	- DMU_OT_DEADLIST, /* ZAP */
	- DMU_OT_DEADLIST_HDR, /* UINT64 */
	- DMU_OT_DSL_CLONES, /* ZAP */
	- DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
	- /*
	- * Do not allocate new object types here. Doing so makes the on-disk
	- * format incompatible with any other format that uses the same object
	- * type number.
	- *
	- * When creating an object which does not have one of the above types
	- * use the DMU_OTN_* type with the correct byteswap and metadata
	- * values.
	- *
	- * The DMU_OTN_* types do not have entries in the dmu_ot table,
	- * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
	- * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
	- * of indexing into dmu_ot directly (this works for both DMU_OT_* types
	- * and DMU_OTN_* types).
	- */
	- DMU_OT_NUMTYPES,
	-
	- /*
	- * Names for valid types declared with DMU_OT().
	- */
	- DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
	- DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
	- DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
	- DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
	- DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
	- DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
	- DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
	- DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
	- DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
	- DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
	-} dmu_object_type_t;
	-
	-/*
	- * These flags are intended to be used to specify the "txg_how"
	- * parameter when calling the dmu_tx_assign() function. See the comment
	- * above dmu_tx_assign() for more details on the meaning of these flags.
	- */
	-#define TXG_NOWAIT (0ULL)
	-#define TXG_WAIT (1ULL<<0)
	-#define TXG_NOTHROTTLE (1ULL<<1)
	-
	-void byteswap_uint64_array(void *buf, size_t size);
	-void byteswap_uint32_array(void *buf, size_t size);
	-void byteswap_uint16_array(void *buf, size_t size);
	-void byteswap_uint8_array(void *buf, size_t size);
	-void zap_byteswap(void *buf, size_t size);
	-void zfs_oldacl_byteswap(void *buf, size_t size);
	-void zfs_acl_byteswap(void *buf, size_t size);
	-void zfs_znode_byteswap(void *buf, size_t size);
	-
	-#define DS_FIND_SNAPSHOTS (1<<0)
	-#define DS_FIND_CHILDREN (1<<1)
	-#define DS_FIND_SERIALIZE (1<<2)
	-
	-/*
	- * The maximum number of bytes that can be accessed as part of one
	- * operation, including metadata.
	- */
	-#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
	-#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
	-
	-#define DMU_USERUSED_OBJECT (-1ULL)
	-#define DMU_GROUPUSED_OBJECT (-2ULL)
	-
	-/*
	- * artificial blkids for bonus buffer and spill blocks
	- */
	-#define DMU_BONUS_BLKID (-1ULL)
	-#define DMU_SPILL_BLKID (-2ULL)
	-/*
	- * Public routines to create, destroy, open, and close objsets.
	- */
	-int dmu_objset_hold(const char name, void tag, objset_t **osp);
	-int dmu_objset_own(const char *name, dmu_objset_type_t type,
	- boolean_t readonly, void tag, objset_t *osp);
	-void dmu_objset_rele(objset_t os, void tag);
	-void dmu_objset_disown(objset_t os, void tag);
	-int dmu_objset_open_ds(struct dsl_dataset ds, objset_t *osp);
	-
	-void dmu_objset_evict_dbufs(objset_t *os);
	-int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
	- void (func)(objset_t os, void arg, cred_t cr, dmu_tx_t tx), void arg);
	-int dmu_get_recursive_snaps_nvl(char fsname, const char snapname,
	- struct nvlist *snaps);
	-int dmu_objset_clone(const char name, const char origin);
	-int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
	- struct nvlist *errlist);
	-int dmu_objset_snapshot_one(const char fsname, const char snapname);
	-int dmu_objset_snapshot_tmp(const char , const char , int);
	-int dmu_objset_find(char name, int func(const char , void ), void arg,
	- int flags);
	-void dmu_objset_byteswap(void *buf, size_t size);
	-int dsl_dataset_rename_snapshot(const char *fsname,
	- const char oldsnapname, const char newsnapname, boolean_t recursive);
	-int dmu_objset_remap_indirects(const char *fsname);
	-
	-typedef struct dmu_buf {
	- uint64_t db_object; /* object that this buffer is part of */
	- uint64_t db_offset; /* byte offset in this object */
	- uint64_t db_size; /* size of buffer in bytes */
	- void db_data; / data in buffer */
	-} dmu_buf_t;
	-
	-/*
	- * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
	- */
	-#define DMU_POOL_DIRECTORY_OBJECT 1
	-#define DMU_POOL_CONFIG "config"
	-#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
	-#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
	-#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
	-#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg"
	-#define DMU_POOL_ROOT_DATASET "root_dataset"
	-#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
	-#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
	-#define DMU_POOL_ERRLOG_LAST "errlog_last"
	-#define DMU_POOL_SPARES "spares"
	-#define DMU_POOL_DEFLATE "deflate"
	-#define DMU_POOL_HISTORY "history"
	-#define DMU_POOL_PROPS "pool_props"
	-#define DMU_POOL_L2CACHE "l2cache"
	-#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
	-#define DMU_POOL_DDT "DDT-%s-%s-%s"
	-#define DMU_POOL_DDT_STATS "DDT-statistics"
	-#define DMU_POOL_CREATION_VERSION "creation_version"
	-#define DMU_POOL_SCAN "scan"
	-#define DMU_POOL_FREE_BPOBJ "free_bpobj"
	-#define DMU_POOL_BPTREE_OBJ "bptree_obj"
	-#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
	-#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
	-#define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map"
	-#define DMU_POOL_REMOVING "com.delphix:removing"
	-#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
	-#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
	-#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
	-
	-/*
	- * Allocate an object from this objset. The range of object numbers
	- * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
	- *
	- * The transaction must be assigned to a txg. The newly allocated
	- * object will be "held" in the transaction (ie. you can modify the
	- * newly allocated object in this transaction).
	- *
	- * dmu_object_alloc() chooses an object and returns it in *objectp.
	- *
	- * dmu_object_claim() allocates a specific object number. If that
	- * number is already allocated, it fails and returns EEXIST.
	- *
	- * Return 0 on success, or ENOSPC or EEXIST as specified above.
	- */
	-uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
	- int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
	-uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
	- int indirect_blockshift,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
	-uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
	- int blocksize, dmu_object_type_t bonus_type, int bonus_len,
	- int dnodesize, dmu_tx_t *tx);
	-int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
	- int blocksize, dmu_object_type_t bonus_type, int bonus_len,
	- int dnodesize, dmu_tx_t *tx);
	-int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
	- dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
	- int bonuslen, int dnodesize, dmu_tx_t *txp);
	-int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
	- int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
	-int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
	- int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
	-
	-/*
	- * Free an object from this objset.
	- *
	- * The object's data will be freed as well (ie. you don't need to call
	- * dmu_free(object, 0, -1, tx)).
	- *
	- * The object need not be held in the transaction.
	- *
	- * If there are any holds on this object's buffers (via dmu_buf_hold()),
	- * or tx holds on the object (via dmu_tx_hold_object()), you can not
	- * free it; it fails and returns EBUSY.
	- *
	- * If the object is not allocated, it fails and returns ENOENT.
	- *
	- * Return 0 on success, or EBUSY or ENOENT as specified above.
	- */
	-int dmu_object_free(objset_t os, uint64_t object, dmu_tx_t tx);
	-
	-/*
	- * Find the next allocated or free object.
	- *
	- * The objectp parameter is in-out. It will be updated to be the next
	- * object which is allocated. Ignore objects which have not been
	- * modified since txg.
	- *
	- * XXX Can only be called on a objset with no dirty data.
	- *
	- * Returns 0 on success, or ENOENT if there are no more objects.
	- */
	-int dmu_object_next(objset_t os, uint64_t objectp,
	- boolean_t hole, uint64_t txg);
	-
	-/*
	- * Set the data blocksize for an object.
	- *
	- * The object cannot have any blocks allcated beyond the first. If
	- * the first block is allocated already, the new size must be greater
	- * than the current block size. If these conditions are not met,
	- * ENOTSUP will be returned.
	- *
	- * Returns 0 on success, or EBUSY if there are any holds on the object
	- * contents, or ENOTSUP as described above.
	- */
	-int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
	- int ibs, dmu_tx_t *tx);
	-
	-/*
	- * Set the checksum property on a dnode. The new checksum algorithm will
	- * apply to all newly written blocks; existing blocks will not be affected.
	- */
	-void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
	- dmu_tx_t *tx);
	-
	-/*
	- * Set the compress property on a dnode. The new compression algorithm will
	- * apply to all newly written blocks; existing blocks will not be affected.
	- */
	-void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
	- dmu_tx_t *tx);
	-
	-int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg);
	-
	-void
	-dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
	- void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
	- int compressed_size, int byteorder, dmu_tx_t *tx);
	-
	-/*
	- * Decide how to write a block: checksum, compression, number of copies, etc.
	- */
	-#define WP_NOFILL 0x1
	-#define WP_DMU_SYNC 0x2
	-#define WP_SPILL 0x4
	-
	-void dmu_write_policy(objset_t os, dnode_t dn, int level, int wp,
	- struct zio_prop *zp);
	-/*
	- * The bonus data is accessed more or less like a regular buffer.
	- * You must dmu_bonus_hold() to get the buffer, which will give you a
	- * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
	- * data. As with any normal buffer, you must call dmu_buf_will_dirty()
	- * before modifying it, and the
	- * object must be held in an assigned transaction before calling
	- * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
	- * buffer as well. You must release your hold with dmu_buf_rele().
	- *
	- * Returns ENOENT, EIO, or 0.
	- */
	-int dmu_bonus_hold(objset_t os, uint64_t object, void tag, dmu_buf_t **);
	-int dmu_bonus_max(void);
	-int dmu_set_bonus(dmu_buf_t , int, dmu_tx_t );
	-int dmu_set_bonustype(dmu_buf_t , dmu_object_type_t, dmu_tx_t );
	-dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
	-int dmu_rm_spill(objset_t , uint64_t, dmu_tx_t );
	-
	-/*
	- * Special spill buffer support used by "SA" framework
	- */
	-
	-int dmu_spill_hold_by_bonus(dmu_buf_t bonus, void tag, dmu_buf_t **dbp);
	-int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
	- void tag, dmu_buf_t *dbp);
	-int dmu_spill_hold_existing(dmu_buf_t bonus, void tag, dmu_buf_t **dbp);
	-
	-/*
	- * Obtain the DMU buffer from the specified object which contains the
	- * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
	- * that it will remain in memory. You must release the hold with
	- * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
	- * hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
	- *
	- * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
	- * on the returned buffer before reading or writing the buffer's
	- * db_data. The comments for those routines describe what particular
	- * operations are valid after calling them.
	- *
	- * The object number must be a valid, allocated object number.
	- */
	-int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
	- void tag, dmu_buf_t *, int flags);
	-int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
	- void tag, dmu_buf_t *dbp, int flags);
	-
	-/*
	- * Add a reference to a dmu buffer that has already been held via
	- * dmu_buf_hold() in the current context.
	- */
	-void dmu_buf_add_ref(dmu_buf_t db, void tag);
	-
	-/*
	- * Attempt to add a reference to a dmu buffer that is in an unknown state,
	- * using a pointer that may have been invalidated by eviction processing.
	- * The request will succeed if the passed in dbuf still represents the
	- * same os/object/blkid, is ineligible for eviction, and has at least
	- * one hold by a user other than the syncer.
	- */
	-boolean_t dmu_buf_try_add_ref(dmu_buf_t , objset_t os, uint64_t object,
	- uint64_t blkid, void *tag);
	-
	-void dmu_buf_rele(dmu_buf_t db, void tag);
	-uint64_t dmu_buf_refcount(dmu_buf_t *db);
	-
	-/*
	- * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
	- * range of an object. A pointer to an array of dmu_buf_t*'s is
	- * returned (in *dbpp).
	- *
	- * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
	- * frees the array. The hold on the array of buffers MUST be released
	- * with dmu_buf_rele_array. You can NOT release the hold on each buffer
	- * individually with dmu_buf_rele.
	- */
	-int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
	- uint64_t length, boolean_t read, void *tag,
	- int numbufsp, dmu_buf_t **dbpp);
	-int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
	- boolean_t read, void tag, int numbufsp, dmu_buf_t ***dbpp,
	- uint32_t flags);
	-void dmu_buf_rele_array(dmu_buf_t *, int numbufs, void tag);
	-
	-typedef void dmu_buf_evict_func_t(void *user_ptr);
	-
	-/*
	- * A DMU buffer user object may be associated with a dbuf for the
	- * duration of its lifetime. This allows the user of a dbuf (client)
	- * to attach private data to a dbuf (e.g. in-core only data such as a
	- * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
	- * when that dbuf has been evicted. Clients typically respond to the
	- * eviction notification by freeing their private data, thus ensuring
	- * the same lifetime for both dbuf and private data.
	- *
	- * The mapping from a dmu_buf_user_t to any client private data is the
	- * client's responsibility. All current consumers of the API with private
	- * data embed a dmu_buf_user_t as the first member of the structure for
	- * their private data. This allows conversions between the two types
	- * with a simple cast. Since the DMU buf user API never needs access
	- * to the private data, other strategies can be employed if necessary
	- * or convenient for the client (e.g. using container_of() to do the
	- * conversion for private data that cannot have the dmu_buf_user_t as
	- * its first member).
	- *
	- * Eviction callbacks are executed without the dbuf mutex held or any
	- * other type of mechanism to guarantee that the dbuf is still available.
	- * For this reason, users must assume the dbuf has already been freed
	- * and not reference the dbuf from the callback context.
	- *
	- * Users requesting "immediate eviction" are notified as soon as the dbuf
	- * is only referenced by dirty records (dirties == holds). Otherwise the
	- * notification occurs after eviction processing for the dbuf begins.
	- */
	-typedef struct dmu_buf_user {
	- /*
	- * Asynchronous user eviction callback state.
	- */
	- taskq_ent_t dbu_tqent;
	-
	- /*
	- * This instance's eviction function pointers.
	- *
	- * dbu_evict_func_sync is called synchronously and then
	- * dbu_evict_func_async is executed asynchronously on a taskq.
	- */
	- dmu_buf_evict_func_t *dbu_evict_func_sync;
	- dmu_buf_evict_func_t *dbu_evict_func_async;
	-#ifdef ZFS_DEBUG
	- /*
	- * Pointer to user's dbuf pointer. NULL for clients that do
	- * not associate a dbuf with their user data.
	- *
	- * The dbuf pointer is cleared upon eviction so as to catch
	- * use-after-evict bugs in clients.
	- */
	- dmu_buf_t **dbu_clear_on_evict_dbufp;
	-#endif
	-} dmu_buf_user_t;
	-
	-/*
	- * Initialize the given dmu_buf_user_t instance with the eviction function
	- * evict_func, to be called when the user is evicted.
	- *
	- * NOTE: This function should only be called once on a given dmu_buf_user_t.
	- * To allow enforcement of this, dbu must already be zeroed on entry.
	- */
	-/ARGSUSED/
	-static inline void
	-dmu_buf_init_user(dmu_buf_user_t dbu, dmu_buf_evict_func_t evict_func_sync,
	- dmu_buf_evict_func_t evict_func_async, dmu_buf_t *clear_on_evict_dbufp)
	-{
	- ASSERT(dbu->dbu_evict_func_sync == NULL);
	- ASSERT(dbu->dbu_evict_func_async == NULL);
	-
	- /* must have at least one evict func */
	- IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
	- dbu->dbu_evict_func_sync = evict_func_sync;
	- dbu->dbu_evict_func_async = evict_func_async;
	-#ifdef ZFS_DEBUG
	- dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
	-#endif
	-}
	-
	-/*
	- * Attach user data to a dbuf and mark it for normal (when the dbuf's
	- * data is cleared or its reference count goes to zero) eviction processing.
	- *
	- * Returns NULL on success, or the existing user if another user currently
	- * owns the buffer.
	- */
	-void dmu_buf_set_user(dmu_buf_t db, dmu_buf_user_t *user);
	-
	-/*
	- * Attach user data to a dbuf and mark it for immediate (its dirty and
	- * reference counts are equal) eviction processing.
	- *
	- * Returns NULL on success, or the existing user if another user currently
	- * owns the buffer.
	- */
	-void dmu_buf_set_user_ie(dmu_buf_t db, dmu_buf_user_t *user);
	-
	-/*
	- * Replace the current user of a dbuf.
	- *
	- * If given the current user of a dbuf, replaces the dbuf's user with
	- * "new_user" and returns the user data pointer that was replaced.
	- * Otherwise returns the current, and unmodified, dbuf user pointer.
	- */
	-void dmu_buf_replace_user(dmu_buf_t db,
	- dmu_buf_user_t old_user, dmu_buf_user_t new_user);
	-
	-/*
	- * Remove the specified user data for a DMU buffer.
	- *
	- * Returns the user that was removed on success, or the current user if
	- * another user currently owns the buffer.
	- */
	-void dmu_buf_remove_user(dmu_buf_t db, dmu_buf_user_t *user);
	-
	-/*
	- * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
	- */
	-void dmu_buf_get_user(dmu_buf_t db);
	-
	-objset_t dmu_buf_get_objset(dmu_buf_t db);
	-dnode_t dmu_buf_dnode_enter(dmu_buf_t db);
	-void dmu_buf_dnode_exit(dmu_buf_t *db);
	-
	-/* Block until any in-progress dmu buf user evictions complete. */
	-void dmu_buf_user_evict_wait(void);
	-
	-/*
	- * Returns the blkptr associated with this dbuf, or NULL if not set.
	- */
	-struct blkptr dmu_buf_get_blkptr(dmu_buf_t db);
	-
	-/*
	- * Indicate that you are going to modify the buffer's data (db_data).
	- *
	- * The transaction (tx) must be assigned to a txg (ie. you've called
	- * dmu_tx_assign()). The buffer's object must be held in the tx
	- * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
	- */
	-void dmu_buf_will_dirty(dmu_buf_t db, dmu_tx_t tx);
	-
	-/*
	- * You must create a transaction, then hold the objects which you will
	- * (or might) modify as part of this transaction. Then you must assign
	- * the transaction to a transaction group. Once the transaction has
	- * been assigned, you can modify buffers which belong to held objects as
	- * part of this transaction. You can't modify buffers before the
	- * transaction has been assigned; you can't modify buffers which don't
	- * belong to objects which this transaction holds; you can't hold
	- * objects once the transaction has been assigned. You may hold an
	- * object which you are going to free (with dmu_object_free()), but you
	- * don't have to.
	- *
	- * You can abort the transaction before it has been assigned.
	- *
	- * Note that you may hold buffers (with dmu_buf_hold) at any time,
	- * regardless of transaction state.
	- */
	-
	-#define DMU_NEW_OBJECT (-1ULL)
	-#define DMU_OBJECT_END (-1ULL)
	-
	-dmu_tx_t dmu_tx_create(objset_t os);
	-void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
	-void dmu_tx_hold_write_by_dnode(dmu_tx_t tx, dnode_t dn, uint64_t off,
	- int len);
	-void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
	- uint64_t len);
	-void dmu_tx_hold_free_by_dnode(dmu_tx_t tx, dnode_t dn, uint64_t off,
	- uint64_t len);
	-void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object);
	-void dmu_tx_hold_zap(dmu_tx_t tx, uint64_t object, int add, const char name);
	-void dmu_tx_hold_zap_by_dnode(dmu_tx_t tx, dnode_t dn, int add,
	- const char *name);
	-void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
	-void dmu_tx_hold_bonus_by_dnode(dmu_tx_t tx, dnode_t dn);
	-void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
	-void dmu_tx_hold_sa(dmu_tx_t tx, struct sa_handle hdl, boolean_t may_grow);
	-void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
	-void dmu_tx_abort(dmu_tx_t *tx);
	-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
	-void dmu_tx_wait(dmu_tx_t *tx);
	-void dmu_tx_commit(dmu_tx_t *tx);
	-void dmu_tx_mark_netfree(dmu_tx_t *tx);
	-
	-/*
	- * To register a commit callback, dmu_tx_callback_register() must be called.
	- *
	- * dcb_data is a pointer to caller private data that is passed on as a
	- * callback parameter. The caller is responsible for properly allocating and
	- * freeing it.
	- *
	- * When registering a callback, the transaction must be already created, but
	- * it cannot be committed or aborted. It can be assigned to a txg or not.
	- *
	- * The callback will be called after the transaction has been safely written
	- * to stable storage and will also be called if the dmu_tx is aborted.
	- * If there is any error which prevents the transaction from being committed to
	- * disk, the callback will be called with a value of error != 0.
	- */
	-typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
	-
	-void dmu_tx_callback_register(dmu_tx_t tx, dmu_tx_callback_func_t dcb_func,
	- void *dcb_data);
	-
	-/*
	- * Free up the data blocks for a defined range of a file. If size is
	- * -1, the range from offset to end-of-file is freed.
	- */
	-int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
	- uint64_t size, dmu_tx_t *tx);
	-int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
	- uint64_t size);
	-int dmu_free_long_object(objset_t *os, uint64_t object);
	-
	-/*
	- * Convenience functions.
	- *
	- * Canfail routines will return 0 on success, or an errno if there is a
	- * nonrecoverable I/O error.
	- */
	-#define DMU_READ_PREFETCH 0 /* prefetch */
	-#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
	-int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	- void *buf, uint32_t flags);
	-int dmu_read_by_dnode(dnode_t dn, uint64_t offset, uint64_t size, void buf,
	- uint32_t flags);
	-void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	- const void buf, dmu_tx_t tx);
	-void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
	- const void buf, dmu_tx_t tx);
	-void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	- dmu_tx_t *tx);
	-int dmu_read_uio(objset_t os, uint64_t object, struct uio uio, uint64_t size);
	-int dmu_read_uio_dbuf(dmu_buf_t zdb, struct uio uio, uint64_t size);
	-int dmu_read_uio_dnode(dnode_t dn, struct uio uio, uint64_t size);
	-int dmu_write_uio(objset_t os, uint64_t object, struct uio uio, uint64_t size,
	- dmu_tx_t *tx);
	-int dmu_write_uio_dbuf(dmu_buf_t zdb, struct uio uio, uint64_t size,
	- dmu_tx_t *tx);
	-int dmu_write_uio_dnode(dnode_t dn, struct uio uio, uint64_t size,
	- dmu_tx_t *tx);
	-#ifdef _KERNEL
	-#ifdef illumos
	-int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
	- uint64_t size, struct page pp, dmu_tx_t tx);
	-#else
	-int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
	- uint64_t size, struct vm_page *ppa, dmu_tx_t tx);
	-int dmu_read_pages(objset_t os, uint64_t object, vm_page_t ma, int count,
	- int rbehind, int rahead, int last_size);
	-#endif
	-#endif
	-struct arc_buf dmu_request_arcbuf(dmu_buf_t handle, int size);
	-void dmu_return_arcbuf(struct arc_buf *buf);
	-void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset,
	- struct arc_buf buf, dmu_tx_t tx);
	-void dmu_assign_arcbuf(dmu_buf_t handle, uint64_t offset, struct arc_buf buf,
	- dmu_tx_t *tx);
	-int dmu_xuio_init(struct xuio *uio, int niov);
	-void dmu_xuio_fini(struct xuio *uio);
	-int dmu_xuio_add(struct xuio uio, struct arc_buf abuf, offset_t off,
	- size_t n);
	-int dmu_xuio_cnt(struct xuio *uio);
	-struct arc_buf dmu_xuio_arcbuf(struct xuio uio, int i);
	-void dmu_xuio_clear(struct xuio *uio, int i);
	-void xuio_stat_wbuf_copied(void);
	-void xuio_stat_wbuf_nocopy(void);
	-
	-extern boolean_t zfs_prefetch_disable;
	-extern int zfs_max_recordsize;
	-
	-/*
	- * Asynchronously try to read in the data.
	- */
	-void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
	- uint64_t len, enum zio_priority pri);
	-
	-typedef struct dmu_object_info {
	- /* All sizes are in bytes unless otherwise indicated. */
	- uint32_t doi_data_block_size;
	- uint32_t doi_metadata_block_size;
	- dmu_object_type_t doi_type;
	- dmu_object_type_t doi_bonus_type;
	- uint64_t doi_bonus_size;
	- uint8_t doi_indirection; /* 2 = dnode->indirect->data */
	- uint8_t doi_checksum;
	- uint8_t doi_compress;
	- uint8_t doi_nblkptr;
	- int8_t doi_pad[4];
	- uint64_t doi_dnodesize;
	- uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
	- uint64_t doi_max_offset;
	- uint64_t doi_fill_count; /* number of non-empty blocks */
	-} dmu_object_info_t;
	-
	-typedef void arc_byteswap_func_t(void *buf, size_t size);
	-
	-typedef struct dmu_object_type_info {
	- dmu_object_byteswap_t ot_byteswap;
	- boolean_t ot_metadata;
	- boolean_t ot_dbuf_metadata_cache;
	- char *ot_name;
	-} dmu_object_type_info_t;
	-
	-typedef struct dmu_object_byteswap_info {
	- arc_byteswap_func_t *ob_func;
	- char *ob_name;
	-} dmu_object_byteswap_info_t;
	-
	-extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
	-extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
	-
	-/*
	- * Get information on a DMU object.
	- *
	- * Return 0 on success or ENOENT if object is not allocated.
	- *
	- * If doi is NULL, just indicates whether the object exists.
	- */
	-int dmu_object_info(objset_t os, uint64_t object, dmu_object_info_t doi);
	-void __dmu_object_info_from_dnode(struct dnode dn, dmu_object_info_t doi);
	-/* Like dmu_object_info, but faster if you have a held dnode in hand. */
	-void dmu_object_info_from_dnode(dnode_t dn, dmu_object_info_t doi);
	-/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
	-void dmu_object_info_from_db(dmu_buf_t db, dmu_object_info_t doi);
	-/*
	- * Like dmu_object_info_from_db, but faster still when you only care about
	- * the size. This is specifically optimized for zfs_getattr().
	- */
	-void dmu_object_size_from_db(dmu_buf_t db, uint32_t blksize,
	- u_longlong_t *nblk512);
	-
	-void dmu_object_dnsize_from_db(dmu_buf_t db, int dnsize);
	-
	-typedef struct dmu_objset_stats {
	- uint64_t dds_num_clones; /* number of clones of this */
	- uint64_t dds_creation_txg;
	- uint64_t dds_guid;
	- dmu_objset_type_t dds_type;
	- uint8_t dds_is_snapshot;
	- uint8_t dds_inconsistent;
	- char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
	-} dmu_objset_stats_t;
	-
	-/*
	- * Get stats on a dataset.
	- */
	-void dmu_objset_fast_stat(objset_t os, dmu_objset_stats_t stat);
	-
	-/*
	- * Add entries to the nvlist for all the objset's properties. See
	- * zfs_prop_table[] and zfs(1m) for details on the properties.
	- */
	-void dmu_objset_stats(objset_t os, struct nvlist nv);
	-
	-/*
	- * Get the space usage statistics for statvfs().
	- *
	- * refdbytes is the amount of space "referenced" by this objset.
	- * availbytes is the amount of space available to this objset, taking
	- * into account quotas & reservations, assuming that no other objsets
	- * use the space first. These values correspond to the 'referenced' and
	- * 'available' properties, described in the zfs(1m) manpage.
	- *
	- * usedobjs and availobjs are the number of objects currently allocated,
	- * and available.
	- */
	-void dmu_objset_space(objset_t os, uint64_t refdbytesp, uint64_t *availbytesp,
	- uint64_t usedobjsp, uint64_t availobjsp);
	-
	-/*
	- * The fsid_guid is a 56-bit ID that can change to avoid collisions.
	- * (Contrast with the ds_guid which is a 64-bit ID that will never
	- * change, so there is a small probability that it will collide.)
	- */
	-uint64_t dmu_objset_fsid_guid(objset_t *os);
	-
	-/*
	- * Get the [cm]time for an objset's snapshot dir
	- */
	-timestruc_t dmu_objset_snap_cmtime(objset_t *os);
	-
	-int dmu_objset_is_snapshot(objset_t *os);
	-
	-extern struct spa dmu_objset_spa(objset_t os);
	-extern struct zilog dmu_objset_zil(objset_t os);
	-extern struct dsl_pool dmu_objset_pool(objset_t os);
	-extern struct dsl_dataset dmu_objset_ds(objset_t os);
	-extern void dmu_objset_name(objset_t os, char buf);
	-extern dmu_objset_type_t dmu_objset_type(objset_t *os);
	-extern uint64_t dmu_objset_id(objset_t *os);
	-extern uint64_t dmu_objset_dnodesize(objset_t *os);
	-extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
	-extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
	-extern int dmu_snapshot_list_next(objset_t os, int namelen, char name,
	- uint64_t id, uint64_t offp, boolean_t *case_conflict);
	-extern int dmu_snapshot_realname(objset_t os, char name, char *real,
	- int maxlen, boolean_t *conflict);
	-extern int dmu_dir_list_next(objset_t os, int namelen, char name,
	- uint64_t idp, uint64_t offp);
	-
	-typedef int objset_used_cb_t(dmu_object_type_t bonustype,
	- void bonus, uint64_t userp, uint64_t *groupp);
	-extern void dmu_objset_register_type(dmu_objset_type_t ost,
	- objset_used_cb_t *cb);
	-extern void dmu_objset_set_user(objset_t os, void user_ptr);
	-extern void dmu_objset_get_user(objset_t os);
	-
	-/*
	- * Return the txg number for the given assigned transaction.
	- */
	-uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
	-
	-/*
	- * Synchronous write.
	- * If a parent zio is provided this function initiates a write on the
	- * provided buffer as a child of the parent zio.
	- * In the absence of a parent zio, the write is completed synchronously.
	- * At write completion, blk is filled with the bp of the written block.
	- * Note that while the data covered by this function will be on stable
	- * storage when the write completes this new data does not become a
	- * permanent part of the file until the associated transaction commits.
	- */
	-
	-/*
	- * {zfs,zvol,ztest}_get_done() args
	- */
	-typedef struct zgd {
	- struct lwb *zgd_lwb;
	- struct blkptr *zgd_bp;
	- dmu_buf_t *zgd_db;
	- struct locked_range *zgd_lr;
	- void *zgd_private;
	-} zgd_t;
	-
	-typedef void dmu_sync_cb_t(zgd_t *arg, int error);
	-int dmu_sync(struct zio zio, uint64_t txg, dmu_sync_cb_t done, zgd_t *zgd);
	-
	-/*
	- * Find the next hole or data block in file starting at *off
	- * Return found offset in *off. Return ESRCH for end of file.
	- */
	-int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
	- uint64_t *off);
	-
	-/*
	- * Check if a DMU object has any dirty blocks. If so, sync out
	- * all pending transaction groups. Otherwise, this function
	- * does not alter DMU state. This could be improved to only sync
	- * out the necessary transaction groups for this particular
	- * object.
	- */
	-int dmu_object_wait_synced(objset_t *os, uint64_t object);
	-
	-/*
	- * Initial setup and final teardown.
	- */
	-extern void dmu_init(void);
	-extern void dmu_fini(void);
	-
	-typedef void (dmu_traverse_cb_t)(objset_t os, void arg, struct blkptr bp,
	- uint64_t object, uint64_t offset, int len);
	-void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
	- dmu_traverse_cb_t cb, void *arg);
	-int dmu_diff(const char tosnap_name, const char fromsnap_name,
	- struct file fp, offset_t offp);
	-
	-/* CRC64 table */
	-#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
	-extern uint64_t zfs_crc64_table[256];
	-
	-extern int zfs_mdcomp_disable;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DMU_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
	@@ -1,315 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_DMU_IMPL_H
	-#define _SYS_DMU_IMPL_H
	-
	-#include <sys/txg_impl.h>
	-#include <sys/zio.h>
	-#include <sys/dnode.h>
	-#include <sys/kstat.h>
	-#include <sys/zfs_context.h>
	-#include <sys/zfs_ioctl.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * This is the locking strategy for the DMU. Numbers in parenthesis are
	- * cases that use that lock order, referenced below:
	- *
	- * ARC is self-contained
	- * bplist is self-contained
	- * refcount is self-contained
	- * txg is self-contained (hopefully!)
	- * zst_lock
	- * zf_rwlock
	- *
	- * XXX try to improve evicting path?
	- *
	- * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
	- * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs
	- *
	- * dp_config_rwlock
	- * must be held before: everything
	- * protects dd namespace changes
	- * protects property changes globally
	- * held from:
	- * dsl_dir_open/r:
	- * dsl_dir_create_sync/w:
	- * dsl_dir_sync_destroy/w:
	- * dsl_dir_rename_sync/w:
	- * dsl_prop_changed_notify/r:
	- *
	- * os_obj_lock
	- * must be held before:
	- * everything except dp_config_rwlock
	- * protects os_obj_next
	- * held from:
	- * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
	- *
	- * dn_struct_rwlock
	- * must be held before:
	- * everything except dp_config_rwlock and os_obj_lock
	- * protects structure of dnode (eg. nlevels)
	- * db_blkptr can change when syncing out change to nlevels
	- * dn_maxblkid
	- * dn_nlevels
	- * dn_blksz
	- * phys nlevels, maxblkid, physical blkptr_t's (?)
	- * held from:
	- * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
	- * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
	- * dbuf_read_impl: db_mtx, dmu_zfetch()
	- * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
	- * dbuf_new_size: db_mtx
	- * dbuf_dirty: db_mtx
	- * dbuf_findbp: (callers, phys? - the real need)
	- * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
	- * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
	- * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
	- * dnode_sync/w (increase_indirection): db_mtx (phys)
	- * dnode_set_blksz/w: dn_dbufs_mtx (dn_blksz)
	- * dnode_new_blkid/w: (dn_maxblkid)
	- * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
	- * dnode_next_offset: (phys)
	- *
	- * dn_dbufs_mtx
	- * must be held before:
	- * db_mtx, hash_mutexes
	- * protects:
	- * dn_dbufs
	- * dn_evicted
	- * held from:
	- * dmu_evict_user: db_mtx (dn_dbufs)
	- * dbuf_free_range: db_mtx (dn_dbufs)
	- * dbuf_remove_ref: db_mtx, callees:
	- * dbuf_hash_remove: hash_mutexes, db_mtx
	- * dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
	- * dnode_set_blksz: (dn_dbufs)
	- *
	- * hash_mutexes (global)
	- * must be held before:
	- * db_mtx
	- * protects dbuf_hash_table (global) and db_hash_next
	- * held from:
	- * dbuf_find: db_mtx
	- * dbuf_hash_insert: db_mtx
	- * dbuf_hash_remove: db_mtx
	- *
	- * db_mtx (meta-leaf)
	- * must be held before:
	- * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
	- * protects:
	- * db_state
	- * db_holds
	- * db_buf
	- * db_changed
	- * db_data_pending
	- * db_dirtied
	- * db_link
	- * db_dirty_node (??)
	- * db_dirtycnt
	- * db_d.*
	- * db.*
	- * held from:
	- * dbuf_dirty: dn_mtx, dn_dirty_mtx
	- * dbuf_dirty->dsl_dir_willuse_space: dd_lock
	- * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
	- * dbuf_undirty: dn_dirty_mtx (db_d)
	- * dbuf_write_done: dn_dirty_mtx (db_state)
	- * dbuf_*
	- * dmu_buf_update_user: none (db_d)
	- * dmu_evict_user: none (db_d) (maybe can eliminate)
	- * dbuf_find: none (db_holds)
	- * dbuf_hash_insert: none (db_holds)
	- * dmu_buf_read_array_impl: none (db_state, db_changed)
	- * dmu_sync: none (db_dirty_node, db_d)
	- * dnode_reallocate: none (db)
	- *
	- * dn_mtx (leaf)
	- * protects:
	- * dn_dirty_dbufs
	- * dn_ranges
	- * phys accounting
	- * dn_allocated_txg
	- * dn_free_txg
	- * dn_assigned_txg
	- * dn_dirty_txg
	- * dn_notxholds
	- * dn_dirtyctx
	- * dn_dirtyctx_firstset
	- * (dn_phys copy fields?)
	- * (dn_phys contents?)
	- * held from:
	- * dnode_*
	- * dbuf_dirty: none
	- * dbuf_sync: none (phys accounting)
	- * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
	- * dbuf_write_done: none (phys accounting)
	- * dmu_object_info_from_dnode: none (accounting)
	- * dmu_tx_commit: none
	- * dmu_tx_hold_object_impl: none
	- * dmu_tx_try_assign: dn_notxholds(cv)
	- * dmu_tx_unassign: none
	- *
	- * dd_lock
	- * must be held before:
	- * ds_lock
	- * ancestors' dd_lock
	- * protects:
	- * dd_prop_cbs
	- * dd_sync_*
	- * dd_used_bytes
	- * dd_tempreserved
	- * dd_space_towrite
	- * dd_myname
	- * dd_phys accounting?
	- * held from:
	- * dsl_dir_*
	- * dsl_prop_changed_notify: none (dd_prop_cbs)
	- * dsl_prop_register: none (dd_prop_cbs)
	- * dsl_prop_unregister: none (dd_prop_cbs)
	- *
	- * os_lock (leaf)
	- * protects:
	- * os_dirty_dnodes
	- * os_free_dnodes
	- * os_dnodes
	- * os_downgraded_dbufs
	- * dn_dirtyblksz
	- * dn_dirty_link
	- * held from:
	- * dnode_create: none (os_dnodes)
	- * dnode_destroy: none (os_dnodes)
	- * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
	- * dnode_free: none (dn_dirtyblksz, os_*_dnodes)
	- *
	- * ds_lock
	- * protects:
	- * ds_objset
	- * ds_open_refcount
	- * ds_snapname
	- * ds_phys accounting
	- * ds_phys userrefs zapobj
	- * ds_reserved
	- * held from:
	- * dsl_dataset_*
	- *
	- * dr_mtx (leaf)
	- * protects:
	- * dr_children
	- * held from:
	- * dbuf_dirty
	- * dbuf_undirty
	- * dbuf_sync_indirect
	- * dnode_new_blkid
	- */
	-
	-struct objset;
	-struct dmu_pool;
	-
	-typedef struct dmu_xuio {
	- int next;
	- int cnt;
	- struct arc_buf **bufs;
	- iovec_t *iovp;
	-} dmu_xuio_t;
	-
	-typedef struct xuio_stats {
	- /* loaned yet not returned arc_buf */
	- kstat_named_t xuiostat_onloan_rbuf;
	- kstat_named_t xuiostat_onloan_wbuf;
	- /* whether a copy is made when loaning out a read buffer */
	- kstat_named_t xuiostat_rbuf_copied;
	- kstat_named_t xuiostat_rbuf_nocopy;
	- /* whether a copy is made when assigning a write buffer */
	- kstat_named_t xuiostat_wbuf_copied;
	- kstat_named_t xuiostat_wbuf_nocopy;
	-} xuio_stats_t;
	-
	-static xuio_stats_t xuio_stats = {
	- { "onloan_read_buf", KSTAT_DATA_UINT64 },
	- { "onloan_write_buf", KSTAT_DATA_UINT64 },
	- { "read_buf_copied", KSTAT_DATA_UINT64 },
	- { "read_buf_nocopy", KSTAT_DATA_UINT64 },
	- { "write_buf_copied", KSTAT_DATA_UINT64 },
	- { "write_buf_nocopy", KSTAT_DATA_UINT64 }
	-};
	-
	-#define XUIOSTAT_INCR(stat, val) \
	- atomic_add_64(&xuio_stats.stat.value.ui64, (val))
	-#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
	-
	-/*
	- * The list of data whose inclusion in a send stream can be pending from
	- * one call to backup_cb to another. Multiple calls to dump_free() and
	- * dump_freeobjects() can be aggregated into a single DRR_FREE or
	- * DRR_FREEOBJECTS replay record.
	- */
	-typedef enum {
	- PENDING_NONE,
	- PENDING_FREE,
	- PENDING_FREEOBJECTS
	-} dmu_pendop_t;
	-
	-typedef struct dmu_sendarg {
	- list_node_t dsa_link;
	- dmu_replay_record_t *dsa_drr;
	- kthread_t *dsa_td;
	- struct file *dsa_fp;
	- int dsa_outfd;
	- struct proc *dsa_proc;
	- offset_t *dsa_off;
	- objset_t *dsa_os;
	- zio_cksum_t dsa_zc;
	- uint64_t dsa_toguid;
	- int dsa_err;
	- dmu_pendop_t dsa_pending_op;
	- uint64_t dsa_featureflags;
	- uint64_t dsa_last_data_object;
	- uint64_t dsa_last_data_offset;
	- uint64_t dsa_resume_object;
	- uint64_t dsa_resume_offset;
	- boolean_t dsa_sent_begin;
	- boolean_t dsa_sent_end;
	-} dmu_sendarg_t;
	-
	-void dmu_object_zapify(objset_t , uint64_t, dmu_object_type_t, dmu_tx_t );
	-void dmu_object_free_zapified(objset_t , uint64_t, dmu_tx_t );
	-int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
	- void , dmu_buf_t *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DMU_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
	@@ -1,221 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#ifndef _SYS_DMU_OBJSET_H
	-#define _SYS_DMU_OBJSET_H
	-
	-#include <sys/spa.h>
	-#include <sys/arc.h>
	-#include <sys/txg.h>
	-#include <sys/zfs_context.h>
	-#include <sys/dnode.h>
	-#include <sys/zio.h>
	-#include <sys/zil.h>
	-#include <sys/sa.h>
	-#include <sys/zfs_ioctl.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-extern krwlock_t os_lock;
	-
	-struct dsl_pool;
	-struct dsl_dataset;
	-struct dmu_tx;
	-
	-#define OBJSET_PHYS_SIZE 2048
	-#define OBJSET_OLD_PHYS_SIZE 1024
	-
	-#define OBJSET_BUF_HAS_USERUSED(buf) \
	- (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
	-
	-#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0)
	-
	-typedef struct objset_phys {
	- dnode_phys_t os_meta_dnode;
	- zil_header_t os_zil_header;
	- uint64_t os_type;
	- uint64_t os_flags;
	- char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
	- sizeof (zil_header_t) - sizeof (uint64_t)*2];
	- dnode_phys_t os_userused_dnode;
	- dnode_phys_t os_groupused_dnode;
	-} objset_phys_t;
	-
	-#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1)
	-struct objset {
	- /* Immutable: */
	- struct dsl_dataset *os_dsl_dataset;
	- spa_t *os_spa;
	- arc_buf_t *os_phys_buf;
	- objset_phys_t *os_phys;
	- /*
	- * The following "special" dnodes have no parent, are exempt
	- * from dnode_move(), and are not recorded in os_dnodes, but they
	- * root their descendents in this objset using handles anyway, so
	- * that all access to dnodes from dbufs consistently uses handles.
	- */
	- dnode_handle_t os_meta_dnode;
	- dnode_handle_t os_userused_dnode;
	- dnode_handle_t os_groupused_dnode;
	- zilog_t *os_zil;
	-
	- list_node_t os_evicting_node;
	-
	- /* can change, under dsl_dir's locks: */
	- uint64_t os_dnodesize; /* default dnode size for new objects */
	- enum zio_checksum os_checksum;
	- enum zio_compress os_compress;
	- uint8_t os_copies;
	- enum zio_checksum os_dedup_checksum;
	- boolean_t os_dedup_verify;
	- zfs_logbias_op_t os_logbias;
	- zfs_cache_type_t os_primary_cache;
	- zfs_cache_type_t os_secondary_cache;
	- zfs_sync_type_t os_sync;
	- zfs_redundant_metadata_type_t os_redundant_metadata;
	- int os_recordsize;
	- /*
	- * The next four values are used as a cache of whatever's on disk, and
	- * are initialized the first time these properties are queried. Before
	- * being initialized with their real values, their values are
	- * OBJSET_PROP_UNINITIALIZED.
	- */
	- uint64_t os_version;
	- uint64_t os_normalization;
	- uint64_t os_utf8only;
	- uint64_t os_casesensitivity;
	- /*
	- * The largest zpl file block allowed in special class.
	- * cached here instead of zfsvfs for easier access.
	- */
	- int os_zpl_special_smallblock;
	-
	- /*
	- * Pointer is constant; the blkptr it points to is protected by
	- * os_dsl_dataset->ds_bp_rwlock
	- */
	- blkptr_t *os_rootbp;
	-
	- /* no lock needed: */
	- struct dmu_tx os_synctx; / XXX sketchy */
	- zil_header_t os_zil_header;
	- multilist_t *os_synced_dnodes;
	- uint64_t os_flags;
	- uint64_t os_freed_dnodes;
	- boolean_t os_rescan_dnodes;
	-
	- /* Protected by os_obj_lock */
	- kmutex_t os_obj_lock;
	- uint64_t os_obj_next_chunk;
	-
	- /* Per-CPU next object to allocate, protected by atomic ops. */
	- uint64_t *os_obj_next_percpu;
	- int os_obj_next_percpu_len;
	-
	- /* Protected by os_lock */
	- kmutex_t os_lock;
	- multilist_t *os_dirty_dnodes[TXG_SIZE];
	- list_t os_dnodes;
	- list_t os_downgraded_dbufs;
	-
	- /* Protects changes to DMU_{USER,GROUP}USED_OBJECT */
	- kmutex_t os_userused_lock;
	-
	- /* stuff we store for the user */
	- kmutex_t os_user_ptr_lock;
	- void *os_user_ptr;
	- sa_os_t *os_sa;
	-};
	-
	-#define DMU_META_OBJSET 0
	-#define DMU_META_DNODE_OBJECT 0
	-#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
	-#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode)
	-#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode)
	-#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode)
	-
	-#define DMU_OS_IS_L2CACHEABLE(os) \
	- ((os)->os_secondary_cache == ZFS_CACHE_ALL \|\| \
	- (os)->os_secondary_cache == ZFS_CACHE_METADATA)
	-
	-#define DMU_OS_IS_L2COMPRESSIBLE(os) (zfs_mdcomp_disable == B_FALSE)
	-
	-/* called from zpl */
	-int dmu_objset_hold(const char name, void tag, objset_t **osp);
	-int dmu_objset_own(const char *name, dmu_objset_type_t type,
	- boolean_t readonly, void tag, objset_t *osp);
	-int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
	- dmu_objset_type_t type, boolean_t readonly, void tag, objset_t *osp);
	-void dmu_objset_refresh_ownership(struct dsl_dataset *ds,
	- struct dsl_dataset *newds, void tag);
	-void dmu_objset_rele(objset_t os, void tag);
	-void dmu_objset_disown(objset_t os, void tag);
	-int dmu_objset_from_ds(struct dsl_dataset ds, objset_t *osp);
	-
	-void dmu_objset_stats(objset_t os, nvlist_t nv);
	-void dmu_objset_fast_stat(objset_t os, dmu_objset_stats_t stat);
	-void dmu_objset_space(objset_t os, uint64_t refdbytesp, uint64_t *availbytesp,
	- uint64_t usedobjsp, uint64_t availobjsp);
	-uint64_t dmu_objset_fsid_guid(objset_t *os);
	-int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
	- int func(struct dsl_pool , struct dsl_dataset , void *),
	- void *arg, int flags);
	-int dmu_objset_prefetch(const char name, void arg);
	-void dmu_objset_evict_dbufs(objset_t *os);
	-timestruc_t dmu_objset_snap_cmtime(objset_t *os);
	-
	-/* called from dsl */
	-void dmu_objset_sync(objset_t os, zio_t zio, dmu_tx_t *tx);
	-boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
	-objset_t dmu_objset_create_impl(spa_t spa, struct dsl_dataset *ds,
	- blkptr_t bp, dmu_objset_type_t type, dmu_tx_t tx);
	-int dmu_objset_open_impl(spa_t spa, struct dsl_dataset ds, blkptr_t *bp,
	- objset_t **osp);
	-void dmu_objset_evict(objset_t *os);
	-void dmu_objset_do_userquota_updates(objset_t os, dmu_tx_t tx);
	-void dmu_objset_userquota_get_ids(dnode_t dn, boolean_t before, dmu_tx_t tx);
	-boolean_t dmu_objset_userused_enabled(objset_t *os);
	-int dmu_objset_userspace_upgrade(objset_t *os);
	-boolean_t dmu_objset_userspace_present(objset_t *os);
	-int dmu_fsname(const char snapname, char buf);
	-
	-void dmu_objset_evict_done(objset_t *os);
	-void dmu_objset_willuse_space(objset_t os, int64_t space, dmu_tx_t tx);
	-
	-void dmu_objset_init(void);
	-void dmu_objset_fini(void);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DMU_OBJSET_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
	@@ -1,93 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#ifndef _DMU_SEND_H
	-#define _DMU_SEND_H
	-
	-#include <sys/spa.h>
	-
	-struct vnode;
	-struct dsl_dataset;
	-struct drr_begin;
	-struct avl_tree;
	-struct dmu_replay_record;
	-
	-extern const char *recv_clone_name;
	-
	-int dmu_send(const char tosnap, const char fromsnap, boolean_t embedok,
	- boolean_t large_block_ok, boolean_t compressok, int outfd,
	- uint64_t resumeobj, uint64_t resumeoff,
	-#ifdef illumos
	- struct vnode vp, offset_t off);
	-#else
	- struct file fp, offset_t off);
	-#endif
	-int dmu_send_estimate(struct dsl_dataset ds, struct dsl_dataset fromds,
	- boolean_t stream_compressed, uint64_t *sizep);
	-int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
	- boolean_t stream_compressed, uint64_t *sizep);
	-int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
	- boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
	-#ifdef illumos
	- int outfd, struct vnode vp, offset_t off);
	-#else
	- int outfd, struct file fp, offset_t off);
	-#endif
	-
	-typedef struct dmu_recv_cookie {
	- struct dsl_dataset *drc_ds;
	- struct dmu_replay_record *drc_drr_begin;
	- struct drr_begin *drc_drrb;
	- const char *drc_tofs;
	- const char *drc_tosnap;
	- boolean_t drc_newfs;
	- boolean_t drc_byteswap;
	- boolean_t drc_force;
	- boolean_t drc_resumable;
	- boolean_t drc_clone;
	- struct avl_tree *drc_guid_to_ds_map;
	- zio_cksum_t drc_cksum;
	- uint64_t drc_newsnapobj;
	- void *drc_owner;
	- cred_t *drc_cred;
	-} dmu_recv_cookie_t;
	-
	-int dmu_recv_begin(char tofs, char tosnap,
	- struct dmu_replay_record *drr_begin,
	- boolean_t force, boolean_t resumable, char origin, dmu_recv_cookie_t drc);
	-#ifdef illumos
	-int dmu_recv_stream(dmu_recv_cookie_t drc, struct vnode vp, offset_t *voffp,
	-#else
	-int dmu_recv_stream(dmu_recv_cookie_t drc, struct file fp, offset_t *voffp,
	-#endif
	- int cleanup_fd, uint64_t *action_handlep);
	-int dmu_recv_end(dmu_recv_cookie_t drc, void owner);
	-boolean_t dmu_objset_is_receiving(objset_t *os);
	-
	-#endif /* _DMU_SEND_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
	@@ -1,69 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_DMU_TRAVERSE_H
	-#define _SYS_DMU_TRAVERSE_H
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/zio.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dnode_phys;
	-struct dsl_dataset;
	-struct zilog;
	-struct arc_buf;
	-
	-typedef int (blkptr_cb_t)(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const struct dnode_phys dnp, void *arg);
	-
	-#define TRAVERSE_PRE (1<<0)
	-#define TRAVERSE_POST (1<<1)
	-#define TRAVERSE_PREFETCH_METADATA (1<<2)
	-#define TRAVERSE_PREFETCH_DATA (1<<3)
	-#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA \| TRAVERSE_PREFETCH_DATA)
	-#define TRAVERSE_HARD (1<<4)
	-
	-/* Special traverse error return value to indicate skipping of children */
	-#define TRAVERSE_VISIT_NO_CHILDREN -1
	-
	-int traverse_dataset(struct dsl_dataset *ds,
	- uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
	-int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start,
	- zbookmark_phys_t resume, int flags, blkptr_cb_t func, void arg);
	-int traverse_dataset_destroyed(spa_t spa, blkptr_t blkptr,
	- uint64_t txg_start, zbookmark_phys_t *resume, int flags,
	- blkptr_cb_t func, void *arg);
	-int traverse_pool(spa_t *spa,
	- uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DMU_TRAVERSE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
	@@ -1,152 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_DMU_TX_H
	-#define _SYS_DMU_TX_H
	-
	-#include <sys/dmu.h>
	-#include <sys/txg.h>
	-#include <sys/refcount.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dmu_buf_impl;
	-struct dmu_tx_hold;
	-struct dnode_link;
	-struct dsl_pool;
	-struct dnode;
	-struct dsl_dir;
	-
	-struct dmu_tx {
	- /*
	- * No synchronization is needed because a tx can only be handled
	- * by one thread.
	- */
	- list_t tx_holds; /* list of dmu_tx_hold_t */
	- objset_t *tx_objset;
	- struct dsl_dir *tx_dir;
	- struct dsl_pool *tx_pool;
	- uint64_t tx_txg;
	- uint64_t tx_lastsnap_txg;
	- uint64_t tx_lasttried_txg;
	- txg_handle_t tx_txgh;
	- void *tx_tempreserve_cookie;
	- struct dmu_tx_hold *tx_needassign_txh;
	-
	- /* list of dmu_tx_callback_t on this dmu_tx */
	- list_t tx_callbacks;
	-
	- /* placeholder for syncing context, doesn't need specific holds */
	- boolean_t tx_anyobj;
	-
	- /* transaction is marked as being a "net free" of space */
	- boolean_t tx_netfree;
	-
	- /* time this transaction was created */
	- hrtime_t tx_start;
	-
	- /* need to wait for sufficient dirty space */
	- boolean_t tx_wait_dirty;
	-
	- /* has this transaction already been delayed? */
	- boolean_t tx_dirty_delayed;
	-
	- int tx_err;
	-};
	-
	-enum dmu_tx_hold_type {
	- THT_NEWOBJECT,
	- THT_WRITE,
	- THT_BONUS,
	- THT_FREE,
	- THT_ZAP,
	- THT_SPACE,
	- THT_SPILL,
	- THT_NUMTYPES
	-};
	-
	-typedef struct dmu_tx_hold {
	- dmu_tx_t *txh_tx;
	- list_node_t txh_node;
	- struct dnode *txh_dnode;
	- zfs_refcount_t txh_space_towrite;
	- zfs_refcount_t txh_memory_tohold;
	- enum dmu_tx_hold_type txh_type;
	- uint64_t txh_arg1;
	- uint64_t txh_arg2;
	-} dmu_tx_hold_t;
	-
	-typedef struct dmu_tx_callback {
	- list_node_t dcb_node; /* linked to tx_callbacks list */
	- dmu_tx_callback_func_t dcb_func; / caller function pointer */
	- void dcb_data; / caller private data */
	-} dmu_tx_callback_t;
	-
	-/*
	- * These routines are defined in dmu.h, and are called by the user.
	- */
	-dmu_tx_t dmu_tx_create(objset_t dd);
	-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
	-void dmu_tx_commit(dmu_tx_t *tx);
	-void dmu_tx_abort(dmu_tx_t *tx);
	-uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
	-struct dsl_pool dmu_tx_pool(dmu_tx_t tx);
	-void dmu_tx_wait(dmu_tx_t *tx);
	-
	-void dmu_tx_callback_register(dmu_tx_t tx, dmu_tx_callback_func_t dcb_func,
	- void *dcb_data);
	-void dmu_tx_do_callbacks(list_t *cb_list, int error);
	-
	-/*
	- * These routines are defined in dmu_spa.h, and are called by the SPA.
	- */
	-extern dmu_tx_t dmu_tx_create_assigned(struct dsl_pool dp, uint64_t txg);
	-
	-/*
	- * These routines are only called by the DMU.
	- */
	-dmu_tx_t dmu_tx_create_dd(dsl_dir_t dd);
	-int dmu_tx_is_syncing(dmu_tx_t *tx);
	-int dmu_tx_private_ok(dmu_tx_t *tx);
	-void dmu_tx_add_new_object(dmu_tx_t tx, dnode_t dn);
	-void dmu_tx_dirty_buf(dmu_tx_t tx, struct dmu_buf_impl db);
	-void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
	-
	-#ifdef ZFS_DEBUG
	-#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db)
	-#else
	-#define DMU_TX_DIRTY_BUF(tx, db)
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DMU_TX_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
	@@ -1,76 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2014 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _DMU_ZFETCH_H
	-#define _DMU_ZFETCH_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-extern uint64_t zfetch_array_rd_sz;
	-
	-struct dnode; /* so we can reference dnode */
	-
	-typedef struct zstream {
	- uint64_t zs_blkid; /* expect next access at this blkid */
	- uint64_t zs_pf_blkid; /* next block to prefetch */
	-
	- /*
	- * We will next prefetch the L1 indirect block of this level-0
	- * block id.
	- */
	- uint64_t zs_ipf_blkid;
	-
	- kmutex_t zs_lock; /* protects stream */
	- hrtime_t zs_atime; /* time last prefetch issued */
	- list_node_t zs_node; /* link for zf_stream */
	-} zstream_t;
	-
	-typedef struct zfetch {
	- krwlock_t zf_rwlock; /* protects zfetch structure */
	- list_t zf_stream; /* list of zstream_t's */
	- struct dnode zf_dnode; / dnode that owns this zfetch */
	-} zfetch_t;
	-
	-void zfetch_init(void);
	-void zfetch_fini(void);
	-
	-void dmu_zfetch_init(zfetch_t , struct dnode );
	-void dmu_zfetch_fini(zfetch_t *);
	-void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
	-
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _DMU_ZFETCH_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
	@@ -1,599 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- */
	-
	-#ifndef _SYS_DNODE_H
	-#define _SYS_DNODE_H
	-
	-#include <sys/zfs_context.h>
	-#include <sys/avl.h>
	-#include <sys/spa.h>
	-#include <sys/txg.h>
	-#include <sys/zio.h>
	-#include <sys/refcount.h>
	-#include <sys/dmu_zfetch.h>
	-#include <sys/zrlock.h>
	-#include <sys/multilist.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * dnode_hold() flags.
	- */
	-#define DNODE_MUST_BE_ALLOCATED 1
	-#define DNODE_MUST_BE_FREE 2
	-
	-/*
	- * dnode_next_offset() flags.
	- */
	-#define DNODE_FIND_HOLE 1
	-#define DNODE_FIND_BACKWARDS 2
	-#define DNODE_FIND_HAVELOCK 4
	-
	-/*
	- * Fixed constants.
	- */
	-#define DNODE_SHIFT 9 /* 512 bytes */
	-#define DN_MIN_INDBLKSHIFT 12 /* 4k */
	-/*
	- * If we ever increase this value beyond 20, we need to revisit all logic that
	- * does x << level * ebps to handle overflow. With a 1M indirect block size,
	- * 4 levels of indirect blocks would not be able to guarantee addressing an
	- * entire object, so 5 levels will be used, but 5 * (20 - 7) = 65.
	- */
	-#define DN_MAX_INDBLKSHIFT 17 /* 128k */
	-#define DNODE_BLOCK_SHIFT 14 /* 16k */
	-#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
	-#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
	-#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
	-
	-/*
	- * dnode id flags
	- *
	- * Note: a file will never ever have its
	- * ids moved from bonus->spill
	- * and only in a crypto environment would it be on spill
	- */
	-#define DN_ID_CHKED_BONUS 0x1
	-#define DN_ID_CHKED_SPILL 0x2
	-#define DN_ID_OLD_EXIST 0x4
	-#define DN_ID_NEW_EXIST 0x8
	-
	-/*
	- * Derived constants.
	- */
	-#define DNODE_MIN_SIZE (1 << DNODE_SHIFT)
	-#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT)
	-#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT)
	-#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT)
	-#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT)
	-#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \
	- (1 << SPA_BLKPTRSHIFT))
	-#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT)
	-#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE))
	-#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
	-#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
	-#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
	-#define DN_KILL_SPILLBLK (1)
	-
	-#define DN_SLOT_UNINIT ((void )NULL) / Uninitialized */
	-#define DN_SLOT_FREE ((void )1UL) / Free slot */
	-#define DN_SLOT_ALLOCATED ((void )2UL) / Allocated slot */
	-#define DN_SLOT_INTERIOR ((void )3UL) / Interior allocated slot */
	-#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR)
	-#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL)
	-
	-#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
	-#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
	-
	-/*
	- * This is inaccurate if the indblkshift of the particular object is not the
	- * max. But it's only used by userland to calculate the zvol reservation.
	- */
	-#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
	-#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT)
	-
	-/* The +2 here is a cheesy way to round up */
	-#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
	- (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
	-
	-#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
	- (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
	-#define DN_MAX_BONUS_LEN(dnp) \
	- ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \
	- (uint8_t )DN_SPILL_BLKPTR(dnp) - (uint8_t )DN_BONUS(dnp) : \
	- (uint8_t )(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t )DN_BONUS(dnp))
	-
	-#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
	- (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
	-
	-#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
	-
	-struct dmu_buf_impl;
	-struct objset;
	-struct zio;
	-
	-enum dnode_dirtycontext {
	- DN_UNDIRTIED,
	- DN_DIRTY_OPEN,
	- DN_DIRTY_SYNC
	-};
	-
	-/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
	-#define DNODE_FLAG_USED_BYTES (1<<0)
	-#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
	-
	-/* Does dnode have a SA spill blkptr in bonus? */
	-#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
	-
	-/*
	- * VARIABLE-LENGTH (LARGE) DNODES
	- *
	- * The motivation for variable-length dnodes is to eliminate the overhead
	- * associated with using spill blocks. Spill blocks are used to store
	- * system attribute data (i.e. file metadata) that does not fit in the
	- * dnode's bonus buffer. By allowing a larger bonus buffer area the use of
	- * a spill block can be avoided. Spill blocks potentially incur an
	- * additional read I/O for every dnode in a dnode block. As a worst case
	- * example, reading 32 dnodes from a 16k dnode block and all of the spill
	- * blocks could issue 33 separate reads. Now suppose those dnodes have size
	- * 1024 and therefore don't need spill blocks. Then the worst case number
	- * of blocks read is reduced to from 33 to two--one per dnode block.
	- *
	- * ZFS-on-Linux systems that make heavy use of extended attributes benefit
	- * from this feature. In particular, ZFS-on-Linux supports the xattr=sa
	- * dataset property which allows file extended attribute data to be stored
	- * in the dnode bonus buffer as an alternative to the traditional
	- * directory-based format. Workloads such as SELinux and the Lustre
	- * distributed filesystem often store enough xattr data to force spill
	- * blocks when xattr=sa is in effect. Large dnodes may therefore provide a
	- * performance benefit to such systems. Other use cases that benefit from
	- * this feature include files with large ACLs and symbolic links with long
	- * target names.
	- *
	- * The size of a dnode may be a multiple of 512 bytes up to the size of a
	- * dnode block (currently 16384 bytes). The dn_extra_slots field of the
	- * on-disk dnode_phys_t structure describes the size of the physical dnode
	- * on disk. The field represents how many "extra" dnode_phys_t slots a
	- * dnode consumes in its dnode block. This convention results in a value of
	- * 0 for 512 byte dnodes which preserves on-disk format compatibility with
	- * older software which doesn't support large dnodes.
	- *
	- * Similarly, the in-memory dnode_t structure has a dn_num_slots field
	- * to represent the total number of dnode_phys_t slots consumed on disk.
	- * Thus dn->dn_num_slots is 1 greater than the corresponding
	- * dnp->dn_extra_slots. This difference in convention was adopted
	- * because, unlike on-disk structures, backward compatibility is not a
	- * concern for in-memory objects, so we used a more natural way to
	- * represent size for a dnode_t.
	- *
	- * The default size for newly created dnodes is determined by the value of
	- * the "dnodesize" dataset property. By default the property is set to
	- * "legacy" which is compatible with older software. Setting the property
	- * to "auto" will allow the filesystem to choose the most suitable dnode
	- * size. Currently this just sets the default dnode size to 1k, but future
	- * code improvements could dynamically choose a size based on observed
	- * workload patterns. Dnodes of varying sizes can coexist within the same
	- * dataset and even within the same dnode block.
	- */
	-
	-typedef struct dnode_phys {
	- uint8_t dn_type; /* dmu_object_type_t */
	- uint8_t dn_indblkshift; /* ln2(indirect block size) */
	- uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
	- uint8_t dn_nblkptr; /* length of dn_blkptr */
	- uint8_t dn_bonustype; /* type of data in bonus buffer */
	- uint8_t dn_checksum; /* ZIO_CHECKSUM type */
	- uint8_t dn_compress; /* ZIO_COMPRESS type */
	- uint8_t dn_flags; /* DNODE_FLAG_* */
	- uint16_t dn_datablkszsec; /* data block size in 512b sectors */
	- uint16_t dn_bonuslen; /* length of dn_bonus */
	- uint8_t dn_extra_slots; /* # of subsequent slots consumed */
	- uint8_t dn_pad2[3];
	-
	- /* accounting is protected by dn_dirty_mtx */
	- uint64_t dn_maxblkid; /* largest allocated block ID */
	- uint64_t dn_used; /* bytes (or sectors) of disk space */
	-
	- /*
	- * Both dn_pad2 and dn_pad3 are protected by the block's MAC. This
	- * allows us to protect any fields that might be added here in the
	- * future. In either case, developers will want to check
	- * zio_crypt_init_uios_dnode() to ensure the new field is being
	- * protected properly.
	- */
	- uint64_t dn_pad3[4];
	- union {
	- blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)];
	- struct {
	- blkptr_t __dn_ignore1;
	- uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN];
	- };
	- struct {
	- blkptr_t __dn_ignore2;
	- uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN -
	- sizeof (blkptr_t)];
	- blkptr_t dn_spill;
	- };
	- };
	-} dnode_phys_t;
	-
	-#define DN_SPILL_BLKPTR(dnp) (blkptr_t )((char )(dnp) + \
	- (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
	-
	-struct dnode {
	- /*
	- * Protects the structure of the dnode, including the number of levels
	- * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
	- */
	- krwlock_t dn_struct_rwlock;
	-
	- /* Our link on dn_objset->os_dnodes list; protected by os_lock. */
	- list_node_t dn_link;
	-
	- /* immutable: */
	- struct objset *dn_objset;
	- uint64_t dn_object;
	- struct dmu_buf_impl *dn_dbuf;
	- struct dnode_handle *dn_handle;
	- dnode_phys_t dn_phys; / pointer into dn->dn_dbuf->db.db_data */
	-
	- /*
	- * Copies of stuff in dn_phys. They're valid in the open
	- * context (eg. even before the dnode is first synced).
	- * Where necessary, these are protected by dn_struct_rwlock.
	- */
	- dmu_object_type_t dn_type; /* object type */
	- uint16_t dn_bonuslen; /* bonus length */
	- uint8_t dn_bonustype; /* bonus type */
	- uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
	- uint8_t dn_checksum; /* ZIO_CHECKSUM type */
	- uint8_t dn_compress; /* ZIO_COMPRESS type */
	- uint8_t dn_nlevels;
	- uint8_t dn_indblkshift;
	- uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
	- uint8_t dn_moved; /* Has this dnode been moved? */
	- uint16_t dn_datablkszsec; /* in 512b sectors */
	- uint32_t dn_datablksz; /* in bytes */
	- uint64_t dn_maxblkid;
	- uint8_t dn_next_type[TXG_SIZE];
	- uint8_t dn_num_slots; /* metadnode slots consumed on disk */
	- uint8_t dn_next_nblkptr[TXG_SIZE];
	- uint8_t dn_next_nlevels[TXG_SIZE];
	- uint8_t dn_next_indblkshift[TXG_SIZE];
	- uint8_t dn_next_bonustype[TXG_SIZE];
	- uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */
	- uint16_t dn_next_bonuslen[TXG_SIZE];
	- uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
	-
	- /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
	- uint32_t dn_dbufs_count; /* count of dn_dbufs */
	-
	- /* protected by os_lock: */
	- multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
	-
	- /* protected by dn_mtx: */
	- kmutex_t dn_mtx;
	- list_t dn_dirty_records[TXG_SIZE];
	- struct range_tree *dn_free_ranges[TXG_SIZE];
	- uint64_t dn_allocated_txg;
	- uint64_t dn_free_txg;
	- uint64_t dn_assigned_txg;
	- uint64_t dn_dirty_txg; /* txg dnode was last dirtied */
	- kcondvar_t dn_notxholds;
	- enum dnode_dirtycontext dn_dirtyctx;
	- uint8_t dn_dirtyctx_firstset; / dbg: contents meaningless */
	-
	- /* protected by own devices */
	- zfs_refcount_t dn_tx_holds;
	- zfs_refcount_t dn_holds;
	-
	- kmutex_t dn_dbufs_mtx;
	- /*
	- * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
	- * can contain multiple dbufs of the same (level, blkid) when a
	- * dbuf is marked DB_EVICTING without being removed from
	- * dn_dbufs. To maintain the avl invariant that there cannot be
	- * duplicate entries, we order the dbufs by an arbitrary value -
	- * their address in memory. This means that dn_dbufs cannot be used to
	- * directly look up a dbuf. Instead, callers must use avl_walk, have
	- * a reference to the dbuf, or look up a non-existant node with
	- * db_state = DB_SEARCH (see dbuf_free_range for an example).
	- */
	- avl_tree_t dn_dbufs;
	-
	- /* protected by dn_struct_rwlock */
	- struct dmu_buf_impl dn_bonus; / bonus buffer dbuf */
	-
	- boolean_t dn_have_spill; /* have spill or are spilling */
	-
	- /* parent IO for current sync write */
	- zio_t *dn_zio;
	-
	- /* used in syncing context */
	- uint64_t dn_oldused; /* old phys used bytes */
	- uint64_t dn_oldflags; /* old phys dn_flags */
	- uint64_t dn_olduid, dn_oldgid;
	- uint64_t dn_newuid, dn_newgid;
	- int dn_id_flags;
	-
	- /* holds prefetch structure */
	- struct zfetch dn_zfetch;
	-};
	-
	-/*
	- * Since AVL already has embedded element counter, use dn_dbufs_count
	- * only for dbufs not counted there (bonus buffers) and just add them.
	- */
	-#define DN_DBUFS_COUNT(dn) ((dn)->dn_dbufs_count + \
	- avl_numnodes(&(dn)->dn_dbufs))
	-
	-/*
	- * Adds a level of indirection between the dbuf and the dnode to avoid
	- * iterating descendent dbufs in dnode_move(). Handles are not allocated
	- * individually, but as an array of child dnodes in dnode_hold_impl().
	- */
	-typedef struct dnode_handle {
	- /* Protects dnh_dnode from modification by dnode_move(). */
	- zrlock_t dnh_zrlock;
	- dnode_t *dnh_dnode;
	-} dnode_handle_t;
	-
	-typedef struct dnode_children {
	- dmu_buf_user_t dnc_dbu; /* User evict data */
	- size_t dnc_count; /* number of children */
	- dnode_handle_t dnc_children[]; /* sized dynamically */
	-} dnode_children_t;
	-
	-typedef struct free_range {
	- avl_node_t fr_node;
	- uint64_t fr_blkid;
	- uint64_t fr_nblks;
	-} free_range_t;
	-
	-void dnode_special_open(struct objset dd, dnode_phys_t dnp,
	- uint64_t object, dnode_handle_t *dnh);
	-void dnode_special_close(dnode_handle_t *dnh);
	-
	-void dnode_setbonuslen(dnode_t dn, int newsize, dmu_tx_t tx);
	-void dnode_setbonus_type(dnode_t dn, dmu_object_type_t, dmu_tx_t tx);
	-void dnode_rm_spill(dnode_t dn, dmu_tx_t tx);
	-
	-int dnode_hold(struct objset *dd, uint64_t object,
	- void ref, dnode_t *dnp);
	-int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
	- void ref, dnode_t *dnp);
	-boolean_t dnode_add_ref(dnode_t dn, void ref);
	-void dnode_rele(dnode_t dn, void ref);
	-void dnode_rele_and_unlock(dnode_t dn, void tag, boolean_t evicting);
	-void dnode_setdirty(dnode_t dn, dmu_tx_t tx);
	-void dnode_sync(dnode_t dn, dmu_tx_t tx);
	-void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
	- dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
	-void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
	- dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
	-void dnode_free(dnode_t dn, dmu_tx_t tx);
	-void dnode_byteswap(dnode_phys_t *dnp);
	-void dnode_buf_byteswap(void *buf, size_t size);
	-void dnode_verify(dnode_t *dn);
	-int dnode_set_blksz(dnode_t dn, uint64_t size, int ibs, dmu_tx_t tx);
	-void dnode_free_range(dnode_t dn, uint64_t off, uint64_t len, dmu_tx_t tx);
	-void dnode_diduse_space(dnode_t *dn, int64_t space);
	-void dnode_new_blkid(dnode_t dn, uint64_t blkid, dmu_tx_t tx, boolean_t);
	-uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
	-void dnode_init(void);
	-void dnode_fini(void);
	-int dnode_next_offset(dnode_t dn, int flags, uint64_t off,
	- int minlvl, uint64_t blkfill, uint64_t txg);
	-void dnode_evict_dbufs(dnode_t *dn);
	-void dnode_evict_bonus(dnode_t *dn);
	-void dnode_free_interior_slots(dnode_t *dn);
	-boolean_t dnode_needs_remap(const dnode_t *dn);
	-
	-#define DNODE_IS_DIRTY(_dn) \
	- ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
	-
	-#define DNODE_IS_CACHEABLE(_dn) \
	- ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL \|\| \
	- (DMU_OT_IS_METADATA((_dn)->dn_type) && \
	- (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
	-
	-#define DNODE_META_IS_CACHEABLE(_dn) \
	- ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL \|\| \
	- (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
	-
	-/*
	- * Used for dnodestats kstat.
	- */
	-typedef struct dnode_stats {
	- /*
	- * Number of failed attempts to hold a meta dnode dbuf.
	- */
	- kstat_named_t dnode_hold_dbuf_hold;
	- /*
	- * Number of failed attempts to read a meta dnode dbuf.
	- */
	- kstat_named_t dnode_hold_dbuf_read;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able
	- * to hold the requested object number which was allocated. This is
	- * the common case when looking up any allocated object number.
	- */
	- kstat_named_t dnode_hold_alloc_hits;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
	- * able to hold the request object number because it was not allocated.
	- */
	- kstat_named_t dnode_hold_alloc_misses;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
	- * able to hold the request object number because the object number
	- * refers to an interior large dnode slot.
	- */
	- kstat_named_t dnode_hold_alloc_interior;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed
	- * to retry acquiring slot zrl locks due to contention.
	- */
	- kstat_named_t dnode_hold_alloc_lock_retry;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not
	- * need to create the dnode because another thread did so after
	- * dropping the read lock but before acquiring the write lock.
	- */
	- kstat_named_t dnode_hold_alloc_lock_misses;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found
	- * a free dnode instantiated by dnode_create() but not yet allocated
	- * by dnode_allocate().
	- */
	- kstat_named_t dnode_hold_alloc_type_none;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able
	- * to hold the requested range of free dnode slots.
	- */
	- kstat_named_t dnode_hold_free_hits;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
	- * able to hold the requested range of free dnode slots because
	- * at least one slot was allocated.
	- */
	- kstat_named_t dnode_hold_free_misses;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
	- * able to hold the requested range of free dnode slots because
	- * after acquiring the zrl lock at least one slot was allocated.
	- */
	- kstat_named_t dnode_hold_free_lock_misses;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed
	- * to retry acquiring slot zrl locks due to contention.
	- */
	- kstat_named_t dnode_hold_free_lock_retry;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
	- * a range of dnode slots which were held by another thread.
	- */
	- kstat_named_t dnode_hold_free_refcount;
	- /*
	- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
	- * a range of dnode slots which would overflow the dnode_phys_t.
	- */
	- kstat_named_t dnode_hold_free_overflow;
	- /*
	- * Number of times a dnode_hold(...) was attempted on a dnode
	- * which had already been unlinked in an earlier txg.
	- */
	- kstat_named_t dnode_hold_free_txg;
	- /*
	- * Number of times dnode_free_interior_slots() needed to retry
	- * acquiring a slot zrl lock due to contention.
	- */
	- kstat_named_t dnode_free_interior_lock_retry;
	- /*
	- * Number of new dnodes allocated by dnode_allocate().
	- */
	- kstat_named_t dnode_allocate;
	- /*
	- * Number of dnodes re-allocated by dnode_reallocate().
	- */
	- kstat_named_t dnode_reallocate;
	- /*
	- * Number of meta dnode dbufs evicted.
	- */
	- kstat_named_t dnode_buf_evict;
	- /*
	- * Number of times dmu_object_alloc*() reached the end of the existing
	- * object ID chunk and advanced to a new one.
	- */
	- kstat_named_t dnode_alloc_next_chunk;
	- /*
	- * Number of times multiple threads attempted to allocate a dnode
	- * from the same block of free dnodes.
	- */
	- kstat_named_t dnode_alloc_race;
	- /*
	- * Number of times dmu_object_alloc*() was forced to advance to the
	- * next meta dnode dbuf due to an error from dmu_object_next().
	- */
	- kstat_named_t dnode_alloc_next_block;
	- /*
	- * Statistics for tracking dnodes which have been moved.
	- */
	- kstat_named_t dnode_move_invalid;
	- kstat_named_t dnode_move_recheck1;
	- kstat_named_t dnode_move_recheck2;
	- kstat_named_t dnode_move_special;
	- kstat_named_t dnode_move_handle;
	- kstat_named_t dnode_move_rwlock;
	- kstat_named_t dnode_move_active;
	-} dnode_stats_t;
	-
	-extern dnode_stats_t dnode_stats;
	-
	-#define DNODE_STAT_INCR(stat, val) \
	- atomic_add_64(&dnode_stats.stat.value.ui64, (val));
	-#define DNODE_STAT_BUMP(stat) \
	- DNODE_STAT_INCR(stat, 1);
	-
	-#ifdef ZFS_DEBUG
	-
	-/*
	- * There should be a ## between the string literal and fmt, to make it
	- * clear that we're joining two strings together, but that piece of shit
	- * gcc doesn't support that preprocessor token.
	- */
	-#define dprintf_dnode(dn, fmt, ...) do { \
	- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	- char __db_buf[32]; \
	- uint64_t __db_obj = (dn)->dn_object; \
	- if (__db_obj == DMU_META_DNODE_OBJECT) \
	- (void) strcpy(__db_buf, "mdn"); \
	- else \
	- (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
	- (u_longlong_t)__db_obj);\
	- dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
	- __db_buf, __VA_ARGS__); \
	- } \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define DNODE_VERIFY(dn) dnode_verify(dn)
	-#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx)
	-
	-#else
	-
	-#define dprintf_dnode(db, fmt, ...)
	-#define DNODE_VERIFY(dn)
	-#define FREE_VERIFY(db, start, end, tx)
	-
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DNODE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h
	@@ -1,52 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_DSL_BOOKMARK_H
	-#define _SYS_DSL_BOOKMARK_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dsl_pool;
	-struct dsl_dataset;
	-
	-/*
	- * On disk zap object.
	- */
	-typedef struct zfs_bookmark_phys {
	- uint64_t zbm_guid; /* guid of bookmarked dataset */
	- uint64_t zbm_creation_txg; /* birth transaction group */
	- uint64_t zbm_creation_time; /* bookmark creation time */
	-} zfs_bookmark_phys_t;
	-
	-int dsl_bookmark_create(nvlist_t , nvlist_t );
	-int dsl_get_bookmarks(const char , nvlist_t , nvlist_t *);
	-int dsl_get_bookmarks_impl(dsl_dataset_t , nvlist_t , nvlist_t *);
	-int dsl_bookmark_destroy(nvlist_t , nvlist_t );
	-int dsl_bookmark_rename(const char fs, const char from, const char *to);
	-int dsl_bookmark_lookup(struct dsl_pool , const char ,
	- struct dsl_dataset , zfs_bookmark_phys_t );
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_BOOKMARK_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
	@@ -1,457 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#ifndef _SYS_DSL_DATASET_H
	-#define _SYS_DSL_DATASET_H
	-
	-#include <sys/dmu.h>
	-#include <sys/spa.h>
	-#include <sys/txg.h>
	-#include <sys/zio.h>
	-#include <sys/bplist.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/zfs_context.h>
	-#include <sys/dsl_deadlist.h>
	-#include <sys/refcount.h>
	-#include <sys/rrwlock.h>
	-#include <zfeature_common.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dsl_dataset;
	-struct dsl_dir;
	-struct dsl_pool;
	-
	-#define DS_FLAG_INCONSISTENT (1ULL<<0)
	-#define DS_IS_INCONSISTENT(ds) \
	- (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
	-
	-/*
	- * Do not allow this dataset to be promoted.
	- */
	-#define DS_FLAG_NOPROMOTE (1ULL<<1)
	-
	-/*
	- * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
	- * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
	- * refquota/refreservations).
	- */
	-#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2)
	-
	-/*
	- * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
	- * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
	- */
	-#define DS_FLAG_DEFER_DESTROY (1ULL<<3)
	-#define DS_IS_DEFER_DESTROY(ds) \
	- (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
	-
	-/*
	- * DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
	- * They should be of the format <reverse-dns>:<field>.
	- */
	-
	-/*
	- * This field's value is the object ID of a zap object which contains the
	- * bookmarks of this dataset. If it is present, then this dataset is counted
	- * in the refcount of the SPA_FEATURES_BOOKMARKS feature.
	- */
	-#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
	-
	-/*
	- * This field is present (with value=0) if this dataset may contain large
	- * dnodes (>512B). If it is present, then this dataset is counted in the
	- * refcount of the SPA_FEATURE_LARGE_DNODE feature.
	- */
	-#define DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode"
	-
	-/*
	- * These fields are set on datasets that are in the middle of a resumable
	- * receive, and allow the sender to resume the send if it is interrupted.
	- */
	-#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
	-#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
	-#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
	-#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
	-#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
	-#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
	-#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok"
	-#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
	-#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok"
	-
	-/*
	- * This field is set to the object number of the remap deadlist if one exists.
	- */
	-#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist"
	-
	-/*
	- * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
	- * name lookups should be performed case-insensitively.
	- */
	-#define DS_FLAG_CI_DATASET (1ULL<<16)
	-
	-#define DS_CREATE_FLAG_NODIRTY (1ULL<<24)
	-
	-typedef struct dsl_dataset_phys {
	- uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */
	- uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */
	- uint64_t ds_prev_snap_txg;
	- uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */
	- uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
	- uint64_t ds_num_children; /* clone/snap children; ==0 for head */
	- uint64_t ds_creation_time; /* seconds since 1970 */
	- uint64_t ds_creation_txg;
	- uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */
	- /*
	- * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes
	- * include all blocks referenced by this dataset, including those
	- * shared with any other datasets.
	- */
	- uint64_t ds_referenced_bytes;
	- uint64_t ds_compressed_bytes;
	- uint64_t ds_uncompressed_bytes;
	- uint64_t ds_unique_bytes; /* only relevant to snapshots */
	- /*
	- * The ds_fsid_guid is a 56-bit ID that can change to avoid
	- * collisions. The ds_guid is a 64-bit ID that will never
	- * change, so there is a small probability that it will collide.
	- */
	- uint64_t ds_fsid_guid;
	- uint64_t ds_guid;
	- uint64_t ds_flags; /* DS_FLAG_* */
	- blkptr_t ds_bp;
	- uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */
	- uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */
	- uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */
	- uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
	-} dsl_dataset_phys_t;
	-
	-typedef struct dsl_dataset {
	- dmu_buf_user_t ds_dbu;
	- rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */
	-
	- /* Immutable: */
	- struct dsl_dir *ds_dir;
	- dmu_buf_t *ds_dbuf;
	- uint64_t ds_object;
	- uint64_t ds_fsid_guid;
	- boolean_t ds_is_snapshot;
	-
	- /* only used in syncing context, only valid for non-snapshots: */
	- struct dsl_dataset *ds_prev;
	- uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
	-
	- /* has internal locking: */
	- dsl_deadlist_t ds_deadlist;
	- bplist_t ds_pending_deadlist;
	-
	- /*
	- * The remap deadlist contains blocks (DVA's, really) that are
	- * referenced by the previous snapshot and point to indirect vdevs,
	- * but in this dataset they have been remapped to point to concrete
	- * (or at least, less-indirect) vdevs. In other words, the
	- * physical DVA is referenced by the previous snapshot but not by
	- * this dataset. Logically, the DVA continues to be referenced,
	- * but we are using a different (less indirect) physical DVA.
	- * This deadlist is used to determine when physical DVAs that
	- * point to indirect vdevs are no longer referenced anywhere,
	- * and thus should be marked obsolete.
	- *
	- * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled.
	- */
	- dsl_deadlist_t ds_remap_deadlist;
	- /* protects creation of the ds_remap_deadlist */
	- kmutex_t ds_remap_deadlist_lock;
	-
	- /* protected by lock on pool's dp_dirty_datasets list */
	- txg_node_t ds_dirty_link;
	- list_node_t ds_synced_link;
	-
	- /*
	- * ds_phys->ds_<accounting> is also protected by ds_lock.
	- * Protected by ds_lock:
	- */
	- kmutex_t ds_lock;
	- objset_t *ds_objset;
	- uint64_t ds_userrefs;
	- void *ds_owner;
	-
	- /*
	- * Long holds prevent the ds from being destroyed; they allow the
	- * ds to remain held even after dropping the dp_config_rwlock.
	- * Owning counts as a long hold. See the comments above
	- * dsl_pool_hold() for details.
	- */
	- zfs_refcount_t ds_longholds;
	-
	- /* no locking; only for making guesses */
	- uint64_t ds_trysnap_txg;
	-
	- /* for objset_open() */
	- kmutex_t ds_opening_lock;
	-
	- uint64_t ds_reserved; /* cached refreservation */
	- uint64_t ds_quota; /* cached refquota */
	-
	- kmutex_t ds_sendstream_lock;
	- list_t ds_sendstreams;
	-
	- /*
	- * When in the middle of a resumable receive, tracks how much
	- * progress we have made.
	- */
	- uint64_t ds_resume_object[TXG_SIZE];
	- uint64_t ds_resume_offset[TXG_SIZE];
	- uint64_t ds_resume_bytes[TXG_SIZE];
	-
	- /* Protected by our dsl_dir's dd_lock */
	- list_t ds_prop_cbs;
	-
	- /*
	- * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
	- * uses this feature.
	- */
	- uint8_t ds_feature_inuse[SPA_FEATURES];
	-
	- /*
	- * Set if we need to activate the feature on this dataset this txg
	- * (used only in syncing context).
	- */
	- uint8_t ds_feature_activation_needed[SPA_FEATURES];
	-
	- /* Protected by ds_lock; keep at end of struct for better locality */
	- char ds_snapname[ZFS_MAX_DATASET_NAME_LEN];
	-} dsl_dataset_t;
	-
	-inline dsl_dataset_phys_t *
	-dsl_dataset_phys(dsl_dataset_t *ds)
	-{
	- return (ds->ds_dbuf->db_data);
	-}
	-
	-typedef struct dsl_dataset_promote_arg {
	- const char *ddpa_clonename;
	- dsl_dataset_t *ddpa_clone;
	- list_t shared_snaps, origin_snaps, clone_snaps;
	- dsl_dataset_t origin_origin; / origin of the origin */
	- uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
	- nvlist_t *err_ds;
	- cred_t *cr;
	-} dsl_dataset_promote_arg_t;
	-
	-typedef struct dsl_dataset_rollback_arg {
	- const char *ddra_fsname;
	- const char *ddra_tosnap;
	- void *ddra_owner;
	- nvlist_t *ddra_result;
	-} dsl_dataset_rollback_arg_t;
	-
	-typedef struct dsl_dataset_snapshot_arg {
	- nvlist_t *ddsa_snaps;
	- nvlist_t *ddsa_props;
	- nvlist_t *ddsa_errors;
	- cred_t *ddsa_cr;
	-} dsl_dataset_snapshot_arg_t;
	-
	-/*
	- * The max length of a temporary tag prefix is the number of hex digits
	- * required to express UINT64_MAX plus one for the hyphen.
	- */
	-#define MAX_TAG_PREFIX_LEN 17
	-
	-#define dsl_dataset_is_snapshot(ds) \
	- (dsl_dataset_phys(ds)->ds_num_children != 0)
	-
	-#define DS_UNIQUE_IS_ACCURATE(ds) \
	- ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
	-
	-int dsl_dataset_hold(struct dsl_pool dp, const char name, void *tag,
	- dsl_dataset_t **dsp);
	-boolean_t dsl_dataset_try_add_ref(struct dsl_pool dp, dsl_dataset_t ds,
	- void *tag);
	-int dsl_dataset_hold_obj(struct dsl_pool dp, uint64_t dsobj, void tag,
	- dsl_dataset_t **);
	-void dsl_dataset_rele(dsl_dataset_t ds, void tag);
	-int dsl_dataset_own(struct dsl_pool dp, const char name,
	- void tag, dsl_dataset_t *dsp);
	-int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
	- void tag, dsl_dataset_t *dsp);
	-void dsl_dataset_disown(dsl_dataset_t ds, void tag);
	-void dsl_dataset_name(dsl_dataset_t ds, char name);
	-boolean_t dsl_dataset_tryown(dsl_dataset_t ds, void tag);
	-int dsl_dataset_namelen(dsl_dataset_t *ds);
	-boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
	-uint64_t dsl_dataset_create_sync(dsl_dir_t pds, const char lastname,
	- dsl_dataset_t origin, uint64_t flags, cred_t , dmu_tx_t *);
	-uint64_t dsl_dataset_create_sync_dd(dsl_dir_t dd, dsl_dataset_t origin,
	- uint64_t flags, dmu_tx_t *tx);
	-void dsl_dataset_snapshot_sync(void arg, dmu_tx_t tx);
	-int dsl_dataset_snapshot_check(void arg, dmu_tx_t tx);
	-int dsl_dataset_snapshot(nvlist_t snaps, nvlist_t props, nvlist_t *errors);
	-void dsl_dataset_promote_sync(void arg, dmu_tx_t tx);
	-int dsl_dataset_promote_check(void arg, dmu_tx_t tx);
	-int dsl_dataset_promote(const char name, char conflsnap);
	-int dsl_dataset_clone_swap(dsl_dataset_t clone, dsl_dataset_t origin_head,
	- boolean_t force);
	-int dsl_dataset_rename_snapshot(const char *fsname,
	- const char oldsnapname, const char newsnapname, boolean_t recursive);
	-int dsl_dataset_snapshot_tmp(const char fsname, const char snapname,
	- minor_t cleanup_minor, const char *htag);
	-
	-blkptr_t dsl_dataset_get_blkptr(dsl_dataset_t ds);
	-
	-spa_t dsl_dataset_get_spa(dsl_dataset_t ds);
	-
	-boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
	- dsl_dataset_t *snap);
	-
	-void dsl_dataset_sync(dsl_dataset_t os, zio_t zio, dmu_tx_t *tx);
	-void dsl_dataset_sync_done(dsl_dataset_t os, dmu_tx_t tx);
	-
	-void dsl_dataset_block_born(dsl_dataset_t ds, const blkptr_t bp,
	- dmu_tx_t *tx);
	-int dsl_dataset_block_kill(dsl_dataset_t ds, const blkptr_t bp,
	- dmu_tx_t *tx, boolean_t async);
	-void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev,
	- uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx);
	-
	-void dsl_dataset_dirty(dsl_dataset_t ds, dmu_tx_t tx);
	-
	-int get_clones_stat_impl(dsl_dataset_t ds, nvlist_t val);
	-char get_receive_resume_stats_impl(dsl_dataset_t ds);
	-char get_child_receive_stats(dsl_dataset_t ds);
	-uint64_t dsl_get_refratio(dsl_dataset_t *ds);
	-uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds);
	-uint64_t dsl_get_compressratio(dsl_dataset_t *ds);
	-uint64_t dsl_get_used(dsl_dataset_t *ds);
	-uint64_t dsl_get_creation(dsl_dataset_t *ds);
	-uint64_t dsl_get_creationtxg(dsl_dataset_t *ds);
	-uint64_t dsl_get_refquota(dsl_dataset_t *ds);
	-uint64_t dsl_get_refreservation(dsl_dataset_t *ds);
	-uint64_t dsl_get_guid(dsl_dataset_t *ds);
	-uint64_t dsl_get_unique(dsl_dataset_t *ds);
	-uint64_t dsl_get_objsetid(dsl_dataset_t *ds);
	-uint64_t dsl_get_userrefs(dsl_dataset_t *ds);
	-uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds);
	-uint64_t dsl_get_referenced(dsl_dataset_t *ds);
	-uint64_t dsl_get_numclones(dsl_dataset_t *ds);
	-uint64_t dsl_get_inconsistent(dsl_dataset_t *ds);
	-uint64_t dsl_get_available(dsl_dataset_t *ds);
	-int dsl_get_written(dsl_dataset_t ds, uint64_t written);
	-int dsl_get_prev_snap(dsl_dataset_t ds, char snap);
	-int dsl_get_mountpoint(dsl_dataset_t ds, const char dsname, char *value,
	- char *source);
	-
	-void get_clones_stat(dsl_dataset_t ds, nvlist_t nv);
	-
	-void dsl_dataset_stats(dsl_dataset_t os, nvlist_t nv);
	-
	-void dsl_dataset_fast_stat(dsl_dataset_t ds, dmu_objset_stats_t stat);
	-void dsl_dataset_space(dsl_dataset_t *ds,
	- uint64_t refdbytesp, uint64_t availbytesp,
	- uint64_t usedobjsp, uint64_t availobjsp);
	-uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
	-int dsl_dataset_space_written(dsl_dataset_t oldsnap, dsl_dataset_t new,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	-int dsl_dataset_space_wouldfree(dsl_dataset_t firstsnap, dsl_dataset_t last,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	-boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
	-
	-int dsl_dsobj_to_dsname(char pname, uint64_t obj, char buf);
	-
	-int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
	- uint64_t asize, uint64_t inflight, uint64_t *used,
	- uint64_t *ref_rsrv);
	-int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
	- uint64_t quota);
	-int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
	- uint64_t reservation);
	-
	-boolean_t dsl_dataset_is_before(dsl_dataset_t later, dsl_dataset_t earlier,
	- uint64_t earlier_txg);
	-void dsl_dataset_long_hold(dsl_dataset_t ds, void tag);
	-void dsl_dataset_long_rele(dsl_dataset_t ds, void tag);
	-boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
	-
	-int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
	- dsl_dataset_t origin_head, boolean_t force, void owner, dmu_tx_t *tx);
	-void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
	- dsl_dataset_t origin_head, dmu_tx_t tx);
	-int dsl_dataset_snapshot_check_impl(dsl_dataset_t ds, const char snapname,
	- dmu_tx_t tx, boolean_t recv, uint64_t cnt, cred_t cr);
	-void dsl_dataset_snapshot_sync_impl(dsl_dataset_t ds, const char snapname,
	- dmu_tx_t *tx);
	-
	-void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
	- dmu_tx_t *tx);
	-void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
	-int dsl_dataset_get_snapname(dsl_dataset_t *ds);
	-int dsl_dataset_snap_lookup(dsl_dataset_t ds, const char name,
	- uint64_t *value);
	-int dsl_dataset_snap_remove(dsl_dataset_t ds, const char name, dmu_tx_t *tx,
	- boolean_t adj_cnt);
	-void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
	- zprop_source_t source, uint64_t value, dmu_tx_t *tx);
	-void dsl_dataset_zapify(dsl_dataset_t ds, dmu_tx_t tx);
	-boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
	-boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
	-
	-int dsl_dataset_rollback_check(void arg, dmu_tx_t tx);
	-void dsl_dataset_rollback_sync(void arg, dmu_tx_t tx);
	-int dsl_dataset_rollback(const char fsname, const char tosnap, void *owner,
	- nvlist_t *result);
	-
	-uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds);
	-void dsl_dataset_create_remap_deadlist(dsl_dataset_t ds, dmu_tx_t tx);
	-boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds);
	-void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t ds, dmu_tx_t tx);
	-
	-void dsl_dataset_deactivate_feature(uint64_t dsobj,
	- spa_feature_t f, dmu_tx_t *tx);
	-
	-#ifdef ZFS_DEBUG
	-#define dprintf_ds(ds, fmt, ...) do { \
	- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	- char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
	- dsl_dataset_name(ds, __ds_name); \
	- dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
	- kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
	- } \
	-_NOTE(CONSTCOND) } while (0)
	-#else
	-#define dprintf_ds(dd, fmt, ...)
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_DATASET_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
	@@ -1,89 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_DSL_DEADLIST_H
	-#define _SYS_DSL_DEADLIST_H
	-
	-#include <sys/bpobj.h>
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dmu_buf;
	-struct dsl_dataset;
	-
	-typedef struct dsl_deadlist_phys {
	- uint64_t dl_used;
	- uint64_t dl_comp;
	- uint64_t dl_uncomp;
	- uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
	-} dsl_deadlist_phys_t;
	-
	-typedef struct dsl_deadlist {
	- objset_t *dl_os;
	- uint64_t dl_object;
	- avl_tree_t dl_tree;
	- boolean_t dl_havetree;
	- struct dmu_buf *dl_dbuf;
	- dsl_deadlist_phys_t *dl_phys;
	- kmutex_t dl_lock;
	-
	- /* if it's the old on-disk format: */
	- bpobj_t dl_bpobj;
	- boolean_t dl_oldfmt;
	-} dsl_deadlist_t;
	-
	-typedef struct dsl_deadlist_entry {
	- avl_node_t dle_node;
	- uint64_t dle_mintxg;
	- bpobj_t dle_bpobj;
	-} dsl_deadlist_entry_t;
	-
	-void dsl_deadlist_open(dsl_deadlist_t dl, objset_t os, uint64_t object);
	-void dsl_deadlist_close(dsl_deadlist_t *dl);
	-uint64_t dsl_deadlist_alloc(objset_t os, dmu_tx_t tx);
	-void dsl_deadlist_free(objset_t os, uint64_t dlobj, dmu_tx_t tx);
	-void dsl_deadlist_insert(dsl_deadlist_t dl, const blkptr_t bp, dmu_tx_t *tx);
	-void dsl_deadlist_add_key(dsl_deadlist_t dl, uint64_t mintxg, dmu_tx_t tx);
	-void dsl_deadlist_remove_key(dsl_deadlist_t dl, uint64_t mintxg, dmu_tx_t tx);
	-uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
	- uint64_t mrs_obj, dmu_tx_t *tx);
	-void dsl_deadlist_space(dsl_deadlist_t *dl,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	-void dsl_deadlist_space_range(dsl_deadlist_t *dl,
	- uint64_t mintxg, uint64_t maxtxg,
	- uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	-void dsl_deadlist_merge(dsl_deadlist_t dl, uint64_t obj, dmu_tx_t tx);
	-void dsl_deadlist_move_bpobj(dsl_deadlist_t dl, bpobj_t bpo, uint64_t mintxg,
	- dmu_tx_t *tx);
	-boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_DEADLIST_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
	@@ -1,81 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_DSL_DELEG_H
	-#define _SYS_DSL_DELEG_H
	-
	-#include <sys/dmu.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define ZFS_DELEG_PERM_NONE ""
	-#define ZFS_DELEG_PERM_CREATE "create"
	-#define ZFS_DELEG_PERM_DESTROY "destroy"
	-#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
	-#define ZFS_DELEG_PERM_ROLLBACK "rollback"
	-#define ZFS_DELEG_PERM_CLONE "clone"
	-#define ZFS_DELEG_PERM_PROMOTE "promote"
	-#define ZFS_DELEG_PERM_RENAME "rename"
	-#define ZFS_DELEG_PERM_MOUNT "mount"
	-#define ZFS_DELEG_PERM_SHARE "share"
	-#define ZFS_DELEG_PERM_SEND "send"
	-#define ZFS_DELEG_PERM_RECEIVE "receive"
	-#define ZFS_DELEG_PERM_ALLOW "allow"
	-#define ZFS_DELEG_PERM_USERPROP "userprop"
	-#define ZFS_DELEG_PERM_VSCAN "vscan"
	-#define ZFS_DELEG_PERM_USERQUOTA "userquota"
	-#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
	-#define ZFS_DELEG_PERM_USERUSED "userused"
	-#define ZFS_DELEG_PERM_GROUPUSED "groupused"
	-#define ZFS_DELEG_PERM_HOLD "hold"
	-#define ZFS_DELEG_PERM_RELEASE "release"
	-#define ZFS_DELEG_PERM_DIFF "diff"
	-#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
	-#define ZFS_DELEG_PERM_REMAP "remap"
	-
	-/*
	- * Note: the names of properties that are marked delegatable are also
	- * valid delegated permissions
	- */
	-
	-int dsl_deleg_get(const char ddname, nvlist_t *nvp);
	-int dsl_deleg_set(const char ddname, nvlist_t nvp, boolean_t unset);
	-int dsl_deleg_access(const char ddname, const char perm, cred_t *cr);
	-int dsl_deleg_access_impl(struct dsl_dataset ds, const char perm, cred_t *cr);
	-void dsl_deleg_set_create_perms(dsl_dir_t dd, dmu_tx_t tx, cred_t *cr);
	-int dsl_deleg_can_allow(char ddname, nvlist_t nvp, cred_t *cr);
	-int dsl_deleg_can_unallow(char ddname, nvlist_t nvp, cred_t *cr);
	-int dsl_deleg_destroy(objset_t os, uint64_t zapobj, dmu_tx_t tx);
	-boolean_t dsl_delegation_on(objset_t *os);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_DELEG_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
	@@ -1,68 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
	- */
	-
	-#ifndef _SYS_DSL_DESTROY_H
	-#define _SYS_DSL_DESTROY_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct nvlist;
	-struct dsl_dataset;
	-struct dmu_tx;
	-
	-int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
	- struct nvlist *);
	-int dsl_destroy_snapshot(const char *, boolean_t);
	-int dsl_destroy_head(const char *);
	-int dsl_destroy_head_check_impl(struct dsl_dataset *, int);
	-void dsl_destroy_head_sync_impl(struct dsl_dataset , struct dmu_tx );
	-int dsl_destroy_inconsistent(const char , void );
	-int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t);
	-void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *,
	- boolean_t, struct dmu_tx *);
	-
	-typedef struct dsl_destroy_snapshot_arg {
	- const char *ddsa_name;
	- boolean_t ddsa_defer;
	-} dsl_destroy_snapshot_arg_t;
	-
	-int dsl_destroy_snapshot_check(void , dmu_tx_t );
	-void dsl_destroy_snapshot_sync(void , dmu_tx_t );
	-
	-typedef struct dsl_destroy_head_arg {
	- const char *ddha_name;
	-} dsl_destroy_head_arg_t;
	-
	-int dsl_destroy_head_check(void , dmu_tx_t );
	-void dsl_destroy_head_sync(void , dmu_tx_t );
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_DESTROY_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
	@@ -1,209 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- */
	-
	-#ifndef _SYS_DSL_DIR_H
	-#define _SYS_DSL_DIR_H
	-
	-#include <sys/dmu.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/refcount.h>
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dsl_dataset;
	-
	-/*
	- * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
	- * They should be of the format <reverse-dns>:<field>.
	- */
	-
	-#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
	-#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
	-#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg"
	-
	-typedef enum dd_used {
	- DD_USED_HEAD,
	- DD_USED_SNAP,
	- DD_USED_CHILD,
	- DD_USED_CHILD_RSRV,
	- DD_USED_REFRSRV,
	- DD_USED_NUM
	-} dd_used_t;
	-
	-#define DD_FLAG_USED_BREAKDOWN (1<<0)
	-
	-typedef struct dsl_dir_phys {
	- uint64_t dd_creation_time; /* not actually used */
	- uint64_t dd_head_dataset_obj;
	- uint64_t dd_parent_obj;
	- uint64_t dd_origin_obj;
	- uint64_t dd_child_dir_zapobj;
	- /*
	- * how much space our children are accounting for; for leaf
	- * datasets, == physical space used by fs + snaps
	- */
	- uint64_t dd_used_bytes;
	- uint64_t dd_compressed_bytes;
	- uint64_t dd_uncompressed_bytes;
	- /* Administrative quota setting */
	- uint64_t dd_quota;
	- /* Administrative reservation setting */
	- uint64_t dd_reserved;
	- uint64_t dd_props_zapobj;
	- uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
	- uint64_t dd_flags;
	- uint64_t dd_used_breakdown[DD_USED_NUM];
	- uint64_t dd_clones; /* dsl_dir objects */
	- uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
	-} dsl_dir_phys_t;
	-
	-struct dsl_dir {
	- dmu_buf_user_t dd_dbu;
	-
	- /* These are immutable; no lock needed: */
	- uint64_t dd_object;
	- dsl_pool_t *dd_pool;
	-
	- /* Stable until user eviction; no lock needed: */
	- dmu_buf_t *dd_dbuf;
	-
	- /* protected by lock on pool's dp_dirty_dirs list */
	- txg_node_t dd_dirty_link;
	-
	- /* protected by dp_config_rwlock */
	- dsl_dir_t *dd_parent;
	-
	- /* Protected by dd_lock */
	- kmutex_t dd_lock;
	- list_t dd_props; /* list of dsl_prop_record_t's */
	- timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
	- uint64_t dd_origin_txg;
	-
	- /* gross estimate of space used by in-flight tx's */
	- uint64_t dd_tempreserved[TXG_SIZE];
	- /* amount of space we expect to write; == amount of dirty data */
	- int64_t dd_space_towrite[TXG_SIZE];
	-
	- /* protected by dd_lock; keep at end of struct for better locality */
	- char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
	-};
	-
	-inline dsl_dir_phys_t *
	-dsl_dir_phys(dsl_dir_t *dd)
	-{
	- return (dd->dd_dbuf->db_data);
	-}
	-
	-void dsl_dir_rele(dsl_dir_t dd, void tag);
	-void dsl_dir_async_rele(dsl_dir_t dd, void tag);
	-int dsl_dir_hold(dsl_pool_t dp, const char name, void *tag,
	- dsl_dir_t , const char tail);
	-int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
	- const char tail, void tag, dsl_dir_t **);
	-void dsl_dir_name(dsl_dir_t dd, char buf);
	-int dsl_dir_namelen(dsl_dir_t *dd);
	-uint64_t dsl_dir_create_sync(dsl_pool_t dp, dsl_dir_t pds,
	- const char name, dmu_tx_t tx);
	-
	-uint64_t dsl_dir_get_used(dsl_dir_t *dd);
	-uint64_t dsl_dir_get_compressed(dsl_dir_t *dd);
	-uint64_t dsl_dir_get_quota(dsl_dir_t *dd);
	-uint64_t dsl_dir_get_reservation(dsl_dir_t *dd);
	-uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd);
	-uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd);
	-uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd);
	-uint64_t dsl_dir_get_usedds(dsl_dir_t *dd);
	-uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd);
	-uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd);
	-void dsl_dir_get_origin(dsl_dir_t dd, char buf);
	-int dsl_dir_get_filesystem_count(dsl_dir_t dd, uint64_t count);
	-int dsl_dir_get_snapshot_count(dsl_dir_t dd, uint64_t count);
	-int dsl_dir_get_remaptxg(dsl_dir_t dd, uint64_t count);
	-
	-void dsl_dir_stats(dsl_dir_t dd, nvlist_t nv);
	-uint64_t dsl_dir_space_available(dsl_dir_t *dd,
	- dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
	-void dsl_dir_dirty(dsl_dir_t dd, dmu_tx_t tx);
	-void dsl_dir_sync(dsl_dir_t dd, dmu_tx_t tx);
	-int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
	- uint64_t asize, boolean_t netfree, void *tr_cookiep, dmu_tx_t tx);
	-void dsl_dir_tempreserve_clear(void tr_cookie, dmu_tx_t tx);
	-void dsl_dir_willuse_space(dsl_dir_t dd, int64_t space, dmu_tx_t tx);
	-void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
	- int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
	-void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
	- dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
	-int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
	- uint64_t quota);
	-int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
	- uint64_t reservation);
	-int dsl_dir_activate_fs_ss_limit(const char *);
	-int dsl_fs_ss_limit_check(dsl_dir_t , uint64_t, zfs_prop_t, dsl_dir_t ,
	- cred_t *);
	-void dsl_fs_ss_count_adjust(dsl_dir_t , int64_t, const char , dmu_tx_t *);
	-int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t);
	-int dsl_dir_rename(const char oldname, const char newname);
	-int dsl_dir_transfer_possible(dsl_dir_t sdd, dsl_dir_t tdd,
	- uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
	-boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
	-void dsl_dir_new_refreservation(dsl_dir_t dd, struct dsl_dataset ds,
	- uint64_t reservation, cred_t cr, dmu_tx_t tx);
	-void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
	-timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
	-void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
	- dmu_tx_t *tx);
	-void dsl_dir_zapify(dsl_dir_t dd, dmu_tx_t tx);
	-boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
	-
	-/* internal reserved dir name */
	-#define MOS_DIR_NAME "$MOS"
	-#define ORIGIN_DIR_NAME "$ORIGIN"
	-#define FREE_DIR_NAME "$FREE"
	-#define LEAK_DIR_NAME "$LEAK"
	-
	-#ifdef ZFS_DEBUG
	-#define dprintf_dd(dd, fmt, ...) do { \
	- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	- char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
	- dsl_dir_name(dd, __ds_name); \
	- dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
	- kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
	- } \
	-_NOTE(CONSTCOND) } while (0)
	-#else
	-#define dprintf_dd(dd, fmt, ...)
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_DIR_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
	@@ -1,191 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-#ifndef _SYS_DSL_POOL_H
	-#define _SYS_DSL_POOL_H
	-
	-#include <sys/spa.h>
	-#include <sys/txg.h>
	-#include <sys/txg_impl.h>
	-#include <sys/zfs_context.h>
	-#include <sys/zio.h>
	-#include <sys/dnode.h>
	-#include <sys/ddt.h>
	-#include <sys/arc.h>
	-#include <sys/bpobj.h>
	-#include <sys/bptree.h>
	-#include <sys/rrwlock.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/mmp.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct objset;
	-struct dsl_dir;
	-struct dsl_dataset;
	-struct dsl_pool;
	-struct dmu_tx;
	-struct dsl_scan;
	-
	-extern uint64_t zfs_dirty_data_max;
	-extern uint64_t zfs_dirty_data_max_max;
	-extern uint64_t zfs_dirty_data_sync_pct;
	-extern int zfs_dirty_data_max_percent;
	-extern int zfs_delay_min_dirty_percent;
	-extern uint64_t zfs_delay_scale;
	-
	-/* These macros are for indexing into the zfs_all_blkstats_t. */
	-#define DMU_OT_DEFERRED DMU_OT_NONE
	-#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
	-#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1)
	-
	-typedef struct zfs_blkstat {
	- uint64_t zb_count;
	- uint64_t zb_asize;
	- uint64_t zb_lsize;
	- uint64_t zb_psize;
	- uint64_t zb_gangs;
	- uint64_t zb_ditto_2_of_2_samevdev;
	- uint64_t zb_ditto_2_of_3_samevdev;
	- uint64_t zb_ditto_3_of_3_samevdev;
	-} zfs_blkstat_t;
	-
	-typedef struct zfs_all_blkstats {
	- zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
	- kmutex_t zab_lock;
	-} zfs_all_blkstats_t;
	-
	-
	-typedef struct dsl_pool {
	- /* Immutable */
	- spa_t *dp_spa;
	- struct objset *dp_meta_objset;
	- struct dsl_dir *dp_root_dir;
	- struct dsl_dir *dp_mos_dir;
	- struct dsl_dir *dp_free_dir;
	- struct dsl_dir *dp_leak_dir;
	- struct dsl_dataset *dp_origin_snap;
	- uint64_t dp_root_dir_obj;
	- struct taskq *dp_vnrele_taskq;
	-
	- /* No lock needed - sync context only */
	- blkptr_t dp_meta_rootbp;
	- uint64_t dp_tmp_userrefs_obj;
	- bpobj_t dp_free_bpobj;
	- uint64_t dp_bptree_obj;
	- uint64_t dp_empty_bpobj;
	- bpobj_t dp_obsolete_bpobj;
	-
	- struct dsl_scan *dp_scan;
	-
	- /* Uses dp_lock */
	- kmutex_t dp_lock;
	- kcondvar_t dp_spaceavail_cv;
	- uint64_t dp_dirty_pertxg[TXG_SIZE];
	- uint64_t dp_dirty_total;
	- uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
	- uint64_t dp_mos_used_delta;
	- uint64_t dp_mos_compressed_delta;
	- uint64_t dp_mos_uncompressed_delta;
	-
	- /*
	- * Time of most recently scheduled (furthest in the future)
	- * wakeup for delayed transactions.
	- */
	- hrtime_t dp_last_wakeup;
	-
	- /* Has its own locking */
	- tx_state_t dp_tx;
	- txg_list_t dp_dirty_datasets;
	- txg_list_t dp_dirty_zilogs;
	- txg_list_t dp_dirty_dirs;
	- txg_list_t dp_sync_tasks;
	- txg_list_t dp_early_sync_tasks;
	- taskq_t *dp_sync_taskq;
	- taskq_t *dp_zil_clean_taskq;
	-
	- /*
	- * Protects administrative changes (properties, namespace)
	- *
	- * It is only held for write in syncing context. Therefore
	- * syncing context does not need to ever have it for read, since
	- * nobody else could possibly have it for write.
	- */
	- rrwlock_t dp_config_rwlock;
	-
	- zfs_all_blkstats_t *dp_blkstats;
	-} dsl_pool_t;
	-
	-int dsl_pool_init(spa_t spa, uint64_t txg, dsl_pool_t *dpp);
	-int dsl_pool_open(dsl_pool_t *dp);
	-void dsl_pool_close(dsl_pool_t *dp);
	-dsl_pool_t dsl_pool_create(spa_t spa, nvlist_t *zplprops, uint64_t txg);
	-void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
	-void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
	-int dsl_pool_sync_context(dsl_pool_t *dp);
	-uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
	-uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
	- zfs_space_check_t slop_policy);
	-void dsl_pool_dirty_space(dsl_pool_t dp, int64_t space, dmu_tx_t tx);
	-void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
	-void dsl_free(dsl_pool_t dp, uint64_t txg, const blkptr_t bpp);
	-void dsl_free_sync(zio_t pio, dsl_pool_t dp, uint64_t txg,
	- const blkptr_t *bpp);
	-void dsl_pool_create_origin(dsl_pool_t dp, dmu_tx_t tx);
	-void dsl_pool_upgrade_clones(dsl_pool_t dp, dmu_tx_t tx);
	-void dsl_pool_upgrade_dir_clones(dsl_pool_t dp, dmu_tx_t tx);
	-void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
	- int64_t used, int64_t comp, int64_t uncomp);
	-void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp,
	- int64_t used, int64_t comp, int64_t uncomp);
	-void dsl_pool_config_enter(dsl_pool_t dp, void tag);
	-void dsl_pool_config_enter_prio(dsl_pool_t dp, void tag);
	-void dsl_pool_config_exit(dsl_pool_t dp, void tag);
	-boolean_t dsl_pool_config_held(dsl_pool_t *dp);
	-boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
	-boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
	-
	-taskq_t dsl_pool_vnrele_taskq(dsl_pool_t dp);
	-
	-int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
	- const char tag, uint64_t now, dmu_tx_t tx);
	-int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
	- const char tag, dmu_tx_t tx);
	-void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
	-int dsl_pool_open_special_dir(dsl_pool_t dp, const char name, dsl_dir_t **);
	-int dsl_pool_hold(const char name, void tag, dsl_pool_t **dp);
	-void dsl_pool_rele(dsl_pool_t dp, void tag);
	-
	-void dsl_pool_create_obsolete_bpobj(dsl_pool_t dp, dmu_tx_t tx);
	-void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t dp, dmu_tx_t tx);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_POOL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
	@@ -1,115 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_DSL_PROP_H
	-#define _SYS_DSL_PROP_H
	-
	-#include <sys/dmu.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/zfs_context.h>
	-#include <sys/dsl_synctask.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dsl_dataset;
	-struct dsl_dir;
	-
	-/* The callback func may not call into the DMU or DSL! */
	-typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
	-
	-typedef struct dsl_prop_record {
	- list_node_t pr_node; /* link on dd_props */
	- const char *pr_propname;
	- list_t pr_cbs;
	-} dsl_prop_record_t;
	-
	-typedef struct dsl_prop_cb_record {
	- list_node_t cbr_pr_node; /* link on pr_cbs */
	- list_node_t cbr_ds_node; /* link on ds_prop_cbs */
	- dsl_prop_record_t *cbr_pr;
	- struct dsl_dataset *cbr_ds;
	- dsl_prop_changed_cb_t *cbr_func;
	- void *cbr_arg;
	-} dsl_prop_cb_record_t;
	-
	-typedef struct dsl_props_arg {
	- nvlist_t *pa_props;
	- zprop_source_t pa_source;
	-} dsl_props_arg_t;
	-
	-void dsl_prop_init(dsl_dir_t *dd);
	-void dsl_prop_fini(dsl_dir_t *dd);
	-int dsl_prop_register(struct dsl_dataset ds, const char propname,
	- dsl_prop_changed_cb_t callback, void cbarg);
	-void dsl_prop_unregister_all(struct dsl_dataset ds, void cbarg);
	-void dsl_prop_notify_all(struct dsl_dir *dd);
	-boolean_t dsl_prop_hascb(struct dsl_dataset *ds);
	-
	-int dsl_prop_get(const char ddname, const char propname,
	- int intsz, int numints, void buf, char setpoint);
	-int dsl_prop_get_integer(const char ddname, const char propname,
	- uint64_t valuep, char setpoint);
	-int dsl_prop_get_all(objset_t os, nvlist_t *nvp);
	-int dsl_prop_get_received(const char dsname, nvlist_t *nvp);
	-int dsl_prop_get_ds(struct dsl_dataset ds, const char propname,
	- int intsz, int numints, void buf, char setpoint);
	-int dsl_prop_get_int_ds(struct dsl_dataset ds, const char propname,
	- uint64_t *valuep);
	-int dsl_prop_get_dd(struct dsl_dir dd, const char propname,
	- int intsz, int numints, void buf, char setpoint,
	- boolean_t snapshot);
	-
	-void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source,
	- nvlist_t props, dmu_tx_t tx);
	-void dsl_prop_set_sync_impl(struct dsl_dataset ds, const char propname,
	- zprop_source_t source, int intsz, int numints, const void *value,
	- dmu_tx_t *tx);
	-int dsl_props_set(const char dsname, zprop_source_t source, nvlist_t nvl);
	-int dsl_prop_set_int(const char dsname, const char propname,
	- zprop_source_t source, uint64_t value);
	-int dsl_prop_set_string(const char dsname, const char propname,
	- zprop_source_t source, const char *value);
	-int dsl_prop_inherit(const char dsname, const char propname,
	- zprop_source_t source);
	-
	-int dsl_prop_predict(dsl_dir_t dd, const char propname,
	- zprop_source_t source, uint64_t value, uint64_t *newvalp);
	-
	-/* flag first receive on or after SPA_VERSION_RECVD_PROPS */
	-boolean_t dsl_prop_get_hasrecvd(const char *dsname);
	-int dsl_prop_set_hasrecvd(const char *dsname);
	-void dsl_prop_unset_hasrecvd(const char *dsname);
	-
	-void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
	-void dsl_prop_nvlist_add_string(nvlist_t *nv,
	- zfs_prop_t prop, const char *value);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_PROP_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
	@@ -1,188 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2017 Datto Inc.
	- */
	-
	-#ifndef _SYS_DSL_SCAN_H
	-#define _SYS_DSL_SCAN_H
	-
	-#include <sys/zfs_context.h>
	-#include <sys/zio.h>
	-#include <sys/ddt.h>
	-#include <sys/bplist.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct objset;
	-struct dsl_dir;
	-struct dsl_dataset;
	-struct dsl_pool;
	-struct dmu_tx;
	-
	-/*
	- * All members of this structure must be uint64_t, for byteswap
	- * purposes.
	- */
	-typedef struct dsl_scan_phys {
	- uint64_t scn_func; /* pool_scan_func_t */
	- uint64_t scn_state; /* dsl_scan_state_t */
	- uint64_t scn_queue_obj;
	- uint64_t scn_min_txg;
	- uint64_t scn_max_txg;
	- uint64_t scn_cur_min_txg;
	- uint64_t scn_cur_max_txg;
	- uint64_t scn_start_time;
	- uint64_t scn_end_time;
	- uint64_t scn_to_examine; /* total bytes to be scanned */
	- uint64_t scn_examined; /* bytes scanned so far */
	- uint64_t scn_to_process;
	- uint64_t scn_processed;
	- uint64_t scn_errors; /* scan I/O error count */
	- uint64_t scn_ddt_class_max;
	- ddt_bookmark_t scn_ddt_bookmark;
	- zbookmark_phys_t scn_bookmark;
	- uint64_t scn_flags; /* dsl_scan_flags_t */
	-} dsl_scan_phys_t;
	-
	-#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
	-
	-typedef enum dsl_scan_flags {
	- DSF_VISIT_DS_AGAIN = 1<<0,
	- DSF_SCRUB_PAUSED = 1<<1,
	-} dsl_scan_flags_t;
	-
	-/*
	- * Every pool will have one dsl_scan_t and this structure will contain
	- * in-memory information about the scan and a pointer to the on-disk
	- * representation (i.e. dsl_scan_phys_t). Most of the state of the scan
	- * is contained on-disk to allow the scan to resume in the event of a reboot
	- * or panic. This structure maintains information about the behavior of a
	- * running scan, some caching information, and how it should traverse the pool.
	- *
	- * The following members of this structure direct the behavior of the scan:
	- *
	- * scn_suspending - a scan that cannot be completed in a single txg or
	- * has exceeded its allotted time will need to suspend.
	- * When this flag is set the scanner will stop traversing
	- * the pool and write out the current state to disk.
	- *
	- * scn_restart_txg - directs the scanner to either restart or start a
	- * a scan at the specified txg value.
	- *
	- * scn_done_txg - when a scan completes its traversal it will set
	- * the completion txg to the next txg. This is necessary
	- * to ensure that any blocks that were freed during
	- * the scan but have not yet been processed (i.e deferred
	- * frees) are accounted for.
	- *
	- * This structure also maintains information about deferred frees which are
	- * a special kind of traversal. Deferred free can exist in either a bptree or
	- * a bpobj structure. The scn_is_bptree flag will indicate the type of
	- * deferred free that is in progress. If the deferred free is part of an
	- * asynchronous destroy then the scn_async_destroying flag will be set.
	- */
	-typedef struct dsl_scan {
	- struct dsl_pool *scn_dp;
	-
	- uint64_t scn_restart_txg;
	- uint64_t scn_done_txg;
	- uint64_t scn_sync_start_time;
	- uint64_t scn_issued_before_pass;
	-
	- /* for freeing blocks */
	- boolean_t scn_is_bptree;
	- boolean_t scn_async_destroying;
	- boolean_t scn_async_stalled;
	- uint64_t scn_async_block_min_time_ms;
	- /* flags and stats for controlling scan state */
	- boolean_t scn_is_sorted; /* doing sequential scan */
	- boolean_t scn_clearing; /* scan is issuing sequential extents */
	- boolean_t scn_checkpointing; /* scan is issuing all queued extents */
	- boolean_t scn_suspending; /* scan is suspending until next txg */
	- uint64_t scn_last_checkpoint; /* time of last checkpoint */
	-
	- /* members for thread synchronization */
	- zio_t scn_zio_root; / root zio for waiting on IO */
	- taskq_t scn_taskq; / task queue for issuing extents */
	-
	- /* for controlling scan prefetch, protected by spa_scrub_lock */
	- boolean_t scn_prefetch_stop; /* prefetch should stop */
	- zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */
	- avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */
	- uint64_t scn_maxinflight_bytes; /* max bytes in flight for poool */
	-
	- /* per txg statistics */
	- uint64_t scn_visited_this_txg; /* total bps visited this txg */
	- uint64_t scn_holes_this_txg;
	- uint64_t scn_lt_min_this_txg;
	- uint64_t scn_gt_max_this_txg;
	- uint64_t scn_ddt_contained_this_txg;
	- uint64_t scn_objsets_visited_this_txg;
	- uint64_t scn_avg_seg_size_this_txg;
	- uint64_t scn_segs_this_txg;
	- uint64_t scn_avg_zio_size_this_txg;
	- uint64_t scn_zios_this_txg;
	-
	- /* members needed for syncing scan status to disk */
	- dsl_scan_phys_t scn_phys; /* on disk representation of scan */
	- dsl_scan_phys_t scn_phys_cached;
	- avl_tree_t scn_queue; /* queue of datasets to scan */
	- uint64_t scn_bytes_pending; /* outstanding data to issue */
	-} dsl_scan_t;
	-
	-typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
	-
	-void dsl_scan_global_init(void);
	-
	-void scan_init(void);
	-void scan_fini(void);
	-int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
	-void dsl_scan_fini(struct dsl_pool *dp);
	-void dsl_scan_sync(struct dsl_pool , dmu_tx_t );
	-int dsl_scan_cancel(struct dsl_pool *);
	-int dsl_scan(struct dsl_pool *, pool_scan_func_t);
	-boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
	-int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
	-void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
	-boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
	-boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
	-void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
	- ddt_entry_t dde, dmu_tx_t tx);
	-void dsl_scan_ds_destroyed(struct dsl_dataset ds, struct dmu_tx tx);
	-void dsl_scan_ds_snapshotted(struct dsl_dataset ds, struct dmu_tx tx);
	-void dsl_scan_ds_clone_swapped(struct dsl_dataset ds1, struct dsl_dataset ds2,
	- struct dmu_tx *tx);
	-boolean_t dsl_scan_active(dsl_scan_t *scn);
	-boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
	-void dsl_scan_freed(spa_t spa, const blkptr_t bp);
	-void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
	-void dsl_scan_io_queue_vdev_xfer(vdev_t svd, vdev_t tvd);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_SCAN_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
	@@ -1,127 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_DSL_SYNCTASK_H
	-#define _SYS_DSL_SYNCTASK_H
	-
	-#include <sys/txg.h>
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dsl_pool;
	-
	-typedef int (dsl_checkfunc_t)(void , dmu_tx_t );
	-typedef void (dsl_syncfunc_t)(void , dmu_tx_t );
	-typedef void (dsl_sigfunc_t)(void , dmu_tx_t );
	-
	-typedef enum zfs_space_check {
	- /*
	- * Normal space check: if there is less than 3.2% free space,
	- * the operation will fail. Operations which are logically
	- * creating things should use this (e.g. "zfs create", "zfs snapshot").
	- * User writes (via the ZPL / ZVOL) also fail at this point.
	- */
	- ZFS_SPACE_CHECK_NORMAL,
	-
	- /*
	- * Space check allows use of half the slop space. If there
	- * is less than 1.6% free space, the operation will fail. Most
	- * operations should use this (e.g. "zfs set", "zfs rename"),
	- * because we want them to succeed even after user writes are failing,
	- * so that they can be used as part of the space recovery process.
	- */
	- ZFS_SPACE_CHECK_RESERVED,
	-
	- /*
	- * Space check allows use of three quarters of the slop space.
	- * If there is less than 0.8% free space, the operation will
	- * fail.
	- */
	- ZFS_SPACE_CHECK_EXTRA_RESERVED,
	-
	- /*
	- * In all cases "zfs destroy" is expected to result in an net
	- * reduction of space, except one. When the pool has a
	- * checkpoint, space freed by "zfs destroy" will not actually
	- * free anything internally. Thus, it starts failing after
	- * three quarters of the slop space is exceeded.
	- */
	- ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED,
	-
	- /*
	- * A channel program can run a "zfs destroy" as part of its
	- * script and therefore has the same space_check policy when
	- * being evaluated.
	- */
	- ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY,
	-
	- /*
	- * No space check is performed. This level of space check should
	- * be used cautiously as operations that use it can even run when
	- * 0.8% capacity is left for use. In this scenario, if there is a
	- * checkpoint, async destroys are suspended and any kind of freeing
	- * can potentially add space instead of freeing it.
	- *
	- * See also the comments above spa_slop_shift.
	- */
	- ZFS_SPACE_CHECK_NONE,
	-
	- ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE,
	-
	-} zfs_space_check_t;
	-
	-typedef struct dsl_sync_task {
	- txg_node_t dst_node;
	- struct dsl_pool *dst_pool;
	- uint64_t dst_txg;
	- int dst_space;
	- zfs_space_check_t dst_space_check;
	- dsl_checkfunc_t *dst_checkfunc;
	- dsl_syncfunc_t *dst_syncfunc;
	- void *dst_arg;
	- int dst_error;
	- boolean_t dst_nowaiter;
	-} dsl_sync_task_t;
	-
	-void dsl_sync_task_sync(dsl_sync_task_t , dmu_tx_t );
	-int dsl_sync_task(const char , dsl_checkfunc_t ,
	- dsl_syncfunc_t , void , int, zfs_space_check_t);
	-void dsl_sync_task_nowait(struct dsl_pool , dsl_syncfunc_t ,
	- void , int, zfs_space_check_t, dmu_tx_t );
	-int dsl_early_sync_task(const char , dsl_checkfunc_t ,
	- dsl_syncfunc_t , void , int, zfs_space_check_t);
	-void dsl_early_sync_task_nowait(struct dsl_pool , dsl_syncfunc_t ,
	- void , int, zfs_space_check_t, dmu_tx_t );
	-int dsl_sync_task_sig(const char , dsl_checkfunc_t , dsl_syncfunc_t *,
	- dsl_sigfunc_t , void , int, zfs_space_check_t);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_SYNCTASK_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
	@@ -1,57 +0,0 @@
	-
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- */
	-
	-#ifndef _SYS_DSL_USERHOLD_H
	-#define _SYS_DSL_USERHOLD_H
	-
	-#include <sys/nvpair.h>
	-#include <sys/types.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dsl_pool;
	-struct dsl_dataset;
	-struct dmu_tx;
	-
	-int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor,
	- nvlist_t *errlist);
	-int dsl_dataset_user_release(nvlist_t holds, nvlist_t errlist);
	-int dsl_dataset_get_holds(const char dsname, nvlist_t nvl);
	-void dsl_dataset_user_release_tmp(struct dsl_pool dp, nvlist_t holds);
	-int dsl_dataset_user_hold_check_one(struct dsl_dataset ds, const char htag,
	- boolean_t temphold, struct dmu_tx *tx);
	-void dsl_dataset_user_hold_sync_one(struct dsl_dataset ds, const char htag,
	- minor_t minor, uint64_t now, struct dmu_tx *tx);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_DSL_USERHOLD_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
	@@ -1,127 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#ifndef _SYS_METASLAB_H
	-#define _SYS_METASLAB_H
	-
	-#include <sys/spa.h>
	-#include <sys/space_map.h>
	-#include <sys/txg.h>
	-#include <sys/zio.h>
	-#include <sys/avl.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-
	-typedef struct metaslab_ops {
	- uint64_t (msop_alloc)(metaslab_t , uint64_t);
	-} metaslab_ops_t;
	-
	-
	-extern metaslab_ops_t *zfs_metaslab_ops;
	-
	-int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
	- metaslab_t **);
	-void metaslab_fini(metaslab_t *);
	-
	-int metaslab_load(metaslab_t *);
	-void metaslab_unload(metaslab_t *);
	-
	-uint64_t metaslab_allocated_space(metaslab_t *);
	-
	-void metaslab_sync(metaslab_t *, uint64_t);
	-void metaslab_sync_done(metaslab_t *, uint64_t);
	-void metaslab_sync_reassess(metaslab_group_t *);
	-uint64_t metaslab_block_maxsize(metaslab_t *);
	-
	-/*
	- * metaslab alloc flags
	- */
	-#define METASLAB_HINTBP_FAVOR 0x0
	-#define METASLAB_HINTBP_AVOID 0x1
	-#define METASLAB_GANG_HEADER 0x2
	-#define METASLAB_GANG_CHILD 0x4
	-#define METASLAB_ASYNC_ALLOC 0x8
	-#define METASLAB_DONT_THROTTLE 0x10
	-#define METASLAB_MUST_RESERVE 0x20
	-#define METASLAB_FASTWRITE 0x40
	-
	-int metaslab_alloc(spa_t , metaslab_class_t , uint64_t,
	- blkptr_t , int, uint64_t, blkptr_t , int, zio_alloc_list_t , zio_t ,
	- int);
	-int metaslab_alloc_dva(spa_t , metaslab_class_t , uint64_t,
	- dva_t , int, dva_t , uint64_t, int, zio_alloc_list_t *, int);
	-void metaslab_free(spa_t , const blkptr_t , uint64_t, boolean_t);
	-void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
	-void metaslab_free_dva(spa_t , const dva_t , boolean_t);
	-void metaslab_free_impl_cb(uint64_t, vdev_t , uint64_t, uint64_t, void );
	-void metaslab_unalloc_dva(spa_t , const dva_t , uint64_t);
	-int metaslab_claim(spa_t , const blkptr_t , uint64_t);
	-int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
	-void metaslab_check_free(spa_t , const blkptr_t );
	-
	-void metaslab_alloc_trace_init(void);
	-void metaslab_alloc_trace_fini(void);
	-void metaslab_trace_init(zio_alloc_list_t *);
	-void metaslab_trace_fini(zio_alloc_list_t *);
	-
	-metaslab_class_t metaslab_class_create(spa_t , metaslab_ops_t *);
	-void metaslab_class_destroy(metaslab_class_t *);
	-int metaslab_class_validate(metaslab_class_t *);
	-void metaslab_class_histogram_verify(metaslab_class_t *);
	-uint64_t metaslab_class_fragmentation(metaslab_class_t *);
	-uint64_t metaslab_class_expandable_space(metaslab_class_t *);
	-boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
	- zio_t *, int);
	-void metaslab_class_throttle_unreserve(metaslab_class_t , int, int, zio_t );
	-
	-uint64_t metaslab_class_get_alloc(metaslab_class_t *);
	-uint64_t metaslab_class_get_space(metaslab_class_t *);
	-uint64_t metaslab_class_get_dspace(metaslab_class_t *);
	-uint64_t metaslab_class_get_deferred(metaslab_class_t *);
	-uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
	-
	-metaslab_group_t metaslab_group_create(metaslab_class_t , vdev_t *, int);
	-void metaslab_group_destroy(metaslab_group_t *);
	-void metaslab_group_activate(metaslab_group_t *);
	-void metaslab_group_passivate(metaslab_group_t *);
	-boolean_t metaslab_group_initialized(metaslab_group_t *);
	-uint64_t metaslab_group_get_space(metaslab_group_t *);
	-void metaslab_group_histogram_verify(metaslab_group_t *);
	-uint64_t metaslab_group_fragmentation(metaslab_group_t *);
	-void metaslab_group_histogram_remove(metaslab_group_t , metaslab_t );
	-void metaslab_group_alloc_decrement(spa_t , uint64_t, void , int, int,
	- boolean_t);
	-void metaslab_group_alloc_verify(spa_t , const blkptr_t , void *, int);
	-void metaslab_recalculate_weight_and_sort(metaslab_t *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_METASLAB_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
	@@ -1,501 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_METASLAB_IMPL_H
	-#define _SYS_METASLAB_IMPL_H
	-
	-#include <sys/metaslab.h>
	-#include <sys/space_map.h>
	-#include <sys/range_tree.h>
	-#include <sys/vdev.h>
	-#include <sys/txg.h>
	-#include <sys/avl.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Metaslab allocation tracing record.
	- */
	-typedef struct metaslab_alloc_trace {
	- list_node_t mat_list_node;
	- metaslab_group_t *mat_mg;
	- metaslab_t *mat_msp;
	- uint64_t mat_size;
	- uint64_t mat_weight;
	- uint32_t mat_dva_id;
	- uint64_t mat_offset;
	- int mat_allocator;
	-} metaslab_alloc_trace_t;
	-
	-/*
	- * Used by the metaslab allocation tracing facility to indicate
	- * error conditions. These errors are stored to the offset member
	- * of the metaslab_alloc_trace_t record and displayed by mdb.
	- */
	-typedef enum trace_alloc_type {
	- TRACE_ALLOC_FAILURE = -1ULL,
	- TRACE_TOO_SMALL = -2ULL,
	- TRACE_FORCE_GANG = -3ULL,
	- TRACE_NOT_ALLOCATABLE = -4ULL,
	- TRACE_GROUP_FAILURE = -5ULL,
	- TRACE_ENOSPC = -6ULL,
	- TRACE_CONDENSING = -7ULL,
	- TRACE_VDEV_ERROR = -8ULL,
	- TRACE_INITIALIZING = -9ULL
	-} trace_alloc_type_t;
	-
	-#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
	-#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
	-#define METASLAB_WEIGHT_CLAIM (1ULL << 61)
	-#define METASLAB_WEIGHT_TYPE (1ULL << 60)
	-#define METASLAB_ACTIVE_MASK \
	- (METASLAB_WEIGHT_PRIMARY \| METASLAB_WEIGHT_SECONDARY \| \
	- METASLAB_WEIGHT_CLAIM)
	-
	-/*
	- * The metaslab weight is used to encode the amount of free space in a
	- * metaslab, such that the "best" metaslab appears first when sorting the
	- * metaslabs by weight. The weight (and therefore the "best" metaslab) can
	- * be determined in two different ways: by computing a weighted sum of all
	- * the free space in the metaslab (a space based weight) or by counting only
	- * the free segments of the largest size (a segment based weight). We prefer
	- * the segment based weight because it reflects how the free space is
	- * comprised, but we cannot always use it -- legacy pools do not have the
	- * space map histogram information necessary to determine the largest
	- * contiguous regions. Pools that have the space map histogram determine
	- * the segment weight by looking at each bucket in the histogram and
	- * determining the free space whose size in bytes is in the range:
	- * [2^i, 2^(i+1))
	- * We then encode the largest index, i, that contains regions into the
	- * segment-weighted value.
	- *
	- * Space-based weight:
	- *
	- * 64 56 48 40 32 24 16 8 0
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * \|PSC1\| weighted-free space \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- *
	- * PS - indicates primary and secondary activation
	- * C - indicates activation for claimed block zio
	- * space - the fragmentation-weighted space
	- *
	- * Segment-based weight:
	- *
	- * 64 56 48 40 32 24 16 8 0
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * \|PSC0\| idx\| count of segments in region \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- *
	- * PS - indicates primary and secondary activation
	- * C - indicates activation for claimed block zio
	- * idx - index for the highest bucket in the histogram
	- * count - number of segments in the specified bucket
	- */
	-#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3)
	-#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x)
	-
	-#define WEIGHT_IS_SPACEBASED(weight) \
	- ((weight) == 0 \|\| BF64_GET((weight), 60, 1))
	-#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1)
	-
	-/*
	- * These macros are only applicable to segment-based weighting.
	- */
	-#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6)
	-#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x)
	-#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54)
	-#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x)
	-
	-/*
	- * A metaslab class encompasses a category of allocatable top-level vdevs.
	- * Each top-level vdev is associated with a metaslab group which defines
	- * the allocatable region for that vdev. Examples of these categories include
	- * "normal" for data block allocations (i.e. main pool allocations) or "log"
	- * for allocations designated for intent log devices (i.e. slog devices).
	- * When a block allocation is requested from the SPA it is associated with a
	- * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
	- * to the class can be used to satisfy that request. Allocations are done
	- * by traversing the metaslab groups that are linked off of the mc_rotor field.
	- * This rotor points to the next metaslab group where allocations will be
	- * attempted. Allocating a block is a 3 step process -- select the metaslab
	- * group, select the metaslab, and then allocate the block. The metaslab
	- * class defines the low-level block allocator that will be used as the
	- * final step in allocation. These allocators are pluggable allowing each class
	- * to use a block allocator that best suits that class.
	- */
	-struct metaslab_class {
	- kmutex_t mc_lock;
	- spa_t *mc_spa;
	- metaslab_group_t *mc_rotor;
	- metaslab_ops_t *mc_ops;
	- uint64_t mc_aliquot;
	-
	- /*
	- * Track the number of metaslab groups that have been initialized
	- * and can accept allocations. An initialized metaslab group is
	- * one has been completely added to the config (i.e. we have
	- * updated the MOS config and the space has been added to the pool).
	- */
	- uint64_t mc_groups;
	-
	- /*
	- * Toggle to enable/disable the allocation throttle.
	- */
	- boolean_t mc_alloc_throttle_enabled;
	-
	- /*
	- * The allocation throttle works on a reservation system. Whenever
	- * an asynchronous zio wants to perform an allocation it must
	- * first reserve the number of blocks that it wants to allocate.
	- * If there aren't sufficient slots available for the pending zio
	- * then that I/O is throttled until more slots free up. The current
	- * number of reserved allocations is maintained by the mc_alloc_slots
	- * refcount. The mc_alloc_max_slots value determines the maximum
	- * number of allocations that the system allows. Gang blocks are
	- * allowed to reserve slots even if we've reached the maximum
	- * number of allocations allowed.
	- */
	- uint64_t *mc_alloc_max_slots;
	- zfs_refcount_t *mc_alloc_slots;
	-
	- uint64_t mc_alloc_groups; /* # of allocatable groups */
	-
	- uint64_t mc_alloc; /* total allocated space */
	- uint64_t mc_deferred; /* total deferred frees */
	- uint64_t mc_space; /* total space (alloc + free) */
	- uint64_t mc_dspace; /* total deflated space */
	- uint64_t mc_minblocksize;
	- uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
	-};
	-
	-/*
	- * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
	- * of a top-level vdev. They are linked togther to form a circular linked
	- * list and can belong to only one metaslab class. Metaslab groups may become
	- * ineligible for allocations for a number of reasons such as limited free
	- * space, fragmentation, or going offline. When this happens the allocator will
	- * simply find the next metaslab group in the linked list and attempt
	- * to allocate from that group instead.
	- */
	-struct metaslab_group {
	- kmutex_t mg_lock;
	- metaslab_t **mg_primaries;
	- metaslab_t **mg_secondaries;
	- avl_tree_t mg_metaslab_tree;
	- uint64_t mg_aliquot;
	- boolean_t mg_allocatable; /* can we allocate? */
	- uint64_t mg_ms_ready;
	-
	- /*
	- * A metaslab group is considered to be initialized only after
	- * we have updated the MOS config and added the space to the pool.
	- * We only allow allocation attempts to a metaslab group if it
	- * has been initialized.
	- */
	- boolean_t mg_initialized;
	-
	- uint64_t mg_free_capacity; /* percentage free */
	- int64_t mg_bias;
	- int64_t mg_activation_count;
	- metaslab_class_t *mg_class;
	- vdev_t *mg_vd;
	- taskq_t *mg_taskq;
	- metaslab_group_t *mg_prev;
	- metaslab_group_t *mg_next;
	-
	- /*
	- * In order for the allocation throttle to function properly, we cannot
	- * have too many IOs going to each disk by default; the throttle
	- * operates by allocating more work to disks that finish quickly, so
	- * allocating larger chunks to each disk reduces its effectiveness.
	- * However, if the number of IOs going to each allocator is too small,
	- * we will not perform proper aggregation at the vdev_queue layer,
	- * also resulting in decreased performance. Therefore, we will use a
	- * ramp-up strategy.
	- *
	- * Each allocator in each metaslab group has a current queue depth
	- * (mg_alloc_queue_depth[allocator]) and a current max queue depth
	- * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group
	- * has an absolute max queue depth (mg_max_alloc_queue_depth). We
	- * add IOs to an allocator until the mg_alloc_queue_depth for that
	- * allocator hits the cur_max. Every time an IO completes for a given
	- * allocator on a given metaslab group, we increment its cur_max until
	- * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
	- * help protect against disks that decrease in performance over time.
	- *
	- * It's possible for an allocator to handle more allocations than
	- * its max. This can occur when gang blocks are required or when other
	- * groups are unable to handle their share of allocations.
	- */
	- uint64_t mg_max_alloc_queue_depth;
	- uint64_t *mg_cur_max_alloc_queue_depth;
	- zfs_refcount_t *mg_alloc_queue_depth;
	- int mg_allocators;
	- /*
	- * A metalab group that can no longer allocate the minimum block
	- * size will set mg_no_free_space. Once a metaslab group is out
	- * of space then its share of work must be distributed to other
	- * groups.
	- */
	- boolean_t mg_no_free_space;
	-
	- uint64_t mg_allocations;
	- uint64_t mg_failed_allocations;
	- uint64_t mg_fragmentation;
	- uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
	-
	- int mg_ms_initializing;
	- boolean_t mg_initialize_updating;
	- kmutex_t mg_ms_initialize_lock;
	- kcondvar_t mg_ms_initialize_cv;
	-};
	-
	-/*
	- * This value defines the number of elements in the ms_lbas array. The value
	- * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
	- * This is the equivalent of highbit(UINT64_MAX).
	- */
	-#define MAX_LBAS 64
	-
	-/*
	- * Each metaslab maintains a set of in-core trees to track metaslab
	- * operations. The in-core free tree (ms_allocatable) contains the list of
	- * free segments which are eligible for allocation. As blocks are
	- * allocated, the allocated segment are removed from the ms_allocatable and
	- * added to a per txg allocation tree (ms_allocating). As blocks are
	- * freed, they are added to the free tree (ms_freeing). These trees
	- * allow us to process all allocations and frees in syncing context
	- * where it is safe to update the on-disk space maps. An additional set
	- * of in-core trees is maintained to track deferred frees
	- * (ms_defer). Once a block is freed it will move from the
	- * ms_freed to the ms_defer tree. A deferred free means that a block
	- * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
	- * transactions groups later. For example, a block that is freed in txg
	- * 50 will not be available for reallocation until txg 52 (50 +
	- * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback.
	- * A pool could be safely rolled back TXG_DEFERS_SIZE transactions
	- * groups and ensure that no block has been reallocated.
	- *
	- * The simplified transition diagram looks like this:
	- *
	- *
	- * ALLOCATE
	- * \|
	- * V
	- * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map)
	- * ^
	- * \| ms_freeing <--- FREE
	- * \| \|
	- * \| v
	- * \| ms_freed
	- * \| \|
	- * +-------- ms_defer[2] <-------+-------> (write to space map)
	- *
	- *
	- * Each metaslab's space is tracked in a single space map in the MOS,
	- * which is only updated in syncing context. Each time we sync a txg,
	- * we append the allocs and frees from that txg to the space map. The
	- * pool space is only updated once all metaslabs have finished syncing.
	- *
	- * To load the in-core free tree we read the space map from disk. This
	- * object contains a series of alloc and free records that are combined
	- * to make up the list of all free segments in this metaslab. These
	- * segments are represented in-core by the ms_allocatable and are stored
	- * in an AVL tree.
	- *
	- * As the space map grows (as a result of the appends) it will
	- * eventually become space-inefficient. When the metaslab's in-core
	- * free tree is zfs_condense_pct/100 times the size of the minimal
	- * on-disk representation, we rewrite it in its minimized form. If a
	- * metaslab needs to condense then we must set the ms_condensing flag to
	- * ensure that allocations are not performed on the metaslab that is
	- * being written.
	- */
	-struct metaslab {
	- /*
	- * This is the main lock of the metaslab and its purpose is to
	- * coordinate our allocations and frees [e.g metaslab_block_alloc(),
	- * metaslab_free_concrete(), ..etc] with our various syncing
	- * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
	- *
	- * The lock is also used during some miscellaneous operations like
	- * using the metaslab's histogram for the metaslab group's histogram
	- * aggregation, or marking the metaslab for initialization.
	- */
	- kmutex_t ms_lock;
	-
	- /*
	- * Acquired together with the ms_lock whenever we expect to
	- * write to metaslab data on-disk (i.e flushing entries to
	- * the metaslab's space map). It helps coordinate readers of
	- * the metaslab's space map [see spa_vdev_remove_thread()]
	- * with writers [see metaslab_sync()].
	- *
	- * Note that metaslab_load(), even though a reader, uses
	- * a completely different mechanism to deal with the reading
	- * of the metaslab's space map based on ms_synced_length. That
	- * said, the function still uses the ms_sync_lock after it
	- * has read the ms_sm [see relevant comment in metaslab_load()
	- * as to why].
	- */
	- kmutex_t ms_sync_lock;
	-
	- kcondvar_t ms_load_cv;
	- space_map_t *ms_sm;
	- uint64_t ms_id;
	- uint64_t ms_start;
	- uint64_t ms_size;
	- uint64_t ms_fragmentation;
	-
	- range_tree_t *ms_allocating[TXG_SIZE];
	- range_tree_t *ms_allocatable;
	- uint64_t ms_allocated_this_txg;
	-
	- /*
	- * The following range trees are accessed only from syncing context.
	- * ms_free*tree only have entries while syncing, and are empty
	- * between syncs.
	- */
	- range_tree_t ms_freeing; / to free this syncing txg */
	- range_tree_t ms_freed; / already freed this syncing txg */
	- range_tree_t *ms_defer[TXG_DEFER_SIZE];
	- range_tree_t ms_checkpointing; / to add to the checkpoint */
	-
	- boolean_t ms_condensing; /* condensing? */
	- boolean_t ms_condense_wanted;
	- uint64_t ms_condense_checked_txg;
	-
	- uint64_t ms_initializing; /* leaves initializing this ms */
	-
	- /*
	- * We must always hold the ms_lock when modifying ms_loaded
	- * and ms_loading.
	- */
	- boolean_t ms_loaded;
	- boolean_t ms_loading;
	-
	- /*
	- * The following histograms count entries that are in the
	- * metaslab's space map (and its histogram) but are not in
	- * ms_allocatable yet, because they are in ms_freed, ms_freeing,
	- * or ms_defer[].
	- *
	- * When the metaslab is not loaded, its ms_weight needs to
	- * reflect what is allocatable (i.e. what will be part of
	- * ms_allocatable if it is loaded). The weight is computed from
	- * the spacemap histogram, but that includes ranges that are
	- * not yet allocatable (because they are in ms_freed,
	- * ms_freeing, or ms_defer[]). Therefore, when calculating the
	- * weight, we need to remove those ranges.
	- *
	- * The ranges in the ms_freed and ms_defer[] range trees are all
	- * present in the spacemap. However, the spacemap may have
	- * multiple entries to represent a contiguous range, because it
	- * is written across multiple sync passes, but the changes of
	- * all sync passes are consolidated into the range trees.
	- * Adjacent ranges that are freed in different sync passes of
	- * one txg will be represented separately (as 2 or more entries)
	- * in the space map (and its histogram), but these adjacent
	- * ranges will be consolidated (represented as one entry) in the
	- * ms_freed/ms_defer[] range trees (and their histograms).
	- *
	- * When calculating the weight, we can not simply subtract the
	- * range trees' histograms from the spacemap's histogram,
	- * because the range trees' histograms may have entries in
	- * higher buckets than the spacemap, due to consolidation.
	- * Instead we must subtract the exact entries that were added to
	- * the spacemap's histogram. ms_synchist and ms_deferhist[]
	- * represent these exact entries, so we can subtract them from
	- * the spacemap's histogram when calculating ms_weight.
	- *
	- * ms_synchist represents the same ranges as ms_freeing +
	- * ms_freed, but without consolidation across sync passes.
	- *
	- * ms_deferhist[i] represents the same ranges as ms_defer[i],
	- * but without consolidation across sync passes.
	- */
	- uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
	- uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
	-
	- /*
	- * Tracks the exact amount of allocated space of this metaslab
	- * (and specifically the metaslab's space map) up to the most
	- * recently completed sync pass [see usage in metaslab_sync()].
	- */
	- uint64_t ms_allocated_space;
	- int64_t ms_deferspace; /* sum of ms_defermap[] space */
	- uint64_t ms_weight; /* weight vs. others in group */
	- uint64_t ms_activation_weight; /* activation weight */
	-
	- /*
	- * Track of whenever a metaslab is selected for loading or allocation.
	- * We use this value to determine how long the metaslab should
	- * stay cached.
	- */
	- uint64_t ms_selected_txg;
	-
	- uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
	- uint64_t ms_max_size; /* maximum allocatable size */
	-
	- /*
	- * -1 if it's not active in an allocator, otherwise set to the allocator
	- * this metaslab is active for.
	- */
	- int ms_allocator;
	- boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */
	-
	- /*
	- * The metaslab block allocators can optionally use a size-ordered
	- * range tree and/or an array of LBAs. Not all allocators use
	- * this functionality. The ms_allocatable_by_size should always
	- * contain the same number of segments as the ms_allocatable. The
	- * only difference is that the ms_allocatable_by_size is ordered by
	- * segment sizes.
	- */
	- avl_tree_t ms_allocatable_by_size;
	- uint64_t ms_lbas[MAX_LBAS];
	-
	- metaslab_group_t ms_group; / metaslab group */
	- avl_node_t ms_group_node; /* node in metaslab group tree */
	- txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
	-
	- /* updated every time we are done syncing the metaslab's space map */
	- uint64_t ms_synced_length;
	-
	- boolean_t ms_new;
	-};
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_METASLAB_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h
	@@ -1,74 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (C) 2017 by Lawrence Livermore National Security, LLC.
	- */
	-
	-#ifndef _SYS_MMP_H
	-#define _SYS_MMP_H
	-
	-#include <sys/spa.h>
	-#include <sys/zfs_context.h>
	-#include <sys/uberblock_impl.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define MMP_MIN_INTERVAL 100 /* ms */
	-#define MMP_DEFAULT_INTERVAL 1000 /* ms */
	-#define MMP_DEFAULT_IMPORT_INTERVALS 20
	-#define MMP_DEFAULT_FAIL_INTERVALS 10
	-#define MMP_MIN_FAIL_INTERVALS 2 /* min if != 0 */
	-#define MMP_IMPORT_SAFETY_FACTOR 200 /* pct */
	-#define MMP_INTERVAL_OK(interval) MAX(interval, MMP_MIN_INTERVAL)
	-#define MMP_FAIL_INTVS_OK(fails) (fails == 0 ? 0 : MAX(fails, \
	- MMP_MIN_FAIL_INTERVALS))
	-
	-typedef struct mmp_thread {
	- kmutex_t mmp_thread_lock; /* protect thread mgmt fields */
	- kcondvar_t mmp_thread_cv;
	- kthread_t *mmp_thread;
	- uint8_t mmp_thread_exiting;
	- kmutex_t mmp_io_lock; /* protect below */
	- hrtime_t mmp_last_write; /* last successful MMP write */
	- uint64_t mmp_delay; /* decaying avg ns between MMP writes */
	- uberblock_t mmp_ub; /* last ub written by sync */
	- zio_t mmp_zio_root; / root of mmp write zios */
	- uint64_t mmp_kstat_id; /* unique id for next MMP write kstat */
	- int mmp_skip_error; /* reason for last skipped write */
	- vdev_t mmp_last_leaf; / last mmp write sent here */
	- uint64_t mmp_leaf_last_gen; /* last mmp write sent here */
	- uint32_t mmp_seq; /* intra-second update counter */
	-} mmp_thread_t;
	-
	-
	-extern void mmp_init(struct spa *spa);
	-extern void mmp_fini(struct spa *spa);
	-extern void mmp_thread_start(struct spa *spa);
	-extern void mmp_thread_stop(struct spa *spa);
	-extern void mmp_update_uberblock(struct spa spa, struct uberblock ub);
	-extern void mmp_signal_all_threads(void);
	-
	-/* Global tuning */
	-extern ulong_t zfs_multihost_interval;
	-extern uint_t zfs_multihost_fail_intervals;
	-extern uint_t zfs_multihost_import_intervals;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_MMP_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
	@@ -1,107 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_MULTILIST_H
	-#define _SYS_MULTILIST_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef list_node_t multilist_node_t;
	-typedef struct multilist multilist_t;
	-typedef struct multilist_sublist multilist_sublist_t;
	-typedef unsigned int multilist_sublist_index_func_t(multilist_t , void );
	-
	-struct multilist_sublist {
	- /*
	- * The mutex used internally to implement thread safe insertions
	- * and removals to this individual sublist. It can also be locked
	- * by a consumer using multilist_sublist_{lock,unlock}, which is
	- * useful if a consumer needs to traverse the list in a thread
	- * safe manner.
	- */
	- kmutex_t mls_lock;
	- /*
	- * The actual list object containing all objects in this sublist.
	- */
	- list_t mls_list;
	- /*
	- * Pad to cache line, in an effort to try and prevent cache line
	- * contention.
	- */
	-} __aligned(CACHE_LINE_SIZE);
	-
	-struct multilist {
	- /*
	- * This is used to get to the multilist_node_t structure given
	- * the void *object contained on the list.
	- */
	- size_t ml_offset;
	- /*
	- * The number of sublists used internally by this multilist.
	- */
	- uint64_t ml_num_sublists;
	- /*
	- * The array of pointers to the actual sublists.
	- */
	- multilist_sublist_t *ml_sublists;
	- /*
	- * Pointer to function which determines the sublist to use
	- * when inserting and removing objects from this multilist.
	- * Please see the comment above multilist_create for details.
	- */
	- multilist_sublist_index_func_t *ml_index_func;
	-};
	-
	-void multilist_destroy(multilist_t *);
	-multilist_t multilist_create(size_t, size_t, multilist_sublist_index_func_t );
	-
	-void multilist_insert(multilist_t , void );
	-void multilist_remove(multilist_t , void );
	-int multilist_is_empty(multilist_t *);
	-
	-unsigned int multilist_get_num_sublists(multilist_t *);
	-unsigned int multilist_get_random_index(multilist_t *);
	-
	-multilist_sublist_t multilist_sublist_lock(multilist_t , unsigned int);
	-multilist_sublist_t multilist_sublist_lock_obj(multilist_t , void *);
	-void multilist_sublist_unlock(multilist_sublist_t *);
	-
	-void multilist_sublist_insert_head(multilist_sublist_t , void );
	-void multilist_sublist_insert_tail(multilist_sublist_t , void );
	-void multilist_sublist_move_forward(multilist_sublist_t mls, void obj);
	-void multilist_sublist_remove(multilist_sublist_t , void );
	-int multilist_sublist_is_empty(multilist_sublist_t *);
	-int multilist_sublist_is_empty_idx(multilist_t *, unsigned int);
	-
	-void multilist_sublist_head(multilist_sublist_t );
	-void multilist_sublist_tail(multilist_sublist_t );
	-void multilist_sublist_next(multilist_sublist_t , void *);
	-void multilist_sublist_prev(multilist_sublist_t , void *);
	-
	-void multilist_link_init(multilist_node_t *);
	-int multilist_link_active(multilist_node_t *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_MULTILIST_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
	@@ -1,124 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_RANGE_TREE_H
	-#define _SYS_RANGE_TREE_H
	-
	-#include <sys/avl.h>
	-#include <sys/dmu.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define RANGE_TREE_HISTOGRAM_SIZE 64
	-
	-typedef struct range_tree_ops range_tree_ops_t;
	-
	-/*
	- * Note: the range_tree may not be accessed concurrently; consumers
	- * must provide external locking if required.
	- */
	-typedef struct range_tree {
	- avl_tree_t rt_root; /* offset-ordered segment AVL tree */
	- uint64_t rt_space; /* sum of all segments in the map */
	- range_tree_ops_t *rt_ops;
	- void *rt_arg;
	-
	- /* rt_avl_compare should only be set it rt_arg is an AVL tree */
	- uint64_t rt_gap; /* allowable inter-segment gap */
	- int (rt_avl_compare)(const void , const void *);
	- /*
	- * The rt_histogram maintains a histogram of ranges. Each bucket,
	- * rt_histogram[i], contains the number of ranges whose size is:
	- * 2^i <= size of range in bytes < 2^(i+1)
	- */
	- uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
	-} range_tree_t;
	-
	-typedef struct range_seg {
	- avl_node_t rs_node; /* AVL node */
	- avl_node_t rs_pp_node; /* AVL picker-private node */
	- uint64_t rs_start; /* starting offset of this segment */
	- uint64_t rs_end; /* ending offset (non-inclusive) */
	- uint64_t rs_fill; /* actual fill if gap mode is on */
	-} range_seg_t;
	-
	-struct range_tree_ops {
	- void (rtop_create)(range_tree_t rt, void *arg);
	- void (rtop_destroy)(range_tree_t rt, void *arg);
	- void (rtop_add)(range_tree_t rt, range_seg_t rs, void arg);
	- void (rtop_remove)(range_tree_t rt, range_seg_t rs, void arg);
	- void (rtop_vacate)(range_tree_t rt, void *arg);
	-};
	-
	-typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
	-
	-void range_tree_init(void);
	-void range_tree_fini(void);
	-range_tree_t range_tree_create_impl(range_tree_ops_t ops, void *arg,
	- int (avl_compare)(const void, const void*), uint64_t gap);
	-range_tree_t range_tree_create(range_tree_ops_t ops, void *arg);
	-void range_tree_destroy(range_tree_t *rt);
	-boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
	-void range_tree_verify_not_present(range_tree_t *rt,
	- uint64_t start, uint64_t size);
	-range_seg_t range_tree_find(range_tree_t rt, uint64_t start, uint64_t size);
	-void range_tree_resize_segment(range_tree_t rt, range_seg_t rs,
	- uint64_t newstart, uint64_t newsize);
	-uint64_t range_tree_space(range_tree_t *rt);
	-boolean_t range_tree_is_empty(range_tree_t *rt);
	-void range_tree_swap(range_tree_t rtsrc, range_tree_t rtdst);
	-void range_tree_stat_verify(range_tree_t *rt);
	-uint64_t range_tree_min(range_tree_t *rt);
	-uint64_t range_tree_max(range_tree_t *rt);
	-uint64_t range_tree_span(range_tree_t *rt);
	-
	-void range_tree_add(void *arg, uint64_t start, uint64_t size);
	-void range_tree_remove(void *arg, uint64_t start, uint64_t size);
	-void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size);
	-void range_tree_adjust_fill(range_tree_t rt, range_seg_t rs, int64_t delta);
	-void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
	-
	-void range_tree_vacate(range_tree_t rt, range_tree_func_t func, void *arg);
	-void range_tree_walk(range_tree_t rt, range_tree_func_t func, void *arg);
	-range_seg_t range_tree_first(range_tree_t rt);
	-
	-void rt_avl_create(range_tree_t rt, void arg);
	-void rt_avl_destroy(range_tree_t rt, void arg);
	-void rt_avl_add(range_tree_t rt, range_seg_t rs, void *arg);
	-void rt_avl_remove(range_tree_t rt, range_seg_t rs, void *arg);
	-void rt_avl_vacate(range_tree_t rt, void arg);
	-extern struct range_tree_ops rt_avl_ops;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_RANGE_TREE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
	@@ -1,125 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_REFCOUNT_H
	-#define _SYS_REFCOUNT_H
	-
	-#include <sys/cdefs.h>
	-#include <sys/types.h>
	-/* For FreeBSD refcount(9). */
	-#include_next <sys/refcount.h>
	-#include <sys/list.h>
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * If the reference is held only by the calling function and not any
	- * particular object, use FTAG (which is a string) for the holder_tag.
	- * Otherwise, use the object that holds the reference.
	- */
	-#define FTAG ((char *)(uintptr_t)__func__)
	-
	-#ifdef ZFS_DEBUG
	-typedef struct reference {
	- list_node_t ref_link;
	- void *ref_holder;
	- uint64_t ref_number;
	- uint8_t *ref_removed;
	-} reference_t;
	-
	-typedef struct refcount {
	- kmutex_t rc_mtx;
	- boolean_t rc_tracked;
	- list_t rc_list;
	- list_t rc_removed;
	- uint64_t rc_count;
	- uint64_t rc_removed_count;
	-} zfs_refcount_t;
	-
	-/*
	- * Note: zfs_refcount_t must be initialized with
	- * refcount_create[_untracked]()
	- */
	-
	-void zfs_refcount_create(zfs_refcount_t *);
	-void zfs_refcount_create_untracked(zfs_refcount_t *);
	-void zfs_refcount_create_tracked(zfs_refcount_t *);
	-void zfs_refcount_destroy(zfs_refcount_t *);
	-void zfs_refcount_destroy_many(zfs_refcount_t *, uint64_t);
	-int zfs_refcount_is_zero(zfs_refcount_t *);
	-int64_t zfs_refcount_count(zfs_refcount_t *);
	-int64_t zfs_refcount_add(zfs_refcount_t , void );
	-int64_t zfs_refcount_remove(zfs_refcount_t , void );
	-int64_t zfs_refcount_add_many(zfs_refcount_t , uint64_t, void );
	-int64_t zfs_refcount_remove_many(zfs_refcount_t , uint64_t, void );
	-void zfs_refcount_transfer(zfs_refcount_t , zfs_refcount_t );
	-void zfs_refcount_transfer_ownership(zfs_refcount_t , void , void *);
	-boolean_t zfs_refcount_held(zfs_refcount_t , void );
	-boolean_t zfs_refcount_not_held(zfs_refcount_t , void );
	-
	-void zfs_refcount_init(void);
	-void zfs_refcount_fini(void);
	-
	-#else /* ZFS_DEBUG */
	-
	-typedef struct refcount {
	- uint64_t rc_count;
	-} zfs_refcount_t;
	-
	-#define zfs_refcount_create(rc) ((rc)->rc_count = 0)
	-#define zfs_refcount_create_untracked(rc) ((rc)->rc_count = 0)
	-#define zfs_refcount_create_tracked(rc) ((rc)->rc_count = 0)
	-#define zfs_refcount_destroy(rc) ((rc)->rc_count = 0)
	-#define zfs_refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
	-#define zfs_refcount_is_zero(rc) ((rc)->rc_count == 0)
	-#define zfs_refcount_count(rc) ((rc)->rc_count)
	-#define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
	-#define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
	-#define zfs_refcount_add_many(rc, number, holder) \
	- atomic_add_64_nv(&(rc)->rc_count, number)
	-#define zfs_refcount_remove_many(rc, number, holder) \
	- atomic_add_64_nv(&(rc)->rc_count, -number)
	-#define zfs_refcount_transfer(dst, src) { \
	- uint64_t __tmp = (src)->rc_count; \
	- atomic_add_64(&(src)->rc_count, -__tmp); \
	- atomic_add_64(&(dst)->rc_count, __tmp); \
	-}
	-#define zfs_refcount_transfer_ownership(rc, current_holder, new_holder) (void)0
	-#define zfs_refcount_held(rc, holder) ((rc)->rc_count > 0)
	-#define zfs_refcount_not_held(rc, holder) (B_TRUE)
	-
	-#define zfs_refcount_init()
	-#define zfs_refcount_fini()
	-
	-#endif /* ZFS_DEBUG */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_REFCOUNT_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
	@@ -1,112 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_RR_RW_LOCK_H
	-#define _SYS_RR_RW_LOCK_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#include <sys/zfs_context.h>
	-#include <sys/refcount.h>
	-
	-/*
	- * A reader-writer lock implementation that allows re-entrant reads, but
	- * still gives writers priority on "new" reads.
	- *
	- * See rrwlock.c for more details about the implementation.
	- *
	- * Fields of the rrwlock_t structure:
	- * - rr_lock: protects modification and reading of rrwlock_t fields
	- * - rr_cv: cv for waking up readers or waiting writers
	- * - rr_writer: thread id of the current writer
	- * - rr_anon_rount: number of active anonymous readers
	- * - rr_linked_rcount: total number of non-anonymous active readers
	- * - rr_writer_wanted: a writer wants the lock
	- */
	-typedef struct rrwlock {
	- kmutex_t rr_lock;
	- kcondvar_t rr_cv;
	- kthread_t *rr_writer;
	- zfs_refcount_t rr_anon_rcount;
	- zfs_refcount_t rr_linked_rcount;
	- boolean_t rr_writer_wanted;
	- boolean_t rr_track_all;
	-} rrwlock_t;
	-
	-/*
	- * 'tag' is used in reference counting tracking. The
	- * 'tag' must be the same in a rrw_enter() as in its
	- * corresponding rrw_exit().
	- */
	-void rrw_init(rrwlock_t *rrl, boolean_t track_all);
	-void rrw_destroy(rrwlock_t *rrl);
	-void rrw_enter(rrwlock_t rrl, krw_t rw, void tag);
	-void rrw_enter_read(rrwlock_t rrl, void tag);
	-void rrw_enter_read_prio(rrwlock_t rrl, void tag);
	-void rrw_enter_write(rrwlock_t *rrl);
	-void rrw_exit(rrwlock_t rrl, void tag);
	-boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
	-void rrw_tsd_destroy(void *arg);
	-
	-#define RRW_READ_HELD(x) rrw_held(x, RW_READER)
	-#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER)
	-#define RRW_LOCK_HELD(x) \
	- (rrw_held(x, RW_WRITER) \|\| rrw_held(x, RW_READER))
	-
	-/*
	- * A reader-mostly lock implementation, tuning above reader-writer locks
	- * for hightly parallel read acquisitions, pessimizing write acquisitions.
	- *
	- * This should be a prime number. See comment in rrwlock.c near
	- * RRM_TD_LOCK() for details.
	- */
	-#define RRM_NUM_LOCKS 17
	-typedef struct rrmlock {
	- rrwlock_t locks[RRM_NUM_LOCKS];
	-} rrmlock_t;
	-
	-void rrm_init(rrmlock_t *rrl, boolean_t track_all);
	-void rrm_destroy(rrmlock_t *rrl);
	-void rrm_enter(rrmlock_t rrl, krw_t rw, void tag);
	-void rrm_enter_read(rrmlock_t rrl, void tag);
	-void rrm_enter_write(rrmlock_t *rrl);
	-void rrm_exit(rrmlock_t rrl, void tag);
	-boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
	-
	-#define RRM_READ_HELD(x) rrm_held(x, RW_READER)
	-#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER)
	-#define RRM_LOCK_HELD(x) \
	- (rrm_held(x, RW_WRITER) \|\| rrm_held(x, RW_READER))
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_RR_RW_LOCK_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
	@@ -1,170 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _SYS_SA_H
	-#define _SYS_SA_H
	-
	-#include <sys/dmu.h>
	-#include <sys/uio.h>
	-
	-/*
	- * Currently available byteswap functions.
	- * If it all possible new attributes should used
	- * one of the already defined byteswap functions.
	- * If a new byteswap function is added then the
	- * ZPL/Pool version will need to be bumped.
	- */
	-
	-typedef enum sa_bswap_type {
	- SA_UINT64_ARRAY,
	- SA_UINT32_ARRAY,
	- SA_UINT16_ARRAY,
	- SA_UINT8_ARRAY,
	- SA_ACL,
	-} sa_bswap_type_t;
	-
	-typedef uint16_t sa_attr_type_t;
	-
	-/*
	- * Attribute to register support for.
	- */
	-typedef struct sa_attr_reg {
	- char sa_name; / attribute name */
	- uint16_t sa_length;
	- sa_bswap_type_t sa_byteswap; /* bswap functon enum */
	- sa_attr_type_t sa_attr; /* filled in during registration */
	-} sa_attr_reg_t;
	-
	-
	-typedef void (sa_data_locator_t)(void *, uint32_t , uint32_t,
	- boolean_t, void *userptr);
	-
	-/*
	- * array of attributes to store.
	- *
	- * This array should be treated as opaque/private data.
	- * The SA_BULK_ADD_ATTR() macro should be used for manipulating
	- * the array.
	- *
	- * When sa_replace_all_by_template() is used the attributes
	- * will be stored in the order defined in the array, except that
	- * the attributes may be split between the bonus and the spill buffer
	- *
	- */
	-typedef struct sa_bulk_attr {
	- void *sa_data;
	- sa_data_locator_t *sa_data_func;
	- uint16_t sa_length;
	- sa_attr_type_t sa_attr;
	- /* the following are private to the sa framework */
	- void *sa_addr;
	- uint16_t sa_buftype;
	- uint16_t sa_size;
	-} sa_bulk_attr_t;
	-
	-
	-/*
	- * special macro for adding entries for bulk attr support
	- * bulk - sa_bulk_attr_t
	- * count - integer that will be incremented during each add
	- * attr - attribute to manipulate
	- * func - function for accessing data.
	- * data - pointer to data.
	- * len - length of data
	- */
	-
	-#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
	-{ \
	- b[idx].sa_attr = attr;\
	- b[idx].sa_data_func = func; \
	- b[idx].sa_data = data; \
	- b[idx++].sa_length = len; \
	-}
	-
	-typedef struct sa_os sa_os_t;
	-
	-typedef enum sa_handle_type {
	- SA_HDL_SHARED,
	- SA_HDL_PRIVATE
	-} sa_handle_type_t;
	-
	-struct sa_handle;
	-typedef void *sa_lookup_tab_t;
	-typedef struct sa_handle sa_handle_t;
	-
	-typedef void (sa_update_cb_t)(sa_handle_t , dmu_tx_t tx);
	-
	-int sa_handle_get(objset_t , uint64_t, void userp,
	- sa_handle_type_t, sa_handle_t **);
	-int sa_handle_get_from_db(objset_t , dmu_buf_t , void *userp,
	- sa_handle_type_t, sa_handle_t **);
	-void sa_handle_destroy(sa_handle_t *);
	-int sa_buf_hold(objset_t , uint64_t, void , dmu_buf_t **);
	-void sa_buf_rele(dmu_buf_t , void );
	-int sa_lookup(sa_handle_t , sa_attr_type_t, void buf, uint32_t buflen);
	-int sa_update(sa_handle_t , sa_attr_type_t, void buf,
	- uint32_t buflen, dmu_tx_t *);
	-int sa_remove(sa_handle_t , sa_attr_type_t, dmu_tx_t );
	-int sa_bulk_lookup(sa_handle_t , sa_bulk_attr_t , int count);
	-int sa_bulk_lookup_locked(sa_handle_t , sa_bulk_attr_t , int count);
	-int sa_bulk_update(sa_handle_t , sa_bulk_attr_t , int count, dmu_tx_t *);
	-int sa_size(sa_handle_t , sa_attr_type_t, int );
	-int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
	- uint32_t buflen, sa_data_locator_t , void userdata, dmu_tx_t *);
	-void sa_object_info(sa_handle_t , dmu_object_info_t );
	-void sa_object_size(sa_handle_t , uint32_t , u_longlong_t *);
	-void sa_get_userdata(sa_handle_t );
	-void sa_set_userp(sa_handle_t , void );
	-dmu_buf_t sa_get_db(sa_handle_t );
	-uint64_t sa_handle_object(sa_handle_t *);
	-boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
	-void sa_register_update_callback(objset_t , sa_update_cb_t );
	-int sa_setup(objset_t , uint64_t, sa_attr_reg_t , int, sa_attr_type_t **);
	-void sa_tear_down(objset_t *);
	-int sa_replace_all_by_template(sa_handle_t , sa_bulk_attr_t ,
	- int, dmu_tx_t *);
	-int sa_replace_all_by_template_locked(sa_handle_t , sa_bulk_attr_t ,
	- int, dmu_tx_t *);
	-boolean_t sa_enabled(objset_t *);
	-void sa_cache_init(void);
	-void sa_cache_fini(void);
	-int sa_set_sa_object(objset_t *, uint64_t);
	-int sa_hdrsize(void *);
	-void sa_handle_lock(sa_handle_t *);
	-void sa_handle_unlock(sa_handle_t *);
	-
	-#ifdef _KERNEL
	-int sa_lookup_uio(sa_handle_t , sa_attr_type_t, uio_t );
	-#endif
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_SA_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
	@@ -1,291 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- */
	-
	-#ifndef _SYS_SA_IMPL_H
	-#define _SYS_SA_IMPL_H
	-
	-#include <sys/dmu.h>
	-#include <sys/refcount.h>
	-#include <sys/list.h>
	-
	-/*
	- * Array of known attributes and their
	- * various characteristics.
	- */
	-typedef struct sa_attr_table {
	- sa_attr_type_t sa_attr;
	- uint8_t sa_registered;
	- uint16_t sa_length;
	- sa_bswap_type_t sa_byteswap;
	- char *sa_name;
	-} sa_attr_table_t;
	-
	-/*
	- * Zap attribute format for attribute registration
	- *
	- * 64 56 48 40 32 24 16 8 0
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * \| unused \| len \| bswap \| attr num \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- *
	- * Zap attribute format for layout information.
	- *
	- * layout information is stored as an array of attribute numbers
	- * The name of the attribute is the layout number (0, 1, 2, ...)
	- *
	- * 16 0
	- * +---- ---+
	- * \| attr # \|
	- * +--------+
	- * \| attr # \|
	- * +--- ----+
	- * ......
	- *
	- */
	-
	-#define ATTR_BSWAP(x) BF32_GET(x, 16, 8)
	-#define ATTR_LENGTH(x) BF32_GET(x, 24, 16)
	-#define ATTR_NUM(x) BF32_GET(x, 0, 16)
	-#define ATTR_ENCODE(x, attr, length, bswap) \
	-{ \
	- BF64_SET(x, 24, 16, length); \
	- BF64_SET(x, 16, 8, bswap); \
	- BF64_SET(x, 0, 16, attr); \
	-}
	-
	-#define TOC_OFF(x) BF32_GET(x, 0, 23)
	-#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1)
	-#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4)
	-#define TOC_ATTR_ENCODE(x, len_idx, offset) \
	-{ \
	- BF32_SET(x, 31, 1, 1); \
	- BF32_SET(x, 24, 7, len_idx); \
	- BF32_SET(x, 0, 24, offset); \
	-}
	-
	-#define SA_LAYOUTS "LAYOUTS"
	-#define SA_REGISTRY "REGISTRY"
	-
	-/*
	- * Each unique layout will have their own table
	- * sa_lot (layout_table)
	- */
	-typedef struct sa_lot {
	- avl_node_t lot_num_node;
	- avl_node_t lot_hash_node;
	- uint64_t lot_num;
	- uint64_t lot_hash;
	- sa_attr_type_t lot_attrs; / array of attr #'s */
	- uint32_t lot_var_sizes; /* how many aren't fixed size */
	- uint32_t lot_attr_count; /* total attr count */
	- list_t lot_idx_tab; /* should be only a couple of entries */
	- int lot_instance; /* used with lot_hash to identify entry */
	-} sa_lot_t;
	-
	-/* index table of offsets */
	-typedef struct sa_idx_tab {
	- list_node_t sa_next;
	- sa_lot_t *sa_layout;
	- uint16_t *sa_variable_lengths;
	- zfs_refcount_t sa_refcount;
	- uint32_t sa_idx_tab; / array of offsets */
	-} sa_idx_tab_t;
	-
	-/*
	- * Since the offset/index information into the actual data
	- * will usually be identical we can share that information with
	- * all handles that have the exact same offsets.
	- *
	- * You would typically only have a large number of different table of
	- * contents if you had a several variable sized attributes.
	- *
	- * Two AVL trees are used to track the attribute layout numbers.
	- * one is keyed by number and will be consulted when a DMU_OT_SA
	- * object is first read. The second tree is keyed by the hash signature
	- * of the attributes and will be consulted when an attribute is added
	- * to determine if we already have an instance of that layout. Both
	- * of these tree's are interconnected. The only difference is that
	- * when an entry is found in the "hash" tree the list of attributes will
	- * need to be compared against the list of attributes you have in hand.
	- * The assumption is that typically attributes will just be updated and
	- * adding a completely new attribute is a very rare operation.
	- */
	-struct sa_os {
	- kmutex_t sa_lock;
	- boolean_t sa_need_attr_registration;
	- boolean_t sa_force_spill;
	- uint64_t sa_master_obj;
	- uint64_t sa_reg_attr_obj;
	- uint64_t sa_layout_attr_obj;
	- int sa_num_attrs;
	- sa_attr_table_t sa_attr_table; / private attr table */
	- sa_update_cb_t *sa_update_cb;
	- avl_tree_t sa_layout_num_tree; /* keyed by layout number */
	- avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */
	- int sa_user_table_sz;
	- sa_attr_type_t sa_user_table; / user name->attr mapping table */
	-};
	-
	-/*
	- * header for all bonus and spill buffers.
	- *
	- * The header has a fixed portion with a variable number
	- * of "lengths" depending on the number of variable sized
	- * attributes which are determined by the "layout number"
	- */
	-
	-#define SA_MAGIC 0x2F505A /* ZFS SA */
	-typedef struct sa_hdr_phys {
	- uint32_t sa_magic;
	- /* BEGIN CSTYLED */
	- /*
	- * Encoded with hdrsize and layout number as follows:
	- * 16 10 0
	- * +--------+-------+
	- * \| hdrsz \|layout \|
	- * +--------+-------+
	- *
	- * Bits 0-10 are the layout number
	- * Bits 11-16 are the size of the header.
	- * The hdrsize is the number * 8
	- *
	- * For example.
	- * hdrsz of 1 ==> 8 byte header
	- * 2 ==> 16 byte header
	- *
	- */
	- /* END CSTYLED */
	- uint16_t sa_layout_info;
	- uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
	- /* ... Data follows the lengths. */
	-} sa_hdr_phys_t;
	-
	-#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
	-#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0)
	-#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
	-{ \
	- BF32_SET_SB(x, 10, 6, 3, 0, size); \
	- BF32_SET(x, 0, 10, num); \
	-}
	-
	-typedef enum sa_buf_type {
	- SA_BONUS = 1,
	- SA_SPILL = 2
	-} sa_buf_type_t;
	-
	-typedef enum sa_data_op {
	- SA_LOOKUP,
	- SA_UPDATE,
	- SA_ADD,
	- SA_REPLACE,
	- SA_REMOVE
	-} sa_data_op_t;
	-
	-/*
	- * Opaque handle used for most sa functions
	- *
	- * This needs to be kept as small as possible.
	- */
	-
	-struct sa_handle {
	- dmu_buf_user_t sa_dbu;
	- kmutex_t sa_lock;
	- dmu_buf_t *sa_bonus;
	- dmu_buf_t *sa_spill;
	- objset_t *sa_os;
	- void *sa_userp;
	- sa_idx_tab_t sa_bonus_tab; / idx of bonus */
	- sa_idx_tab_t sa_spill_tab; / only present if spill activated */
	-};
	-
	-#define SA_GET_DB(hdl, type) \
	- (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
	-
	-#define SA_GET_HDR(hdl, type) \
	- ((sa_hdr_phys_t )((dmu_buf_impl_t )(SA_GET_DB(hdl, \
	- type))->db.db_data))
	-
	-#define SA_IDX_TAB_GET(hdl, type) \
	- (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
	-
	-#define IS_SA_BONUSTYPE(a) \
	- ((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
	-
	-#define SA_BONUSTYPE_FROM_DB(db) \
	- (dmu_get_bonustype((dmu_buf_t *)db))
	-
	-#define SA_BLKPTR_SPACE (DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t))
	-
	-#define SA_LAYOUT_NUM(x, type) \
	- ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
	- ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
	-
	-
	-#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
	-
	-#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
	- hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
	- SA_REGISTERED_LEN(sa, attr))
	-
	-#define SA_SET_HDR(hdr, num, size) \
	- { \
	- hdr->sa_magic = SA_MAGIC; \
	- SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
	- }
	-
	-#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
	- { \
	- bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
	- bulk.sa_buftype = type; \
	- bulk.sa_addr = \
	- (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
	- (uintptr_t)hdr); \
	-}
	-
	-#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
	- (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
	- (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
	- sizeof (uint16_t), 8) : 0)))
	-
	-int sa_add_impl(sa_handle_t *, sa_attr_type_t,
	- uint32_t, sa_data_locator_t, void , dmu_tx_t );
	-
	-void sa_register_update_callback_locked(objset_t , sa_update_cb_t );
	-int sa_size_locked(sa_handle_t , sa_attr_type_t, int );
	-
	-void sa_default_locator(void *, uint32_t , uint32_t, boolean_t, void *);
	-int sa_attr_size(sa_os_t , sa_idx_tab_t , sa_attr_type_t,
	- uint16_t , sa_hdr_phys_t );
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_SA_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
	@@ -1,969 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Joyent, Inc.
	- * Copyright (c) 2017 Datto Inc.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#ifndef _SYS_SPA_H
	-#define _SYS_SPA_H
	-
	-#include <sys/avl.h>
	-#include <sys/zfs_context.h>
	-#include <sys/nvpair.h>
	-#include <sys/sysevent.h>
	-#include <sys/sysmacros.h>
	-#include <sys/types.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/dmu.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Forward references that lots of things need.
	- */
	-typedef struct spa spa_t;
	-typedef struct vdev vdev_t;
	-typedef struct metaslab metaslab_t;
	-typedef struct metaslab_group metaslab_group_t;
	-typedef struct metaslab_class metaslab_class_t;
	-typedef struct zio zio_t;
	-typedef struct zilog zilog_t;
	-typedef struct spa_aux_vdev spa_aux_vdev_t;
	-typedef struct ddt ddt_t;
	-typedef struct ddt_entry ddt_entry_t;
	-struct dsl_pool;
	-struct dsl_dataset;
	-
	-/*
	- * General-purpose 32-bit and 64-bit bitfield encodings.
	- */
	-#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
	-#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
	-#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
	-#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
	-
	-#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
	-#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
	-
	-#define BF32_SET(x, low, len, val) do { \
	- ASSERT3U(val, <, 1U << (len)); \
	- ASSERT3U(low + len, <=, 32); \
	- (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define BF64_SET(x, low, len, val) do { \
	- ASSERT3U(val, <, 1ULL << (len)); \
	- ASSERT3U(low + len, <=, 64); \
	- ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define BF32_GET_SB(x, low, len, shift, bias) \
	- ((BF32_GET(x, low, len) + (bias)) << (shift))
	-#define BF64_GET_SB(x, low, len, shift, bias) \
	- ((BF64_GET(x, low, len) + (bias)) << (shift))
	-
	-#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
	- ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
	- ASSERT3S((val) >> (shift), >=, bias); \
	- BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
	-_NOTE(CONSTCOND) } while (0)
	-#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
	- ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
	- ASSERT3S((val) >> (shift), >=, bias); \
	- BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-/*
	- * We currently support block sizes from 512 bytes to 16MB.
	- * The benefits of larger blocks, and thus larger IO, need to be weighed
	- * against the cost of COWing a giant block to modify one byte, and the
	- * large latency of reading or writing a large block.
	- *
	- * Note that although blocks up to 16MB are supported, the recordsize
	- * property can not be set larger than zfs_max_recordsize (default 1MB).
	- * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
	- *
	- * Note that although the LSIZE field of the blkptr_t can store sizes up
	- * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
	- * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
	- */
	-#define SPA_MINBLOCKSHIFT 9
	-#define SPA_OLD_MAXBLOCKSHIFT 17
	-#define SPA_MAXBLOCKSHIFT 24
	-#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
	-#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
	-#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
	-
	-/*
	- * Default maximum supported logical ashift.
	- *
	- * The current 8k allocation block size limit is due to the 8k
	- * aligned/sized operations performed by vdev_probe() on
	- * vdev_label->vl_pad2. Using another "safe region" for these tests
	- * would allow the limit to be raised to 16k, at the expense of
	- * only having 8 available uberblocks in the label area.
	- */
	-#define SPA_MAXASHIFT 13
	-
	-/*
	- * Default minimum supported logical ashift.
	- */
	-#define SPA_MINASHIFT SPA_MINBLOCKSHIFT
	-
	-/*
	- * Size of block to hold the configuration data (a packed nvlist)
	- */
	-#define SPA_CONFIG_BLOCKSIZE (1ULL << 14)
	-
	-/*
	- * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
	- * The ASIZE encoding should be at least 64 times larger (6 more bits)
	- * to support up to 4-way RAID-Z mirror mode with worst-case gang block
	- * overhead, three DVAs per bp, plus one more bit in case we do anything
	- * else that expands the ASIZE.
	- */
	-#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
	-#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
	-#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
	-
	-#define SPA_COMPRESSBITS 7
	-#define SPA_VDEVBITS 24
	-
	-/*
	- * All SPA data is represented by 128-bit data virtual addresses (DVAs).
	- * The members of the dva_t should be considered opaque outside the SPA.
	- */
	-typedef struct dva {
	- uint64_t dva_word[2];
	-} dva_t;
	-
	-/*
	- * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
	- */
	-typedef struct zio_cksum {
	- uint64_t zc_word[4];
	-} zio_cksum_t;
	-
	-/*
	- * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
	- * secret and is suitable for use in MAC algorithms as the key.
	- */
	-typedef struct zio_cksum_salt {
	- uint8_t zcs_bytes[32];
	-} zio_cksum_salt_t;
	-
	-/*
	- * Each block is described by its DVAs, time of birth, checksum, etc.
	- * The word-by-word, bit-by-bit layout of the blkptr is as follows:
	- *
	- * 64 56 48 40 32 24 16 8 0
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 0 \| pad \| vdev1 \| GRID \| ASIZE \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 1 \|G\| offset1 \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 2 \| pad \| vdev2 \| GRID \| ASIZE \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 3 \|G\| offset2 \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 4 \| pad \| vdev3 \| GRID \| ASIZE \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 5 \|G\| offset3 \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 6 \|BDX\|lvl\| type \| cksum \|E\| comp\| PSIZE \| LSIZE \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 7 \| padding \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 8 \| padding \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 9 \| physical birth txg \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * a \| logical birth txg \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * b \| fill count \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * c \| checksum[0] \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * d \| checksum[1] \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * e \| checksum[2] \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * f \| checksum[3] \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- *
	- * Legend:
	- *
	- * vdev virtual device ID
	- * offset offset into virtual device
	- * LSIZE logical size
	- * PSIZE physical size (after compression)
	- * ASIZE allocated size (including RAID-Z parity and gang block headers)
	- * GRID RAID-Z layout information (reserved for future use)
	- * cksum checksum function
	- * comp compression function
	- * G gang block indicator
	- * B byteorder (endianness)
	- * D dedup
	- * X encryption (on version 30, which is not supported)
	- * E blkptr_t contains embedded data (see below)
	- * lvl level of indirection
	- * type DMU object type
	- * phys birth txg when dva[0] was written; zero if same as logical birth txg
	- * note that typically all the dva's would be written in this
	- * txg, but they could be different if they were moved by
	- * device removal.
	- * log. birth transaction group in which the block was logically born
	- * fill count number of non-zero blocks under this bp
	- * checksum[4] 256-bit checksum of the data this bp describes
	- */
	-
	-/*
	- * "Embedded" blkptr_t's don't actually point to a block, instead they
	- * have a data payload embedded in the blkptr_t itself. See the comment
	- * in blkptr.c for more details.
	- *
	- * The blkptr_t is laid out as follows:
	- *
	- * 64 56 48 40 32 24 16 8 0
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 0 \| payload \|
	- * 1 \| payload \|
	- * 2 \| payload \|
	- * 3 \| payload \|
	- * 4 \| payload \|
	- * 5 \| payload \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 6 \|BDX\|lvl\| type \| etype \|E\| comp\| PSIZE\| LSIZE \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 7 \| payload \|
	- * 8 \| payload \|
	- * 9 \| payload \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * a \| logical birth txg \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * b \| payload \|
	- * c \| payload \|
	- * d \| payload \|
	- * e \| payload \|
	- * f \| payload \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- *
	- * Legend:
	- *
	- * payload contains the embedded data
	- * B (byteorder) byteorder (endianness)
	- * D (dedup) padding (set to zero)
	- * X encryption (set to zero; see above)
	- * E (embedded) set to one
	- * lvl indirection level
	- * type DMU object type
	- * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
	- * comp compression function of payload
	- * PSIZE size of payload after compression, in bytes
	- * LSIZE logical size of payload, in bytes
	- * note that 25 bits is enough to store the largest
	- * "normal" BP's LSIZE (2^16 * 2^9) in bytes
	- * log. birth transaction group in which the block was logically born
	- *
	- * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
	- * bp's they are stored in units of SPA_MINBLOCKSHIFT.
	- * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
	- * The B, D, X, lvl, type, and comp fields are stored the same as with normal
	- * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
	- * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
	- * other macros, as they assert that they are only used on BP's of the correct
	- * "embedded-ness".
	- */
	-
	-#define BPE_GET_ETYPE(bp) \
	- (ASSERT(BP_IS_EMBEDDED(bp)), \
	- BF64_GET((bp)->blk_prop, 40, 8))
	-#define BPE_SET_ETYPE(bp, t) do { \
	- ASSERT(BP_IS_EMBEDDED(bp)); \
	- BF64_SET((bp)->blk_prop, 40, 8, t); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define BPE_GET_LSIZE(bp) \
	- (ASSERT(BP_IS_EMBEDDED(bp)), \
	- BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
	-#define BPE_SET_LSIZE(bp, x) do { \
	- ASSERT(BP_IS_EMBEDDED(bp)); \
	- BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define BPE_GET_PSIZE(bp) \
	- (ASSERT(BP_IS_EMBEDDED(bp)), \
	- BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
	-#define BPE_SET_PSIZE(bp, x) do { \
	- ASSERT(BP_IS_EMBEDDED(bp)); \
	- BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-typedef enum bp_embedded_type {
	- BP_EMBEDDED_TYPE_DATA,
	- BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
	- NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
	-} bp_embedded_type_t;
	-
	-#define BPE_NUM_WORDS 14
	-#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
	-#define BPE_IS_PAYLOADWORD(bp, wp) \
	- ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
	-
	-#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
	-#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
	-#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */
	-
	-/*
	- * A block is a hole when it has either 1) never been written to, or
	- * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
	- * without physically allocating disk space. Holes are represented in the
	- * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
	- * done through the BP_IS_HOLE macro. For holes, the logical size, level,
	- * DMU object type, and birth times are all also stored for holes that
	- * were written to at some point (i.e. were punched after having been filled).
	- */
	-typedef struct blkptr {
	- dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
	- uint64_t blk_prop; /* size, compression, type, etc */
	- uint64_t blk_pad[2]; /* Extra space for the future */
	- uint64_t blk_phys_birth; /* txg when block was allocated */
	- uint64_t blk_birth; /* transaction group at birth */
	- uint64_t blk_fill; /* fill count */
	- zio_cksum_t blk_cksum; /* 256-bit checksum */
	-} blkptr_t;
	-
	-/*
	- * Macros to get and set fields in a bp or DVA.
	- */
	-#define DVA_GET_ASIZE(dva) \
	- BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
	-#define DVA_SET_ASIZE(dva, x) \
	- BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
	- SPA_MINBLOCKSHIFT, 0, x)
	-
	-#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
	-#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
	-
	-#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
	-#define DVA_SET_VDEV(dva, x) \
	- BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
	-
	-#define DVA_GET_OFFSET(dva) \
	- BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
	-#define DVA_SET_OFFSET(dva, x) \
	- BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
	-
	-#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
	-#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
	-
	-#define BP_GET_LSIZE(bp) \
	- (BP_IS_EMBEDDED(bp) ? \
	- (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
	- BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
	-#define BP_SET_LSIZE(bp, x) do { \
	- ASSERT(!BP_IS_EMBEDDED(bp)); \
	- BF64_SET_SB((bp)->blk_prop, \
	- 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define BP_GET_PSIZE(bp) \
	- (BP_IS_EMBEDDED(bp) ? 0 : \
	- BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
	-#define BP_SET_PSIZE(bp, x) do { \
	- ASSERT(!BP_IS_EMBEDDED(bp)); \
	- BF64_SET_SB((bp)->blk_prop, \
	- 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define BP_GET_COMPRESS(bp) \
	- BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
	-#define BP_SET_COMPRESS(bp, x) \
	- BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
	-
	-#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
	-#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)
	-
	-#define BP_GET_CHECKSUM(bp) \
	- (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
	- BF64_GET((bp)->blk_prop, 40, 8))
	-#define BP_SET_CHECKSUM(bp, x) do { \
	- ASSERT(!BP_IS_EMBEDDED(bp)); \
	- BF64_SET((bp)->blk_prop, 40, 8, x); \
	-_NOTE(CONSTCOND) } while (0)
	-
	-#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
	-#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
	-
	-#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
	-#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
	-
	-#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
	-#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
	-
	-#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
	-#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
	-
	-#define BP_PHYSICAL_BIRTH(bp) \
	- (BP_IS_EMBEDDED(bp) ? 0 : \
	- (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
	-
	-#define BP_SET_BIRTH(bp, logical, physical) \
	-{ \
	- ASSERT(!BP_IS_EMBEDDED(bp)); \
	- (bp)->blk_birth = (logical); \
	- (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
	-}
	-
	-#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
	-
	-#define BP_IS_METADATA(bp) \
	- (BP_GET_LEVEL(bp) > 0 \|\| DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
	-
	-#define BP_GET_ASIZE(bp) \
	- (BP_IS_EMBEDDED(bp) ? 0 : \
	- DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
	- DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
	- DVA_GET_ASIZE(&(bp)->blk_dva[2]))
	-
	-#define BP_GET_UCSIZE(bp) \
	- (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
	-
	-#define BP_GET_NDVAS(bp) \
	- (BP_IS_EMBEDDED(bp) ? 0 : \
	- !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
	- !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
	- !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
	-
	-#define BP_COUNT_GANG(bp) \
	- (BP_IS_EMBEDDED(bp) ? 0 : \
	- (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
	- DVA_GET_GANG(&(bp)->blk_dva[1]) + \
	- DVA_GET_GANG(&(bp)->blk_dva[2])))
	-
	-#define DVA_EQUAL(dva1, dva2) \
	- ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
	- (dva1)->dva_word[0] == (dva2)->dva_word[0])
	-
	-#define BP_EQUAL(bp1, bp2) \
	- (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
	- (bp1)->blk_birth == (bp2)->blk_birth && \
	- DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
	- DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
	- DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
	-
	-#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
	- (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) \| \
	- ((zc1).zc_word[1] - (zc2).zc_word[1]) \| \
	- ((zc1).zc_word[2] - (zc2).zc_word[2]) \| \
	- ((zc1).zc_word[3] - (zc2).zc_word[3])))
	-
	-#define ZIO_CHECKSUM_IS_ZERO(zc) \
	- (0 == ((zc)->zc_word[0] \| (zc)->zc_word[1] \| \
	- (zc)->zc_word[2] \| (zc)->zc_word[3]))
	-
	-#define ZIO_CHECKSUM_BSWAP(zcp) \
	-{ \
	- (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \
	- (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \
	- (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \
	- (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \
	-}
	-
	-
	-#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
	-
	-#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
	-{ \
	- (zcp)->zc_word[0] = w0; \
	- (zcp)->zc_word[1] = w1; \
	- (zcp)->zc_word[2] = w2; \
	- (zcp)->zc_word[3] = w3; \
	-}
	-
	-#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
	-#define BP_IS_GANG(bp) \
	- (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
	-#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
	- (dva)->dva_word[1] == 0ULL)
	-#define BP_IS_HOLE(bp) \
	- (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
	-
	-/* BP_IS_RAIDZ(bp) assumes no block compression */
	-#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
	- BP_GET_PSIZE(bp))
	-
	-#define BP_ZERO(bp) \
	-{ \
	- (bp)->blk_dva[0].dva_word[0] = 0; \
	- (bp)->blk_dva[0].dva_word[1] = 0; \
	- (bp)->blk_dva[1].dva_word[0] = 0; \
	- (bp)->blk_dva[1].dva_word[1] = 0; \
	- (bp)->blk_dva[2].dva_word[0] = 0; \
	- (bp)->blk_dva[2].dva_word[1] = 0; \
	- (bp)->blk_prop = 0; \
	- (bp)->blk_pad[0] = 0; \
	- (bp)->blk_pad[1] = 0; \
	- (bp)->blk_phys_birth = 0; \
	- (bp)->blk_birth = 0; \
	- (bp)->blk_fill = 0; \
	- ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
	-}
	-
	-#if BYTE_ORDER == _BIG_ENDIAN
	-#define ZFS_HOST_BYTEORDER (0ULL)
	-#else
	-#define ZFS_HOST_BYTEORDER (1ULL)
	-#endif
	-
	-#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
	-
	-#define BP_SPRINTF_LEN 320
	-
	-/*
	- * This macro allows code sharing between zfs, libzpool, and mdb.
	- * 'func' is either snprintf() or mdb_snprintf().
	- * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
	- */
	-#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
	-{ \
	- static const char *copyname[] = \
	- { "zero", "single", "double", "triple" }; \
	- int len = 0; \
	- int copies = 0; \
	- \
	- if (bp == NULL) { \
	- len += func(buf + len, size - len, "<NULL>"); \
	- } else if (BP_IS_HOLE(bp)) { \
	- len += func(buf + len, size - len, \
	- "HOLE [L%llu %s] " \
	- "size=%llxL birth=%lluL", \
	- (u_longlong_t)BP_GET_LEVEL(bp), \
	- type, \
	- (u_longlong_t)BP_GET_LSIZE(bp), \
	- (u_longlong_t)bp->blk_birth); \
	- } else if (BP_IS_EMBEDDED(bp)) { \
	- len = func(buf + len, size - len, \
	- "EMBEDDED [L%llu %s] et=%u %s " \
	- "size=%llxL/%llxP birth=%lluL", \
	- (u_longlong_t)BP_GET_LEVEL(bp), \
	- type, \
	- (int)BPE_GET_ETYPE(bp), \
	- compress, \
	- (u_longlong_t)BPE_GET_LSIZE(bp), \
	- (u_longlong_t)BPE_GET_PSIZE(bp), \
	- (u_longlong_t)bp->blk_birth); \
	- } else { \
	- for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
	- const dva_t *dva = &bp->blk_dva[d]; \
	- if (DVA_IS_VALID(dva)) \
	- copies++; \
	- len += func(buf + len, size - len, \
	- "DVA[%d]=<%llu:%llx:%llx>%c", d, \
	- (u_longlong_t)DVA_GET_VDEV(dva), \
	- (u_longlong_t)DVA_GET_OFFSET(dva), \
	- (u_longlong_t)DVA_GET_ASIZE(dva), \
	- ws); \
	- } \
	- if (BP_IS_GANG(bp) && \
	- DVA_GET_ASIZE(&bp->blk_dva[2]) <= \
	- DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \
	- copies--; \
	- len += func(buf + len, size - len, \
	- "[L%llu %s] %s %s %s %s %s %s%c" \
	- "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \
	- "cksum=%llx:%llx:%llx:%llx", \
	- (u_longlong_t)BP_GET_LEVEL(bp), \
	- type, \
	- checksum, \
	- compress, \
	- BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \
	- BP_IS_GANG(bp) ? "gang" : "contiguous", \
	- BP_GET_DEDUP(bp) ? "dedup" : "unique", \
	- copyname[copies], \
	- ws, \
	- (u_longlong_t)BP_GET_LSIZE(bp), \
	- (u_longlong_t)BP_GET_PSIZE(bp), \
	- (u_longlong_t)bp->blk_birth, \
	- (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
	- (u_longlong_t)BP_GET_FILL(bp), \
	- ws, \
	- (u_longlong_t)bp->blk_cksum.zc_word[0], \
	- (u_longlong_t)bp->blk_cksum.zc_word[1], \
	- (u_longlong_t)bp->blk_cksum.zc_word[2], \
	- (u_longlong_t)bp->blk_cksum.zc_word[3]); \
	- } \
	- ASSERT(len < size); \
	-}
	-
	-#define BP_GET_BUFC_TYPE(bp) \
	- (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
	-
	-typedef enum spa_import_type {
	- SPA_IMPORT_EXISTING,
	- SPA_IMPORT_ASSEMBLE
	-} spa_import_type_t;
	-
	-/* state manipulation functions */
	-extern int spa_open(const char pool, spa_t , void tag);
	-extern int spa_open_rewind(const char pool, spa_t , void tag,
	- nvlist_t policy, nvlist_t *config);
	-extern int spa_get_stats(const char pool, nvlist_t config, char altroot,
	- size_t buflen);
	-extern int spa_create(const char pool, nvlist_t config, nvlist_t *props,
	- nvlist_t *zplprops);
	-#ifdef illumos
	-extern int spa_import_rootpool(char devpath, char devid);
	-#else
	-extern int spa_import_rootpool(const char *name, bool checkpointrewind);
	-#endif
	-extern int spa_import(const char pool, nvlist_t config, nvlist_t *props,
	- uint64_t flags);
	-extern nvlist_t spa_tryimport(nvlist_t tryconfig);
	-extern int spa_destroy(char *pool);
	-extern int spa_checkpoint(const char *pool);
	-extern int spa_checkpoint_discard(const char *pool);
	-extern int spa_export(char pool, nvlist_t *oldconfig, boolean_t force,
	- boolean_t hardforce);
	-extern int spa_reset(char *pool);
	-extern void spa_async_request(spa_t *spa, int flag);
	-extern void spa_async_unrequest(spa_t *spa, int flag);
	-extern void spa_async_suspend(spa_t *spa);
	-extern void spa_async_resume(spa_t *spa);
	-extern spa_t spa_inject_addref(char pool);
	-extern void spa_inject_delref(spa_t *spa);
	-extern void spa_scan_stat_init(spa_t *spa);
	-extern int spa_scan_get_stats(spa_t spa, pool_scan_stat_t ps);
	-
	-#define SPA_ASYNC_CONFIG_UPDATE 0x01
	-#define SPA_ASYNC_REMOVE 0x02
	-#define SPA_ASYNC_PROBE 0x04
	-#define SPA_ASYNC_RESILVER_DONE 0x08
	-#define SPA_ASYNC_RESILVER 0x10
	-#define SPA_ASYNC_AUTOEXPAND 0x20
	-#define SPA_ASYNC_REMOVE_DONE 0x40
	-#define SPA_ASYNC_REMOVE_STOP 0x80
	-#define SPA_ASYNC_INITIALIZE_RESTART 0x100
	-
	-/*
	- * Controls the behavior of spa_vdev_remove().
	- */
	-#define SPA_REMOVE_UNSPARE 0x01
	-#define SPA_REMOVE_DONE 0x02
	-
	-/* device manipulation */
	-extern int spa_vdev_add(spa_t spa, nvlist_t nvroot);
	-extern int spa_vdev_attach(spa_t spa, uint64_t guid, nvlist_t nvroot,
	- int replacing);
	-extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
	- int replace_done);
	-extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
	-extern boolean_t spa_vdev_remove_active(spa_t *spa);
	-extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type);
	-extern int spa_vdev_setpath(spa_t spa, uint64_t guid, const char newpath);
	-extern int spa_vdev_setfru(spa_t spa, uint64_t guid, const char newfru);
	-extern int spa_vdev_split_mirror(spa_t spa, char newname, nvlist_t *config,
	- nvlist_t *props, boolean_t exp);
	-
	-/* spare state (which is global across all pools) */
	-extern void spa_spare_add(vdev_t *vd);
	-extern void spa_spare_remove(vdev_t *vd);
	-extern boolean_t spa_spare_exists(uint64_t guid, uint64_t pool, int refcnt);
	-extern void spa_spare_activate(vdev_t *vd);
	-
	-/* L2ARC state (which is global across all pools) */
	-extern void spa_l2cache_add(vdev_t *vd);
	-extern void spa_l2cache_remove(vdev_t *vd);
	-extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
	-extern void spa_l2cache_activate(vdev_t *vd);
	-extern void spa_l2cache_drop(spa_t *spa);
	-
	-/* scanning */
	-extern int spa_scan(spa_t *spa, pool_scan_func_t func);
	-extern int spa_scan_stop(spa_t *spa);
	-extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);
	-
	-/* spa syncing */
	-extern void spa_sync(spa_t spa, uint64_t txg); / only for DMU use */
	-extern void spa_sync_allpools(void);
	-
	-/* spa namespace global mutex */
	-extern kmutex_t spa_namespace_lock;
	-
	-/*
	- * SPA configuration functions in spa_config.c
	- */
	-
	-#define SPA_CONFIG_UPDATE_POOL 0
	-#define SPA_CONFIG_UPDATE_VDEVS 1
	-
	-extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
	-extern void spa_config_load(void);
	-extern nvlist_t spa_all_configs(uint64_t );
	-extern void spa_config_set(spa_t spa, nvlist_t config);
	-extern nvlist_t spa_config_generate(spa_t spa, vdev_t *vd, uint64_t txg,
	- int getstats);
	-extern void spa_config_update(spa_t *spa, int what);
	-
	-/*
	- * Miscellaneous SPA routines in spa_misc.c
	- */
	-
	-/* Namespace manipulation */
	-extern spa_t spa_lookup(const char name);
	-extern spa_t spa_add(const char name, nvlist_t config, const char altroot);
	-extern void spa_remove(spa_t *spa);
	-extern spa_t spa_next(spa_t prev);
	-
	-/* Refcount functions */
	-extern void spa_open_ref(spa_t spa, void tag);
	-extern void spa_close(spa_t spa, void tag);
	-extern void spa_async_close(spa_t spa, void tag);
	-extern boolean_t spa_refcount_zero(spa_t *spa);
	-
	-#define SCL_NONE 0x00
	-#define SCL_CONFIG 0x01
	-#define SCL_STATE 0x02
	-#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */
	-#define SCL_ALLOC 0x08
	-#define SCL_ZIO 0x10
	-#define SCL_FREE 0x20
	-#define SCL_VDEV 0x40
	-#define SCL_LOCKS 7
	-#define SCL_ALL ((1 << SCL_LOCKS) - 1)
	-#define SCL_STATE_ALL (SCL_STATE \| SCL_L2ARC \| SCL_ZIO)
	-
	-/* Pool configuration locks */
	-extern int spa_config_tryenter(spa_t spa, int locks, void tag, krw_t rw);
	-extern void spa_config_enter(spa_t spa, int locks, void tag, krw_t rw);
	-extern void spa_config_exit(spa_t spa, int locks, void tag);
	-extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
	-
	-/* Pool vdev add/remove lock */
	-extern uint64_t spa_vdev_enter(spa_t *spa);
	-extern uint64_t spa_vdev_config_enter(spa_t *spa);
	-extern void spa_vdev_config_exit(spa_t spa, vdev_t vd, uint64_t txg,
	- int error, char *tag);
	-extern int spa_vdev_exit(spa_t spa, vdev_t vd, uint64_t txg, int error);
	-
	-/* Pool vdev state change lock */
	-extern void spa_vdev_state_enter(spa_t *spa, int oplock);
	-extern int spa_vdev_state_exit(spa_t spa, vdev_t vd, int error);
	-
	-/* Log state */
	-typedef enum spa_log_state {
	- SPA_LOG_UNKNOWN = 0, /* unknown log state */
	- SPA_LOG_MISSING, /* missing log(s) */
	- SPA_LOG_CLEAR, /* clear the log(s) */
	- SPA_LOG_GOOD, /* log(s) are good */
	-} spa_log_state_t;
	-
	-extern spa_log_state_t spa_get_log_state(spa_t *spa);
	-extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
	-extern int spa_reset_logs(spa_t *spa);
	-
	-/* Log claim callback */
	-extern void spa_claim_notify(zio_t *zio);
	-
	-/* Accessor functions */
	-extern boolean_t spa_shutting_down(spa_t *spa);
	-extern struct dsl_pool spa_get_dsl(spa_t spa);
	-extern boolean_t spa_is_initializing(spa_t *spa);
	-extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
	-extern blkptr_t spa_get_rootblkptr(spa_t spa);
	-extern void spa_set_rootblkptr(spa_t spa, const blkptr_t bp);
	-extern void spa_altroot(spa_t , char , size_t);
	-extern int spa_sync_pass(spa_t *spa);
	-extern char spa_name(spa_t spa);
	-extern uint64_t spa_guid(spa_t *spa);
	-extern uint64_t spa_load_guid(spa_t *spa);
	-extern uint64_t spa_last_synced_txg(spa_t *spa);
	-extern uint64_t spa_first_txg(spa_t *spa);
	-extern uint64_t spa_syncing_txg(spa_t *spa);
	-extern uint64_t spa_final_dirty_txg(spa_t *spa);
	-extern uint64_t spa_version(spa_t *spa);
	-extern pool_state_t spa_state(spa_t *spa);
	-extern spa_load_state_t spa_load_state(spa_t *spa);
	-extern uint64_t spa_freeze_txg(spa_t *spa);
	-extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
	-extern uint64_t spa_get_dspace(spa_t *spa);
	-extern uint64_t spa_get_checkpoint_space(spa_t *spa);
	-extern uint64_t spa_get_slop_space(spa_t *spa);
	-extern void spa_update_dspace(spa_t *spa);
	-extern uint64_t spa_version(spa_t *spa);
	-extern boolean_t spa_deflate(spa_t *spa);
	-extern metaslab_class_t spa_normal_class(spa_t spa);
	-extern metaslab_class_t spa_log_class(spa_t spa);
	-extern metaslab_class_t spa_special_class(spa_t spa);
	-extern metaslab_class_t spa_dedup_class(spa_t spa);
	-extern metaslab_class_t spa_preferred_class(spa_t spa, uint64_t size,
	- dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
	-
	-extern void spa_evicting_os_register(spa_t , objset_t os);
	-extern void spa_evicting_os_deregister(spa_t , objset_t os);
	-extern void spa_evicting_os_wait(spa_t *spa);
	-extern int spa_max_replication(spa_t *spa);
	-extern int spa_prev_software_version(spa_t *spa);
	-extern int spa_busy(void);
	-extern uint8_t spa_get_failmode(spa_t *spa);
	-extern boolean_t spa_suspended(spa_t *spa);
	-extern uint64_t spa_bootfs(spa_t *spa);
	-extern uint64_t spa_delegation(spa_t *spa);
	-extern objset_t spa_meta_objset(spa_t spa);
	-extern uint64_t spa_deadman_synctime(spa_t *spa);
	-extern struct proc spa_proc(spa_t spa);
	-extern uint64_t spa_dirty_data(spa_t *spa);
	-
	-/* Miscellaneous support routines */
	-extern void spa_load_failed(spa_t spa, const char fmt, ...);
	-extern void spa_load_note(spa_t spa, const char fmt, ...);
	-extern void spa_activate_mos_feature(spa_t spa, const char feature,
	- dmu_tx_t *tx);
	-extern void spa_deactivate_mos_feature(spa_t spa, const char feature);
	-extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
	-extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
	-extern char spa_strdup(const char );
	-extern void spa_strfree(char *);
	-extern uint64_t spa_get_random(uint64_t range);
	-extern uint64_t spa_generate_guid(spa_t *spa);
	-extern void snprintf_blkptr(char buf, size_t buflen, const blkptr_t bp);
	-extern void spa_freeze(spa_t *spa);
	-extern int spa_change_guid(spa_t *spa);
	-extern void spa_upgrade(spa_t *spa, uint64_t version);
	-extern void spa_evict_all(void);
	-extern vdev_t spa_lookup_by_guid(spa_t spa, uint64_t guid,
	- boolean_t l2cache);
	-extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
	-extern uint64_t dva_get_dsize_sync(spa_t spa, const dva_t dva);
	-extern uint64_t bp_get_dsize_sync(spa_t spa, const blkptr_t bp);
	-extern uint64_t bp_get_dsize(spa_t spa, const blkptr_t bp);
	-extern boolean_t spa_has_slogs(spa_t *spa);
	-extern boolean_t spa_is_root(spa_t *spa);
	-extern boolean_t spa_writeable(spa_t *spa);
	-extern boolean_t spa_has_pending_synctask(spa_t *spa);
	-extern int spa_maxblocksize(spa_t *spa);
	-extern int spa_maxdnodesize(spa_t *spa);
	-extern boolean_t spa_multihost(spa_t *spa);
	-extern unsigned long spa_get_hostid(void);
	-extern boolean_t spa_has_checkpoint(spa_t *spa);
	-extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
	-extern boolean_t spa_suspend_async_destroy(spa_t *spa);
	-extern uint64_t spa_min_claim_txg(spa_t *spa);
	-extern void zfs_blkptr_verify(spa_t spa, const blkptr_t bp);
	-extern boolean_t zfs_dva_valid(spa_t spa, const dva_t dva,
	- const blkptr_t *bp);
	-typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
	- void *arg);
	-extern boolean_t spa_remap_blkptr(spa_t spa, blkptr_t bp,
	- spa_remap_cb_t callback, void *arg);
	-extern uint64_t spa_get_last_removal_txg(spa_t *spa);
	-extern boolean_t spa_trust_config(spa_t *spa);
	-extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
	-extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
	-extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
	-extern void spa_activate_allocation_classes(spa_t , dmu_tx_t );
	-
	-extern int spa_mode(spa_t *spa);
	-extern uint64_t zfs_strtonum(const char str, char *nptr);
	-
	-extern char *spa_his_ievent_table[];
	-
	-extern void spa_history_create_obj(spa_t spa, dmu_tx_t tx);
	-extern int spa_history_get(spa_t spa, uint64_t offset, uint64_t *len_read,
	- char *his_buf);
	-extern int spa_history_log(spa_t spa, const char his_buf);
	-extern int spa_history_log_nvl(spa_t spa, nvlist_t nvl);
	-extern void spa_history_log_version(spa_t spa, const char operation);
	-extern void spa_history_log_internal(spa_t spa, const char operation,
	- dmu_tx_t tx, const char fmt, ...);
	-extern void spa_history_log_internal_ds(struct dsl_dataset ds, const char op,
	- dmu_tx_t tx, const char fmt, ...);
	-extern void spa_history_log_internal_dd(dsl_dir_t dd, const char operation,
	- dmu_tx_t tx, const char fmt, ...);
	-
	-/* error handling */
	-struct zbookmark_phys;
	-extern void spa_log_error(spa_t spa, zio_t zio);
	-extern void zfs_ereport_post(const char cls, spa_t spa, vdev_t *vd,
	- zio_t *zio, uint64_t stateoroffset, uint64_t length);
	-extern void zfs_post_remove(spa_t spa, vdev_t vd);
	-extern void zfs_post_state_change(spa_t spa, vdev_t vd);
	-extern void zfs_post_autoreplace(spa_t spa, vdev_t vd);
	-extern uint64_t spa_get_errlog_size(spa_t *spa);
	-extern int spa_get_errlog(spa_t spa, void uaddr, size_t *count);
	-extern void spa_errlog_rotate(spa_t *spa);
	-extern void spa_errlog_drain(spa_t *spa);
	-extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
	-extern void spa_get_errlists(spa_t spa, avl_tree_t last, avl_tree_t *scrub);
	-
	-/* vdev cache */
	-extern void vdev_cache_stat_init(void);
	-extern void vdev_cache_stat_fini(void);
	-
	-/* Initialization and termination */
	-extern void spa_init(int flags);
	-extern void spa_fini(void);
	-extern void spa_boot_init(void);
	-
	-/* properties */
	-extern int spa_prop_set(spa_t spa, nvlist_t nvp);
	-extern int spa_prop_get(spa_t spa, nvlist_t *nvp);
	-extern void spa_prop_clear_bootfs(spa_t spa, uint64_t obj, dmu_tx_t tx);
	-extern void spa_configfile_set(spa_t , nvlist_t , boolean_t);
	-
	-/* asynchronous event notification */
	-extern void spa_event_notify(spa_t spa, vdev_t vdev, nvlist_t *hist_nvl,
	- const char *name);
	-extern sysevent_t spa_event_create(spa_t spa, vdev_t vd, nvlist_t hist_nvl,
	- const char *name);
	-extern void spa_event_post(sysevent_t *ev);
	-extern void spa_event_discard(sysevent_t *ev);
	-
	-#ifdef ZFS_DEBUG
	-#define dprintf_bp(bp, fmt, ...) do { \
	- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	- char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
	- snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
	- dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
	- kmem_free(__blkbuf, BP_SPRINTF_LEN); \
	- } \
	-_NOTE(CONSTCOND) } while (0)
	-#else
	-#define dprintf_bp(bp, fmt, ...)
	-#endif
	-
	-extern int spa_mode_global; /* mode, e.g. FREAD \| FWRITE */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_SPA_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
	@@ -1,48 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_SPA_BOOT_H
	-#define _SYS_SPA_BOOT_H
	-
	-#include <sys/nvpair.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-extern char spa_get_bootprop(char prop);
	-extern void spa_free_bootprop(char *prop);
	-
	-extern void spa_arch_init(void);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_SPA_BOOT_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h
	@@ -1,44 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_SPA_CHECKPOINT_H
	-#define _SYS_SPA_CHECKPOINT_H
	-
	-#include <sys/zthr.h>
	-
	-typedef struct spa_checkpoint_info {
	- uint64_t sci_timestamp; /* when checkpointed uberblock was synced */
	- uint64_t sci_dspace; /* disk space used by checkpoint in bytes */
	-} spa_checkpoint_info_t;
	-
	-int spa_checkpoint(const char *);
	-int spa_checkpoint_discard(const char *);
	-
	-boolean_t spa_checkpoint_discard_thread_check(void , zthr_t );
	-void spa_checkpoint_discard_thread(void , zthr_t );
	-
	-int spa_checkpoint_get_stats(spa_t , pool_checkpoint_stat_t );
	-
	-#endif /* _SYS_SPA_CHECKPOINT_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
	@@ -1,435 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2017 Datto Inc.
	- * Copyright (c) 2017, Intel Corporation.
	- * Copyright (c) 2016 Actifio, Inc. All rights reserved.
	- */
	-
	-#ifndef _SYS_SPA_IMPL_H
	-#define _SYS_SPA_IMPL_H
	-
	-#include <sys/spa.h>
	-#include <sys/spa_checkpoint.h>
	-#include <sys/vdev.h>
	-#include <sys/vdev_removal.h>
	-#include <sys/metaslab.h>
	-#include <sys/dmu.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/uberblock_impl.h>
	-#include <sys/zfs_context.h>
	-#include <sys/avl.h>
	-#include <sys/refcount.h>
	-#include <sys/bplist.h>
	-#include <sys/bpobj.h>
	-#include <sys/zfeature.h>
	-#include <sys/zthr.h>
	-#include <zfeature_common.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct spa_error_entry {
	- zbookmark_phys_t se_bookmark;
	- char *se_name;
	- avl_node_t se_avl;
	-} spa_error_entry_t;
	-
	-typedef struct spa_history_phys {
	- uint64_t sh_pool_create_len; /* ending offset of zpool create */
	- uint64_t sh_phys_max_off; /* physical EOF */
	- uint64_t sh_bof; /* logical BOF */
	- uint64_t sh_eof; /* logical EOF */
	- uint64_t sh_records_lost; /* num of records overwritten */
	-} spa_history_phys_t;
	-
	-/*
	- * All members must be uint64_t, for byteswap purposes.
	- */
	-typedef struct spa_removing_phys {
	- uint64_t sr_state; /* dsl_scan_state_t */
	-
	- /*
	- * The vdev ID that we most recently attempted to remove,
	- * or -1 if no removal has been attempted.
	- */
	- uint64_t sr_removing_vdev;
	-
	- /*
	- * The vdev ID that we most recently successfully removed,
	- * or -1 if no devices have been removed.
	- */
	- uint64_t sr_prev_indirect_vdev;
	-
	- uint64_t sr_start_time;
	- uint64_t sr_end_time;
	-
	- /*
	- * Note that we can not use the space map's or indirect mapping's
	- * accounting as a substitute for these values, because we need to
	- * count frees of not-yet-copied data as though it did the copy.
	- * Otherwise, we could get into a situation where copied > to_copy,
	- * or we complete before copied == to_copy.
	- */
	- uint64_t sr_to_copy; /* bytes that need to be copied */
	- uint64_t sr_copied; /* bytes that have been copied or freed */
	-} spa_removing_phys_t;
	-
	-/*
	- * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT
	- * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense
	- * of an indirect vdev's mapping object is in progress.
	- */
	-typedef struct spa_condensing_indirect_phys {
	- /*
	- * The vdev ID of the indirect vdev whose indirect mapping is
	- * being condensed.
	- */
	- uint64_t scip_vdev;
	-
	- /*
	- * The vdev's old obsolete spacemap. This spacemap's contents are
	- * being integrated into the new mapping.
	- */
	- uint64_t scip_prev_obsolete_sm_object;
	-
	- /*
	- * The new mapping object that is being created.
	- */
	- uint64_t scip_next_mapping_object;
	-} spa_condensing_indirect_phys_t;
	-
	-struct spa_aux_vdev {
	- uint64_t sav_object; /* MOS object for device list */
	- nvlist_t sav_config; / cached device config */
	- vdev_t *sav_vdevs; / devices */
	- int sav_count; /* number devices */
	- boolean_t sav_sync; /* sync the device list */
	- nvlist_t *sav_pending; / pending device additions */
	- uint_t sav_npending; /* # pending devices */
	-};
	-
	-typedef struct spa_config_lock {
	- kmutex_t scl_lock;
	- kthread_t *scl_writer;
	- int scl_write_wanted;
	- kcondvar_t scl_cv;
	- zfs_refcount_t scl_count;
	-} spa_config_lock_t;
	-
	-typedef struct spa_config_dirent {
	- list_node_t scd_link;
	- char *scd_path;
	-} spa_config_dirent_t;
	-
	-typedef enum zio_taskq_type {
	- ZIO_TASKQ_ISSUE = 0,
	- ZIO_TASKQ_ISSUE_HIGH,
	- ZIO_TASKQ_INTERRUPT,
	- ZIO_TASKQ_INTERRUPT_HIGH,
	- ZIO_TASKQ_TYPES
	-} zio_taskq_type_t;
	-
	-/*
	- * State machine for the zpool-poolname process. The states transitions
	- * are done as follows:
	- *
	- * From To Routine
	- * PROC_NONE -> PROC_CREATED spa_activate()
	- * PROC_CREATED -> PROC_ACTIVE spa_thread()
	- * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate()
	- * PROC_DEACTIVATE -> PROC_GONE spa_thread()
	- * PROC_GONE -> PROC_NONE spa_deactivate()
	- */
	-typedef enum spa_proc_state {
	- SPA_PROC_NONE, /* spa_proc = &p0, no process created */
	- SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */
	- SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */
	- SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */
	- SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
	-} spa_proc_state_t;
	-
	-typedef struct spa_taskqs {
	- uint_t stqs_count;
	- taskq_t **stqs_taskq;
	-} spa_taskqs_t;
	-
	-typedef enum spa_all_vdev_zap_action {
	- AVZ_ACTION_NONE = 0,
	- AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */
	- AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */
	- AVZ_ACTION_INITIALIZE
	-} spa_avz_action_t;
	-
	-typedef enum spa_config_source {
	- SPA_CONFIG_SRC_NONE = 0,
	- SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */
	- SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */
	- SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */
	- SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */
	- SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */
	-} spa_config_source_t;
	-
	-struct spa {
	- /*
	- * Fields protected by spa_namespace_lock.
	- */
	- char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */
	- char spa_comment; / comment */
	- avl_node_t spa_avl; /* node in spa_namespace_avl */
	- nvlist_t spa_config; / last synced config */
	- nvlist_t spa_config_syncing; / currently syncing config */
	- nvlist_t spa_config_splitting; / config for splitting */
	- nvlist_t spa_load_info; / info and errors from load */
	- uint64_t spa_config_txg; /* txg of last config change */
	- int spa_sync_pass; /* iterate-to-convergence */
	- pool_state_t spa_state; /* pool state */
	- int spa_inject_ref; /* injection references */
	- uint8_t spa_sync_on; /* sync threads are running */
	- spa_load_state_t spa_load_state; /* current load operation */
	- boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */
	- boolean_t spa_trust_config; /* do we trust vdev tree? */
	- spa_config_source_t spa_config_source; /* where config comes from? */
	- uint64_t spa_import_flags; /* import specific flags */
	- spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
	- dsl_pool_t *spa_dsl_pool;
	- boolean_t spa_is_initializing; /* true while opening pool */
	- metaslab_class_t spa_normal_class; / normal data class */
	- metaslab_class_t spa_log_class; / intent log data class */
	- metaslab_class_t spa_special_class; / special allocation class */
	- metaslab_class_t spa_dedup_class; / dedup allocation class */
	- uint64_t spa_first_txg; /* first txg after spa_open() */
	- uint64_t spa_final_txg; /* txg of export/destroy */
	- uint64_t spa_freeze_txg; /* freeze pool at this txg */
	- uint64_t spa_load_max_txg; /* best initial ub_txg */
	- uint64_t spa_claim_max_txg; /* highest claimed birth txg */
	- timespec_t spa_loaded_ts; /* 1st successful open time */
	- objset_t spa_meta_objset; / copy of dp->dp_meta_objset */
	- kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */
	- list_t spa_evicting_os_list; /* Objsets being evicted. */
	- kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */
	- txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
	- vdev_t spa_root_vdev; / top-level vdev container */
	- int spa_min_ashift; /* of vdevs in normal class */
	- int spa_max_ashift; /* of vdevs in normal class */
	- uint64_t spa_config_guid; /* config pool guid */
	- uint64_t spa_load_guid; /* spa_load initialized guid */
	- uint64_t spa_last_synced_guid; /* last synced guid */
	- list_t spa_config_dirty_list; /* vdevs with dirty config */
	- list_t spa_state_dirty_list; /* vdevs with dirty state */
	- /*
	- * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
	- * stored in spa_alloc_count. There is one tree and one lock for each
	- * allocator, to help improve allocation performance in write-heavy
	- * workloads.
	- */
	- kmutex_t *spa_alloc_locks;
	- avl_tree_t *spa_alloc_trees;
	- int spa_alloc_count;
	-
	- spa_aux_vdev_t spa_spares; /* hot spares */
	- spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
	- nvlist_t spa_label_features; / Features for reading MOS */
	- uint64_t spa_config_object; /* MOS object for pool config */
	- uint64_t spa_config_generation; /* config generation number */
	- uint64_t spa_syncing_txg; /* txg currently syncing */
	- bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
	- bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
	- zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
	- /* checksum context templates */
	- kmutex_t spa_cksum_tmpls_lock;
	- void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
	- uberblock_t spa_ubsync; /* last synced uberblock */
	- uberblock_t spa_uberblock; /* current uberblock */
	- boolean_t spa_extreme_rewind; /* rewind past deferred frees */
	- uint64_t spa_last_io; /* lbolt of last non-scan I/O */
	- kmutex_t spa_scrub_lock; /* resilver/scrub lock */
	- uint64_t spa_scrub_inflight; /* in-flight scrub bytes */
	- uint64_t spa_load_verify_ios; /* in-flight verifications IOs */
	- kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
	- uint8_t spa_scrub_active; /* active or suspended? */
	- uint8_t spa_scrub_type; /* type of scrub we're doing */
	- uint8_t spa_scrub_finished; /* indicator to rotate logs */
	- uint8_t spa_scrub_started; /* started since last boot */
	- uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */
	- uint64_t spa_scan_pass_start; /* start time per pass/reboot */
	- uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */
	- uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
	- uint64_t spa_scan_pass_exam; /* examined bytes per pass */
	- uint64_t spa_scan_pass_issued; /* issued bytes per pass */
	- kmutex_t spa_async_lock; /* protect async state */
	- kthread_t spa_async_thread; / thread doing async task */
	- kthread_t spa_async_thread_vd; / thread doing vd async task */
	- int spa_async_suspended; /* async tasks suspended */
	- kcondvar_t spa_async_cv; /* wait for thread_exit() */
	- uint16_t spa_async_tasks; /* async task mask */
	- uint64_t spa_missing_tvds; /* unopenable tvds on load */
	- uint64_t spa_missing_tvds_allowed; /* allow loading spa? */
	-
	- spa_removing_phys_t spa_removing_phys;
	- spa_vdev_removal_t *spa_vdev_removal;
	-
	- spa_condensing_indirect_phys_t spa_condensing_indirect_phys;
	- spa_condensing_indirect_t *spa_condensing_indirect;
	- zthr_t spa_condense_zthr; / zthr doing condense. */
	-
	- uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */
	- spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
	- zthr_t *spa_checkpoint_discard_zthr;
	-
	- char spa_root; / alternate root directory */
	- uint64_t spa_ena; /* spa-wide ereport ENA */
	- int spa_last_open_failed; /* error if last open failed */
	- uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */
	- uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */
	- uint64_t spa_load_txg; /* ub txg that loaded */
	- uint64_t spa_load_txg_ts; /* timestamp from that ub */
	- uint64_t spa_load_meta_errors; /* verify metadata err count */
	- uint64_t spa_load_data_errors; /* verify data err count */
	- uint64_t spa_verify_min_txg; /* start txg of verify scrub */
	- kmutex_t spa_errlog_lock; /* error log lock */
	- uint64_t spa_errlog_last; /* last error log object */
	- uint64_t spa_errlog_scrub; /* scrub error log object */
	- kmutex_t spa_errlist_lock; /* error list/ereport lock */
	- avl_tree_t spa_errlist_last; /* last error list */
	- avl_tree_t spa_errlist_scrub; /* scrub error list */
	- uint64_t spa_deflate; /* should we deflate? */
	- uint64_t spa_history; /* history object */
	- kmutex_t spa_history_lock; /* history lock */
	- vdev_t spa_pending_vdev; / pending vdev additions */
	- kmutex_t spa_props_lock; /* property lock */
	- uint64_t spa_pool_props_object; /* object for properties */
	- uint64_t spa_bootfs; /* default boot filesystem */
	- uint64_t spa_failmode; /* failure mode for the pool */
	- uint64_t spa_delegation; /* delegation on/off */
	- list_t spa_config_list; /* previous cache file(s) */
	- /* per-CPU array of root of async I/O: */
	- zio_t **spa_async_zio_root;
	- zio_t spa_suspend_zio_root; / root of all suspended I/O */
	- zio_t spa_txg_zio[TXG_SIZE]; / spa_sync() waits for this */
	- kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
	- kcondvar_t spa_suspend_cv; /* notification of resume */
	- zio_suspend_reason_t spa_suspended; /* pool is suspended */
	- uint8_t spa_claiming; /* pool is doing zil_claim() */
	- boolean_t spa_is_root; /* pool is root */
	- int spa_minref; /* num refs when first opened */
	- int spa_mode; /* FREAD \| FWRITE */
	- spa_log_state_t spa_log_state; /* log state */
	- uint64_t spa_autoexpand; /* lun expansion on/off */
	- uint64_t spa_bootsize; /* efi system partition size */
	- ddt_t spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; / in-core DDTs */
	- uint64_t spa_ddt_stat_object; /* DDT statistics */
	- uint64_t spa_dedup_ditto; /* dedup ditto threshold */
	- uint64_t spa_dedup_checksum; /* default dedup checksum */
	- uint64_t spa_dspace; /* dspace in normal class */
	- kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
	- kmutex_t spa_proc_lock; /* protects spa_proc* */
	- kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
	- spa_proc_state_t spa_proc_state; /* see definition */
	- struct proc spa_proc; / "zpool-poolname" process */
	- uint64_t spa_did; /* if procp != p0, did of t1 */
	- kthread_t spa_trim_thread; / thread sending TRIM I/Os */
	- kmutex_t spa_trim_lock; /* protects spa_trim_cv */
	- kcondvar_t spa_trim_cv; /* used to notify TRIM thread */
	- boolean_t spa_autoreplace; /* autoreplace set in open */
	- int spa_vdev_locks; /* locks grabbed */
	- uint64_t spa_creation_version; /* version at pool creation */
	- uint64_t spa_prev_software_version; /* See ub_software_version */
	- uint64_t spa_feat_for_write_obj; /* required to write to pool */
	- uint64_t spa_feat_for_read_obj; /* required to read from pool */
	- uint64_t spa_feat_desc_obj; /* Feature descriptions */
	- uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */
	- kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */
	- nvlist_t spa_feat_stats; / Cache of enabled features */
	- /* cache feature refcounts */
	- uint64_t spa_feat_refcount_cache[SPA_FEATURES];
	-#ifdef illumos
	- cyclic_id_t spa_deadman_cycid; /* cyclic id */
	-#else /* !illumos */
	-#ifdef _KERNEL
	- struct callout spa_deadman_cycid; /* callout id */
	- struct task spa_deadman_task;
	-#endif
	-#endif /* illumos */
	- uint64_t spa_deadman_calls; /* number of deadman calls */
	- hrtime_t spa_sync_starttime; /* starting time fo spa_sync */
	- uint64_t spa_deadman_synctime; /* deadman expiration timer */
	- uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
	- spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */
	-
	-#ifdef illumos
	- /*
	- * spa_iokstat_lock protects spa_iokstat and
	- * spa_queue_stats[].
	- */
	- kmutex_t spa_iokstat_lock;
	- struct kstat spa_iokstat; / kstat of io to this pool */
	- struct {
	- int spa_active;
	- int spa_queued;
	- } spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
	-#endif
	- /* arc_memory_throttle() parameters during low memory condition */
	- uint64_t spa_lowmem_page_load; /* memory load during txg */
	- uint64_t spa_lowmem_last_txg; /* txg window start */
	-
	- hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
	-
	- taskq_t spa_zvol_taskq; / Taskq for minor management */
	-
	- uint64_t spa_multihost; /* multihost aware (mmp) */
	- mmp_thread_t spa_mmp; /* multihost mmp thread */
	- list_t spa_leaf_list; /* list of leaf vdevs */
	- uint64_t spa_leaf_list_gen; /* track leaf_list changes */
	-
	- /*
	- * spa_refcount & spa_config_lock must be the last elements
	- * because refcount_t changes size based on compilation options.
	- * because zfs_refcount_t changes size based on compilation options.
	- * In order for the MDB module to function correctly, the other
	- * fields must remain in the same location.
	- */
	- spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
	- zfs_refcount_t spa_refcount; /* number of opens */
	-#ifndef illumos
	- boolean_t spa_splitting_newspa; /* creating new spa in split */
	-#endif
	-};
	-
	-extern const char *spa_config_path;
	-
	-extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
	- task_func_t func, void arg, uint_t flags, taskq_ent_t *ent);
	-extern void spa_load_spares(spa_t *spa);
	-extern void spa_load_l2cache(spa_t *spa);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_SPA_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
	@@ -1,230 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_SPACE_MAP_H
	-#define _SYS_SPACE_MAP_H
	-
	-#include <sys/avl.h>
	-#include <sys/range_tree.h>
	-#include <sys/dmu.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * The size of the space map object has increased to include a histogram.
	- * The SPACE_MAP_SIZE_V0 designates the original size and is used to
	- * maintain backward compatibility.
	- */
	-#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
	-#define SPACE_MAP_HISTOGRAM_SIZE 32
	-
	-/*
	- * The space_map_phys is the on-disk representation of the space map.
	- * Consumers of space maps should never reference any of the members of this
	- * structure directly. These members may only be updated in syncing context.
	- *
	- * Note the smp_object is no longer used but remains in the structure
	- * for backward compatibility.
	- */
	-typedef struct space_map_phys {
	- /* object number: not needed but kept for backwards compatibility */
	- uint64_t smp_object;
	-
	- /* length of the object in bytes */
	- uint64_t smp_length;
	-
	- /* space allocated from the map */
	- int64_t smp_alloc;
	-
	- /* reserved */
	- uint64_t smp_pad[5];
	-
	- /*
	- * The smp_histogram maintains a histogram of free regions. Each
	- * bucket, smp_histogram[i], contains the number of free regions
	- * whose size is:
	- * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
	- */
	- uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
	-} space_map_phys_t;
	-
	-/*
	- * The space map object defines a region of space, its size, how much is
	- * allocated, and the on-disk object that stores this information.
	- * Consumers of space maps may only access the members of this structure.
	- *
	- * Note: the space_map may not be accessed concurrently; consumers
	- * must provide external locking if required.
	- */
	-typedef struct space_map {
	- uint64_t sm_start; /* start of map */
	- uint64_t sm_size; /* size of map */
	- uint8_t sm_shift; /* unit shift */
	- objset_t sm_os; / objset for this map */
	- uint64_t sm_object; /* object id for this map */
	- uint32_t sm_blksz; /* block size for space map */
	- dmu_buf_t sm_dbuf; / space_map_phys_t dbuf */
	- space_map_phys_t sm_phys; / on-disk space map */
	-} space_map_t;
	-
	-/*
	- * debug entry
	- *
	- * 2 2 10 50
	- * +-----+-----+------------+----------------------------------+
	- * \| 1 0 \| act \| syncpass \| txg (lower bits) \|
	- * +-----+-----+------------+----------------------------------+
	- * 63 62 61 60 59 50 49 0
	- *
	- *
	- * one-word entry
	- *
	- * 1 47 1 15
	- * +-----------------------------------------------------------+
	- * \| 0 \| offset (sm_shift units) \| type \| run \|
	- * +-----------------------------------------------------------+
	- * 63 62 16 15 14 0
	- *
	- *
	- * two-word entry
	- *
	- * 2 2 36 24
	- * +-----+-----+---------------------------+-------------------+
	- * \| 1 1 \| pad \| run \| vdev \|
	- * +-----+-----+---------------------------+-------------------+
	- * 63 62 61 60 59 24 23 0
	- *
	- * 1 63
	- * +------+----------------------------------------------------+
	- * \| type \| offset \|
	- * +------+----------------------------------------------------+
	- * 63 62 0
	- *
	- * Note that a two-word entry will not strandle a block boundary.
	- * If necessary, the last word of a block will be padded with a
	- * debug entry (with act = syncpass = txg = 0).
	- */
	-
	-typedef enum {
	- SM_ALLOC,
	- SM_FREE
	-} maptype_t;
	-
	-typedef struct space_map_entry {
	- maptype_t sme_type;
	- uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */
	- uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */
	- uint64_t sme_run; /* max is 2^36; units of sm_shift */
	-} space_map_entry_t;
	-
	-#define SM_NO_VDEVID (1 << SPA_VDEVBITS)
	-
	-/* one-word entry constants */
	-#define SM_DEBUG_PREFIX 2
	-#define SM_OFFSET_BITS 47
	-#define SM_RUN_BITS 15
	-
	-/* two-word entry constants */
	-#define SM2_PREFIX 3
	-#define SM2_OFFSET_BITS 63
	-#define SM2_RUN_BITS 36
	-
	-#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2)
	-#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2)
	-
	-#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2)
	-#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2)
	-#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
	-#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
	-#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
	-#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
	-
	-#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS)
	-#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS)
	-#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
	-#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
	-#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1)
	-#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS)
	-#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
	-#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL)
	-
	-#define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1)
	-#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS)
	-#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS)
	-#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS)
	-#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1)
	-#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1)
	-#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS)
	-#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS)
	-#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL)
	-#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL)
	-
	-boolean_t sm_entry_is_debug(uint64_t e);
	-boolean_t sm_entry_is_single_word(uint64_t e);
	-boolean_t sm_entry_is_double_word(uint64_t e);
	-
	-typedef int (sm_cb_t)(space_map_entry_t sme, void *arg);
	-
	-int space_map_load(space_map_t sm, range_tree_t rt, maptype_t maptype);
	-int space_map_load_length(space_map_t sm, range_tree_t rt, maptype_t maptype,
	- uint64_t length);
	-int space_map_iterate(space_map_t *sm, uint64_t length,
	- sm_cb_t callback, void *arg);
	-int space_map_incremental_destroy(space_map_t sm, sm_cb_t callback, void arg,
	- dmu_tx_t *tx);
	-
	-boolean_t space_map_histogram_verify(space_map_t sm, range_tree_t rt);
	-void space_map_histogram_clear(space_map_t *sm);
	-void space_map_histogram_add(space_map_t sm, range_tree_t rt,
	- dmu_tx_t *tx);
	-
	-uint64_t space_map_object(space_map_t *sm);
	-int64_t space_map_allocated(space_map_t *sm);
	-uint64_t space_map_length(space_map_t *sm);
	-
	-void space_map_write(space_map_t sm, range_tree_t rt, maptype_t maptype,
	- uint64_t vdev_id, dmu_tx_t *tx);
	-uint64_t space_map_estimate_optimal_size(space_map_t sm, range_tree_t rt,
	- uint64_t vdev_id);
	-void space_map_truncate(space_map_t sm, int blocksize, dmu_tx_t tx);
	-uint64_t space_map_alloc(objset_t os, int blocksize, dmu_tx_t tx);
	-void space_map_free(space_map_t sm, dmu_tx_t tx);
	-void space_map_free_obj(objset_t os, uint64_t smobj, dmu_tx_t tx);
	-
	-int space_map_open(space_map_t *smp, objset_t os, uint64_t object,
	- uint64_t start, uint64_t size, uint8_t shift);
	-void space_map_close(space_map_t *sm);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_SPACE_MAP_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h
	@@ -1,57 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_SPACE_REFTREE_H
	-#define _SYS_SPACE_REFTREE_H
	-
	-#include <sys/range_tree.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct space_ref {
	- avl_node_t sr_node; /* AVL node */
	- uint64_t sr_offset; /* range offset (start or end) */
	- int64_t sr_refcnt; /* associated reference count */
	-} space_ref_t;
	-
	-void space_reftree_create(avl_tree_t *t);
	-void space_reftree_destroy(avl_tree_t *t);
	-void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
	- int64_t refcnt);
	-void space_reftree_add_map(avl_tree_t t, range_tree_t rt, int64_t refcnt);
	-void space_reftree_generate_map(avl_tree_t t, range_tree_t rt,
	- int64_t minref);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_SPACE_REFTREE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
	@@ -1,51 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
	- * All rights reserved.
	- */
	-
	-#ifndef _SYS_TRIM_MAP_H
	-#define _SYS_TRIM_MAP_H
	-
	-#include <sys/avl.h>
	-#include <sys/list.h>
	-#include <sys/spa.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-extern void trim_map_create(vdev_t *vd);
	-extern void trim_map_destroy(vdev_t *vd);
	-extern void trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg);
	-extern boolean_t trim_map_write_start(zio_t *zio);
	-extern void trim_map_write_done(zio_t *zio);
	-
	-extern void trim_thread_create(spa_t *spa);
	-extern void trim_thread_destroy(spa_t *spa);
	-extern void trim_thread_wakeup(spa_t *spa);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_TRIM_MAP_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
	@@ -1,136 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_TXG_H
	-#define _SYS_TXG_H
	-
	-#include <sys/spa.h>
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */
	-#define TXG_SIZE 4 /* next power of 2 */
	-#define TXG_MASK (TXG_SIZE - 1) /* mask for size */
	-#define TXG_INITIAL TXG_SIZE /* initial txg */
	-#define TXG_IDX (txg & TXG_MASK)
	-
	-/* Number of txgs worth of frees we defer adding to in-core spacemaps */
	-#define TXG_DEFER_SIZE 2
	-
	-typedef struct tx_cpu tx_cpu_t;
	-
	-typedef struct txg_handle {
	- tx_cpu_t *th_cpu;
	- uint64_t th_txg;
	-} txg_handle_t;
	-
	-typedef struct txg_node {
	- struct txg_node *tn_next[TXG_SIZE];
	- uint8_t tn_member[TXG_SIZE];
	-} txg_node_t;
	-
	-typedef struct txg_list {
	- kmutex_t tl_lock;
	- size_t tl_offset;
	- spa_t *tl_spa;
	- txg_node_t *tl_head[TXG_SIZE];
	-} txg_list_t;
	-
	-struct dsl_pool;
	-
	-extern void txg_init(struct dsl_pool *dp, uint64_t txg);
	-extern void txg_fini(struct dsl_pool *dp);
	-extern void txg_sync_start(struct dsl_pool *dp);
	-extern void txg_sync_stop(struct dsl_pool *dp);
	-extern uint64_t txg_hold_open(struct dsl_pool dp, txg_handle_t txghp);
	-extern void txg_rele_to_quiesce(txg_handle_t *txghp);
	-extern void txg_rele_to_sync(txg_handle_t *txghp);
	-extern void txg_register_callbacks(txg_handle_t txghp, list_t tx_callbacks);
	-
	-extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
	- hrtime_t resolution);
	-extern void txg_kick(struct dsl_pool *dp);
	-
	-/*
	- * Wait until the given transaction group has finished syncing.
	- * Try to make this happen as soon as possible (eg. kick off any
	- * necessary syncs immediately). If txg==0, wait for the currently open
	- * txg to finish syncing.
	- */
	-extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
	-
	-/*
	- * Wait as above. Returns true if the thread was signaled while waiting.
	- */
	-extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg);
	-
	-/*
	- * Wait until the given transaction group, or one after it, is
	- * the open transaction group. Try to make this happen as soon
	- * as possible (eg. kick off any necessary syncs immediately).
	- * If txg == 0, wait for the next open txg.
	- */
	-extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
	-
	-/*
	- * Returns TRUE if we are "backed up" waiting for the syncing
	- * transaction to complete; otherwise returns FALSE.
	- */
	-extern boolean_t txg_stalled(struct dsl_pool *dp);
	-
	-/* returns TRUE if someone is waiting for the next txg to sync */
	-extern boolean_t txg_sync_waiting(struct dsl_pool *dp);
	-
	-extern void txg_verify(spa_t *spa, uint64_t txg);
	-
	-/*
	- * Per-txg object lists.
	- */
	-
	-#define TXG_CLEAN(txg) ((txg) - 1)
	-
	-extern void txg_list_create(txg_list_t tl, spa_t spa, size_t offset);
	-extern void txg_list_destroy(txg_list_t *tl);
	-extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg);
	-extern boolean_t txg_all_lists_empty(txg_list_t *tl);
	-extern boolean_t txg_list_add(txg_list_t tl, void p, uint64_t txg);
	-extern boolean_t txg_list_add_tail(txg_list_t tl, void p, uint64_t txg);
	-extern void txg_list_remove(txg_list_t tl, uint64_t txg);
	-extern void txg_list_remove_this(txg_list_t tl, void *p, uint64_t txg);
	-extern boolean_t txg_list_member(txg_list_t tl, void p, uint64_t txg);
	-extern void txg_list_head(txg_list_t tl, uint64_t txg);
	-extern void txg_list_next(txg_list_t tl, void *p, uint64_t txg);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_TXG_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
	@@ -1,125 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_TXG_IMPL_H
	-#define _SYS_TXG_IMPL_H
	-
	-#include <sys/spa.h>
	-#include <sys/txg.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * The tx_cpu structure is a per-cpu structure that is used to track
	- * the number of active transaction holds (tc_count). As transactions
	- * are assigned into a transaction group the appropriate tc_count is
	- * incremented to indicate that there are pending changes that have yet
	- * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
	- * the tc_count. A transaction group is not considered quiesced until all
	- * tx_cpu structures have reached a tc_count of zero.
	- *
	- * This structure is a per-cpu structure by design. Updates to this structure
	- * are frequent and concurrent. Having a single structure would result in
	- * heavy lock contention so a per-cpu design was implemented. With the fanned
	- * out mutex design, consumers only need to lock the mutex associated with
	- * thread's cpu.
	- *
	- * The tx_cpu contains two locks, the tc_lock and tc_open_lock.
	- * The tc_lock is used to protect all members of the tx_cpu structure with
	- * the exception of the tc_open_lock. This lock should only be held for a
	- * short period of time, typically when updating the value of tc_count.
	- *
	- * The tc_open_lock protects the tx_open_txg member of the tx_state structure.
	- * This lock is used to ensure that transactions are only assigned into
	- * the current open transaction group. In order to move the current open
	- * transaction group to the quiesce phase, the txg_quiesce thread must
	- * grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
	- * The tc_open_lock is held until the transaction is assigned into the
	- * transaction group. Typically, this is a short operation but if throttling
	- * is occuring it may be held for longer periods of time.
	- */
	-struct tx_cpu {
	- kmutex_t tc_open_lock; /* protects tx_open_txg */
	- kmutex_t tc_lock; /* protects the rest of this struct */
	- kcondvar_t tc_cv[TXG_SIZE];
	- uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
	- list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
	- char tc_pad[8]; /* pad to fill 3 cache lines */
	-};
	-
	-/*
	- * The tx_state structure maintains the state information about the different
	- * stages of the pool's transcation groups. A per pool tx_state structure
	- * is used to track this information. The tx_state structure also points to
	- * an array of tx_cpu structures (described above). Although the tx_sync_lock
	- * is used to protect the members of this structure, it is not used to
	- * protect the tx_open_txg. Instead a special lock in the tx_cpu structure
	- * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
	- * Any thread wishing to update tx_open_txg must grab the tc_open_lock on
	- * every cpu (see txg_quiesce()).
	- */
	-typedef struct tx_state {
	- tx_cpu_t tx_cpu; / protects access to tx_open_txg */
	- kmutex_t tx_sync_lock; /* protects the rest of this struct */
	-
	- uint64_t tx_open_txg; /* currently open txg id */
	- uint64_t tx_quiescing_txg; /* currently quiescing txg id */
	- uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
	- uint64_t tx_syncing_txg; /* currently syncing txg id */
	- uint64_t tx_synced_txg; /* last synced txg id */
	-
	- hrtime_t tx_open_time; /* start time of tx_open_txg */
	-
	- uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */
	- uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */
	-
	- kcondvar_t tx_sync_more_cv;
	- kcondvar_t tx_sync_done_cv;
	- kcondvar_t tx_quiesce_more_cv;
	- kcondvar_t tx_quiesce_done_cv;
	- kcondvar_t tx_timeout_cv;
	- kcondvar_t tx_exit_cv; /* wait for all threads to exit */
	-
	- uint8_t tx_threads; /* number of threads */
	- uint8_t tx_exiting; /* set when we're exiting */
	-
	- kthread_t *tx_sync_thread;
	- kthread_t *tx_quiesce_thread;
	-
	- taskq_t tx_commit_cb_taskq; / commit callback taskq */
	-} tx_state_t;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_TXG_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
	@@ -1,50 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2014 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_UBERBLOCK_H
	-#define _SYS_UBERBLOCK_H
	-
	-#include <sys/spa.h>
	-#include <sys/vdev.h>
	-#include <sys/zio.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct uberblock uberblock_t;
	-
	-extern int uberblock_verify(uberblock_t *);
	-extern boolean_t uberblock_update(uberblock_t ub, vdev_t rvd, uint64_t txg,
	- uint64_t mmp_delay);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_UBERBLOCK_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
	@@ -1,145 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_UBERBLOCK_IMPL_H
	-#define _SYS_UBERBLOCK_IMPL_H
	-
	-#include <sys/uberblock.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * The uberblock version is incremented whenever an incompatible on-disk
	- * format change is made to the SPA, DMU, or ZAP.
	- *
	- * Note: the first two fields should never be moved. When a storage pool
	- * is opened, the uberblock must be read off the disk before the version
	- * can be checked. If the ub_version field is moved, we may not detect
	- * version mismatch. If the ub_magic field is moved, applications that
	- * expect the magic number in the first word won't work.
	- */
	-#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
	-#define UBERBLOCK_SHIFT 10 /* up to 1K */
	-#define MMP_MAGIC 0xa11cea11 /* all-see-all */
	-
	-#define MMP_INTERVAL_VALID_BIT 0x01
	-#define MMP_SEQ_VALID_BIT 0x02
	-#define MMP_FAIL_INT_VALID_BIT 0x04
	-
	-#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
	- ubp->ub_mmp_magic == MMP_MAGIC)
	-#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
	- MMP_INTERVAL_VALID_BIT))
	-#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
	- MMP_SEQ_VALID_BIT))
	-#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
	- MMP_FAIL_INT_VALID_BIT))
	-
	-#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
	- >> 8)
	-#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
	- >> 32)
	-#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
	- >> 48)
	-
	-#define MMP_INTERVAL_SET(write) \
	- (((uint64_t)(write & 0xFFFFFF) << 8) \| MMP_INTERVAL_VALID_BIT)
	-
	-#define MMP_SEQ_SET(seq) \
	- (((uint64_t)(seq & 0xFFFF) << 32) \| MMP_SEQ_VALID_BIT)
	-
	-#define MMP_FAIL_INT_SET(fail) \
	- (((uint64_t)(fail & 0xFFFF) << 48) \| MMP_FAIL_INT_VALID_BIT)
	-
	-struct uberblock {
	- uint64_t ub_magic; /* UBERBLOCK_MAGIC */
	- uint64_t ub_version; /* SPA_VERSION */
	- uint64_t ub_txg; /* txg of last sync */
	- uint64_t ub_guid_sum; /* sum of all vdev guids */
	- uint64_t ub_timestamp; /* UTC time of last sync */
	- blkptr_t ub_rootbp; /* MOS objset_phys_t */
	-
	- /* highest SPA_VERSION supported by software that wrote this txg */
	- uint64_t ub_software_version;
	-
	- /* Maybe missing in uberblocks we read, but always written */
	- uint64_t ub_mmp_magic;
	- /*
	- * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
	- * Otherwise, nanosec since last MMP write.
	- */
	- uint64_t ub_mmp_delay;
	-
	- /*
	- * The ub_mmp_config contains the multihost write interval, multihost
	- * fail intervals, sequence number for sub-second granularity, and
	- * valid bit mask. This layout is as follows:
	- *
	- * 64 56 48 40 32 24 16 8 0
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * 0 \| Fail Intervals\| Seq \| Write Interval (ms) \| VALID \|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- *
	- * This allows a write_interval of (2^24/1000)s, over 4.5 hours
	- *
	- * VALID Bits:
	- * - 0x01 - Write Interval (ms)
	- * - 0x02 - Sequence number exists
	- * - 0x04 - Fail Intervals
	- * - 0xf8 - Reserved
	- */
	- uint64_t ub_mmp_config;
	-
	- /*
	- * ub_checkpoint_txg indicates two things about the current uberblock:
	- *
	- * 1] If it is not zero then this uberblock is a checkpoint. If it is
	- * zero, then this uberblock is not a checkpoint.
	- *
	- * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
	- * the ub_txg that the uberblock had at the time we moved it to
	- * the MOS config.
	- *
	- * The field is set when we checkpoint the uberblock and continues to
	- * hold that value even after we've rewound (unlike the ub_txg that
	- * is reset to a higher value).
	- *
	- * Besides checks used to determine whether we are reopening the
	- * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
	- * the value of the field is used to determine which ZIL blocks have
	- * been allocated according to the ms_sm when we are rewinding to a
	- * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
	- * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
	- */
	- uint64_t ub_checkpoint_txg;
	-};
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_UBERBLOCK_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
	@@ -1,57 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _SYS_UNIQUE_H
	-#define _SYS_UNIQUE_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/* The number of significant bits in each unique value. */
	-#define UNIQUE_BITS 56
	-
	-void unique_init(void);
	-void unique_fini(void);
	-
	-/*
	- * Return a new unique value (which will not be uniquified against until
	- * it is unique_insert()-ed).
	- */
	-uint64_t unique_create(void);
	-
	-/* Return a unique value, which equals the one passed in if possible. */
	-uint64_t unique_insert(uint64_t value);
	-
	-/* Indicate that this value no longer needs to be uniquified against. */
	-void unique_remove(uint64_t value);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_UNIQUE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
	@@ -1,196 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#ifndef _SYS_VDEV_H
	-#define _SYS_VDEV_H
	-
	-#include <sys/spa.h>
	-#include <sys/zio.h>
	-#include <sys/dmu.h>
	-#include <sys/space_map.h>
	-#include <sys/fs/zfs.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef enum vdev_dtl_type {
	- DTL_MISSING, /* 0% replication: no copies of the data */
	- DTL_PARTIAL, /* less than 100% replication: some copies missing */
	- DTL_SCRUB, /* unable to fully repair during scrub/resilver */
	- DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
	- DTL_TYPES
	-} vdev_dtl_type_t;
	-
	-extern boolean_t zfs_nocacheflush;
	-extern boolean_t zfs_trim_enabled;
	-
	-extern void vdev_dbgmsg(vdev_t vd, const char fmt, ...);
	-extern void vdev_dbgmsg_print_tree(vdev_t *, int);
	-extern int vdev_open(vdev_t *);
	-extern void vdev_open_children(vdev_t *);
	-extern boolean_t vdev_uses_zvols(vdev_t *);
	-extern int vdev_validate(vdev_t *);
	-extern int vdev_copy_path_strict(vdev_t , vdev_t );
	-extern void vdev_copy_path_relaxed(vdev_t , vdev_t );
	-extern void vdev_close(vdev_t *);
	-extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
	-extern void vdev_reopen(vdev_t *);
	-extern int vdev_validate_aux(vdev_t *vd);
	-extern zio_t vdev_probe(vdev_t vd, zio_t *pio);
	-extern boolean_t vdev_is_concrete(vdev_t *vd);
	-extern boolean_t vdev_is_bootable(vdev_t *vd);
	-extern vdev_t vdev_lookup_top(spa_t spa, uint64_t vdev);
	-extern vdev_t vdev_lookup_by_guid(vdev_t vd, uint64_t guid);
	-extern int vdev_count_leaves(spa_t *spa);
	-extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
	- uint64_t txg, uint64_t size);
	-extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
	- uint64_t txg, uint64_t size);
	-extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
	-extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
	-extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
	- int scrub_done);
	-extern boolean_t vdev_dtl_required(vdev_t *vd);
	-extern boolean_t vdev_resilver_needed(vdev_t *vd,
	- uint64_t minp, uint64_t maxp);
	-extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
	- dmu_tx_t *tx);
	-extern uint64_t vdev_create_link_zap(vdev_t vd, dmu_tx_t tx);
	-extern void vdev_construct_zaps(vdev_t vd, dmu_tx_t tx);
	-extern void vdev_destroy_spacemaps(vdev_t vd, dmu_tx_t tx);
	-extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
	- uint64_t size);
	-extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
	- uint64_t offset, uint64_t size, dmu_tx_t *tx);
	-
	-extern void vdev_hold(vdev_t *);
	-extern void vdev_rele(vdev_t *);
	-
	-extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
	-extern void vdev_metaslab_fini(vdev_t *vd);
	-extern void vdev_metaslab_set_size(vdev_t *);
	-extern void vdev_ashift_optimize(vdev_t *);
	-extern void vdev_expand(vdev_t *vd, uint64_t txg);
	-extern void vdev_split(vdev_t *vd);
	-extern void vdev_deadman(vdev_t *vd);
	-
	-extern void vdev_get_stats(vdev_t vd, vdev_stat_t vs);
	-extern void vdev_clear_stats(vdev_t *vd);
	-extern void vdev_stat_update(zio_t *zio, uint64_t psize);
	-extern void vdev_scan_stat_init(vdev_t *vd);
	-extern void vdev_propagate_state(vdev_t *vd);
	-extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
	- vdev_aux_t aux);
	-extern boolean_t vdev_children_are_offline(vdev_t *vd);
	-
	-extern void vdev_space_update(vdev_t *vd,
	- int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
	-
	-extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
	-
	-extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
	-
	-extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
	-extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
	-extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
	- vdev_state_t *);
	-extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
	-extern void vdev_clear(spa_t spa, vdev_t vd);
	-
	-extern boolean_t vdev_is_dead(vdev_t *vd);
	-extern boolean_t vdev_readable(vdev_t *vd);
	-extern boolean_t vdev_writeable(vdev_t *vd);
	-extern boolean_t vdev_allocatable(vdev_t *vd);
	-extern boolean_t vdev_accessible(vdev_t vd, zio_t zio);
	-extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
	-
	-extern void vdev_cache_init(vdev_t *vd);
	-extern void vdev_cache_fini(vdev_t *vd);
	-extern boolean_t vdev_cache_read(zio_t *zio);
	-extern void vdev_cache_write(zio_t *zio);
	-extern void vdev_cache_purge(vdev_t *vd);
	-
	-extern void vdev_queue_init(vdev_t *vd);
	-extern void vdev_queue_fini(vdev_t *vd);
	-extern zio_t vdev_queue_io(zio_t zio);
	-extern void vdev_queue_io_done(zio_t *zio);
	-extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
	-extern int vdev_queue_length(vdev_t *vd);
	-extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
	-extern void vdev_queue_register_lastoffset(vdev_t vd, zio_t zio);
	-
	-extern void vdev_config_dirty(vdev_t *vd);
	-extern void vdev_config_clean(vdev_t *vd);
	-extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
	-
	-extern void vdev_state_dirty(vdev_t *vd);
	-extern void vdev_state_clean(vdev_t *vd);
	-
	-typedef enum vdev_config_flag {
	- VDEV_CONFIG_SPARE = 1 << 0,
	- VDEV_CONFIG_L2CACHE = 1 << 1,
	- VDEV_CONFIG_REMOVING = 1 << 2,
	- VDEV_CONFIG_MOS = 1 << 3,
	- VDEV_CONFIG_MISSING = 1 << 4
	-} vdev_config_flag_t;
	-
	-extern void vdev_top_config_generate(spa_t spa, nvlist_t config);
	-extern nvlist_t vdev_config_generate(spa_t spa, vdev_t *vd,
	- boolean_t getstats, vdev_config_flag_t flags);
	-
	-/*
	- * Label routines
	- */
	-struct uberblock;
	-extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
	-extern int vdev_label_number(uint64_t psise, uint64_t offset);
	-extern nvlist_t vdev_label_read_config(vdev_t vd, uint64_t txg);
	-extern void vdev_uberblock_load(vdev_t , struct uberblock , nvlist_t **);
	-extern void vdev_label_write(zio_t zio, vdev_t vd, int l, abd_t *buf, uint64_t
	- offset, uint64_t size, zio_done_func_t done, void priv, int flags);
	-extern int vdev_label_read_bootenv(vdev_t , nvlist_t );
	-extern int vdev_label_write_bootenv(vdev_t , char );
	-
	-typedef enum {
	- VDEV_LABEL_CREATE, /* create/add a new device */
	- VDEV_LABEL_REPLACE, /* replace an existing device */
	- VDEV_LABEL_SPARE, /* add a new hot spare */
	- VDEV_LABEL_REMOVE, /* remove an existing device */
	- VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
	- VDEV_LABEL_SPLIT /* generating new label for split-off dev */
	-} vdev_labeltype_t;
	-
	-extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
	-
	-extern int vdev_label_write_pad2(vdev_t vd, const char buf, size_t size);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VDEV_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
	@@ -1,67 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- * Copyright (c) 2013 Joyent, Inc. All rights reserved.
	- * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-#ifndef _SYS_VDEV_DISK_H
	-#define _SYS_VDEV_DISK_H
	-
	-#include <sys/vdev.h>
	-#ifdef _KERNEL
	-#include <sys/buf.h>
	-#include <sys/ddi.h>
	-#include <sys/sunldi.h>
	-#include <sys/sunddi.h>
	-#endif
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#ifdef _KERNEL
	-typedef struct vdev_disk {
	- ddi_devid_t vd_devid;
	- char *vd_minor;
	- ldi_handle_t vd_lh;
	- list_t vd_ldi_cbs;
	- boolean_t vd_ldi_offline;
	-} vdev_disk_t;
	-#endif
	-
	-extern int vdev_disk_physio(vdev_t *,
	- caddr_t, size_t, uint64_t, int, boolean_t);
	-
	-/*
	- * Since vdev_disk.c is not compiled into libzpool, this function should only be
	- * defined in the zfs kernel module.
	- */
	-#ifdef _KERNEL
	-extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
	-#endif
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VDEV_DISK_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
	@@ -1,49 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _SYS_VDEV_FILE_H
	-#define _SYS_VDEV_FILE_H
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include <sys/vdev.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct vdev_file {
	- vnode_t *vf_vnode;
	-} vdev_file_t;
	-
	-extern void vdev_file_init(void);
	-extern void vdev_file_fini(void);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VDEV_FILE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
	@@ -1,571 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#ifndef _SYS_VDEV_IMPL_H
	-#define _SYS_VDEV_IMPL_H
	-
	-#include <sys/avl.h>
	-#include <sys/bpobj.h>
	-#include <sys/dmu.h>
	-#include <sys/metaslab.h>
	-#include <sys/nvpair.h>
	-#include <sys/space_map.h>
	-#include <sys/vdev.h>
	-#include <sys/dkio.h>
	-#include <sys/uberblock_impl.h>
	-#include <sys/vdev_indirect_mapping.h>
	-#include <sys/vdev_indirect_births.h>
	-#include <sys/vdev_removal.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Virtual device descriptors.
	- *
	- * All storage pool operations go through the virtual device framework,
	- * which provides data replication and I/O scheduling.
	- */
	-
	-/*
	- * Forward declarations that lots of things need.
	- */
	-typedef struct vdev_queue vdev_queue_t;
	-typedef struct vdev_cache vdev_cache_t;
	-typedef struct vdev_cache_entry vdev_cache_entry_t;
	-struct abd;
	-
	-extern int zfs_vdev_queue_depth_pct;
	-extern int zfs_vdev_def_queue_depth;
	-extern uint32_t zfs_vdev_async_write_max_active;
	-
	-/*
	- * Virtual device operations
	- */
	-typedef int vdev_open_func_t(vdev_t vd, uint64_t size, uint64_t *max_size,
	- uint64_t logical_ashift, uint64_t physical_ashift);
	-typedef void vdev_close_func_t(vdev_t *vd);
	-typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
	-typedef void vdev_io_start_func_t(zio_t *zio);
	-typedef void vdev_io_done_func_t(zio_t *zio);
	-typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
	-typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
	-typedef void vdev_hold_func_t(vdev_t *vd);
	-typedef void vdev_rele_func_t(vdev_t *vd);
	-
	-typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
	- uint64_t offset, uint64_t size, void *arg);
	-typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
	- vdev_remap_cb_t callback, void *arg);
	-/*
	- * Given a target vdev, translates the logical range "in" to the physical
	- * range "res"
	- */
	-typedef void vdev_xlation_func_t(vdev_t cvd, const range_seg_t in,
	- range_seg_t *res);
	-
	-typedef struct vdev_ops {
	- vdev_open_func_t *vdev_op_open;
	- vdev_close_func_t *vdev_op_close;
	- vdev_asize_func_t *vdev_op_asize;
	- vdev_io_start_func_t *vdev_op_io_start;
	- vdev_io_done_func_t *vdev_op_io_done;
	- vdev_state_change_func_t *vdev_op_state_change;
	- vdev_need_resilver_func_t *vdev_op_need_resilver;
	- vdev_hold_func_t *vdev_op_hold;
	- vdev_rele_func_t *vdev_op_rele;
	- vdev_remap_func_t *vdev_op_remap;
	- /*
	- * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
	- * Used when initializing vdevs. Isn't used by leaf ops.
	- */
	- vdev_xlation_func_t *vdev_op_xlate;
	- char vdev_op_type[16];
	- boolean_t vdev_op_leaf;
	-} vdev_ops_t;
	-
	-/*
	- * Virtual device properties
	- */
	-struct vdev_cache_entry {
	- struct abd *ve_abd;
	- uint64_t ve_offset;
	- uint64_t ve_lastused;
	- avl_node_t ve_offset_node;
	- avl_node_t ve_lastused_node;
	- uint32_t ve_hits;
	- uint16_t ve_missed_update;
	- zio_t *ve_fill_io;
	-};
	-
	-struct vdev_cache {
	- avl_tree_t vc_offset_tree;
	- avl_tree_t vc_lastused_tree;
	- kmutex_t vc_lock;
	-};
	-
	-typedef struct vdev_queue_class {
	- uint32_t vqc_active;
	-
	- /*
	- * Sorted by offset or timestamp, depending on if the queue is
	- * LBA-ordered vs FIFO.
	- */
	- avl_tree_t vqc_queued_tree;
	-} vdev_queue_class_t;
	-
	-struct vdev_queue {
	- vdev_t *vq_vdev;
	- vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
	- avl_tree_t vq_active_tree;
	- avl_tree_t vq_read_offset_tree;
	- avl_tree_t vq_write_offset_tree;
	- uint64_t vq_last_offset;
	- hrtime_t vq_io_complete_ts; /* time last i/o completed */
	- kmutex_t vq_lock;
	- uint64_t vq_lastoffset;
	-};
	-
	-typedef enum vdev_alloc_bias {
	- VDEV_BIAS_NONE,
	- VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */
	- VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */
	- VDEV_BIAS_DEDUP /* dedicated to dedup metadata */
	-} vdev_alloc_bias_t;
	-
	-
	-/*
	- * On-disk indirect vdev state.
	- *
	- * An indirect vdev is described exclusively in the MOS config of a pool.
	- * The config for an indirect vdev includes several fields, which are
	- * accessed in memory by a vdev_indirect_config_t.
	- */
	-typedef struct vdev_indirect_config {
	- /*
	- * Object (in MOS) which contains the indirect mapping. This object
	- * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
	- * vimep_src. The bonus buffer for this object is a
	- * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
	- * removal is initiated.
	- *
	- * Note that this object can be empty if none of the data on the vdev
	- * has been copied yet.
	- */
	- uint64_t vic_mapping_object;
	-
	- /*
	- * Object (in MOS) which contains the birth times for the mapping
	- * entries. This object contains an array of
	- * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
	- * buffer for this object is a vdev_indirect_birth_phys_t. This object
	- * is allocated when a vdev removal is initiated.
	- *
	- * Note that this object can be empty if none of the vdev has yet been
	- * copied.
	- */
	- uint64_t vic_births_object;
	-
	- /*
	- * This is the vdev ID which was removed previous to this vdev, or
	- * UINT64_MAX if there are no previously removed vdevs.
	- */
	- uint64_t vic_prev_indirect_vdev;
	-} vdev_indirect_config_t;
	-
	-/*
	- * Virtual device descriptor
	- */
	-struct vdev {
	- /*
	- * Common to all vdev types.
	- */
	- uint64_t vdev_id; /* child number in vdev parent */
	- uint64_t vdev_guid; /* unique ID for this vdev */
	- uint64_t vdev_guid_sum; /* self guid + all child guids */
	- uint64_t vdev_orig_guid; /* orig. guid prior to remove */
	- uint64_t vdev_asize; /* allocatable device capacity */
	- uint64_t vdev_min_asize; /* min acceptable asize */
	- uint64_t vdev_max_asize; /* max acceptable asize */
	- uint64_t vdev_ashift; /* block alignment shift */
	- /*
	- * Logical block alignment shift
	- *
	- * The smallest sized/aligned I/O supported by the device.
	- */
	- uint64_t vdev_logical_ashift;
	- /*
	- * Physical block alignment shift
	- *
	- * The device supports logical I/Os with vdev_logical_ashift
	- * size/alignment, but optimum performance will be achieved by
	- * aligning/sizing requests to vdev_physical_ashift. Smaller
	- * requests may be inflated or incur device level read-modify-write
	- * operations.
	- *
	- * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
	- */
	- uint64_t vdev_physical_ashift;
	- uint64_t vdev_state; /* see VDEV_STATE_* #defines */
	- uint64_t vdev_prevstate; /* used when reopening a vdev */
	- vdev_ops_t vdev_ops; / vdev operations */
	- spa_t vdev_spa; / spa for this vdev */
	- void vdev_tsd; / type-specific data */
	- vnode_t vdev_name_vp; / vnode for pathname */
	- vnode_t vdev_devid_vp; / vnode for devid */
	- vdev_t vdev_top; / top-level vdev */
	- vdev_t vdev_parent; / parent vdev */
	- vdev_t *vdev_child; / array of children */
	- uint64_t vdev_children; /* number of children */
	- vdev_stat_t vdev_stat; /* virtual device statistics */
	- boolean_t vdev_expanding; /* expand the vdev? */
	- boolean_t vdev_reopening; /* reopen in progress? */
	- boolean_t vdev_nonrot; /* true if solid state */
	- int vdev_open_error; /* error on last open */
	- kthread_t vdev_open_thread; / thread opening children */
	- uint64_t vdev_crtxg; /* txg when top-level was added */
	-
	- /*
	- * Top-level vdev state.
	- */
	- uint64_t vdev_ms_array; /* metaslab array object */
	- uint64_t vdev_ms_shift; /* metaslab size shift */
	- uint64_t vdev_ms_count; /* number of metaslabs */
	- metaslab_group_t vdev_mg; / metaslab group */
	- metaslab_t *vdev_ms; / metaslab array */
	- txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
	- txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
	- txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
	- boolean_t vdev_remove_wanted; /* async remove wanted? */
	- boolean_t vdev_probe_wanted; /* async probe wanted? */
	- list_node_t vdev_config_dirty_node; /* config dirty list */
	- list_node_t vdev_state_dirty_node; /* state dirty list */
	- uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
	- uint64_t vdev_islog; /* is an intent log device */
	- uint64_t vdev_removing; /* device is being removed? */
	- boolean_t vdev_ishole; /* is a hole in the namespace */
	- uint64_t vdev_top_zap;
	- vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
	-
	- /* pool checkpoint related */
	- space_map_t vdev_checkpoint_sm; / contains reserved blocks */
	-
	- boolean_t vdev_initialize_exit_wanted;
	- vdev_initializing_state_t vdev_initialize_state;
	- kthread_t *vdev_initialize_thread;
	- /* Protects vdev_initialize_thread and vdev_initialize_state. */
	- kmutex_t vdev_initialize_lock;
	- kcondvar_t vdev_initialize_cv;
	- uint64_t vdev_initialize_offset[TXG_SIZE];
	- uint64_t vdev_initialize_last_offset;
	- range_tree_t vdev_initialize_tree; / valid while initializing */
	- uint64_t vdev_initialize_bytes_est;
	- uint64_t vdev_initialize_bytes_done;
	- time_t vdev_initialize_action_time; /* start and end time */
	-
	- /* for limiting outstanding I/Os */
	- kmutex_t vdev_initialize_io_lock;
	- kcondvar_t vdev_initialize_io_cv;
	- uint64_t vdev_initialize_inflight;
	-
	- /*
	- * Values stored in the config for an indirect or removing vdev.
	- */
	- vdev_indirect_config_t vdev_indirect_config;
	-
	- /*
	- * The vdev_indirect_rwlock protects the vdev_indirect_mapping
	- * pointer from changing on indirect vdevs (when it is condensed).
	- * Note that removing (not yet indirect) vdevs have different
	- * access patterns (the mapping is not accessed from open context,
	- * e.g. from zio_read) and locking strategy (e.g. svr_lock).
	- */
	- krwlock_t vdev_indirect_rwlock;
	- vdev_indirect_mapping_t *vdev_indirect_mapping;
	- vdev_indirect_births_t *vdev_indirect_births;
	-
	- /*
	- * In memory data structures used to manage the obsolete sm, for
	- * indirect or removing vdevs.
	- *
	- * The vdev_obsolete_segments is the in-core record of the segments
	- * that are no longer referenced anywhere in the pool (due to
	- * being freed or remapped and not referenced by any snapshots).
	- * During a sync, segments are added to vdev_obsolete_segments
	- * via vdev_indirect_mark_obsolete(); at the end of each sync
	- * pass, this is appended to vdev_obsolete_sm via
	- * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock
	- * protects against concurrent modifications of vdev_obsolete_segments
	- * from multiple zio threads.
	- */
	- kmutex_t vdev_obsolete_lock;
	- range_tree_t *vdev_obsolete_segments;
	- space_map_t *vdev_obsolete_sm;
	-
	- /*
	- * Protects the vdev_scan_io_queue field itself as well as the
	- * structure's contents (when present).
	- */
	- kmutex_t vdev_scan_io_queue_lock;
	- struct dsl_scan_io_queue *vdev_scan_io_queue;
	-
	- /*
	- * Leaf vdev state.
	- */
	- range_tree_t vdev_dtl[DTL_TYPES]; / dirty time logs */
	- space_map_t vdev_dtl_sm; / dirty time log space map */
	- txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
	- uint64_t vdev_dtl_object; /* DTL object */
	- uint64_t vdev_psize; /* physical device capacity */
	- uint64_t vdev_wholedisk; /* true if this is a whole disk */
	- uint64_t vdev_offline; /* persistent offline state */
	- uint64_t vdev_faulted; /* persistent faulted state */
	- uint64_t vdev_degraded; /* persistent degraded state */
	- uint64_t vdev_removed; /* persistent removed state */
	- uint64_t vdev_resilver_txg; /* persistent resilvering state */
	- uint64_t vdev_nparity; /* number of parity devices for raidz */
	- char vdev_path; / vdev path (if any) */
	- char vdev_devid; / vdev devid (if any) */
	- char vdev_physpath; / vdev device path (if any) */
	- char vdev_fru; / physical FRU location */
	- uint64_t vdev_not_present; /* not present during import */
	- uint64_t vdev_unspare; /* unspare when resilvering done */
	- boolean_t vdev_nowritecache; /* true if flushwritecache failed */
	- boolean_t vdev_notrim; /* true if trim failed */
	- boolean_t vdev_checkremove; /* temporary online test */
	- boolean_t vdev_forcefault; /* force online fault */
	- boolean_t vdev_splitting; /* split or repair in progress */
	- boolean_t vdev_delayed_close; /* delayed device close? */
	- boolean_t vdev_tmpoffline; /* device taken offline temporarily? */
	- boolean_t vdev_detached; /* device detached? */
	- boolean_t vdev_cant_read; /* vdev is failing all reads */
	- boolean_t vdev_cant_write; /* vdev is failing all writes */
	- boolean_t vdev_isspare; /* was a hot spare */
	- boolean_t vdev_isl2cache; /* was a l2cache device */
	- vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
	- vdev_cache_t vdev_cache; /* physical block cache */
	- spa_aux_vdev_t vdev_aux; / for l2cache and spares vdevs */
	- zio_t vdev_probe_zio; / root of current probe */
	- vdev_aux_t vdev_label_aux; /* on-disk aux state */
	- struct trim_map vdev_trimmap; / map on outstanding trims */
	- uint64_t vdev_leaf_zap;
	- hrtime_t vdev_mmp_pending; /* 0 if write finished */
	- uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
	- list_node_t vdev_leaf_node; /* leaf vdev list */
	-
	- /*
	- * For DTrace to work in userland (libzpool) context, these fields must
	- * remain at the end of the structure. DTrace will use the kernel's
	- * CTF definition for 'struct vdev', and since the size of a kmutex_t is
	- * larger in userland, the offsets for the rest of the fields would be
	- * incorrect.
	- */
	- kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
	- kmutex_t vdev_stat_lock; /* vdev_stat */
	- kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
	-};
	-
	-#define VDEV_RAIDZ_MAXPARITY 3
	-
	-#define VDEV_PAD_SIZE (8 << 10)
	-/* 2 padding areas (vl_pad1 and vl_be) to skip */
	-#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
	-#define VDEV_PHYS_SIZE (112 << 10)
	-#define VDEV_UBERBLOCK_RING (128 << 10)
	-
	-/*
	- * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
	- * ring when MMP is enabled.
	- */
	-#define MMP_BLOCKS_PER_LABEL 1
	-
	-/* The largest uberblock we support is 8k. */
	-#define MAX_UBERBLOCK_SHIFT (13)
	-#define VDEV_UBERBLOCK_SHIFT(vd) \
	- MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
	- MAX_UBERBLOCK_SHIFT)
	-#define VDEV_UBERBLOCK_COUNT(vd) \
	- (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
	-#define VDEV_UBERBLOCK_OFFSET(vd, n) \
	- offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
	-#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
	-
	-typedef struct vdev_phys {
	- char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
	- zio_eck_t vp_zbt;
	-} vdev_phys_t;
	-
	-typedef enum vbe_vers {
	- /* The bootenv file is stored as ascii text in the envblock */
	- VB_RAW = 0,
	-
	- /*
	- * The bootenv file is converted to an nvlist and then packed into the
	- * envblock.
	- */
	- VB_NVLIST = 1
	-} vbe_vers_t;
	-
	-typedef struct vdev_boot_envblock {
	- uint64_t vbe_version;
	- char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) -
	- sizeof (zio_eck_t)];
	- zio_eck_t vbe_zbt;
	-} vdev_boot_envblock_t;
	-
	-CTASSERT(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE);
	-
	-typedef struct vdev_label {
	- char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
	- vdev_boot_envblock_t vl_be; /* 8K */
	- vdev_phys_t vl_vdev_phys; /* 112K */
	- char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
	-} vdev_label_t; /* 256K total */
	-
	-/*
	- * vdev_dirty() flags
	- */
	-#define VDD_METASLAB 0x01
	-#define VDD_DTL 0x02
	-
	-/* Offset of embedded boot loader region on each label */
	-#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
	-/*
	- * Size of embedded boot loader region on each label.
	- * The total size of the first two labels plus the boot area is 4MB.
	- */
	-#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
	-
	-/*
	- * Size of label regions at the start and end of each leaf device.
	- */
	-#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
	-#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
	-#define VDEV_LABELS 4
	-#define VDEV_BEST_LABEL VDEV_LABELS
	-
	-#define VDEV_ALLOC_LOAD 0
	-#define VDEV_ALLOC_ADD 1
	-#define VDEV_ALLOC_SPARE 2
	-#define VDEV_ALLOC_L2CACHE 3
	-#define VDEV_ALLOC_ROOTPOOL 4
	-#define VDEV_ALLOC_SPLIT 5
	-#define VDEV_ALLOC_ATTACH 6
	-
	-/*
	- * Allocate or free a vdev
	- */
	-extern vdev_t vdev_alloc_common(spa_t spa, uint_t id, uint64_t guid,
	- vdev_ops_t *ops);
	-extern int vdev_alloc(spa_t spa, vdev_t vdp, nvlist_t config,
	- vdev_t *parent, uint_t id, int alloctype);
	-extern void vdev_free(vdev_t *vd);
	-
	-/*
	- * Add or remove children and parents
	- */
	-extern void vdev_add_child(vdev_t pvd, vdev_t cvd);
	-extern void vdev_remove_child(vdev_t pvd, vdev_t cvd);
	-extern void vdev_compact_children(vdev_t *pvd);
	-extern vdev_t vdev_add_parent(vdev_t cvd, vdev_ops_t *ops);
	-extern void vdev_remove_parent(vdev_t *cvd);
	-
	-/*
	- * vdev sync load and sync
	- */
	-extern boolean_t vdev_log_state_valid(vdev_t *vd);
	-extern int vdev_load(vdev_t *vd);
	-extern int vdev_dtl_load(vdev_t *vd);
	-extern void vdev_sync(vdev_t *vd, uint64_t txg);
	-extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
	-extern void vdev_dirty(vdev_t vd, int flags, void arg, uint64_t txg);
	-extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
	-
	-/*
	- * Available vdev types.
	- */
	-extern vdev_ops_t vdev_root_ops;
	-extern vdev_ops_t vdev_mirror_ops;
	-extern vdev_ops_t vdev_replacing_ops;
	-extern vdev_ops_t vdev_raidz_ops;
	-#ifdef _KERNEL
	-extern vdev_ops_t vdev_geom_ops;
	-#else
	-extern vdev_ops_t vdev_disk_ops;
	-#endif
	-extern vdev_ops_t vdev_file_ops;
	-extern vdev_ops_t vdev_missing_ops;
	-extern vdev_ops_t vdev_hole_ops;
	-extern vdev_ops_t vdev_spare_ops;
	-extern vdev_ops_t vdev_indirect_ops;
	-
	-/*
	- * Common size functions
	- */
	-extern void vdev_default_xlate(vdev_t vd, const range_seg_t in,
	- range_seg_t *out);
	-extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
	-extern uint64_t vdev_get_min_asize(vdev_t *vd);
	-extern void vdev_set_min_asize(vdev_t *vd);
	-
	-/*
	- * Global variables
	- */
	-extern int vdev_standard_sm_blksz;
	-/* zdb uses this tunable, so it must be declared here to make lint happy. */
	-extern int zfs_vdev_cache_size;
	-extern uint_t zfs_geom_probe_vdev_key;
	-
	-/*
	- * Functions from vdev_indirect.c
	- */
	-extern void vdev_indirect_sync_obsolete(vdev_t vd, dmu_tx_t tx);
	-extern boolean_t vdev_indirect_should_condense(vdev_t *vd);
	-extern void spa_condense_indirect_start_sync(vdev_t vd, dmu_tx_t tx);
	-extern int vdev_obsolete_sm_object(vdev_t *vd);
	-extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);
	-
	-#ifdef illumos
	-/*
	- * Other miscellaneous functions
	- */
	-int vdev_checkpoint_sm_object(vdev_t *vd);
	-
	-/*
	- * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
	- */
	-typedef struct vdev_buf {
	- buf_t vb_buf; /* buffer that describes the io */
	- zio_t vb_io; / pointer back to the original zio_t */
	-} vdev_buf_t;
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VDEV_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
	@@ -1,80 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H
	-#define _SYS_VDEV_INDIRECT_BIRTHS_H
	-
	-#include <sys/dmu.h>
	-#include <sys/spa.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct vdev_indirect_birth_entry_phys {
	- uint64_t vibe_offset;
	- uint64_t vibe_phys_birth_txg;
	-} vdev_indirect_birth_entry_phys_t;
	-
	-typedef struct vdev_indirect_birth_phys {
	- uint64_t vib_count; /* count of v_i_b_entry_phys_t's */
	-} vdev_indirect_birth_phys_t;
	-
	-typedef struct vdev_indirect_births {
	- uint64_t vib_object;
	-
	- /*
	- * Each entry indicates that everything up to but not including
	- * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted
	- * by increasing phys_birth, and also by increasing offset. See
	- * vdev_indirect_births_physbirth for usage.
	- */
	- vdev_indirect_birth_entry_phys_t *vib_entries;
	-
	- objset_t *vib_objset;
	-
	- dmu_buf_t *vib_dbuf;
	- vdev_indirect_birth_phys_t *vib_phys;
	-} vdev_indirect_births_t;
	-
	-extern vdev_indirect_births_t vdev_indirect_births_open(objset_t os,
	- uint64_t object);
	-extern void vdev_indirect_births_close(vdev_indirect_births_t *vib);
	-extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib);
	-extern uint64_t vdev_indirect_births_alloc(objset_t os, dmu_tx_t tx);
	-extern void vdev_indirect_births_free(objset_t *os, uint64_t object,
	- dmu_tx_t *tx);
	-
	-extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib);
	-extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib);
	-
	-extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
	- uint64_t offset, uint64_t txg, dmu_tx_t *tx);
	-
	-extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib,
	- uint64_t offset, uint64_t asize);
	-
	-extern uint64_t vdev_indirect_births_last_entry_txg(
	- vdev_indirect_births_t *vib);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
	@@ -1,141 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_VDEV_INDIRECT_MAPPING_H
	-#define _SYS_VDEV_INDIRECT_MAPPING_H
	-
	-#include <sys/dmu.h>
	-#include <sys/list.h>
	-#include <sys/spa.h>
	-#include <sys/space_map.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct vdev_indirect_mapping_entry_phys {
	- /*
	- * Decode with DVA_MAPPING_* macros.
	- * Contains:
	- * the source offset (low 63 bits)
	- * the one-bit "mark", used for garbage collection (by zdb)
	- */
	- uint64_t vimep_src;
	-
	- /*
	- * Note: the DVA's asize is 24 bits, and can thus store ranges
	- * up to 8GB.
	- */
	- dva_t vimep_dst;
	-} vdev_indirect_mapping_entry_phys_t;
	-
	-#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \
	- BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0)
	-#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \
	- BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x)
	-
	-typedef struct vdev_indirect_mapping_entry {
	- vdev_indirect_mapping_entry_phys_t vime_mapping;
	- uint32_t vime_obsolete_count;
	- list_node_t vime_node;
	-} vdev_indirect_mapping_entry_t;
	-
	-/*
	- * This is stored in the bonus buffer of the mapping object, see comment of
	- * vdev_indirect_config for more details.
	- */
	-typedef struct vdev_indirect_mapping_phys {
	- uint64_t vimp_max_offset;
	- uint64_t vimp_bytes_mapped;
	- uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */
	-
	- /*
	- * For each entry in the mapping object, this object contains an
	- * entry representing the number of bytes of that mapping entry
	- * that were no longer in use by the pool at the time this indirect
	- * vdev was last condensed.
	- */
	- uint64_t vimp_counts_object;
	-} vdev_indirect_mapping_phys_t;
	-
	-#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t))
	-
	-typedef struct vdev_indirect_mapping {
	- uint64_t vim_object;
	- boolean_t vim_havecounts;
	-
	- /*
	- * An ordered array of all mapping entries, sorted by source offset.
	- * Note that vim_entries is needed during a removal (and contains
	- * mappings that have been synced to disk so far) to handle frees
	- * from the removing device.
	- */
	- vdev_indirect_mapping_entry_phys_t *vim_entries;
	-
	- objset_t *vim_objset;
	-
	- dmu_buf_t *vim_dbuf;
	- vdev_indirect_mapping_phys_t *vim_phys;
	-} vdev_indirect_mapping_t;
	-
	-extern vdev_indirect_mapping_t vdev_indirect_mapping_open(objset_t os,
	- uint64_t object);
	-extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim);
	-extern uint64_t vdev_indirect_mapping_alloc(objset_t os, dmu_tx_t tx);
	-extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj,
	- dmu_tx_t *tx);
	-
	-extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim);
	-extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim);
	-extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim);
	-extern uint64_t vdev_indirect_mapping_bytes_mapped(
	- vdev_indirect_mapping_t *vim);
	-extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim);
	-
	-/*
	- * Writes the given list of vdev_indirect_mapping_entry_t to the mapping
	- * then updates internal state.
	- */
	-extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
	- list_t vime_list, dmu_tx_t tx);
	-
	-extern vdev_indirect_mapping_entry_phys_t *
	- vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
	- uint64_t offset);
	-
	-extern vdev_indirect_mapping_entry_phys_t *
	- vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
	- uint64_t offset);
	-
	-extern uint32_t *vdev_indirect_mapping_load_obsolete_counts(
	- vdev_indirect_mapping_t *vim);
	-extern void vdev_indirect_mapping_load_obsolete_spacemap(
	- vdev_indirect_mapping_t *vim,
	- uint32_t counts, space_map_t obsolete_space_sm);
	-extern void vdev_indirect_mapping_increment_obsolete_count(
	- vdev_indirect_mapping_t *vim,
	- uint64_t offset, uint64_t asize, uint32_t *counts);
	-extern void vdev_indirect_mapping_free_obsolete_counts(
	- vdev_indirect_mapping_t vim, uint32_t counts);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
	@@ -1,46 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_VDEV_INITIALIZE_H
	-#define _SYS_VDEV_INITIALIZE_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-extern void vdev_initialize(vdev_t *vd);
	-extern void vdev_initialize_stop(vdev_t *vd,
	- vdev_initializing_state_t tgt_state);
	-extern void vdev_initialize_stop_all(vdev_t *vd,
	- vdev_initializing_state_t tgt_state);
	-extern void vdev_initialize_restart(vdev_t *vd);
	-extern void vdev_xlate(vdev_t vd, const range_seg_t logical_rs,
	- range_seg_t *physical_rs);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VDEV_INITIALIZE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
	@@ -1,50 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- */
	-
	-#ifndef _SYS_VDEV_RAIDZ_H
	-#define _SYS_VDEV_RAIDZ_H
	-
	-#include <sys/vdev.h>
	-#ifdef illumos
	-#include <sys/semaphore.h>
	-#ifdef _KERNEL
	-#include <sys/ddi.h>
	-#include <sys/sunldi.h>
	-#include <sys/sunddi.h>
	-#endif
	-#endif
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#ifdef _KERNEL
	-extern int vdev_raidz_physio(vdev_t *,
	- caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
	-#endif
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VDEV_RAIDZ_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
	@@ -1,96 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_VDEV_REMOVAL_H
	-#define _SYS_VDEV_REMOVAL_H
	-
	-#include <sys/spa.h>
	-#include <sys/bpobj.h>
	-#include <sys/vdev_indirect_mapping.h>
	-#include <sys/vdev_indirect_births.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct spa_vdev_removal {
	- uint64_t svr_vdev_id;
	- uint64_t svr_max_offset_to_sync[TXG_SIZE];
	- /* Thread performing a vdev removal. */
	- kthread_t *svr_thread;
	- /* Segments left to copy from the current metaslab. */
	- range_tree_t *svr_allocd_segs;
	- kmutex_t svr_lock;
	- kcondvar_t svr_cv;
	- boolean_t svr_thread_exit;
	-
	- /*
	- * New mappings to write out each txg.
	- */
	- list_t svr_new_segments[TXG_SIZE];
	-
	- /*
	- * Ranges that were freed while a mapping was in flight. This is
	- * a subset of the ranges covered by vdev_im_new_segments.
	- */
	- range_tree_t *svr_frees[TXG_SIZE];
	-
	- /*
	- * Number of bytes which we have finished our work for
	- * in each txg. This could be data copied (which will be part of
	- * the mappings in vdev_im_new_segments), or data freed before
	- * we got around to copying it.
	- */
	- uint64_t svr_bytes_done[TXG_SIZE];
	-
	- /* List of leaf zap objects to be unlinked */
	- nvlist_t *svr_zaplist;
	-} spa_vdev_removal_t;
	-
	-typedef struct spa_condensing_indirect {
	- /*
	- * New mappings to write out each txg.
	- */
	- list_t sci_new_mapping_entries[TXG_SIZE];
	-
	- vdev_indirect_mapping_t *sci_new_mapping;
	-} spa_condensing_indirect_t;
	-
	-extern int spa_remove_init(spa_t *);
	-extern void spa_restart_removal(spa_t *);
	-extern int spa_condense_init(spa_t *);
	-extern void spa_condense_fini(spa_t *);
	-extern void spa_start_indirect_condensing_thread(spa_t *);
	-extern void spa_vdev_condense_suspend(spa_t *);
	-extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
	-extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
	-extern int spa_removal_get_stats(spa_t , pool_removal_stat_t );
	-extern void svr_sync(spa_t spa, dmu_tx_t tx);
	-extern void spa_vdev_remove_suspend(spa_t *);
	-extern int spa_vdev_remove_cancel(spa_t *);
	-extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
	-
	-extern int vdev_removal_max_span;
	-extern int zfs_remove_max_segment;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VDEV_REMOVAL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
	@@ -1,514 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright 2017 Nexenta Systems, Inc.
	- */
	-
	-#ifndef _SYS_ZAP_H
	-#define _SYS_ZAP_H
	-
	-/*
	- * ZAP - ZFS Attribute Processor
	- *
	- * The ZAP is a module which sits on top of the DMU (Data Management
	- * Unit) and implements a higher-level storage primitive using DMU
	- * objects. Its primary consumer is the ZPL (ZFS Posix Layer).
	- *
	- * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
	- * Users should use only zap routines to access a zapobj - they should
	- * not access the DMU object directly using DMU routines.
	- *
	- * The attributes stored in a zapobj are name-value pairs. The name is
	- * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
	- * terminating NULL). The value is an array of integers, which may be
	- * 1, 2, 4, or 8 bytes long. The total space used by the array (number
	- * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
	- * Note that an 8-byte integer value can be used to store the location
	- * (object number) of another dmu object (which may be itself a zapobj).
	- * Note that you can use a zero-length attribute to store a single bit
	- * of information - the attribute is present or not.
	- *
	- * The ZAP routines are thread-safe. However, you must observe the
	- * DMU's restriction that a transaction may not be operated on
	- * concurrently.
	- *
	- * Any of the routines that return an int may return an I/O error (EIO
	- * or ECHECKSUM).
	- *
	- *
	- * Implementation / Performance Notes:
	- *
	- * The ZAP is intended to operate most efficiently on attributes with
	- * short (49 bytes or less) names and single 8-byte values, for which
	- * the microzap will be used. The ZAP should be efficient enough so
	- * that the user does not need to cache these attributes.
	- *
	- * The ZAP's locking scheme makes its routines thread-safe. Operations
	- * on different zapobjs will be processed concurrently. Operations on
	- * the same zapobj which only read data will be processed concurrently.
	- * Operations on the same zapobj which modify data will be processed
	- * concurrently when there are many attributes in the zapobj (because
	- * the ZAP uses per-block locking - more than 128 * (number of cpus)
	- * small attributes will suffice).
	- */
	-
	-/*
	- * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
	- * strings) for the names of attributes, rather than a byte string
	- * bounded by an explicit length. If some day we want to support names
	- * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
	- * we'll have to add routines for using length-bounded strings.
	- */
	-
	-#include <sys/dmu.h>
	-#include <sys/refcount.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Specifies matching criteria for ZAP lookups.
	- * MT_NORMALIZE Use ZAP normalization flags, which can include both
	- * unicode normalization and case-insensitivity.
	- * MT_MATCH_CASE Do case-sensitive lookups even if MT_NORMALIZE is
	- * specified and ZAP normalization flags include
	- * U8_TEXTPREP_TOUPPER.
	- */
	-typedef enum matchtype {
	- MT_NORMALIZE = 1 << 0,
	- MT_MATCH_CASE = 1 << 1,
	-} matchtype_t;
	-
	-typedef enum zap_flags {
	- /* Use 64-bit hash value (serialized cursors will always use 64-bits) */
	- ZAP_FLAG_HASH64 = 1 << 0,
	- /* Key is binary, not string (zap_add_uint64() can be used) */
	- ZAP_FLAG_UINT64_KEY = 1 << 1,
	- /*
	- * First word of key (which must be an array of uint64) is
	- * already randomly distributed.
	- */
	- ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
	-} zap_flags_t;
	-
	-/*
	- * Create a new zapobj with no attributes and return its object number.
	- *
	- * dnodesize specifies the on-disk size of the dnode for the new zapobj.
	- * Valid values are multiples of 512 up to DNODE_MAX_SIZE.
	- */
	-uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
	-uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
	-uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
	-uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags,
	- dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
	- int dnodesize, dmu_tx_t *tx);
	-uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
	- dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
	-uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
	- zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
	- int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
	- int dnodesize, dmu_tx_t *tx);
	-uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
	- uint64_t parent_obj, const char name, dmu_tx_t tx);
	-uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
	- uint64_t parent_obj, const char name, int dnodesize, dmu_tx_t tx);
	-uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
	- uint64_t parent_obj, const char name, int dnodesize, dmu_tx_t tx);
	-
	-/*
	- * Initialize an already-allocated object.
	- */
	-void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
	- zap_flags_t flags, dmu_tx_t *tx);
	-
	-/*
	- * Create a new zapobj with no attributes from the given (unallocated)
	- * object number.
	- */
	-int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
	-int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
	-int zap_create_claim_norm(objset_t *ds, uint64_t obj,
	- int normflags, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
	-int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
	- int normflags, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
	-
	-/*
	- * The zapobj passed in must be a valid ZAP object for all of the
	- * following routines.
	- */
	-
	-/*
	- * Destroy this zapobj and all its attributes.
	- *
	- * Frees the object number using dmu_object_free.
	- */
	-int zap_destroy(objset_t ds, uint64_t zapobj, dmu_tx_t tx);
	-
	-/*
	- * Manipulate attributes.
	- *
	- * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
	- */
	-
	-/*
	- * Retrieve the contents of the attribute with the given name.
	- *
	- * If the requested attribute does not exist, the call will fail and
	- * return ENOENT.
	- *
	- * If 'integer_size' is smaller than the attribute's integer size, the
	- * call will fail and return EINVAL.
	- *
	- * If 'integer_size' is equal to or larger than the attribute's integer
	- * size, the call will succeed and return 0.
	- *
	- * When converting to a larger integer size, the integers will be treated as
	- * unsigned (ie. no sign-extension will be performed).
	- *
	- * 'num_integers' is the length (in integers) of 'buf'.
	- *
	- * If the attribute is longer than the buffer, as many integers as will
	- * fit will be transferred to 'buf'. If the entire attribute was not
	- * transferred, the call will return EOVERFLOW.
	- */
	-int zap_lookup(objset_t ds, uint64_t zapobj, const char name,
	- uint64_t integer_size, uint64_t num_integers, void *buf);
	-
	-/*
	- * If rn_len is nonzero, realname will be set to the name of the found
	- * entry (which may be different from the requested name if matchtype is
	- * not MT_EXACT).
	- *
	- * If normalization_conflictp is not NULL, it will be set if there is
	- * another name with the same case/unicode normalized form.
	- */
	-int zap_lookup_norm(objset_t ds, uint64_t zapobj, const char name,
	- uint64_t integer_size, uint64_t num_integers, void *buf,
	- matchtype_t mt, char *realname, int rn_len,
	- boolean_t *normalization_conflictp);
	-int zap_lookup_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
	-int zap_contains(objset_t ds, uint64_t zapobj, const char name);
	-int zap_prefetch_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints);
	-int zap_lookup_by_dnode(dnode_t dn, const char name,
	- uint64_t integer_size, uint64_t num_integers, void *buf);
	-int zap_lookup_norm_by_dnode(dnode_t dn, const char name,
	- uint64_t integer_size, uint64_t num_integers, void *buf,
	- matchtype_t mt, char *realname, int rn_len,
	- boolean_t *ncp);
	-
	-int zap_count_write_by_dnode(dnode_t dn, const char name,
	- int add, zfs_refcount_t towrite, zfs_refcount_t tooverwrite);
	-
	-/*
	- * Create an attribute with the given name and value.
	- *
	- * If an attribute with the given name already exists, the call will
	- * fail and return EEXIST.
	- */
	-int zap_add(objset_t ds, uint64_t zapobj, const char key,
	- int integer_size, uint64_t num_integers,
	- const void val, dmu_tx_t tx);
	-int zap_add_by_dnode(dnode_t dn, const char key,
	- int integer_size, uint64_t num_integers,
	- const void val, dmu_tx_t tx);
	-int zap_add_uint64(objset_t ds, uint64_t zapobj, const uint64_t key,
	- int key_numints, int integer_size, uint64_t num_integers,
	- const void val, dmu_tx_t tx);
	-
	-/*
	- * Set the attribute with the given name to the given value. If an
	- * attribute with the given name does not exist, it will be created. If
	- * an attribute with the given name already exists, the previous value
	- * will be overwritten. The integer_size may be different from the
	- * existing attribute's integer size, in which case the attribute's
	- * integer size will be updated to the new value.
	- */
	-int zap_update(objset_t ds, uint64_t zapobj, const char name,
	- int integer_size, uint64_t num_integers, const void val, dmu_tx_t tx);
	-int zap_update_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints,
	- int integer_size, uint64_t num_integers, const void val, dmu_tx_t tx);
	-
	-/*
	- * Get the length (in integers) and the integer size of the specified
	- * attribute.
	- *
	- * If the requested attribute does not exist, the call will fail and
	- * return ENOENT.
	- */
	-int zap_length(objset_t ds, uint64_t zapobj, const char name,
	- uint64_t integer_size, uint64_t num_integers);
	-int zap_length_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints, uint64_t integer_size, uint64_t num_integers);
	-
	-/*
	- * Remove the specified attribute.
	- *
	- * If the specified attribute does not exist, the call will fail and
	- * return ENOENT.
	- */
	-int zap_remove(objset_t ds, uint64_t zapobj, const char name, dmu_tx_t *tx);
	-int zap_remove_norm(objset_t ds, uint64_t zapobj, const char name,
	- matchtype_t mt, dmu_tx_t *tx);
	-int zap_remove_by_dnode(dnode_t dn, const char name, dmu_tx_t *tx);
	-int zap_remove_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints, dmu_tx_t *tx);
	-
	-/*
	- * Returns (in *count) the number of attributes in the specified zap
	- * object.
	- */
	-int zap_count(objset_t ds, uint64_t zapobj, uint64_t count);
	-
	-/*
	- * Returns (in name) the name of the entry whose (value & mask)
	- * (za_first_integer) is value, or ENOENT if not found. The string
	- * pointed to by name must be at least 256 bytes long. If mask==0, the
	- * match must be exact (ie, same as mask=-1ULL).
	- */
	-int zap_value_search(objset_t *os, uint64_t zapobj,
	- uint64_t value, uint64_t mask, char *name);
	-
	-/*
	- * Transfer all the entries from fromobj into intoobj. Only works on
	- * int_size=8 num_integers=1 values. Fails if there are any duplicated
	- * entries.
	- */
	-int zap_join(objset_t os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t tx);
	-
	-/* Same as zap_join, but set the values to 'value'. */
	-int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
	- uint64_t value, dmu_tx_t *tx);
	-
	-/* Same as zap_join, but add together any duplicated entries. */
	-int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
	- dmu_tx_t *tx);
	-
	-/*
	- * Manipulate entries where the name + value are the "same" (the name is
	- * a stringified version of the value).
	- */
	-int zap_add_int(objset_t os, uint64_t obj, uint64_t value, dmu_tx_t tx);
	-int zap_remove_int(objset_t os, uint64_t obj, uint64_t value, dmu_tx_t tx);
	-int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
	-int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
	- dmu_tx_t *tx);
	-
	-/* Here the key is an int and the value is a different int. */
	-int zap_add_int_key(objset_t *os, uint64_t obj,
	- uint64_t key, uint64_t value, dmu_tx_t *tx);
	-int zap_update_int_key(objset_t *os, uint64_t obj,
	- uint64_t key, uint64_t value, dmu_tx_t *tx);
	-int zap_lookup_int_key(objset_t *os, uint64_t obj,
	- uint64_t key, uint64_t *valuep);
	-
	-int zap_increment(objset_t os, uint64_t obj, const char name, int64_t delta,
	- dmu_tx_t *tx);
	-
	-struct zap;
	-struct zap_leaf;
	-typedef struct zap_cursor {
	- /* This structure is opaque! */
	- objset_t *zc_objset;
	- struct zap *zc_zap;
	- struct zap_leaf *zc_leaf;
	- uint64_t zc_zapobj;
	- uint64_t zc_serialized;
	- uint64_t zc_hash;
	- uint32_t zc_cd;
	- boolean_t zc_prefetch;
	-} zap_cursor_t;
	-
	-typedef struct {
	- int za_integer_length;
	- /*
	- * za_normalization_conflict will be set if there are additional
	- * entries with this normalized form (eg, "foo" and "Foo").
	- */
	- boolean_t za_normalization_conflict;
	- uint64_t za_num_integers;
	- uint64_t za_first_integer; /* no sign extension for <8byte ints */
	- char za_name[ZAP_MAXNAMELEN];
	-} zap_attribute_t;
	-
	-/*
	- * The interface for listing all the attributes of a zapobj can be
	- * thought of as cursor moving down a list of the attributes one by
	- * one. The cookie returned by the zap_cursor_serialize routine is
	- * persistent across system calls (and across reboot, even).
	- */
	-
	-/*
	- * Initialize a zap cursor, pointing to the "first" attribute of the
	- * zapobj. You must _fini the cursor when you are done with it.
	- */
	-void zap_cursor_init(zap_cursor_t zc, objset_t ds, uint64_t zapobj);
	-void zap_cursor_init_noprefetch(zap_cursor_t zc, objset_t os,
	- uint64_t zapobj);
	-void zap_cursor_fini(zap_cursor_t *zc);
	-
	-/*
	- * Get the attribute currently pointed to by the cursor. Returns
	- * ENOENT if at the end of the attributes.
	- */
	-int zap_cursor_retrieve(zap_cursor_t zc, zap_attribute_t za);
	-
	-/*
	- * Advance the cursor to the next attribute.
	- */
	-void zap_cursor_advance(zap_cursor_t *zc);
	-
	-/*
	- * Get a persistent cookie pointing to the current position of the zap
	- * cursor. The low 4 bits in the cookie are always zero, and thus can
	- * be used as to differentiate a serialized cookie from a different type
	- * of value. The cookie will be less than 2^32 as long as there are
	- * fewer than 2^22 (4.2 million) entries in the zap object.
	- */
	-uint64_t zap_cursor_serialize(zap_cursor_t *zc);
	-
	-/*
	- * Advance the cursor to the attribute having the given key.
	- */
	-int zap_cursor_move_to_key(zap_cursor_t zc, const char name, matchtype_t mt);
	-
	-/*
	- * Initialize a zap cursor pointing to the position recorded by
	- * zap_cursor_serialize (in the "serialized" argument). You can also
	- * use a "serialized" argument of 0 to start at the beginning of the
	- * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
	- * zap_cursor_init(...).)
	- */
	-void zap_cursor_init_serialized(zap_cursor_t zc, objset_t ds,
	- uint64_t zapobj, uint64_t serialized);
	-
	-
	-#define ZAP_HISTOGRAM_SIZE 10
	-
	-typedef struct zap_stats {
	- /*
	- * Size of the pointer table (in number of entries).
	- * This is always a power of 2, or zero if it's a microzap.
	- * In general, it should be considerably greater than zs_num_leafs.
	- */
	- uint64_t zs_ptrtbl_len;
	-
	- uint64_t zs_blocksize; /* size of zap blocks */
	-
	- /*
	- * The number of blocks used. Note that some blocks may be
	- * wasted because old ptrtbl's and large name/value blocks are
	- * not reused. (Although their space is reclaimed, we don't
	- * reuse those offsets in the object.)
	- */
	- uint64_t zs_num_blocks;
	-
	- /*
	- * Pointer table values from zap_ptrtbl in the zap_phys_t
	- */
	- uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */
	- uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */
	- uint64_t zs_ptrtbl_zt_blk; /* starting block number */
	- uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */
	- uint64_t zs_ptrtbl_zt_shift; /* bits to index it */
	-
	- /*
	- * Values of the other members of the zap_phys_t
	- */
	- uint64_t zs_block_type; /* ZBT_HEADER */
	- uint64_t zs_magic; /* ZAP_MAGIC */
	- uint64_t zs_num_leafs; /* The number of leaf blocks */
	- uint64_t zs_num_entries; /* The number of zap entries */
	- uint64_t zs_salt; /* salt to stir into hash function */
	-
	- /*
	- * Histograms. For all histograms, the last index
	- * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
	- * than what can be represented. For example
	- * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
	- * of leafs with more than 45 entries.
	- */
	-
	- /*
	- * zs_leafs_with_n_pointers[n] is the number of leafs with
	- * 2^n pointers to it.
	- */
	- uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
	-
	- /*
	- * zs_leafs_with_n_entries[n] is the number of leafs with
	- * [n5, (n+1)5) entries. In the current implementation, there
	- * can be at most 55 entries in any block, but there may be
	- * fewer if the name or value is large, or the block is not
	- * completely full.
	- */
	- uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
	-
	- /*
	- * zs_leafs_n_tenths_full[n] is the number of leafs whose
	- * fullness is in the range [n/10, (n+1)/10).
	- */
	- uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
	-
	- /*
	- * zs_entries_using_n_chunks[n] is the number of entries which
	- * consume n 24-byte chunks. (Note, large names/values only use
	- * one chunk, but contribute to zs_num_blocks_large.)
	- */
	- uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
	-
	- /*
	- * zs_buckets_with_n_entries[n] is the number of buckets (each
	- * leaf has 64 buckets) with n entries.
	- * zs_buckets_with_n_entries[1] should be very close to
	- * zs_num_entries.
	- */
	- uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
	-} zap_stats_t;
	-
	-/*
	- * Get statistics about a ZAP object. Note: you need to be aware of the
	- * internal implementation of the ZAP to correctly interpret some of the
	- * statistics. This interface shouldn't be relied on unless you really
	- * know what you're doing.
	- */
	-int zap_get_stats(objset_t ds, uint64_t zapobj, zap_stats_t zs);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZAP_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
	@@ -1,242 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Nexenta Systems, Inc.
	- */
	-
	-#ifndef _SYS_ZAP_IMPL_H
	-#define _SYS_ZAP_IMPL_H
	-
	-#include <sys/zap.h>
	-#include <sys/zfs_context.h>
	-#include <sys/avl.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-extern int fzap_default_block_shift;
	-
	-#define ZAP_MAGIC 0x2F52AB2ABULL
	-
	-#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift)
	-
	-#define MZAP_ENT_LEN 64
	-#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
	-#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE
	-
	-#define ZAP_NEED_CD (-1U)
	-
	-typedef struct mzap_ent_phys {
	- uint64_t mze_value;
	- uint32_t mze_cd;
	- uint16_t mze_pad; /* in case we want to chain them someday */
	- char mze_name[MZAP_NAME_LEN];
	-} mzap_ent_phys_t;
	-
	-typedef struct mzap_phys {
	- uint64_t mz_block_type; /* ZBT_MICRO */
	- uint64_t mz_salt;
	- uint64_t mz_normflags;
	- uint64_t mz_pad[5];
	- mzap_ent_phys_t mz_chunk[1];
	- /* actually variable size depending on block size */
	-} mzap_phys_t;
	-
	-typedef struct mzap_ent {
	- avl_node_t mze_node;
	- int mze_chunkid;
	- uint64_t mze_hash;
	- uint32_t mze_cd; /* copy from mze_phys->mze_cd */
	-} mzap_ent_t;
	-
	-#define MZE_PHYS(zap, mze) \
	- (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid])
	-
	-/*
	- * The (fat) zap is stored in one object. It is an array of
	- * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
	- *
	- * ptrtbl fits in first block:
	- * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
	- *
	- * ptrtbl too big for first block:
	- * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
	- *
	- */
	-
	-struct dmu_buf;
	-struct zap_leaf;
	-
	-#define ZBT_LEAF ((1ULL << 63) + 0)
	-#define ZBT_HEADER ((1ULL << 63) + 1)
	-#define ZBT_MICRO ((1ULL << 63) + 3)
	-/* any other values are ptrtbl blocks */
	-
	-/*
	- * the embedded pointer table takes up half a block:
	- * block size / entry size (2^3) / 2
	- */
	-#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
	-
	-/*
	- * The embedded pointer table starts half-way through the block. Since
	- * the pointer table itself is half the block, it starts at (64-bit)
	- * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
	- */
	-#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
	- ((uint64_t *)zap_f_phys(zap)) \
	- [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
	-
	-/*
	- * TAKE NOTE:
	- * If zap_phys_t is modified, zap_byteswap() must be modified.
	- */
	-typedef struct zap_phys {
	- uint64_t zap_block_type; /* ZBT_HEADER */
	- uint64_t zap_magic; /* ZAP_MAGIC */
	-
	- struct zap_table_phys {
	- uint64_t zt_blk; /* starting block number */
	- uint64_t zt_numblks; /* number of blocks */
	- uint64_t zt_shift; /* bits to index it */
	- uint64_t zt_nextblk; /* next (larger) copy start block */
	- uint64_t zt_blks_copied; /* number source blocks copied */
	- } zap_ptrtbl;
	-
	- uint64_t zap_freeblk; /* the next free block */
	- uint64_t zap_num_leafs; /* number of leafs */
	- uint64_t zap_num_entries; /* number of entries */
	- uint64_t zap_salt; /* salt to stir into hash function */
	- uint64_t zap_normflags; /* flags for u8_textprep_str() */
	- uint64_t zap_flags; /* zap_flags_t */
	- /*
	- * This structure is followed by padding, and then the embedded
	- * pointer table. The embedded pointer table takes up second
	- * half of the block. It is accessed using the
	- * ZAP_EMBEDDED_PTRTBL_ENT() macro.
	- */
	-} zap_phys_t;
	-
	-typedef struct zap_table_phys zap_table_phys_t;
	-
	-typedef struct zap {
	- dmu_buf_user_t zap_dbu;
	- objset_t *zap_objset;
	- uint64_t zap_object;
	- struct dmu_buf *zap_dbuf;
	- krwlock_t zap_rwlock;
	- boolean_t zap_ismicro;
	- int zap_normflags;
	- uint64_t zap_salt;
	- union {
	- struct {
	- /*
	- * zap_num_entries_mtx protects
	- * zap_num_entries
	- */
	- kmutex_t zap_num_entries_mtx;
	- int zap_block_shift;
	- } zap_fat;
	- struct {
	- int16_t zap_num_entries;
	- int16_t zap_num_chunks;
	- int16_t zap_alloc_next;
	- avl_tree_t zap_avl;
	- } zap_micro;
	- } zap_u;
	-} zap_t;
	-
	-inline zap_phys_t *
	-zap_f_phys(zap_t *zap)
	-{
	- return (zap->zap_dbuf->db_data);
	-}
	-
	-inline mzap_phys_t *
	-zap_m_phys(zap_t *zap)
	-{
	- return (zap->zap_dbuf->db_data);
	-}
	-
	-typedef struct zap_name {
	- zap_t *zn_zap;
	- int zn_key_intlen;
	- const void *zn_key_orig;
	- int zn_key_orig_numints;
	- const void *zn_key_norm;
	- int zn_key_norm_numints;
	- uint64_t zn_hash;
	- matchtype_t zn_matchtype;
	- int zn_normflags;
	- char zn_normbuf[ZAP_MAXNAMELEN];
	-} zap_name_t;
	-
	-#define zap_f zap_u.zap_fat
	-#define zap_m zap_u.zap_micro
	-
	-boolean_t zap_match(zap_name_t zn, const char matchname);
	-int zap_lockdir(objset_t os, uint64_t obj, dmu_tx_t tx,
	- krw_t lti, boolean_t fatreader, boolean_t adding, void tag, zap_t *zapp);
	-void zap_unlockdir(zap_t zap, void tag);
	-void zap_evict_sync(void *dbu);
	-zap_name_t zap_name_alloc(zap_t zap, const char *key, matchtype_t mt);
	-void zap_name_free(zap_name_t *zn);
	-int zap_hashbits(zap_t *zap);
	-uint32_t zap_maxcd(zap_t *zap);
	-uint64_t zap_getflags(zap_t *zap);
	-
	-#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
	-
	-void fzap_byteswap(void *buf, size_t size);
	-int fzap_count(zap_t zap, uint64_t count);
	-int fzap_lookup(zap_name_t *zn,
	- uint64_t integer_size, uint64_t num_integers, void *buf,
	- char realname, int rn_len, boolean_t normalization_conflictp);
	-void fzap_prefetch(zap_name_t *zn);
	-int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
	- const void val, void tag, dmu_tx_t *tx);
	-int fzap_update(zap_name_t *zn,
	- int integer_size, uint64_t num_integers, const void *val,
	- void tag, dmu_tx_t tx);
	-int fzap_length(zap_name_t *zn,
	- uint64_t integer_size, uint64_t num_integers);
	-int fzap_remove(zap_name_t zn, dmu_tx_t tx);
	-int fzap_cursor_retrieve(zap_t zap, zap_cursor_t zc, zap_attribute_t *za);
	-void fzap_get_stats(zap_t zap, zap_stats_t zs);
	-void zap_put_leaf(struct zap_leaf *l);
	-
	-int fzap_add_cd(zap_name_t *zn,
	- uint64_t integer_size, uint64_t num_integers,
	- const void val, uint32_t cd, void tag, dmu_tx_t *tx);
	-void fzap_upgrade(zap_t zap, dmu_tx_t tx, zap_flags_t flags);
	-int fzap_cursor_move_to_key(zap_cursor_t zc, zap_name_t zn);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZAP_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
	@@ -1,248 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- */
	-
	-#ifndef _SYS_ZAP_LEAF_H
	-#define _SYS_ZAP_LEAF_H
	-
	-#include <sys/zap.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct zap;
	-struct zap_name;
	-struct zap_stats;
	-
	-#define ZAP_LEAF_MAGIC 0x2AB1EAF
	-
	-/* chunk size = 24 bytes */
	-#define ZAP_LEAF_CHUNKSIZE 24
	-
	-/*
	- * The amount of space available for chunks is:
	- * block size (1<<l->l_bs) - hash entry size (2) * number of hash
	- * entries - header space (2*chunksize)
	- */
	-#define ZAP_LEAF_NUMCHUNKS(l) \
	- (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
	- ZAP_LEAF_CHUNKSIZE - 2)
	-
	-/*
	- * The amount of space within the chunk available for the array is:
	- * chunk size - space for type (1) - space for next pointer (2)
	- */
	-#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
	-
	-#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
	- (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
	-
	-/*
	- * Low water mark: when there are only this many chunks free, start
	- * growing the ptrtbl. Ideally, this should be larger than a
	- * "reasonably-sized" entry. 20 chunks is more than enough for the
	- * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
	- * while still being only around 3% for 16k blocks.
	- */
	-#define ZAP_LEAF_LOW_WATER (20)
	-
	-/*
	- * The leaf hash table has block size / 2^5 (32) number of entries,
	- * which should be more than enough for the maximum number of entries,
	- * which is less than block size / CHUNKSIZE (24) / minimum number of
	- * chunks per entry (3).
	- */
	-#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
	-#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
	-
	-/*
	- * The chunks start immediately after the hash table. The end of the
	- * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
	- * chunk_t.
	- */
	-#define ZAP_LEAF_CHUNK(l, idx) \
	- ((zap_leaf_chunk_t *) \
	- (zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
	-#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
	-
	-typedef enum zap_chunk_type {
	- ZAP_CHUNK_FREE = 253,
	- ZAP_CHUNK_ENTRY = 252,
	- ZAP_CHUNK_ARRAY = 251,
	- ZAP_CHUNK_TYPE_MAX = 250
	-} zap_chunk_type_t;
	-
	-#define ZLF_ENTRIES_CDSORTED (1<<0)
	-
	-/*
	- * TAKE NOTE:
	- * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
	- */
	-typedef struct zap_leaf_phys {
	- struct zap_leaf_header {
	- /* Public to ZAP */
	- uint64_t lh_block_type; /* ZBT_LEAF */
	- uint64_t lh_pad1;
	- uint64_t lh_prefix; /* hash prefix of this leaf */
	- uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
	- uint16_t lh_nfree; /* number free chunks */
	- uint16_t lh_nentries; /* number of entries */
	- uint16_t lh_prefix_len; /* num bits used to id this */
	-
	- /* Private to zap_leaf */
	- uint16_t lh_freelist; /* chunk head of free list */
	- uint8_t lh_flags; /* ZLF_* flags */
	- uint8_t lh_pad2[11];
	- } l_hdr; /* 2 24-byte chunks */
	-
	- /*
	- * The header is followed by a hash table with
	- * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
	- * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
	- * zap_leaf_chunk structures. These structures are accessed
	- * with the ZAP_LEAF_CHUNK() macro.
	- */
	-
	- uint16_t l_hash[1];
	-} zap_leaf_phys_t;
	-
	-typedef union zap_leaf_chunk {
	- struct zap_leaf_entry {
	- uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
	- uint8_t le_value_intlen; /* size of value's ints */
	- uint16_t le_next; /* next entry in hash chain */
	- uint16_t le_name_chunk; /* first chunk of the name */
	- uint16_t le_name_numints; /* ints in name (incl null) */
	- uint16_t le_value_chunk; /* first chunk of the value */
	- uint16_t le_value_numints; /* value length in ints */
	- uint32_t le_cd; /* collision differentiator */
	- uint64_t le_hash; /* hash value of the name */
	- } l_entry;
	- struct zap_leaf_array {
	- uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
	- uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
	- uint16_t la_next; /* next blk or CHAIN_END */
	- } l_array;
	- struct zap_leaf_free {
	- uint8_t lf_type; /* always ZAP_CHUNK_FREE */
	- uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
	- uint16_t lf_next; /* next in free list, or CHAIN_END */
	- } l_free;
	-} zap_leaf_chunk_t;
	-
	-typedef struct zap_leaf {
	- dmu_buf_user_t l_dbu;
	- krwlock_t l_rwlock;
	- uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
	- int l_bs; /* block size shift */
	- dmu_buf_t *l_dbuf;
	-} zap_leaf_t;
	-
	-inline zap_leaf_phys_t *
	-zap_leaf_phys(zap_leaf_t *l)
	-{
	- return (l->l_dbuf->db_data);
	-}
	-
	-typedef struct zap_entry_handle {
	- /* Set by zap_leaf and public to ZAP */
	- uint64_t zeh_num_integers;
	- uint64_t zeh_hash;
	- uint32_t zeh_cd;
	- uint8_t zeh_integer_size;
	-
	- /* Private to zap_leaf */
	- uint16_t zeh_fakechunk;
	- uint16_t *zeh_chunkp;
	- zap_leaf_t *zeh_leaf;
	-} zap_entry_handle_t;
	-
	-/*
	- * Return a handle to the named entry, or ENOENT if not found. The hash
	- * value must equal zap_hash(name).
	- */
	-extern int zap_leaf_lookup(zap_leaf_t *l,
	- struct zap_name zn, zap_entry_handle_t zeh);
	-
	-/*
	- * Return a handle to the entry with this hash+cd, or the entry with the
	- * next closest hash+cd.
	- */
	-extern int zap_leaf_lookup_closest(zap_leaf_t *l,
	- uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
	-
	-/*
	- * Read the first num_integers in the attribute. Integer size
	- * conversion will be done without sign extension. Return EINVAL if
	- * integer_size is too small. Return EOVERFLOW if there are more than
	- * num_integers in the attribute.
	- */
	-extern int zap_entry_read(const zap_entry_handle_t *zeh,
	- uint8_t integer_size, uint64_t num_integers, void *buf);
	-
	-extern int zap_entry_read_name(struct zap zap, const zap_entry_handle_t zeh,
	- uint16_t buflen, char *buf);
	-
	-/*
	- * Replace the value of an existing entry.
	- *
	- * May fail if it runs out of space (ENOSPC).
	- */
	-extern int zap_entry_update(zap_entry_handle_t *zeh,
	- uint8_t integer_size, uint64_t num_integers, const void *buf);
	-
	-/*
	- * Remove an entry.
	- */
	-extern void zap_entry_remove(zap_entry_handle_t *zeh);
	-
	-/*
	- * Create an entry. An equal entry must not exist, and this entry must
	- * belong in this leaf (according to its hash value). Fills in the
	- * entry handle on success. Returns 0 on success or ENOSPC on failure.
	- */
	-extern int zap_entry_create(zap_leaf_t l, struct zap_name zn, uint32_t cd,
	- uint8_t integer_size, uint64_t num_integers, const void *buf,
	- zap_entry_handle_t *zeh);
	-
	-/* Determine whether there is another entry with the same normalized form. */
	-extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
	- struct zap_name zn, const char name, struct zap *zap);
	-
	-/*
	- * Other stuff.
	- */
	-
	-extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
	-extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
	-extern void zap_leaf_split(zap_leaf_t l, zap_leaf_t nl, boolean_t sort);
	-extern void zap_leaf_stats(struct zap zap, zap_leaf_t l,
	- struct zap_stats *zs);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZAP_LEAF_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h
	@@ -1,185 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZCP_H
	-#define _SYS_ZCP_H
	-
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_pool.h>
	-
	-#include "lua.h"
	-#include "lualib.h"
	-#include "lauxlib.h"
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define ZCP_RUN_INFO_KEY "runinfo"
	-
	-extern uint64_t zfs_lua_max_instrlimit;
	-extern uint64_t zfs_lua_max_memlimit;
	-
	-int zcp_argerror(lua_State , int, const char , ...);
	-
	-int zcp_eval(const char , const char , boolean_t, uint64_t, uint64_t,
	- nvpair_t , nvlist_t );
	-
	-int zcp_load_list_lib(lua_State *);
	-
	-int zcp_load_synctask_lib(lua_State *, boolean_t);
	-
	-typedef void (zcp_cleanup_t)(void *);
	-typedef struct zcp_cleanup_handler {
	- zcp_cleanup_t *zch_cleanup_func;
	- void *zch_cleanup_arg;
	- list_node_t zch_node;
	-} zcp_cleanup_handler_t;
	-
	-typedef struct zcp_alloc_arg {
	- boolean_t aa_must_succeed;
	- int64_t aa_alloc_remaining;
	- int64_t aa_alloc_limit;
	-} zcp_alloc_arg_t;
	-
	-typedef struct zcp_run_info {
	- dsl_pool_t *zri_pool;
	-
	- /*
	- * An estimate of the total amount of space consumed by all
	- * synctasks we have successfully performed so far in this
	- * channel program. Used to generate ENOSPC errors for syncfuncs.
	- */
	- int zri_space_used;
	-
	- /*
	- * The credentials of the thread which originally invoked the channel
	- * program. Since channel programs are always invoked from the synctask
	- * thread they should always do permissions checks against this cred
	- * rather than the 'current' thread's.
	- */
	- cred_t *zri_cred;
	-
	- /*
	- * The tx in which this channel program is running.
	- */
	- dmu_tx_t *zri_tx;
	-
	- /*
	- * The maximum number of Lua instructions the channel program is allowed
	- * to execute. If it takes longer than this it will time out. A value
	- * of 0 indicates no instruction limit.
	- */
	- uint64_t zri_maxinstrs;
	-
	- /*
	- * The number of Lua instructions the channel program has executed.
	- */
	- uint64_t zri_curinstrs;
	-
	- /*
	- * Boolean indicating whether or not the channel program exited
	- * because it timed out.
	- */
	- boolean_t zri_timed_out;
	-
	- /*
	- * Channel program was canceled by user
	- */
	- boolean_t zri_canceled;
	-
	- /*
	- * Boolean indicating whether or not we are running in syncing
	- * context.
	- */
	- boolean_t zri_sync;
	-
	- /*
	- * List of currently registered cleanup handlers, which will be
	- * triggered in the event of a fatal error.
	- */
	- list_t zri_cleanup_handlers;
	-
	- /*
	- * The Lua state context of our channel program.
	- */
	- lua_State *zri_state;
	-
	- /*
	- * Lua memory allocator arguments.
	- */
	- zcp_alloc_arg_t *zri_allocargs;
	-
	- /*
	- * Contains output values from zcp script or error string.
	- */
	- nvlist_t *zri_outnvl;
	-
	- /*
	- * The errno number returned to caller of zcp_eval().
	- */
	- int zri_result;
	-} zcp_run_info_t;
	-
	-zcp_run_info_t zcp_run_info(lua_State );
	-zcp_cleanup_handler_t zcp_register_cleanup(lua_State , zcp_cleanup_t, void *);
	-void zcp_deregister_cleanup(lua_State , zcp_cleanup_handler_t );
	-void zcp_cleanup(lua_State *);
	-
	-/*
	- * Argument parsing routines for channel program callback functions.
	- */
	-typedef struct zcp_arg {
	- /*
	- * The name of this argument. For keyword arguments this is the name
	- * functions will use to set the argument. For positional arguments
	- * the name has no programatic meaning, but will appear in error
	- * messages and help output.
	- */
	- const char *za_name;
	-
	- /*
	- * The Lua type this argument should have (e.g. LUA_TSTRING,
	- * LUA_TBOOLEAN) see the lua_type() function documentation for a
	- * complete list. Calling a function with an argument that does
	- * not match the expected type will result in the program terminating.
	- */
	- const int za_lua_type;
	-} zcp_arg_t;
	-
	-void zcp_parse_args(lua_State , const char , const zcp_arg_t *,
	- const zcp_arg_t *);
	-int zcp_nvlist_to_lua(lua_State , nvlist_t , char *, int);
	-int zcp_dataset_hold_error(lua_State , dsl_pool_t , const char *, int);
	-struct dsl_dataset zcp_dataset_hold(lua_State , dsl_pool_t *,
	- const char , void );
	-
	-typedef int (zcp_lib_func_t)(lua_State *);
	-typedef struct zcp_lib_info {
	- const char *name;
	- zcp_lib_func_t *func;
	- const zcp_arg_t pargs[4];
	- const zcp_arg_t kwargs[2];
	-} zcp_lib_info_t;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZCP_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h
	@@ -1,35 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZCP_GLOBALS_H
	-#define _SYS_ZCP_GLOBALS_H
	-
	-#include "lua.h"
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-void zcp_load_globals(lua_State *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZCP_GLOBALS_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h
	@@ -1,41 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZCP_LIST_H
	-#define _SYS_ZCP_LIST_H
	-
	-#include "lua.h"
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-void zcp_load_list_funcs(lua_State *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZCP_LIST_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h
	@@ -1,34 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZCP_PROP_H
	-#define _SYS_ZCP_PROP_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-int zcp_load_get_lib(lua_State *state);
	-boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZCP_PROP_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
	@@ -1,73 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZFEATURE_H
	-#define _SYS_ZFEATURE_H
	-
	-#include <sys/nvpair.h>
	-#include <sys/txg.h>
	-#include "zfeature_common.h"
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define VALID_FEATURE_FID(fid) ((fid) >= 0 && (fid) < SPA_FEATURES)
	-#define VALID_FEATURE_OR_NONE(fid) ((fid) == SPA_FEATURE_NONE \|\| \
	- VALID_FEATURE_FID(fid))
	-
	-struct spa;
	-struct dmu_tx;
	-struct objset;
	-
	-extern void spa_feature_create_zap_objects(struct spa , struct dmu_tx );
	-extern void spa_feature_enable(struct spa *, spa_feature_t,
	- struct dmu_tx *);
	-extern void spa_feature_incr(struct spa , spa_feature_t, struct dmu_tx );
	-extern void spa_feature_decr(struct spa , spa_feature_t, struct dmu_tx );
	-extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t);
	-extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t);
	-extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid,
	- uint64_t *txg);
	-extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t);
	-extern boolean_t spa_features_check(spa_t , boolean_t, nvlist_t , nvlist_t *);
	-
	-/*
	- * These functions are only exported for zhack and zdb; normal callers should
	- * use the above interfaces.
	- */
	-extern int feature_get_refcount(struct spa , zfeature_info_t , uint64_t *);
	-extern int feature_get_refcount_from_disk(spa_t spa, zfeature_info_t feature,
	- uint64_t *res);
	-extern void feature_enable_sync(struct spa , zfeature_info_t ,
	- struct dmu_tx *);
	-extern void feature_sync(struct spa , zfeature_info_t , uint64_t,
	- struct dmu_tx *);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZFEATURE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
	@@ -1,248 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _SYS_FS_ZFS_ACL_H
	-#define _SYS_FS_ZFS_ACL_H
	-
	-#ifdef _KERNEL
	-#include <sys/cred.h>
	-#endif
	-#include <sys/acl.h>
	-#include <sys/dmu.h>
	-#include <sys/zfs_fuid.h>
	-#include <sys/sa.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct znode_phys;
	-
	-#define ACE_SLOT_CNT 6
	-#define ZFS_ACL_VERSION_INITIAL 0ULL
	-#define ZFS_ACL_VERSION_FUID 1ULL
	-#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID
	-
	-/*
	- * ZFS ACLs (Access Control Lists) are stored in various forms.
	- *
	- * Files created with ACL version ZFS_ACL_VERSION_INITIAL
	- * will all be created with fixed length ACEs of type
	- * zfs_oldace_t.
	- *
	- * Files with ACL version ZFS_ACL_VERSION_FUID will be created
	- * with various sized ACEs. The abstraction entries will utilize
	- * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t
	- * and some specialized CIFS ACEs will use zfs_object_ace_t.
	- */
	-
	-/*
	- * All ACEs have a common hdr. For
	- * owner@, group@, and everyone@ this is all
	- * thats needed.
	- */
	-typedef struct zfs_ace_hdr {
	- uint16_t z_type;
	- uint16_t z_flags;
	- uint32_t z_access_mask;
	-} zfs_ace_hdr_t;
	-
	-typedef zfs_ace_hdr_t zfs_ace_abstract_t;
	-
	-/*
	- * Standard ACE
	- */
	-typedef struct zfs_ace {
	- zfs_ace_hdr_t z_hdr;
	- uint64_t z_fuid;
	-} zfs_ace_t;
	-
	-/*
	- * The following type only applies to ACE_ACCESS_ALLOWED\|DENIED_OBJECT_ACE_TYPE
	- * and will only be set/retrieved in a CIFS context.
	- */
	-
	-typedef struct zfs_object_ace {
	- zfs_ace_t z_ace;
	- uint8_t z_object_type[16]; /* object type */
	- uint8_t z_inherit_type[16]; /* inherited object type */
	-} zfs_object_ace_t;
	-
	-typedef struct zfs_oldace {
	- uint32_t z_fuid; /* "who" */
	- uint32_t z_access_mask; /* access mask */
	- uint16_t z_flags; /* flags, i.e inheritance */
	- uint16_t z_type; /* type of entry allow/deny */
	-} zfs_oldace_t;
	-
	-typedef struct zfs_acl_phys_v0 {
	- uint64_t z_acl_extern_obj; /* ext acl pieces */
	- uint32_t z_acl_count; /* Number of ACEs */
	- uint16_t z_acl_version; /* acl version */
	- uint16_t z_acl_pad; /* pad */
	- zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
	-} zfs_acl_phys_v0_t;
	-
	-#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
	-
	-/*
	- * Size of ACL count is always 2 bytes.
	- * Necessary to for dealing with both V0 ACL and V1 ACL layout
	- */
	-#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t))
	-
	-typedef struct zfs_acl_phys {
	- uint64_t z_acl_extern_obj; /* ext acl pieces */
	- uint32_t z_acl_size; /* Number of bytes in ACL */
	- uint16_t z_acl_version; /* acl version */
	- uint16_t z_acl_count; /* ace count */
	- uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
	-} zfs_acl_phys_t;
	-
	-typedef struct acl_ops {
	- uint32_t (ace_mask_get) (void acep); /* get access mask */
	- void (ace_mask_set) (void acep,
	- uint32_t mask); /* set access mask */
	- uint16_t (ace_flags_get) (void acep); /* get flags */
	- void (ace_flags_set) (void acep,
	- uint16_t flags); /* set flags */
	- uint16_t (ace_type_get)(void acep); /* get type */
	- void (ace_type_set)(void acep,
	- uint16_t type); /* set type */
	- uint64_t (ace_who_get)(void acep); /* get who/fuid */
	- void (ace_who_set)(void acep,
	- uint64_t who); /* set who/fuid */
	- size_t (ace_size)(void acep); /* how big is this ace */
	- size_t (ace_abstract_size)(void); / sizeof abstract entry */
	- int (ace_mask_off)(void); / off of access mask in ace */
	- /* ptr to data if any */
	- int (ace_data)(void acep, void **datap);
	-} acl_ops_t;
	-
	-/*
	- * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's.
	- * Each node will have one or more ACEs associated with it. You will
	- * only have multiple nodes during a chmod operation. Normally only
	- * one node is required.
	- */
	-typedef struct zfs_acl_node {
	- list_node_t z_next; /* Next chunk of ACEs */
	- void z_acldata; / pointer into actual ACE(s) */
	- void z_allocdata; / pointer to kmem allocated memory */
	- size_t z_allocsize; /* Size of blob in bytes */
	- size_t z_size; /* length of ACL data */
	- uint64_t z_ace_count; /* number of ACEs in this acl node */
	- int z_ace_idx; /* ace iterator positioned on */
	-} zfs_acl_node_t;
	-
	-typedef struct zfs_acl {
	- uint64_t z_acl_count; /* Number of ACEs */
	- size_t z_acl_bytes; /* Number of bytes in ACL */
	- uint_t z_version; /* version of ACL */
	- void z_next_ace; / pointer to next ACE */
	- uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */
	- zfs_acl_node_t z_curr_node; / current node iterator is handling */
	- list_t z_acl; /* chunks of ACE data */
	- acl_ops_t z_ops; /* ACL operations */
	-} zfs_acl_t;
	-
	-typedef struct acl_locator_cb {
	- zfs_acl_t *cb_aclp;
	- zfs_acl_node_t *cb_acl_node;
	-} zfs_acl_locator_cb_t;
	-
	-#define ACL_DATA_ALLOCED 0x1
	-#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
	-
	-struct zfs_fuid_info;
	-
	-typedef struct zfs_acl_ids {
	- uint64_t z_fuid; /* file owner fuid */
	- uint64_t z_fgid; /* file group owner fuid */
	- uint64_t z_mode; /* mode to set on create */
	- zfs_acl_t z_aclp; / ACL to create with file */
	- struct zfs_fuid_info z_fuidp; / for tracking fuids for log */
	-} zfs_acl_ids_t;
	-
	-/*
	- * Property values for acl_mode and acl_inherit.
	- *
	- * acl_mode can take discard, noallow, groupmask and passthrough.
	- * whereas acl_inherit has secure instead of groupmask.
	- */
	-
	-#define ZFS_ACL_DISCARD 0
	-#define ZFS_ACL_NOALLOW 1
	-#define ZFS_ACL_GROUPMASK 2
	-#define ZFS_ACL_PASSTHROUGH 3
	-#define ZFS_ACL_RESTRICTED 4
	-#define ZFS_ACL_PASSTHROUGH_X 5
	-
	-struct znode;
	-struct zfsvfs;
	-
	-#ifdef _KERNEL
	-int zfs_acl_ids_create(struct znode , int, vattr_t ,
	- cred_t , vsecattr_t , zfs_acl_ids_t *);
	-void zfs_acl_ids_free(zfs_acl_ids_t *);
	-boolean_t zfs_acl_ids_overquota(struct zfsvfs , zfs_acl_ids_t );
	-int zfs_getacl(struct znode , vsecattr_t , boolean_t, cred_t *);
	-int zfs_setacl(struct znode , vsecattr_t , boolean_t, cred_t *);
	-void zfs_acl_rele(void *);
	-void zfs_oldace_byteswap(ace_t *, int);
	-void zfs_ace_byteswap(void *, size_t, boolean_t);
	-extern boolean_t zfs_has_access(struct znode zp, cred_t cr);
	-extern int zfs_zaccess(struct znode , int, int, boolean_t, cred_t );
	-#ifdef illumos
	-int zfs_fastaccesschk_execute(struct znode , cred_t );
	-#endif
	-int zfs_freebsd_fastaccesschk_execute(struct vnode , cred_t );
	-extern int zfs_zaccess_rwx(struct znode , mode_t, int, cred_t );
	-extern int zfs_zaccess_unix(struct znode , mode_t, cred_t );
	-extern int zfs_acl_access(struct znode , int, cred_t );
	-int zfs_acl_chmod_setattr(struct znode , zfs_acl_t *, uint64_t);
	-int zfs_zaccess_delete(struct znode , struct znode , cred_t *);
	-int zfs_zaccess_rename(struct znode , struct znode ,
	- struct znode , struct znode , cred_t *cr);
	-void zfs_acl_free(zfs_acl_t *);
	-int zfs_vsec_2_aclp(struct zfsvfs , vtype_t, vsecattr_t , cred_t *,
	- struct zfs_fuid_info , zfs_acl_t );
	-int zfs_aclset_common(struct znode , zfs_acl_t , cred_t , dmu_tx_t );
	-uint64_t zfs_external_acl(struct znode *);
	-int zfs_znode_acl_version(struct znode *);
	-int zfs_acl_size(struct znode , int );
	-zfs_acl_t *zfs_acl_alloc(int);
	-zfs_acl_node_t *zfs_acl_node_alloc(size_t);
	-void zfs_acl_xform(struct znode , zfs_acl_t , cred_t *);
	-void zfs_acl_data_locator(void *, uint32_t , uint32_t, boolean_t, void *);
	-uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *,
	- uint64_t *, uint64_t, uint64_t);
	-int zfs_acl_chown_setattr(struct znode *);
	-
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-#endif /* _SYS_FS_ZFS_ACL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
	@@ -1,146 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZFS_CONTEXT_H
	-#define _SYS_ZFS_CONTEXT_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#include <sys/param.h>
	-#include <sys/stdint.h>
	-#include <sys/note.h>
	-#include <sys/kernel.h>
	-#include <sys/debug.h>
	-#include <sys/systm.h>
	-#include <sys/proc.h>
	-#include <sys/sysmacros.h>
	-#include <sys/bitmap.h>
	-#include <sys/cmn_err.h>
	-#include <sys/kmem.h>
	-#include <sys/taskq.h>
	-#include <sys/taskqueue.h>
	-#include <sys/systm.h>
	-#include <sys/conf.h>
	-#include <sys/mutex.h>
	-#include <sys/rwlock.h>
	-#include <sys/kcondvar.h>
	-#include <sys/random.h>
	-#include <sys/byteorder.h>
	-#include <sys/systm.h>
	-#include <sys/list.h>
	-#include <sys/zfs_debug.h>
	-#include <sys/sysevent.h>
	-#include <sys/uio.h>
	-#include <sys/dirent.h>
	-#include <sys/time.h>
	-#include <sys/uio.h>
	-#include <sys/fcntl.h>
	-#include <sys/limits.h>
	-#include <sys/string.h>
	-#include <sys/bio.h>
	-#include <sys/buf.h>
	-#include <sys/cred.h>
	-#include <sys/sdt.h>
	-#include <sys/file.h>
	-#include <sys/vfs.h>
	-#include <sys/sysctl.h>
	-#include <sys/sbuf.h>
	-#include <sys/priv.h>
	-#include <sys/kdb.h>
	-#include <sys/ktr.h>
	-#include <sys/stack.h>
	-#include <sys/lockf.h>
	-#include <sys/pathname.h>
	-#include <sys/policy.h>
	-#include <sys/refstr.h>
	-#include <sys/zone.h>
	-#include <sys/eventhandler.h>
	-#include <sys/extattr.h>
	-#include <sys/misc.h>
	-#include <sys/sig.h>
	-#include <sys/osd.h>
	-#include <sys/sysevent/dev.h>
	-#include <sys/sysevent/eventdefs.h>
	-#include <sys/u8_textprep.h>
	-#include <sys/fm/util.h>
	-#include <sys/sunddi.h>
	-#ifdef illumos
	-#include <sys/cyclic.h>
	-#endif
	-#include <sys/callo.h>
	-#include <sys/disp.h>
	-#include <machine/_inttypes.h>
	-#include <machine/stdarg.h>
	-
	-#include <vm/vm.h>
	-#include <vm/vm_page.h>
	-#include <vm/vm_object.h>
	-#include <vm/vm_kern.h>
	-#include <vm/vm_map.h>
	-#include <vm/vm_extern.h>
	-#include <vm/vnode_pager.h>
	-
	-#define boot_ncpus (mp_ncpus)
	-
	-#define CPU_SEQID (curcpu)
	-
	-#define tsd_create(keyp, destructor) do { \
	- *(keyp) = osd_thread_register((destructor)); \
	- KASSERT(*(keyp) > 0, ("cannot register OSD")); \
	-} while (0)
	-#define tsd_destroy(keyp) osd_thread_deregister(*(keyp))
	-#define tsd_get(key) osd_thread_get(curthread, (key))
	-#define tsd_set(key, value) osd_thread_set(curthread, (key), (value))
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-extern int zfs_debug_level;
	-extern struct mtx zfs_debug_mtx;
	-#define ZFS_LOG(lvl, ...) do { \
	- if (((lvl) & 0xff) <= zfs_debug_level) { \
	- mtx_lock(&zfs_debug_mtx); \
	- printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \
	- printf(__VA_ARGS__); \
	- printf("\n"); \
	- if ((lvl) & 0x100) \
	- kdb_backtrace(); \
	- mtx_unlock(&zfs_debug_mtx); \
	- } \
	-} while (0)
	-
	-#define sys_shutdown rebooting
	-
	-#define noinline __attribute__((noinline))
	-#define likely(x) __builtin_expect((x), 1)
	-
	-#endif /* _SYS_ZFS_CONTEXT_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
	@@ -1,65 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _ZFS_CTLDIR_H
	-#define _ZFS_CTLDIR_H
	-
	-#include <sys/vnode.h>
	-#include <sys/zfs_vfsops.h>
	-#include <sys/zfs_znode.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define ZFS_CTLDIR_NAME ".zfs"
	-
	-#define zfs_has_ctldir(zdp) \
	- ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
	- ((zdp)->z_zfsvfs->z_ctldir != NULL))
	-#define zfs_show_ctldir(zdp) \
	- (zfs_has_ctldir(zdp) && \
	- ((zdp)->z_zfsvfs->z_show_ctldir))
	-
	-void zfsctl_create(zfsvfs_t *);
	-void zfsctl_destroy(zfsvfs_t *);
	-int zfsctl_root(zfsvfs_t , int, vnode_t *);
	-void zfsctl_init(void);
	-void zfsctl_fini(void);
	-boolean_t zfsctl_is_node(vnode_t *);
	-
	-int zfsctl_rename_snapshot(const char from, const char to);
	-int zfsctl_destroy_snapshot(const char *snapname, int force);
	-int zfsctl_umount_snapshots(vfs_t , int, cred_t );
	-
	-int zfsctl_lookup_objset(vfs_t vfsp, uint64_t objsetid, zfsvfs_t *zfsvfsp);
	-
	-#define ZFSCTL_INO_ROOT 0x1
	-#define ZFSCTL_INO_SNAPDIR 0x2
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZFS_CTLDIR_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
	@@ -1,99 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZFS_DEBUG_H
	-#define _SYS_ZFS_DEBUG_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#ifndef TRUE
	-#define TRUE 1
	-#endif
	-
	-#ifndef FALSE
	-#define FALSE 0
	-#endif
	-
	-/*
	- * ZFS debugging
	- */
	-
	-#if defined(DEBUG) \|\| !defined(_KERNEL)
	-#if !defined(ZFS_DEBUG)
	-#define ZFS_DEBUG
	-#endif
	-#endif
	-
	-extern int zfs_flags;
	-extern boolean_t zfs_recover;
	-extern boolean_t zfs_free_leak_on_eio;
	-
	-#define ZFS_DEBUG_DPRINTF (1 << 0)
	-#define ZFS_DEBUG_DBUF_VERIFY (1 << 1)
	-#define ZFS_DEBUG_DNODE_VERIFY (1 << 2)
	-#define ZFS_DEBUG_SNAPNAMES (1 << 3)
	-#define ZFS_DEBUG_MODIFY (1 << 4)
	-/* 1<<5 was previously used, try not to reuse */
	-#define ZFS_DEBUG_ZIO_FREE (1 << 6)
	-#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7)
	-#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8)
	-#define ZFS_DEBUG_INDIRECT_REMAP (1 << 9)
	-
	-#ifdef ZFS_DEBUG
	-extern void __dprintf(const char file, const char func,
	- int line, const char *fmt, ...);
	-#define dprintf(...) \
	- if (zfs_flags & ZFS_DEBUG_DPRINTF) \
	- __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
	-#else
	-#define dprintf(...) ((void)0)
	-#endif /* ZFS_DEBUG */
	-
	-extern void zfs_panic_recover(const char *fmt, ...);
	-
	-typedef struct zfs_dbgmsg {
	- list_node_t zdm_node;
	- time_t zdm_timestamp;
	- char zdm_msg[1]; /* variable length allocation */
	-} zfs_dbgmsg_t;
	-
	-extern void zfs_dbgmsg_init(void);
	-extern void zfs_dbgmsg_fini(void);
	-extern void zfs_dbgmsg(const char *fmt, ...);
	-extern void zfs_dbgmsg_print(const char *tag);
	-
	-#ifdef illumos
	-#ifndef _KERNEL
	-extern int dprintf_find_string(const char *string);
	-#endif
	-#endif /* illumos */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZFS_DEBUG_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
	@@ -1,74 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _SYS_FS_ZFS_DIR_H
	-#define _SYS_FS_ZFS_DIR_H
	-
	-#include <sys/pathname.h>
	-#include <sys/dmu.h>
	-#include <sys/zfs_znode.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/* zfs_dirent_lock() flags */
	-#define ZNEW 0x0001 /* entry should not exist */
	-#define ZEXISTS 0x0002 /* entry should exist */
	-#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */
	-#define ZXATTR 0x0008 /* we want the xattr dir */
	-#define ZRENAMING 0x0010 /* znode is being renamed */
	-#define ZCILOOK 0x0020 /* case-insensitive lookup requested */
	-#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */
	-#define ZHAVELOCK 0x0080 /* z_name_lock is already held */
	-
	-/* mknode flags */
	-#define IS_ROOT_NODE 0x01 /* create a root node */
	-#define IS_XATTR 0x02 /* create an extended attribute node */
	-
	-extern int zfs_dirent_lookup(znode_t , const char , znode_t **, int);
	-extern int zfs_link_create(znode_t , const char , znode_t , dmu_tx_t , int);
	-extern int zfs_link_destroy(znode_t , const char , znode_t , dmu_tx_t , int,
	- boolean_t *);
	-#if 0
	-extern int zfs_dirlook(vnode_t , const char , vnode_t **, int);
	-#else
	-extern int zfs_dirlook(znode_t , const char name, znode_t **);
	-#endif
	-extern void zfs_mknode(znode_t , vattr_t , dmu_tx_t , cred_t ,
	- uint_t, znode_t *, zfs_acl_ids_t );
	-extern void zfs_rmnode(znode_t *);
	-extern boolean_t zfs_dirempty(znode_t *);
	-extern void zfs_unlinked_add(znode_t , dmu_tx_t );
	-extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
	-extern int zfs_sticky_remove_access(znode_t , znode_t , cred_t *cr);
	-extern int zfs_get_xattrdir(znode_t , vnode_t , cred_t , int);
	-extern int zfs_make_xattrdir(znode_t , vattr_t , vnode_t *, cred_t );
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FS_ZFS_DIR_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
	@@ -1,132 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _SYS_FS_ZFS_FUID_H
	-#define _SYS_FS_ZFS_FUID_H
	-
	-#include <sys/types.h>
	-#ifdef _KERNEL
	-#include <sys/kidmap.h>
	-#include <sys/dmu.h>
	-#include <sys/zfs_vfsops.h>
	-#endif
	-#include <sys/avl.h>
	-#include <sys/list.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef enum {
	- ZFS_OWNER,
	- ZFS_GROUP,
	- ZFS_ACE_USER,
	- ZFS_ACE_GROUP
	-} zfs_fuid_type_t;
	-
	-/*
	- * Estimate space needed for one more fuid table entry.
	- * for now assume its current size + 1K
	- */
	-#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
	-
	-#define FUID_INDEX(x) ((x) >> 32)
	-#define FUID_RID(x) ((x) & 0xffffffff)
	-#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) \| (rid))
	-/*
	- * FUIDs cause problems for the intent log
	- * we need to replay the creation of the FUID,
	- * but we can't count on the idmapper to be around
	- * and during replay the FUID index may be different than
	- * before. Also, if an ACL has 100 ACEs and 12 different
	- * domains we don't want to log 100 domain strings, but rather
	- * just the unique 12.
	- */
	-
	-/*
	- * The FUIDs in the log will index into
	- * domain string table and the bottom half will be the rid.
	- * Used for mapping ephemeral uid/gid during ACL setting to FUIDs
	- */
	-typedef struct zfs_fuid {
	- list_node_t z_next;
	- uint64_t z_id; /* uid/gid being converted to fuid */
	- uint64_t z_domidx; /* index in AVL domain table */
	- uint64_t z_logfuid; /* index for domain in log */
	-} zfs_fuid_t;
	-
	-/* list of unique domains */
	-typedef struct zfs_fuid_domain {
	- list_node_t z_next;
	- uint64_t z_domidx; /* AVL tree idx */
	- const char z_domain; / domain string */
	-} zfs_fuid_domain_t;
	-
	-/*
	- * FUID information necessary for logging create, setattr, and setacl.
	- */
	-typedef struct zfs_fuid_info {
	- list_t z_fuids;
	- list_t z_domains;
	- uint64_t z_fuid_owner;
	- uint64_t z_fuid_group;
	- char *z_domain_table; / Used during replay */
	- uint32_t z_fuid_cnt; /* How many fuids in z_fuids */
	- uint32_t z_domain_cnt; /* How many domains */
	- size_t z_domain_str_sz; /* len of domain strings z_domain list */
	-} zfs_fuid_info_t;
	-
	-#ifdef _KERNEL
	-struct znode;
	-extern uid_t zfs_fuid_map_id(zfsvfs_t , uint64_t, cred_t , zfs_fuid_type_t);
	-extern void zfs_fuid_node_add(zfs_fuid_info_t *, const char , uint32_t,
	- uint64_t, uint64_t, zfs_fuid_type_t);
	-extern void zfs_fuid_destroy(zfsvfs_t *);
	-extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
	- cred_t , zfs_fuid_info_t *);
	-extern uint64_t zfs_fuid_create(zfsvfs_t , uint64_t, cred_t , zfs_fuid_type_t,
	- zfs_fuid_info_t **);
	-extern void zfs_fuid_map_ids(struct znode zp, cred_t cr,
	- uid_t uid, uid_t gid);
	-extern zfs_fuid_info_t *zfs_fuid_info_alloc(void);
	-extern void zfs_fuid_info_free(zfs_fuid_info_t *);
	-extern boolean_t zfs_groupmember(zfsvfs_t , uint64_t, cred_t );
	-void zfs_fuid_sync(zfsvfs_t , dmu_tx_t );
	-extern int zfs_fuid_find_by_domain(zfsvfs_t , const char domain,
	- char **retdomain, boolean_t addok);
	-extern const char zfs_fuid_find_by_idx(zfsvfs_t zfsvfs, uint32_t idx);
	-extern void zfs_fuid_txhold(zfsvfs_t zfsvfs, dmu_tx_t tx);
	-#endif
	-
	-char zfs_fuid_idx_domain(avl_tree_t , uint32_t);
	-void zfs_fuid_avl_tree_create(avl_tree_t , avl_tree_t );
	-uint64_t zfs_fuid_table_load(objset_t , uint64_t, avl_tree_t , avl_tree_t *);
	-void zfs_fuid_table_destroy(avl_tree_t , avl_tree_t );
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FS_ZFS_FUID_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
	@@ -1,466 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright 2016 RackTop Systems.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#ifndef _SYS_ZFS_IOCTL_H
	-#define _SYS_ZFS_IOCTL_H
	-
	-#include <sys/cred.h>
	-#include <sys/dmu.h>
	-#include <sys/zio.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/spa.h>
	-#include <sys/zfs_stat.h>
	-
	-#ifdef _KERNEL
	-#include <sys/nvpair.h>
	-#endif /* _KERNEL */
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * The structures in this file are passed between userland and the
	- * kernel. Userland may be running a 32-bit process, while the kernel
	- * is 64-bit. Therefore, these structures need to compile the same in
	- * 32-bit and 64-bit. This means not using type "long", and adding
	- * explicit padding so that the 32-bit structure will not be packed more
	- * tightly than the 64-bit structure (which requires 64-bit alignment).
	- */
	-
	-/*
	- * Property values for snapdir
	- */
	-#define ZFS_SNAPDIR_HIDDEN 0
	-#define ZFS_SNAPDIR_VISIBLE 1
	-
	-/*
	- * Field manipulation macros for the drr_versioninfo field of the
	- * send stream header.
	- */
	-
	-/*
	- * Header types for zfs send streams.
	- */
	-typedef enum drr_headertype {
	- DMU_SUBSTREAM = 0x1,
	- DMU_COMPOUNDSTREAM = 0x2
	-} drr_headertype_t;
	-
	-#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2)
	-#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x)
	-
	-#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30)
	-#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x)
	-
	-/*
	- * Feature flags for zfs send streams (flags in drr_versioninfo)
	- */
	-
	-#define DMU_BACKUP_FEATURE_DEDUP (1 << 0)
	-#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1)
	-#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2)
	-/* flags #3 - #15 are reserved for incompatible closed-source implementations */
	-#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16)
	-#define DMU_BACKUP_FEATURE_LZ4 (1 << 17)
	-/* flag #18 is reserved for a Delphix feature */
	-#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19)
	-#define DMU_BACKUP_FEATURE_RESUMING (1 << 20)
	-/* flag #21 is reserved for a Delphix feature */
	-#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22)
	-#define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23)
	-/* flag #24 is reserved for the raw send feature */
	-/* flag #25 is reserved for the ZSTD compression feature */
	-
	-/*
	- * Mask of all supported backup features
	- */
	-#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP \| \
	- DMU_BACKUP_FEATURE_DEDUPPROPS \| DMU_BACKUP_FEATURE_SA_SPILL \| \
	- DMU_BACKUP_FEATURE_EMBED_DATA \| DMU_BACKUP_FEATURE_LZ4 \| \
	- DMU_BACKUP_FEATURE_RESUMING \| \
	- DMU_BACKUP_FEATURE_LARGE_BLOCKS \| DMU_BACKUP_FEATURE_LARGE_DNODE \| \
	- DMU_BACKUP_FEATURE_COMPRESSED)
	-
	-/* Are all features in the given flag word currently supported? */
	-#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
	-
	-typedef enum dmu_send_resume_token_version {
	- ZFS_SEND_RESUME_TOKEN_VERSION = 1
	-} dmu_send_resume_token_version_t;
	-
	-/*
	- * The drr_versioninfo field of the dmu_replay_record has the
	- * following layout:
	- *
	- * 64 56 48 40 32 24 16 8 0
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- * \| reserved \| feature-flags \|C\|S\|
	- * +-------+-------+-------+-------+-------+-------+-------+-------+
	- *
	- * The low order two bits indicate the header type: SUBSTREAM (0x1)
	- * or COMPOUNDSTREAM (0x2). Using two bits for this is historical:
	- * this field used to be a version number, where the two version types
	- * were 1 and 2. Using two bits for this allows earlier versions of
	- * the code to be able to recognize send streams that don't use any
	- * of the features indicated by feature flags.
	- */
	-
	-#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
	-
	-/*
	- * Send stream flags. Bits 24-31 are reserved for vendor-specific
	- * implementations and should not be used.
	- */
	-#define DRR_FLAG_CLONE (1<<0)
	-#define DRR_FLAG_CI_DATA (1<<1)
	-/*
	- * This send stream, if it is a full send, includes the FREE and FREEOBJECT
	- * records that are created by the sending process. This means that the send
	- * stream can be received as a clone, even though it is not an incremental.
	- * This is not implemented as a feature flag, because the receiving side does
	- * not need to have implemented it to receive this stream; it is fully backwards
	- * compatible. We need a flag, though, because full send streams without it
	- * cannot necessarily be received as a clone correctly.
	- */
	-#define DRR_FLAG_FREERECORDS (1<<2)
	-
	-/*
	- * flags in the drr_checksumflags field in the DRR_WRITE and
	- * DRR_WRITE_BYREF blocks
	- */
	-#define DRR_CHECKSUM_DEDUP (1<<0)
	-
	-#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP)
	-
	-/* deal with compressed drr_write replay records */
	-#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0)
	-#define DRR_WRITE_PAYLOAD_SIZE(drrw) \
	- (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \
	- (drrw)->drr_logical_size)
	-
	-typedef struct dmu_replay_record {
	- enum {
	- DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
	- DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
	- DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
	- } drr_type;
	- uint32_t drr_payloadlen;
	- union {
	- struct drr_begin {
	- uint64_t drr_magic;
	- uint64_t drr_versioninfo; /* was drr_version */
	- uint64_t drr_creation_time;
	- dmu_objset_type_t drr_type;
	- uint32_t drr_flags;
	- uint64_t drr_toguid;
	- uint64_t drr_fromguid;
	- char drr_toname[MAXNAMELEN];
	- } drr_begin;
	- struct drr_end {
	- zio_cksum_t drr_checksum;
	- uint64_t drr_toguid;
	- } drr_end;
	- struct drr_object {
	- uint64_t drr_object;
	- dmu_object_type_t drr_type;
	- dmu_object_type_t drr_bonustype;
	- uint32_t drr_blksz;
	- uint32_t drr_bonuslen;
	- uint8_t drr_checksumtype;
	- uint8_t drr_compress;
	- uint8_t drr_dn_slots;
	- uint8_t drr_pad[5];
	- uint64_t drr_toguid;
	- /* bonus content follows */
	- } drr_object;
	- struct drr_freeobjects {
	- uint64_t drr_firstobj;
	- uint64_t drr_numobjs;
	- uint64_t drr_toguid;
	- } drr_freeobjects;
	- struct drr_write {
	- uint64_t drr_object;
	- dmu_object_type_t drr_type;
	- uint32_t drr_pad;
	- uint64_t drr_offset;
	- uint64_t drr_logical_size;
	- uint64_t drr_toguid;
	- uint8_t drr_checksumtype;
	- uint8_t drr_checksumflags;
	- uint8_t drr_compressiontype;
	- uint8_t drr_pad2[5];
	- /* deduplication key */
	- ddt_key_t drr_key;
	- /* only nonzero if drr_compressiontype is not 0 */
	- uint64_t drr_compressed_size;
	- /* content follows */
	- } drr_write;
	- struct drr_free {
	- uint64_t drr_object;
	- uint64_t drr_offset;
	- uint64_t drr_length;
	- uint64_t drr_toguid;
	- } drr_free;
	- struct drr_write_byref {
	- /* where to put the data */
	- uint64_t drr_object;
	- uint64_t drr_offset;
	- uint64_t drr_length;
	- uint64_t drr_toguid;
	- /* where to find the prior copy of the data */
	- uint64_t drr_refguid;
	- uint64_t drr_refobject;
	- uint64_t drr_refoffset;
	- /* properties of the data */
	- uint8_t drr_checksumtype;
	- uint8_t drr_checksumflags;
	- uint8_t drr_pad2[6];
	- ddt_key_t drr_key; /* deduplication key */
	- } drr_write_byref;
	- struct drr_spill {
	- uint64_t drr_object;
	- uint64_t drr_length;
	- uint64_t drr_toguid;
	- uint64_t drr_pad[4]; /* needed for crypto */
	- /* spill data follows */
	- } drr_spill;
	- struct drr_write_embedded {
	- uint64_t drr_object;
	- uint64_t drr_offset;
	- /* logical length, should equal blocksize */
	- uint64_t drr_length;
	- uint64_t drr_toguid;
	- uint8_t drr_compression;
	- uint8_t drr_etype;
	- uint8_t drr_pad[6];
	- uint32_t drr_lsize; /* uncompressed size of payload */
	- uint32_t drr_psize; /* compr. (real) size of payload */
	- /* (possibly compressed) content follows */
	- } drr_write_embedded;
	-
	- /*
	- * Nore: drr_checksum is overlaid with all record types
	- * except DRR_BEGIN. Therefore its (non-pad) members
	- * must not overlap with members from the other structs.
	- * We accomplish this by putting its members at the very
	- * end of the struct.
	- */
	- struct drr_checksum {
	- uint64_t drr_pad[34];
	- /*
	- * fletcher-4 checksum of everything preceding the
	- * checksum.
	- */
	- zio_cksum_t drr_checksum;
	- } drr_checksum;
	- } drr_u;
	-} dmu_replay_record_t;
	-
	-/* diff record range types */
	-typedef enum diff_type {
	- DDR_NONE = 0x1,
	- DDR_INUSE = 0x2,
	- DDR_FREE = 0x4
	-} diff_type_t;
	-
	-/*
	- * The diff reports back ranges of free or in-use objects.
	- */
	-typedef struct dmu_diff_record {
	- uint64_t ddr_type;
	- uint64_t ddr_first;
	- uint64_t ddr_last;
	-} dmu_diff_record_t;
	-
	-typedef struct zinject_record {
	- uint64_t zi_objset;
	- uint64_t zi_object;
	- uint64_t zi_start;
	- uint64_t zi_end;
	- uint64_t zi_guid;
	- uint32_t zi_level;
	- uint32_t zi_error;
	- uint64_t zi_type;
	- uint32_t zi_freq;
	- uint32_t zi_failfast;
	- char zi_func[MAXNAMELEN];
	- uint32_t zi_iotype;
	- int32_t zi_duration;
	- uint64_t zi_timer;
	- uint64_t zi_nlanes;
	- uint32_t zi_cmd;
	- uint32_t zi_pad;
	-} zinject_record_t;
	-
	-#define ZINJECT_NULL 0x1
	-#define ZINJECT_FLUSH_ARC 0x2
	-#define ZINJECT_UNLOAD_SPA 0x4
	-
	-typedef enum zinject_type {
	- ZINJECT_UNINITIALIZED,
	- ZINJECT_DATA_FAULT,
	- ZINJECT_DEVICE_FAULT,
	- ZINJECT_LABEL_FAULT,
	- ZINJECT_IGNORED_WRITES,
	- ZINJECT_PANIC,
	- ZINJECT_DELAY_IO,
	-} zinject_type_t;
	-
	-typedef struct zfs_share {
	- uint64_t z_exportdata;
	- uint64_t z_sharedata;
	- uint64_t z_sharetype; /* 0 = share, 1 = unshare */
	- uint64_t z_sharemax; /* max length of share string */
	-} zfs_share_t;
	-
	-/*
	- * ZFS file systems may behave the usual, POSIX-compliant way, where
	- * name lookups are case-sensitive. They may also be set up so that
	- * all the name lookups are case-insensitive, or so that only some
	- * lookups, the ones that set an FIGNORECASE flag, are case-insensitive.
	- */
	-typedef enum zfs_case {
	- ZFS_CASE_SENSITIVE,
	- ZFS_CASE_INSENSITIVE,
	- ZFS_CASE_MIXED
	-} zfs_case_t;
	-
	-/*
	- * Note: this struct must have the same layout in 32-bit and 64-bit, so
	- * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit
	- * kernel. Therefore, we add padding to it so that no "hidden" padding
	- * is automatically added on 64-bit (but not on 32-bit).
	- */
	-typedef struct zfs_cmd {
	- char zc_name[MAXPATHLEN]; /* name of pool or dataset */
	- uint64_t zc_nvlist_src; /* really (char ) /
	- uint64_t zc_nvlist_src_size;
	- uint64_t zc_nvlist_dst; /* really (char ) /
	- uint64_t zc_nvlist_dst_size;
	- boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
	- int zc_pad2;
	-
	- /*
	- * The following members are for legacy ioctls which haven't been
	- * converted to the new method.
	- */
	- uint64_t zc_history; /* really (char ) /
	- char zc_value[MAXPATHLEN * 2];
	- char zc_string[MAXNAMELEN];
	- uint64_t zc_guid;
	- uint64_t zc_nvlist_conf; /* really (char ) /
	- uint64_t zc_nvlist_conf_size;
	- uint64_t zc_cookie;
	- uint64_t zc_objset_type;
	- uint64_t zc_perm_action;
	- uint64_t zc_history_len;
	- uint64_t zc_history_offset;
	- uint64_t zc_obj;
	- uint64_t zc_iflags; /* internal to zfs(7fs) */
	- zfs_share_t zc_share;
	- uint64_t zc_jailid;
	- dmu_objset_stats_t zc_objset_stats;
	- dmu_replay_record_t zc_begin_record;
	- zinject_record_t zc_inject_record;
	- uint32_t zc_defer_destroy;
	- uint32_t zc_flags;
	- uint64_t zc_action_handle;
	- int zc_cleanup_fd;
	- uint8_t zc_simple;
	- uint8_t zc_pad3[3];
	- boolean_t zc_resumable;
	- uint32_t zc_pad4;
	- uint64_t zc_sendobj;
	- uint64_t zc_fromobj;
	- uint64_t zc_createtxg;
	- zfs_stat_t zc_stat;
	-} zfs_cmd_t;
	-
	-typedef struct zfs_useracct {
	- char zu_domain[256];
	- uid_t zu_rid;
	- uint32_t zu_pad;
	- uint64_t zu_space;
	-} zfs_useracct_t;
	-
	-#define ZFSDEV_MAX_MINOR (1 << 16)
	-#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1)
	-
	-#define ZPOOL_EXPORT_AFTER_SPLIT 0x1
	-
	-#ifdef _KERNEL
	-struct objset;
	-struct zfsvfs;
	-
	-typedef struct zfs_creat {
	- nvlist_t *zct_zplprops;
	- nvlist_t *zct_props;
	-} zfs_creat_t;
	-
	-extern int zfs_secpolicy_snapshot_perms(const char , cred_t );
	-extern int zfs_secpolicy_rename_perms(const char , const char , cred_t *);
	-extern int zfs_secpolicy_destroy_perms(const char , cred_t );
	-extern int zfs_busy(void);
	-extern void zfs_unmount_snap(const char *);
	-extern void zfs_destroy_unmount_origin(const char *);
	-#ifdef illumos
	-extern int getzfsvfs_impl(struct objset , struct zfsvfs *);
	-#else
	-extern int getzfsvfs_impl(struct objset , vfs_t *);
	-#endif
	-extern int getzfsvfs(const char , struct zfsvfs *);
	-
	-/*
	- * ZFS minor numbers can refer to either a control device instance or
	- * a zvol. Depending on the value of zss_type, zss_data points to either
	- * a zvol_state_t or a zfs_onexit_t.
	- */
	-enum zfs_soft_state_type {
	- ZSST_ZVOL,
	- ZSST_CTLDEV
	-};
	-
	-typedef struct zfs_soft_state {
	- enum zfs_soft_state_type zss_type;
	- void *zss_data;
	-} zfs_soft_state_t;
	-
	-extern void *zfsdev_get_soft_state(minor_t minor,
	- enum zfs_soft_state_type which);
	-extern minor_t zfsdev_minor_alloc(void);
	-
	-extern void *zfsdev_state;
	-
	-#endif /* _KERNEL */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZFS_IOCTL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
	@@ -1,66 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZFS_ONEXIT_H
	-#define _SYS_ZFS_ONEXIT_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#ifdef _KERNEL
	-
	-typedef struct zfs_onexit {
	- kmutex_t zo_lock;
	- list_t zo_actions;
	-} zfs_onexit_t;
	-
	-typedef struct zfs_onexit_action_node {
	- list_node_t za_link;
	- void (za_func)(void );
	- void *za_data;
	-} zfs_onexit_action_node_t;
	-
	-extern void zfs_onexit_init(zfs_onexit_t **zo);
	-extern void zfs_onexit_destroy(zfs_onexit_t *zo);
	-
	-#endif
	-
	-extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
	-extern void zfs_onexit_fd_rele(int fd);
	-extern int zfs_onexit_add_cb(minor_t minor, void (func)(void ), void *data,
	- uint64_t *action_handle);
	-extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle,
	- boolean_t fire);
	-extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle,
	- void **data);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZFS_ONEXIT_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
	@@ -1,90 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2018 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_FS_ZFS_RLOCK_H
	-#define _SYS_FS_ZFS_RLOCK_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#ifdef __FreeBSD__
	-#define rangelock_init zfs_rangelock_init
	-#define rangelock_fini zfs_rangelock_fini
	-#endif
	-
	-typedef enum {
	- RL_READER,
	- RL_WRITER,
	- RL_APPEND
	-} rangelock_type_t;
	-
	-struct locked_range;
	-
	-typedef void (rangelock_cb_t)(struct locked_range , void );
	-
	-#ifdef __FreeBSD__
	-typedef struct zfs_rangelock {
	-#else
	-typedef struct rangelock {
	-#endif
	- avl_tree_t rl_tree; /* contains locked_range_t */
	- kmutex_t rl_lock;
	- rangelock_cb_t *rl_cb;
	- void *rl_arg;
	-} rangelock_t;
	-
	-typedef struct locked_range {
	- rangelock_t lr_rangelock; / rangelock that this lock applies to */
	- avl_node_t lr_node; /* avl node link */
	- uint64_t lr_offset; /* file range offset */
	- uint64_t lr_length; /* file range length */
	- uint_t lr_count; /* range reference count in tree */
	- rangelock_type_t lr_type; /* range type */
	- kcondvar_t lr_write_cv; /* cv for waiting writers */
	- kcondvar_t lr_read_cv; /* cv for waiting readers */
	- uint8_t lr_proxy; /* acting for original range */
	- uint8_t lr_write_wanted; /* writer wants to lock this range */
	- uint8_t lr_read_wanted; /* reader wants to lock this range */
	-} locked_range_t;
	-
	-void rangelock_init(rangelock_t , rangelock_cb_t , void *);
	-void rangelock_fini(rangelock_t *);
	-
	-locked_range_t rangelock_enter(rangelock_t ,
	- uint64_t, uint64_t, rangelock_type_t);
	-locked_range_t rangelock_tryenter(rangelock_t ,
	- uint64_t, uint64_t, rangelock_type_t);
	-void rangelock_exit(locked_range_t *);
	-void rangelock_reduce(locked_range_t *, uint64_t, uint64_t);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FS_ZFS_RLOCK_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
	@@ -1,142 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _SYS_ZFS_SA_H
	-#define _SYS_ZFS_SA_H
	-
	-#ifdef _KERNEL
	-#include <sys/list.h>
	-#include <sys/dmu.h>
	-#include <sys/zfs_acl.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/sa.h>
	-#include <sys/zil.h>
	-
	-
	-#endif
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * This is the list of known attributes
	- * to the ZPL. The values of the actual
	- * attributes are not defined by the order
	- * the enums. It is controlled by the attribute
	- * registration mechanism. Two different file system
	- * could have different numeric values for the same
	- * attributes. this list is only used for dereferencing
	- * into the table that will hold the actual numeric value.
	- */
	-typedef enum zpl_attr {
	- ZPL_ATIME,
	- ZPL_MTIME,
	- ZPL_CTIME,
	- ZPL_CRTIME,
	- ZPL_GEN,
	- ZPL_MODE,
	- ZPL_SIZE,
	- ZPL_PARENT,
	- ZPL_LINKS,
	- ZPL_XATTR,
	- ZPL_RDEV,
	- ZPL_FLAGS,
	- ZPL_UID,
	- ZPL_GID,
	- ZPL_PAD,
	- ZPL_ZNODE_ACL,
	- ZPL_DACL_COUNT,
	- ZPL_SYMLINK,
	- ZPL_SCANSTAMP,
	- ZPL_DACL_ACES,
	- ZPL_END
	-} zpl_attr_t;
	-
	-#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108
	-#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \
	- sizeof (zfs_acl_phys_t))
	-
	-#define SA_MODE_OFFSET 0
	-#define SA_SIZE_OFFSET 8
	-#define SA_GEN_OFFSET 16
	-#define SA_UID_OFFSET 24
	-#define SA_GID_OFFSET 32
	-#define SA_PARENT_OFFSET 40
	-
	-extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1];
	-extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1];
	-
	-/*
	- * This is a deprecated data structure that only exists for
	- * dealing with file systems create prior to ZPL version 5.
	- */
	-typedef struct znode_phys {
	- uint64_t zp_atime[2]; /* 0 - last file access time */
	- uint64_t zp_mtime[2]; /* 16 - last file modification time */
	- uint64_t zp_ctime[2]; /* 32 - last file change time */
	- uint64_t zp_crtime[2]; /* 48 - creation time */
	- uint64_t zp_gen; /* 64 - generation (txg of creation) */
	- uint64_t zp_mode; /* 72 - file mode bits */
	- uint64_t zp_size; /* 80 - size of file */
	- uint64_t zp_parent; /* 88 - directory parent (`..') */
	- uint64_t zp_links; /* 96 - number of links to file */
	- uint64_t zp_xattr; /* 104 - DMU object for xattrs */
	- uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
	- uint64_t zp_flags; /* 120 - persistent flags */
	- uint64_t zp_uid; /* 128 - file owner */
	- uint64_t zp_gid; /* 136 - owning group */
	- uint64_t zp_zap; /* 144 - extra attributes */
	- uint64_t zp_pad[3]; /* 152 - future */
	- zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */
	- /*
	- * Data may pad out any remaining bytes in the znode buffer, eg:
	- *
	- * \|<---------------------- dnode_phys (512) ------------------------>\|
	- * \|<-- dnode (192) --->\|<----------- "bonus" buffer (320) ---------->\|
	- * \|<---- znode (264) ---->\|<---- data (56) ---->\|
	- *
	- * At present, we use this space for the following:
	- * - symbolic links
	- * - 32-byte anti-virus scanstamp (regular files only)
	- */
	-} znode_phys_t;
	-
	-#ifdef _KERNEL
	-int zfs_sa_readlink(struct znode , uio_t );
	-void zfs_sa_symlink(struct znode , char link, int len, dmu_tx_t *);
	-void zfs_sa_upgrade(struct sa_handle , dmu_tx_t );
	-void zfs_sa_get_scanstamp(struct znode , xvattr_t );
	-void zfs_sa_set_scanstamp(struct znode , xvattr_t , dmu_tx_t *);
	-void zfs_sa_uprade_pre(struct sa_handle , void , dmu_tx_t *);
	-void zfs_sa_upgrade_post(struct sa_handle , void , dmu_tx_t *);
	-void zfs_sa_upgrade_txholds(dmu_tx_t , struct znode );
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZFS_SA_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
	@@ -1,55 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _SYS_FS_ZFS_STAT_H
	-#define _SYS_FS_ZFS_STAT_H
	-
	-#ifdef _KERNEL
	-#include <sys/isa_defs.h>
	-#include <sys/dmu.h>
	-#endif
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * A limited number of zpl level stats are retrievable
	- * with an ioctl. zfs diff is the current consumer.
	- */
	-typedef struct zfs_stat {
	- uint64_t zs_gen;
	- uint64_t zs_mode;
	- uint64_t zs_links;
	- uint64_t zs_ctime[2];
	-} zfs_stat_t;
	-
	-extern int zfs_obj_to_stats(objset_t osp, uint64_t obj, zfs_stat_t sb,
	- char *buf, int len);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FS_ZFS_STAT_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
	@@ -1,192 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
	- * All rights reserved.
	- */
	-
	-#ifndef _SYS_FS_ZFS_VFSOPS_H
	-#define _SYS_FS_ZFS_VFSOPS_H
	-
	-#include <sys/list.h>
	-#include <sys/vfs.h>
	-#include <sys/zil.h>
	-#include <sys/sa.h>
	-#include <sys/rrwlock.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/rmlock.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct zfsvfs zfsvfs_t;
	-struct znode;
	-
	-struct zfsvfs {
	- vfs_t z_vfs; / generic fs struct */
	- zfsvfs_t z_parent; / parent fs */
	- objset_t z_os; / objset reference */
	- uint64_t z_root; /* id of root znode */
	- uint64_t z_unlinkedobj; /* id of unlinked zapobj */
	- uint64_t z_max_blksz; /* maximum block size for files */
	- uint64_t z_fuid_obj; /* fuid table object number */
	- uint64_t z_fuid_size; /* fuid table size */
	- avl_tree_t z_fuid_idx; /* fuid tree keyed by index */
	- avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */
	- krwlock_t z_fuid_lock; /* fuid lock */
	- boolean_t z_fuid_loaded; /* fuid tables are loaded */
	- boolean_t z_fuid_dirty; /* need to sync fuid table ? */
	- struct zfs_fuid_info z_fuid_replay; / fuid info for replay */
	- zilog_t z_log; / intent log pointer */
	- uint_t z_acl_mode; /* acl chmod/mode behavior */
	- uint_t z_acl_inherit; /* acl inheritance behavior */
	- zfs_case_t z_case; /* case-sense */
	- boolean_t z_utf8; /* utf8-only */
	- int z_norm; /* normalization flags */
	- boolean_t z_atime; /* enable atimes mount option */
	- boolean_t z_unmounted; /* unmounted */
	- rrmlock_t z_teardown_lock;
	- struct rmslock z_teardown_inactive_lock;
	- list_t z_all_znodes; /* all vnodes in the fs */
	- kmutex_t z_znodes_lock; /* lock for z_all_znodes */
	- struct zfsctl_root z_ctldir; / .zfs directory pointer */
	- boolean_t z_show_ctldir; /* expose .zfs in the root dir */
	- boolean_t z_issnap; /* true if this is a snapshot */
	- boolean_t z_vscan; /* virus scan on/off */
	- boolean_t z_use_fuids; /* version allows fuids */
	- boolean_t z_replay; /* set during ZIL replay */
	- boolean_t z_use_sa; /* version allow system attributes */
	- boolean_t z_use_namecache;/* make use of FreeBSD name cache */
	- uint64_t z_version; /* ZPL version */
	- uint64_t z_shares_dir; /* hidden shares dir */
	- kmutex_t z_lock;
	- uint64_t z_userquota_obj;
	- uint64_t z_groupquota_obj;
	- uint64_t z_replay_eof; /* New end of file - replay only */
	- sa_attr_type_t z_attr_table; / SA attr mapping->id */
	-#define ZFS_OBJ_MTX_SZ 64
	- kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
	-#if defined(__FreeBSD__)
	- struct task z_unlinked_drain_task;
	-#endif
	-};
	-
	-#define ZFS_TRYRLOCK_TEARDOWN_INACTIVE(zfsvfs) \
	- rms_try_rlock(&(zfsvfs)->z_teardown_inactive_lock)
	-
	-#define ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs) \
	- rms_rlock(&(zfsvfs)->z_teardown_inactive_lock)
	-
	-#define ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs) \
	- rms_runlock(&(zfsvfs)->z_teardown_inactive_lock)
	-
	-#define ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs) \
	- rms_wlock(&(zfsvfs)->z_teardown_inactive_lock)
	-
	-#define ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs) \
	- rms_wunlock(&(zfsvfs)->z_teardown_inactive_lock)
	-
	-#define ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs) \
	- rms_wowned(&(zfsvfs)->z_teardown_inactive_lock)
	-
	-/*
	- * Normal filesystems (those not under .zfs/snapshot) have a total
	- * file ID size limited to 12 bytes (including the length field) due to
	- * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical
	- * reasons, this same limit is being imposed by the Solaris NFSv3 implementation
	- * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It
	- * is not possible to expand beyond 12 bytes without abandoning support
	- * of NFSv2.
	- *
	- * For normal filesystems, we partition up the available space as follows:
	- * 2 bytes fid length (required)
	- * 6 bytes object number (48 bits)
	- * 4 bytes generation number (32 bits)
	- *
	- * We reserve only 48 bits for the object number, as this is the limit
	- * currently defined and imposed by the DMU.
	- */
	-typedef struct zfid_short {
	- uint16_t zf_len;
	- uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */
	- uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */
	-} zfid_short_t;
	-
	-/*
	- * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes
	- * (including the length field). This makes files under .zfs/snapshot
	- * accessible by NFSv3 and NFSv4, but not NFSv2.
	- *
	- * For files under .zfs/snapshot, we partition up the available space
	- * as follows:
	- * 2 bytes fid length (required)
	- * 6 bytes object number (48 bits)
	- * 4 bytes generation number (32 bits)
	- * 6 bytes objset id (48 bits)
	- * 4 bytes[**] currently just zero (32 bits)
	- *
	- * We reserve only 48 bits for the object number and objset id, as these are
	- * the limits currently defined and imposed by the DMU.
	- *
	- * [*] 20 bytes on FreeBSD to fit into the size of struct fid.
	- * [**] 2 bytes on FreeBSD for the above reason.
	- */
	-typedef struct zfid_long {
	- zfid_short_t z_fid;
	- uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */
	- uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */
	-} zfid_long_t;
	-
	-#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t))
	-#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
	-
	-extern uint_t zfs_fsyncer_key;
	-extern int zfs_super_owner;
	-
	-extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
	-extern int zfs_resume_fs(zfsvfs_t zfsvfs, struct dsl_dataset ds);
	-extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
	- const char domain, uint64_t rid, uint64_t valuep);
	-extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
	- uint64_t cookiep, void vbuf, uint64_t *bufsizep);
	-extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
	- const char *domain, uint64_t rid, uint64_t quota);
	-extern boolean_t zfs_owner_overquota(zfsvfs_t zfsvfs, struct znode ,
	- boolean_t isgroup);
	-extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup,
	- uint64_t fuid);
	-extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
	-extern int zfsvfs_create(const char name, zfsvfs_t *zfvp);
	-extern int zfsvfs_create_impl(zfsvfs_t *zfvp, zfsvfs_t zfsvfs, objset_t *os);
	-extern void zfsvfs_free(zfsvfs_t *zfsvfs);
	-extern int zfs_check_global_label(const char dsname, const char hexsl);
	-
	-#ifdef _KERNEL
	-extern void zfsvfs_update_fromname(const char oldname, const char newname);
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FS_ZFS_VFSOPS_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
	@@ -1,374 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-#ifndef _SYS_FS_ZFS_ZNODE_H
	-#define _SYS_FS_ZFS_ZNODE_H
	-
	-#ifdef _KERNEL
	-#include <sys/list.h>
	-#include <sys/dmu.h>
	-#include <sys/sa.h>
	-#include <sys/zfs_vfsops.h>
	-#include <sys/rrwlock.h>
	-#include <sys/zfs_sa.h>
	-#include <sys/zfs_stat.h>
	-#include <sys/zfs_rlock.h>
	-#endif
	-#include <sys/zfs_acl.h>
	-#include <sys/zil.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Additional file level attributes, that are stored
	- * in the upper half of zp_flags
	- */
	-#define ZFS_READONLY 0x0000000100000000
	-#define ZFS_HIDDEN 0x0000000200000000
	-#define ZFS_SYSTEM 0x0000000400000000
	-#define ZFS_ARCHIVE 0x0000000800000000
	-#define ZFS_IMMUTABLE 0x0000001000000000
	-#define ZFS_NOUNLINK 0x0000002000000000
	-#define ZFS_APPENDONLY 0x0000004000000000
	-#define ZFS_NODUMP 0x0000008000000000
	-#define ZFS_OPAQUE 0x0000010000000000
	-#define ZFS_AV_QUARANTINED 0x0000020000000000
	-#define ZFS_AV_MODIFIED 0x0000040000000000
	-#define ZFS_REPARSE 0x0000080000000000
	-#define ZFS_OFFLINE 0x0000100000000000
	-#define ZFS_SPARSE 0x0000200000000000
	-
	-#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
	-{ \
	- if (value) \
	- pflags \|= attr; \
	- else \
	- pflags &= ~attr; \
	- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
	- &pflags, sizeof (pflags), tx)); \
	-}
	-
	-/*
	- * Define special zfs pflags
	- */
	-#define ZFS_XATTR 0x1 /* is an extended attribute */
	-#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
	-#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
	-#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */
	-#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */
	-#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */
	-#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */
	-#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
	-#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
	-
	-#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME]
	-#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME]
	-#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME]
	-#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME]
	-#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN]
	-#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES]
	-#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR]
	-#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK]
	-#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV]
	-#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP]
	-#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID]
	-#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID]
	-#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT]
	-#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS]
	-#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE]
	-#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT]
	-#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS]
	-#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE]
	-#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL]
	-#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD]
	-
	-/*
	- * Is ID ephemeral?
	- */
	-#define IS_EPHEMERAL(x) (x > MAXUID)
	-
	-/*
	- * Should we use FUIDs?
	- */
	-#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \
	- spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
	-#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \
	- spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
	-
	-#define MASTER_NODE_OBJ 1
	-
	-/*
	- * Special attributes for master node.
	- * "userquota@" and "groupquota@" are also valid (from
	- * zfs_userquota_prop_prefixes[]).
	- */
	-#define ZFS_FSID "FSID"
	-#define ZFS_UNLINKED_SET "DELETE_QUEUE"
	-#define ZFS_ROOT_OBJ "ROOT"
	-#define ZPL_VERSION_STR "VERSION"
	-#define ZFS_FUID_TABLES "FUID"
	-#define ZFS_SHARES_DIR "SHARES"
	-#define ZFS_SA_ATTRS "SA_ATTRS"
	-
	-/*
	- * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in
	- * the directory entries.
	- */
	-#ifndef IFTODT
	-#define IFTODT(mode) (((mode) & S_IFMT) >> 12)
	-#endif
	-
	-/*
	- * The directory entry has the type (currently unused on Solaris) in the
	- * top 4 bits, and the object number in the low 48 bits. The "middle"
	- * 12 bits are unused.
	- */
	-#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
	-#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
	-
	-/*
	- * Directory entry locks control access to directory entries.
	- * They are used to protect creates, deletes, and renames.
	- * Each directory znode has a mutex and a list of locked names.
	- */
	-#ifdef _KERNEL
	-typedef struct zfs_dirlock {
	- char dl_name; / directory entry being locked */
	- uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */
	- uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */
	- uint16_t dl_namesize; /* set if dl_name was allocated */
	- kcondvar_t dl_cv; /* wait for entry to be unlocked */
	- struct znode dl_dzp; / directory znode */
	- struct zfs_dirlock dl_next; / next in z_dirlocks list */
	-} zfs_dirlock_t;
	-
	-typedef struct znode {
	- struct zfsvfs *z_zfsvfs;
	- vnode_t *z_vnode;
	- uint64_t z_id; /* object ID for this znode */
	-#ifdef illumos
	- kmutex_t z_lock; /* znode modification lock */
	- krwlock_t z_parent_lock; /* parent lock for directories */
	- krwlock_t z_name_lock; /* "master" lock for dirent locks */
	- zfs_dirlock_t z_dirlocks; / directory entry lock list */
	-#endif
	- rangelock_t z_rangelock; /* file range locks */
	- uint8_t z_unlinked; /* file has been unlinked */
	- uint8_t z_atime_dirty; /* atime needs to be synced */
	- uint8_t z_zn_prefetch; /* Prefetch znodes? */
	- uint8_t z_moved; /* Has this znode been moved? */
	- uint_t z_blksz; /* block size in bytes */
	- uint_t z_seq; /* modification sequence number */
	- uint64_t z_mapcnt; /* number of pages mapped to file */
	- uint64_t z_dnodesize; /* dnode size */
	- uint64_t z_gen; /* generation (cached) */
	- uint64_t z_size; /* file size (cached) */
	- uint64_t z_atime[2]; /* atime (cached) */
	- uint64_t z_links; /* file links (cached) */
	- uint64_t z_pflags; /* pflags (cached) */
	- uint64_t z_uid; /* uid fuid (cached) */
	- uint64_t z_gid; /* gid fuid (cached) */
	- mode_t z_mode; /* mode (cached) */
	- uint32_t z_sync_cnt; /* synchronous open count */
	- kmutex_t z_acl_lock; /* acl data lock */
	- zfs_acl_t z_acl_cached; / cached acl */
	- list_node_t z_link_node; /* all znodes in fs link */
	- sa_handle_t z_sa_hdl; / handle to sa data */
	- boolean_t z_is_sa; /* are we native sa? */
	-} znode_t;
	-
	-#define ZFS_LINK_MAX UINT64_MAX
	-
	-/*
	- * Range locking rules
	- * --------------------
	- * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
	- * file range needs to be locked as RL_WRITER. Only then can the pages be
	- * freed etc and zp_size reset. zp_size must be set within range lock.
	- * 2. For writes and punching holes (zfs_write & zfs_space) just the range
	- * being written or freed needs to be locked as RL_WRITER.
	- * Multiple writes at the end of the file must coordinate zp_size updates
	- * to ensure data isn't lost. A compare and swap loop is currently used
	- * to ensure the file size is at least the offset last written.
	- * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
	- * read needs to be locked as RL_READER. A check against zp_size can then
	- * be made for reading beyond end of file.
	- */
	-
	-/*
	- * Convert between znode pointers and vnode pointers
	- */
	-#ifdef DEBUG
	-static __inline vnode_t *
	-ZTOV(znode_t *zp)
	-{
	- vnode_t *vp = zp->z_vnode;
	-
	- ASSERT(vp != NULL && vp->v_data == zp);
	- return (vp);
	-}
	-static __inline znode_t *
	-VTOZ(vnode_t *vp)
	-{
	- znode_t zp = (znode_t )vp->v_data;
	-
	- ASSERT(zp != NULL && zp->z_vnode == vp);
	- return (zp);
	-}
	-#else
	-#define ZTOV(ZP) ((ZP)->z_vnode)
	-#define VTOZ(VP) ((znode_t *)(VP)->v_data)
	-#endif
	-
	-#define VTOZ_SMR(VP) ((znode_t *)vn_load_v_data_smr(VP))
	-
	-/* Called on entry to each ZFS vnode and vfs operation */
	-#define ZFS_ENTER(zfsvfs) \
	- { \
	- rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
	- if ((zfsvfs)->z_unmounted) { \
	- ZFS_EXIT(zfsvfs); \
	- return (EIO); \
	- } \
	- }
	-
	-/* Must be called before exiting the vop */
	-#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG)
	-
	-/* Verifies the znode is valid */
	-#define ZFS_VERIFY_ZP(zp) \
	- if ((zp)->z_sa_hdl == NULL) { \
	- ZFS_EXIT((zp)->z_zfsvfs); \
	- return (EIO); \
	- } \
	-
	-/*
	- * Macros for dealing with dmu_buf_hold
	- */
	-#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1))
	-#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \
	- (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
	-#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
	- mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
	-#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \
	- mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
	-#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
	- mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
	-
	-/* Encode ZFS stored time values from a struct timespec */
	-#define ZFS_TIME_ENCODE(tp, stmp) \
	-{ \
	- (stmp)[0] = (uint64_t)(tp)->tv_sec; \
	- (stmp)[1] = (uint64_t)(tp)->tv_nsec; \
	-}
	-
	-/* Decode ZFS stored time values to a struct timespec */
	-#define ZFS_TIME_DECODE(tp, stmp) \
	-{ \
	- (tp)->tv_sec = (time_t)(stmp)[0]; \
	- (tp)->tv_nsec = (long)(stmp)[1]; \
	-}
	-
	-/*
	- * Timestamp defines
	- */
	-#define ACCESSED (AT_ATIME)
	-#define STATE_CHANGED (AT_CTIME)
	-#define CONTENT_MODIFIED (AT_MTIME \| AT_CTIME)
	-
	-#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
	- if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
	- zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
	-
	-extern int zfs_init_fs(zfsvfs_t , znode_t *);
	-extern void zfs_set_dataprop(objset_t *);
	-extern void zfs_create_fs(objset_t os, cred_t cr, nvlist_t *,
	- dmu_tx_t *tx);
	-extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
	- uint64_t [2], boolean_t);
	-extern void zfs_grow_blocksize(znode_t , uint64_t, dmu_tx_t );
	-extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
	-extern void zfs_znode_init(void);
	-extern void zfs_znode_fini(void);
	-extern int zfs_zget(zfsvfs_t , uint64_t, znode_t *);
	-extern int zfs_rezget(znode_t *);
	-extern void zfs_zinactive(znode_t *);
	-extern void zfs_znode_delete(znode_t , dmu_tx_t );
	-extern void zfs_znode_free(znode_t *);
	-extern void zfs_remove_op_tables();
	-extern int zfs_create_op_tables();
	-extern dev_t zfs_cmpldev(uint64_t);
	-extern int zfs_get_zplprop(objset_t os, zfs_prop_t prop, uint64_t value);
	-extern int zfs_get_stats(objset_t os, nvlist_t nv);
	-extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os);
	-extern void zfs_znode_dmu_fini(znode_t *);
	-
	-extern void zfs_log_create(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t dzp, znode_t zp, char name, vsecattr_t , zfs_fuid_info_t *,
	- vattr_t *vap);
	-extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
	- vattr_t *vap);
	-extern void zfs_log_remove(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t dzp, char name, uint64_t foid);
	-#define ZFS_NO_OBJECT 0 /* no object id */
	-extern void zfs_log_link(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t dzp, znode_t zp, char *name);
	-extern void zfs_log_symlink(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t dzp, znode_t zp, char name, char link);
	-extern void zfs_log_rename(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t sdzp, char sname, znode_t tdzp, char dname, znode_t *szp);
	-extern void zfs_log_write(zilog_t zilog, dmu_tx_t tx, int txtype,
	- znode_t *zp, offset_t off, ssize_t len, int ioflag);
	-extern void zfs_log_truncate(zilog_t zilog, dmu_tx_t tx, int txtype,
	- znode_t *zp, uint64_t off, uint64_t len);
	-extern void zfs_log_setattr(zilog_t zilog, dmu_tx_t tx, int txtype,
	- znode_t zp, vattr_t vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
	-#ifndef ZFS_NO_ACL
	-extern void zfs_log_acl(zilog_t zilog, dmu_tx_t tx, znode_t *zp,
	- vsecattr_t vsecp, zfs_fuid_info_t fuidp);
	-#endif
	-extern void zfs_xvattr_set(znode_t zp, xvattr_t xvap, dmu_tx_t *tx);
	-extern void zfs_upgrade(zfsvfs_t zfsvfs, dmu_tx_t tx);
	-extern int zfs_create_share_dir(zfsvfs_t zfsvfs, dmu_tx_t tx);
	-
	-extern zil_get_data_t zfs_get_data;
	-extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
	-extern int zfsfstype;
	-
	-extern int zfs_znode_parent_and_name(znode_t zp, znode_t dzpp, char buf);
	-
	-#endif /* _KERNEL */
	-
	-extern int zfs_obj_to_path(objset_t osp, uint64_t obj, char buf, int len);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FS_ZFS_ZNODE_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
	@@ -1,464 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#ifndef _SYS_ZIL_H
	-#define _SYS_ZIL_H
	-
	-#include <sys/types.h>
	-#include <sys/spa.h>
	-#include <sys/zio.h>
	-#include <sys/dmu.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct dsl_pool;
	-struct dsl_dataset;
	-struct lwb;
	-
	-/*
	- * Intent log format:
	- *
	- * Each objset has its own intent log. The log header (zil_header_t)
	- * for objset N's intent log is kept in the Nth object of the SPA's
	- * intent_log objset. The log header points to a chain of log blocks,
	- * each of which contains log records (i.e., transactions) followed by
	- * a log block trailer (zil_trailer_t). The format of a log record
	- * depends on the record (or transaction) type, but all records begin
	- * with a common structure that defines the type, length, and txg.
	- */
	-
	-/*
	- * Intent log header - this on disk structure holds fields to manage
	- * the log. All fields are 64 bit to easily handle cross architectures.
	- */
	-typedef struct zil_header {
	- uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
	- uint64_t zh_replay_seq; /* highest replayed sequence number */
	- blkptr_t zh_log; /* log chain */
	- uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
	- uint64_t zh_flags; /* header flags */
	- uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
	- uint64_t zh_pad[3];
	-} zil_header_t;
	-
	-/*
	- * zh_flags bit settings
	- */
	-#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
	-#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */
	-
	-/*
	- * Log block chaining.
	- *
	- * Log blocks are chained together. Originally they were chained at the
	- * end of the block. For performance reasons the chain was moved to the
	- * beginning of the block which allows writes for only the data being used.
	- * The older position is supported for backwards compatability.
	- *
	- * The zio_eck_t contains a zec_cksum which for the intent log is
	- * the sequence number of this log block. A seq of 0 is invalid.
	- * The zec_cksum is checked by the SPA against the sequence
	- * number passed in the blk_cksum field of the blkptr_t
	- */
	-typedef struct zil_chain {
	- uint64_t zc_pad;
	- blkptr_t zc_next_blk; /* next block in chain */
	- uint64_t zc_nused; /* bytes in log block used */
	- zio_eck_t zc_eck; /* block trailer */
	-} zil_chain_t;
	-
	-#define ZIL_MIN_BLKSZ 4096ULL
	-
	-/*
	- * ziltest is by and large an ugly hack, but very useful in
	- * checking replay without tedious work.
	- * When running ziltest we want to keep all itx's and so maintain
	- * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
	- * We subtract TXG_CONCURRENT_STATES to allow for common code.
	- */
	-#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
	-
	-/*
	- * The words of a log block checksum.
	- */
	-#define ZIL_ZC_GUID_0 0
	-#define ZIL_ZC_GUID_1 1
	-#define ZIL_ZC_OBJSET 2
	-#define ZIL_ZC_SEQ 3
	-
	-typedef enum zil_create {
	- Z_FILE,
	- Z_DIR,
	- Z_XATTRDIR,
	-} zil_create_t;
	-
	-/*
	- * size of xvattr log section.
	- * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
	- * for create time and a single 64 bit integer for all of the attributes,
	- * and 4 64 bit integers (32 bytes) for the scanstamp.
	- *
	- */
	-
	-#define ZIL_XVAT_SIZE(mapsize) \
	- sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
	- (sizeof (uint64_t) * 7)
	-
	-/*
	- * Size of ACL in log. The ACE data is padded out to properly align
	- * on 8 byte boundary.
	- */
	-
	-#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t)))
	-
	-/*
	- * Intent log transaction types and record structures
	- */
	-#define TX_COMMIT 0 /* Commit marker (no on-disk state) */
	-#define TX_CREATE 1 /* Create file */
	-#define TX_MKDIR 2 /* Make directory */
	-#define TX_MKXATTR 3 /* Make XATTR directory */
	-#define TX_SYMLINK 4 /* Create symbolic link to a file */
	-#define TX_REMOVE 5 /* Remove file */
	-#define TX_RMDIR 6 /* Remove directory */
	-#define TX_LINK 7 /* Create hard link to a file */
	-#define TX_RENAME 8 /* Rename a file */
	-#define TX_WRITE 9 /* File write */
	-#define TX_TRUNCATE 10 /* Truncate a file */
	-#define TX_SETATTR 11 /* Set file attributes */
	-#define TX_ACL_V0 12 /* Set old formatted ACL */
	-#define TX_ACL 13 /* Set ACL */
	-#define TX_CREATE_ACL 14 /* create with ACL */
	-#define TX_CREATE_ATTR 15 /* create + attrs */
	-#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
	-#define TX_MKDIR_ACL 17 /* mkdir with ACL */
	-#define TX_MKDIR_ATTR 18 /* mkdir with attr */
	-#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
	-#define TX_WRITE2 20 /* dmu_sync EALREADY write */
	-#define TX_MAX_TYPE 21 /* Max transaction type */
	-
	-/*
	- * The transactions for mkdir, symlink, remove, rmdir, link, and rename
	- * may have the following bit set, indicating the original request
	- * specified case-insensitive handling of names.
	- */
	-#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
	-
	-/*
	- * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
	- * out of order. For convenience in the code, all such records must have
	- * lr_foid at the same offset.
	- */
	-#define TX_OOO(txtype) \
	- ((txtype) == TX_WRITE \|\| \
	- (txtype) == TX_TRUNCATE \|\| \
	- (txtype) == TX_SETATTR \|\| \
	- (txtype) == TX_ACL_V0 \|\| \
	- (txtype) == TX_ACL \|\| \
	- (txtype) == TX_WRITE2)
	-
	-/*
	- * The number of dnode slots consumed by the object is stored in the 8
	- * unused upper bits of the object ID. We subtract 1 from the value
	- * stored on disk for compatibility with implementations that don't
	- * support large dnodes. The slot count for a single-slot dnode will
	- * contain 0 for those bits to preserve the log record format for
	- * "small" dnodes.
	- */
	-#define LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1)
	-#define LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1)
	-#define LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT)
	-#define LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x))
	-
	-/*
	- * Format of log records.
	- * The fields are carefully defined to allow them to be aligned
	- * and sized the same on sparc & intel architectures.
	- * Each log record has a common structure at the beginning.
	- *
	- * The log record on disk (lrc_seq) holds the sequence number of all log
	- * records which is used to ensure we don't replay the same record.
	- */
	-typedef struct { /* common log record header */
	- uint64_t lrc_txtype; /* intent log transaction type */
	- uint64_t lrc_reclen; /* transaction record length */
	- uint64_t lrc_txg; /* dmu transaction group number */
	- uint64_t lrc_seq; /* see comment above */
	-} lr_t;
	-
	-/*
	- * Common start of all out-of-order record types (TX_OOO() above).
	- */
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_foid; /* object id */
	-} lr_ooo_t;
	-
	-/*
	- * Handle option extended vattr attributes.
	- *
	- * Whenever new attributes are added the version number
	- * will need to be updated as will code in
	- * zfs_log.c and zfs_replay.c
	- */
	-typedef struct {
	- uint32_t lr_attr_masksize; /* number of elements in array */
	- uint32_t lr_attr_bitmap; /* First entry of array */
	- /* remainder of array and any additional fields */
	-} lr_attr_t;
	-
	-/*
	- * log record for creates without optional ACL.
	- * This log record does support optional xvattr_t attributes.
	- */
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_doid; /* object id of directory */
	- uint64_t lr_foid; /* object id of created file object */
	- uint64_t lr_mode; /* mode of object */
	- uint64_t lr_uid; /* uid of object */
	- uint64_t lr_gid; /* gid of object */
	- uint64_t lr_gen; /* generation (txg of creation) */
	- uint64_t lr_crtime[2]; /* creation time */
	- uint64_t lr_rdev; /* rdev of object to create */
	- /* name of object to create follows this */
	- /* for symlinks, link content follows name */
	- /* for creates with xvattr data, the name follows the xvattr info */
	-} lr_create_t;
	-
	-/*
	- * FUID ACL record will be an array of ACEs from the original ACL.
	- * If this array includes ephemeral IDs, the record will also include
	- * an array of log-specific FUIDs to replace the ephemeral IDs.
	- * Only one copy of each unique domain will be present, so the log-specific
	- * FUIDs will use an index into a compressed domain table. On replay this
	- * information will be used to construct real FUIDs (and bypass idmap,
	- * since it may not be available).
	- */
	-
	-/*
	- * Log record for creates with optional ACL
	- * This log record is also used for recording any FUID
	- * information needed for replaying the create. If the
	- * file doesn't have any actual ACEs then the lr_aclcnt
	- * would be zero.
	- *
	- * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
	- * If create is also setting xvattr's, then acl data follows xvattr.
	- * If ACE FUIDs are needed then they will follow the xvattr_t. Following
	- * the FUIDs will be the domain table information. The FUIDs for the owner
	- * and group will be in lr_create. Name follows ACL data.
	- */
	-typedef struct {
	- lr_create_t lr_create; /* common create portion */
	- uint64_t lr_aclcnt; /* number of ACEs in ACL */
	- uint64_t lr_domcnt; /* number of unique domains */
	- uint64_t lr_fuidcnt; /* number of real fuids */
	- uint64_t lr_acl_bytes; /* number of bytes in ACL */
	- uint64_t lr_acl_flags; /* ACL flags */
	-} lr_acl_create_t;
	-
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_doid; /* obj id of directory */
	- /* name of object to remove follows this */
	-} lr_remove_t;
	-
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_doid; /* obj id of directory */
	- uint64_t lr_link_obj; /* obj id of link */
	- /* name of object to link follows this */
	-} lr_link_t;
	-
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_sdoid; /* obj id of source directory */
	- uint64_t lr_tdoid; /* obj id of target directory */
	- /* 2 strings: names of source and destination follow this */
	-} lr_rename_t;
	-
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_foid; /* file object to write */
	- uint64_t lr_offset; /* offset to write to */
	- uint64_t lr_length; /* user data length to write */
	- uint64_t lr_blkoff; /* no longer used */
	- blkptr_t lr_blkptr; /* spa block pointer for replay */
	- /* write data will follow for small writes */
	-} lr_write_t;
	-
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_foid; /* object id of file to truncate */
	- uint64_t lr_offset; /* offset to truncate from */
	- uint64_t lr_length; /* length to truncate */
	-} lr_truncate_t;
	-
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_foid; /* file object to change attributes */
	- uint64_t lr_mask; /* mask of attributes to set */
	- uint64_t lr_mode; /* mode to set */
	- uint64_t lr_uid; /* uid to set */
	- uint64_t lr_gid; /* gid to set */
	- uint64_t lr_size; /* size to set */
	- uint64_t lr_atime[2]; /* access time */
	- uint64_t lr_mtime[2]; /* modification time */
	- /* optional attribute lr_attr_t may be here */
	-} lr_setattr_t;
	-
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_foid; /* obj id of file */
	- uint64_t lr_aclcnt; /* number of acl entries */
	- /* lr_aclcnt number of ace_t entries follow this */
	-} lr_acl_v0_t;
	-
	-typedef struct {
	- lr_t lr_common; /* common portion of log record */
	- uint64_t lr_foid; /* obj id of file */
	- uint64_t lr_aclcnt; /* number of ACEs in ACL */
	- uint64_t lr_domcnt; /* number of unique domains */
	- uint64_t lr_fuidcnt; /* number of real fuids */
	- uint64_t lr_acl_bytes; /* number of bytes in ACL */
	- uint64_t lr_acl_flags; /* ACL flags */
	- /* lr_acl_bytes number of variable sized ace's follows */
	-} lr_acl_t;
	-
	-/*
	- * ZIL structure definitions, interface function prototype and globals.
	- */
	-
	-/*
	- * Writes are handled in three different ways:
	- *
	- * WR_INDIRECT:
	- * In this mode, if we need to commit the write later, then the block
	- * is immediately written into the file system (using dmu_sync),
	- * and a pointer to the block is put into the log record.
	- * When the txg commits the block is linked in.
	- * This saves additionally writing the data into the log record.
	- * There are a few requirements for this to occur:
	- * - write is greater than zfs/zvol_immediate_write_sz
	- * - not using slogs (as slogs are assumed to always be faster
	- * than writing into the main pool)
	- * - the write occupies only one block
	- * WR_COPIED:
	- * If we know we'll immediately be committing the
	- * transaction (FSYNC or FDSYNC), the we allocate a larger
	- * log record here for the data and copy the data in.
	- * WR_NEED_COPY:
	- * Otherwise we don't allocate a buffer, and if we need to
	- * flush the write later then a buffer is allocated and
	- * we retrieve the data using the dmu.
	- */
	-typedef enum {
	- WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
	- /* and put blkptr in log, rather than actual data) */
	- WR_COPIED, /* immediate - data is copied into lr_write_t */
	- WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
	- WR_NUM_STATES /* number of states */
	-} itx_wr_state_t;
	-
	-typedef struct itx {
	- list_node_t itx_node; /* linkage on zl_itx_list */
	- void itx_private; / type-specific opaque data */
	- itx_wr_state_t itx_wr_state; /* write state */
	- uint8_t itx_sync; /* synchronous transaction */
	- uint64_t itx_oid; /* object id */
	- lr_t itx_lr; /* common part of log record */
	- /* followed by type-specific part of lr_xx_t and its immediate data */
	-} itx_t;
	-
	-typedef int zil_parse_blk_func_t(zilog_t zilog, blkptr_t bp, void *arg,
	- uint64_t txg);
	-typedef int zil_parse_lr_func_t(zilog_t zilog, lr_t lr, void *arg,
	- uint64_t txg);
	-typedef int zil_replay_func_t(void arg1, void arg2, boolean_t byteswap);
	-typedef int zil_get_data_t(void arg, lr_write_t lr, char *dbuf,
	- struct lwb lwb, zio_t zio);
	-
	-extern int zil_parse(zilog_t zilog, zil_parse_blk_func_t parse_blk_func,
	- zil_parse_lr_func_t parse_lr_func, void arg, uint64_t txg);
	-
	-extern void zil_init(void);
	-extern void zil_fini(void);
	-
	-extern zilog_t zil_alloc(objset_t os, zil_header_t *zh_phys);
	-extern void zil_free(zilog_t *zilog);
	-
	-extern zilog_t zil_open(objset_t os, zil_get_data_t *get_data);
	-extern void zil_close(zilog_t *zilog);
	-
	-extern void zil_replay(objset_t os, void arg,
	- zil_replay_func_t *replay_func[TX_MAX_TYPE]);
	-extern boolean_t zil_replaying(zilog_t zilog, dmu_tx_t tx);
	-extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
	-extern void zil_destroy_sync(zilog_t zilog, dmu_tx_t tx);
	-extern void zil_rollback_destroy(zilog_t zilog, dmu_tx_t tx);
	-
	-extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
	-extern void zil_itx_destroy(itx_t *itx);
	-extern void zil_itx_assign(zilog_t zilog, itx_t itx, dmu_tx_t *tx);
	-
	-extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid);
	-extern void zil_commit(zilog_t *zilog, uint64_t oid);
	-extern void zil_commit_impl(zilog_t *zilog, uint64_t oid);
	-
	-extern int zil_reset(const char osname, void txarg);
	-extern int zil_claim(struct dsl_pool *dp,
	- struct dsl_dataset ds, void txarg);
	-extern int zil_check_log_chain(struct dsl_pool *dp,
	- struct dsl_dataset ds, void tx);
	-extern void zil_sync(zilog_t zilog, dmu_tx_t tx);
	-extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
	-
	-extern int zil_suspend(const char osname, void *cookiep);
	-extern void zil_resume(void *cookie);
	-
	-extern void zil_lwb_add_block(struct lwb lwb, const blkptr_t bp);
	-extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg);
	-extern int zil_bp_tree_add(zilog_t zilog, const blkptr_t bp);
	-
	-extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);
	-
	-extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);
	-
	-extern uint64_t zil_max_copied_data(zilog_t *zilog);
	-extern uint64_t zil_max_log_data(zilog_t *zilog);
	-
	-extern int zil_replay_disable;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZIL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
	@@ -1,229 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#ifndef _SYS_ZIL_IMPL_H
	-#define _SYS_ZIL_IMPL_H
	-
	-#include <sys/zil.h>
	-#include <sys/dmu_objset.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Possbile states for a given lwb structure.
	- *
	- * An lwb will start out in the "closed" state, and then transition to
	- * the "opened" state via a call to zil_lwb_write_open(). When
	- * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock"
	- * must be held.
	- *
	- * After the lwb is "opened", it can transition into the "issued" state
	- * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
	- * be held when making this transition.
	- *
	- * After the lwb's write zio completes, it transitions into the "write
	- * done" state via zil_lwb_write_done(); and then into the "flush done"
	- * state via zil_lwb_flush_vdevs_done(). When transitioning from
	- * "issued" to "write done", and then from "write done" to "flush done",
	- * the zilog's "zl_lock" must be held, not the "zl_issuer_lock".
	- *
	- * The zilog's "zl_issuer_lock" can become heavily contended in certain
	- * workloads, so we specifically avoid acquiring that lock when
	- * transitioning an lwb from "issued" to "done". This allows us to avoid
	- * having to acquire the "zl_issuer_lock" for each lwb ZIO completion,
	- * which would have added more lock contention on an already heavily
	- * contended lock.
	- *
	- * Additionally, correctness when reading an lwb's state is often
	- * acheived by exploiting the fact that these state transitions occur in
	- * this specific order; i.e. "closed" to "opened" to "issued" to "done".
	- *
	- * Thus, if an lwb is in the "closed" or "opened" state, holding the
	- * "zl_issuer_lock" will prevent a concurrent thread from transitioning
	- * that lwb to the "issued" state. Likewise, if an lwb is already in the
	- * "issued" state, holding the "zl_lock" will prevent a concurrent
	- * thread from transitioning that lwb to the "write done" state.
	- */
	-typedef enum {
	- LWB_STATE_CLOSED,
	- LWB_STATE_OPENED,
	- LWB_STATE_ISSUED,
	- LWB_STATE_WRITE_DONE,
	- LWB_STATE_FLUSH_DONE,
	- LWB_NUM_STATES
	-} lwb_state_t;
	-
	-/*
	- * Log write block (lwb)
	- *
	- * Prior to an lwb being issued to disk via zil_lwb_write_issue(), it
	- * will be protected by the zilog's "zl_issuer_lock". Basically, prior
	- * to it being issued, it will only be accessed by the thread that's
	- * holding the "zl_issuer_lock". After the lwb is issued, the zilog's
	- * "zl_lock" is used to protect the lwb against concurrent access.
	- */
	-typedef struct lwb {
	- zilog_t lwb_zilog; / back pointer to log struct */
	- blkptr_t lwb_blk; /* on disk address of this log blk */
	- boolean_t lwb_slog; /* lwb_blk is on SLOG device */
	- int lwb_nused; /* # used bytes in buffer */
	- int lwb_sz; /* size of block and buffer */
	- lwb_state_t lwb_state; /* the state of this lwb */
	- char lwb_buf; / log write buffer */
	- zio_t lwb_write_zio; / zio for the lwb buffer */
	- zio_t lwb_root_zio; / root zio for lwb write and flushes */
	- dmu_tx_t lwb_tx; / tx for log block allocation */
	- uint64_t lwb_max_txg; /* highest txg in this lwb */
	- list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
	- list_t lwb_waiters; /* list of zil_commit_waiter's */
	- avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */
	- kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */
	- hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
	-} lwb_t;
	-
	-/*
	- * ZIL commit waiter.
	- *
	- * This structure is allocated each time zil_commit() is called, and is
	- * used by zil_commit() to communicate with other parts of the ZIL, such
	- * that zil_commit() can know when it safe for it return. For more
	- * details, see the comment above zil_commit().
	- *
	- * The "zcw_lock" field is used to protect the commit waiter against
	- * concurrent access. This lock is often acquired while already holding
	- * the zilog's "zl_issuer_lock" or "zl_lock"; see the functions
	- * zil_process_commit_list() and zil_lwb_flush_vdevs_done() as examples
	- * of this. Thus, one must be careful not to acquire the
	- * "zl_issuer_lock" or "zl_lock" when already holding the "zcw_lock";
	- * e.g. see the zil_commit_waiter_timeout() function.
	- */
	-typedef struct zil_commit_waiter {
	- kcondvar_t zcw_cv; /* signalled when "done" */
	- kmutex_t zcw_lock; /* protects fields of this struct */
	- list_node_t zcw_node; /* linkage in lwb_t:lwb_waiter list */
	- lwb_t zcw_lwb; / back pointer to lwb when linked */
	- boolean_t zcw_done; /* B_TRUE when "done", else B_FALSE */
	- int zcw_zio_error; /* contains the zio io_error value */
	-} zil_commit_waiter_t;
	-
	-/*
	- * Intent log transaction lists
	- */
	-typedef struct itxs {
	- list_t i_sync_list; /* list of synchronous itxs */
	- avl_tree_t i_async_tree; /* tree of foids for async itxs */
	-} itxs_t;
	-
	-typedef struct itxg {
	- kmutex_t itxg_lock; /* lock for this structure */
	- uint64_t itxg_txg; /* txg for this chain */
	- itxs_t itxg_itxs; / sync and async itxs */
	-} itxg_t;
	-
	-/* for async nodes we build up an AVL tree of lists of async itxs per file */
	-typedef struct itx_async_node {
	- uint64_t ia_foid; /* file object id */
	- list_t ia_list; /* list of async itxs for this foid */
	- avl_node_t ia_node; /* AVL tree linkage */
	-} itx_async_node_t;
	-
	-/*
	- * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
	- * we've touched so we know which ones need a write cache flush at the end.
	- */
	-typedef struct zil_vdev_node {
	- uint64_t zv_vdev; /* vdev to be flushed */
	- avl_node_t zv_node; /* AVL tree linkage */
	-} zil_vdev_node_t;
	-
	-#define ZIL_PREV_BLKS 16
	-
	-/*
	- * Stable storage intent log management structure. One per dataset.
	- */
	-struct zilog {
	- kmutex_t zl_lock; /* protects most zilog_t fields */
	- struct dsl_pool zl_dmu_pool; / DSL pool */
	- spa_t zl_spa; / handle for read/write log */
	- const zil_header_t zl_header; / log header buffer */
	- objset_t zl_os; / object set we're logging */
	- zil_get_data_t zl_get_data; / callback to get object content */
	- lwb_t zl_last_lwb_opened; / most recent lwb opened */
	- hrtime_t zl_last_lwb_latency; /* zio latency of last lwb done */
	- uint64_t zl_lr_seq; /* on-disk log record sequence number */
	- uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */
	- uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
	- uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
	- uint64_t zl_replaying_seq; /* current replay seq number */
	- uint32_t zl_suspend; /* log suspend count */
	- kcondvar_t zl_cv_suspend; /* log suspend completion */
	- uint8_t zl_suspending; /* log is currently suspending */
	- uint8_t zl_keep_first; /* keep first log block in destroy */
	- uint8_t zl_replay; /* replaying records while set */
	- uint8_t zl_stop_sync; /* for debugging */
	- kmutex_t zl_issuer_lock; /* single writer, per ZIL, at a time */
	- uint8_t zl_logbias; /* latency or throughput */
	- uint8_t zl_sync; /* synchronous or asynchronous */
	- int zl_parse_error; /* last zil_parse() error */
	- uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */
	- uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */
	- uint64_t zl_parse_blk_count; /* number of blocks parsed */
	- uint64_t zl_parse_lr_count; /* number of log records parsed */
	- itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
	- list_t zl_itx_commit_list; /* itx list to be committed */
	- uint64_t zl_cur_used; /* current commit log size used */
	- list_t zl_lwb_list; /* in-flight log write list */
	- avl_tree_t zl_bp_tree; /* track bps during log parse */
	- clock_t zl_replay_time; /* lbolt of when replay started */
	- uint64_t zl_replay_blks; /* number of log blocks replayed */
	- zil_header_t zl_old_header; /* debugging aid */
	- uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
	- uint_t zl_prev_rotor; /* rotor for zl_prev[] */
	- txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
	- uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */
	- /*
	- * Max block size for this ZIL. Note that this can not be changed
	- * while the ZIL is in use because consumers (ZPL/zvol) need to take
	- * this into account when deciding between WR_COPIED and WR_NEED_COPY
	- * (see zil_max_copied_data()).
	- */
	- uint64_t zl_max_block_size;
	-};
	-
	-typedef struct zil_bp_node {
	- dva_t zn_dva;
	- avl_node_t zn_node;
	-} zil_bp_node_t;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZIL_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
	@@ -1,675 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright 2016 Toomas Soome <tsoome@me.com>
	- */
	-
	-#ifndef _ZIO_H
	-#define _ZIO_H
	-
	-#include <sys/zio_priority.h>
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/txg.h>
	-#include <sys/avl.h>
	-#include <sys/kstat.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zio_impl.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Embedded checksum
	- */
	-#define ZEC_MAGIC 0x210da7ab10c7a11ULL
	-
	-typedef struct zio_eck {
	- uint64_t zec_magic; /* for validation, endianness */
	- zio_cksum_t zec_cksum; /* 256-bit checksum */
	-} zio_eck_t;
	-
	-/*
	- * Gang block headers are self-checksumming and contain an array
	- * of block pointers.
	- */
	-#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
	-#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
	- sizeof (zio_eck_t)) / sizeof (blkptr_t))
	-#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
	- sizeof (zio_eck_t) - \
	- (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
	- sizeof (uint64_t))
	-
	-typedef struct zio_gbh {
	- blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
	- uint64_t zg_filler[SPA_GBH_FILLER];
	- zio_eck_t zg_tail;
	-} zio_gbh_phys_t;
	-
	-enum zio_checksum {
	- ZIO_CHECKSUM_INHERIT = 0,
	- ZIO_CHECKSUM_ON,
	- ZIO_CHECKSUM_OFF,
	- ZIO_CHECKSUM_LABEL,
	- ZIO_CHECKSUM_GANG_HEADER,
	- ZIO_CHECKSUM_ZILOG,
	- ZIO_CHECKSUM_FLETCHER_2,
	- ZIO_CHECKSUM_FLETCHER_4,
	- ZIO_CHECKSUM_SHA256,
	- ZIO_CHECKSUM_ZILOG2,
	- ZIO_CHECKSUM_NOPARITY,
	- ZIO_CHECKSUM_SHA512,
	- ZIO_CHECKSUM_SKEIN,
	-#ifdef illumos
	- ZIO_CHECKSUM_EDONR,
	-#endif
	- ZIO_CHECKSUM_FUNCTIONS
	-};
	-
	-/*
	- * The number of "legacy" compression functions which can be set on individual
	- * objects.
	- */
	-#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
	-
	-#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
	-#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
	-
	-#define ZIO_CHECKSUM_MASK 0xffULL
	-#define ZIO_CHECKSUM_VERIFY (1 << 8)
	-
	-#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
	-#define ZIO_DEDUPDITTO_MIN 100
	-
	-/*
	- * The number of "legacy" compression functions which can be set on individual
	- * objects.
	- */
	-#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
	-
	-/*
	- * The meaning of "compress = on" selected by the compression features enabled
	- * on a given pool.
	- */
	-#define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB
	-#define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4
	-
	-#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
	-
	-#define BOOTFS_COMPRESS_VALID(compress) \
	- ((compress) == ZIO_COMPRESS_LZJB \|\| \
	- (compress) == ZIO_COMPRESS_LZ4 \|\| \
	- (compress) == ZIO_COMPRESS_ON \|\| \
	- (compress) == ZIO_COMPRESS_OFF)
	-
	-#define ZIO_FAILURE_MODE_WAIT 0
	-#define ZIO_FAILURE_MODE_CONTINUE 1
	-#define ZIO_FAILURE_MODE_PANIC 2
	-
	-typedef enum zio_suspend_reason {
	- ZIO_SUSPEND_NONE = 0,
	- ZIO_SUSPEND_IOERR,
	- ZIO_SUSPEND_MMP,
	-} zio_suspend_reason_t;
	-
	-enum zio_flag {
	- /*
	- * Flags inherited by gang, ddt, and vdev children,
	- * and that must be equal for two zios to aggregate
	- */
	- ZIO_FLAG_DONT_AGGREGATE = 1 << 0,
	- ZIO_FLAG_IO_REPAIR = 1 << 1,
	- ZIO_FLAG_SELF_HEAL = 1 << 2,
	- ZIO_FLAG_RESILVER = 1 << 3,
	- ZIO_FLAG_SCRUB = 1 << 4,
	- ZIO_FLAG_SCAN_THREAD = 1 << 5,
	- ZIO_FLAG_PHYSICAL = 1 << 6,
	-
	-#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
	-
	- /*
	- * Flags inherited by ddt, gang, and vdev children.
	- */
	- ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */
	- ZIO_FLAG_SPECULATIVE = 1 << 8,
	- ZIO_FLAG_CONFIG_WRITER = 1 << 9,
	- ZIO_FLAG_DONT_RETRY = 1 << 10,
	- ZIO_FLAG_DONT_CACHE = 1 << 11,
	- ZIO_FLAG_NODATA = 1 << 12,
	- ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
	- ZIO_FLAG_IO_ALLOCATING = 1 << 14,
	-
	-#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
	-#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
	-
	- /*
	- * Flags inherited by vdev children.
	- */
	- ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */
	- ZIO_FLAG_PROBE = 1 << 16,
	- ZIO_FLAG_TRYHARD = 1 << 17,
	- ZIO_FLAG_OPTIONAL = 1 << 18,
	-
	-#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
	-
	- /*
	- * Flags not inherited by any children.
	- */
	- ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */
	- ZIO_FLAG_DONT_PROPAGATE = 1 << 20,
	- ZIO_FLAG_IO_BYPASS = 1 << 21,
	- ZIO_FLAG_IO_REWRITE = 1 << 22,
	- ZIO_FLAG_RAW = 1 << 23,
	- ZIO_FLAG_GANG_CHILD = 1 << 24,
	- ZIO_FLAG_DDT_CHILD = 1 << 25,
	- ZIO_FLAG_GODFATHER = 1 << 26,
	- ZIO_FLAG_NOPWRITE = 1 << 27,
	- ZIO_FLAG_REEXECUTED = 1 << 28,
	- ZIO_FLAG_DELEGATED = 1 << 29,
	-};
	-
	-#define ZIO_FLAG_MUSTSUCCEED 0
	-
	-#define ZIO_DDT_CHILD_FLAGS(zio) \
	- (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) \| \
	- ZIO_FLAG_DDT_CHILD \| ZIO_FLAG_CANFAIL)
	-
	-#define ZIO_GANG_CHILD_FLAGS(zio) \
	- (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) \| \
	- ZIO_FLAG_GANG_CHILD \| ZIO_FLAG_CANFAIL)
	-
	-#define ZIO_VDEV_CHILD_FLAGS(zio) \
	- (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) \| \
	- ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_CANFAIL)
	-
	-#define ZIO_CHILD_BIT(x) (1 << (x))
	-#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))
	-
	-enum zio_child {
	- ZIO_CHILD_VDEV = 0,
	- ZIO_CHILD_GANG,
	- ZIO_CHILD_DDT,
	- ZIO_CHILD_LOGICAL,
	- ZIO_CHILD_TYPES
	-};
	-
	-#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
	-#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG)
	-#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
	-#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
	-#define ZIO_CHILD_ALL_BITS \
	- (ZIO_CHILD_VDEV_BIT \| ZIO_CHILD_GANG_BIT \| \
	- ZIO_CHILD_DDT_BIT \| ZIO_CHILD_LOGICAL_BIT)
	-
	-enum zio_wait_type {
	- ZIO_WAIT_READY = 0,
	- ZIO_WAIT_DONE,
	- ZIO_WAIT_TYPES
	-};
	-
	-/*
	- * These are bespoke errnos used in ZFS. We map them to their closest FreeBSD
	- * equivalents. This gives us more useful error messages from strerror(3).
	- */
	-#define ECKSUM EINTEGRITY
	-#define EFRAGS ENOSPC
	-
	-typedef void zio_done_func_t(zio_t *zio);
	-
	-extern boolean_t zio_dva_throttle_enabled;
	-extern const char *zio_type_name[ZIO_TYPES];
	-
	-/*
	- * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
	- * identifies any block in the pool. By convention, the meta-objset (MOS)
	- * is objset 0, and the meta-dnode is object 0. This covers all blocks
	- * except root blocks and ZIL blocks, which are defined as follows:
	- *
	- * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
	- * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
	- * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
	- * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
	- *
	- * Note: this structure is called a bookmark because its original purpose
	- * was to remember where to resume a pool-wide traverse.
	- *
	- * Note: this structure is passed between userland and the kernel, and is
	- * stored on disk (by virtue of being incorporated into other on-disk
	- * structures, e.g. dsl_scan_phys_t).
	- */
	-typedef struct zbookmark_phys {
	- uint64_t zb_objset;
	- uint64_t zb_object;
	- int64_t zb_level;
	- uint64_t zb_blkid;
	-} zbookmark_phys_t;
	-
	-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
	-{ \
	- (zb)->zb_objset = objset; \
	- (zb)->zb_object = object; \
	- (zb)->zb_level = level; \
	- (zb)->zb_blkid = blkid; \
	-}
	-
	-#define ZB_DESTROYED_OBJSET (-1ULL)
	-
	-#define ZB_ROOT_OBJECT (0ULL)
	-#define ZB_ROOT_LEVEL (-1LL)
	-#define ZB_ROOT_BLKID (0ULL)
	-
	-#define ZB_ZIL_OBJECT (0ULL)
	-#define ZB_ZIL_LEVEL (-2LL)
	-
	-#define ZB_DNODE_LEVEL (-3LL)
	-#define ZB_DNODE_BLKID (0ULL)
	-
	-#define ZB_IS_ZERO(zb) \
	- ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
	- (zb)->zb_level == 0 && (zb)->zb_blkid == 0)
	-#define ZB_IS_ROOT(zb) \
	- ((zb)->zb_object == ZB_ROOT_OBJECT && \
	- (zb)->zb_level == ZB_ROOT_LEVEL && \
	- (zb)->zb_blkid == ZB_ROOT_BLKID)
	-
	-typedef struct zio_prop {
	- enum zio_checksum zp_checksum;
	- enum zio_compress zp_compress;
	- dmu_object_type_t zp_type;
	- uint8_t zp_level;
	- uint8_t zp_copies;
	- boolean_t zp_dedup;
	- boolean_t zp_dedup_verify;
	- boolean_t zp_nopwrite;
	- uint32_t zp_zpl_smallblk;
	-} zio_prop_t;
	-
	-typedef struct zio_cksum_report zio_cksum_report_t;
	-
	-typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
	- const void *good_data);
	-typedef void zio_cksum_free_f(void *cbdata, size_t size);
	-
	-struct zio_bad_cksum; /* defined in zio_checksum.h */
	-struct dnode_phys;
	-struct abd;
	-
	-struct zio_cksum_report {
	- struct zio_cksum_report *zcr_next;
	- nvlist_t *zcr_ereport;
	- nvlist_t *zcr_detector;
	- void *zcr_cbdata;
	- size_t zcr_cbinfo; /* passed to zcr_free() */
	- uint64_t zcr_align;
	- uint64_t zcr_length;
	- zio_cksum_finish_f *zcr_finish;
	- zio_cksum_free_f *zcr_free;
	-
	- /* internal use only */
	- struct zio_bad_cksum zcr_ckinfo; / information from failure */
	-};
	-
	-typedef void zio_vsd_cksum_report_f(zio_t zio, zio_cksum_report_t zcr,
	- void *arg);
	-
	-zio_vsd_cksum_report_f zio_vsd_default_cksum_report;
	-
	-typedef struct zio_vsd_ops {
	- zio_done_func_t *vsd_free;
	- zio_vsd_cksum_report_f *vsd_cksum_report;
	-} zio_vsd_ops_t;
	-
	-typedef struct zio_gang_node {
	- zio_gbh_phys_t *gn_gbh;
	- struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS];
	-} zio_gang_node_t;
	-
	-typedef zio_t zio_gang_issue_func_t(zio_t zio, blkptr_t *bp,
	- zio_gang_node_t gn, struct abd data, uint64_t offset);
	-
	-typedef void zio_transform_func_t(zio_t zio, struct abd data, uint64_t size);
	-
	-typedef struct zio_transform {
	- struct abd *zt_orig_abd;
	- uint64_t zt_orig_size;
	- uint64_t zt_bufsize;
	- zio_transform_func_t *zt_transform;
	- struct zio_transform *zt_next;
	-} zio_transform_t;
	-
	-typedef zio_t zio_pipe_stage_t(zio_t zio);
	-
	-/*
	- * The io_reexecute flags are distinct from io_flags because the child must
	- * be able to propagate them to the parent. The normal io_flags are local
	- * to the zio, not protected by any lock, and not modifiable by children;
	- * the reexecute flags are protected by io_lock, modifiable by children,
	- * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
	- */
	-#define ZIO_REEXECUTE_NOW 0x01
	-#define ZIO_REEXECUTE_SUSPEND 0x02
	-
	-typedef struct zio_alloc_list {
	- list_t zal_list;
	- uint64_t zal_size;
	-} zio_alloc_list_t;
	-
	-typedef struct zio_link {
	- zio_t *zl_parent;
	- zio_t *zl_child;
	- list_node_t zl_parent_node;
	- list_node_t zl_child_node;
	-} zio_link_t;
	-
	-/*
	- * Used for TRIM kstat.
	- */
	-typedef struct zio_trim_stats {
	- /*
	- * Number of bytes successfully TRIMmed.
	- */
	- kstat_named_t bytes;
	-
	- /*
	- * Number of successful TRIM requests.
	- */
	- kstat_named_t success;
	-
	- /*
	- * Number of TRIM requests that failed because TRIM is not
	- * supported.
	- */
	- kstat_named_t unsupported;
	-
	- /*
	- * Number of TRIM requests that failed for other reasons.
	- */
	- kstat_named_t failed;
	-} zio_trim_stats_t;
	-
	-extern zio_trim_stats_t zio_trim_stats;
	-
	-#define ZIO_TRIM_STAT_INCR(stat, val) \
	- atomic_add_64(&zio_trim_stats.stat.value.ui64, (val));
	-#define ZIO_TRIM_STAT_BUMP(stat) \
	- ZIO_TRIM_STAT_INCR(stat, 1);
	-
	-struct zio {
	- /* Core information about this I/O */
	- zbookmark_phys_t io_bookmark;
	- zio_prop_t io_prop;
	- zio_type_t io_type;
	- enum zio_child io_child_type;
	- int io_cmd;
	- zio_priority_t io_priority;
	- uint8_t io_reexecute;
	- uint8_t io_state[ZIO_WAIT_TYPES];
	- uint64_t io_txg;
	- spa_t *io_spa;
	- blkptr_t *io_bp;
	- blkptr_t *io_bp_override;
	- blkptr_t io_bp_copy;
	- list_t io_parent_list;
	- list_t io_child_list;
	- zio_t *io_logical;
	- zio_transform_t *io_transform_stack;
	-
	- /* Callback info */
	- zio_done_func_t *io_ready;
	- zio_done_func_t *io_children_ready;
	- zio_done_func_t *io_physdone;
	- zio_done_func_t *io_done;
	- void *io_private;
	- int64_t io_prev_space_delta; /* DMU private */
	- blkptr_t io_bp_orig;
	-
	- /* Data represented by this I/O */
	- struct abd *io_abd;
	- struct abd *io_orig_abd;
	- uint64_t io_size;
	- uint64_t io_orig_size;
	- /* io_lsize != io_orig_size iff this is a raw write */
	- uint64_t io_lsize;
	-
	- /* Stuff for the vdev stack */
	- vdev_t *io_vd;
	- void *io_vsd;
	- const zio_vsd_ops_t *io_vsd_ops;
	- metaslab_class_t io_metaslab_class; / dva throttle class */
	-
	- uint64_t io_offset;
	- hrtime_t io_timestamp;
	- hrtime_t io_queued_timestamp;
	- hrtime_t io_target_timestamp;
	- avl_node_t io_queue_node;
	- avl_node_t io_offset_node;
	- avl_node_t io_alloc_node;
	- zio_alloc_list_t io_alloc_list;
	-
	-#ifdef __FreeBSD__
	- struct bio *io_bio;
	-#ifdef _KERNEL
	- struct callout io_timer;
	-#endif
	-#endif
	-
	- /* Internal pipeline state */
	- enum zio_flag io_flags;
	- enum zio_stage io_stage;
	- enum zio_stage io_pipeline;
	- enum zio_flag io_orig_flags;
	- enum zio_stage io_orig_stage;
	- enum zio_stage io_orig_pipeline;
	- enum zio_stage io_pipeline_trace;
	- int io_error;
	- int io_child_error[ZIO_CHILD_TYPES];
	- uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
	- uint64_t io_child_count;
	- uint64_t io_phys_children;
	- uint64_t io_parent_count;
	- uint64_t *io_stall;
	- zio_t *io_gang_leader;
	- zio_gang_node_t *io_gang_tree;
	- void *io_executor;
	- void *io_waiter;
	- kmutex_t io_lock;
	- kcondvar_t io_cv;
	- int io_allocator;
	-
	- /* FMA state */
	- zio_cksum_report_t *io_cksum_report;
	- uint64_t io_ena;
	-
	- /* Taskq dispatching state */
	- taskq_ent_t io_tqent;
	-
	- avl_node_t io_trim_node;
	- list_node_t io_trim_link;
	-};
	-
	-extern int zio_bookmark_compare(const void , const void );
	-
	-extern zio_t zio_null(zio_t pio, spa_t spa, vdev_t vd,
	- zio_done_func_t done, void priv, enum zio_flag flags);
	-
	-extern zio_t zio_root(spa_t spa,
	- zio_done_func_t done, void priv, enum zio_flag flags);
	-
	-extern zio_t zio_read(zio_t pio, spa_t spa, const blkptr_t bp,
	- struct abd data, uint64_t lsize, zio_done_func_t done, void *priv,
	- zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
	-
	-extern zio_t zio_write(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp,
	- struct abd data, uint64_t size, uint64_t psize, const zio_prop_t zp,
	- zio_done_func_t ready, zio_done_func_t children_ready,
	- zio_done_func_t physdone, zio_done_func_t done,
	- void *priv, zio_priority_t priority, enum zio_flag flags,
	- const zbookmark_phys_t *zb);
	-
	-extern zio_t zio_rewrite(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp,
	- struct abd data, uint64_t size, zio_done_func_t done, void *priv,
	- zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
	-
	-extern void zio_write_override(zio_t zio, blkptr_t bp, int copies,
	- boolean_t nopwrite);
	-
	-extern void zio_free(spa_t spa, uint64_t txg, const blkptr_t bp);
	-
	-extern zio_t zio_claim(zio_t pio, spa_t *spa, uint64_t txg,
	- const blkptr_t *bp,
	- zio_done_func_t done, void priv, enum zio_flag flags);
	-
	-extern zio_t zio_ioctl(zio_t pio, spa_t spa, vdev_t vd, int cmd,
	- uint64_t offset, uint64_t size, zio_done_func_t done, void priv,
	- zio_priority_t priority, enum zio_flag flags);
	-
	-extern zio_t zio_read_phys(zio_t pio, vdev_t *vd, uint64_t offset,
	- uint64_t size, struct abd *data, int checksum,
	- zio_done_func_t done, void priv, zio_priority_t priority,
	- enum zio_flag flags, boolean_t labels);
	-
	-extern zio_t zio_write_phys(zio_t pio, vdev_t *vd, uint64_t offset,
	- uint64_t size, struct abd *data, int checksum,
	- zio_done_func_t done, void priv, zio_priority_t priority,
	- enum zio_flag flags, boolean_t labels);
	-
	-extern zio_t zio_free_sync(zio_t pio, spa_t *spa, uint64_t txg,
	- const blkptr_t *bp, uint64_t size, enum zio_flag flags);
	-
	-extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg,
	- blkptr_t new_bp, blkptr_t old_bp, uint64_t size, boolean_t *slog);
	-extern void zio_flush(zio_t zio, vdev_t vd);
	-extern zio_t zio_trim(zio_t zio, spa_t spa, vdev_t vd, uint64_t offset,
	- uint64_t size);
	-extern void zio_shrink(zio_t *zio, uint64_t size);
	-
	-extern int zio_wait(zio_t *zio);
	-extern void zio_nowait(zio_t *zio);
	-extern void zio_execute(zio_t *zio);
	-extern void zio_interrupt(zio_t *zio);
	-extern void zio_delay_init(zio_t *zio);
	-extern void zio_delay_interrupt(zio_t *zio);
	-
	-extern zio_t zio_walk_parents(zio_t cio, zio_link_t **);
	-extern zio_t zio_walk_children(zio_t pio, zio_link_t **);
	-extern zio_t zio_unique_parent(zio_t cio);
	-extern void zio_add_child(zio_t pio, zio_t cio);
	-
	-extern void *zio_buf_alloc(size_t size);
	-extern void zio_buf_free(void *buf, size_t size);
	-extern void *zio_data_buf_alloc(size_t size);
	-extern void zio_data_buf_free(void *buf, size_t size);
	-
	-extern void zio_push_transform(zio_t zio, struct abd abd, uint64_t size,
	- uint64_t bufsize, zio_transform_func_t *transform);
	-extern void zio_pop_transforms(zio_t *zio);
	-
	-extern void zio_resubmit_stage_async(void *);
	-
	-extern zio_t zio_vdev_child_io(zio_t zio, blkptr_t bp, vdev_t vd,
	- uint64_t offset, struct abd *data, uint64_t size, int type,
	- zio_priority_t priority, enum zio_flag flags,
	- zio_done_func_t done, void priv);
	-
	-extern zio_t zio_vdev_delegated_io(vdev_t vd, uint64_t offset,
	- struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
	- enum zio_flag flags, zio_done_func_t done, void priv);
	-
	-extern void zio_vdev_io_bypass(zio_t *zio);
	-extern void zio_vdev_io_reissue(zio_t *zio);
	-extern void zio_vdev_io_redone(zio_t *zio);
	-
	-extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
	-
	-extern void zio_checksum_verified(zio_t *zio);
	-extern int zio_worst_error(int e1, int e2);
	-
	-extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
	- enum zio_checksum parent);
	-extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
	- enum zio_checksum child, enum zio_checksum parent);
	-extern enum zio_compress zio_compress_select(spa_t *spa,
	- enum zio_compress child, enum zio_compress parent);
	-
	-extern void zio_suspend(spa_t spa, zio_t zio, zio_suspend_reason_t);
	-extern int zio_resume(spa_t *spa);
	-extern void zio_resume_wait(spa_t *spa);
	-
	-/*
	- * Initial setup and teardown.
	- */
	-extern void zio_init(void);
	-extern void zio_fini(void);
	-
	-/*
	- * Fault injection
	- */
	-struct zinject_record;
	-extern uint32_t zio_injection_enabled;
	-extern int zio_inject_fault(char name, int flags, int id,
	- struct zinject_record *record);
	-extern int zio_inject_list_next(int id, char name, size_t buflen,
	- struct zinject_record *record);
	-extern int zio_clear_fault(int id);
	-extern void zio_handle_panic_injection(spa_t spa, char tag, uint64_t type);
	-extern int zio_handle_fault_injection(zio_t *zio, int error);
	-extern int zio_handle_device_injection(vdev_t vd, zio_t zio, int error);
	-extern int zio_handle_label_injection(zio_t *zio, int error);
	-extern void zio_handle_ignored_writes(zio_t *zio);
	-extern hrtime_t zio_handle_io_delay(zio_t *zio);
	-
	-/*
	- * Checksum ereport functions
	- */
	-extern void zfs_ereport_start_checksum(spa_t spa, vdev_t vd, struct zio *zio,
	- uint64_t offset, uint64_t length, void arg, struct zio_bad_cksum info);
	-extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
	- const void good_data, const void bad_data, boolean_t drop_if_identical);
	-
	-extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
	-extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
	-
	-/* If we have the good data in hand, this function can be used */
	-extern void zfs_ereport_post_checksum(spa_t spa, vdev_t vd,
	- struct zio *zio, uint64_t offset, uint64_t length,
	- const void good_data, const void bad_data, struct zio_bad_cksum *info);
	-
	-/* Called from spa_sync(), but primarily an injection handler */
	-extern void spa_handle_ignored_writes(spa_t *spa);
	-
	-/* zbookmark_phys functions */
	-boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
	- const zbookmark_phys_t subtree_root, const zbookmark_phys_t last_block);
	-int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
	- uint8_t ibs2, const zbookmark_phys_t zb1, const zbookmark_phys_t zb2);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZIO_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
	@@ -1,119 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
	- * Copyright Saso Kiselkov 2013, All rights reserved.
	- */
	-
	-#ifndef _SYS_ZIO_CHECKSUM_H
	-#define _SYS_ZIO_CHECKSUM_H
	-
	-#include <sys/zio.h>
	-#include <zfeature_common.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct abd;
	-
	-/*
	- * Signature for checksum functions.
	- */
	-typedef void zio_checksum_t(struct abd *, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp);
	-typedef void zio_checksum_tmpl_init_t(const zio_cksum_salt_t salt);
	-typedef void zio_checksum_tmpl_free_t(void *ctx_template);
	-
	-typedef enum zio_checksum_flags {
	- /* Strong enough for metadata? */
	- ZCHECKSUM_FLAG_METADATA = (1 << 1),
	- /* ZIO embedded checksum */
	- ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
	- /* Strong enough for dedup (without verification)? */
	- ZCHECKSUM_FLAG_DEDUP = (1 << 3),
	- /* Uses salt value */
	- ZCHECKSUM_FLAG_SALTED = (1 << 4),
	- /* Strong enough for nopwrite? */
	- ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
	-} zio_checksum_flags_t;
	-
	-/*
	- * Information about each checksum function.
	- */
	-typedef struct zio_checksum_info {
	- /* checksum function for each byteorder */
	- zio_checksum_t *ci_func[2];
	- zio_checksum_tmpl_init_t *ci_tmpl_init;
	- zio_checksum_tmpl_free_t *ci_tmpl_free;
	- zio_checksum_flags_t ci_flags;
	- char ci_name; / descriptive name */
	-} zio_checksum_info_t;
	-
	-typedef struct zio_bad_cksum {
	- zio_cksum_t zbc_expected;
	- zio_cksum_t zbc_actual;
	- const char *zbc_checksum_name;
	- uint8_t zbc_byteswapped;
	- uint8_t zbc_injected;
	- uint8_t zbc_has_cksum; /* expected/actual valid */
	-} zio_bad_cksum_t;
	-
	-extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
	-
	-/*
	- * Checksum routines.
	- */
	-extern zio_checksum_t abd_checksum_SHA256;
	-extern zio_checksum_t abd_checksum_SHA512_native;
	-extern zio_checksum_t abd_checksum_SHA512_byteswap;
	-
	-/* Skein */
	-extern zio_checksum_t abd_checksum_skein_native;
	-extern zio_checksum_t abd_checksum_skein_byteswap;
	-extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init;
	-extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free;
	-
	-#ifdef illumos
	-/* Edon-R */
	-extern zio_checksum_t abd_checksum_edonr_native;
	-extern zio_checksum_t abd_checksum_edonr_byteswap;
	-extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init;
	-extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free;
	-#endif
	-
	-extern int zio_checksum_equal(spa_t , blkptr_t , enum zio_checksum,
	- void , uint64_t, uint64_t, zio_bad_cksum_t );
	-extern void zio_checksum_compute(zio_t *, enum zio_checksum,
	- struct abd *, uint64_t);
	-extern int zio_checksum_error_impl(spa_t , blkptr_t , enum zio_checksum,
	- struct abd , uint64_t, uint64_t, zio_bad_cksum_t );
	-extern int zio_checksum_error(zio_t zio, zio_bad_cksum_t out);
	-extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
	-extern void zio_checksum_templates_free(spa_t *spa);
	-extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZIO_CHECKSUM_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
	@@ -1,128 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZIO_COMPRESS_H
	-#define _SYS_ZIO_COMPRESS_H
	-
	-#include <sys/abd.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-enum zio_compress {
	- ZIO_COMPRESS_INHERIT = 0,
	- ZIO_COMPRESS_ON,
	- ZIO_COMPRESS_OFF,
	- ZIO_COMPRESS_LZJB,
	- ZIO_COMPRESS_EMPTY,
	- ZIO_COMPRESS_GZIP_1,
	- ZIO_COMPRESS_GZIP_2,
	- ZIO_COMPRESS_GZIP_3,
	- ZIO_COMPRESS_GZIP_4,
	- ZIO_COMPRESS_GZIP_5,
	- ZIO_COMPRESS_GZIP_6,
	- ZIO_COMPRESS_GZIP_7,
	- ZIO_COMPRESS_GZIP_8,
	- ZIO_COMPRESS_GZIP_9,
	- ZIO_COMPRESS_ZLE,
	- ZIO_COMPRESS_LZ4,
	- ZIO_COMPRESS_FUNCTIONS
	-};
	-
	-/* Common signature for all zio compress functions. */
	-typedef size_t zio_compress_func_t(void src, void dst,
	- size_t s_len, size_t d_len, int);
	-/* Common signature for all zio decompress functions. */
	-typedef int zio_decompress_func_t(void src, void dst,
	- size_t s_len, size_t d_len, int);
	-/*
	- * Common signature for all zio decompress functions using an ABD as input.
	- * This is helpful if you have both compressed ARC and scatter ABDs enabled,
	- * but is not a requirement for all compression algorithms.
	- */
	-typedef int zio_decompress_abd_func_t(abd_t src, void dst,
	- size_t s_len, size_t d_len, int);
	-
	-/*
	- * Information about each compression function.
	- */
	-typedef struct zio_compress_info {
	- char *ci_name;
	- int ci_level;
	- zio_compress_func_t *ci_compress;
	- zio_decompress_func_t *ci_decompress;
	-} zio_compress_info_t;
	-
	-extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
	-
	-/*
	- * Compression routines.
	- */
	-extern size_t lzjb_compress(void src, void dst, size_t s_len, size_t d_len,
	- int level);
	-extern int lzjb_decompress(void src, void dst, size_t s_len, size_t d_len,
	- int level);
	-extern size_t gzip_compress(void src, void dst, size_t s_len, size_t d_len,
	- int level);
	-extern int gzip_decompress(void src, void dst, size_t s_len, size_t d_len,
	- int level);
	-extern size_t zle_compress(void src, void dst, size_t s_len, size_t d_len,
	- int level);
	-extern int zle_decompress(void src, void dst, size_t s_len, size_t d_len,
	- int level);
	-extern void lz4_init(void);
	-extern void lz4_fini(void);
	-extern size_t lz4_compress(void src, void dst, size_t s_len, size_t d_len,
	- int level);
	-extern int lz4_decompress(void src, void dst, size_t s_len, size_t d_len,
	- int level);
	-
	-/*
	- * Compress and decompress data if necessary.
	- */
	-extern size_t zio_compress_data(enum zio_compress c, abd_t src, void dst,
	- size_t s_len);
	-extern int zio_decompress_data(enum zio_compress c, abd_t src, void dst,
	- size_t s_len, size_t d_len);
	-extern int zio_decompress_data_buf(enum zio_compress c, void src, void dst,
	- size_t s_len, size_t d_len);
	-
	-/*
	- * Module lifetime management.
	- */
	-extern void zio_compress_init(void);
	-extern void zio_compress_fini(void);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZIO_COMPRESS_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
	@@ -1,256 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _ZIO_IMPL_H
	-#define _ZIO_IMPL_H
	-
	-#include <sys/zfs_context.h>
	-#include <sys/zio.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * XXX -- Describe ZFS I/O pipeline here. Fill in as needed.
	- *
	- * The ZFS I/O pipeline is comprised of various stages which are defined
	- * in the zio_stage enum below. The individual stages are used to construct
	- * these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
	- *
	- * I/O operations: (XXX - provide detail for each of the operations)
	- *
	- * Read:
	- * Write:
	- * Free:
	- * Claim:
	- * Ioctl:
	- *
	- * Although the most common pipeline are used by the basic I/O operations
	- * above, there are some helper pipelines (one could consider them
	- * sub-pipelines) which are used internally by the ZIO module and are
	- * explained below:
	- *
	- * Interlock Pipeline:
	- * The interlock pipeline is the most basic pipeline and is used by all
	- * of the I/O operations. The interlock pipeline does not perform any I/O
	- * and is used to coordinate the dependencies between I/Os that are being
	- * issued (i.e. the parent/child relationship).
	- *
	- * Vdev child Pipeline:
	- * The vdev child pipeline is responsible for performing the physical I/O.
	- * It is in this pipeline where the I/O are queued and possibly cached.
	- *
	- * In addition to performing I/O, the pipeline is also responsible for
	- * data transformations. The transformations performed are based on the
	- * specific properties that user may have selected and modify the
	- * behavior of the pipeline. Examples of supported transformations are
	- * compression, dedup, and nop writes. Transformations will either modify
	- * the data or the pipeline. This list below further describes each of
	- * the supported transformations:
	- *
	- * Compression:
	- * ZFS supports three different flavors of compression -- gzip, lzjb, and
	- * zle. Compression occurs as part of the write pipeline and is performed
	- * in the ZIO_STAGE_WRITE_BP_INIT stage.
	- *
	- * Dedup:
	- * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and
	- * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing
	- * read pipeline if the dedup bit is set on the block pointer.
	- * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage
	- * and added to a write pipeline if a user has enabled dedup on that
	- * particular dataset.
	- *
	- * NOP Write:
	- * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage
	- * and is added to an existing write pipeline if a crypographically
	- * secure checksum (i.e. SHA256) is enabled and compression is turned on.
	- * The NOP write stage will compare the checksums of the current data
	- * on-disk (level-0 blocks only) and the data that is currently being written.
	- * If the checksum values are identical then the pipeline is converted to
	- * an interlock pipeline skipping block allocation and bypassing the
	- * physical I/O. The nop write feature can handle writes in either
	- * syncing or open context (i.e. zil writes) and as a result is mutually
	- * exclusive with dedup.
	- */
	-
	-/*
	- * zio pipeline stage definitions
	- */
	-enum zio_stage {
	- ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
	-
	- ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
	- ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */
	- ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */
	- ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */
	- ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */
	-
	- ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */
	-
	- ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */
	-
	- ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */
	- ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */
	- ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */
	- ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */
	-
	- ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */
	- ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */
	-
	- ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */
	- ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */
	- ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */
	- ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */
	-
	- ZIO_STAGE_READY = 1 << 18, /* RWFCI */
	-
	- ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RWF-I */
	- ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RWF-I */
	- ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RWF-I */
	-
	- ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */
	-
	- ZIO_STAGE_DONE = 1 << 23 /* RWFCI */
	-};
	-
	-#define ZIO_INTERLOCK_STAGES \
	- (ZIO_STAGE_READY \| \
	- ZIO_STAGE_DONE)
	-
	-#define ZIO_INTERLOCK_PIPELINE \
	- ZIO_INTERLOCK_STAGES
	-
	-#define ZIO_VDEV_IO_STAGES \
	- (ZIO_STAGE_VDEV_IO_START \| \
	- ZIO_STAGE_VDEV_IO_DONE \| \
	- ZIO_STAGE_VDEV_IO_ASSESS)
	-
	-#define ZIO_VDEV_CHILD_PIPELINE \
	- (ZIO_VDEV_IO_STAGES \| \
	- ZIO_STAGE_DONE)
	-
	-#define ZIO_READ_COMMON_STAGES \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_VDEV_IO_STAGES \| \
	- ZIO_STAGE_CHECKSUM_VERIFY)
	-
	-#define ZIO_READ_PHYS_PIPELINE \
	- ZIO_READ_COMMON_STAGES
	-
	-#define ZIO_READ_PIPELINE \
	- (ZIO_READ_COMMON_STAGES \| \
	- ZIO_STAGE_READ_BP_INIT)
	-
	-#define ZIO_DDT_CHILD_READ_PIPELINE \
	- ZIO_READ_COMMON_STAGES
	-
	-#define ZIO_DDT_READ_PIPELINE \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_STAGE_READ_BP_INIT \| \
	- ZIO_STAGE_DDT_READ_START \| \
	- ZIO_STAGE_DDT_READ_DONE)
	-
	-#define ZIO_WRITE_COMMON_STAGES \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_VDEV_IO_STAGES \| \
	- ZIO_STAGE_ISSUE_ASYNC \| \
	- ZIO_STAGE_CHECKSUM_GENERATE)
	-
	-#define ZIO_WRITE_PHYS_PIPELINE \
	- ZIO_WRITE_COMMON_STAGES
	-
	-#define ZIO_REWRITE_PIPELINE \
	- (ZIO_WRITE_COMMON_STAGES \| \
	- ZIO_STAGE_WRITE_COMPRESS \| \
	- ZIO_STAGE_WRITE_BP_INIT)
	-
	-#define ZIO_WRITE_PIPELINE \
	- (ZIO_WRITE_COMMON_STAGES \| \
	- ZIO_STAGE_WRITE_BP_INIT \| \
	- ZIO_STAGE_WRITE_COMPRESS \| \
	- ZIO_STAGE_DVA_THROTTLE \| \
	- ZIO_STAGE_DVA_ALLOCATE)
	-
	-#define ZIO_DDT_CHILD_WRITE_PIPELINE \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_VDEV_IO_STAGES \| \
	- ZIO_STAGE_DVA_THROTTLE \| \
	- ZIO_STAGE_DVA_ALLOCATE)
	-
	-#define ZIO_DDT_WRITE_PIPELINE \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_STAGE_WRITE_BP_INIT \| \
	- ZIO_STAGE_ISSUE_ASYNC \| \
	- ZIO_STAGE_WRITE_COMPRESS \| \
	- ZIO_STAGE_CHECKSUM_GENERATE \| \
	- ZIO_STAGE_DDT_WRITE)
	-
	-#define ZIO_GANG_STAGES \
	- (ZIO_STAGE_GANG_ASSEMBLE \| \
	- ZIO_STAGE_GANG_ISSUE)
	-
	-#define ZIO_FREE_PIPELINE \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_STAGE_FREE_BP_INIT \| \
	- ZIO_STAGE_DVA_FREE)
	-
	-#define ZIO_FREE_PHYS_PIPELINE \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_VDEV_IO_STAGES)
	-
	-#define ZIO_DDT_FREE_PIPELINE \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_STAGE_FREE_BP_INIT \| \
	- ZIO_STAGE_ISSUE_ASYNC \| \
	- ZIO_STAGE_DDT_FREE)
	-
	-#define ZIO_CLAIM_PIPELINE \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_STAGE_DVA_CLAIM)
	-
	-#define ZIO_IOCTL_PIPELINE \
	- (ZIO_INTERLOCK_STAGES \| \
	- ZIO_STAGE_VDEV_IO_START \| \
	- ZIO_STAGE_VDEV_IO_ASSESS)
	-
	-#define ZIO_BLOCKING_STAGES \
	- (ZIO_STAGE_DVA_ALLOCATE \| \
	- ZIO_STAGE_DVA_CLAIM \| \
	- ZIO_STAGE_VDEV_IO_START)
	-
	-extern void zio_inject_init(void);
	-extern void zio_inject_fini(void);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZIO_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
	@@ -1,43 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
	- */
	-#ifndef _ZIO_PRIORITY_H
	-#define _ZIO_PRIORITY_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef enum zio_priority {
	- ZIO_PRIORITY_SYNC_READ,
	- ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
	- ZIO_PRIORITY_ASYNC_READ, /* prefetch */
	- ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
	- ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
	- ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
	- ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
	- ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
	- ZIO_PRIORITY_NUM_QUEUEABLE,
	-
	- ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
	-} zio_priority_t;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZIO_PRIORITY_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
	@@ -1,63 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2015 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZRLOCK_H
	-#define _SYS_ZRLOCK_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef struct zrlock {
	- kmutex_t zr_mtx;
	- volatile int32_t zr_refcount;
	- kcondvar_t zr_cv;
	- uint16_t zr_pad;
	-#ifdef ZFS_DEBUG
	- kthread_t *zr_owner;
	- const char *zr_caller;
	-#endif
	-} zrlock_t;
	-
	-extern void zrl_init(zrlock_t *);
	-extern void zrl_destroy(zrlock_t *);
	-#define zrl_add(_z) zrl_add_impl((_z), __func__)
	-extern void zrl_add_impl(zrlock_t , const char );
	-extern void zrl_remove(zrlock_t *);
	-extern int zrl_tryenter(zrlock_t *);
	-extern void zrl_exit(zrlock_t *);
	-extern int zrl_is_zero(zrlock_t *);
	-extern int zrl_is_locked(zrlock_t *);
	-#ifdef ZFS_DEBUG
	-extern kthread_t zrl_owner(zrlock_t );
	-#endif
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZRLOCK_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h
	@@ -1,39 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZTHR_H
	-#define _SYS_ZTHR_H
	-
	-typedef struct zthr zthr_t;
	-typedef void (zthr_func_t)(void , zthr_t );
	-typedef boolean_t (zthr_checkfunc_t)(void , zthr_t );
	-
	-extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc,
	- zthr_func_t func, void arg);
	-extern zthr_t zthr_create_timer(zthr_checkfunc_t checkfunc,
	- zthr_func_t func, void arg, hrtime_t nano_wait);
	-extern void zthr_destroy(zthr_t *t);
	-
	-extern void zthr_wakeup(zthr_t *t);
	-extern void zthr_cancel(zthr_t *t);
	-extern void zthr_resume(zthr_t *t);
	-
	-extern boolean_t zthr_iscancelled(zthr_t *t);
	-
	-#endif /* _SYS_ZTHR_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
	@@ -1,85 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2016 Actifio, Inc. All rights reserved.
	- */
	-
	-#ifndef _SYS_ZVOL_H
	-#define _SYS_ZVOL_H
	-
	-#include <sys/zfs_context.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define ZVOL_OBJ 1ULL
	-#define ZVOL_ZAP_OBJ 2ULL
	-
	-#ifdef _KERNEL
	-extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
	-extern int zvol_check_volblocksize(uint64_t volblocksize);
	-extern int zvol_get_stats(objset_t os, nvlist_t nv);
	-extern void zvol_create_cb(objset_t os, void arg, cred_t cr, dmu_tx_t tx);
	-extern int zvol_set_volsize(const char *, uint64_t);
	-
	-#ifdef illumos
	-extern int zvol_open(dev_t devp, int flag, int otyp, cred_t cr);
	-extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
	-extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
	-extern int zvol_strategy(buf_t *bp);
	-extern int zvol_read(dev_t dev, uio_t uiop, cred_t cr);
	-extern int zvol_write(dev_t dev, uio_t uiop, cred_t cr);
	-extern int zvol_aread(dev_t dev, struct aio_req aio, cred_t cr);
	-extern int zvol_awrite(dev_t dev, struct aio_req aio, cred_t cr);
	-#endif /* illumos */
	-extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
	- int *rvalp);
	-extern int zvol_busy(void);
	-extern void zvol_init(void);
	-extern void zvol_fini(void);
	-
	-#ifdef illumos
	-extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
	- uint64_t max_xfer_len, void minor_hdl, void objset_hdl, void *zil_hdl,
	- void rl_hdl, void bonus_hdl);
	-extern uint64_t zvol_get_volume_size(void *minor_hdl);
	-extern int zvol_get_volume_wce(void *minor_hdl);
	-extern void zvol_log_write_minor(void minor_hdl, dmu_tx_t tx, offset_t off,
	- ssize_t resid, boolean_t sync);
	-#endif /* illumos */
	-
	-#if defined(__FreeBSD__) \|\| defined(__FreeBSD_kernel__)
	-extern void zvol_create_minors(spa_t spa, const char name);
	-extern void zvol_remove_minors(spa_t spa, const char name);
	-extern void zvol_rename_minors(spa_t spa, const char oldname,
	- const char *newname);
	-#endif
	-
	-#endif /* _KERNEL */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ZVOL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
	@@ -1,634 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
	- * All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa_impl.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/trim_map.h>
	-#include <sys/time.h>
	-
	-/*
	- * Calculate the zio end, upgrading based on ashift which would be
	- * done by zio_vdev_io_start.
	- *
	- * This makes free range consolidation much more effective
	- * than it would otherwise be as well as ensuring that entire
	- * blocks are invalidated by writes.
	- */
	-#define TRIM_ZIO_END(vd, offset, size) (offset + \
	- P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
	-
	-/* Maximal segment size for ATA TRIM. */
	-#define TRIM_MAP_SIZE_FACTOR (512 << 16)
	-
	-#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR)
	-
	-#define TRIM_MAP_ADD(tm, ts) do { \
	- list_insert_tail(&(tm)->tm_head, (ts)); \
	- (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
	-} while (0)
	-
	-#define TRIM_MAP_REM(tm, ts) do { \
	- list_remove(&(tm)->tm_head, (ts)); \
	- (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
	-} while (0)
	-
	-typedef struct trim_map {
	- list_t tm_head; /* List of segments sorted by txg. */
	- avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */
	- avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */
	- avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */
	- list_t tm_pending_writes; /* Writes blocked on in-flight frees. */
	- kmutex_t tm_lock;
	- uint64_t tm_pending; /* Count of pending TRIMs. */
	-} trim_map_t;
	-
	-typedef struct trim_seg {
	- avl_node_t ts_node; /* AVL node. */
	- list_node_t ts_next; /* List element. */
	- uint64_t ts_start; /* Starting offset of this segment. */
	- uint64_t ts_end; /* Ending offset (non-inclusive). */
	- uint64_t ts_txg; /* Segment creation txg. */
	- hrtime_t ts_time; /* Segment creation time. */
	-} trim_seg_t;
	-
	-extern boolean_t zfs_trim_enabled;
	-
	-static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */
	-static u_int trim_timeout = 30; /* Keep deleted data up to 30s */
	-static u_int trim_max_interval = 1; /* 1s delays between TRIMs */
	-static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	- "ZFS TRIM");
	-
	-SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
	- 0, "Delay TRIMs by up to this many TXGs");
	-SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
	- "Delay TRIMs by up to this many seconds");
	-SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
	- &trim_max_interval, 0,
	- "Maximum interval between TRIM queue processing (seconds)");
	-
	-SYSCTL_DECL(_vfs_zfs_vdev);
	-SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
	- &trim_vdev_max_pending, 0,
	- "Maximum pending TRIM segments for a vdev");
	-
	-static void trim_map_vdev_commit_done(spa_t spa, vdev_t vd);
	-
	-static int
	-trim_map_seg_compare(const void x1, const void x2)
	-{
	- const trim_seg_t *s1 = x1;
	- const trim_seg_t *s2 = x2;
	-
	- if (s1->ts_start < s2->ts_start) {
	- if (s1->ts_end > s2->ts_start)
	- return (0);
	- return (-1);
	- }
	- if (s1->ts_start > s2->ts_start) {
	- if (s1->ts_start < s2->ts_end)
	- return (0);
	- return (1);
	- }
	- return (0);
	-}
	-
	-static int
	-trim_map_zio_compare(const void x1, const void x2)
	-{
	- const zio_t *z1 = x1;
	- const zio_t *z2 = x2;
	-
	- if (z1->io_offset < z2->io_offset) {
	- if (z1->io_offset + z1->io_size > z2->io_offset)
	- return (0);
	- return (-1);
	- }
	- if (z1->io_offset > z2->io_offset) {
	- if (z1->io_offset < z2->io_offset + z2->io_size)
	- return (0);
	- return (1);
	- }
	- return (0);
	-}
	-
	-void
	-trim_map_create(vdev_t *vd)
	-{
	- trim_map_t *tm;
	-
	- ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
	- vd->vdev_ops->vdev_op_leaf);
	-
	- tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
	- mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
	- list_create(&tm->tm_head, sizeof (trim_seg_t),
	- offsetof(trim_seg_t, ts_next));
	- list_create(&tm->tm_pending_writes, sizeof (zio_t),
	- offsetof(zio_t, io_trim_link));
	- avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
	- sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
	- avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
	- sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
	- avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
	- sizeof (zio_t), offsetof(zio_t, io_trim_node));
	- vd->vdev_trimmap = tm;
	-}
	-
	-void
	-trim_map_destroy(vdev_t *vd)
	-{
	- trim_map_t *tm;
	- trim_seg_t *ts;
	-
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- if (!zfs_trim_enabled)
	- return;
	-
	- tm = vd->vdev_trimmap;
	- if (tm == NULL)
	- return;
	-
	- /*
	- * We may have been called before trim_map_vdev_commit_done()
	- * had a chance to run, so do it now to prune the remaining
	- * inflight frees.
	- */
	- trim_map_vdev_commit_done(vd->vdev_spa, vd);
	-
	- mutex_enter(&tm->tm_lock);
	- while ((ts = list_head(&tm->tm_head)) != NULL) {
	- avl_remove(&tm->tm_queued_frees, ts);
	- TRIM_MAP_REM(tm, ts);
	- kmem_free(ts, sizeof (*ts));
	- }
	- mutex_exit(&tm->tm_lock);
	-
	- avl_destroy(&tm->tm_queued_frees);
	- avl_destroy(&tm->tm_inflight_frees);
	- avl_destroy(&tm->tm_inflight_writes);
	- list_destroy(&tm->tm_pending_writes);
	- list_destroy(&tm->tm_head);
	- mutex_destroy(&tm->tm_lock);
	- kmem_free(tm, sizeof (*tm));
	- vd->vdev_trimmap = NULL;
	-}
	-
	-static void
	-trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
	-{
	- avl_index_t where;
	- trim_seg_t tsearch, ts_before, ts_after, *ts;
	- boolean_t merge_before, merge_after;
	- hrtime_t time;
	-
	- ASSERT(MUTEX_HELD(&tm->tm_lock));
	- VERIFY(start < end);
	-
	- time = gethrtime();
	- tsearch.ts_start = start;
	- tsearch.ts_end = end;
	-
	- ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
	- if (ts != NULL) {
	- if (start < ts->ts_start)
	- trim_map_segment_add(tm, start, ts->ts_start, txg);
	- if (end > ts->ts_end)
	- trim_map_segment_add(tm, ts->ts_end, end, txg);
	- return;
	- }
	-
	- ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
	- ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
	-
	- merge_before = (ts_before != NULL && ts_before->ts_end == start);
	- merge_after = (ts_after != NULL && ts_after->ts_start == end);
	-
	- if (merge_before && merge_after) {
	- avl_remove(&tm->tm_queued_frees, ts_before);
	- TRIM_MAP_REM(tm, ts_before);
	- TRIM_MAP_REM(tm, ts_after);
	- ts_after->ts_start = ts_before->ts_start;
	- ts_after->ts_txg = txg;
	- ts_after->ts_time = time;
	- TRIM_MAP_ADD(tm, ts_after);
	- kmem_free(ts_before, sizeof (*ts_before));
	- } else if (merge_before) {
	- TRIM_MAP_REM(tm, ts_before);
	- ts_before->ts_end = end;
	- ts_before->ts_txg = txg;
	- ts_before->ts_time = time;
	- TRIM_MAP_ADD(tm, ts_before);
	- } else if (merge_after) {
	- TRIM_MAP_REM(tm, ts_after);
	- ts_after->ts_start = start;
	- ts_after->ts_txg = txg;
	- ts_after->ts_time = time;
	- TRIM_MAP_ADD(tm, ts_after);
	- } else {
	- ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
	- ts->ts_start = start;
	- ts->ts_end = end;
	- ts->ts_txg = txg;
	- ts->ts_time = time;
	- avl_insert(&tm->tm_queued_frees, ts, where);
	- TRIM_MAP_ADD(tm, ts);
	- }
	-}
	-
	-static void
	-trim_map_segment_remove(trim_map_t tm, trim_seg_t ts, uint64_t start,
	- uint64_t end)
	-{
	- trim_seg_t *nts;
	- boolean_t left_over, right_over;
	-
	- ASSERT(MUTEX_HELD(&tm->tm_lock));
	-
	- left_over = (ts->ts_start < start);
	- right_over = (ts->ts_end > end);
	-
	- TRIM_MAP_REM(tm, ts);
	- if (left_over && right_over) {
	- nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
	- nts->ts_start = end;
	- nts->ts_end = ts->ts_end;
	- nts->ts_txg = ts->ts_txg;
	- nts->ts_time = ts->ts_time;
	- ts->ts_end = start;
	- avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
	- TRIM_MAP_ADD(tm, ts);
	- TRIM_MAP_ADD(tm, nts);
	- } else if (left_over) {
	- ts->ts_end = start;
	- TRIM_MAP_ADD(tm, ts);
	- } else if (right_over) {
	- ts->ts_start = end;
	- TRIM_MAP_ADD(tm, ts);
	- } else {
	- avl_remove(&tm->tm_queued_frees, ts);
	- kmem_free(ts, sizeof (*ts));
	- }
	-}
	-
	-static void
	-trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
	-{
	- zio_t zsearch, *zs;
	-
	- ASSERT(MUTEX_HELD(&tm->tm_lock));
	-
	- zsearch.io_offset = start;
	- zsearch.io_size = end - start;
	-
	- zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
	- if (zs == NULL) {
	- trim_map_segment_add(tm, start, end, txg);
	- return;
	- }
	- if (start < zs->io_offset)
	- trim_map_free_locked(tm, start, zs->io_offset, txg);
	- if (zs->io_offset + zs->io_size < end)
	- trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
	-}
	-
	-void
	-trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
	-{
	- trim_map_t *tm = vd->vdev_trimmap;
	-
	- if (!zfs_trim_enabled \|\| vd->vdev_notrim \|\| tm == NULL)
	- return;
	-
	- mutex_enter(&tm->tm_lock);
	- trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
	- mutex_exit(&tm->tm_lock);
	-}
	-
	-boolean_t
	-trim_map_write_start(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- trim_map_t *tm = vd->vdev_trimmap;
	- trim_seg_t tsearch, *ts;
	- boolean_t left_over, right_over;
	- uint64_t start, end;
	-
	- if (!zfs_trim_enabled \|\| vd->vdev_notrim \|\| tm == NULL)
	- return (B_TRUE);
	-
	- start = zio->io_offset;
	- end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
	- tsearch.ts_start = start;
	- tsearch.ts_end = end;
	-
	- mutex_enter(&tm->tm_lock);
	-
	- /*
	- * Checking for colliding in-flight frees.
	- */
	- ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
	- if (ts != NULL) {
	- list_insert_tail(&tm->tm_pending_writes, zio);
	- mutex_exit(&tm->tm_lock);
	- return (B_FALSE);
	- }
	-
	- /*
	- * Loop until all overlapping segments are removed.
	- */
	- while ((ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL)) != NULL) {
	- trim_map_segment_remove(tm, ts, start, end);
	- }
	-
	- avl_add(&tm->tm_inflight_writes, zio);
	-
	- mutex_exit(&tm->tm_lock);
	-
	- return (B_TRUE);
	-}
	-
	-void
	-trim_map_write_done(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- trim_map_t *tm = vd->vdev_trimmap;
	-
	- /*
	- * Don't check for vdev_notrim, since the write could have
	- * started before vdev_notrim was set.
	- */
	- if (!zfs_trim_enabled \|\| tm == NULL)
	- return;
	-
	- mutex_enter(&tm->tm_lock);
	- /*
	- * Don't fail if the write isn't in the tree, since the write
	- * could have started after vdev_notrim was set.
	- */
	- if (zio->io_trim_node.avl_child[0] \|\|
	- zio->io_trim_node.avl_child[1] \|\|
	- AVL_XPARENT(&zio->io_trim_node) \|\|
	- tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
	- avl_remove(&tm->tm_inflight_writes, zio);
	- mutex_exit(&tm->tm_lock);
	-}
	-
	-/*
	- * Return the oldest segment (the one with the lowest txg / time) or NULL if:
	- * 1. The list is empty
	- * 2. The first element's txg is greater than txgsafe
	- * 3. The first element's txg is not greater than the txg argument and the
	- * the first element's time is not greater than time argument
	- */
	-static trim_seg_t *
	-trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time,
	- boolean_t force)
	-{
	- trim_seg_t *ts;
	-
	- ASSERT(MUTEX_HELD(&tm->tm_lock));
	- VERIFY(txgsafe >= txg);
	-
	- ts = list_head(&tm->tm_head);
	- if (ts != NULL && ts->ts_txg <= txgsafe &&
	- (ts->ts_txg <= txg \|\| ts->ts_time <= time \|\| force))
	- return (ts);
	- return (NULL);
	-}
	-
	-static void
	-trim_map_vdev_commit(spa_t spa, zio_t zio, vdev_t *vd)
	-{
	- trim_map_t *tm = vd->vdev_trimmap;
	- trim_seg_t *ts;
	- uint64_t size, offset, txgtarget, txgsafe;
	- int64_t hard, soft;
	- hrtime_t timelimit;
	-
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- if (tm == NULL)
	- return;
	-
	- timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
	- if (vd->vdev_isl2cache) {
	- txgsafe = UINT64_MAX;
	- txgtarget = UINT64_MAX;
	- } else {
	- txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
	- if (txgsafe > trim_txg_delay)
	- txgtarget = txgsafe - trim_txg_delay;
	- else
	- txgtarget = 0;
	- }
	-
	- mutex_enter(&tm->tm_lock);
	- hard = 0;
	- if (tm->tm_pending > trim_vdev_max_pending)
	- hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
	- soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
	- /* Loop until we have sent all outstanding free's */
	- while (soft > 0 &&
	- (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
	- != NULL) {
	- TRIM_MAP_REM(tm, ts);
	- avl_remove(&tm->tm_queued_frees, ts);
	- avl_add(&tm->tm_inflight_frees, ts);
	- size = ts->ts_end - ts->ts_start;
	- offset = ts->ts_start;
	- /*
	- * We drop the lock while we call zio_nowait as the IO
	- * scheduler can result in a different IO being run e.g.
	- * a write which would result in a recursive lock.
	- */
	- mutex_exit(&tm->tm_lock);
	-
	- zio_nowait(zio_trim(zio, spa, vd, offset, size));
	-
	- soft -= TRIM_MAP_SEGS(size);
	- hard -= TRIM_MAP_SEGS(size);
	- mutex_enter(&tm->tm_lock);
	- }
	- mutex_exit(&tm->tm_lock);
	-}
	-
	-static void
	-trim_map_vdev_commit_done(spa_t spa, vdev_t vd)
	-{
	- trim_map_t *tm = vd->vdev_trimmap;
	- trim_seg_t *ts;
	- list_t pending_writes;
	- zio_t *zio;
	- uint64_t start, size;
	- void *cookie;
	-
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- if (tm == NULL)
	- return;
	-
	- mutex_enter(&tm->tm_lock);
	- if (!avl_is_empty(&tm->tm_inflight_frees)) {
	- cookie = NULL;
	- while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
	- &cookie)) != NULL) {
	- kmem_free(ts, sizeof (*ts));
	- }
	- }
	- list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
	- io_trim_link));
	- list_move_tail(&pending_writes, &tm->tm_pending_writes);
	- mutex_exit(&tm->tm_lock);
	-
	- while ((zio = list_remove_head(&pending_writes)) != NULL) {
	- zio_vdev_io_reissue(zio);
	- zio_execute(zio);
	- }
	- list_destroy(&pending_writes);
	-}
	-
	-static void
	-trim_map_commit(spa_t spa, zio_t zio, vdev_t *vd)
	-{
	- int c;
	-
	- if (vd == NULL)
	- return;
	-
	- if (vd->vdev_ops->vdev_op_leaf) {
	- trim_map_vdev_commit(spa, zio, vd);
	- } else {
	- for (c = 0; c < vd->vdev_children; c++)
	- trim_map_commit(spa, zio, vd->vdev_child[c]);
	- }
	-}
	-
	-static void
	-trim_map_commit_done(spa_t spa, vdev_t vd)
	-{
	- int c;
	-
	- if (vd == NULL)
	- return;
	-
	- if (vd->vdev_ops->vdev_op_leaf) {
	- trim_map_vdev_commit_done(spa, vd);
	- } else {
	- for (c = 0; c < vd->vdev_children; c++)
	- trim_map_commit_done(spa, vd->vdev_child[c]);
	- }
	-}
	-
	-static void
	-trim_thread(void *arg)
	-{
	- spa_t *spa = arg;
	- zio_t *zio;
	-
	-#ifdef _KERNEL
	- (void) snprintf(curthread->td_name, sizeof(curthread->td_name),
	- "trim %s", spa_name(spa));
	-#endif
	-
	- for (;;) {
	- mutex_enter(&spa->spa_trim_lock);
	- if (spa->spa_trim_thread == NULL) {
	- spa->spa_trim_thread = curthread;
	- cv_signal(&spa->spa_trim_cv);
	- mutex_exit(&spa->spa_trim_lock);
	- thread_exit();
	- }
	-
	- (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
	- hz * trim_max_interval);
	- mutex_exit(&spa->spa_trim_lock);
	-
	- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
	-
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	- trim_map_commit(spa, zio, spa->spa_root_vdev);
	- (void) zio_wait(zio);
	- trim_map_commit_done(spa, spa->spa_root_vdev);
	- spa_config_exit(spa, SCL_STATE, FTAG);
	- }
	-}
	-
	-void
	-trim_thread_create(spa_t *spa)
	-{
	-
	- if (!zfs_trim_enabled)
	- return;
	-
	- mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
	- mutex_enter(&spa->spa_trim_lock);
	- spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
	- TS_RUN, minclsyspri);
	- mutex_exit(&spa->spa_trim_lock);
	-}
	-
	-void
	-trim_thread_destroy(spa_t *spa)
	-{
	-
	- if (!zfs_trim_enabled)
	- return;
	- if (spa->spa_trim_thread == NULL)
	- return;
	-
	- mutex_enter(&spa->spa_trim_lock);
	- /* Setting spa_trim_thread to NULL tells the thread to stop. */
	- spa->spa_trim_thread = NULL;
	- cv_signal(&spa->spa_trim_cv);
	- /* The thread will set it back to != NULL on exit. */
	- while (spa->spa_trim_thread == NULL)
	- cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
	- spa->spa_trim_thread = NULL;
	- mutex_exit(&spa->spa_trim_lock);
	-
	- cv_destroy(&spa->spa_trim_cv);
	- mutex_destroy(&spa->spa_trim_lock);
	-}
	-
	-void
	-trim_thread_wakeup(spa_t *spa)
	-{
	-
	- if (!zfs_trim_enabled)
	- return;
	- if (spa->spa_trim_thread == NULL)
	- return;
	-
	- mutex_enter(&spa->spa_trim_lock);
	- cv_signal(&spa->spa_trim_cv);
	- mutex_exit(&spa->spa_trim_lock);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
	@@ -1,977 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org>
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/txg_impl.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/zil.h>
	-#include <sys/callb.h>
	-
	-/*
	- * ZFS Transaction Groups
	- * ----------------------
	- *
	- * ZFS transaction groups are, as the name implies, groups of transactions
	- * that act on persistent state. ZFS asserts consistency at the granularity of
	- * these transaction groups. Each successive transaction group (txg) is
	- * assigned a 64-bit consecutive identifier. There are three active
	- * transaction group states: open, quiescing, or syncing. At any given time,
	- * there may be an active txg associated with each state; each active txg may
	- * either be processing, or blocked waiting to enter the next state. There may
	- * be up to three active txgs, and there is always a txg in the open state
	- * (though it may be blocked waiting to enter the quiescing state). In broad
	- * strokes, transactions -- operations that change in-memory structures -- are
	- * accepted into the txg in the open state, and are completed while the txg is
	- * in the open or quiescing states. The accumulated changes are written to
	- * disk in the syncing state.
	- *
	- * Open
	- *
	- * When a new txg becomes active, it first enters the open state. New
	- * transactions -- updates to in-memory structures -- are assigned to the
	- * currently open txg. There is always a txg in the open state so that ZFS can
	- * accept new changes (though the txg may refuse new changes if it has hit
	- * some limit). ZFS advances the open txg to the next state for a variety of
	- * reasons such as it hitting a time or size threshold, or the execution of an
	- * administrative action that must be completed in the syncing state.
	- *
	- * Quiescing
	- *
	- * After a txg exits the open state, it enters the quiescing state. The
	- * quiescing state is intended to provide a buffer between accepting new
	- * transactions in the open state and writing them out to stable storage in
	- * the syncing state. While quiescing, transactions can continue their
	- * operation without delaying either of the other states. Typically, a txg is
	- * in the quiescing state very briefly since the operations are bounded by
	- * software latencies rather than, say, slower I/O latencies. After all
	- * transactions complete, the txg is ready to enter the next state.
	- *
	- * Syncing
	- *
	- * In the syncing state, the in-memory state built up during the open and (to
	- * a lesser degree) the quiescing states is written to stable storage. The
	- * process of writing out modified data can, in turn modify more data. For
	- * example when we write new blocks, we need to allocate space for them; those
	- * allocations modify metadata (space maps)... which themselves must be
	- * written to stable storage. During the sync state, ZFS iterates, writing out
	- * data until it converges and all in-memory changes have been written out.
	- * The first such pass is the largest as it encompasses all the modified user
	- * data (as opposed to filesystem metadata). Subsequent passes typically have
	- * far less data to write as they consist exclusively of filesystem metadata.
	- *
	- * To ensure convergence, after a certain number of passes ZFS begins
	- * overwriting locations on stable storage that had been allocated earlier in
	- * the syncing state (and subsequently freed). ZFS usually allocates new
	- * blocks to optimize for large, continuous, writes. For the syncing state to
	- * converge however it must complete a pass where no new blocks are allocated
	- * since each allocation requires a modification of persistent metadata.
	- * Further, to hasten convergence, after a prescribed number of passes, ZFS
	- * also defers frees, and stops compressing.
	- *
	- * In addition to writing out user data, we must also execute synctasks during
	- * the syncing context. A synctask is the mechanism by which some
	- * administrative activities work such as creating and destroying snapshots or
	- * datasets. Note that when a synctask is initiated it enters the open txg,
	- * and ZFS then pushes that txg as quickly as possible to completion of the
	- * syncing state in order to reduce the latency of the administrative
	- * activity. To complete the syncing state, ZFS writes out a new uberblock,
	- * the root of the tree of blocks that comprise all state stored on the ZFS
	- * pool. Finally, if there is a quiesced txg waiting, we signal that it can
	- * now transition to the syncing state.
	- */
	-
	-static void txg_sync_thread(void *arg);
	-static void txg_quiesce_thread(void *arg);
	-
	-int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "ZFS TXG");
	-SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0,
	- "Maximum seconds worth of delta per txg");
	-
	-/*
	- * Prepare the txg subsystem.
	- */
	-void
	-txg_init(dsl_pool_t *dp, uint64_t txg)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	- int c;
	- bzero(tx, sizeof (tx_state_t));
	-
	- tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
	-
	- for (c = 0; c < max_ncpus; c++) {
	- int i;
	-
	- mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
	- NULL);
	- for (i = 0; i < TXG_SIZE; i++) {
	- cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
	- NULL);
	- list_create(&tx->tx_cpu[c].tc_callbacks[i],
	- sizeof (dmu_tx_callback_t),
	- offsetof(dmu_tx_callback_t, dcb_node));
	- }
	- }
	-
	- mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
	-
	- tx->tx_open_txg = txg;
	-}
	-
	-/*
	- * Close down the txg subsystem.
	- */
	-void
	-txg_fini(dsl_pool_t *dp)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	- int c;
	-
	- ASSERT0(tx->tx_threads);
	-
	- mutex_destroy(&tx->tx_sync_lock);
	-
	- cv_destroy(&tx->tx_sync_more_cv);
	- cv_destroy(&tx->tx_sync_done_cv);
	- cv_destroy(&tx->tx_quiesce_more_cv);
	- cv_destroy(&tx->tx_quiesce_done_cv);
	- cv_destroy(&tx->tx_exit_cv);
	-
	- for (c = 0; c < max_ncpus; c++) {
	- int i;
	-
	- mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
	- mutex_destroy(&tx->tx_cpu[c].tc_lock);
	- for (i = 0; i < TXG_SIZE; i++) {
	- cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
	- list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
	- }
	- }
	-
	- if (tx->tx_commit_cb_taskq != NULL)
	- taskq_destroy(tx->tx_commit_cb_taskq);
	-
	- kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
	-
	- bzero(tx, sizeof (tx_state_t));
	-}
	-
	-/*
	- * Start syncing transaction groups.
	- */
	-void
	-txg_sync_start(dsl_pool_t *dp)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	-
	- mutex_enter(&tx->tx_sync_lock);
	-
	- dprintf("pool %p\n", dp);
	-
	- ASSERT0(tx->tx_threads);
	-
	- tx->tx_threads = 2;
	-
	- tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
	- dp, 0, spa_proc(dp->dp_spa), TS_RUN, minclsyspri);
	-
	- /*
	- * The sync thread can need a larger-than-default stack size on
	- * 32-bit x86. This is due in part to nested pools and
	- * scrub_visitbp() recursion.
	- */
	- tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
	- dp, 0, spa_proc(dp->dp_spa), TS_RUN, minclsyspri);
	-
	- mutex_exit(&tx->tx_sync_lock);
	-}
	-
	-static void
	-txg_thread_enter(tx_state_t tx, callb_cpr_t cpr)
	-{
	- CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
	- mutex_enter(&tx->tx_sync_lock);
	-}
	-
	-static void
	-txg_thread_exit(tx_state_t tx, callb_cpr_t cpr, kthread_t **tpp)
	-{
	- ASSERT(*tpp != NULL);
	- *tpp = NULL;
	- tx->tx_threads--;
	- cv_broadcast(&tx->tx_exit_cv);
	- CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
	- thread_exit();
	-}
	-
	-static void
	-txg_thread_wait(tx_state_t tx, callb_cpr_t cpr, kcondvar_t *cv, clock_t time)
	-{
	- CALLB_CPR_SAFE_BEGIN(cpr);
	-
	- if (time)
	- (void) cv_timedwait(cv, &tx->tx_sync_lock, time);
	- else
	- cv_wait(cv, &tx->tx_sync_lock);
	-
	- CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
	-}
	-
	-/*
	- * Stop syncing transaction groups.
	- */
	-void
	-txg_sync_stop(dsl_pool_t *dp)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	-
	- dprintf("pool %p\n", dp);
	- /*
	- * Finish off any work in progress.
	- */
	- ASSERT3U(tx->tx_threads, ==, 2);
	-
	- /*
	- * We need to ensure that we've vacated the deferred space_maps.
	- */
	- txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
	-
	- /*
	- * Wake all sync threads and wait for them to die.
	- */
	- mutex_enter(&tx->tx_sync_lock);
	-
	- ASSERT3U(tx->tx_threads, ==, 2);
	-
	- tx->tx_exiting = 1;
	-
	- cv_broadcast(&tx->tx_quiesce_more_cv);
	- cv_broadcast(&tx->tx_quiesce_done_cv);
	- cv_broadcast(&tx->tx_sync_more_cv);
	-
	- while (tx->tx_threads != 0)
	- cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
	-
	- tx->tx_exiting = 0;
	-
	- mutex_exit(&tx->tx_sync_lock);
	-}
	-
	-uint64_t
	-txg_hold_open(dsl_pool_t dp, txg_handle_t th)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	- tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
	- uint64_t txg;
	-
	- mutex_enter(&tc->tc_open_lock);
	- txg = tx->tx_open_txg;
	-
	- mutex_enter(&tc->tc_lock);
	- tc->tc_count[txg & TXG_MASK]++;
	- mutex_exit(&tc->tc_lock);
	-
	- th->th_cpu = tc;
	- th->th_txg = txg;
	-
	- return (txg);
	-}
	-
	-void
	-txg_rele_to_quiesce(txg_handle_t *th)
	-{
	- tx_cpu_t *tc = th->th_cpu;
	-
	- ASSERT(!MUTEX_HELD(&tc->tc_lock));
	- mutex_exit(&tc->tc_open_lock);
	-}
	-
	-void
	-txg_register_callbacks(txg_handle_t th, list_t tx_callbacks)
	-{
	- tx_cpu_t *tc = th->th_cpu;
	- int g = th->th_txg & TXG_MASK;
	-
	- mutex_enter(&tc->tc_lock);
	- list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
	- mutex_exit(&tc->tc_lock);
	-}
	-
	-void
	-txg_rele_to_sync(txg_handle_t *th)
	-{
	- tx_cpu_t *tc = th->th_cpu;
	- int g = th->th_txg & TXG_MASK;
	-
	- mutex_enter(&tc->tc_lock);
	- ASSERT(tc->tc_count[g] != 0);
	- if (--tc->tc_count[g] == 0)
	- cv_broadcast(&tc->tc_cv[g]);
	- mutex_exit(&tc->tc_lock);
	-
	- th->th_cpu = NULL; /* defensive */
	-}
	-
	-/*
	- * Blocks until all transactions in the group are committed.
	- *
	- * On return, the transaction group has reached a stable state in which it can
	- * then be passed off to the syncing context.
	- */
	-static __noinline void
	-txg_quiesce(dsl_pool_t *dp, uint64_t txg)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	- int g = txg & TXG_MASK;
	- int c;
	-
	- /*
	- * Grab all tc_open_locks so nobody else can get into this txg.
	- */
	- for (c = 0; c < max_ncpus; c++)
	- mutex_enter(&tx->tx_cpu[c].tc_open_lock);
	-
	- ASSERT(txg == tx->tx_open_txg);
	- tx->tx_open_txg++;
	- tx->tx_open_time = gethrtime();
	-
	- DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
	- DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
	-
	- /*
	- * Now that we've incremented tx_open_txg, we can let threads
	- * enter the next transaction group.
	- */
	- for (c = 0; c < max_ncpus; c++)
	- mutex_exit(&tx->tx_cpu[c].tc_open_lock);
	-
	- /*
	- * Quiesce the transaction group by waiting for everyone to txg_exit().
	- */
	- for (c = 0; c < max_ncpus; c++) {
	- tx_cpu_t *tc = &tx->tx_cpu[c];
	- mutex_enter(&tc->tc_lock);
	- while (tc->tc_count[g] != 0)
	- cv_wait(&tc->tc_cv[g], &tc->tc_lock);
	- mutex_exit(&tc->tc_lock);
	- }
	-}
	-
	-static void
	-txg_do_callbacks(void *arg)
	-{
	- list_t *cb_list = arg;
	-
	- dmu_tx_do_callbacks(cb_list, 0);
	-
	- list_destroy(cb_list);
	-
	- kmem_free(cb_list, sizeof (list_t));
	-}
	-
	-/*
	- * Dispatch the commit callbacks registered on this txg to worker threads.
	- *
	- * If no callbacks are registered for a given TXG, nothing happens.
	- * This function creates a taskq for the associated pool, if needed.
	- */
	-static void
	-txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
	-{
	- int c;
	- tx_state_t *tx = &dp->dp_tx;
	- list_t *cb_list;
	-
	- for (c = 0; c < max_ncpus; c++) {
	- tx_cpu_t *tc = &tx->tx_cpu[c];
	- /*
	- * No need to lock tx_cpu_t at this point, since this can
	- * only be called once a txg has been synced.
	- */
	-
	- int g = txg & TXG_MASK;
	-
	- if (list_is_empty(&tc->tc_callbacks[g]))
	- continue;
	-
	- if (tx->tx_commit_cb_taskq == NULL) {
	- /*
	- * Commit callback taskq hasn't been created yet.
	- */
	- tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
	- max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
	- TASKQ_PREPOPULATE);
	- }
	-
	- cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
	- list_create(cb_list, sizeof (dmu_tx_callback_t),
	- offsetof(dmu_tx_callback_t, dcb_node));
	-
	- list_move_tail(cb_list, &tc->tc_callbacks[g]);
	-
	- (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
	- txg_do_callbacks, cb_list, TQ_SLEEP);
	- }
	-}
	-
	-static boolean_t
	-txg_is_syncing(dsl_pool_t *dp)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	- ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
	- return (tx->tx_syncing_txg != 0);
	-}
	-
	-static boolean_t
	-txg_is_quiescing(dsl_pool_t *dp)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	- ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
	- return (tx->tx_quiescing_txg != 0);
	-}
	-
	-static boolean_t
	-txg_has_quiesced_to_sync(dsl_pool_t *dp)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	- ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
	- return (tx->tx_quiesced_txg != 0);
	-}
	-
	-static void
	-txg_sync_thread(void *arg)
	-{
	- dsl_pool_t *dp = arg;
	- spa_t *spa = dp->dp_spa;
	- tx_state_t *tx = &dp->dp_tx;
	- callb_cpr_t cpr;
	- uint64_t start, delta;
	-
	- txg_thread_enter(tx, &cpr);
	-
	- start = delta = 0;
	- for (;;) {
	- uint64_t timeout = zfs_txg_timeout * hz;
	- uint64_t timer;
	- uint64_t txg;
	- uint64_t dirty_min_bytes =
	- zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100;
	-
	- /*
	- * We sync when we're scanning, there's someone waiting
	- * on us, or the quiesce thread has handed off a txg to
	- * us, or we have reached our timeout.
	- */
	- timer = (delta >= timeout ? 0 : timeout - delta);
	- while (!dsl_scan_active(dp->dp_scan) &&
	- !tx->tx_exiting && timer > 0 &&
	- tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
	- !txg_has_quiesced_to_sync(dp) &&
	- dp->dp_dirty_total < dirty_min_bytes) {
	- dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
	- tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
	- txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
	- delta = ddi_get_lbolt() - start;
	- timer = (delta > timeout ? 0 : timeout - delta);
	- }
	-
	- /*
	- * Wait until the quiesce thread hands off a txg to us,
	- * prompting it to do so if necessary.
	- */
	- while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
	- if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
	- tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
	- cv_broadcast(&tx->tx_quiesce_more_cv);
	- txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
	- }
	-
	- if (tx->tx_exiting)
	- txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
	-
	- /*
	- * Consume the quiesced txg which has been handed off to
	- * us. This may cause the quiescing thread to now be
	- * able to quiesce another txg, so we must signal it.
	- */
	- ASSERT(tx->tx_quiesced_txg != 0);
	- txg = tx->tx_quiesced_txg;
	- tx->tx_quiesced_txg = 0;
	- tx->tx_syncing_txg = txg;
	- DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
	- cv_broadcast(&tx->tx_quiesce_more_cv);
	-
	- dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
	- txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
	- mutex_exit(&tx->tx_sync_lock);
	-
	- start = ddi_get_lbolt();
	- spa_sync(spa, txg);
	- delta = ddi_get_lbolt() - start;
	-
	- mutex_enter(&tx->tx_sync_lock);
	- tx->tx_synced_txg = txg;
	- tx->tx_syncing_txg = 0;
	- DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
	- cv_broadcast(&tx->tx_sync_done_cv);
	-
	- /*
	- * Dispatch commit callbacks to worker threads.
	- */
	- txg_dispatch_callbacks(dp, txg);
	- }
	-}
	-
	-static void
	-txg_quiesce_thread(void *arg)
	-{
	- dsl_pool_t *dp = arg;
	- tx_state_t *tx = &dp->dp_tx;
	- callb_cpr_t cpr;
	-
	- txg_thread_enter(tx, &cpr);
	-
	- for (;;) {
	- uint64_t txg;
	-
	- /*
	- * We quiesce when there's someone waiting on us.
	- * However, we can only have one txg in "quiescing" or
	- * "quiesced, waiting to sync" state. So we wait until
	- * the "quiesced, waiting to sync" txg has been consumed
	- * by the sync thread.
	- */
	- while (!tx->tx_exiting &&
	- (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting \|\|
	- txg_has_quiesced_to_sync(dp)))
	- txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
	-
	- if (tx->tx_exiting)
	- txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
	-
	- txg = tx->tx_open_txg;
	- dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
	- txg, tx->tx_quiesce_txg_waiting,
	- tx->tx_sync_txg_waiting);
	- tx->tx_quiescing_txg = txg;
	-
	- mutex_exit(&tx->tx_sync_lock);
	- txg_quiesce(dp, txg);
	- mutex_enter(&tx->tx_sync_lock);
	-
	- /*
	- * Hand this txg off to the sync thread.
	- */
	- dprintf("quiesce done, handing off txg %llu\n", txg);
	- tx->tx_quiescing_txg = 0;
	- tx->tx_quiesced_txg = txg;
	- DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
	- cv_broadcast(&tx->tx_sync_more_cv);
	- cv_broadcast(&tx->tx_quiesce_done_cv);
	- }
	-}
	-
	-/*
	- * Delay this thread by delay nanoseconds if we are still in the open
	- * transaction group and there is already a waiting txg quiesing or quiesced.
	- * Abort the delay if this txg stalls or enters the quiesing state.
	- */
	-void
	-txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	- hrtime_t start = gethrtime();
	-
	- /* don't delay if this txg could transition to quiescing immediately */
	- if (tx->tx_open_txg > txg \|\|
	- tx->tx_syncing_txg == txg-1 \|\| tx->tx_synced_txg == txg-1)
	- return;
	-
	- mutex_enter(&tx->tx_sync_lock);
	- if (tx->tx_open_txg > txg \|\| tx->tx_synced_txg == txg-1) {
	- mutex_exit(&tx->tx_sync_lock);
	- return;
	- }
	-
	- while (gethrtime() - start < delay &&
	- tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
	- (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
	- &tx->tx_sync_lock, delay, resolution, 0);
	- }
	-
	- mutex_exit(&tx->tx_sync_lock);
	-}
	-
	-static boolean_t
	-txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	-
	- ASSERT(!dsl_pool_config_held(dp));
	-
	- mutex_enter(&tx->tx_sync_lock);
	- ASSERT3U(tx->tx_threads, ==, 2);
	- if (txg == 0)
	- txg = tx->tx_open_txg + TXG_DEFER_SIZE;
	- if (tx->tx_sync_txg_waiting < txg)
	- tx->tx_sync_txg_waiting = txg;
	- dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
	- txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
	- while (tx->tx_synced_txg < txg) {
	- dprintf("broadcasting sync more "
	- "tx_synced=%llu waiting=%llu dp=%p\n",
	- tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
	- cv_broadcast(&tx->tx_sync_more_cv);
	- if (wait_sig) {
	- /*
	- * Condition wait here but stop if the thread receives a
	- * signal. The caller may call txg_wait_synced*() again
	- * to resume waiting for this txg.
	- */
	-#ifdef __FreeBSD__
	- /*
	- * FreeBSD returns EINTR or ERESTART if there is
	- * a pending signal, zero if the conditional variable
	- * is signaled. illumos returns zero in the former case
	- * and >0 in the latter.
	- */
	- if (cv_wait_sig(&tx->tx_sync_done_cv,
	- &tx->tx_sync_lock) != 0) {
	-#else
	- if (cv_wait_sig(&tx->tx_sync_done_cv,
	- &tx->tx_sync_lock) == 0) {
	-#endif
	-
	- mutex_exit(&tx->tx_sync_lock);
	- return (B_TRUE);
	- }
	- } else {
	- cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
	- }
	- }
	- mutex_exit(&tx->tx_sync_lock);
	- return (B_FALSE);
	-}
	-
	-void
	-txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
	-{
	- VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
	-}
	-
	-/*
	- * Similar to a txg_wait_synced but it can be interrupted from a signal.
	- * Returns B_TRUE if the thread was signaled while waiting.
	- */
	-boolean_t
	-txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
	-{
	- return (txg_wait_synced_impl(dp, txg, B_TRUE));
	-}
	-
	-void
	-txg_wait_open(dsl_pool_t *dp, uint64_t txg)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	-
	- ASSERT(!dsl_pool_config_held(dp));
	-
	- mutex_enter(&tx->tx_sync_lock);
	- ASSERT3U(tx->tx_threads, ==, 2);
	- if (txg == 0)
	- txg = tx->tx_open_txg + 1;
	- if (tx->tx_quiesce_txg_waiting < txg)
	- tx->tx_quiesce_txg_waiting = txg;
	- dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
	- txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
	- while (tx->tx_open_txg < txg) {
	- cv_broadcast(&tx->tx_quiesce_more_cv);
	- cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
	- }
	- mutex_exit(&tx->tx_sync_lock);
	-}
	-
	-/*
	- * If there isn't a txg syncing or in the pipeline, push another txg through
	- * the pipeline by queiscing the open txg.
	- */
	-void
	-txg_kick(dsl_pool_t *dp)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	-
	- ASSERT(!dsl_pool_config_held(dp));
	-
	- mutex_enter(&tx->tx_sync_lock);
	- if (!txg_is_syncing(dp) &&
	- !txg_is_quiescing(dp) &&
	- tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
	- tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
	- tx->tx_quiesced_txg <= tx->tx_synced_txg) {
	- tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
	- cv_broadcast(&tx->tx_quiesce_more_cv);
	- }
	- mutex_exit(&tx->tx_sync_lock);
	-}
	-
	-boolean_t
	-txg_stalled(dsl_pool_t *dp)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	- return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
	-}
	-
	-boolean_t
	-txg_sync_waiting(dsl_pool_t *dp)
	-{
	- tx_state_t *tx = &dp->dp_tx;
	-
	- return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting \|\|
	- tx->tx_quiesced_txg != 0);
	-}
	-
	-/*
	- * Verify that this txg is active (open, quiescing, syncing). Non-active
	- * txg's should not be manipulated.
	- */
	-void
	-txg_verify(spa_t *spa, uint64_t txg)
	-{
	- dsl_pool_t *dp = spa_get_dsl(spa);
	- if (txg <= TXG_INITIAL \|\| txg == ZILTEST_TXG)
	- return;
	- ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
	- ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
	- ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
	-}
	-
	-/*
	- * Per-txg object lists.
	- */
	-void
	-txg_list_create(txg_list_t tl, spa_t spa, size_t offset)
	-{
	- int t;
	-
	- mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- tl->tl_offset = offset;
	- tl->tl_spa = spa;
	-
	- for (t = 0; t < TXG_SIZE; t++)
	- tl->tl_head[t] = NULL;
	-}
	-
	-void
	-txg_list_destroy(txg_list_t *tl)
	-{
	- int t;
	-
	- for (t = 0; t < TXG_SIZE; t++)
	- ASSERT(txg_list_empty(tl, t));
	-
	- mutex_destroy(&tl->tl_lock);
	-}
	-
	-boolean_t
	-txg_list_empty(txg_list_t *tl, uint64_t txg)
	-{
	- txg_verify(tl->tl_spa, txg);
	- return (tl->tl_head[txg & TXG_MASK] == NULL);
	-}
	-
	-/*
	- * Returns true if all txg lists are empty.
	- *
	- * Warning: this is inherently racy (an item could be added immediately
	- * after this function returns). We don't bother with the lock because
	- * it wouldn't change the semantics.
	- */
	-boolean_t
	-txg_all_lists_empty(txg_list_t *tl)
	-{
	- for (int i = 0; i < TXG_SIZE; i++) {
	- if (!txg_list_empty(tl, i)) {
	- return (B_FALSE);
	- }
	- }
	- return (B_TRUE);
	-}
	-
	-/*
	- * Add an entry to the list (unless it's already on the list).
	- * Returns B_TRUE if it was actually added.
	- */
	-boolean_t
	-txg_list_add(txg_list_t tl, void p, uint64_t txg)
	-{
	- int t = txg & TXG_MASK;
	- txg_node_t tn = (txg_node_t )((char *)p + tl->tl_offset);
	- boolean_t add;
	-
	- txg_verify(tl->tl_spa, txg);
	- mutex_enter(&tl->tl_lock);
	- add = (tn->tn_member[t] == 0);
	- if (add) {
	- tn->tn_member[t] = 1;
	- tn->tn_next[t] = tl->tl_head[t];
	- tl->tl_head[t] = tn;
	- }
	- mutex_exit(&tl->tl_lock);
	-
	- return (add);
	-}
	-
	-/*
	- * Add an entry to the end of the list, unless it's already on the list.
	- * (walks list to find end)
	- * Returns B_TRUE if it was actually added.
	- */
	-boolean_t
	-txg_list_add_tail(txg_list_t tl, void p, uint64_t txg)
	-{
	- int t = txg & TXG_MASK;
	- txg_node_t tn = (txg_node_t )((char *)p + tl->tl_offset);
	- boolean_t add;
	-
	- txg_verify(tl->tl_spa, txg);
	- mutex_enter(&tl->tl_lock);
	- add = (tn->tn_member[t] == 0);
	- if (add) {
	- txg_node_t **tp;
	-
	- for (tp = &tl->tl_head[t]; tp != NULL; tp = &(tp)->tn_next[t])
	- continue;
	-
	- tn->tn_member[t] = 1;
	- tn->tn_next[t] = NULL;
	- *tp = tn;
	- }
	- mutex_exit(&tl->tl_lock);
	-
	- return (add);
	-}
	-
	-/*
	- * Remove the head of the list and return it.
	- */
	-void *
	-txg_list_remove(txg_list_t *tl, uint64_t txg)
	-{
	- int t = txg & TXG_MASK;
	- txg_node_t *tn;
	- void *p = NULL;
	-
	- txg_verify(tl->tl_spa, txg);
	- mutex_enter(&tl->tl_lock);
	- if ((tn = tl->tl_head[t]) != NULL) {
	- ASSERT(tn->tn_member[t]);
	- ASSERT(tn->tn_next[t] == NULL \|\| tn->tn_next[t]->tn_member[t]);
	- p = (char *)tn - tl->tl_offset;
	- tl->tl_head[t] = tn->tn_next[t];
	- tn->tn_next[t] = NULL;
	- tn->tn_member[t] = 0;
	- }
	- mutex_exit(&tl->tl_lock);
	-
	- return (p);
	-}
	-
	-/*
	- * Remove a specific item from the list and return it.
	- */
	-void *
	-txg_list_remove_this(txg_list_t tl, void p, uint64_t txg)
	-{
	- int t = txg & TXG_MASK;
	- txg_node_t tn, *tp;
	-
	- txg_verify(tl->tl_spa, txg);
	- mutex_enter(&tl->tl_lock);
	-
	- for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
	- if ((char *)tn - tl->tl_offset == p) {
	- *tp = tn->tn_next[t];
	- tn->tn_next[t] = NULL;
	- tn->tn_member[t] = 0;
	- mutex_exit(&tl->tl_lock);
	- return (p);
	- }
	- }
	-
	- mutex_exit(&tl->tl_lock);
	-
	- return (NULL);
	-}
	-
	-boolean_t
	-txg_list_member(txg_list_t tl, void p, uint64_t txg)
	-{
	- int t = txg & TXG_MASK;
	- txg_node_t tn = (txg_node_t )((char *)p + tl->tl_offset);
	-
	- txg_verify(tl->tl_spa, txg);
	- return (tn->tn_member[t] != 0);
	-}
	-
	-/*
	- * Walk a txg list -- only safe if you know it's not changing.
	- */
	-void *
	-txg_list_head(txg_list_t *tl, uint64_t txg)
	-{
	- int t = txg & TXG_MASK;
	- txg_node_t *tn = tl->tl_head[t];
	-
	- txg_verify(tl->tl_spa, txg);
	- return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
	-}
	-
	-void *
	-txg_list_next(txg_list_t tl, void p, uint64_t txg)
	-{
	- int t = txg & TXG_MASK;
	- txg_node_t tn = (txg_node_t )((char *)p + tl->tl_offset);
	-
	- txg_verify(tl->tl_spa, txg);
	- tn = tn->tn_next[t];
	-
	- return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
	@@ -1,74 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/uberblock_impl.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/mmp.h>
	-
	-int
	-uberblock_verify(uberblock_t *ub)
	-{
	- if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
	- byteswap_uint64_array(ub, sizeof (uberblock_t));
	-
	- if (ub->ub_magic != UBERBLOCK_MAGIC)
	- return (SET_ERROR(EINVAL));
	-
	- return (0);
	-}
	-
	-/*
	- * Update the uberblock and return TRUE if anything changed in this
	- * transaction group.
	- */
	-boolean_t
	-uberblock_update(uberblock_t ub, vdev_t rvd, uint64_t txg, uint64_t mmp_delay)
	-{
	- ASSERT(ub->ub_txg < txg);
	-
	- /*
	- * We explicitly do not set ub_version here, so that older versions
	- * continue to be written with the previous uberblock version.
	- */
	- ub->ub_magic = UBERBLOCK_MAGIC;
	- ub->ub_txg = txg;
	- ub->ub_guid_sum = rvd->vdev_guid_sum;
	- ub->ub_timestamp = gethrestime_sec();
	- ub->ub_software_version = SPA_VERSION;
	- ub->ub_mmp_magic = MMP_MAGIC;
	- if (spa_multihost(rvd->vdev_spa)) {
	- ub->ub_mmp_delay = mmp_delay;
	- ub->ub_mmp_config = MMP_SEQ_SET(0) \|
	- MMP_INTERVAL_SET(zfs_multihost_interval) \|
	- MMP_FAIL_INT_SET(zfs_multihost_fail_intervals);
	- } else {
	- ub->ub_mmp_delay = 0;
	- ub->ub_mmp_config = 0;
	- }
	- ub->ub_checkpoint_txg = 0;
	-
	- return (ub->ub_rootbp.blk_birth == txg);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
	@@ -1,112 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include <sys/zfs_context.h>
	-#include <sys/avl.h>
	-#include <sys/unique.h>
	-
	-static avl_tree_t unique_avl;
	-static kmutex_t unique_mtx;
	-
	-typedef struct unique {
	- avl_node_t un_link;
	- uint64_t un_value;
	-} unique_t;
	-
	-#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
	-
	-static int
	-unique_compare(const void a, const void b)
	-{
	- const unique_t una = (const unique_t )a;
	- const unique_t unb = (const unique_t )b;
	-
	- return (AVL_CMP(una->un_value, unb->un_value));
	-}
	-
	-void
	-unique_init(void)
	-{
	- avl_create(&unique_avl, unique_compare,
	- sizeof (unique_t), offsetof(unique_t, un_link));
	- mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL);
	-}
	-
	-void
	-unique_fini(void)
	-{
	- avl_destroy(&unique_avl);
	- mutex_destroy(&unique_mtx);
	-}
	-
	-uint64_t
	-unique_create(void)
	-{
	- uint64_t value = unique_insert(0);
	- unique_remove(value);
	- return (value);
	-}
	-
	-uint64_t
	-unique_insert(uint64_t value)
	-{
	- avl_index_t idx;
	- unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
	-
	- un->un_value = value;
	-
	- mutex_enter(&unique_mtx);
	- while (un->un_value == 0 \|\| un->un_value & ~UNIQUE_MASK \|\|
	- avl_find(&unique_avl, un, &idx)) {
	- mutex_exit(&unique_mtx);
	- (void) random_get_pseudo_bytes((void*)&un->un_value,
	- sizeof (un->un_value));
	- un->un_value &= UNIQUE_MASK;
	- mutex_enter(&unique_mtx);
	- }
	-
	- avl_insert(&unique_avl, un, idx);
	- mutex_exit(&unique_mtx);
	-
	- return (un->un_value);
	-}
	-
	-void
	-unique_remove(uint64_t value)
	-{
	- unique_t un_tofind;
	- unique_t *un;
	-
	- un_tofind.un_value = value;
	- mutex_enter(&unique_mtx);
	- un = avl_find(&unique_avl, &un_tofind, NULL);
	- if (un != NULL) {
	- avl_remove(&unique_avl, un);
	- kmem_free(un, sizeof (unique_t));
	- }
	- mutex_exit(&unique_mtx);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
	@@ -1,4520 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright 2017 Nexenta Systems, Inc.
	- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016 Toomas Soome <tsoome@me.com>
	- * Copyright 2019 Joyent, Inc.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/fm/fs/zfs.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/bpobj.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/uberblock_impl.h>
	-#include <sys/metaslab.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/space_map.h>
	-#include <sys/space_reftree.h>
	-#include <sys/zio.h>
	-#include <sys/zap.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/arc.h>
	-#include <sys/zil.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/abd.h>
	-#include <sys/trim_map.h>
	-#include <sys/vdev_initialize.h>
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "ZFS VDEV");
	-
	-/*
	- * Virtual device management.
	- */
	-
	-/*
	- * The limit for ZFS to automatically increase a top-level vdev's ashift
	- * from logical ashift to physical ashift.
	- *
	- * Example: one or more 512B emulation child vdevs
	- * child->vdev_ashift = 9 (512 bytes)
	- * child->vdev_physical_ashift = 12 (4096 bytes)
	- * zfs_max_auto_ashift = 11 (2048 bytes)
	- * zfs_min_auto_ashift = 9 (512 bytes)
	- *
	- * On pool creation or the addition of a new top-level vdev, ZFS will
	- * increase the ashift of the top-level vdev to 2048 as limited by
	- * zfs_max_auto_ashift.
	- *
	- * Example: one or more 512B emulation child vdevs
	- * child->vdev_ashift = 9 (512 bytes)
	- * child->vdev_physical_ashift = 12 (4096 bytes)
	- * zfs_max_auto_ashift = 13 (8192 bytes)
	- * zfs_min_auto_ashift = 9 (512 bytes)
	- *
	- * On pool creation or the addition of a new top-level vdev, ZFS will
	- * increase the ashift of the top-level vdev to 4096 to match the
	- * max vdev_physical_ashift.
	- *
	- * Example: one or more 512B emulation child vdevs
	- * child->vdev_ashift = 9 (512 bytes)
	- * child->vdev_physical_ashift = 9 (512 bytes)
	- * zfs_max_auto_ashift = 13 (8192 bytes)
	- * zfs_min_auto_ashift = 12 (4096 bytes)
	- *
	- * On pool creation or the addition of a new top-level vdev, ZFS will
	- * increase the ashift of the top-level vdev to 4096 to match the
	- * zfs_min_auto_ashift.
	- */
	-static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
	-static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;
	-
	-static int
	-sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
	-{
	- uint64_t val;
	- int err;
	-
	- val = zfs_max_auto_ashift;
	- err = sysctl_handle_64(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val > SPA_MAXASHIFT \|\| val < zfs_min_auto_ashift)
	- return (EINVAL);
	-
	- zfs_max_auto_ashift = val;
	-
	- return (0);
	-}
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
	- CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t),
	- sysctl_vfs_zfs_max_auto_ashift, "QU",
	- "Max ashift used when optimising for logical -> physical sectors size on "
	- "new top-level vdevs.");
	-
	-static int
	-sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
	-{
	- uint64_t val;
	- int err;
	-
	- val = zfs_min_auto_ashift;
	- err = sysctl_handle_64(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val < SPA_MINASHIFT \|\| val > zfs_max_auto_ashift)
	- return (EINVAL);
	-
	- zfs_min_auto_ashift = val;
	-
	- return (0);
	-}
	-SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
	- CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t),
	- sysctl_vfs_zfs_min_auto_ashift, "QU",
	- "Min ashift used when creating new top-level vdevs.");
	-
	-static vdev_ops_t *vdev_ops_table[] = {
	- &vdev_root_ops,
	- &vdev_raidz_ops,
	- &vdev_mirror_ops,
	- &vdev_replacing_ops,
	- &vdev_spare_ops,
	-#ifdef _KERNEL
	- &vdev_geom_ops,
	-#else
	- &vdev_disk_ops,
	-#endif
	- &vdev_file_ops,
	- &vdev_missing_ops,
	- &vdev_hole_ops,
	- &vdev_indirect_ops,
	- NULL
	-};
	-
	-
	-/* default target for number of metaslabs per top-level vdev */
	-int zfs_vdev_default_ms_count = 200;
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN,
	- &zfs_vdev_default_ms_count, 0,
	- "Target number of metaslabs per top-level vdev");
	-
	-/* minimum number of metaslabs per top-level vdev */
	-int zfs_vdev_min_ms_count = 16;
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
	- &zfs_vdev_min_ms_count, 0,
	- "Minimum number of metaslabs per top-level vdev");
	-
	-/* practical upper limit of total metaslabs per top-level vdev */
	-int zfs_vdev_ms_count_limit = 1ULL << 17;
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
	- &zfs_vdev_ms_count_limit, 0,
	- "Maximum number of metaslabs per top-level vdev");
	-
	-/* lower limit for metaslab size (512M) */
	-int zfs_vdev_default_ms_shift = 29;
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
	- &zfs_vdev_default_ms_shift, 0,
	- "Default shift between vdev size and number of metaslabs");
	-
	-/* upper limit for metaslab size (16G) */
	-int zfs_vdev_max_ms_shift = 34;
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
	- &zfs_vdev_max_ms_shift, 0,
	- "Maximum shift between vdev size and number of metaslabs");
	-
	-boolean_t vdev_validate_skip = B_FALSE;
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, validate_skip, CTLFLAG_RWTUN,
	- &vdev_validate_skip, 0,
	- "Bypass vdev validation");
	-
	-/*
	- * Since the DTL space map of a vdev is not expected to have a lot of
	- * entries, we default its block size to 4K.
	- */
	-int vdev_dtl_sm_blksz = (1 << 12);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN,
	- &vdev_dtl_sm_blksz, 0,
	- "Block size for DTL space map. Power of 2 and greater than 4096.");
	-
	-/*
	- * vdev-wide space maps that have lots of entries written to them at
	- * the end of each transaction can benefit from a higher I/O bandwidth
	- * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
	- */
	-int vdev_standard_sm_blksz = (1 << 17);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN,
	- &vdev_standard_sm_blksz, 0,
	- "Block size for standard space map. Power of 2 and greater than 4096.");
	-
	-/*
	- * Tunable parameter for debugging or performance analysis. Setting this
	- * will cause pool corruption on power loss if a volatile out-of-order
	- * write cache is enabled.
	- */
	-boolean_t zfs_nocacheflush = B_FALSE;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RWTUN,
	- &zfs_nocacheflush, 0, "Disable cache flush");
	-
	-/PRINTFLIKE2/
	-void
	-vdev_dbgmsg(vdev_t vd, const char fmt, ...)
	-{
	- va_list adx;
	- char buf[256];
	-
	- va_start(adx, fmt);
	- (void) vsnprintf(buf, sizeof (buf), fmt, adx);
	- va_end(adx);
	-
	- if (vd->vdev_path != NULL) {
	- zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
	- vd->vdev_path, buf);
	- } else {
	- zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
	- vd->vdev_ops->vdev_op_type,
	- (u_longlong_t)vd->vdev_id,
	- (u_longlong_t)vd->vdev_guid, buf);
	- }
	-}
	-
	-void
	-vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
	-{
	- char state[20];
	-
	- if (vd->vdev_ishole \|\| vd->vdev_ops == &vdev_missing_ops) {
	- zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
	- vd->vdev_ops->vdev_op_type);
	- return;
	- }
	-
	- switch (vd->vdev_state) {
	- case VDEV_STATE_UNKNOWN:
	- (void) snprintf(state, sizeof (state), "unknown");
	- break;
	- case VDEV_STATE_CLOSED:
	- (void) snprintf(state, sizeof (state), "closed");
	- break;
	- case VDEV_STATE_OFFLINE:
	- (void) snprintf(state, sizeof (state), "offline");
	- break;
	- case VDEV_STATE_REMOVED:
	- (void) snprintf(state, sizeof (state), "removed");
	- break;
	- case VDEV_STATE_CANT_OPEN:
	- (void) snprintf(state, sizeof (state), "can't open");
	- break;
	- case VDEV_STATE_FAULTED:
	- (void) snprintf(state, sizeof (state), "faulted");
	- break;
	- case VDEV_STATE_DEGRADED:
	- (void) snprintf(state, sizeof (state), "degraded");
	- break;
	- case VDEV_STATE_HEALTHY:
	- (void) snprintf(state, sizeof (state), "healthy");
	- break;
	- default:
	- (void) snprintf(state, sizeof (state), "<state %u>",
	- (uint_t)vd->vdev_state);
	- }
	-
	- zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
	- "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
	- vd->vdev_islog ? " (log)" : "",
	- (u_longlong_t)vd->vdev_guid,
	- vd->vdev_path ? vd->vdev_path : "N/A", state);
	-
	- for (uint64_t i = 0; i < vd->vdev_children; i++)
	- vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
	-}
	-
	-/*
	- * Given a vdev type, return the appropriate ops vector.
	- */
	-static vdev_ops_t *
	-vdev_getops(const char *type)
	-{
	- vdev_ops_t ops, *opspp;
	-
	- for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
	- if (strcmp(ops->vdev_op_type, type) == 0)
	- break;
	-
	- return (ops);
	-}
	-
	-/*
	- * Derive the enumerated alloction bias from string input.
	- * String origin is either the per-vdev zap or zpool(1M).
	- */
	-static vdev_alloc_bias_t
	-vdev_derive_alloc_bias(const char *bias)
	-{
	- vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
	-
	- if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
	- alloc_bias = VDEV_BIAS_LOG;
	- else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
	- alloc_bias = VDEV_BIAS_SPECIAL;
	- else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
	- alloc_bias = VDEV_BIAS_DEDUP;
	-
	- return (alloc_bias);
	-}
	-
	-/* ARGSUSED */
	-void
	-vdev_default_xlate(vdev_t vd, const range_seg_t in, range_seg_t *res)
	-{
	- res->rs_start = in->rs_start;
	- res->rs_end = in->rs_end;
	-}
	-
	-/*
	- * Default asize function: return the MAX of psize with the asize of
	- * all children. This is what's used by anything other than RAID-Z.
	- */
	-uint64_t
	-vdev_default_asize(vdev_t *vd, uint64_t psize)
	-{
	- uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
	- uint64_t csize;
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
	- asize = MAX(asize, csize);
	- }
	-
	- return (asize);
	-}
	-
	-/*
	- * Get the minimum allocatable size. We define the allocatable size as
	- * the vdev's asize rounded to the nearest metaslab. This allows us to
	- * replace or attach devices which don't have the same physical size but
	- * can still satisfy the same number of allocations.
	- */
	-uint64_t
	-vdev_get_min_asize(vdev_t *vd)
	-{
	- vdev_t *pvd = vd->vdev_parent;
	-
	- /*
	- * If our parent is NULL (inactive spare or cache) or is the root,
	- * just return our own asize.
	- */
	- if (pvd == NULL)
	- return (vd->vdev_asize);
	-
	- /*
	- * The top-level vdev just returns the allocatable size rounded
	- * to the nearest metaslab.
	- */
	- if (vd == vd->vdev_top)
	- return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
	-
	- /*
	- * The allocatable space for a raidz vdev is N * sizeof(smallest child),
	- * so each child must provide at least 1/Nth of its asize.
	- */
	- if (pvd->vdev_ops == &vdev_raidz_ops)
	- return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
	- pvd->vdev_children);
	-
	- return (pvd->vdev_min_asize);
	-}
	-
	-void
	-vdev_set_min_asize(vdev_t *vd)
	-{
	- vd->vdev_min_asize = vdev_get_min_asize(vd);
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_set_min_asize(vd->vdev_child[c]);
	-}
	-
	-vdev_t *
	-vdev_lookup_top(spa_t *spa, uint64_t vdev)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
	-
	- if (vdev < rvd->vdev_children) {
	- ASSERT(rvd->vdev_child[vdev] != NULL);
	- return (rvd->vdev_child[vdev]);
	- }
	-
	- return (NULL);
	-}
	-
	-vdev_t *
	-vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
	-{
	- vdev_t *mvd;
	-
	- if (vd->vdev_guid == guid)
	- return (vd);
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
	- NULL)
	- return (mvd);
	-
	- return (NULL);
	-}
	-
	-static int
	-vdev_count_leaves_impl(vdev_t *vd)
	-{
	- int n = 0;
	-
	- if (vd->vdev_ops->vdev_op_leaf)
	- return (1);
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- n += vdev_count_leaves_impl(vd->vdev_child[c]);
	-
	- return (n);
	-}
	-
	-int
	-vdev_count_leaves(spa_t *spa)
	-{
	- return (vdev_count_leaves_impl(spa->spa_root_vdev));
	-}
	-
	-void
	-vdev_add_child(vdev_t pvd, vdev_t cvd)
	-{
	- size_t oldsize, newsize;
	- uint64_t id = cvd->vdev_id;
	- vdev_t **newchild;
	- spa_t *spa = cvd->vdev_spa;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	- ASSERT(cvd->vdev_parent == NULL);
	-
	- cvd->vdev_parent = pvd;
	-
	- if (pvd == NULL)
	- return;
	-
	- ASSERT(id >= pvd->vdev_children \|\| pvd->vdev_child[id] == NULL);
	-
	- oldsize = pvd->vdev_children * sizeof (vdev_t *);
	- pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
	- newsize = pvd->vdev_children * sizeof (vdev_t *);
	-
	- newchild = kmem_zalloc(newsize, KM_SLEEP);
	- if (pvd->vdev_child != NULL) {
	- bcopy(pvd->vdev_child, newchild, oldsize);
	- kmem_free(pvd->vdev_child, oldsize);
	- }
	-
	- pvd->vdev_child = newchild;
	- pvd->vdev_child[id] = cvd;
	-
	- cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
	- ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
	-
	- /*
	- * Walk up all ancestors to update guid sum.
	- */
	- for (; pvd != NULL; pvd = pvd->vdev_parent)
	- pvd->vdev_guid_sum += cvd->vdev_guid_sum;
	-
	- if (cvd->vdev_ops->vdev_op_leaf) {
	- list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
	- cvd->vdev_spa->spa_leaf_list_gen++;
	- }
	-}
	-
	-void
	-vdev_remove_child(vdev_t pvd, vdev_t cvd)
	-{
	- int c;
	- uint_t id = cvd->vdev_id;
	-
	- ASSERT(cvd->vdev_parent == pvd);
	-
	- if (pvd == NULL)
	- return;
	-
	- ASSERT(id < pvd->vdev_children);
	- ASSERT(pvd->vdev_child[id] == cvd);
	-
	- pvd->vdev_child[id] = NULL;
	- cvd->vdev_parent = NULL;
	-
	- for (c = 0; c < pvd->vdev_children; c++)
	- if (pvd->vdev_child[c])
	- break;
	-
	- if (c == pvd->vdev_children) {
	- kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
	- pvd->vdev_child = NULL;
	- pvd->vdev_children = 0;
	- }
	-
	- if (cvd->vdev_ops->vdev_op_leaf) {
	- spa_t *spa = cvd->vdev_spa;
	- list_remove(&spa->spa_leaf_list, cvd);
	- spa->spa_leaf_list_gen++;
	- }
	-
	- /*
	- * Walk up all ancestors to update guid sum.
	- */
	- for (; pvd != NULL; pvd = pvd->vdev_parent)
	- pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
	-}
	-
	-/*
	- * Remove any holes in the child array.
	- */
	-void
	-vdev_compact_children(vdev_t *pvd)
	-{
	- vdev_t *newchild, cvd;
	- int oldc = pvd->vdev_children;
	- int newc;
	-
	- ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- if (oldc == 0)
	- return;
	-
	- for (int c = newc = 0; c < oldc; c++)
	- if (pvd->vdev_child[c])
	- newc++;
	-
	- if (newc > 0) {
	- newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
	-
	- for (int c = newc = 0; c < oldc; c++) {
	- if ((cvd = pvd->vdev_child[c]) != NULL) {
	- newchild[newc] = cvd;
	- cvd->vdev_id = newc++;
	- }
	- }
	- } else {
	- newchild = NULL;
	- }
	-
	- kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
	- pvd->vdev_child = newchild;
	- pvd->vdev_children = newc;
	-}
	-
	-/*
	- * Allocate and minimally initialize a vdev_t.
	- */
	-vdev_t *
	-vdev_alloc_common(spa_t spa, uint_t id, uint64_t guid, vdev_ops_t ops)
	-{
	- vdev_t *vd;
	- vdev_indirect_config_t *vic;
	-
	- vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
	- vic = &vd->vdev_indirect_config;
	-
	- if (spa->spa_root_vdev == NULL) {
	- ASSERT(ops == &vdev_root_ops);
	- spa->spa_root_vdev = vd;
	- spa->spa_load_guid = spa_generate_guid(NULL);
	- }
	-
	- if (guid == 0 && ops != &vdev_hole_ops) {
	- if (spa->spa_root_vdev == vd) {
	- /*
	- * The root vdev's guid will also be the pool guid,
	- * which must be unique among all pools.
	- */
	- guid = spa_generate_guid(NULL);
	- } else {
	- /*
	- * Any other vdev's guid must be unique within the pool.
	- */
	- guid = spa_generate_guid(spa);
	- }
	- ASSERT(!spa_guid_exists(spa_guid(spa), guid));
	- }
	-
	- vd->vdev_spa = spa;
	- vd->vdev_id = id;
	- vd->vdev_guid = guid;
	- vd->vdev_guid_sum = guid;
	- vd->vdev_ops = ops;
	- vd->vdev_state = VDEV_STATE_CLOSED;
	- vd->vdev_ishole = (ops == &vdev_hole_ops);
	- vic->vic_prev_indirect_vdev = UINT64_MAX;
	-
	- rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
	- mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
	- vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
	-
	- list_link_init(&vd->vdev_leaf_node);
	- mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
	- cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
	-
	- for (int t = 0; t < DTL_TYPES; t++) {
	- vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
	- }
	- txg_list_create(&vd->vdev_ms_list, spa,
	- offsetof(struct metaslab, ms_txg_node));
	- txg_list_create(&vd->vdev_dtl_list, spa,
	- offsetof(struct vdev, vdev_dtl_node));
	- vd->vdev_stat.vs_timestamp = gethrtime();
	- vdev_queue_init(vd);
	- vdev_cache_init(vd);
	-
	- return (vd);
	-}
	-
	-/*
	- * Allocate a new vdev. The 'alloctype' is used to control whether we are
	- * creating a new vdev or loading an existing one - the behavior is slightly
	- * different for each case.
	- */
	-int
	-vdev_alloc(spa_t spa, vdev_t vdp, nvlist_t nv, vdev_t *parent, uint_t id,
	- int alloctype)
	-{
	- vdev_ops_t *ops;
	- char *type;
	- uint64_t guid = 0, islog, nparity;
	- vdev_t *vd;
	- vdev_indirect_config_t *vic;
	- vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
	- boolean_t top_level = (parent && !parent->vdev_parent);
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
	- return (SET_ERROR(EINVAL));
	-
	- if ((ops = vdev_getops(type)) == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * If this is a load, get the vdev guid from the nvlist.
	- * Otherwise, vdev_alloc_common() will generate one for us.
	- */
	- if (alloctype == VDEV_ALLOC_LOAD) {
	- uint64_t label_id;
	-
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) \|\|
	- label_id != id)
	- return (SET_ERROR(EINVAL));
	-
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	- return (SET_ERROR(EINVAL));
	- } else if (alloctype == VDEV_ALLOC_SPARE) {
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	- return (SET_ERROR(EINVAL));
	- } else if (alloctype == VDEV_ALLOC_L2CACHE) {
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	- return (SET_ERROR(EINVAL));
	- } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * The first allocated vdev must be of type 'root'.
	- */
	- if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * Determine whether we're a log vdev.
	- */
	- islog = 0;
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
	- if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
	- return (SET_ERROR(ENOTSUP));
	-
	- if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
	- return (SET_ERROR(ENOTSUP));
	-
	- /*
	- * Set the nparity property for RAID-Z vdevs.
	- */
	- nparity = -1ULL;
	- if (ops == &vdev_raidz_ops) {
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
	- &nparity) == 0) {
	- if (nparity == 0 \|\| nparity > VDEV_RAIDZ_MAXPARITY)
	- return (SET_ERROR(EINVAL));
	- /*
	- * Previous versions could only support 1 or 2 parity
	- * device.
	- */
	- if (nparity > 1 &&
	- spa_version(spa) < SPA_VERSION_RAIDZ2)
	- return (SET_ERROR(ENOTSUP));
	- if (nparity > 2 &&
	- spa_version(spa) < SPA_VERSION_RAIDZ3)
	- return (SET_ERROR(ENOTSUP));
	- } else {
	- /*
	- * We require the parity to be specified for SPAs that
	- * support multiple parity levels.
	- */
	- if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
	- return (SET_ERROR(EINVAL));
	- /*
	- * Otherwise, we default to 1 parity device for RAID-Z.
	- */
	- nparity = 1;
	- }
	- } else {
	- nparity = 0;
	- }
	- ASSERT(nparity != -1ULL);
	-
	- /*
	- * If creating a top-level vdev, check for allocation classes input
	- */
	- if (top_level && alloctype == VDEV_ALLOC_ADD) {
	- char *bias;
	-
	- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
	- &bias) == 0) {
	- alloc_bias = vdev_derive_alloc_bias(bias);
	-
	- /* spa_vdev_add() expects feature to be enabled */
	- if (alloc_bias != VDEV_BIAS_LOG &&
	- spa->spa_load_state != SPA_LOAD_CREATE &&
	- !spa_feature_is_enabled(spa,
	- SPA_FEATURE_ALLOCATION_CLASSES)) {
	- return (SET_ERROR(ENOTSUP));
	- }
	- }
	- }
	-
	- vd = vdev_alloc_common(spa, id, guid, ops);
	- vic = &vd->vdev_indirect_config;
	-
	- vd->vdev_islog = islog;
	- vd->vdev_nparity = nparity;
	- if (top_level && alloc_bias != VDEV_BIAS_NONE)
	- vd->vdev_alloc_bias = alloc_bias;
	-
	- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
	- vd->vdev_path = spa_strdup(vd->vdev_path);
	- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
	- vd->vdev_devid = spa_strdup(vd->vdev_devid);
	- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
	- &vd->vdev_physpath) == 0)
	- vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
	- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
	- vd->vdev_fru = spa_strdup(vd->vdev_fru);
	-
	- /*
	- * Set the whole_disk property. If it's not specified, leave the value
	- * as -1.
	- */
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	- &vd->vdev_wholedisk) != 0)
	- vd->vdev_wholedisk = -1ULL;
	-
	- ASSERT0(vic->vic_mapping_object);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
	- &vic->vic_mapping_object);
	- ASSERT0(vic->vic_births_object);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
	- &vic->vic_births_object);
	- ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
	- &vic->vic_prev_indirect_vdev);
	-
	- /*
	- * Look for the 'not present' flag. This will only be set if the device
	- * was not present at the time of import.
	- */
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
	- &vd->vdev_not_present);
	-
	- /*
	- * Get the alignment requirement.
	- */
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
	-
	- /*
	- * Retrieve the vdev creation time.
	- */
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
	- &vd->vdev_crtxg);
	-
	- /*
	- * If we're a top-level vdev, try to load the allocation parameters.
	- */
	- if (top_level &&
	- (alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_SPLIT)) {
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
	- &vd->vdev_ms_array);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
	- &vd->vdev_ms_shift);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
	- &vd->vdev_asize);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
	- &vd->vdev_removing);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
	- &vd->vdev_top_zap);
	- } else {
	- ASSERT0(vd->vdev_top_zap);
	- }
	-
	- if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
	- ASSERT(alloctype == VDEV_ALLOC_LOAD \|\|
	- alloctype == VDEV_ALLOC_ADD \|\|
	- alloctype == VDEV_ALLOC_SPLIT \|\|
	- alloctype == VDEV_ALLOC_ROOTPOOL);
	- /* Note: metaslab_group_create() is now deferred */
	- }
	-
	- if (vd->vdev_ops->vdev_op_leaf &&
	- (alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_SPLIT)) {
	- (void) nvlist_lookup_uint64(nv,
	- ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
	- } else {
	- ASSERT0(vd->vdev_leaf_zap);
	- }
	-
	- /*
	- * If we're a leaf vdev, try to load the DTL object and other state.
	- */
	-
	- if (vd->vdev_ops->vdev_op_leaf &&
	- (alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_L2CACHE \|\|
	- alloctype == VDEV_ALLOC_ROOTPOOL)) {
	- if (alloctype == VDEV_ALLOC_LOAD) {
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
	- &vd->vdev_dtl_object);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
	- &vd->vdev_unspare);
	- }
	-
	- if (alloctype == VDEV_ALLOC_ROOTPOOL) {
	- uint64_t spare = 0;
	-
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
	- &spare) == 0 && spare)
	- spa_spare_add(vd);
	- }
	-
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
	- &vd->vdev_offline);
	-
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
	- &vd->vdev_resilver_txg);
	-
	- /*
	- * When importing a pool, we want to ignore the persistent fault
	- * state, as the diagnosis made on another system may not be
	- * valid in the current context. Local vdevs will
	- * remain in the faulted state.
	- */
	- if (spa_load_state(spa) == SPA_LOAD_OPEN) {
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
	- &vd->vdev_faulted);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
	- &vd->vdev_degraded);
	- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
	- &vd->vdev_removed);
	-
	- if (vd->vdev_faulted \|\| vd->vdev_degraded) {
	- char *aux;
	-
	- vd->vdev_label_aux =
	- VDEV_AUX_ERR_EXCEEDED;
	- if (nvlist_lookup_string(nv,
	- ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
	- strcmp(aux, "external") == 0)
	- vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
	- }
	- }
	- }
	-
	- /*
	- * Add ourselves to the parent's list of children.
	- */
	- vdev_add_child(parent, vd);
	-
	- *vdp = vd;
	-
	- return (0);
	-}
	-
	-void
	-vdev_free(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
	-
	- /*
	- * Scan queues are normally destroyed at the end of a scan. If the
	- * queue exists here, that implies the vdev is being removed while
	- * the scan is still running.
	- */
	- if (vd->vdev_scan_io_queue != NULL) {
	- mutex_enter(&vd->vdev_scan_io_queue_lock);
	- dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
	- vd->vdev_scan_io_queue = NULL;
	- mutex_exit(&vd->vdev_scan_io_queue_lock);
	- }
	-
	- /*
	- * vdev_free() implies closing the vdev first. This is simpler than
	- * trying to ensure complicated semantics for all callers.
	- */
	- vdev_close(vd);
	-
	- ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
	- ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
	-
	- /*
	- * Free all children.
	- */
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_free(vd->vdev_child[c]);
	-
	- ASSERT(vd->vdev_child == NULL);
	- ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
	- ASSERT(vd->vdev_initialize_thread == NULL);
	-
	- /*
	- * Discard allocation state.
	- */
	- if (vd->vdev_mg != NULL) {
	- vdev_metaslab_fini(vd);
	- metaslab_group_destroy(vd->vdev_mg);
	- }
	-
	- ASSERT0(vd->vdev_stat.vs_space);
	- ASSERT0(vd->vdev_stat.vs_dspace);
	- ASSERT0(vd->vdev_stat.vs_alloc);
	-
	- /*
	- * Remove this vdev from its parent's child list.
	- */
	- vdev_remove_child(vd->vdev_parent, vd);
	-
	- ASSERT(vd->vdev_parent == NULL);
	- ASSERT(!list_link_active(&vd->vdev_leaf_node));
	-
	- /*
	- * Clean up vdev structure.
	- */
	- vdev_queue_fini(vd);
	- vdev_cache_fini(vd);
	-
	- if (vd->vdev_path)
	- spa_strfree(vd->vdev_path);
	- if (vd->vdev_devid)
	- spa_strfree(vd->vdev_devid);
	- if (vd->vdev_physpath)
	- spa_strfree(vd->vdev_physpath);
	- if (vd->vdev_fru)
	- spa_strfree(vd->vdev_fru);
	-
	- if (vd->vdev_isspare)
	- spa_spare_remove(vd);
	- if (vd->vdev_isl2cache)
	- spa_l2cache_remove(vd);
	-
	- txg_list_destroy(&vd->vdev_ms_list);
	- txg_list_destroy(&vd->vdev_dtl_list);
	-
	- mutex_enter(&vd->vdev_dtl_lock);
	- space_map_close(vd->vdev_dtl_sm);
	- for (int t = 0; t < DTL_TYPES; t++) {
	- range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
	- range_tree_destroy(vd->vdev_dtl[t]);
	- }
	- mutex_exit(&vd->vdev_dtl_lock);
	-
	- EQUIV(vd->vdev_indirect_births != NULL,
	- vd->vdev_indirect_mapping != NULL);
	- if (vd->vdev_indirect_births != NULL) {
	- vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
	- vdev_indirect_births_close(vd->vdev_indirect_births);
	- }
	-
	- if (vd->vdev_obsolete_sm != NULL) {
	- ASSERT(vd->vdev_removing \|\|
	- vd->vdev_ops == &vdev_indirect_ops);
	- space_map_close(vd->vdev_obsolete_sm);
	- vd->vdev_obsolete_sm = NULL;
	- }
	- range_tree_destroy(vd->vdev_obsolete_segments);
	- rw_destroy(&vd->vdev_indirect_rwlock);
	- mutex_destroy(&vd->vdev_obsolete_lock);
	-
	- mutex_destroy(&vd->vdev_dtl_lock);
	- mutex_destroy(&vd->vdev_stat_lock);
	- mutex_destroy(&vd->vdev_probe_lock);
	- mutex_destroy(&vd->vdev_scan_io_queue_lock);
	- mutex_destroy(&vd->vdev_initialize_lock);
	- mutex_destroy(&vd->vdev_initialize_io_lock);
	- cv_destroy(&vd->vdev_initialize_io_cv);
	- cv_destroy(&vd->vdev_initialize_cv);
	-
	- if (vd == spa->spa_root_vdev)
	- spa->spa_root_vdev = NULL;
	-
	- kmem_free(vd, sizeof (vdev_t));
	-}
	-
	-/*
	- * Transfer top-level vdev state from svd to tvd.
	- */
	-static void
	-vdev_top_transfer(vdev_t svd, vdev_t tvd)
	-{
	- spa_t *spa = svd->vdev_spa;
	- metaslab_t *msp;
	- vdev_t *vd;
	- int t;
	-
	- ASSERT(tvd == tvd->vdev_top);
	-
	- tvd->vdev_ms_array = svd->vdev_ms_array;
	- tvd->vdev_ms_shift = svd->vdev_ms_shift;
	- tvd->vdev_ms_count = svd->vdev_ms_count;
	- tvd->vdev_top_zap = svd->vdev_top_zap;
	-
	- svd->vdev_ms_array = 0;
	- svd->vdev_ms_shift = 0;
	- svd->vdev_ms_count = 0;
	- svd->vdev_top_zap = 0;
	-
	- if (tvd->vdev_mg)
	- ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
	- tvd->vdev_mg = svd->vdev_mg;
	- tvd->vdev_ms = svd->vdev_ms;
	-
	- svd->vdev_mg = NULL;
	- svd->vdev_ms = NULL;
	-
	- if (tvd->vdev_mg != NULL)
	- tvd->vdev_mg->mg_vd = tvd;
	-
	- tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
	- svd->vdev_checkpoint_sm = NULL;
	-
	- tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
	- svd->vdev_alloc_bias = VDEV_BIAS_NONE;
	-
	- tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
	- tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
	- tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
	-
	- svd->vdev_stat.vs_alloc = 0;
	- svd->vdev_stat.vs_space = 0;
	- svd->vdev_stat.vs_dspace = 0;
	-
	- /*
	- * State which may be set on a top-level vdev that's in the
	- * process of being removed.
	- */
	- ASSERT0(tvd->vdev_indirect_config.vic_births_object);
	- ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
	- ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
	- ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
	- ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
	- ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
	- ASSERT0(tvd->vdev_removing);
	- tvd->vdev_removing = svd->vdev_removing;
	- tvd->vdev_indirect_config = svd->vdev_indirect_config;
	- tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
	- tvd->vdev_indirect_births = svd->vdev_indirect_births;
	- range_tree_swap(&svd->vdev_obsolete_segments,
	- &tvd->vdev_obsolete_segments);
	- tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
	- svd->vdev_indirect_config.vic_mapping_object = 0;
	- svd->vdev_indirect_config.vic_births_object = 0;
	- svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
	- svd->vdev_indirect_mapping = NULL;
	- svd->vdev_indirect_births = NULL;
	- svd->vdev_obsolete_sm = NULL;
	- svd->vdev_removing = 0;
	-
	- for (t = 0; t < TXG_SIZE; t++) {
	- while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
	- (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
	- while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
	- (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
	- if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
	- (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
	- }
	-
	- if (list_link_active(&svd->vdev_config_dirty_node)) {
	- vdev_config_clean(svd);
	- vdev_config_dirty(tvd);
	- }
	-
	- if (list_link_active(&svd->vdev_state_dirty_node)) {
	- vdev_state_clean(svd);
	- vdev_state_dirty(tvd);
	- }
	-
	- tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
	- svd->vdev_deflate_ratio = 0;
	-
	- tvd->vdev_islog = svd->vdev_islog;
	- svd->vdev_islog = 0;
	-
	- dsl_scan_io_queue_vdev_xfer(svd, tvd);
	-}
	-
	-static void
	-vdev_top_update(vdev_t tvd, vdev_t vd)
	-{
	- if (vd == NULL)
	- return;
	-
	- vd->vdev_top = tvd;
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_top_update(tvd, vd->vdev_child[c]);
	-}
	-
	-/*
	- * Add a mirror/replacing vdev above an existing vdev.
	- */
	-vdev_t *
	-vdev_add_parent(vdev_t cvd, vdev_ops_t ops)
	-{
	- spa_t *spa = cvd->vdev_spa;
	- vdev_t *pvd = cvd->vdev_parent;
	- vdev_t *mvd;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
	-
	- mvd->vdev_asize = cvd->vdev_asize;
	- mvd->vdev_min_asize = cvd->vdev_min_asize;
	- mvd->vdev_max_asize = cvd->vdev_max_asize;
	- mvd->vdev_psize = cvd->vdev_psize;
	- mvd->vdev_ashift = cvd->vdev_ashift;
	- mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
	- mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
	- mvd->vdev_state = cvd->vdev_state;
	- mvd->vdev_crtxg = cvd->vdev_crtxg;
	-
	- vdev_remove_child(pvd, cvd);
	- vdev_add_child(pvd, mvd);
	- cvd->vdev_id = mvd->vdev_children;
	- vdev_add_child(mvd, cvd);
	- vdev_top_update(cvd->vdev_top, cvd->vdev_top);
	-
	- if (mvd == mvd->vdev_top)
	- vdev_top_transfer(cvd, mvd);
	-
	- return (mvd);
	-}
	-
	-/*
	- * Remove a 1-way mirror/replacing vdev from the tree.
	- */
	-void
	-vdev_remove_parent(vdev_t *cvd)
	-{
	- vdev_t *mvd = cvd->vdev_parent;
	- vdev_t *pvd = mvd->vdev_parent;
	-
	- ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- ASSERT(mvd->vdev_children == 1);
	- ASSERT(mvd->vdev_ops == &vdev_mirror_ops \|\|
	- mvd->vdev_ops == &vdev_replacing_ops \|\|
	- mvd->vdev_ops == &vdev_spare_ops);
	- cvd->vdev_ashift = mvd->vdev_ashift;
	- cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
	- cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
	-
	- vdev_remove_child(mvd, cvd);
	- vdev_remove_child(pvd, mvd);
	-
	- /*
	- * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
	- * Otherwise, we could have detached an offline device, and when we
	- * go to import the pool we'll think we have two top-level vdevs,
	- * instead of a different version of the same top-level vdev.
	- */
	- if (mvd->vdev_top == mvd) {
	- uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
	- cvd->vdev_orig_guid = cvd->vdev_guid;
	- cvd->vdev_guid += guid_delta;
	- cvd->vdev_guid_sum += guid_delta;
	- }
	- cvd->vdev_id = mvd->vdev_id;
	- vdev_add_child(pvd, cvd);
	- vdev_top_update(cvd->vdev_top, cvd->vdev_top);
	-
	- if (cvd == cvd->vdev_top)
	- vdev_top_transfer(mvd, cvd);
	-
	- ASSERT(mvd->vdev_children == 0);
	- vdev_free(mvd);
	-}
	-
	-static void
	-vdev_metaslab_group_create(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- /*
	- * metaslab_group_create was delayed until allocation bias was available
	- */
	- if (vd->vdev_mg == NULL) {
	- metaslab_class_t *mc;
	-
	- if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
	- vd->vdev_alloc_bias = VDEV_BIAS_LOG;
	-
	- ASSERT3U(vd->vdev_islog, ==,
	- (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
	-
	- switch (vd->vdev_alloc_bias) {
	- case VDEV_BIAS_LOG:
	- mc = spa_log_class(spa);
	- break;
	- case VDEV_BIAS_SPECIAL:
	- mc = spa_special_class(spa);
	- break;
	- case VDEV_BIAS_DEDUP:
	- mc = spa_dedup_class(spa);
	- break;
	- default:
	- mc = spa_normal_class(spa);
	- }
	-
	- vd->vdev_mg = metaslab_group_create(mc, vd,
	- spa->spa_alloc_count);
	-
	- /*
	- * The spa ashift values currently only reflect the
	- * general vdev classes. Class destination is late
	- * binding so ashift checking had to wait until now
	- */
	- if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
	- mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
	- if (vd->vdev_ashift > spa->spa_max_ashift)
	- spa->spa_max_ashift = vd->vdev_ashift;
	- if (vd->vdev_ashift < spa->spa_min_ashift)
	- spa->spa_min_ashift = vd->vdev_ashift;
	- }
	- }
	-}
	-
	-int
	-vdev_metaslab_init(vdev_t *vd, uint64_t txg)
	-{
	- spa_t *spa = vd->vdev_spa;
	- objset_t *mos = spa->spa_meta_objset;
	- uint64_t m;
	- uint64_t oldc = vd->vdev_ms_count;
	- uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
	- metaslab_t **mspp;
	- int error;
	- boolean_t expanding = (oldc != 0);
	-
	- ASSERT(txg == 0 \|\| spa_config_held(spa, SCL_ALLOC, RW_WRITER));
	-
	- /*
	- * This vdev is not being allocated from yet or is a hole.
	- */
	- if (vd->vdev_ms_shift == 0)
	- return (0);
	-
	- ASSERT(!vd->vdev_ishole);
	-
	- ASSERT(oldc <= newc);
	-
	- mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
	-
	- if (expanding) {
	- bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
	- kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
	- }
	-
	- vd->vdev_ms = mspp;
	- vd->vdev_ms_count = newc;
	- for (m = oldc; m < newc; m++) {
	- uint64_t object = 0;
	-
	- /*
	- * vdev_ms_array may be 0 if we are creating the "fake"
	- * metaslabs for an indirect vdev for zdb's leak detection.
	- * See zdb_leak_init().
	- */
	- if (txg == 0 && vd->vdev_ms_array != 0) {
	- error = dmu_read(mos, vd->vdev_ms_array,
	- m * sizeof (uint64_t), sizeof (uint64_t), &object,
	- DMU_READ_PREFETCH);
	- if (error != 0) {
	- vdev_dbgmsg(vd, "unable to read the metaslab "
	- "array [error=%d]", error);
	- return (error);
	- }
	- }
	-
	-#ifndef _KERNEL
	- /*
	- * To accomodate zdb_leak_init() fake indirect
	- * metaslabs, we allocate a metaslab group for
	- * indirect vdevs which normally don't have one.
	- */
	- if (vd->vdev_mg == NULL) {
	- ASSERT0(vdev_is_concrete(vd));
	- vdev_metaslab_group_create(vd);
	- }
	-#endif
	- error = metaslab_init(vd->vdev_mg, m, object, txg,
	- &(vd->vdev_ms[m]));
	- if (error != 0) {
	- vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
	- error);
	- return (error);
	- }
	- }
	-
	- if (txg == 0)
	- spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
	-
	- /*
	- * If the vdev is being removed we don't activate
	- * the metaslabs since we want to ensure that no new
	- * allocations are performed on this device.
	- */
	- if (!expanding && !vd->vdev_removing) {
	- metaslab_group_activate(vd->vdev_mg);
	- }
	-
	- if (txg == 0)
	- spa_config_exit(spa, SCL_ALLOC, FTAG);
	-
	- return (0);
	-}
	-
	-void
	-vdev_metaslab_fini(vdev_t *vd)
	-{
	- if (vd->vdev_checkpoint_sm != NULL) {
	- ASSERT(spa_feature_is_active(vd->vdev_spa,
	- SPA_FEATURE_POOL_CHECKPOINT));
	- space_map_close(vd->vdev_checkpoint_sm);
	- /*
	- * Even though we close the space map, we need to set its
	- * pointer to NULL. The reason is that vdev_metaslab_fini()
	- * may be called multiple times for certain operations
	- * (i.e. when destroying a pool) so we need to ensure that
	- * this clause never executes twice. This logic is similar
	- * to the one used for the vdev_ms clause below.
	- */
	- vd->vdev_checkpoint_sm = NULL;
	- }
	-
	- if (vd->vdev_ms != NULL) {
	- metaslab_group_t *mg = vd->vdev_mg;
	- metaslab_group_passivate(mg);
	-
	- uint64_t count = vd->vdev_ms_count;
	- for (uint64_t m = 0; m < count; m++) {
	- metaslab_t *msp = vd->vdev_ms[m];
	- if (msp != NULL)
	- metaslab_fini(msp);
	- }
	- kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
	- vd->vdev_ms = NULL;
	-
	- vd->vdev_ms_count = 0;
	-
	- for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
	- ASSERT0(mg->mg_histogram[i]);
	- }
	- ASSERT0(vd->vdev_ms_count);
	-}
	-
	-typedef struct vdev_probe_stats {
	- boolean_t vps_readable;
	- boolean_t vps_writeable;
	- int vps_flags;
	-} vdev_probe_stats_t;
	-
	-static void
	-vdev_probe_done(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- vdev_t *vd = zio->io_vd;
	- vdev_probe_stats_t *vps = zio->io_private;
	-
	- ASSERT(vd->vdev_probe_zio != NULL);
	-
	- if (zio->io_type == ZIO_TYPE_READ) {
	- if (zio->io_error == 0)
	- vps->vps_readable = 1;
	- if (zio->io_error == 0 && spa_writeable(spa)) {
	- zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
	- zio->io_offset, zio->io_size, zio->io_abd,
	- ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
	- ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
	- } else {
	- abd_free(zio->io_abd);
	- }
	- } else if (zio->io_type == ZIO_TYPE_WRITE) {
	- if (zio->io_error == 0)
	- vps->vps_writeable = 1;
	- abd_free(zio->io_abd);
	- } else if (zio->io_type == ZIO_TYPE_NULL) {
	- zio_t *pio;
	-
	- vd->vdev_cant_read \|= !vps->vps_readable;
	- vd->vdev_cant_write \|= !vps->vps_writeable;
	-
	- if (vdev_readable(vd) &&
	- (vdev_writeable(vd) \|\| !spa_writeable(spa))) {
	- zio->io_error = 0;
	- } else {
	- ASSERT(zio->io_error != 0);
	- vdev_dbgmsg(vd, "failed probe");
	- zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
	- spa, vd, NULL, 0, 0);
	- zio->io_error = SET_ERROR(ENXIO);
	- }
	-
	- mutex_enter(&vd->vdev_probe_lock);
	- ASSERT(vd->vdev_probe_zio == zio);
	- vd->vdev_probe_zio = NULL;
	- mutex_exit(&vd->vdev_probe_lock);
	-
	- zio_link_t *zl = NULL;
	- while ((pio = zio_walk_parents(zio, &zl)) != NULL)
	- if (!vdev_accessible(vd, pio))
	- pio->io_error = SET_ERROR(ENXIO);
	-
	- kmem_free(vps, sizeof (*vps));
	- }
	-}
	-
	-/*
	- * Determine whether this device is accessible.
	- *
	- * Read and write to several known locations: the pad regions of each
	- * vdev label but the first, which we leave alone in case it contains
	- * a VTOC.
	- */
	-zio_t *
	-vdev_probe(vdev_t vd, zio_t zio)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_probe_stats_t *vps = NULL;
	- zio_t *pio;
	-
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- /*
	- * Don't probe the probe.
	- */
	- if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
	- return (NULL);
	-
	- /*
	- * To prevent 'probe storms' when a device fails, we create
	- * just one probe i/o at a time. All zios that want to probe
	- * this vdev will become parents of the probe io.
	- */
	- mutex_enter(&vd->vdev_probe_lock);
	-
	- if ((pio = vd->vdev_probe_zio) == NULL) {
	- vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
	-
	- vps->vps_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_PROBE \|
	- ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_AGGREGATE \|
	- ZIO_FLAG_TRYHARD;
	-
	- if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
	- /*
	- * vdev_cant_read and vdev_cant_write can only
	- * transition from TRUE to FALSE when we have the
	- * SCL_ZIO lock as writer; otherwise they can only
	- * transition from FALSE to TRUE. This ensures that
	- * any zio looking at these values can assume that
	- * failures persist for the life of the I/O. That's
	- * important because when a device has intermittent
	- * connectivity problems, we want to ensure that
	- * they're ascribed to the device (ENXIO) and not
	- * the zio (EIO).
	- *
	- * Since we hold SCL_ZIO as writer here, clear both
	- * values so the probe can reevaluate from first
	- * principles.
	- */
	- vps->vps_flags \|= ZIO_FLAG_CONFIG_WRITER;
	- vd->vdev_cant_read = B_FALSE;
	- vd->vdev_cant_write = B_FALSE;
	- }
	-
	- vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
	- vdev_probe_done, vps,
	- vps->vps_flags \| ZIO_FLAG_DONT_PROPAGATE);
	-
	- /*
	- * We can't change the vdev state in this context, so we
	- * kick off an async task to do it on our behalf.
	- */
	- if (zio != NULL) {
	- vd->vdev_probe_wanted = B_TRUE;
	- spa_async_request(spa, SPA_ASYNC_PROBE);
	- }
	- }
	-
	- if (zio != NULL)
	- zio_add_child(zio, pio);
	-
	- mutex_exit(&vd->vdev_probe_lock);
	-
	- if (vps == NULL) {
	- ASSERT(zio != NULL);
	- return (NULL);
	- }
	-
	- for (int l = 1; l < VDEV_LABELS; l++) {
	- zio_nowait(zio_read_phys(pio, vd,
	- vdev_label_offset(vd->vdev_psize, l,
	- offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
	- abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
	- ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
	- ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
	- }
	-
	- if (zio == NULL)
	- return (pio);
	-
	- zio_nowait(pio);
	- return (NULL);
	-}
	-
	-static void
	-vdev_open_child(void *arg)
	-{
	- vdev_t *vd = arg;
	-
	- vd->vdev_open_thread = curthread;
	- vd->vdev_open_error = vdev_open(vd);
	- vd->vdev_open_thread = NULL;
	-}
	-
	-boolean_t
	-vdev_uses_zvols(vdev_t *vd)
	-{
	- if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
	- strlen(ZVOL_DIR)) == 0)
	- return (B_TRUE);
	- for (int c = 0; c < vd->vdev_children; c++)
	- if (vdev_uses_zvols(vd->vdev_child[c]))
	- return (B_TRUE);
	- return (B_FALSE);
	-}
	-
	-void
	-vdev_open_children(vdev_t *vd)
	-{
	- taskq_t *tq;
	- int children = vd->vdev_children;
	-
	- vd->vdev_nonrot = B_TRUE;
	-
	- /*
	- * in order to handle pools on top of zvols, do the opens
	- * in a single thread so that the same thread holds the
	- * spa_namespace_lock
	- */
	- if (B_TRUE \|\| vdev_uses_zvols(vd)) {
	- for (int c = 0; c < children; c++) {
	- vd->vdev_child[c]->vdev_open_error =
	- vdev_open(vd->vdev_child[c]);
	- vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
	- }
	- return;
	- }
	- tq = taskq_create("vdev_open", children, minclsyspri,
	- children, children, TASKQ_PREPOPULATE);
	-
	- for (int c = 0; c < children; c++)
	- VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
	- TQ_SLEEP) != 0);
	-
	- taskq_destroy(tq);
	-
	- for (int c = 0; c < children; c++)
	- vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
	-}
	-
	-/*
	- * Compute the raidz-deflation ratio. Note, we hard-code
	- * in 128k (1 << 17) because it is the "typical" blocksize.
	- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
	- * otherwise it would inconsistently account for existing bp's.
	- */
	-static void
	-vdev_set_deflate_ratio(vdev_t *vd)
	-{
	- if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
	- vd->vdev_deflate_ratio = (1 << 17) /
	- (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
	- }
	-}
	-
	-/*
	- * Prepare a virtual device for access.
	- */
	-int
	-vdev_open(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- int error;
	- uint64_t osize = 0;
	- uint64_t max_osize = 0;
	- uint64_t asize, max_asize, psize;
	- uint64_t logical_ashift = 0;
	- uint64_t physical_ashift = 0;
	-
	- ASSERT(vd->vdev_open_thread == curthread \|\|
	- spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	- ASSERT(vd->vdev_state == VDEV_STATE_CLOSED \|\|
	- vd->vdev_state == VDEV_STATE_CANT_OPEN \|\|
	- vd->vdev_state == VDEV_STATE_OFFLINE);
	-
	- vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
	- vd->vdev_cant_read = B_FALSE;
	- vd->vdev_cant_write = B_FALSE;
	- vd->vdev_notrim = B_FALSE;
	- vd->vdev_min_asize = vdev_get_min_asize(vd);
	-
	- /*
	- * If this vdev is not removed, check its fault status. If it's
	- * faulted, bail out of the open.
	- */
	- if (!vd->vdev_removed && vd->vdev_faulted) {
	- ASSERT(vd->vdev_children == 0);
	- ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\|
	- vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
	- vd->vdev_label_aux);
	- return (SET_ERROR(ENXIO));
	- } else if (vd->vdev_offline) {
	- ASSERT(vd->vdev_children == 0);
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
	- return (SET_ERROR(ENXIO));
	- }
	-
	- error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
	- &logical_ashift, &physical_ashift);
	-
	- /*
	- * Reset the vdev_reopening flag so that we actually close
	- * the vdev on error.
	- */
	- vd->vdev_reopening = B_FALSE;
	- if (zio_injection_enabled && error == 0)
	- error = zio_handle_device_injection(vd, NULL, ENXIO);
	-
	- if (error) {
	- if (vd->vdev_removed &&
	- vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
	- vd->vdev_removed = B_FALSE;
	-
	- if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
	- vd->vdev_stat.vs_aux);
	- } else {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	- vd->vdev_stat.vs_aux);
	- }
	- return (error);
	- }
	-
	- vd->vdev_removed = B_FALSE;
	-
	- /*
	- * Recheck the faulted flag now that we have confirmed that
	- * the vdev is accessible. If we're faulted, bail.
	- */
	- if (vd->vdev_faulted) {
	- ASSERT(vd->vdev_children == 0);
	- ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\|
	- vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
	- vd->vdev_label_aux);
	- return (SET_ERROR(ENXIO));
	- }
	-
	- if (vd->vdev_degraded) {
	- ASSERT(vd->vdev_children == 0);
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
	- VDEV_AUX_ERR_EXCEEDED);
	- } else {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
	- }
	-
	- /*
	- * For hole or missing vdevs we just return success.
	- */
	- if (vd->vdev_ishole \|\| vd->vdev_ops == &vdev_missing_ops)
	- return (0);
	-
	- if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf)
	- trim_map_create(vd);
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
	- VDEV_AUX_NONE);
	- break;
	- }
	- }
	-
	- osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
	- max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
	-
	- if (vd->vdev_children == 0) {
	- if (osize < SPA_MINDEVSIZE) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_TOO_SMALL);
	- return (SET_ERROR(EOVERFLOW));
	- }
	- psize = osize;
	- asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
	- max_asize = max_osize - (VDEV_LABEL_START_SIZE +
	- VDEV_LABEL_END_SIZE);
	- } else {
	- if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
	- (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_TOO_SMALL);
	- return (SET_ERROR(EOVERFLOW));
	- }
	- psize = 0;
	- asize = osize;
	- max_asize = max_osize;
	- }
	-
	- vd->vdev_psize = psize;
	-
	- /*
	- * Make sure the allocatable size hasn't shrunk too much.
	- */
	- if (asize < vd->vdev_min_asize) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_BAD_LABEL);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- vd->vdev_physical_ashift =
	- MAX(physical_ashift, vd->vdev_physical_ashift);
	- vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
	- vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
	-
	- if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_ASHIFT_TOO_BIG);
	- return (EINVAL);
	- }
	-
	- if (vd->vdev_asize == 0) {
	- /*
	- * This is the first-ever open, so use the computed values.
	- * For testing purposes, a higher ashift can be requested.
	- */
	- vd->vdev_asize = asize;
	- vd->vdev_max_asize = max_asize;
	- } else {
	- /*
	- * Make sure the alignment requirement hasn't increased.
	- */
	- if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
	- vd->vdev_ops->vdev_op_leaf) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_BAD_LABEL);
	- return (EINVAL);
	- }
	- vd->vdev_max_asize = max_asize;
	- }
	-
	- /*
	- * If all children are healthy we update asize if either:
	- * The asize has increased, due to a device expansion caused by dynamic
	- * LUN growth or vdev replacement, and automatic expansion is enabled;
	- * making the additional space available.
	- *
	- * The asize has decreased, due to a device shrink usually caused by a
	- * vdev replace with a smaller device. This ensures that calculations
	- * based of max_asize and asize e.g. esize are always valid. It's safe
	- * to do this as we've already validated that asize is greater than
	- * vdev_min_asize.
	- */
	- if (vd->vdev_state == VDEV_STATE_HEALTHY &&
	- ((asize > vd->vdev_asize &&
	- (vd->vdev_expanding \|\| spa->spa_autoexpand)) \|\|
	- (asize < vd->vdev_asize)))
	- vd->vdev_asize = asize;
	-
	- vdev_set_min_asize(vd);
	-
	- /*
	- * Ensure we can issue some IO before declaring the
	- * vdev open for business.
	- */
	- if (vd->vdev_ops->vdev_op_leaf &&
	- (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
	- VDEV_AUX_ERR_EXCEEDED);
	- return (error);
	- }
	-
	- /*
	- * Track the min and max ashift values for normal data devices.
	- *
	- * DJB - TBD these should perhaps be tracked per allocation class
	- * (e.g. spa_min_ashift is used to round up post compression buffers)
	- */
	- if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
	- vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
	- vd->vdev_aux == NULL) {
	- if (vd->vdev_ashift > spa->spa_max_ashift)
	- spa->spa_max_ashift = vd->vdev_ashift;
	- if (vd->vdev_ashift < spa->spa_min_ashift)
	- spa->spa_min_ashift = vd->vdev_ashift;
	- }
	-
	- /*
	- * If a leaf vdev has a DTL, and seems healthy, then kick off a
	- * resilver. But don't do this if we are doing a reopen for a scrub,
	- * since this would just restart the scrub we are already doing.
	- */
	- if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
	- vdev_resilver_needed(vd, NULL, NULL))
	- spa_async_request(spa, SPA_ASYNC_RESILVER);
	-
	- return (0);
	-}
	-
	-/*
	- * Called once the vdevs are all opened, this routine validates the label
	- * contents. This needs to be done before vdev_load() so that we don't
	- * inadvertently do repair I/Os to the wrong device.
	- *
	- * This function will only return failure if one of the vdevs indicates that it
	- * has since been destroyed or exported. This is only possible if
	- * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
	- * will be updated but the function will return 0.
	- */
	-int
	-vdev_validate(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- nvlist_t *label;
	- uint64_t guid = 0, aux_guid = 0, top_guid;
	- uint64_t state;
	- nvlist_t *nvl;
	- uint64_t txg;
	-
	- if (vdev_validate_skip)
	- return (0);
	-
	- for (uint64_t c = 0; c < vd->vdev_children; c++)
	- if (vdev_validate(vd->vdev_child[c]) != 0)
	- return (SET_ERROR(EBADF));
	-
	- /*
	- * If the device has already failed, or was marked offline, don't do
	- * any further validation. Otherwise, label I/O will fail and we will
	- * overwrite the previous state.
	- */
	- if (!vd->vdev_ops->vdev_op_leaf \|\| !vdev_readable(vd))
	- return (0);
	-
	- /*
	- * If we are performing an extreme rewind, we allow for a label that
	- * was modified at a point after the current txg.
	- * If config lock is not held do not check for the txg. spa_sync could
	- * be updating the vdev's label before updating spa_last_synced_txg.
	- */
	- if (spa->spa_extreme_rewind \|\| spa_last_synced_txg(spa) == 0 \|\|
	- spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
	- txg = UINT64_MAX;
	- else
	- txg = spa_last_synced_txg(spa);
	-
	- if ((label = vdev_label_read_config(vd, txg)) == NULL) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_BAD_LABEL);
	- vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
	- "txg %llu", (u_longlong_t)txg);
	- return (0);
	- }
	-
	- /*
	- * Determine if this vdev has been split off into another
	- * pool. If so, then refuse to open it.
	- */
	- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
	- &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_SPLIT_POOL);
	- nvlist_free(label);
	- vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
	- return (0);
	- }
	-
	- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- nvlist_free(label);
	- vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
	- ZPOOL_CONFIG_POOL_GUID);
	- return (0);
	- }
	-
	- /*
	- * If config is not trusted then ignore the spa guid check. This is
	- * necessary because if the machine crashed during a re-guid the new
	- * guid might have been written to all of the vdev labels, but not the
	- * cached config. The check will be performed again once we have the
	- * trusted config from the MOS.
	- */
	- if (spa->spa_trust_config && guid != spa_guid(spa)) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- nvlist_free(label);
	- vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
	- "match config (%llu != %llu)", (u_longlong_t)guid,
	- (u_longlong_t)spa_guid(spa));
	- return (0);
	- }
	-
	- if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
	- != 0 \|\| nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
	- &aux_guid) != 0)
	- aux_guid = 0;
	-
	- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- nvlist_free(label);
	- vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
	- ZPOOL_CONFIG_GUID);
	- return (0);
	- }
	-
	- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
	- != 0) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- nvlist_free(label);
	- vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
	- ZPOOL_CONFIG_TOP_GUID);
	- return (0);
	- }
	-
	- /*
	- * If this vdev just became a top-level vdev because its sibling was
	- * detached, it will have adopted the parent's vdev guid -- but the
	- * label may or may not be on disk yet. Fortunately, either version
	- * of the label will have the same top guid, so if we're a top-level
	- * vdev, we can safely compare to that instead.
	- * However, if the config comes from a cachefile that failed to update
	- * after the detach, a top-level vdev will appear as a non top-level
	- * vdev in the config. Also relax the constraints if we perform an
	- * extreme rewind.
	- *
	- * If we split this vdev off instead, then we also check the
	- * original pool's guid. We don't want to consider the vdev
	- * corrupt if it is partway through a split operation.
	- */
	- if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
	- boolean_t mismatch = B_FALSE;
	- if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
	- if (vd != vd->vdev_top \|\| vd->vdev_guid != top_guid)
	- mismatch = B_TRUE;
	- } else {
	- if (vd->vdev_guid != top_guid &&
	- vd->vdev_top->vdev_guid != guid)
	- mismatch = B_TRUE;
	- }
	-
	- if (mismatch) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- nvlist_free(label);
	- vdev_dbgmsg(vd, "vdev_validate: config guid "
	- "doesn't match label guid");
	- vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
	- (u_longlong_t)vd->vdev_guid,
	- (u_longlong_t)vd->vdev_top->vdev_guid);
	- vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
	- "aux_guid %llu", (u_longlong_t)guid,
	- (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
	- return (0);
	- }
	- }
	-
	- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
	- &state) != 0) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- nvlist_free(label);
	- vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
	- ZPOOL_CONFIG_POOL_STATE);
	- return (0);
	- }
	-
	- nvlist_free(label);
	-
	- /*
	- * If this is a verbatim import, no need to check the
	- * state of the pool.
	- */
	- if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
	- spa_load_state(spa) == SPA_LOAD_OPEN &&
	- state != POOL_STATE_ACTIVE) {
	- vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
	- "for spa %s", (u_longlong_t)state, spa->spa_name);
	- return (SET_ERROR(EBADF));
	- }
	-
	- /*
	- * If we were able to open and validate a vdev that was
	- * previously marked permanently unavailable, clear that state
	- * now.
	- */
	- if (vd->vdev_not_present)
	- vd->vdev_not_present = 0;
	-
	- return (0);
	-}
	-
	-static void
	-vdev_copy_path_impl(vdev_t svd, vdev_t dvd)
	-{
	- if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
	- if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
	- zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
	- "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
	- dvd->vdev_path, svd->vdev_path);
	- spa_strfree(dvd->vdev_path);
	- dvd->vdev_path = spa_strdup(svd->vdev_path);
	- }
	- } else if (svd->vdev_path != NULL) {
	- dvd->vdev_path = spa_strdup(svd->vdev_path);
	- zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
	- (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
	- }
	-}
	-
	-/*
	- * Recursively copy vdev paths from one vdev to another. Source and destination
	- * vdev trees must have same geometry otherwise return error. Intended to copy
	- * paths from userland config into MOS config.
	- */
	-int
	-vdev_copy_path_strict(vdev_t svd, vdev_t dvd)
	-{
	- if ((svd->vdev_ops == &vdev_missing_ops) \|\|
	- (svd->vdev_ishole && dvd->vdev_ishole) \|\|
	- (dvd->vdev_ops == &vdev_indirect_ops))
	- return (0);
	-
	- if (svd->vdev_ops != dvd->vdev_ops) {
	- vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
	- svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (svd->vdev_guid != dvd->vdev_guid) {
	- vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
	- "%llu)", (u_longlong_t)svd->vdev_guid,
	- (u_longlong_t)dvd->vdev_guid);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (svd->vdev_children != dvd->vdev_children) {
	- vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
	- "%llu != %llu", (u_longlong_t)svd->vdev_children,
	- (u_longlong_t)dvd->vdev_children);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- for (uint64_t i = 0; i < svd->vdev_children; i++) {
	- int error = vdev_copy_path_strict(svd->vdev_child[i],
	- dvd->vdev_child[i]);
	- if (error != 0)
	- return (error);
	- }
	-
	- if (svd->vdev_ops->vdev_op_leaf)
	- vdev_copy_path_impl(svd, dvd);
	-
	- return (0);
	-}
	-
	-static void
	-vdev_copy_path_search(vdev_t stvd, vdev_t dvd)
	-{
	- ASSERT(stvd->vdev_top == stvd);
	- ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
	-
	- for (uint64_t i = 0; i < dvd->vdev_children; i++) {
	- vdev_copy_path_search(stvd, dvd->vdev_child[i]);
	- }
	-
	- if (!dvd->vdev_ops->vdev_op_leaf \|\| !vdev_is_concrete(dvd))
	- return;
	-
	- /*
	- * The idea here is that while a vdev can shift positions within
	- * a top vdev (when replacing, attaching mirror, etc.) it cannot
	- * step outside of it.
	- */
	- vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
	-
	- if (vd == NULL \|\| vd->vdev_ops != dvd->vdev_ops)
	- return;
	-
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- vdev_copy_path_impl(vd, dvd);
	-}
	-
	-/*
	- * Recursively copy vdev paths from one root vdev to another. Source and
	- * destination vdev trees may differ in geometry. For each destination leaf
	- * vdev, search a vdev with the same guid and top vdev id in the source.
	- * Intended to copy paths from userland config into MOS config.
	- */
	-void
	-vdev_copy_path_relaxed(vdev_t srvd, vdev_t drvd)
	-{
	- uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
	- ASSERT(srvd->vdev_ops == &vdev_root_ops);
	- ASSERT(drvd->vdev_ops == &vdev_root_ops);
	-
	- for (uint64_t i = 0; i < children; i++) {
	- vdev_copy_path_search(srvd->vdev_child[i],
	- drvd->vdev_child[i]);
	- }
	-}
	-
	-/*
	- * Close a virtual device.
	- */
	-void
	-vdev_close(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_t *pvd = vd->vdev_parent;
	-
	- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	-
	- /*
	- * If our parent is reopening, then we are as well, unless we are
	- * going offline.
	- */
	- if (pvd != NULL && pvd->vdev_reopening)
	- vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
	-
	- vd->vdev_ops->vdev_op_close(vd);
	-
	- vdev_cache_purge(vd);
	-
	- if (vd->vdev_ops->vdev_op_leaf)
	- trim_map_destroy(vd);
	-
	- /*
	- * We record the previous state before we close it, so that if we are
	- * doing a reopen(), we don't generate FMA ereports if we notice that
	- * it's still faulted.
	- */
	- vd->vdev_prevstate = vd->vdev_state;
	-
	- if (vd->vdev_offline)
	- vd->vdev_state = VDEV_STATE_OFFLINE;
	- else
	- vd->vdev_state = VDEV_STATE_CLOSED;
	- vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
	-}
	-
	-void
	-vdev_hold(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT(spa_is_root(spa));
	- if (spa->spa_state == POOL_STATE_UNINITIALIZED)
	- return;
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_hold(vd->vdev_child[c]);
	-
	- if (vd->vdev_ops->vdev_op_leaf)
	- vd->vdev_ops->vdev_op_hold(vd);
	-}
	-
	-void
	-vdev_rele(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT(spa_is_root(spa));
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_rele(vd->vdev_child[c]);
	-
	- if (vd->vdev_ops->vdev_op_leaf)
	- vd->vdev_ops->vdev_op_rele(vd);
	-}
	-
	-/*
	- * Reopen all interior vdevs and any unopened leaves. We don't actually
	- * reopen leaf vdevs which had previously been opened as they might deadlock
	- * on the spa_config_lock. Instead we only obtain the leaf's physical size.
	- * If the leaf has never been opened then open it, as usual.
	- */
	-void
	-vdev_reopen(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	-
	- /* set the reopening flag unless we're taking the vdev offline */
	- vd->vdev_reopening = !vd->vdev_offline;
	- vdev_close(vd);
	- (void) vdev_open(vd);
	-
	- /*
	- * Call vdev_validate() here to make sure we have the same device.
	- * Otherwise, a device with an invalid label could be successfully
	- * opened in response to vdev_reopen().
	- */
	- if (vd->vdev_aux) {
	- (void) vdev_validate_aux(vd);
	- if (vdev_readable(vd) && vdev_writeable(vd) &&
	- vd->vdev_aux == &spa->spa_l2cache &&
	- !l2arc_vdev_present(vd))
	- l2arc_add_vdev(spa, vd);
	- } else {
	- (void) vdev_validate(vd);
	- }
	-
	- /*
	- * Reassess parent vdev's health.
	- */
	- vdev_propagate_state(vd);
	-}
	-
	-int
	-vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
	-{
	- int error;
	-
	- /*
	- * Normally, partial opens (e.g. of a mirror) are allowed.
	- * For a create, however, we want to fail the request if
	- * there are any components we can't open.
	- */
	- error = vdev_open(vd);
	-
	- if (error \|\| vd->vdev_state != VDEV_STATE_HEALTHY) {
	- vdev_close(vd);
	- return (error ? error : ENXIO);
	- }
	-
	- /*
	- * Recursively load DTLs and initialize all labels.
	- */
	- if ((error = vdev_dtl_load(vd)) != 0 \|\|
	- (error = vdev_label_init(vd, txg, isreplacing ?
	- VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
	- vdev_close(vd);
	- return (error);
	- }
	-
	- return (0);
	-}
	-
	-void
	-vdev_metaslab_set_size(vdev_t *vd)
	-{
	- uint64_t asize = vd->vdev_asize;
	- uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
	- uint64_t ms_shift;
	-
	- /*
	- * There are two dimensions to the metaslab sizing calculation:
	- * the size of the metaslab and the count of metaslabs per vdev.
	- *
	- * The default values used below are a good balance between memory
	- * usage (larger metaslab size means more memory needed for loaded
	- * metaslabs; more metaslabs means more memory needed for the
	- * metaslab_t structs), metaslab load time (larger metaslabs take
	- * longer to load), and metaslab sync time (more metaslabs means
	- * more time spent syncing all of them).
	- *
	- * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
	- * The range of the dimensions are as follows:
	- *
	- * 2^29 <= ms_size <= 2^34
	- * 16 <= ms_count <= 131,072
	- *
	- * On the lower end of vdev sizes, we aim for metaslabs sizes of
	- * at least 512MB (2^29) to minimize fragmentation effects when
	- * testing with smaller devices. However, the count constraint
	- * of at least 16 metaslabs will override this minimum size goal.
	- *
	- * On the upper end of vdev sizes, we aim for a maximum metaslab
	- * size of 16GB. However, we will cap the total count to 2^17
	- * metaslabs to keep our memory footprint in check and let the
	- * metaslab size grow from there if that limit is hit.
	- *
	- * The net effect of applying above constrains is summarized below.
	- *
	- * vdev size metaslab count
	- * --------------\|-----------------
	- * < 8GB ~16
	- * 8GB - 100GB one per 512MB
	- * 100GB - 3TB ~200
	- * 3TB - 2PB one per 16GB
	- * > 2PB ~131,072
	- * --------------------------------
	- *
	- * Finally, note that all of the above calculate the initial
	- * number of metaslabs. Expanding a top-level vdev will result
	- * in additional metaslabs being allocated making it possible
	- * to exceed the zfs_vdev_ms_count_limit.
	- */
	-
	- if (ms_count < zfs_vdev_min_ms_count)
	- ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
	- else if (ms_count > zfs_vdev_default_ms_count)
	- ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
	- else
	- ms_shift = zfs_vdev_default_ms_shift;
	-
	- if (ms_shift < SPA_MAXBLOCKSHIFT) {
	- ms_shift = SPA_MAXBLOCKSHIFT;
	- } else if (ms_shift > zfs_vdev_max_ms_shift) {
	- ms_shift = zfs_vdev_max_ms_shift;
	- /* cap the total count to constrain memory footprint */
	- if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
	- ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
	- }
	-
	- vd->vdev_ms_shift = ms_shift;
	- ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
	-}
	-
	-/*
	- * Maximize performance by inflating the configured ashift for top level
	- * vdevs to be as close to the physical ashift as possible while maintaining
	- * administrator defined limits and ensuring it doesn't go below the
	- * logical ashift.
	- */
	-void
	-vdev_ashift_optimize(vdev_t *vd)
	-{
	- if (vd == vd->vdev_top) {
	- if (vd->vdev_ashift < vd->vdev_physical_ashift) {
	- vd->vdev_ashift = MIN(
	- MAX(zfs_max_auto_ashift, vd->vdev_ashift),
	- MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
	- } else {
	- /*
	- * Unusual case where logical ashift > physical ashift
	- * so we can't cap the calculated ashift based on max
	- * ashift as that would cause failures.
	- * We still check if we need to increase it to match
	- * the min ashift.
	- */
	- vd->vdev_ashift = MAX(zfs_min_auto_ashift,
	- vd->vdev_ashift);
	- }
	- }
	-}
	-
	-void
	-vdev_dirty(vdev_t vd, int flags, void arg, uint64_t txg)
	-{
	- ASSERT(vd == vd->vdev_top);
	- /* indirect vdevs don't have metaslabs or dtls */
	- ASSERT(vdev_is_concrete(vd) \|\| flags == 0);
	- ASSERT(ISP2(flags));
	- ASSERT(spa_writeable(vd->vdev_spa));
	-
	- if (flags & VDD_METASLAB)
	- (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
	-
	- if (flags & VDD_DTL)
	- (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
	-
	- (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
	-}
	-
	-void
	-vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
	-{
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
	-
	- if (vd->vdev_ops->vdev_op_leaf)
	- vdev_dirty(vd->vdev_top, flags, vd, txg);
	-}
	-
	-/*
	- * DTLs.
	- *
	- * A vdev's DTL (dirty time log) is the set of transaction groups for which
	- * the vdev has less than perfect replication. There are four kinds of DTL:
	- *
	- * DTL_MISSING: txgs for which the vdev has no valid copies of the data
	- *
	- * DTL_PARTIAL: txgs for which data is available, but not fully replicated
	- *
	- * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
	- * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
	- * txgs that was scrubbed.
	- *
	- * DTL_OUTAGE: txgs which cannot currently be read, whether due to
	- * persistent errors or just some device being offline.
	- * Unlike the other three, the DTL_OUTAGE map is not generally
	- * maintained; it's only computed when needed, typically to
	- * determine whether a device can be detached.
	- *
	- * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
	- * either has the data or it doesn't.
	- *
	- * For interior vdevs such as mirror and RAID-Z the picture is more complex.
	- * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
	- * if any child is less than fully replicated, then so is its parent.
	- * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
	- * comprising only those txgs which appear in 'maxfaults' or more children;
	- * those are the txgs we don't have enough replication to read. For example,
	- * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
	- * thus, its DTL_MISSING consists of the set of txgs that appear in more than
	- * two child DTL_MISSING maps.
	- *
	- * It should be clear from the above that to compute the DTLs and outage maps
	- * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
	- * Therefore, that is all we keep on disk. When loading the pool, or after
	- * a configuration change, we generate all other DTLs from first principles.
	- */
	-void
	-vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
	-{
	- range_tree_t *rt = vd->vdev_dtl[t];
	-
	- ASSERT(t < DTL_TYPES);
	- ASSERT(vd != vd->vdev_spa->spa_root_vdev);
	- ASSERT(spa_writeable(vd->vdev_spa));
	-
	- mutex_enter(&vd->vdev_dtl_lock);
	- if (!range_tree_contains(rt, txg, size))
	- range_tree_add(rt, txg, size);
	- mutex_exit(&vd->vdev_dtl_lock);
	-}
	-
	-boolean_t
	-vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
	-{
	- range_tree_t *rt = vd->vdev_dtl[t];
	- boolean_t dirty = B_FALSE;
	-
	- ASSERT(t < DTL_TYPES);
	- ASSERT(vd != vd->vdev_spa->spa_root_vdev);
	-
	- /*
	- * While we are loading the pool, the DTLs have not been loaded yet.
	- * Ignore the DTLs and try all devices. This avoids a recursive
	- * mutex enter on the vdev_dtl_lock, and also makes us try hard
	- * when loading the pool (relying on the checksum to ensure that
	- * we get the right data -- note that we while loading, we are
	- * only reading the MOS, which is always checksummed).
	- */
	- if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE)
	- return (B_FALSE);
	-
	- mutex_enter(&vd->vdev_dtl_lock);
	- if (!range_tree_is_empty(rt))
	- dirty = range_tree_contains(rt, txg, size);
	- mutex_exit(&vd->vdev_dtl_lock);
	-
	- return (dirty);
	-}
	-
	-boolean_t
	-vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
	-{
	- range_tree_t *rt = vd->vdev_dtl[t];
	- boolean_t empty;
	-
	- mutex_enter(&vd->vdev_dtl_lock);
	- empty = range_tree_is_empty(rt);
	- mutex_exit(&vd->vdev_dtl_lock);
	-
	- return (empty);
	-}
	-
	-/*
	- * Returns B_TRUE if vdev determines offset needs to be resilvered.
	- */
	-boolean_t
	-vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
	-{
	- ASSERT(vd != vd->vdev_spa->spa_root_vdev);
	-
	- if (vd->vdev_ops->vdev_op_need_resilver == NULL \|\|
	- vd->vdev_ops->vdev_op_leaf)
	- return (B_TRUE);
	-
	- return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
	-}
	-
	-/*
	- * Returns the lowest txg in the DTL range.
	- */
	-static uint64_t
	-vdev_dtl_min(vdev_t *vd)
	-{
	- range_seg_t *rs;
	-
	- ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
	- ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
	- ASSERT0(vd->vdev_children);
	-
	- rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
	- return (rs->rs_start - 1);
	-}
	-
	-/*
	- * Returns the highest txg in the DTL.
	- */
	-static uint64_t
	-vdev_dtl_max(vdev_t *vd)
	-{
	- range_seg_t *rs;
	-
	- ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
	- ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
	- ASSERT0(vd->vdev_children);
	-
	- rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
	- return (rs->rs_end);
	-}
	-
	-/*
	- * Determine if a resilvering vdev should remove any DTL entries from
	- * its range. If the vdev was resilvering for the entire duration of the
	- * scan then it should excise that range from its DTLs. Otherwise, this
	- * vdev is considered partially resilvered and should leave its DTL
	- * entries intact. The comment in vdev_dtl_reassess() describes how we
	- * excise the DTLs.
	- */
	-static boolean_t
	-vdev_dtl_should_excise(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
	-
	- ASSERT0(scn->scn_phys.scn_errors);
	- ASSERT0(vd->vdev_children);
	-
	- if (vd->vdev_state < VDEV_STATE_DEGRADED)
	- return (B_FALSE);
	-
	- if (vd->vdev_resilver_txg == 0 \|\|
	- range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
	- return (B_TRUE);
	-
	- /*
	- * When a resilver is initiated the scan will assign the scn_max_txg
	- * value to the highest txg value that exists in all DTLs. If this
	- * device's max DTL is not part of this scan (i.e. it is not in
	- * the range (scn_min_txg, scn_max_txg] then it is not eligible
	- * for excision.
	- */
	- if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
	- ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
	- ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
	- ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * Reassess DTLs after a config change or scrub completion.
	- */
	-void
	-vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
	-{
	- spa_t *spa = vd->vdev_spa;
	- avl_tree_t reftree;
	- int minref;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_dtl_reassess(vd->vdev_child[c], txg,
	- scrub_txg, scrub_done);
	-
	- if (vd == spa->spa_root_vdev \|\| !vdev_is_concrete(vd) \|\| vd->vdev_aux)
	- return;
	-
	- if (vd->vdev_ops->vdev_op_leaf) {
	- dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
	-
	- mutex_enter(&vd->vdev_dtl_lock);
	-
	- /*
	- * If we've completed a scan cleanly then determine
	- * if this vdev should remove any DTLs. We only want to
	- * excise regions on vdevs that were available during
	- * the entire duration of this scan.
	- */
	- if (scrub_txg != 0 &&
	- (spa->spa_scrub_started \|\|
	- (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
	- vdev_dtl_should_excise(vd)) {
	- /*
	- * We completed a scrub up to scrub_txg. If we
	- * did it without rebooting, then the scrub dtl
	- * will be valid, so excise the old region and
	- * fold in the scrub dtl. Otherwise, leave the
	- * dtl as-is if there was an error.
	- *
	- * There's little trick here: to excise the beginning
	- * of the DTL_MISSING map, we put it into a reference
	- * tree and then add a segment with refcnt -1 that
	- * covers the range [0, scrub_txg). This means
	- * that each txg in that range has refcnt -1 or 0.
	- * We then add DTL_SCRUB with a refcnt of 2, so that
	- * entries in the range [0, scrub_txg) will have a
	- * positive refcnt -- either 1 or 2. We then convert
	- * the reference tree into the new DTL_MISSING map.
	- */
	- space_reftree_create(&reftree);
	- space_reftree_add_map(&reftree,
	- vd->vdev_dtl[DTL_MISSING], 1);
	- space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
	- space_reftree_add_map(&reftree,
	- vd->vdev_dtl[DTL_SCRUB], 2);
	- space_reftree_generate_map(&reftree,
	- vd->vdev_dtl[DTL_MISSING], 1);
	- space_reftree_destroy(&reftree);
	- }
	- range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
	- range_tree_walk(vd->vdev_dtl[DTL_MISSING],
	- range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
	- if (scrub_done)
	- range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
	- range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
	- if (!vdev_readable(vd))
	- range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
	- else
	- range_tree_walk(vd->vdev_dtl[DTL_MISSING],
	- range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
	-
	- /*
	- * If the vdev was resilvering and no longer has any
	- * DTLs then reset its resilvering flag and dirty
	- * the top level so that we persist the change.
	- */
	- if (vd->vdev_resilver_txg != 0 &&
	- range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
	- range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
	- vd->vdev_resilver_txg = 0;
	- vdev_config_dirty(vd->vdev_top);
	- }
	-
	- mutex_exit(&vd->vdev_dtl_lock);
	-
	- if (txg != 0)
	- vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
	- return;
	- }
	-
	- mutex_enter(&vd->vdev_dtl_lock);
	- for (int t = 0; t < DTL_TYPES; t++) {
	- /* account for child's outage in parent's missing map */
	- int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
	- if (t == DTL_SCRUB)
	- continue; /* leaf vdevs only */
	- if (t == DTL_PARTIAL)
	- minref = 1; /* i.e. non-zero */
	- else if (vd->vdev_nparity != 0)
	- minref = vd->vdev_nparity + 1; /* RAID-Z */
	- else
	- minref = vd->vdev_children; /* any kind of mirror */
	- space_reftree_create(&reftree);
	- for (int c = 0; c < vd->vdev_children; c++) {
	- vdev_t *cvd = vd->vdev_child[c];
	- mutex_enter(&cvd->vdev_dtl_lock);
	- space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
	- mutex_exit(&cvd->vdev_dtl_lock);
	- }
	- space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
	- space_reftree_destroy(&reftree);
	- }
	- mutex_exit(&vd->vdev_dtl_lock);
	-}
	-
	-int
	-vdev_dtl_load(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- objset_t *mos = spa->spa_meta_objset;
	- int error = 0;
	-
	- if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
	- ASSERT(vdev_is_concrete(vd));
	-
	- error = space_map_open(&vd->vdev_dtl_sm, mos,
	- vd->vdev_dtl_object, 0, -1ULL, 0);
	- if (error)
	- return (error);
	- ASSERT(vd->vdev_dtl_sm != NULL);
	-
	- mutex_enter(&vd->vdev_dtl_lock);
	- error = space_map_load(vd->vdev_dtl_sm,
	- vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
	- mutex_exit(&vd->vdev_dtl_lock);
	-
	- return (error);
	- }
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- error = vdev_dtl_load(vd->vdev_child[c]);
	- if (error != 0)
	- break;
	- }
	-
	- return (error);
	-}
	-
	-static void
	-vdev_zap_allocation_data(vdev_t vd, dmu_tx_t tx)
	-{
	- spa_t *spa = vd->vdev_spa;
	- objset_t *mos = spa->spa_meta_objset;
	- vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
	- const char *string;
	-
	- ASSERT(alloc_bias != VDEV_BIAS_NONE);
	-
	- string =
	- (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
	- (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
	- (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
	-
	- ASSERT(string != NULL);
	- VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
	- 1, strlen(string) + 1, string, tx));
	-
	- if (alloc_bias == VDEV_BIAS_SPECIAL \|\| alloc_bias == VDEV_BIAS_DEDUP) {
	- spa_activate_allocation_classes(spa, tx);
	- }
	-}
	-
	-void
	-vdev_destroy_unlink_zap(vdev_t vd, uint64_t zapobj, dmu_tx_t tx)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
	- VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
	- zapobj, tx));
	-}
	-
	-uint64_t
	-vdev_create_link_zap(vdev_t vd, dmu_tx_t tx)
	-{
	- spa_t *spa = vd->vdev_spa;
	- uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
	- DMU_OT_NONE, 0, tx);
	-
	- ASSERT(zap != 0);
	- VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
	- zap, tx));
	-
	- return (zap);
	-}
	-
	-void
	-vdev_construct_zaps(vdev_t vd, dmu_tx_t tx)
	-{
	- if (vd->vdev_ops != &vdev_hole_ops &&
	- vd->vdev_ops != &vdev_missing_ops &&
	- vd->vdev_ops != &vdev_root_ops &&
	- !vd->vdev_top->vdev_removing) {
	- if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
	- vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
	- }
	- if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
	- vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
	- if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
	- vdev_zap_allocation_data(vd, tx);
	- }
	- }
	-
	- for (uint64_t i = 0; i < vd->vdev_children; i++) {
	- vdev_construct_zaps(vd->vdev_child[i], tx);
	- }
	-}
	-
	-void
	-vdev_dtl_sync(vdev_t *vd, uint64_t txg)
	-{
	- spa_t *spa = vd->vdev_spa;
	- range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
	- objset_t *mos = spa->spa_meta_objset;
	- range_tree_t *rtsync;
	- dmu_tx_t *tx;
	- uint64_t object = space_map_object(vd->vdev_dtl_sm);
	-
	- ASSERT(vdev_is_concrete(vd));
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	-
	- if (vd->vdev_detached \|\| vd->vdev_top->vdev_removing) {
	- mutex_enter(&vd->vdev_dtl_lock);
	- space_map_free(vd->vdev_dtl_sm, tx);
	- space_map_close(vd->vdev_dtl_sm);
	- vd->vdev_dtl_sm = NULL;
	- mutex_exit(&vd->vdev_dtl_lock);
	-
	- /*
	- * We only destroy the leaf ZAP for detached leaves or for
	- * removed log devices. Removed data devices handle leaf ZAP
	- * cleanup later, once cancellation is no longer possible.
	- */
	- if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached \|\|
	- vd->vdev_top->vdev_islog)) {
	- vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
	- vd->vdev_leaf_zap = 0;
	- }
	-
	- dmu_tx_commit(tx);
	- return;
	- }
	-
	- if (vd->vdev_dtl_sm == NULL) {
	- uint64_t new_object;
	-
	- new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx);
	- VERIFY3U(new_object, !=, 0);
	-
	- VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
	- 0, -1ULL, 0));
	- ASSERT(vd->vdev_dtl_sm != NULL);
	- }
	-
	- rtsync = range_tree_create(NULL, NULL);
	-
	- mutex_enter(&vd->vdev_dtl_lock);
	- range_tree_walk(rt, range_tree_add, rtsync);
	- mutex_exit(&vd->vdev_dtl_lock);
	-
	- space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
	- space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
	- range_tree_vacate(rtsync, NULL, NULL);
	-
	- range_tree_destroy(rtsync);
	-
	- /*
	- * If the object for the space map has changed then dirty
	- * the top level so that we update the config.
	- */
	- if (object != space_map_object(vd->vdev_dtl_sm)) {
	- vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
	- "new object %llu", (u_longlong_t)txg, spa_name(spa),
	- (u_longlong_t)object,
	- (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
	- vdev_config_dirty(vd->vdev_top);
	- }
	-
	- dmu_tx_commit(tx);
	-}
	-
	-/*
	- * Determine whether the specified vdev can be offlined/detached/removed
	- * without losing data.
	- */
	-boolean_t
	-vdev_dtl_required(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_t *tvd = vd->vdev_top;
	- uint8_t cant_read = vd->vdev_cant_read;
	- boolean_t required;
	-
	- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	-
	- if (vd == spa->spa_root_vdev \|\| vd == tvd)
	- return (B_TRUE);
	-
	- /*
	- * Temporarily mark the device as unreadable, and then determine
	- * whether this results in any DTL outages in the top-level vdev.
	- * If not, we can safely offline/detach/remove the device.
	- */
	- vd->vdev_cant_read = B_TRUE;
	- vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
	- required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
	- vd->vdev_cant_read = cant_read;
	- vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
	-
	- if (!required && zio_injection_enabled)
	- required = !!zio_handle_device_injection(vd, NULL, ECHILD);
	-
	- return (required);
	-}
	-
	-/*
	- * Determine if resilver is needed, and if so the txg range.
	- */
	-boolean_t
	-vdev_resilver_needed(vdev_t vd, uint64_t minp, uint64_t *maxp)
	-{
	- boolean_t needed = B_FALSE;
	- uint64_t thismin = UINT64_MAX;
	- uint64_t thismax = 0;
	-
	- if (vd->vdev_children == 0) {
	- mutex_enter(&vd->vdev_dtl_lock);
	- if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
	- vdev_writeable(vd)) {
	-
	- thismin = vdev_dtl_min(vd);
	- thismax = vdev_dtl_max(vd);
	- needed = B_TRUE;
	- }
	- mutex_exit(&vd->vdev_dtl_lock);
	- } else {
	- for (int c = 0; c < vd->vdev_children; c++) {
	- vdev_t *cvd = vd->vdev_child[c];
	- uint64_t cmin, cmax;
	-
	- if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
	- thismin = MIN(thismin, cmin);
	- thismax = MAX(thismax, cmax);
	- needed = B_TRUE;
	- }
	- }
	- }
	-
	- if (needed && minp) {
	- *minp = thismin;
	- *maxp = thismax;
	- }
	- return (needed);
	-}
	-
	-/*
	- * Gets the checkpoint space map object from the vdev's ZAP.
	- * Returns the spacemap object, or 0 if it wasn't in the ZAP
	- * or the ZAP doesn't exist yet.
	- */
	-int
	-vdev_checkpoint_sm_object(vdev_t *vd)
	-{
	- ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
	- if (vd->vdev_top_zap == 0) {
	- return (0);
	- }
	-
	- uint64_t sm_obj = 0;
	- int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
	- VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj);
	-
	- ASSERT(err == 0 \|\| err == ENOENT);
	-
	- return (sm_obj);
	-}
	-
	-int
	-vdev_load(vdev_t *vd)
	-{
	- int error = 0;
	- /*
	- * Recursively load all children.
	- */
	- for (int c = 0; c < vd->vdev_children; c++) {
	- error = vdev_load(vd->vdev_child[c]);
	- if (error != 0) {
	- return (error);
	- }
	- }
	-
	- vdev_set_deflate_ratio(vd);
	-
	- /*
	- * On spa_load path, grab the allocation bias from our zap
	- */
	- if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
	- spa_t *spa = vd->vdev_spa;
	- char bias_str[64];
	-
	- if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
	- VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
	- bias_str) == 0) {
	- ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
	- vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
	- }
	- }
	-
	- /*
	- * If this is a top-level vdev, initialize its metaslabs.
	- */
	- if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
	- vdev_metaslab_group_create(vd);
	-
	- if (vd->vdev_ashift == 0 \|\| vd->vdev_asize == 0) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
	- "asize=%llu", (u_longlong_t)vd->vdev_ashift,
	- (u_longlong_t)vd->vdev_asize);
	- return (SET_ERROR(ENXIO));
	- }
	-
	- error = vdev_metaslab_init(vd, 0);
	- if (error != 0) {
	- vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
	- "[error=%d]", error);
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- return (error);
	- }
	-
	- uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
	- if (checkpoint_sm_obj != 0) {
	- objset_t *mos = spa_meta_objset(vd->vdev_spa);
	- ASSERT(vd->vdev_asize != 0);
	- ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
	-
	- error = space_map_open(&vd->vdev_checkpoint_sm,
	- mos, checkpoint_sm_obj, 0, vd->vdev_asize,
	- vd->vdev_ashift);
	- if (error != 0) {
	- vdev_dbgmsg(vd, "vdev_load: space_map_open "
	- "failed for checkpoint spacemap (obj %llu) "
	- "[error=%d]",
	- (u_longlong_t)checkpoint_sm_obj, error);
	- return (error);
	- }
	- ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
	-
	- /*
	- * Since the checkpoint_sm contains free entries
	- * exclusively we can use space_map_allocated() to
	- * indicate the cumulative checkpointed space that
	- * has been freed.
	- */
	- vd->vdev_stat.vs_checkpoint_space =
	- -space_map_allocated(vd->vdev_checkpoint_sm);
	- vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
	- vd->vdev_stat.vs_checkpoint_space;
	- }
	- }
	-
	- /*
	- * If this is a leaf vdev, load its DTL.
	- */
	- if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
	- "[error=%d]", error);
	- return (error);
	- }
	-
	- uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
	- if (obsolete_sm_object != 0) {
	- objset_t *mos = vd->vdev_spa->spa_meta_objset;
	- ASSERT(vd->vdev_asize != 0);
	- ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
	-
	- if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
	- obsolete_sm_object, 0, vd->vdev_asize, 0))) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
	- "obsolete spacemap (obj %llu) [error=%d]",
	- (u_longlong_t)obsolete_sm_object, error);
	- return (error);
	- }
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * The special vdev case is used for hot spares and l2cache devices. Its
	- * sole purpose it to set the vdev state for the associated vdev. To do this,
	- * we make sure that we can open the underlying device, then try to read the
	- * label, and make sure that the label is sane and that it hasn't been
	- * repurposed to another pool.
	- */
	-int
	-vdev_validate_aux(vdev_t *vd)
	-{
	- nvlist_t *label;
	- uint64_t guid, version;
	- uint64_t state;
	-
	- if (!vdev_readable(vd))
	- return (0);
	-
	- if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- return (-1);
	- }
	-
	- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 \|\|
	- !SPA_VERSION_IS_SUPPORTED(version) \|\|
	- nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 \|\|
	- guid != vd->vdev_guid \|\|
	- nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
	- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- nvlist_free(label);
	- return (-1);
	- }
	-
	- /*
	- * We don't actually check the pool state here. If it's in fact in
	- * use by another pool, we update this fact on the fly when requested.
	- */
	- nvlist_free(label);
	- return (0);
	-}
	-
	-/*
	- * Free the objects used to store this vdev's spacemaps, and the array
	- * that points to them.
	- */
	-void
	-vdev_destroy_spacemaps(vdev_t vd, dmu_tx_t tx)
	-{
	- if (vd->vdev_ms_array == 0)
	- return;
	-
	- objset_t *mos = vd->vdev_spa->spa_meta_objset;
	- uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
	- size_t array_bytes = array_count * sizeof (uint64_t);
	- uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
	- VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
	- array_bytes, smobj_array, 0));
	-
	- for (uint64_t i = 0; i < array_count; i++) {
	- uint64_t smobj = smobj_array[i];
	- if (smobj == 0)
	- continue;
	-
	- space_map_free_obj(mos, smobj, tx);
	- }
	-
	- kmem_free(smobj_array, array_bytes);
	- VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
	- vd->vdev_ms_array = 0;
	-}
	-
	-static void
	-vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT(vd->vdev_islog);
	- ASSERT(vd == vd->vdev_top);
	- ASSERT3U(txg, ==, spa_syncing_txg(spa));
	-
	- dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
	-
	- vdev_destroy_spacemaps(vd, tx);
	- if (vd->vdev_top_zap != 0) {
	- vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
	- vd->vdev_top_zap = 0;
	- }
	-
	- dmu_tx_commit(tx);
	-}
	-
	-void
	-vdev_sync_done(vdev_t *vd, uint64_t txg)
	-{
	- metaslab_t *msp;
	- boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
	-
	- ASSERT(vdev_is_concrete(vd));
	-
	- while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
	- != NULL)
	- metaslab_sync_done(msp, txg);
	-
	- if (reassess)
	- metaslab_sync_reassess(vd->vdev_mg);
	-}
	-
	-void
	-vdev_sync(vdev_t *vd, uint64_t txg)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_t *lvd;
	- metaslab_t *msp;
	-
	- ASSERT3U(txg, ==, spa->spa_syncing_txg);
	- dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	- if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
	- ASSERT(vd->vdev_removing \|\|
	- vd->vdev_ops == &vdev_indirect_ops);
	-
	- vdev_indirect_sync_obsolete(vd, tx);
	-
	- /*
	- * If the vdev is indirect, it can't have dirty
	- * metaslabs or DTLs.
	- */
	- if (vd->vdev_ops == &vdev_indirect_ops) {
	- ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
	- ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
	- dmu_tx_commit(tx);
	- return;
	- }
	- }
	-
	- ASSERT(vdev_is_concrete(vd));
	-
	- if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
	- !vd->vdev_removing) {
	- ASSERT(vd == vd->vdev_top);
	- ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
	- vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
	- DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
	- ASSERT(vd->vdev_ms_array != 0);
	- vdev_config_dirty(vd);
	- }
	-
	- while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
	- metaslab_sync(msp, txg);
	- (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
	- }
	-
	- while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
	- vdev_dtl_sync(lvd, txg);
	-
	- /*
	- * If this is an empty log device being removed, destroy the
	- * metadata associated with it.
	- */
	- if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
	- vdev_remove_empty_log(vd, txg);
	-
	- (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
	- dmu_tx_commit(tx);
	-}
	-
	-uint64_t
	-vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
	-{
	- return (vd->vdev_ops->vdev_op_asize(vd, psize));
	-}
	-
	-/*
	- * Mark the given vdev faulted. A faulted vdev behaves as if the device could
	- * not be opened, and no I/O is attempted.
	- */
	-int
	-vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
	-{
	- vdev_t vd, tvd;
	-
	- spa_vdev_state_enter(spa, SCL_NONE);
	-
	- if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	- return (spa_vdev_state_exit(spa, NULL, ENODEV));
	-
	- if (!vd->vdev_ops->vdev_op_leaf)
	- return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
	-
	- tvd = vd->vdev_top;
	-
	- /*
	- * We don't directly use the aux state here, but if we do a
	- * vdev_reopen(), we need this value to be present to remember why we
	- * were faulted.
	- */
	- vd->vdev_label_aux = aux;
	-
	- /*
	- * Faulted state takes precedence over degraded.
	- */
	- vd->vdev_delayed_close = B_FALSE;
	- vd->vdev_faulted = 1ULL;
	- vd->vdev_degraded = 0ULL;
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
	-
	- /*
	- * If this device has the only valid copy of the data, then
	- * back off and simply mark the vdev as degraded instead.
	- */
	- if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
	- vd->vdev_degraded = 1ULL;
	- vd->vdev_faulted = 0ULL;
	-
	- /*
	- * If we reopen the device and it's not dead, only then do we
	- * mark it degraded.
	- */
	- vdev_reopen(tvd);
	-
	- if (vdev_readable(vd))
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
	- }
	-
	- return (spa_vdev_state_exit(spa, vd, 0));
	-}
	-
	-/*
	- * Mark the given vdev degraded. A degraded vdev is purely an indication to the
	- * user that something is wrong. The vdev continues to operate as normal as far
	- * as I/O is concerned.
	- */
	-int
	-vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
	-{
	- vdev_t *vd;
	-
	- spa_vdev_state_enter(spa, SCL_NONE);
	-
	- if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	- return (spa_vdev_state_exit(spa, NULL, ENODEV));
	-
	- if (!vd->vdev_ops->vdev_op_leaf)
	- return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
	-
	- /*
	- * If the vdev is already faulted, then don't do anything.
	- */
	- if (vd->vdev_faulted \|\| vd->vdev_degraded)
	- return (spa_vdev_state_exit(spa, NULL, 0));
	-
	- vd->vdev_degraded = 1ULL;
	- if (!vdev_is_dead(vd))
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
	- aux);
	-
	- return (spa_vdev_state_exit(spa, vd, 0));
	-}
	-
	-/*
	- * Online the given vdev.
	- *
	- * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
	- * spare device should be detached when the device finishes resilvering.
	- * Second, the online should be treated like a 'test' online case, so no FMA
	- * events are generated if the device fails to open.
	- */
	-int
	-vdev_online(spa_t spa, uint64_t guid, uint64_t flags, vdev_state_t newstate)
	-{
	- vdev_t vd, tvd, pvd, rvd = spa->spa_root_vdev;
	- boolean_t wasoffline;
	- vdev_state_t oldstate;
	-
	- spa_vdev_state_enter(spa, SCL_NONE);
	-
	- if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	- return (spa_vdev_state_exit(spa, NULL, ENODEV));
	-
	- if (!vd->vdev_ops->vdev_op_leaf)
	- return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
	-
	- wasoffline = (vd->vdev_offline \|\| vd->vdev_tmpoffline);
	- oldstate = vd->vdev_state;
	-
	- tvd = vd->vdev_top;
	- vd->vdev_offline = B_FALSE;
	- vd->vdev_tmpoffline = B_FALSE;
	- vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
	- vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
	-
	- /* XXX - L2ARC 1.0 does not support expansion */
	- if (!vd->vdev_aux) {
	- for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
	- pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
	- }
	-
	- vdev_reopen(tvd);
	- vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
	-
	- if (!vd->vdev_aux) {
	- for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
	- pvd->vdev_expanding = B_FALSE;
	- }
	-
	- if (newstate)
	- *newstate = vd->vdev_state;
	- if ((flags & ZFS_ONLINE_UNSPARE) &&
	- !vdev_is_dead(vd) && vd->vdev_parent &&
	- vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
	- vd->vdev_parent->vdev_child[0] == vd)
	- vd->vdev_unspare = B_TRUE;
	-
	- if ((flags & ZFS_ONLINE_EXPAND) \|\| spa->spa_autoexpand) {
	-
	- /* XXX - L2ARC 1.0 does not support expansion */
	- if (vd->vdev_aux)
	- return (spa_vdev_state_exit(spa, vd, ENOTSUP));
	- spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
	- }
	-
	- /* Restart initializing if necessary */
	- mutex_enter(&vd->vdev_initialize_lock);
	- if (vdev_writeable(vd) &&
	- vd->vdev_initialize_thread == NULL &&
	- vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
	- (void) vdev_initialize(vd);
	- }
	- mutex_exit(&vd->vdev_initialize_lock);
	-
	- if (wasoffline \|\|
	- (oldstate < VDEV_STATE_DEGRADED &&
	- vd->vdev_state >= VDEV_STATE_DEGRADED))
	- spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
	-
	- return (spa_vdev_state_exit(spa, vd, 0));
	-}
	-
	-static int
	-vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
	-{
	- vdev_t vd, tvd;
	- int error = 0;
	- uint64_t generation;
	- metaslab_group_t *mg;
	-
	-top:
	- spa_vdev_state_enter(spa, SCL_ALLOC);
	-
	- if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	- return (spa_vdev_state_exit(spa, NULL, ENODEV));
	-
	- if (!vd->vdev_ops->vdev_op_leaf)
	- return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
	-
	- tvd = vd->vdev_top;
	- mg = tvd->vdev_mg;
	- generation = spa->spa_config_generation + 1;
	-
	- /*
	- * If the device isn't already offline, try to offline it.
	- */
	- if (!vd->vdev_offline) {
	- /*
	- * If this device has the only valid copy of some data,
	- * don't allow it to be offlined. Log devices are always
	- * expendable.
	- */
	- if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
	- vdev_dtl_required(vd))
	- return (spa_vdev_state_exit(spa, NULL, EBUSY));
	-
	- /*
	- * If the top-level is a slog and it has had allocations
	- * then proceed. We check that the vdev's metaslab group
	- * is not NULL since it's possible that we may have just
	- * added this vdev but not yet initialized its metaslabs.
	- */
	- if (tvd->vdev_islog && mg != NULL) {
	- /*
	- * Prevent any future allocations.
	- */
	- metaslab_group_passivate(mg);
	- (void) spa_vdev_state_exit(spa, vd, 0);
	-
	- error = spa_reset_logs(spa);
	-
	- /*
	- * If the log device was successfully reset but has
	- * checkpointed data, do not offline it.
	- */
	- if (error == 0 &&
	- tvd->vdev_checkpoint_sm != NULL) {
	- error = ZFS_ERR_CHECKPOINT_EXISTS;
	- }
	-
	- spa_vdev_state_enter(spa, SCL_ALLOC);
	-
	- /*
	- * Check to see if the config has changed.
	- */
	- if (error \|\| generation != spa->spa_config_generation) {
	- metaslab_group_activate(mg);
	- if (error)
	- return (spa_vdev_state_exit(spa,
	- vd, error));
	- (void) spa_vdev_state_exit(spa, vd, 0);
	- goto top;
	- }
	- ASSERT0(tvd->vdev_stat.vs_alloc);
	- }
	-
	- /*
	- * Offline this device and reopen its top-level vdev.
	- * If the top-level vdev is a log device then just offline
	- * it. Otherwise, if this action results in the top-level
	- * vdev becoming unusable, undo it and fail the request.
	- */
	- vd->vdev_offline = B_TRUE;
	- vdev_reopen(tvd);
	-
	- if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
	- vdev_is_dead(tvd)) {
	- vd->vdev_offline = B_FALSE;
	- vdev_reopen(tvd);
	- return (spa_vdev_state_exit(spa, NULL, EBUSY));
	- }
	-
	- /*
	- * Add the device back into the metaslab rotor so that
	- * once we online the device it's open for business.
	- */
	- if (tvd->vdev_islog && mg != NULL)
	- metaslab_group_activate(mg);
	- }
	-
	- vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
	-
	- return (spa_vdev_state_exit(spa, vd, 0));
	-}
	-
	-int
	-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
	-{
	- int error;
	-
	- mutex_enter(&spa->spa_vdev_top_lock);
	- error = vdev_offline_locked(spa, guid, flags);
	- mutex_exit(&spa->spa_vdev_top_lock);
	-
	- return (error);
	-}
	-
	-/*
	- * Clear the error counts associated with this vdev. Unlike vdev_online() and
	- * vdev_offline(), we assume the spa config is locked. We also clear all
	- * children. If 'vd' is NULL, then the user wants to clear all vdevs.
	- */
	-void
	-vdev_clear(spa_t spa, vdev_t vd)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	-
	- if (vd == NULL)
	- vd = rvd;
	-
	- vd->vdev_stat.vs_read_errors = 0;
	- vd->vdev_stat.vs_write_errors = 0;
	- vd->vdev_stat.vs_checksum_errors = 0;
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_clear(spa, vd->vdev_child[c]);
	-
	- if (vd == rvd) {
	- for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
	- vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
	-
	- for (int c = 0; c < spa->spa_spares.sav_count; c++)
	- vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
	- }
	-
	- /*
	- * It makes no sense to "clear" an indirect vdev.
	- */
	- if (!vdev_is_concrete(vd))
	- return;
	-
	- /*
	- * If we're in the FAULTED state or have experienced failed I/O, then
	- * clear the persistent state and attempt to reopen the device. We
	- * also mark the vdev config dirty, so that the new faulted state is
	- * written out to disk.
	- */
	- if (vd->vdev_faulted \|\| vd->vdev_degraded \|\|
	- !vdev_readable(vd) \|\| !vdev_writeable(vd)) {
	-
	- /*
	- * When reopening in reponse to a clear event, it may be due to
	- * a fmadm repair request. In this case, if the device is
	- * still broken, we want to still post the ereport again.
	- */
	- vd->vdev_forcefault = B_TRUE;
	-
	- vd->vdev_faulted = vd->vdev_degraded = 0ULL;
	- vd->vdev_cant_read = B_FALSE;
	- vd->vdev_cant_write = B_FALSE;
	-
	- vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
	-
	- vd->vdev_forcefault = B_FALSE;
	-
	- if (vd != rvd && vdev_writeable(vd->vdev_top))
	- vdev_state_dirty(vd->vdev_top);
	-
	- if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
	- spa_async_request(spa, SPA_ASYNC_RESILVER);
	-
	- spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
	- }
	-
	- /*
	- * When clearing a FMA-diagnosed fault, we always want to
	- * unspare the device, as we assume that the original spare was
	- * done in response to the FMA fault.
	- */
	- if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
	- vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
	- vd->vdev_parent->vdev_child[0] == vd)
	- vd->vdev_unspare = B_TRUE;
	-}
	-
	-boolean_t
	-vdev_is_dead(vdev_t *vd)
	-{
	- /*
	- * Holes and missing devices are always considered "dead".
	- * This simplifies the code since we don't have to check for
	- * these types of devices in the various code paths.
	- * Instead we rely on the fact that we skip over dead devices
	- * before issuing I/O to them.
	- */
	- return (vd->vdev_state < VDEV_STATE_DEGRADED \|\|
	- vd->vdev_ops == &vdev_hole_ops \|\|
	- vd->vdev_ops == &vdev_missing_ops);
	-}
	-
	-boolean_t
	-vdev_readable(vdev_t *vd)
	-{
	- return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
	-}
	-
	-boolean_t
	-vdev_writeable(vdev_t *vd)
	-{
	- return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
	- vdev_is_concrete(vd));
	-}
	-
	-boolean_t
	-vdev_allocatable(vdev_t *vd)
	-{
	- uint64_t state = vd->vdev_state;
	-
	- /*
	- * We currently allow allocations from vdevs which may be in the
	- * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
	- * fails to reopen then we'll catch it later when we're holding
	- * the proper locks. Note that we have to get the vdev state
	- * in a local variable because although it changes atomically,
	- * we're asking two separate questions about it.
	- */
	- return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
	- !vd->vdev_cant_write && vdev_is_concrete(vd) &&
	- vd->vdev_mg->mg_initialized);
	-}
	-
	-boolean_t
	-vdev_accessible(vdev_t vd, zio_t zio)
	-{
	- ASSERT(zio->io_vd == vd);
	-
	- if (vdev_is_dead(vd) \|\| vd->vdev_remove_wanted)
	- return (B_FALSE);
	-
	- if (zio->io_type == ZIO_TYPE_READ)
	- return (!vd->vdev_cant_read);
	-
	- if (zio->io_type == ZIO_TYPE_WRITE)
	- return (!vd->vdev_cant_write);
	-
	- return (B_TRUE);
	-}
	-
	-boolean_t
	-vdev_is_spacemap_addressable(vdev_t *vd)
	-{
	- if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
	- return (B_TRUE);
	-
	- /*
	- * If double-word space map entries are not enabled we assume
	- * 47 bits of the space map entry are dedicated to the entry's
	- * offset (see SM_OFFSET_BITS in space_map.h). We then use that
	- * to calculate the maximum address that can be described by a
	- * space map entry for the given device.
	- */
	- uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
	-
	- if (shift >= 63) /* detect potential overflow */
	- return (B_TRUE);
	-
	- return (vd->vdev_asize < (1ULL << shift));
	-}
	-
	-/*
	- * Get statistics for the given vdev.
	- */
	-void
	-vdev_get_stats(vdev_t vd, vdev_stat_t vs)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- vdev_t *tvd = vd->vdev_top;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
	-
	- mutex_enter(&vd->vdev_stat_lock);
	- bcopy(&vd->vdev_stat, vs, sizeof (*vs));
	- vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
	- vs->vs_state = vd->vdev_state;
	- vs->vs_rsize = vdev_get_min_asize(vd);
	- if (vd->vdev_ops->vdev_op_leaf) {
	- vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
	- /*
	- * Report intializing progress. Since we don't have the
	- * initializing locks held, this is only an estimate (although a
	- * fairly accurate one).
	- */
	- vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done;
	- vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est;
	- vs->vs_initialize_state = vd->vdev_initialize_state;
	- vs->vs_initialize_action_time = vd->vdev_initialize_action_time;
	- }
	- /*
	- * Report expandable space on top-level, non-auxillary devices only.
	- * The expandable space is reported in terms of metaslab sized units
	- * since that determines how much space the pool can expand.
	- */
	- if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
	- vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
	- spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
	- }
	- vs->vs_configured_ashift = vd->vdev_top != NULL
	- ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
	- vs->vs_logical_ashift = vd->vdev_logical_ashift;
	- vs->vs_physical_ashift = vd->vdev_physical_ashift;
	- if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
	- vdev_is_concrete(vd)) {
	- vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
	- vd->vdev_mg->mg_fragmentation : 0;
	- }
	-
	- /*
	- * If we're getting stats on the root vdev, aggregate the I/O counts
	- * over all top-level vdevs (i.e. the direct children of the root).
	- */
	- if (vd == rvd) {
	- for (int c = 0; c < rvd->vdev_children; c++) {
	- vdev_t *cvd = rvd->vdev_child[c];
	- vdev_stat_t *cvs = &cvd->vdev_stat;
	-
	- for (int t = 0; t < ZIO_TYPES; t++) {
	- vs->vs_ops[t] += cvs->vs_ops[t];
	- vs->vs_bytes[t] += cvs->vs_bytes[t];
	- }
	- cvs->vs_scan_removing = cvd->vdev_removing;
	- }
	- }
	- mutex_exit(&vd->vdev_stat_lock);
	-}
	-
	-void
	-vdev_clear_stats(vdev_t *vd)
	-{
	- mutex_enter(&vd->vdev_stat_lock);
	- vd->vdev_stat.vs_space = 0;
	- vd->vdev_stat.vs_dspace = 0;
	- vd->vdev_stat.vs_alloc = 0;
	- mutex_exit(&vd->vdev_stat_lock);
	-}
	-
	-void
	-vdev_scan_stat_init(vdev_t *vd)
	-{
	- vdev_stat_t *vs = &vd->vdev_stat;
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_scan_stat_init(vd->vdev_child[c]);
	-
	- mutex_enter(&vd->vdev_stat_lock);
	- vs->vs_scan_processed = 0;
	- mutex_exit(&vd->vdev_stat_lock);
	-}
	-
	-void
	-vdev_stat_update(zio_t *zio, uint64_t psize)
	-{
	- spa_t *spa = zio->io_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
	- vdev_t *pvd;
	- uint64_t txg = zio->io_txg;
	- vdev_stat_t *vs = &vd->vdev_stat;
	- zio_type_t type = zio->io_type;
	- int flags = zio->io_flags;
	-
	- /*
	- * If this i/o is a gang leader, it didn't do any actual work.
	- */
	- if (zio->io_gang_tree)
	- return;
	-
	- if (zio->io_error == 0) {
	- /*
	- * If this is a root i/o, don't count it -- we've already
	- * counted the top-level vdevs, and vdev_get_stats() will
	- * aggregate them when asked. This reduces contention on
	- * the root vdev_stat_lock and implicitly handles blocks
	- * that compress away to holes, for which there is no i/o.
	- * (Holes never create vdev children, so all the counters
	- * remain zero, which is what we want.)
	- *
	- * Note: this only applies to successful i/o (io_error == 0)
	- * because unlike i/o counts, errors are not additive.
	- * When reading a ditto block, for example, failure of
	- * one top-level vdev does not imply a root-level error.
	- */
	- if (vd == rvd)
	- return;
	-
	- ASSERT(vd == zio->io_vd);
	-
	- if (flags & ZIO_FLAG_IO_BYPASS)
	- return;
	-
	- mutex_enter(&vd->vdev_stat_lock);
	-
	- if (flags & ZIO_FLAG_IO_REPAIR) {
	- if (flags & ZIO_FLAG_SCAN_THREAD) {
	- dsl_scan_phys_t *scn_phys =
	- &spa->spa_dsl_pool->dp_scan->scn_phys;
	- uint64_t *processed = &scn_phys->scn_processed;
	-
	- /* XXX cleanup? */
	- if (vd->vdev_ops->vdev_op_leaf)
	- atomic_add_64(processed, psize);
	- vs->vs_scan_processed += psize;
	- }
	-
	- if (flags & ZIO_FLAG_SELF_HEAL)
	- vs->vs_self_healed += psize;
	- }
	-
	- vs->vs_ops[type]++;
	- vs->vs_bytes[type] += psize;
	-
	- mutex_exit(&vd->vdev_stat_lock);
	- return;
	- }
	-
	- if (flags & ZIO_FLAG_SPECULATIVE)
	- return;
	-
	- /*
	- * If this is an I/O error that is going to be retried, then ignore the
	- * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
	- * hard errors, when in reality they can happen for any number of
	- * innocuous reasons (bus resets, MPxIO link failure, etc).
	- */
	- if (zio->io_error == EIO &&
	- !(zio->io_flags & ZIO_FLAG_IO_RETRY))
	- return;
	-
	- /*
	- * Intent logs writes won't propagate their error to the root
	- * I/O so don't mark these types of failures as pool-level
	- * errors.
	- */
	- if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
	- return;
	-
	- mutex_enter(&vd->vdev_stat_lock);
	- if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
	- if (zio->io_error == ECKSUM)
	- vs->vs_checksum_errors++;
	- else
	- vs->vs_read_errors++;
	- }
	- if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
	- vs->vs_write_errors++;
	- mutex_exit(&vd->vdev_stat_lock);
	-
	- if (spa->spa_load_state == SPA_LOAD_NONE &&
	- type == ZIO_TYPE_WRITE && txg != 0 &&
	- (!(flags & ZIO_FLAG_IO_REPAIR) \|\|
	- (flags & ZIO_FLAG_SCAN_THREAD) \|\|
	- spa->spa_claiming)) {
	- /*
	- * This is either a normal write (not a repair), or it's
	- * a repair induced by the scrub thread, or it's a repair
	- * made by zil_claim() during spa_load() in the first txg.
	- * In the normal case, we commit the DTL change in the same
	- * txg as the block was born. In the scrub-induced repair
	- * case, we know that scrubs run in first-pass syncing context,
	- * so we commit the DTL change in spa_syncing_txg(spa).
	- * In the zil_claim() case, we commit in spa_first_txg(spa).
	- *
	- * We currently do not make DTL entries for failed spontaneous
	- * self-healing writes triggered by normal (non-scrubbing)
	- * reads, because we have no transactional context in which to
	- * do so -- and it's not clear that it'd be desirable anyway.
	- */
	- if (vd->vdev_ops->vdev_op_leaf) {
	- uint64_t commit_txg = txg;
	- if (flags & ZIO_FLAG_SCAN_THREAD) {
	- ASSERT(flags & ZIO_FLAG_IO_REPAIR);
	- ASSERT(spa_sync_pass(spa) == 1);
	- vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
	- commit_txg = spa_syncing_txg(spa);
	- } else if (spa->spa_claiming) {
	- ASSERT(flags & ZIO_FLAG_IO_REPAIR);
	- commit_txg = spa_first_txg(spa);
	- }
	- ASSERT(commit_txg >= spa_syncing_txg(spa));
	- if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
	- return;
	- for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
	- vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
	- vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
	- }
	- if (vd != rvd)
	- vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
	- }
	-}
	-
	-int64_t
	-vdev_deflated_space(vdev_t *vd, int64_t space)
	-{
	- ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
	- ASSERT(vd->vdev_deflate_ratio != 0 \|\| vd->vdev_isl2cache);
	-
	- return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
	-}
	-
	-/*
	- * Update the in-core space usage stats for this vdev and the root vdev.
	- */
	-void
	-vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
	- int64_t space_delta)
	-{
	- int64_t dspace_delta;
	- spa_t *spa = vd->vdev_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	-
	- ASSERT(vd == vd->vdev_top);
	-
	- /*
	- * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
	- * factor. We must calculate this here and not at the root vdev
	- * because the root vdev's psize-to-asize is simply the max of its
	- * childrens', thus not accurate enough for us.
	- */
	- dspace_delta = vdev_deflated_space(vd, space_delta);
	-
	- mutex_enter(&vd->vdev_stat_lock);
	- vd->vdev_stat.vs_alloc += alloc_delta;
	- vd->vdev_stat.vs_space += space_delta;
	- vd->vdev_stat.vs_dspace += dspace_delta;
	- mutex_exit(&vd->vdev_stat_lock);
	-
	- /* every class but log contributes to root space stats */
	- if (vd->vdev_mg != NULL && !vd->vdev_islog) {
	- mutex_enter(&rvd->vdev_stat_lock);
	- rvd->vdev_stat.vs_alloc += alloc_delta;
	- rvd->vdev_stat.vs_space += space_delta;
	- rvd->vdev_stat.vs_dspace += dspace_delta;
	- mutex_exit(&rvd->vdev_stat_lock);
	- }
	- /* Note: metaslab_class_space_update moved to metaslab_space_update */
	-}
	-
	-/*
	- * Mark a top-level vdev's config as dirty, placing it on the dirty list
	- * so that it will be written out next time the vdev configuration is synced.
	- * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
	- */
	-void
	-vdev_config_dirty(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- int c;
	-
	- ASSERT(spa_writeable(spa));
	-
	- /*
	- * If this is an aux vdev (as with l2cache and spare devices), then we
	- * update the vdev config manually and set the sync flag.
	- */
	- if (vd->vdev_aux != NULL) {
	- spa_aux_vdev_t *sav = vd->vdev_aux;
	- nvlist_t **aux;
	- uint_t naux;
	-
	- for (c = 0; c < sav->sav_count; c++) {
	- if (sav->sav_vdevs[c] == vd)
	- break;
	- }
	-
	- if (c == sav->sav_count) {
	- /*
	- * We're being removed. There's nothing more to do.
	- */
	- ASSERT(sav->sav_sync == B_TRUE);
	- return;
	- }
	-
	- sav->sav_sync = B_TRUE;
	-
	- if (nvlist_lookup_nvlist_array(sav->sav_config,
	- ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
	- VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
	- ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
	- }
	-
	- ASSERT(c < naux);
	-
	- /*
	- * Setting the nvlist in the middle if the array is a little
	- * sketchy, but it will work.
	- */
	- nvlist_free(aux[c]);
	- aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
	-
	- return;
	- }
	-
	- /*
	- * The dirty list is protected by the SCL_CONFIG lock. The caller
	- * must either hold SCL_CONFIG as writer, or must be the sync thread
	- * (which holds SCL_CONFIG as reader). There's only one sync thread,
	- * so this is sufficient to ensure mutual exclusion.
	- */
	- ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\|
	- (dsl_pool_sync_context(spa_get_dsl(spa)) &&
	- spa_config_held(spa, SCL_CONFIG, RW_READER)));
	-
	- if (vd == rvd) {
	- for (c = 0; c < rvd->vdev_children; c++)
	- vdev_config_dirty(rvd->vdev_child[c]);
	- } else {
	- ASSERT(vd == vd->vdev_top);
	-
	- if (!list_link_active(&vd->vdev_config_dirty_node) &&
	- vdev_is_concrete(vd)) {
	- list_insert_head(&spa->spa_config_dirty_list, vd);
	- }
	- }
	-}
	-
	-void
	-vdev_config_clean(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\|
	- (dsl_pool_sync_context(spa_get_dsl(spa)) &&
	- spa_config_held(spa, SCL_CONFIG, RW_READER)));
	-
	- ASSERT(list_link_active(&vd->vdev_config_dirty_node));
	- list_remove(&spa->spa_config_dirty_list, vd);
	-}
	-
	-/*
	- * Mark a top-level vdev's state as dirty, so that the next pass of
	- * spa_sync() can convert this into vdev_config_dirty(). We distinguish
	- * the state changes from larger config changes because they require
	- * much less locking, and are often needed for administrative actions.
	- */
	-void
	-vdev_state_dirty(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT(spa_writeable(spa));
	- ASSERT(vd == vd->vdev_top);
	-
	- /*
	- * The state list is protected by the SCL_STATE lock. The caller
	- * must either hold SCL_STATE as writer, or must be the sync thread
	- * (which holds SCL_STATE as reader). There's only one sync thread,
	- * so this is sufficient to ensure mutual exclusion.
	- */
	- ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\|
	- (dsl_pool_sync_context(spa_get_dsl(spa)) &&
	- spa_config_held(spa, SCL_STATE, RW_READER)));
	-
	- if (!list_link_active(&vd->vdev_state_dirty_node) &&
	- vdev_is_concrete(vd))
	- list_insert_head(&spa->spa_state_dirty_list, vd);
	-}
	-
	-void
	-vdev_state_clean(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\|
	- (dsl_pool_sync_context(spa_get_dsl(spa)) &&
	- spa_config_held(spa, SCL_STATE, RW_READER)));
	-
	- ASSERT(list_link_active(&vd->vdev_state_dirty_node));
	- list_remove(&spa->spa_state_dirty_list, vd);
	-}
	-
	-/*
	- * Propagate vdev state up from children to parent.
	- */
	-void
	-vdev_propagate_state(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- int degraded = 0, faulted = 0;
	- int corrupted = 0;
	- vdev_t *child;
	-
	- if (vd->vdev_children > 0) {
	- for (int c = 0; c < vd->vdev_children; c++) {
	- child = vd->vdev_child[c];
	-
	- /*
	- * Don't factor holes or indirect vdevs into the
	- * decision.
	- */
	- if (!vdev_is_concrete(child))
	- continue;
	-
	- if (!vdev_readable(child) \|\|
	- (!vdev_writeable(child) && spa_writeable(spa))) {
	- /*
	- * Root special: if there is a top-level log
	- * device, treat the root vdev as if it were
	- * degraded.
	- */
	- if (child->vdev_islog && vd == rvd)
	- degraded++;
	- else
	- faulted++;
	- } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
	- degraded++;
	- }
	-
	- if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
	- corrupted++;
	- }
	-
	- vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
	-
	- /*
	- * Root special: if there is a top-level vdev that cannot be
	- * opened due to corrupted metadata, then propagate the root
	- * vdev's aux state as 'corrupt' rather than 'insufficient
	- * replicas'.
	- */
	- if (corrupted && vd == rvd &&
	- rvd->vdev_state == VDEV_STATE_CANT_OPEN)
	- vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	- }
	-
	- if (vd->vdev_parent)
	- vdev_propagate_state(vd->vdev_parent);
	-}
	-
	-/*
	- * Set a vdev's state. If this is during an open, we don't update the parent
	- * state, because we're in the process of opening children depth-first.
	- * Otherwise, we propagate the change to the parent.
	- *
	- * If this routine places a device in a faulted state, an appropriate ereport is
	- * generated.
	- */
	-void
	-vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
	-{
	- uint64_t save_state;
	- spa_t *spa = vd->vdev_spa;
	-
	- if (state == vd->vdev_state) {
	- vd->vdev_stat.vs_aux = aux;
	- return;
	- }
	-
	- save_state = vd->vdev_state;
	-
	- vd->vdev_state = state;
	- vd->vdev_stat.vs_aux = aux;
	-
	- /*
	- * If we are setting the vdev state to anything but an open state, then
	- * always close the underlying device unless the device has requested
	- * a delayed close (i.e. we're about to remove or fault the device).
	- * Otherwise, we keep accessible but invalid devices open forever.
	- * We don't call vdev_close() itself, because that implies some extra
	- * checks (offline, etc) that we don't want here. This is limited to
	- * leaf devices, because otherwise closing the device will affect other
	- * children.
	- */
	- if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
	- vd->vdev_ops->vdev_op_leaf)
	- vd->vdev_ops->vdev_op_close(vd);
	-
	- if (vd->vdev_removed &&
	- state == VDEV_STATE_CANT_OPEN &&
	- (aux == VDEV_AUX_OPEN_FAILED \|\| vd->vdev_checkremove)) {
	- /*
	- * If the previous state is set to VDEV_STATE_REMOVED, then this
	- * device was previously marked removed and someone attempted to
	- * reopen it. If this failed due to a nonexistent device, then
	- * keep the device in the REMOVED state. We also let this be if
	- * it is one of our special test online cases, which is only
	- * attempting to online the device and shouldn't generate an FMA
	- * fault.
	- */
	- vd->vdev_state = VDEV_STATE_REMOVED;
	- vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
	- } else if (state == VDEV_STATE_REMOVED) {
	- vd->vdev_removed = B_TRUE;
	- } else if (state == VDEV_STATE_CANT_OPEN) {
	- /*
	- * If we fail to open a vdev during an import or recovery, we
	- * mark it as "not available", which signifies that it was
	- * never there to begin with. Failure to open such a device
	- * is not considered an error.
	- */
	- if ((spa_load_state(spa) == SPA_LOAD_IMPORT \|\|
	- spa_load_state(spa) == SPA_LOAD_RECOVER) &&
	- vd->vdev_ops->vdev_op_leaf)
	- vd->vdev_not_present = 1;
	-
	- /*
	- * Post the appropriate ereport. If the 'prevstate' field is
	- * set to something other than VDEV_STATE_UNKNOWN, it indicates
	- * that this is part of a vdev_reopen(). In this case, we don't
	- * want to post the ereport if the device was already in the
	- * CANT_OPEN state beforehand.
	- *
	- * If the 'checkremove' flag is set, then this is an attempt to
	- * online the device in response to an insertion event. If we
	- * hit this case, then we have detected an insertion event for a
	- * faulted or offline device that wasn't in the removed state.
	- * In this scenario, we don't post an ereport because we are
	- * about to replace the device, or attempt an online with
	- * vdev_forcefault, which will generate the fault for us.
	- */
	- if ((vd->vdev_prevstate != state \|\| vd->vdev_forcefault) &&
	- !vd->vdev_not_present && !vd->vdev_checkremove &&
	- vd != spa->spa_root_vdev) {
	- const char *class;
	-
	- switch (aux) {
	- case VDEV_AUX_OPEN_FAILED:
	- class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
	- break;
	- case VDEV_AUX_CORRUPT_DATA:
	- class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
	- break;
	- case VDEV_AUX_NO_REPLICAS:
	- class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
	- break;
	- case VDEV_AUX_BAD_GUID_SUM:
	- class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
	- break;
	- case VDEV_AUX_TOO_SMALL:
	- class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
	- break;
	- case VDEV_AUX_BAD_LABEL:
	- class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
	- break;
	- default:
	- class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
	- }
	-
	- zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
	- }
	-
	- /* Erase any notion of persistent removed state */
	- vd->vdev_removed = B_FALSE;
	- } else {
	- vd->vdev_removed = B_FALSE;
	- }
	-
	- /*
	- * Notify the fmd of the state change. Be verbose and post
	- * notifications even for stuff that's not important; the fmd agent can
	- * sort it out. Don't emit state change events for non-leaf vdevs since
	- * they can't change state on their own. The FMD can check their state
	- * if it wants to when it sees that a leaf vdev had a state change.
	- */
	- if (vd->vdev_ops->vdev_op_leaf)
	- zfs_post_state_change(spa, vd);
	-
	- if (!isopen && vd->vdev_parent)
	- vdev_propagate_state(vd->vdev_parent);
	-}
	-
	-boolean_t
	-vdev_children_are_offline(vdev_t *vd)
	-{
	- ASSERT(!vd->vdev_ops->vdev_op_leaf);
	-
	- for (uint64_t i = 0; i < vd->vdev_children; i++) {
	- if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
	- return (B_FALSE);
	- }
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Check the vdev configuration to ensure that it's capable of supporting
	- * a root pool. We do not support partial configuration.
	- * In addition, only a single top-level vdev is allowed.
	- *
	- * FreeBSD does not have above limitations.
	- */
	-boolean_t
	-vdev_is_bootable(vdev_t *vd)
	-{
	-#ifdef illumos
	- if (!vd->vdev_ops->vdev_op_leaf) {
	- char *vdev_type = vd->vdev_ops->vdev_op_type;
	-
	- if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
	- vd->vdev_children > 1) {
	- return (B_FALSE);
	- } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 \|\|
	- strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
	- return (B_FALSE);
	- }
	- }
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- if (!vdev_is_bootable(vd->vdev_child[c]))
	- return (B_FALSE);
	- }
	-#endif /* illumos */
	- return (B_TRUE);
	-}
	-
	-boolean_t
	-vdev_is_concrete(vdev_t *vd)
	-{
	- vdev_ops_t *ops = vd->vdev_ops;
	- if (ops == &vdev_indirect_ops \|\| ops == &vdev_hole_ops \|\|
	- ops == &vdev_missing_ops \|\| ops == &vdev_root_ops) {
	- return (B_FALSE);
	- } else {
	- return (B_TRUE);
	- }
	-}
	-
	-/*
	- * Determine if a log device has valid content. If the vdev was
	- * removed or faulted in the MOS config then we know that
	- * the content on the log device has already been written to the pool.
	- */
	-boolean_t
	-vdev_log_state_valid(vdev_t *vd)
	-{
	- if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
	- !vd->vdev_removed)
	- return (B_TRUE);
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- if (vdev_log_state_valid(vd->vdev_child[c]))
	- return (B_TRUE);
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Expand a vdev if possible.
	- */
	-void
	-vdev_expand(vdev_t *vd, uint64_t txg)
	-{
	- ASSERT(vd->vdev_top == vd);
	- ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	- ASSERT(vdev_is_concrete(vd));
	-
	- vdev_set_deflate_ratio(vd);
	-
	- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
	- vdev_is_concrete(vd)) {
	- vdev_metaslab_group_create(vd);
	- VERIFY(vdev_metaslab_init(vd, txg) == 0);
	- vdev_config_dirty(vd);
	- }
	-}
	-
	-/*
	- * Split a vdev.
	- */
	-void
	-vdev_split(vdev_t *vd)
	-{
	- vdev_t cvd, pvd = vd->vdev_parent;
	-
	- vdev_remove_child(pvd, vd);
	- vdev_compact_children(pvd);
	-
	- cvd = pvd->vdev_child[0];
	- if (pvd->vdev_children == 1) {
	- vdev_remove_parent(cvd);
	- cvd->vdev_splitting = B_TRUE;
	- }
	- vdev_propagate_state(cvd);
	-}
	-
	-void
	-vdev_deadman(vdev_t *vd)
	-{
	- for (int c = 0; c < vd->vdev_children; c++) {
	- vdev_t *cvd = vd->vdev_child[c];
	-
	- vdev_deadman(cvd);
	- }
	-
	- if (vd->vdev_ops->vdev_op_leaf) {
	- vdev_queue_t *vq = &vd->vdev_queue;
	-
	- mutex_enter(&vq->vq_lock);
	- if (avl_numnodes(&vq->vq_active_tree) > 0) {
	- spa_t *spa = vd->vdev_spa;
	- zio_t *fio;
	- uint64_t delta;
	-
	- /*
	- * Look at the head of all the pending queues,
	- * if any I/O has been outstanding for longer than
	- * the spa_deadman_synctime we panic the system.
	- */
	- fio = avl_first(&vq->vq_active_tree);
	- delta = gethrtime() - fio->io_timestamp;
	- if (delta > spa_deadman_synctime(spa)) {
	- vdev_dbgmsg(vd, "SLOW IO: zio timestamp "
	- "%lluns, delta %lluns, last io %lluns",
	- fio->io_timestamp, (u_longlong_t)delta,
	- vq->vq_io_complete_ts);
	- fm_panic("I/O to pool '%s' appears to be "
	- "hung on vdev guid %llu at '%s'.",
	- spa_name(spa),
	- (long long unsigned int) vd->vdev_guid,
	- vd->vdev_path);
	- }
	- }
	- mutex_exit(&vq->vq_lock);
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
	@@ -1,434 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zio.h>
	-#include <sys/kstat.h>
	-#include <sys/abd.h>
	-
	-/*
	- * Virtual device read-ahead caching.
	- *
	- * This file implements a simple LRU read-ahead cache. When the DMU reads
	- * a given block, it will often want other, nearby blocks soon thereafter.
	- * We take advantage of this by reading a larger disk region and caching
	- * the result. In the best case, this can turn 128 back-to-back 512-byte
	- * reads into a single 64k read followed by 127 cache hits; this reduces
	- * latency dramatically. In the worst case, it can turn an isolated 512-byte
	- * read into a 64k read, which doesn't affect latency all that much but is
	- * terribly wasteful of bandwidth. A more intelligent version of the cache
	- * could keep track of access patterns and not do read-ahead unless it sees
	- * at least two temporally close I/Os to the same region. Currently, only
	- * metadata I/O is inflated. A futher enhancement could take advantage of
	- * more semantic information about the I/O. And it could use something
	- * faster than an AVL tree; that was chosen solely for convenience.
	- *
	- * There are five cache operations: allocate, fill, read, write, evict.
	- *
	- * (1) Allocate. This reserves a cache entry for the specified region.
	- * We separate the allocate and fill operations so that multiple threads
	- * don't generate I/O for the same cache miss.
	- *
	- * (2) Fill. When the I/O for a cache miss completes, the fill routine
	- * places the data in the previously allocated cache entry.
	- *
	- * (3) Read. Read data from the cache.
	- *
	- * (4) Write. Update cache contents after write completion.
	- *
	- * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
	- * if the total cache size exceeds zfs_vdev_cache_size.
	- */
	-
	-/*
	- * These tunables are for performance analysis.
	- */
	-/*
	- * All i/os smaller than zfs_vdev_cache_max will be turned into
	- * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
	- * track buffer). At most zfs_vdev_cache_size bytes will be kept in each
	- * vdev's vdev_cache.
	- *
	- * TODO: Note that with the current ZFS code, it turns out that the
	- * vdev cache is not helpful, and in some cases actually harmful. It
	- * is better if we disable this. Once some time has passed, we should
	- * actually remove this to simplify the code. For now we just disable
	- * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11
	- * has made these same changes.
	- */
	-int zfs_vdev_cache_max = 1<<14; /* 16KB */
	-int zfs_vdev_cache_size = 0;
	-int zfs_vdev_cache_bshift = 16;
	-
	-#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */
	-
	-SYSCTL_DECL(_vfs_zfs_vdev);
	-SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "ZFS VDEV Cache");
	-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
	- &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size");
	-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
	- &zfs_vdev_cache_size, 0, "Size of VDEV cache");
	-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN,
	- &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value");
	-
	-kstat_t *vdc_ksp = NULL;
	-
	-typedef struct vdc_stats {
	- kstat_named_t vdc_stat_delegations;
	- kstat_named_t vdc_stat_hits;
	- kstat_named_t vdc_stat_misses;
	-} vdc_stats_t;
	-
	-static vdc_stats_t vdc_stats = {
	- { "delegations", KSTAT_DATA_UINT64 },
	- { "hits", KSTAT_DATA_UINT64 },
	- { "misses", KSTAT_DATA_UINT64 }
	-};
	-
	-#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64);
	-
	-static inline int
	-vdev_cache_offset_compare(const void a1, const void a2)
	-{
	- const vdev_cache_entry_t ve1 = (const vdev_cache_entry_t )a1;
	- const vdev_cache_entry_t ve2 = (const vdev_cache_entry_t )a2;
	-
	- return (AVL_CMP(ve1->ve_offset, ve2->ve_offset));
	-}
	-
	-static int
	-vdev_cache_lastused_compare(const void a1, const void a2)
	-{
	- const vdev_cache_entry_t ve1 = (const vdev_cache_entry_t )a1;
	- const vdev_cache_entry_t ve2 = (const vdev_cache_entry_t )a2;
	-
	- int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused);
	- if (likely(cmp))
	- return (cmp);
	-
	- /*
	- * Among equally old entries, sort by offset to ensure uniqueness.
	- */
	- return (vdev_cache_offset_compare(a1, a2));
	-}
	-
	-/*
	- * Evict the specified entry from the cache.
	- */
	-static void
	-vdev_cache_evict(vdev_cache_t vc, vdev_cache_entry_t ve)
	-{
	- ASSERT(MUTEX_HELD(&vc->vc_lock));
	- ASSERT3P(ve->ve_fill_io, ==, NULL);
	- ASSERT3P(ve->ve_abd, !=, NULL);
	-
	- avl_remove(&vc->vc_lastused_tree, ve);
	- avl_remove(&vc->vc_offset_tree, ve);
	- abd_free(ve->ve_abd);
	- kmem_free(ve, sizeof (vdev_cache_entry_t));
	-}
	-
	-/*
	- * Allocate an entry in the cache. At the point we don't have the data,
	- * we're just creating a placeholder so that multiple threads don't all
	- * go off and read the same blocks.
	- */
	-static vdev_cache_entry_t *
	-vdev_cache_allocate(zio_t *zio)
	-{
	- vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	- uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
	- vdev_cache_entry_t *ve;
	-
	- ASSERT(MUTEX_HELD(&vc->vc_lock));
	-
	- if (zfs_vdev_cache_size == 0)
	- return (NULL);
	-
	- /*
	- * If adding a new entry would exceed the cache size,
	- * evict the oldest entry (LRU).
	- */
	- if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
	- zfs_vdev_cache_size) {
	- ve = avl_first(&vc->vc_lastused_tree);
	- if (ve->ve_fill_io != NULL)
	- return (NULL);
	- ASSERT3U(ve->ve_hits, !=, 0);
	- vdev_cache_evict(vc, ve);
	- }
	-
	- ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
	- ve->ve_offset = offset;
	- ve->ve_lastused = ddi_get_lbolt();
	- ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
	-
	- avl_add(&vc->vc_offset_tree, ve);
	- avl_add(&vc->vc_lastused_tree, ve);
	-
	- return (ve);
	-}
	-
	-static void
	-vdev_cache_hit(vdev_cache_t vc, vdev_cache_entry_t ve, zio_t *zio)
	-{
	- uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	-
	- ASSERT(MUTEX_HELD(&vc->vc_lock));
	- ASSERT3P(ve->ve_fill_io, ==, NULL);
	-
	- if (ve->ve_lastused != ddi_get_lbolt()) {
	- avl_remove(&vc->vc_lastused_tree, ve);
	- ve->ve_lastused = ddi_get_lbolt();
	- avl_add(&vc->vc_lastused_tree, ve);
	- }
	-
	- ve->ve_hits++;
	- abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
	-}
	-
	-/*
	- * Fill a previously allocated cache entry with data.
	- */
	-static void
	-vdev_cache_fill(zio_t *fio)
	-{
	- vdev_t *vd = fio->io_vd;
	- vdev_cache_t *vc = &vd->vdev_cache;
	- vdev_cache_entry_t *ve = fio->io_private;
	- zio_t *pio;
	-
	- ASSERT3U(fio->io_size, ==, VCBS);
	-
	- /*
	- * Add data to the cache.
	- */
	- mutex_enter(&vc->vc_lock);
	-
	- ASSERT3P(ve->ve_fill_io, ==, fio);
	- ASSERT3U(ve->ve_offset, ==, fio->io_offset);
	- ASSERT3P(ve->ve_abd, ==, fio->io_abd);
	-
	- ve->ve_fill_io = NULL;
	-
	- /*
	- * Even if this cache line was invalidated by a missed write update,
	- * any reads that were queued up before the missed update are still
	- * valid, so we can satisfy them from this line before we evict it.
	- */
	- zio_link_t *zl = NULL;
	- while ((pio = zio_walk_parents(fio, &zl)) != NULL)
	- vdev_cache_hit(vc, ve, pio);
	-
	- if (fio->io_error \|\| ve->ve_missed_update)
	- vdev_cache_evict(vc, ve);
	-
	- mutex_exit(&vc->vc_lock);
	-}
	-
	-/*
	- * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss.
	- */
	-boolean_t
	-vdev_cache_read(zio_t *zio)
	-{
	- vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	- vdev_cache_entry_t *ve, ve_search;
	- uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	- uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	- zio_t *fio;
	-
	- ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
	-
	- if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
	- return (B_FALSE);
	-
	- if (zio->io_size > zfs_vdev_cache_max)
	- return (B_FALSE);
	-
	- /*
	- * If the I/O straddles two or more cache blocks, don't cache it.
	- */
	- if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
	- return (B_FALSE);
	-
	- ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
	-
	- mutex_enter(&vc->vc_lock);
	-
	- ve_search.ve_offset = cache_offset;
	- ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
	-
	- if (ve != NULL) {
	- if (ve->ve_missed_update) {
	- mutex_exit(&vc->vc_lock);
	- return (B_FALSE);
	- }
	-
	- if ((fio = ve->ve_fill_io) != NULL) {
	- zio_vdev_io_bypass(zio);
	- zio_add_child(zio, fio);
	- mutex_exit(&vc->vc_lock);
	- VDCSTAT_BUMP(vdc_stat_delegations);
	- return (B_TRUE);
	- }
	-
	- vdev_cache_hit(vc, ve, zio);
	- zio_vdev_io_bypass(zio);
	-
	- mutex_exit(&vc->vc_lock);
	- VDCSTAT_BUMP(vdc_stat_hits);
	- return (B_TRUE);
	- }
	-
	- ve = vdev_cache_allocate(zio);
	-
	- if (ve == NULL) {
	- mutex_exit(&vc->vc_lock);
	- return (B_FALSE);
	- }
	-
	- fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
	- ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
	- ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
	-
	- ve->ve_fill_io = fio;
	- zio_vdev_io_bypass(zio);
	- zio_add_child(zio, fio);
	-
	- mutex_exit(&vc->vc_lock);
	- zio_nowait(fio);
	- VDCSTAT_BUMP(vdc_stat_misses);
	-
	- return (B_TRUE);
	-}
	-
	-/*
	- * Update cache contents upon write completion.
	- */
	-void
	-vdev_cache_write(zio_t *zio)
	-{
	- vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	- vdev_cache_entry_t *ve, ve_search;
	- uint64_t io_start = zio->io_offset;
	- uint64_t io_end = io_start + zio->io_size;
	- uint64_t min_offset = P2ALIGN(io_start, VCBS);
	- uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
	- avl_index_t where;
	-
	- ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
	-
	- mutex_enter(&vc->vc_lock);
	-
	- ve_search.ve_offset = min_offset;
	- ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
	-
	- if (ve == NULL)
	- ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
	-
	- while (ve != NULL && ve->ve_offset < max_offset) {
	- uint64_t start = MAX(ve->ve_offset, io_start);
	- uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
	-
	- if (ve->ve_fill_io != NULL) {
	- ve->ve_missed_update = 1;
	- } else {
	- abd_copy_off(ve->ve_abd, zio->io_abd,
	- start - ve->ve_offset, start - io_start,
	- end - start);
	- }
	- ve = AVL_NEXT(&vc->vc_offset_tree, ve);
	- }
	- mutex_exit(&vc->vc_lock);
	-}
	-
	-void
	-vdev_cache_purge(vdev_t *vd)
	-{
	- vdev_cache_t *vc = &vd->vdev_cache;
	- vdev_cache_entry_t *ve;
	-
	- mutex_enter(&vc->vc_lock);
	- while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
	- vdev_cache_evict(vc, ve);
	- mutex_exit(&vc->vc_lock);
	-}
	-
	-void
	-vdev_cache_init(vdev_t *vd)
	-{
	- vdev_cache_t *vc = &vd->vdev_cache;
	-
	- mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
	- sizeof (vdev_cache_entry_t),
	- offsetof(struct vdev_cache_entry, ve_offset_node));
	-
	- avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
	- sizeof (vdev_cache_entry_t),
	- offsetof(struct vdev_cache_entry, ve_lastused_node));
	-}
	-
	-void
	-vdev_cache_fini(vdev_t *vd)
	-{
	- vdev_cache_t *vc = &vd->vdev_cache;
	-
	- vdev_cache_purge(vd);
	-
	- avl_destroy(&vc->vc_offset_tree);
	- avl_destroy(&vc->vc_lastused_tree);
	-
	- mutex_destroy(&vc->vc_lock);
	-}
	-
	-void
	-vdev_cache_stat_init(void)
	-{
	- vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
	- KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
	- KSTAT_FLAG_VIRTUAL);
	- if (vdc_ksp != NULL) {
	- vdc_ksp->ks_data = &vdc_stats;
	- kstat_install(vdc_ksp);
	- }
	-}
	-
	-void
	-vdev_cache_stat_fini(void)
	-{
	- if (vdc_ksp != NULL) {
	- kstat_delete(vdc_ksp);
	- vdc_ksp = NULL;
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
	@@ -1,971 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2013 Joyent, Inc. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa_impl.h>
	-#include <sys/refcount.h>
	-#include <sys/vdev_disk.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/abd.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zio.h>
	-#include <sys/sunldi.h>
	-#include <sys/efi_partition.h>
	-#include <sys/fm/fs/zfs.h>
	-
	-/*
	- * Virtual device vector for disks.
	- */
	-
	-extern ldi_ident_t zfs_li;
	-
	-static void vdev_disk_close(vdev_t *);
	-
	-typedef struct vdev_disk_ldi_cb {
	- list_node_t lcb_next;
	- ldi_callback_id_t lcb_id;
	-} vdev_disk_ldi_cb_t;
	-
	-/*
	- * Bypass the devid when opening a disk vdev.
	- * There have been issues where the devids of several devices were shuffled,
	- * causing pool open failures. Note, that this flag is intended to be used
	- * for pool recovery only.
	- *
	- * Note that if a pool is imported with the devids bypassed, all its vdevs will
	- * cease storing devid information permanently. In practice, the devid is rarely
	- * useful as vdev paths do not tend to change unless the hardware is
	- * reconfigured. That said, if the paths do change and a pool fails to open
	- * automatically at boot, a simple zpool import should re-scan the paths and fix
	- * the issue.
	- */
	-boolean_t vdev_disk_bypass_devid = B_FALSE;
	-
	-static void
	-vdev_disk_alloc(vdev_t *vd)
	-{
	- vdev_disk_t *dvd;
	-
	- dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
	- /*
	- * Create the LDI event callback list.
	- */
	- list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
	- offsetof(vdev_disk_ldi_cb_t, lcb_next));
	-}
	-
	-static void
	-vdev_disk_free(vdev_t *vd)
	-{
	- vdev_disk_t *dvd = vd->vdev_tsd;
	- vdev_disk_ldi_cb_t *lcb;
	-
	- if (dvd == NULL)
	- return;
	-
	- /*
	- * We have already closed the LDI handle. Clean up the LDI event
	- * callbacks and free vd->vdev_tsd.
	- */
	- while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
	- list_remove(&dvd->vd_ldi_cbs, lcb);
	- (void) ldi_ev_remove_callbacks(lcb->lcb_id);
	- kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
	- }
	- list_destroy(&dvd->vd_ldi_cbs);
	- kmem_free(dvd, sizeof (vdev_disk_t));
	- vd->vdev_tsd = NULL;
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
	- void *ev_data)
	-{
	- vdev_t vd = (vdev_t )arg;
	- vdev_disk_t *dvd = vd->vdev_tsd;
	-
	- /*
	- * Ignore events other than offline.
	- */
	- if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
	- return (LDI_EV_SUCCESS);
	-
	- /*
	- * All LDI handles must be closed for the state change to succeed, so
	- * call on vdev_disk_close() to do this.
	- *
	- * We inform vdev_disk_close that it is being called from offline
	- * notify context so it will defer cleanup of LDI event callbacks and
	- * freeing of vd->vdev_tsd to the offline finalize or a reopen.
	- */
	- dvd->vd_ldi_offline = B_TRUE;
	- vdev_disk_close(vd);
	-
	- /*
	- * Now that the device is closed, request that the spa_async_thread
	- * mark the device as REMOVED and notify FMA of the removal.
	- */
	- zfs_post_remove(vd->vdev_spa, vd);
	- vd->vdev_remove_wanted = B_TRUE;
	- spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
	-
	- return (LDI_EV_SUCCESS);
	-}
	-
	-/* ARGSUSED */
	-static void
	-vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
	- int ldi_result, void arg, void ev_data)
	-{
	- vdev_t vd = (vdev_t )arg;
	-
	- /*
	- * Ignore events other than offline.
	- */
	- if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
	- return;
	-
	- /*
	- * We have already closed the LDI handle in notify.
	- * Clean up the LDI event callbacks and free vd->vdev_tsd.
	- */
	- vdev_disk_free(vd);
	-
	- /*
	- * Request that the vdev be reopened if the offline state change was
	- * unsuccessful.
	- */
	- if (ldi_result != LDI_EV_SUCCESS) {
	- vd->vdev_probe_wanted = B_TRUE;
	- spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
	- }
	-}
	-
	-static ldi_ev_callback_t vdev_disk_off_callb = {
	- .cb_vers = LDI_EV_CB_VERS,
	- .cb_notify = vdev_disk_off_notify,
	- .cb_finalize = vdev_disk_off_finalize
	-};
	-
	-/* ARGSUSED */
	-static void
	-vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
	- int ldi_result, void arg, void ev_data)
	-{
	- vdev_t vd = (vdev_t )arg;
	-
	- /*
	- * Ignore events other than degrade.
	- */
	- if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
	- return;
	-
	- /*
	- * Degrade events always succeed. Mark the vdev as degraded.
	- * This status is purely informative for the user.
	- */
	- (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
	-}
	-
	-static ldi_ev_callback_t vdev_disk_dgrd_callb = {
	- .cb_vers = LDI_EV_CB_VERS,
	- .cb_notify = NULL,
	- .cb_finalize = vdev_disk_dgrd_finalize
	-};
	-
	-static void
	-vdev_disk_hold(vdev_t *vd)
	-{
	- ddi_devid_t devid;
	- char *minor;
	-
	- ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
	-
	- /*
	- * We must have a pathname, and it must be absolute.
	- */
	- if (vd->vdev_path == NULL \|\| vd->vdev_path[0] != '/')
	- return;
	-
	- /*
	- * Only prefetch path and devid info if the device has
	- * never been opened.
	- */
	- if (vd->vdev_tsd != NULL)
	- return;
	-
	- if (vd->vdev_wholedisk == -1ULL) {
	- size_t len = strlen(vd->vdev_path) + 3;
	- char *buf = kmem_alloc(len, KM_SLEEP);
	-
	- (void) snprintf(buf, len, "%ss0", vd->vdev_path);
	-
	- (void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
	- kmem_free(buf, len);
	- }
	-
	- if (vd->vdev_name_vp == NULL)
	- (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
	-
	- if (vd->vdev_devid != NULL &&
	- ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
	- (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
	- ddi_devid_str_free(minor);
	- ddi_devid_free(devid);
	- }
	-}
	-
	-static void
	-vdev_disk_rele(vdev_t *vd)
	-{
	- ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
	-
	- if (vd->vdev_name_vp) {
	- VN_RELE_ASYNC(vd->vdev_name_vp,
	- dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
	- vd->vdev_name_vp = NULL;
	- }
	- if (vd->vdev_devid_vp) {
	- VN_RELE_ASYNC(vd->vdev_devid_vp,
	- dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
	- vd->vdev_devid_vp = NULL;
	- }
	-}
	-
	-/*
	- * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
	- * even a fallback to DKIOCGMEDIAINFO fails.
	- */
	-#ifdef DEBUG
	-#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__)
	-#else
	-#define VDEV_DEBUG(...) /* Nothing... */
	-#endif
	-
	-static int
	-vdev_disk_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	- uint64_t *ashift)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_disk_t *dvd = vd->vdev_tsd;
	- ldi_ev_cookie_t ecookie;
	- vdev_disk_ldi_cb_t *lcb;
	- union {
	- struct dk_minfo_ext ude;
	- struct dk_minfo ud;
	- } dks;
	- struct dk_minfo_ext *dkmext = &dks.ude;
	- struct dk_minfo *dkm = &dks.ud;
	- int error;
	- dev_t dev;
	- int otyp;
	- boolean_t validate_devid = B_FALSE;
	- ddi_devid_t devid;
	- uint64_t capacity = 0, blksz = 0, pbsize;
	-
	- /*
	- * We must have a pathname, and it must be absolute.
	- */
	- if (vd->vdev_path == NULL \|\| vd->vdev_path[0] != '/') {
	- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * Reopen the device if it's not currently open. Otherwise,
	- * just update the physical size of the device.
	- */
	- if (dvd != NULL) {
	- if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
	- /*
	- * If we are opening a device in its offline notify
	- * context, the LDI handle was just closed. Clean
	- * up the LDI event callbacks and free vd->vdev_tsd.
	- */
	- vdev_disk_free(vd);
	- } else {
	- ASSERT(vd->vdev_reopening);
	- goto skip_open;
	- }
	- }
	-
	- /*
	- * Create vd->vdev_tsd.
	- */
	- vdev_disk_alloc(vd);
	- dvd = vd->vdev_tsd;
	-
	- /*
	- * Allow bypassing the devid.
	- */
	- if (vd->vdev_devid != NULL && vdev_disk_bypass_devid) {
	- vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed",
	- vd->vdev_devid);
	- spa_strfree(vd->vdev_devid);
	- vd->vdev_devid = NULL;
	- }
	-
	- /*
	- * When opening a disk device, we want to preserve the user's original
	- * intent. We always want to open the device by the path the user gave
	- * us, even if it is one of multiple paths to the save device. But we
	- * also want to be able to survive disks being removed/recabled.
	- * Therefore the sequence of opening devices is:
	- *
	- * 1. Try opening the device by path. For legacy pools without the
	- * 'whole_disk' property, attempt to fix the path by appending 's0'.
	- *
	- * 2. If the devid of the device matches the stored value, return
	- * success.
	- *
	- * 3. Otherwise, the device may have moved. Try opening the device
	- * by the devid instead.
	- */
	- if (vd->vdev_devid != NULL) {
	- if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
	- &dvd->vd_minor) != 0) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	- vdev_dbgmsg(vd, "vdev_disk_open: invalid "
	- "vdev_devid '%s'", vd->vdev_devid);
	- return (SET_ERROR(EINVAL));
	- }
	- }
	-
	- error = EINVAL; /* presume failure */
	-
	- if (vd->vdev_path != NULL) {
	-
	- if (vd->vdev_wholedisk == -1ULL) {
	- size_t len = strlen(vd->vdev_path) + 3;
	- char *buf = kmem_alloc(len, KM_SLEEP);
	-
	- (void) snprintf(buf, len, "%ss0", vd->vdev_path);
	-
	- error = ldi_open_by_name(buf, spa_mode(spa), kcred,
	- &dvd->vd_lh, zfs_li);
	- if (error == 0) {
	- spa_strfree(vd->vdev_path);
	- vd->vdev_path = buf;
	- vd->vdev_wholedisk = 1ULL;
	- } else {
	- kmem_free(buf, len);
	- }
	- }
	-
	- /*
	- * If we have not yet opened the device, try to open it by the
	- * specified path.
	- */
	- if (error != 0) {
	- error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
	- kcred, &dvd->vd_lh, zfs_li);
	- }
	-
	- /*
	- * Compare the devid to the stored value.
	- */
	- if (error == 0 && vd->vdev_devid != NULL &&
	- ldi_get_devid(dvd->vd_lh, &devid) == 0) {
	- if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
	- /*
	- * A mismatch here is unexpected, log it.
	- */
	- char *devid_str = ddi_devid_str_encode(devid,
	- dvd->vd_minor);
	- vdev_dbgmsg(vd, "vdev_disk_open: devid "
	- "mismatch: %s != %s", vd->vdev_devid,
	- devid_str);
	- cmn_err(CE_NOTE, "vdev_disk_open %s: devid "
	- "mismatch: %s != %s", vd->vdev_path,
	- vd->vdev_devid, devid_str);
	- ddi_devid_str_free(devid_str);
	-
	- error = SET_ERROR(EINVAL);
	- (void) ldi_close(dvd->vd_lh, spa_mode(spa),
	- kcred);
	- dvd->vd_lh = NULL;
	- }
	- ddi_devid_free(devid);
	- }
	-
	- /*
	- * If we succeeded in opening the device, but 'vdev_wholedisk'
	- * is not yet set, then this must be a slice.
	- */
	- if (error == 0 && vd->vdev_wholedisk == -1ULL)
	- vd->vdev_wholedisk = 0;
	- }
	-
	- /*
	- * If we were unable to open by path, or the devid check fails, open by
	- * devid instead.
	- */
	- if (error != 0 && vd->vdev_devid != NULL) {
	- error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
	- spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
	- if (error != 0) {
	- vdev_dbgmsg(vd, "Failed to open by devid (%s)",
	- vd->vdev_devid);
	- }
	- }
	-
	- /*
	- * If all else fails, then try opening by physical path (if available)
	- * or the logical path (if we failed due to the devid check). While not
	- * as reliable as the devid, this will give us something, and the higher
	- * level vdev validation will prevent us from opening the wrong device.
	- */
	- if (error) {
	- if (vd->vdev_devid != NULL)
	- validate_devid = B_TRUE;
	-
	- if (vd->vdev_physpath != NULL &&
	- (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
	- error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
	- kcred, &dvd->vd_lh, zfs_li);
	-
	- /*
	- * Note that we don't support the legacy auto-wholedisk support
	- * as above. This hasn't been used in a very long time and we
	- * don't need to propagate its oddities to this edge condition.
	- */
	- if (error && vd->vdev_path != NULL)
	- error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
	- kcred, &dvd->vd_lh, zfs_li);
	- }
	-
	- if (error) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	- vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]",
	- error);
	- return (error);
	- }
	-
	- /*
	- * Now that the device has been successfully opened, update the devid
	- * if necessary.
	- */
	- if (validate_devid && spa_writeable(spa) &&
	- ldi_get_devid(dvd->vd_lh, &devid) == 0) {
	- if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
	- char *vd_devid;
	-
	- vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
	- vdev_dbgmsg(vd, "vdev_disk_open: update devid from "
	- "'%s' to '%s'", vd->vdev_devid, vd_devid);
	- cmn_err(CE_NOTE, "vdev_disk_open %s: update devid "
	- "from '%s' to '%s'", vd->vdev_path != NULL ?
	- vd->vdev_path : "?", vd->vdev_devid, vd_devid);
	- spa_strfree(vd->vdev_devid);
	- vd->vdev_devid = spa_strdup(vd_devid);
	- ddi_devid_str_free(vd_devid);
	- }
	- ddi_devid_free(devid);
	- }
	-
	- /*
	- * Once a device is opened, verify that the physical device path (if
	- * available) is up to date.
	- */
	- if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
	- ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
	- char physpath, minorname;
	-
	- physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	- minorname = NULL;
	- if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
	- ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
	- (vd->vdev_physpath == NULL \|\|
	- strcmp(vd->vdev_physpath, physpath) != 0)) {
	- if (vd->vdev_physpath)
	- spa_strfree(vd->vdev_physpath);
	- (void) strlcat(physpath, ":", MAXPATHLEN);
	- (void) strlcat(physpath, minorname, MAXPATHLEN);
	- vd->vdev_physpath = spa_strdup(physpath);
	- }
	- if (minorname)
	- kmem_free(minorname, strlen(minorname) + 1);
	- kmem_free(physpath, MAXPATHLEN);
	- }
	-
	- /*
	- * Register callbacks for the LDI offline event.
	- */
	- if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
	- LDI_EV_SUCCESS) {
	- lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
	- list_insert_tail(&dvd->vd_ldi_cbs, lcb);
	- (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
	- &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
	- }
	-
	- /*
	- * Register callbacks for the LDI degrade event.
	- */
	- if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
	- LDI_EV_SUCCESS) {
	- lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
	- list_insert_tail(&dvd->vd_ldi_cbs, lcb);
	- (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
	- &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
	- }
	-skip_open:
	- /*
	- * Determine the actual size of the device.
	- */
	- if (ldi_get_size(dvd->vd_lh, psize) != 0) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	- vdev_dbgmsg(vd, "vdev_disk_open: failed to get size");
	- return (SET_ERROR(EINVAL));
	- }
	-
	- max_psize = psize;
	-
	- /*
	- * Determine the device's minimum transfer size.
	- * If the ioctl isn't supported, assume DEV_BSIZE.
	- */
	- if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
	- (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
	- capacity = dkmext->dki_capacity - 1;
	- blksz = dkmext->dki_lbsize;
	- pbsize = dkmext->dki_pbsize;
	- } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
	- (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
	- VDEV_DEBUG(
	- "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
	- vd->vdev_path);
	- capacity = dkm->dki_capacity - 1;
	- blksz = dkm->dki_lbsize;
	- pbsize = blksz;
	- } else {
	- VDEV_DEBUG("vdev_disk_open(\"%s\"): "
	- "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
	- vd->vdev_path, error);
	- pbsize = DEV_BSIZE;
	- }
	-
	- *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
	-
	- if (vd->vdev_wholedisk == 1) {
	- int wce = 1;
	-
	- if (error == 0) {
	- /*
	- * If we have the capability to expand, we'd have
	- * found out via success from DKIOCGMEDIAINFO{,EXT}.
	- * Adjust max_psize upward accordingly since we know
	- * we own the whole disk now.
	- */
	- max_psize = capacity blksz;
	- }
	-
	- /*
	- * Since we own the whole disk, try to enable disk write
	- * caching. We ignore errors because it's OK if we can't do it.
	- */
	- (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
	- FKIOCTL, kcred, NULL);
	- }
	-
	- /*
	- * Clear the nowritecache bit, so that on a vdev_reopen() we will
	- * try again.
	- */
	- vd->vdev_nowritecache = B_FALSE;
	-
	- return (0);
	-}
	-
	-static void
	-vdev_disk_close(vdev_t *vd)
	-{
	- vdev_disk_t *dvd = vd->vdev_tsd;
	-
	- if (vd->vdev_reopening \|\| dvd == NULL)
	- return;
	-
	- if (dvd->vd_minor != NULL) {
	- ddi_devid_str_free(dvd->vd_minor);
	- dvd->vd_minor = NULL;
	- }
	-
	- if (dvd->vd_devid != NULL) {
	- ddi_devid_free(dvd->vd_devid);
	- dvd->vd_devid = NULL;
	- }
	-
	- if (dvd->vd_lh != NULL) {
	- (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
	- dvd->vd_lh = NULL;
	- }
	-
	- vd->vdev_delayed_close = B_FALSE;
	- /*
	- * If we closed the LDI handle due to an offline notify from LDI,
	- * don't free vd->vdev_tsd or unregister the callbacks here;
	- * the offline finalize callback or a reopen will take care of it.
	- */
	- if (dvd->vd_ldi_offline)
	- return;
	-
	- vdev_disk_free(vd);
	-}
	-
	-int
	-vdev_disk_physio(vdev_t *vd, caddr_t data,
	- size_t size, uint64_t offset, int flags, boolean_t isdump)
	-{
	- vdev_disk_t *dvd = vd->vdev_tsd;
	-
	- /*
	- * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
	- * Nothing to be done here but return failure.
	- */
	- if (dvd == NULL \|\| (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
	- return (EIO);
	-
	- ASSERT(vd->vdev_ops == &vdev_disk_ops);
	-
	- /*
	- * If in the context of an active crash dump, use the ldi_dump(9F)
	- * call instead of ldi_strategy(9F) as usual.
	- */
	- if (isdump) {
	- ASSERT3P(dvd, !=, NULL);
	- return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
	- lbtodb(size)));
	- }
	-
	- return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
	-}
	-
	-int
	-vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
	- size_t size, uint64_t offset, int flags)
	-{
	- buf_t *bp;
	- int error = 0;
	-
	- if (vd_lh == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- ASSERT(flags & B_READ \|\| flags & B_WRITE);
	-
	- bp = getrbuf(KM_SLEEP);
	- bp->b_flags = flags \| B_BUSY \| B_NOCACHE \| B_FAILFAST;
	- bp->b_bcount = size;
	- bp->b_un.b_addr = (void *)data;
	- bp->b_lblkno = lbtodb(offset);
	- bp->b_bufsize = size;
	-
	- error = ldi_strategy(vd_lh, bp);
	- ASSERT(error == 0);
	- if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
	- error = SET_ERROR(EIO);
	- freerbuf(bp);
	-
	- return (error);
	-}
	-
	-static void
	-vdev_disk_io_intr(buf_t *bp)
	-{
	- vdev_buf_t vb = (vdev_buf_t )bp;
	- zio_t *zio = vb->vb_io;
	-
	- /*
	- * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
	- * Rather than teach the rest of the stack about other error
	- * possibilities (EFAULT, etc), we normalize the error value here.
	- */
	- zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
	-
	- if (zio->io_error == 0 && bp->b_resid != 0)
	- zio->io_error = SET_ERROR(EIO);
	-
	- if (zio->io_type == ZIO_TYPE_READ) {
	- abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
	- } else {
	- abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
	- }
	-
	- kmem_free(vb, sizeof (vdev_buf_t));
	-
	- zio_delay_interrupt(zio);
	-}
	-
	-static void
	-vdev_disk_ioctl_free(zio_t *zio)
	-{
	- kmem_free(zio->io_vsd, sizeof (struct dk_callback));
	-}
	-
	-static const zio_vsd_ops_t vdev_disk_vsd_ops = {
	- vdev_disk_ioctl_free,
	- zio_vsd_default_cksum_report
	-};
	-
	-static void
	-vdev_disk_ioctl_done(void *zio_arg, int error)
	-{
	- zio_t *zio = zio_arg;
	-
	- zio->io_error = error;
	-
	- zio_interrupt(zio);
	-}
	-
	-static void
	-vdev_disk_io_start(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- vdev_disk_t *dvd = vd->vdev_tsd;
	- vdev_buf_t *vb;
	- struct dk_callback *dkc;
	- buf_t *bp;
	- int error;
	-
	- /*
	- * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
	- * Nothing to be done here but return failure.
	- */
	- if (dvd == NULL \|\| (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
	- zio->io_error = SET_ERROR(ENXIO);
	- zio_interrupt(zio);
	- return;
	- }
	-
	- if (zio->io_type == ZIO_TYPE_IOCTL) {
	- /* XXPOLICY */
	- if (!vdev_readable(vd)) {
	- zio->io_error = SET_ERROR(ENXIO);
	- zio_interrupt(zio);
	- return;
	- }
	-
	- switch (zio->io_cmd) {
	-
	- case DKIOCFLUSHWRITECACHE:
	-
	- if (zfs_nocacheflush)
	- break;
	-
	- if (vd->vdev_nowritecache) {
	- zio->io_error = SET_ERROR(ENOTSUP);
	- break;
	- }
	-
	- zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
	- zio->io_vsd_ops = &vdev_disk_vsd_ops;
	-
	- dkc->dkc_callback = vdev_disk_ioctl_done;
	- dkc->dkc_flag = FLUSH_VOLATILE;
	- dkc->dkc_cookie = zio;
	-
	- error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
	- (uintptr_t)dkc, FKIOCTL, kcred, NULL);
	-
	- if (error == 0) {
	- /*
	- * The ioctl will be done asychronously,
	- * and will call vdev_disk_ioctl_done()
	- * upon completion.
	- */
	- return;
	- }
	-
	- zio->io_error = error;
	-
	- break;
	-
	- default:
	- zio->io_error = SET_ERROR(ENOTSUP);
	- }
	-
	- zio_execute(zio);
	- return;
	- }
	-
	- ASSERT(zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE);
	- zio->io_target_timestamp = zio_handle_io_delay(zio);
	-
	- vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
	-
	- vb->vb_io = zio;
	- bp = &vb->vb_buf;
	-
	- bioinit(bp);
	- bp->b_flags = B_BUSY \| B_NOCACHE \|
	- (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
	- if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY \| ZIO_FLAG_TRYHARD)))
	- bp->b_flags \|= B_FAILFAST;
	- bp->b_bcount = zio->io_size;
	-
	- if (zio->io_type == ZIO_TYPE_READ) {
	- bp->b_un.b_addr =
	- abd_borrow_buf(zio->io_abd, zio->io_size);
	- } else {
	- bp->b_un.b_addr =
	- abd_borrow_buf_copy(zio->io_abd, zio->io_size);
	- }
	-
	- bp->b_lblkno = lbtodb(zio->io_offset);
	- bp->b_bufsize = zio->io_size;
	- bp->b_iodone = (int (*)())vdev_disk_io_intr;
	-
	- /* ldi_strategy() will return non-zero only on programming errors */
	- VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
	-}
	-
	-static void
	-vdev_disk_io_done(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	-
	- /*
	- * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
	- * the device has been removed. If this is the case, then we trigger an
	- * asynchronous removal of the device. Otherwise, probe the device and
	- * make sure it's still accessible.
	- */
	- if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
	- vdev_disk_t *dvd = vd->vdev_tsd;
	- int state = DKIO_NONE;
	-
	- if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
	- FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
	- /*
	- * We post the resource as soon as possible, instead of
	- * when the async removal actually happens, because the
	- * DE is using this information to discard previous I/O
	- * errors.
	- */
	- zfs_post_remove(zio->io_spa, vd);
	- vd->vdev_remove_wanted = B_TRUE;
	- spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
	- } else if (!vd->vdev_delayed_close) {
	- vd->vdev_delayed_close = B_TRUE;
	- }
	- }
	-}
	-
	-vdev_ops_t vdev_disk_ops = {
	- vdev_disk_open,
	- vdev_disk_close,
	- vdev_default_asize,
	- vdev_disk_io_start,
	- vdev_disk_io_done,
	- NULL,
	- NULL,
	- vdev_disk_hold,
	- vdev_disk_rele,
	- NULL,
	- vdev_default_xlate,
	- VDEV_TYPE_DISK, /* name of this vdev type */
	- B_TRUE /* leaf vdev */
	-};
	-
	-/*
	- * Given the root disk device devid or pathname, read the label from
	- * the device, and construct a configuration nvlist.
	- */
	-int
	-vdev_disk_read_rootlabel(char devpath, char devid, nvlist_t **config)
	-{
	- ldi_handle_t vd_lh;
	- vdev_label_t *label;
	- uint64_t s, size;
	- int l;
	- ddi_devid_t tmpdevid;
	- int error = -1;
	- char *minor_name;
	-
	- /*
	- * Read the device label and build the nvlist.
	- */
	- if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
	- &minor_name) == 0) {
	- error = ldi_open_by_devid(tmpdevid, minor_name,
	- FREAD, kcred, &vd_lh, zfs_li);
	- ddi_devid_free(tmpdevid);
	- ddi_devid_str_free(minor_name);
	- }
	-
	- if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
	- zfs_li)))
	- return (error);
	-
	- if (ldi_get_size(vd_lh, &s)) {
	- (void) ldi_close(vd_lh, FREAD, kcred);
	- return (SET_ERROR(EIO));
	- }
	-
	- size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
	- label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
	-
	- *config = NULL;
	- for (l = 0; l < VDEV_LABELS; l++) {
	- uint64_t offset, state, txg = 0;
	-
	- /* read vdev label */
	- offset = vdev_label_offset(size, l, 0);
	- if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
	- VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
	- continue;
	-
	- if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
	- sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
	- *config = NULL;
	- continue;
	- }
	-
	- if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
	- &state) != 0 \|\| state >= POOL_STATE_DESTROYED) {
	- nvlist_free(*config);
	- *config = NULL;
	- continue;
	- }
	-
	- if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
	- &txg) != 0 \|\| txg == 0) {
	- nvlist_free(*config);
	- *config = NULL;
	- continue;
	- }
	-
	- break;
	- }
	-
	- kmem_free(label, sizeof (vdev_label_t));
	- (void) ldi_close(vd_lh, FREAD, kcred);
	- if (*config == NULL)
	- error = SET_ERROR(EIDRM);
	-
	- return (error);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
	@@ -1,307 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/vdev_file.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zio.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/fm/fs/zfs.h>
	-#include <sys/abd.h>
	-
	-/*
	- * Virtual device vector for files.
	- */
	-
	-static taskq_t *vdev_file_taskq;
	-
	-void
	-vdev_file_init(void)
	-{
	- vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
	- minclsyspri, max_ncpus, INT_MAX, 0);
	-}
	-
	-void
	-vdev_file_fini(void)
	-{
	- taskq_destroy(vdev_file_taskq);
	-}
	-
	-static void
	-vdev_file_hold(vdev_t *vd)
	-{
	- ASSERT(vd->vdev_path != NULL);
	-}
	-
	-static void
	-vdev_file_rele(vdev_t *vd)
	-{
	- ASSERT(vd->vdev_path != NULL);
	-}
	-
	-static int
	-vdev_file_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	- uint64_t logical_ashift, uint64_t physical_ashift)
	-{
	- vdev_file_t *vf;
	- vnode_t *vp;
	- vattr_t vattr;
	- int error;
	-
	- /* Rotational optimizations only make sense on block devices */
	- vd->vdev_nonrot = B_TRUE;
	-
	- /*
	- * We must have a pathname, and it must be absolute.
	- */
	- if (vd->vdev_path == NULL \|\| vd->vdev_path[0] != '/') {
	- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * Reopen the device if it's not currently open. Otherwise,
	- * just update the physical size of the device.
	- */
	- if (vd->vdev_tsd != NULL) {
	- ASSERT(vd->vdev_reopening);
	- vf = vd->vdev_tsd;
	- vp = vf->vf_vnode;
	- goto skip_open;
	- }
	-
	- vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
	-
	- /*
	- * We always open the files from the root of the global zone, even if
	- * we're in a local zone. If the user has gotten to this point, the
	- * administrator has already decided that the pool should be available
	- * to local zone users, so the underlying devices should be as well.
	- */
	- ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
	- error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
	- spa_mode(vd->vdev_spa) \| FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
	-
	- if (error) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	- kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
	- vd->vdev_tsd = NULL;
	- return (error);
	- }
	-
	- vf->vf_vnode = vp;
	-
	-#ifdef _KERNEL
	- /*
	- * Make sure it's a regular file.
	- */
	- if (vp->v_type != VREG) {
	-#ifdef __FreeBSD__
	- (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
	-#endif
	- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	-#ifdef __FreeBSD__
	- kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
	- vd->vdev_tsd = NULL;
	-#endif
	- return (SET_ERROR(ENODEV));
	- }
	-#endif /* _KERNEL */
	-
	-skip_open:
	- /*
	- * Determine the physical size of the file.
	- */
	- vattr.va_mask = AT_SIZE;
	- vn_lock(vp, LK_SHARED \| LK_RETRY);
	- error = VOP_GETATTR(vp, &vattr, kcred);
	- VOP_UNLOCK(vp);
	- if (error) {
	- (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
	- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	- kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
	- vd->vdev_tsd = NULL;
	- return (error);
	- }
	-
	- vd->vdev_notrim = B_TRUE;
	-
	- max_psize = psize = vattr.va_size;
	- *logical_ashift = SPA_MINBLOCKSHIFT;
	- *physical_ashift = SPA_MINBLOCKSHIFT;
	-
	- return (0);
	-}
	-
	-static void
	-vdev_file_close(vdev_t *vd)
	-{
	- vdev_file_t *vf = vd->vdev_tsd;
	-
	- if (vd->vdev_reopening \|\| vf == NULL)
	- return;
	-
	- if (vf->vf_vnode != NULL) {
	- (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
	- kcred, NULL);
	- }
	-
	- vd->vdev_delayed_close = B_FALSE;
	- kmem_free(vf, sizeof (vdev_file_t));
	- vd->vdev_tsd = NULL;
	-}
	-
	-/*
	- * Implements the interrupt side for file vdev types. This routine will be
	- * called when the I/O completes allowing us to transfer the I/O to the
	- * interrupt taskqs. For consistency, the code structure mimics disk vdev
	- * types.
	- */
	-static void
	-vdev_file_io_intr(zio_t *zio)
	-{
	- zio_delay_interrupt(zio);
	-}
	-
	-static void
	-vdev_file_io_strategy(void *arg)
	-{
	- zio_t *zio = arg;
	- vdev_t *vd = zio->io_vd;
	- vdev_file_t *vf;
	- vnode_t *vp;
	- void *addr;
	- ssize_t resid;
	-
	- vf = vd->vdev_tsd;
	- vp = vf->vf_vnode;
	-
	- ASSERT(zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE);
	- if (zio->io_type == ZIO_TYPE_READ) {
	- addr = abd_borrow_buf(zio->io_abd, zio->io_size);
	- } else {
	- addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
	- }
	-
	- zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
	- UIO_READ : UIO_WRITE, vp, addr, zio->io_size,
	- zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
	-
	- if (zio->io_type == ZIO_TYPE_READ) {
	- abd_return_buf_copy(zio->io_abd, addr, zio->io_size);
	- } else {
	- abd_return_buf(zio->io_abd, addr, zio->io_size);
	- }
	-
	- if (resid != 0 && zio->io_error == 0)
	- zio->io_error = ENOSPC;
	-
	- vdev_file_io_intr(zio);
	-}
	-
	-static void
	-vdev_file_io_start(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- vdev_file_t *vf = vd->vdev_tsd;
	-
	- if (zio->io_type == ZIO_TYPE_IOCTL) {
	- /* XXPOLICY */
	- if (!vdev_readable(vd)) {
	- zio->io_error = SET_ERROR(ENXIO);
	- zio_interrupt(zio);
	- return;
	- }
	-
	- switch (zio->io_cmd) {
	- case DKIOCFLUSHWRITECACHE:
	-
	- if (zfs_nocacheflush)
	- break;
	-
	- zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC \| FDSYNC,
	- kcred, NULL);
	- break;
	- default:
	- zio->io_error = SET_ERROR(ENOTSUP);
	- }
	-
	- zio_execute(zio);
	- return;
	- }
	-
	- ASSERT(zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE);
	- zio->io_target_timestamp = zio_handle_io_delay(zio);
	-
	- VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
	- TQ_SLEEP), !=, 0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-vdev_file_io_done(zio_t *zio)
	-{
	-}
	-
	-vdev_ops_t vdev_file_ops = {
	- vdev_file_open,
	- vdev_file_close,
	- vdev_default_asize,
	- vdev_file_io_start,
	- vdev_file_io_done,
	- NULL,
	- NULL,
	- vdev_file_hold,
	- vdev_file_rele,
	- NULL,
	- vdev_default_xlate,
	- VDEV_TYPE_FILE, /* name of this vdev type */
	- B_TRUE /* leaf vdev */
	-};
	-
	-/*
	- * From userland we access disks just like files.
	- */
	-#ifndef _KERNEL
	-
	-vdev_ops_t vdev_disk_ops = {
	- vdev_file_open,
	- vdev_file_close,
	- vdev_default_asize,
	- vdev_file_io_start,
	- vdev_file_io_done,
	- NULL,
	- NULL,
	- vdev_file_hold,
	- vdev_file_rele,
	- NULL,
	- vdev_default_xlate,
	- VDEV_TYPE_DISK, /* name of this vdev type */
	- B_TRUE /* leaf vdev */
	-};
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
	@@ -1,1193 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/param.h>
	-#include <sys/kernel.h>
	-#include <sys/bio.h>
	-#include <sys/disk.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zio.h>
	-#include <geom/geom.h>
	-#include <geom/geom_int.h>
	-
	-/*
	- * Virtual device vector for GEOM.
	- */
	-
	-static g_attrchanged_t vdev_geom_attrchanged;
	-struct g_class zfs_vdev_class = {
	- .name = "ZFS::VDEV",
	- .version = G_VERSION,
	- .attrchanged = vdev_geom_attrchanged,
	-};
	-
	-struct consumer_vdev_elem {
	- SLIST_ENTRY(consumer_vdev_elem) elems;
	- vdev_t *vd;
	-};
	-
	-SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
	-_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
	- == sizeof(struct consumer_priv_t*),
	- "consumer_priv_t* can't be stored in g_consumer.private");
	-
	-DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
	-
	-SYSCTL_DECL(_vfs_zfs_vdev);
	-/* Don't send BIO_FLUSH. */
	-static int vdev_geom_bio_flush_disable;
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
	- &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
	-/* Don't send BIO_DELETE. */
	-static int vdev_geom_bio_delete_disable;
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
	- &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
	-
	-/* Declare local functions */
	-static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
	-
	-/*
	- * Thread local storage used to indicate when a thread is probing geoms
	- * for their guids. If NULL, this thread is not tasting geoms. If non NULL,
	- * it is looking for a replacement for the vdev_t* that is its value.
	- */
	-uint_t zfs_geom_probe_vdev_key;
	-
	-static void
	-vdev_geom_set_rotation_rate(vdev_t vd, struct g_consumer cp)
	-{
	- int error;
	- uint16_t rate;
	-
	- error = g_getattr("GEOM::rotation_rate", cp, &rate);
	- if (error == 0 && rate == 1)
	- vd->vdev_nonrot = B_TRUE;
	- else
	- vd->vdev_nonrot = B_FALSE;
	-}
	-
	-static void
	-vdev_geom_set_physpath(vdev_t vd, struct g_consumer cp,
	- boolean_t do_null_update)
	-{
	- boolean_t needs_update = B_FALSE;
	- char *physpath;
	- int error, physpath_len;
	-
	- physpath_len = MAXPATHLEN;
	- physpath = g_malloc(physpath_len, M_WAITOK\|M_ZERO);
	- error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
	- if (error == 0) {
	- char *old_physpath;
	-
	- /* g_topology lock ensures that vdev has not been closed */
	- g_topology_assert();
	- old_physpath = vd->vdev_physpath;
	- vd->vdev_physpath = spa_strdup(physpath);
	-
	- if (old_physpath != NULL) {
	- needs_update = (strcmp(old_physpath,
	- vd->vdev_physpath) != 0);
	- spa_strfree(old_physpath);
	- } else
	- needs_update = do_null_update;
	- }
	- g_free(physpath);
	-
	- /*
	- * If the physical path changed, update the config.
	- * Only request an update for previously unset physpaths if
	- * requested by the caller.
	- */
	- if (needs_update)
	- spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
	-
	-}
	-
	-static void
	-vdev_geom_attrchanged(struct g_consumer cp, const char attr)
	-{
	- char *old_physpath;
	- struct consumer_priv_t *priv;
	- struct consumer_vdev_elem *elem;
	- int error;
	-
	- priv = (struct consumer_priv_t*)&cp->private;
	- if (SLIST_EMPTY(priv))
	- return;
	-
	- SLIST_FOREACH(elem, priv, elems) {
	- vdev_t *vd = elem->vd;
	- if (strcmp(attr, "GEOM::rotation_rate") == 0) {
	- vdev_geom_set_rotation_rate(vd, cp);
	- return;
	- }
	- if (strcmp(attr, "GEOM::physpath") == 0) {
	- vdev_geom_set_physpath(vd, cp, /null_update/B_TRUE);
	- return;
	- }
	- }
	-}
	-
	-static void
	-vdev_geom_resize(struct g_consumer *cp)
	-{
	- struct consumer_priv_t *priv;
	- struct consumer_vdev_elem *elem;
	- spa_t *spa;
	- vdev_t *vd;
	-
	- priv = (struct consumer_priv_t *)&cp->private;
	- if (SLIST_EMPTY(priv))
	- return;
	-
	- SLIST_FOREACH(elem, priv, elems) {
	- vd = elem->vd;
	- if (vd->vdev_state != VDEV_STATE_HEALTHY)
	- continue;
	- spa = vd->vdev_spa;
	- if (!spa->spa_autoexpand)
	- continue;
	- vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
	- }
	-}
	-
	-static void
	-vdev_geom_orphan(struct g_consumer *cp)
	-{
	- struct consumer_priv_t *priv;
	- struct consumer_vdev_elem *elem;
	-
	- g_topology_assert();
	-
	- priv = (struct consumer_priv_t*)&cp->private;
	- if (SLIST_EMPTY(priv))
	- /* Vdev close in progress. Ignore the event. */
	- return;
	-
	- /*
	- * Orphan callbacks occur from the GEOM event thread.
	- * Concurrent with this call, new I/O requests may be
	- * working their way through GEOM about to find out
	- * (only once executed by the g_down thread) that we've
	- * been orphaned from our disk provider. These I/Os
	- * must be retired before we can detach our consumer.
	- * This is most easily achieved by acquiring the
	- * SPA ZIO configuration lock as a writer, but doing
	- * so with the GEOM topology lock held would cause
	- * a lock order reversal. Instead, rely on the SPA's
	- * async removal support to invoke a close on this
	- * vdev once it is safe to do so.
	- */
	- SLIST_FOREACH(elem, priv, elems) {
	- vdev_t *vd = elem->vd;
	-
	- vd->vdev_remove_wanted = B_TRUE;
	- spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
	- }
	-}
	-
	-static struct g_consumer *
	-vdev_geom_attach(struct g_provider pp, vdev_t vd, boolean_t sanity)
	-{
	- struct g_geom *gp;
	- struct g_consumer *cp;
	- int error;
	-
	- g_topology_assert();
	-
	- ZFS_LOG(1, "Attaching to %s.", pp->name);
	-
	- if (sanity) {
	- if (pp->sectorsize > VDEV_PAD_SIZE \|\| !ISP2(pp->sectorsize)) {
	- ZFS_LOG(1, "Failing attach of %s. "
	- "Incompatible sectorsize %d\n",
	- pp->name, pp->sectorsize);
	- return (NULL);
	- } else if (pp->mediasize < SPA_MINDEVSIZE) {
	- ZFS_LOG(1, "Failing attach of %s. "
	- "Incompatible mediasize %ju\n",
	- pp->name, pp->mediasize);
	- return (NULL);
	- }
	- }
	-
	- /* Do we have geom already? No? Create one. */
	- LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
	- if (gp->flags & G_GEOM_WITHER)
	- continue;
	- if (strcmp(gp->name, "zfs::vdev") != 0)
	- continue;
	- break;
	- }
	- if (gp == NULL) {
	- gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
	- gp->orphan = vdev_geom_orphan;
	- gp->attrchanged = vdev_geom_attrchanged;
	- gp->resize = vdev_geom_resize;
	- cp = g_new_consumer(gp);
	- error = g_attach(cp, pp);
	- if (error != 0) {
	- ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
	- __LINE__, error);
	- vdev_geom_detach(cp, B_FALSE);
	- return (NULL);
	- }
	- error = g_access(cp, 1, 0, 1);
	- if (error != 0) {
	- ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
	- __LINE__, error);
	- vdev_geom_detach(cp, B_FALSE);
	- return (NULL);
	- }
	- ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
	- } else {
	- /* Check if we are already connected to this provider. */
	- LIST_FOREACH(cp, &gp->consumer, consumer) {
	- if (cp->provider == pp) {
	- ZFS_LOG(1, "Found consumer for %s.", pp->name);
	- break;
	- }
	- }
	- if (cp == NULL) {
	- cp = g_new_consumer(gp);
	- error = g_attach(cp, pp);
	- if (error != 0) {
	- ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
	- __func__, __LINE__, error);
	- vdev_geom_detach(cp, B_FALSE);
	- return (NULL);
	- }
	- error = g_access(cp, 1, 0, 1);
	- if (error != 0) {
	- ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
	- __func__, __LINE__, error);
	- vdev_geom_detach(cp, B_FALSE);
	- return (NULL);
	- }
	- ZFS_LOG(1, "Created consumer for %s.", pp->name);
	- } else {
	- error = g_access(cp, 1, 0, 1);
	- if (error != 0) {
	- ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
	- __func__, __LINE__, error);
	- return (NULL);
	- }
	- ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
	- }
	- }
	-
	- if (vd != NULL)
	- vd->vdev_tsd = cp;
	-
	- cp->flags \|= G_CF_DIRECT_SEND \| G_CF_DIRECT_RECEIVE;
	- return (cp);
	-}
	-
	-static void
	-vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
	-{
	- struct g_geom *gp;
	-
	- g_topology_assert();
	-
	- ZFS_LOG(1, "Detaching from %s.",
	- cp->provider && cp->provider->name ? cp->provider->name : "NULL");
	-
	- gp = cp->geom;
	- if (open_for_read)
	- g_access(cp, -1, 0, -1);
	- /* Destroy consumer on last close. */
	- if (cp->acr == 0 && cp->ace == 0) {
	- if (cp->acw > 0)
	- g_access(cp, 0, -cp->acw, 0);
	- if (cp->provider != NULL) {
	- ZFS_LOG(1, "Destroying consumer for %s.",
	- cp->provider->name ? cp->provider->name : "NULL");
	- g_detach(cp);
	- }
	- g_destroy_consumer(cp);
	- }
	- /* Destroy geom if there are no consumers left. */
	- if (LIST_EMPTY(&gp->consumer)) {
	- ZFS_LOG(1, "Destroyed geom %s.", gp->name);
	- g_wither_geom(gp, ENXIO);
	- }
	-}
	-
	-static void
	-vdev_geom_close_locked(vdev_t *vd)
	-{
	- struct g_consumer *cp;
	- struct consumer_priv_t *priv;
	- struct consumer_vdev_elem elem, elem_temp;
	-
	- g_topology_assert();
	-
	- cp = vd->vdev_tsd;
	- vd->vdev_delayed_close = B_FALSE;
	- if (cp == NULL)
	- return;
	-
	- ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
	- KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
	- priv = (struct consumer_priv_t*)&cp->private;
	- vd->vdev_tsd = NULL;
	- SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
	- if (elem->vd == vd) {
	- SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
	- g_free(elem);
	- }
	- }
	-
	- vdev_geom_detach(cp, B_TRUE);
	-}
	-
	-/*
	- * Issue one or more bios to the vdev in parallel
	- * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
	- * operation is described by parallel entries from each array. There may be
	- * more bios actually issued than entries in the array
	- */
	-static void
	-vdev_geom_io(struct g_consumer cp, int cmds, void *datas, off_t offsets,
	- off_t sizes, int errors, int ncmds)
	-{
	- struct bio **bios;
	- u_char *p;
	- off_t off, maxio, s, end;
	- int i, n_bios, j;
	- size_t bios_size;
	-
	- maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
	- n_bios = 0;
	-
	- /* How many bios are required for all commands ? */
	- for (i = 0; i < ncmds; i++)
	- n_bios += (sizes[i] + maxio - 1) / maxio;
	-
	- /* Allocate memory for the bios */
	- bios_size = n_bios * sizeof(struct bio*);
	- bios = kmem_zalloc(bios_size, KM_SLEEP);
	-
	- /* Prepare and issue all of the bios */
	- for (i = j = 0; i < ncmds; i++) {
	- off = offsets[i];
	- p = datas[i];
	- s = sizes[i];
	- end = off + s;
	- ASSERT((off % cp->provider->sectorsize) == 0);
	- ASSERT((s % cp->provider->sectorsize) == 0);
	-
	- for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
	- bios[j] = g_alloc_bio();
	- bios[j]->bio_cmd = cmds[i];
	- bios[j]->bio_done = NULL;
	- bios[j]->bio_offset = off;
	- bios[j]->bio_length = MIN(s, maxio);
	- bios[j]->bio_data = p;
	- g_io_request(bios[j], cp);
	- }
	- }
	- ASSERT(j == n_bios);
	-
	- /* Wait for all of the bios to complete, and clean them up */
	- for (i = j = 0; i < ncmds; i++) {
	- off = offsets[i];
	- s = sizes[i];
	- end = off + s;
	-
	- for (; off < end; off += maxio, s -= maxio, j++) {
	- errors[i] = biowait(bios[j], "vdev_geom_io") \|\| errors[i];
	- g_destroy_bio(bios[j]);
	- }
	- }
	- kmem_free(bios, bios_size);
	-}
	-
	-/*
	- * Read the vdev config from a device. Return the number of valid labels that
	- * were found. The vdev config will be returned in config if and only if at
	- * least one valid label was found.
	- */
	-static int
	-vdev_geom_read_config(struct g_consumer cp, nvlist_t *configp)
	-{
	- struct g_provider *pp;
	- nvlist_t *config;
	- vdev_phys_t *vdev_lists[VDEV_LABELS];
	- char *buf;
	- size_t buflen;
	- uint64_t psize, state, txg;
	- off_t offsets[VDEV_LABELS];
	- off_t size;
	- off_t sizes[VDEV_LABELS];
	- int cmds[VDEV_LABELS];
	- int errors[VDEV_LABELS];
	- int l, nlabels;
	-
	- g_topology_assert_not();
	-
	- pp = cp->provider;
	- ZFS_LOG(1, "Reading config from %s...", pp->name);
	-
	- psize = pp->mediasize;
	- psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
	-
	- size = sizeof(*vdev_lists[0]) + pp->sectorsize -
	- ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
	-
	- buflen = sizeof(vdev_lists[0]->vp_nvlist);
	-
	- /* Create all of the IO requests */
	- for (l = 0; l < VDEV_LABELS; l++) {
	- cmds[l] = BIO_READ;
	- vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
	- offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
	- sizes[l] = size;
	- errors[l] = 0;
	- ASSERT(offsets[l] % pp->sectorsize == 0);
	- }
	-
	- /* Issue the IO requests */
	- vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
	- VDEV_LABELS);
	-
	- /* Parse the labels */
	- config = *configp = NULL;
	- nlabels = 0;
	- for (l = 0; l < VDEV_LABELS; l++) {
	- if (errors[l] != 0)
	- continue;
	-
	- buf = vdev_lists[l]->vp_nvlist;
	-
	- if (nvlist_unpack(buf, buflen, &config, 0) != 0)
	- continue;
	-
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	- &state) != 0 \|\| state > POOL_STATE_L2CACHE) {
	- nvlist_free(config);
	- continue;
	- }
	-
	- if (state != POOL_STATE_SPARE &&
	- state != POOL_STATE_L2CACHE &&
	- (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
	- &txg) != 0 \|\| txg == 0)) {
	- nvlist_free(config);
	- continue;
	- }
	-
	- if (*configp != NULL)
	- nvlist_free(*configp);
	- *configp = config;
	-
	- nlabels++;
	- }
	-
	- /* Free the label storage */
	- for (l = 0; l < VDEV_LABELS; l++)
	- kmem_free(vdev_lists[l], size);
	-
	- return (nlabels);
	-}
	-
	-static void
	-resize_configs(nvlist_t **configs, uint64_t count, uint64_t id)
	-{
	- nvlist_t **new_configs;
	- uint64_t i;
	-
	- if (id < *count)
	- return;
	- new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
	- KM_SLEEP);
	- for (i = 0; i < *count; i++)
	- new_configs[i] = (*configs)[i];
	- if (*configs != NULL)
	- kmem_free(configs, count * sizeof(void *));
	- *configs = new_configs;
	- *count = id + 1;
	-}
	-
	-static void
	-process_vdev_config(nvlist_t **configs, uint64_t count, nvlist_t *cfg,
	- const char name, uint64_t known_pool_guid)
	-{
	- nvlist_t *vdev_tree;
	- uint64_t pool_guid;
	- uint64_t vdev_guid, known_guid;
	- uint64_t id, txg, known_txg;
	- char *pname;
	- int i;
	-
	- if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 \|\|
	- strcmp(pname, name) != 0)
	- goto ignore;
	-
	- if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
	- goto ignore;
	-
	- if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
	- goto ignore;
	-
	- if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
	- goto ignore;
	-
	- if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
	- goto ignore;
	-
	- VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
	-
	- if (*known_pool_guid != 0) {
	- if (pool_guid != *known_pool_guid)
	- goto ignore;
	- } else
	- *known_pool_guid = pool_guid;
	-
	- resize_configs(configs, count, id);
	-
	- if ((*configs)[id] != NULL) {
	- VERIFY(nvlist_lookup_uint64((*configs)[id],
	- ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
	- if (txg <= known_txg)
	- goto ignore;
	- nvlist_free((*configs)[id]);
	- }
	-
	- (*configs)[id] = cfg;
	- return;
	-
	-ignore:
	- nvlist_free(cfg);
	-}
	-
	-int
	-vdev_geom_read_pool_label(const char *name,
	- nvlist_t **configs, uint64_t count)
	-{
	- struct g_class *mp;
	- struct g_geom *gp;
	- struct g_provider *pp;
	- struct g_consumer *zcp;
	- nvlist_t *vdev_cfg;
	- uint64_t pool_guid;
	- int error, nlabels;
	-
	- DROP_GIANT();
	- g_topology_lock();
	-
	- *configs = NULL;
	- *count = 0;
	- pool_guid = 0;
	- LIST_FOREACH(mp, &g_classes, class) {
	- if (mp == &zfs_vdev_class)
	- continue;
	- LIST_FOREACH(gp, &mp->geom, geom) {
	- if (gp->flags & G_GEOM_WITHER)
	- continue;
	- LIST_FOREACH(pp, &gp->provider, provider) {
	- if (pp->flags & G_PF_WITHER)
	- continue;
	- zcp = vdev_geom_attach(pp, NULL, B_TRUE);
	- if (zcp == NULL)
	- continue;
	- g_topology_unlock();
	- nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
	- g_topology_lock();
	- vdev_geom_detach(zcp, B_TRUE);
	- if (nlabels == 0)
	- continue;
	- ZFS_LOG(1, "successfully read vdev config");
	-
	- process_vdev_config(configs, count,
	- vdev_cfg, name, &pool_guid);
	- }
	- }
	- }
	- g_topology_unlock();
	- PICKUP_GIANT();
	-
	- return (*count > 0 ? 0 : ENOENT);
	-}
	-
	-enum match {
	- NO_MATCH = 0, /* No matching labels found */
	- TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid*/
	- ZERO_MATCH = 1, /* Should never be returned */
	- ONE_MATCH = 2, /* 1 label matching the vdev_guid */
	- TWO_MATCH = 3, /* 2 label matching the vdev_guid */
	- THREE_MATCH = 4, /* 3 label matching the vdev_guid */
	- FULL_MATCH = 5 /* all labels match the vdev_guid */
	-};
	-
	-static enum match
	-vdev_attach_ok(vdev_t vd, struct g_provider pp)
	-{
	- nvlist_t *config;
	- uint64_t pool_guid, top_guid, vdev_guid;
	- struct g_consumer *cp;
	- int nlabels;
	-
	- cp = vdev_geom_attach(pp, NULL, B_TRUE);
	- if (cp == NULL) {
	- ZFS_LOG(1, "Unable to attach tasting instance to %s.",
	- pp->name);
	- return (NO_MATCH);
	- }
	- g_topology_unlock();
	- nlabels = vdev_geom_read_config(cp, &config);
	- g_topology_lock();
	- vdev_geom_detach(cp, B_TRUE);
	- if (nlabels == 0) {
	- ZFS_LOG(1, "Unable to read config from %s.", pp->name);
	- return (NO_MATCH);
	- }
	-
	- pool_guid = 0;
	- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
	- top_guid = 0;
	- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
	- vdev_guid = 0;
	- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
	- nvlist_free(config);
	-
	- /*
	- * Check that the label's pool guid matches the desired guid.
	- * Inactive spares and L2ARCs do not have any pool guid in the label.
	- */
	- if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
	- ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
	- pp->name,
	- (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
	- return (NO_MATCH);
	- }
	-
	- /*
	- * Check that the label's vdev guid matches the desired guid.
	- * The second condition handles possible race on vdev detach, when
	- * remaining vdev receives GUID of destroyed top level mirror vdev.
	- */
	- if (vdev_guid == vd->vdev_guid) {
	- ZFS_LOG(1, "guids match for provider %s.", pp->name);
	- return (ZERO_MATCH + nlabels);
	- } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
	- ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
	- return (TOPGUID_MATCH);
	- }
	- ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
	- pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
	- return (NO_MATCH);
	-}
	-
	-static struct g_consumer *
	-vdev_geom_attach_by_guids(vdev_t *vd)
	-{
	- struct g_class *mp;
	- struct g_geom *gp;
	- struct g_provider pp, best_pp;
	- struct g_consumer *cp;
	- const char *vdpath;
	- enum match match, best_match;
	-
	- g_topology_assert();
	-
	- vdpath = vd->vdev_path + sizeof("/dev/") - 1;
	- cp = NULL;
	- best_pp = NULL;
	- best_match = NO_MATCH;
	- LIST_FOREACH(mp, &g_classes, class) {
	- if (mp == &zfs_vdev_class)
	- continue;
	- LIST_FOREACH(gp, &mp->geom, geom) {
	- if (gp->flags & G_GEOM_WITHER)
	- continue;
	- LIST_FOREACH(pp, &gp->provider, provider) {
	- match = vdev_attach_ok(vd, pp);
	- if (match > best_match) {
	- best_match = match;
	- best_pp = pp;
	- } else if (match == best_match) {
	- if (strcmp(pp->name, vdpath) == 0) {
	- best_pp = pp;
	- }
	- }
	- if (match == FULL_MATCH)
	- goto out;
	- }
	- }
	- }
	-
	-out:
	- if (best_pp) {
	- cp = vdev_geom_attach(best_pp, vd, B_TRUE);
	- if (cp == NULL) {
	- printf("ZFS WARNING: Unable to attach to %s.\n",
	- best_pp->name);
	- }
	- }
	- return (cp);
	-}
	-
	-static struct g_consumer *
	-vdev_geom_open_by_guids(vdev_t *vd)
	-{
	- struct g_consumer *cp;
	- char *buf;
	- size_t len;
	-
	- g_topology_assert();
	-
	- ZFS_LOG(1, "Searching by guids [%ju:%ju].",
	- (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
	- cp = vdev_geom_attach_by_guids(vd);
	- if (cp != NULL) {
	- len = strlen(cp->provider->name) + strlen("/dev/") + 1;
	- buf = kmem_alloc(len, KM_SLEEP);
	-
	- snprintf(buf, len, "/dev/%s", cp->provider->name);
	- spa_strfree(vd->vdev_path);
	- vd->vdev_path = buf;
	-
	- ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
	- (uintmax_t)spa_guid(vd->vdev_spa),
	- (uintmax_t)vd->vdev_guid, cp->provider->name);
	- } else {
	- ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
	- (uintmax_t)spa_guid(vd->vdev_spa),
	- (uintmax_t)vd->vdev_guid);
	- }
	-
	- return (cp);
	-}
	-
	-static struct g_consumer *
	-vdev_geom_open_by_path(vdev_t *vd, int check_guid)
	-{
	- struct g_provider *pp;
	- struct g_consumer *cp;
	-
	- g_topology_assert();
	-
	- cp = NULL;
	- pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
	- if (pp != NULL) {
	- ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
	- if (!check_guid \|\| vdev_attach_ok(vd, pp) == FULL_MATCH)
	- cp = vdev_geom_attach(pp, vd, B_FALSE);
	- }
	-
	- return (cp);
	-}
	-
	-static int
	-vdev_geom_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	- uint64_t logical_ashift, uint64_t physical_ashift)
	-{
	- struct g_provider *pp;
	- struct g_consumer *cp;
	- size_t bufsize;
	- int error;
	-
	- /* Set the TLS to indicate downstack that we should not access zvols*/
	- VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
	-
	- /*
	- * We must have a pathname, and it must be absolute.
	- */
	- if (vd->vdev_path == NULL \|\| strncmp(vd->vdev_path, "/dev/", 5) != 0) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	- return (EINVAL);
	- }
	-
	- /*
	- * Reopen the device if it's not currently open. Otherwise,
	- * just update the physical size of the device.
	- */
	- if ((cp = vd->vdev_tsd) != NULL) {
	- ASSERT(vd->vdev_reopening);
	- goto skip_open;
	- }
	-
	- DROP_GIANT();
	- g_topology_lock();
	- error = 0;
	-
	- if (vd->vdev_spa->spa_splitting_newspa \|\|
	- (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
	- vd->vdev_spa->spa_load_state == SPA_LOAD_NONE \|\|
	- vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
	- /*
	- * We are dealing with a vdev that hasn't been previously
	- * opened (since boot), and we are not loading an
	- * existing pool configuration. This looks like a
	- * vdev add operation to a new or existing pool.
	- * Assume the user knows what he/she is doing and find
	- * GEOM provider by its name, ignoring GUID mismatches.
	- *
	- * XXPOLICY: It would be safer to only allow a device
	- * that is unlabeled or labeled but missing
	- * GUID information to be opened in this fashion,
	- * unless we are doing a split, in which case we
	- * should allow any guid.
	- */
	- cp = vdev_geom_open_by_path(vd, 0);
	- } else {
	- /*
	- * Try using the recorded path for this device, but only
	- * accept it if its label data contains the expected GUIDs.
	- */
	- cp = vdev_geom_open_by_path(vd, 1);
	- if (cp == NULL) {
	- /*
	- * The device at vd->vdev_path doesn't have the
	- * expected GUIDs. The disks might have merely
	- * moved around so try all other GEOM providers
	- * to find one with the right GUIDs.
	- */
	- cp = vdev_geom_open_by_guids(vd);
	- }
	- }
	-
	- /* Clear the TLS now that tasting is done */
	- VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
	-
	- if (cp == NULL) {
	- ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
	- error = ENOENT;
	- } else {
	- struct consumer_priv_t *priv;
	- struct consumer_vdev_elem *elem;
	- int spamode;
	-
	- priv = (struct consumer_priv_t*)&cp->private;
	- if (cp->private == NULL)
	- SLIST_INIT(priv);
	- elem = g_malloc(sizeof(*elem), M_WAITOK\|M_ZERO);
	- elem->vd = vd;
	- SLIST_INSERT_HEAD(priv, elem, elems);
	-
	- spamode = spa_mode(vd->vdev_spa);
	- if (cp->provider->sectorsize > VDEV_PAD_SIZE \|\|
	- !ISP2(cp->provider->sectorsize)) {
	- ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
	- cp->provider->name);
	-
	- vdev_geom_close_locked(vd);
	- error = EINVAL;
	- cp = NULL;
	- } else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
	- int i;
	-
	- for (i = 0; i < 5; i++) {
	- error = g_access(cp, 0, 1, 0);
	- if (error == 0)
	- break;
	- g_topology_unlock();
	- tsleep(vd, 0, "vdev", hz / 2);
	- g_topology_lock();
	- }
	- if (error != 0) {
	- printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
	- cp->provider->name, error);
	- vdev_geom_close_locked(vd);
	- cp = NULL;
	- }
	- }
	- }
	-
	- /* Fetch initial physical path information for this device. */
	- if (cp != NULL) {
	- vdev_geom_attrchanged(cp, "GEOM::physpath");
	-
	- /* Set other GEOM characteristics */
	- vdev_geom_set_physpath(vd, cp, /do_null_update/B_FALSE);
	- vdev_geom_set_rotation_rate(vd, cp);
	- }
	-
	- g_topology_unlock();
	- PICKUP_GIANT();
	- if (cp == NULL) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	- vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
	- error);
	- return (error);
	- }
	-skip_open:
	- pp = cp->provider;
	-
	- /*
	- * Determine the actual size of the device.
	- */
	- max_psize = psize = pp->mediasize;
	-
	- /*
	- * Determine the device's minimum transfer size and preferred
	- * transfer size.
	- */
	- *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
	- *physical_ashift = 0;
	- if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
	- pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
	- *physical_ashift = highbit(pp->stripesize) - 1;
	-
	- /*
	- * Clear the nowritecache settings, so that on a vdev_reopen()
	- * we will try again.
	- */
	- vd->vdev_nowritecache = B_FALSE;
	-
	- return (0);
	-}
	-
	-static void
	-vdev_geom_close(vdev_t *vd)
	-{
	- struct g_consumer *cp;
	- int locked;
	-
	- cp = vd->vdev_tsd;
	-
	- DROP_GIANT();
	- locked = g_topology_locked();
	- if (!locked)
	- g_topology_lock();
	-
	- if (!vd->vdev_reopening \|\|
	- (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 \|\|
	- (cp->provider != NULL && cp->provider->error != 0))))
	- vdev_geom_close_locked(vd);
	-
	- if (!locked)
	- g_topology_unlock();
	- PICKUP_GIANT();
	-}
	-
	-static void
	-vdev_geom_io_intr(struct bio *bp)
	-{
	- vdev_t *vd;
	- zio_t *zio;
	-
	- zio = bp->bio_caller1;
	- vd = zio->io_vd;
	- zio->io_error = bp->bio_error;
	- if (zio->io_error == 0 && bp->bio_resid != 0)
	- zio->io_error = SET_ERROR(EIO);
	-
	- switch(zio->io_error) {
	- case ENOTSUP:
	- /*
	- * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
	- * that future attempts will never succeed. In this case
	- * we set a persistent flag so that we don't bother with
	- * requests in the future.
	- */
	- switch(bp->bio_cmd) {
	- case BIO_FLUSH:
	- vd->vdev_nowritecache = B_TRUE;
	- break;
	- case BIO_DELETE:
	- vd->vdev_notrim = B_TRUE;
	- break;
	- }
	- break;
	- case ENXIO:
	- if (!vd->vdev_remove_wanted) {
	- /*
	- * If provider's error is set we assume it is being
	- * removed.
	- */
	- if (bp->bio_to->error != 0) {
	- vd->vdev_remove_wanted = B_TRUE;
	- spa_async_request(zio->io_spa,
	- SPA_ASYNC_REMOVE);
	- } else if (!vd->vdev_delayed_close) {
	- vd->vdev_delayed_close = B_TRUE;
	- }
	- }
	- break;
	- }
	-
	- /*
	- * We have to split bio freeing into two parts, because the ABD code
	- * cannot be called in this context and vdev_op_io_done is not called
	- * for ZIO_TYPE_IOCTL zio-s.
	- */
	- if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
	- g_destroy_bio(bp);
	- zio->io_bio = NULL;
	- }
	- zio_delay_interrupt(zio);
	-}
	-
	-static void
	-vdev_geom_io_start(zio_t *zio)
	-{
	- vdev_t *vd;
	- struct g_consumer *cp;
	- struct bio *bp;
	- int error;
	-
	- vd = zio->io_vd;
	-
	- switch (zio->io_type) {
	- case ZIO_TYPE_IOCTL:
	- /* XXPOLICY */
	- if (!vdev_readable(vd)) {
	- zio->io_error = SET_ERROR(ENXIO);
	- zio_interrupt(zio);
	- return;
	- } else {
	- switch (zio->io_cmd) {
	- case DKIOCFLUSHWRITECACHE:
	- if (zfs_nocacheflush \|\| vdev_geom_bio_flush_disable)
	- break;
	- if (vd->vdev_nowritecache) {
	- zio->io_error = SET_ERROR(ENOTSUP);
	- break;
	- }
	- goto sendreq;
	- default:
	- zio->io_error = SET_ERROR(ENOTSUP);
	- }
	- }
	-
	- zio_execute(zio);
	- return;
	- case ZIO_TYPE_FREE:
	- if (vd->vdev_notrim) {
	- zio->io_error = SET_ERROR(ENOTSUP);
	- } else if (!vdev_geom_bio_delete_disable) {
	- goto sendreq;
	- }
	- zio_execute(zio);
	- return;
	- }
	-sendreq:
	- ASSERT(zio->io_type == ZIO_TYPE_READ \|\|
	- zio->io_type == ZIO_TYPE_WRITE \|\|
	- zio->io_type == ZIO_TYPE_FREE \|\|
	- zio->io_type == ZIO_TYPE_IOCTL);
	-
	- cp = vd->vdev_tsd;
	- if (cp == NULL) {
	- zio->io_error = SET_ERROR(ENXIO);
	- zio_interrupt(zio);
	- return;
	- }
	- bp = g_alloc_bio();
	- bp->bio_caller1 = zio;
	- switch (zio->io_type) {
	- case ZIO_TYPE_READ:
	- case ZIO_TYPE_WRITE:
	- zio->io_target_timestamp = zio_handle_io_delay(zio);
	- bp->bio_offset = zio->io_offset;
	- bp->bio_length = zio->io_size;
	- if (zio->io_type == ZIO_TYPE_READ) {
	- bp->bio_cmd = BIO_READ;
	- bp->bio_data =
	- abd_borrow_buf(zio->io_abd, zio->io_size);
	- } else {
	- bp->bio_cmd = BIO_WRITE;
	- bp->bio_data =
	- abd_borrow_buf_copy(zio->io_abd, zio->io_size);
	- }
	- break;
	- case ZIO_TYPE_FREE:
	- bp->bio_cmd = BIO_DELETE;
	- bp->bio_data = NULL;
	- bp->bio_offset = zio->io_offset;
	- bp->bio_length = zio->io_size;
	- break;
	- case ZIO_TYPE_IOCTL:
	- bp->bio_cmd = BIO_FLUSH;
	- bp->bio_data = NULL;
	- bp->bio_offset = cp->provider->mediasize;
	- bp->bio_length = 0;
	- break;
	- }
	- bp->bio_done = vdev_geom_io_intr;
	- zio->io_bio = bp;
	-
	- g_io_request(bp, cp);
	-}
	-
	-static void
	-vdev_geom_io_done(zio_t *zio)
	-{
	- struct bio *bp = zio->io_bio;
	-
	- if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
	- ASSERT(bp == NULL);
	- return;
	- }
	-
	- if (bp == NULL) {
	- ASSERT3S(zio->io_error, ==, ENXIO);
	- return;
	- }
	-
	- if (zio->io_type == ZIO_TYPE_READ)
	- abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
	- else
	- abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
	-
	- g_destroy_bio(bp);
	- zio->io_bio = NULL;
	-}
	-
	-static void
	-vdev_geom_hold(vdev_t *vd)
	-{
	-}
	-
	-static void
	-vdev_geom_rele(vdev_t *vd)
	-{
	-}
	-
	-vdev_ops_t vdev_geom_ops = {
	- vdev_geom_open,
	- vdev_geom_close,
	- vdev_default_asize,
	- vdev_geom_io_start,
	- vdev_geom_io_done,
	- NULL,
	- NULL,
	- vdev_geom_hold,
	- vdev_geom_rele,
	- NULL,
	- vdev_default_xlate,
	- VDEV_TYPE_DISK, /* name of this vdev type */
	- B_TRUE /* leaf vdev */
	-};
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
	@@ -1,1849 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zio.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/metaslab.h>
	-#include <sys/refcount.h>
	-#include <sys/dmu.h>
	-#include <sys/vdev_indirect_mapping.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/zap.h>
	-#include <sys/abd.h>
	-#include <sys/zthr.h>
	-
	-/*
	- * An indirect vdev corresponds to a vdev that has been removed. Since
	- * we cannot rewrite block pointers of snapshots, etc., we keep a
	- * mapping from old location on the removed device to the new location
	- * on another device in the pool and use this mapping whenever we need
	- * to access the DVA. Unfortunately, this mapping did not respect
	- * logical block boundaries when it was first created, and so a DVA on
	- * this indirect vdev may be "split" into multiple sections that each
	- * map to a different location. As a consequence, not all DVAs can be
	- * translated to an equivalent new DVA. Instead we must provide a
	- * "vdev_remap" operation that executes a callback on each contiguous
	- * segment of the new location. This function is used in multiple ways:
	- *
	- * - i/os to this vdev use the callback to determine where the
	- * data is now located, and issue child i/os for each segment's new
	- * location.
	- *
	- * - frees and claims to this vdev use the callback to free or claim
	- * each mapped segment. (Note that we don't actually need to claim
	- * log blocks on indirect vdevs, because we don't allocate to
	- * removing vdevs. However, zdb uses zio_claim() for its leak
	- * detection.)
	- */
	-
	-/*
	- * "Big theory statement" for how we mark blocks obsolete.
	- *
	- * When a block on an indirect vdev is freed or remapped, a section of
	- * that vdev's mapping may no longer be referenced (aka "obsolete"). We
	- * keep track of how much of each mapping entry is obsolete. When
	- * an entry becomes completely obsolete, we can remove it, thus reducing
	- * the memory used by the mapping. The complete picture of obsolescence
	- * is given by the following data structures, described below:
	- * - the entry-specific obsolete count
	- * - the vdev-specific obsolete spacemap
	- * - the pool-specific obsolete bpobj
	- *
	- * == On disk data structures used ==
	- *
	- * We track the obsolete space for the pool using several objects. Each
	- * of these objects is created on demand and freed when no longer
	- * needed, and is assumed to be empty if it does not exist.
	- * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
	- *
	- * - Each vic_mapping_object (associated with an indirect vdev) can
	- * have a vimp_counts_object. This is an array of uint32_t's
	- * with the same number of entries as the vic_mapping_object. When
	- * the mapping is condensed, entries from the vic_obsolete_sm_object
	- * (see below) are folded into the counts. Therefore, each
	- * obsolete_counts entry tells us the number of bytes in the
	- * corresponding mapping entry that were not referenced when the
	- * mapping was last condensed.
	- *
	- * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
	- * This is a space map containing an alloc entry for every DVA that
	- * has been obsoleted since the last time this indirect vdev was
	- * condensed. We use this object in order to improve performance
	- * when marking a DVA as obsolete. Instead of modifying an arbitrary
	- * offset of the vimp_counts_object, we only need to append an entry
	- * to the end of this object. When a DVA becomes obsolete, it is
	- * added to the obsolete space map. This happens when the DVA is
	- * freed, remapped and not referenced by a snapshot, or the last
	- * snapshot referencing it is destroyed.
	- *
	- * - Each dataset can have a ds_remap_deadlist object. This is a
	- * deadlist object containing all blocks that were remapped in this
	- * dataset but referenced in a previous snapshot. Blocks can only
	- * appear on this list if they were remapped (dsl_dataset_block_remapped);
	- * blocks that were killed in a head dataset are put on the normal
	- * ds_deadlist and marked obsolete when they are freed.
	- *
	- * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
	- * in the pool that need to be marked obsolete. When a snapshot is
	- * destroyed, we move some of the ds_remap_deadlist to the obsolete
	- * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
	- * asynchronously process the obsolete bpobj, moving its entries to
	- * the specific vdevs' obsolete space maps.
	- *
	- * == Summary of how we mark blocks as obsolete ==
	- *
	- * - When freeing a block: if any DVA is on an indirect vdev, append to
	- * vic_obsolete_sm_object.
	- * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
	- * references; otherwise append to vic_obsolete_sm_object).
	- * - When freeing a snapshot: move parts of ds_remap_deadlist to
	- * dp_obsolete_bpobj (same algorithm as ds_deadlist).
	- * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
	- * individual vdev's vic_obsolete_sm_object.
	- */
	-
	-/*
	- * "Big theory statement" for how we condense indirect vdevs.
	- *
	- * Condensing an indirect vdev's mapping is the process of determining
	- * the precise counts of obsolete space for each mapping entry (by
	- * integrating the obsolete spacemap into the obsolete counts) and
	- * writing out a new mapping that contains only referenced entries.
	- *
	- * We condense a vdev when we expect the mapping to shrink (see
	- * vdev_indirect_should_condense()), but only perform one condense at a
	- * time to limit the memory usage. In addition, we use a separate
	- * open-context thread (spa_condense_indirect_thread) to incrementally
	- * create the new mapping object in a way that minimizes the impact on
	- * the rest of the system.
	- *
	- * == Generating a new mapping ==
	- *
	- * To generate a new mapping, we follow these steps:
	- *
	- * 1. Save the old obsolete space map and create a new mapping object
	- * (see spa_condense_indirect_start_sync()). This initializes the
	- * spa_condensing_indirect_phys with the "previous obsolete space map",
	- * which is now read only. Newly obsolete DVAs will be added to a
	- * new (initially empty) obsolete space map, and will not be
	- * considered as part of this condense operation.
	- *
	- * 2. Construct in memory the precise counts of obsolete space for each
	- * mapping entry, by incorporating the obsolete space map into the
	- * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
	- *
	- * 3. Iterate through each mapping entry, writing to the new mapping any
	- * entries that are not completely obsolete (i.e. which don't have
	- * obsolete count == mapping length). (See
	- * spa_condense_indirect_generate_new_mapping().)
	- *
	- * 4. Destroy the old mapping object and switch over to the new one
	- * (spa_condense_indirect_complete_sync).
	- *
	- * == Restarting from failure ==
	- *
	- * To restart the condense when we import/open the pool, we must start
	- * at the 2nd step above: reconstruct the precise counts in memory,
	- * based on the space map + counts. Then in the 3rd step, we start
	- * iterating where we left off: at vimp_max_offset of the new mapping
	- * object.
	- */
	-
	-boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
	-
	-/*
	- * Condense if at least this percent of the bytes in the mapping is
	- * obsolete. With the default of 25%, the amount of space mapped
	- * will be reduced to 1% of its original size after at most 16
	- * condenses. Higher values will condense less often (causing less
	- * i/o); lower values will reduce the mapping size more quickly.
	- */
	-int zfs_indirect_condense_obsolete_pct = 25;
	-
	-/*
	- * Condense if the obsolete space map takes up more than this amount of
	- * space on disk (logically). This limits the amount of disk space
	- * consumed by the obsolete space map; the default of 1GB is small enough
	- * that we typically don't mind "wasting" it.
	- */
	-uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
	-
	-/*
	- * Don't bother condensing if the mapping uses less than this amount of
	- * memory. The default of 128KB is considered a "trivial" amount of
	- * memory and not worth reducing.
	- */
	-uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
	-
	-/*
	- * This is used by the test suite so that it can ensure that certain
	- * actions happen while in the middle of a condense (which might otherwise
	- * complete too quickly). If used to reduce the performance impact of
	- * condensing in production, a maximum value of 1 should be sufficient.
	- */
	-int zfs_condense_indirect_commit_entry_delay_ticks = 0;
	-
	-/*
	- * If an indirect split block contains more than this many possible unique
	- * combinations when being reconstructed, consider it too computationally
	- * expensive to check them all. Instead, try at most 100 randomly-selected
	- * combinations each time the block is accessed. This allows all segment
	- * copies to participate fairly in the reconstruction when all combinations
	- * cannot be checked and prevents repeated use of one bad copy.
	- */
	-int zfs_reconstruct_indirect_combinations_max = 256;
	-
	-
	-/*
	- * Enable to simulate damaged segments and validate reconstruction.
	- * Used by ztest
	- */
	-unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
	-
	-/*
	- * The indirect_child_t represents the vdev that we will read from, when we
	- * need to read all copies of the data (e.g. for scrub or reconstruction).
	- * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
	- * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs,
	- * ic_vdev is a child of the mirror.
	- */
	-typedef struct indirect_child {
	- abd_t *ic_data;
	- vdev_t *ic_vdev;
	-
	- /*
	- * ic_duplicate is NULL when the ic_data contents are unique, when it
	- * is determined to be a duplicate it references the primary child.
	- */
	- struct indirect_child *ic_duplicate;
	- list_node_t ic_node; /* node on is_unique_child */
	-} indirect_child_t;
	-
	-/*
	- * The indirect_split_t represents one mapped segment of an i/o to the
	- * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
	- * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
	- * For split blocks, there will be several of these.
	- */
	-typedef struct indirect_split {
	- list_node_t is_node; /* link on iv_splits */
	-
	- /*
	- * is_split_offset is the offset into the i/o.
	- * This is the sum of the previous splits' is_size's.
	- */
	- uint64_t is_split_offset;
	-
	- vdev_t is_vdev; / top-level vdev */
	- uint64_t is_target_offset; /* offset on is_vdev */
	- uint64_t is_size;
	- int is_children; /* number of entries in is_child[] */
	- int is_unique_children; /* number of entries in is_unique_child */
	- list_t is_unique_child;
	-
	- /*
	- * is_good_child is the child that we are currently using to
	- * attempt reconstruction.
	- */
	- indirect_child_t *is_good_child;
	-
	- indirect_child_t is_child[1]; /* variable-length */
	-} indirect_split_t;
	-
	-/*
	- * The indirect_vsd_t is associated with each i/o to the indirect vdev.
	- * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
	- */
	-typedef struct indirect_vsd {
	- boolean_t iv_split_block;
	- boolean_t iv_reconstruct;
	- uint64_t iv_unique_combinations;
	- uint64_t iv_attempts;
	- uint64_t iv_attempts_max;
	-
	- list_t iv_splits; /* list of indirect_split_t's */
	-} indirect_vsd_t;
	-
	-static void
	-vdev_indirect_map_free(zio_t *zio)
	-{
	- indirect_vsd_t *iv = zio->io_vsd;
	-
	- indirect_split_t *is;
	- while ((is = list_head(&iv->iv_splits)) != NULL) {
	- for (int c = 0; c < is->is_children; c++) {
	- indirect_child_t *ic = &is->is_child[c];
	- if (ic->ic_data != NULL)
	- abd_free(ic->ic_data);
	- }
	- list_remove(&iv->iv_splits, is);
	-
	- indirect_child_t *ic;
	- while ((ic = list_head(&is->is_unique_child)) != NULL)
	- list_remove(&is->is_unique_child, ic);
	-
	- list_destroy(&is->is_unique_child);
	-
	- kmem_free(is,
	- offsetof(indirect_split_t, is_child[is->is_children]));
	- }
	- kmem_free(iv, sizeof (*iv));
	-}
	-
	-static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
	- vdev_indirect_map_free,
	- zio_vsd_default_cksum_report
	-};
	-/*
	- * Mark the given offset and size as being obsolete.
	- */
	-void
	-vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
	- ASSERT(vd->vdev_removing \|\| vd->vdev_ops == &vdev_indirect_ops);
	- ASSERT(size > 0);
	- VERIFY(vdev_indirect_mapping_entry_for_offset(
	- vd->vdev_indirect_mapping, offset) != NULL);
	-
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	- mutex_enter(&vd->vdev_obsolete_lock);
	- range_tree_add(vd->vdev_obsolete_segments, offset, size);
	- mutex_exit(&vd->vdev_obsolete_lock);
	- vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
	- }
	-}
	-
	-/*
	- * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
	- * wrapper is provided because the DMU does not know about vdev_t's and
	- * cannot directly call vdev_indirect_mark_obsolete.
	- */
	-void
	-spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
	- uint64_t size, dmu_tx_t *tx)
	-{
	- vdev_t *vd = vdev_lookup_top(spa, vdev_id);
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- /* The DMU can only remap indirect vdevs. */
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	- vdev_indirect_mark_obsolete(vd, offset, size);
	-}
	-
	-static spa_condensing_indirect_t *
	-spa_condensing_indirect_create(spa_t *spa)
	-{
	- spa_condensing_indirect_phys_t *scip =
	- &spa->spa_condensing_indirect_phys;
	- spa_condensing_indirect_t sci = kmem_zalloc(sizeof (sci), KM_SLEEP);
	- objset_t *mos = spa->spa_meta_objset;
	-
	- for (int i = 0; i < TXG_SIZE; i++) {
	- list_create(&sci->sci_new_mapping_entries[i],
	- sizeof (vdev_indirect_mapping_entry_t),
	- offsetof(vdev_indirect_mapping_entry_t, vime_node));
	- }
	-
	- sci->sci_new_mapping =
	- vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
	-
	- return (sci);
	-}
	-
	-static void
	-spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
	-{
	- for (int i = 0; i < TXG_SIZE; i++)
	- list_destroy(&sci->sci_new_mapping_entries[i]);
	-
	- if (sci->sci_new_mapping != NULL)
	- vdev_indirect_mapping_close(sci->sci_new_mapping);
	-
	- kmem_free(sci, sizeof (*sci));
	-}
	-
	-boolean_t
	-vdev_indirect_should_condense(vdev_t *vd)
	-{
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- spa_t *spa = vd->vdev_spa;
	-
	- ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
	-
	- if (!zfs_condense_indirect_vdevs_enable)
	- return (B_FALSE);
	-
	- /*
	- * We can only condense one indirect vdev at a time.
	- */
	- if (spa->spa_condensing_indirect != NULL)
	- return (B_FALSE);
	-
	- if (spa_shutting_down(spa))
	- return (B_FALSE);
	-
	- /*
	- * The mapping object size must not change while we are
	- * condensing, so we can only condense indirect vdevs
	- * (not vdevs that are still in the middle of being removed).
	- */
	- if (vd->vdev_ops != &vdev_indirect_ops)
	- return (B_FALSE);
	-
	- /*
	- * If nothing new has been marked obsolete, there is no
	- * point in condensing.
	- */
	- if (vd->vdev_obsolete_sm == NULL) {
	- ASSERT0(vdev_obsolete_sm_object(vd));
	- return (B_FALSE);
	- }
	-
	- ASSERT(vd->vdev_obsolete_sm != NULL);
	-
	- ASSERT3U(vdev_obsolete_sm_object(vd), ==,
	- space_map_object(vd->vdev_obsolete_sm));
	-
	- uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
	- uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
	- uint64_t mapping_size = vdev_indirect_mapping_size(vim);
	- uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
	-
	- ASSERT3U(bytes_obsolete, <=, bytes_mapped);
	-
	- /*
	- * If a high percentage of the bytes that are mapped have become
	- * obsolete, condense (unless the mapping is already small enough).
	- * This has a good chance of reducing the amount of memory used
	- * by the mapping.
	- */
	- if (bytes_obsolete * 100 / bytes_mapped >=
	- zfs_indirect_condense_obsolete_pct &&
	- mapping_size > zfs_condense_min_mapping_bytes) {
	- zfs_dbgmsg("should condense vdev %llu because obsolete "
	- "spacemap covers %d%% of %lluMB mapping",
	- (u_longlong_t)vd->vdev_id,
	- (int)(bytes_obsolete * 100 / bytes_mapped),
	- (u_longlong_t)bytes_mapped / 1024 / 1024);
	- return (B_TRUE);
	- }
	-
	- /*
	- * If the obsolete space map takes up too much space on disk,
	- * condense in order to free up this disk space.
	- */
	- if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
	- zfs_dbgmsg("should condense vdev %llu because obsolete sm "
	- "length %lluMB >= max size %lluMB",
	- (u_longlong_t)vd->vdev_id,
	- (u_longlong_t)obsolete_sm_size / 1024 / 1024,
	- (u_longlong_t)zfs_condense_max_obsolete_bytes /
	- 1024 / 1024);
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * This sync task completes (finishes) a condense, deleting the old
	- * mapping and replacing it with the new one.
	- */
	-static void
	-spa_condense_indirect_complete_sync(void arg, dmu_tx_t tx)
	-{
	- spa_condensing_indirect_t *sci = arg;
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- spa_condensing_indirect_phys_t *scip =
	- &spa->spa_condensing_indirect_phys;
	- vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	- objset_t *mos = spa->spa_meta_objset;
	- vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
	- uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
	- uint64_t new_count =
	- vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	- ASSERT3P(sci, ==, spa->spa_condensing_indirect);
	- for (int i = 0; i < TXG_SIZE; i++) {
	- ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
	- }
	- ASSERT(vic->vic_mapping_object != 0);
	- ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
	- ASSERT(scip->scip_next_mapping_object != 0);
	- ASSERT(scip->scip_prev_obsolete_sm_object != 0);
	-
	- /*
	- * Reset vdev_indirect_mapping to refer to the new object.
	- */
	- rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
	- vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
	- vd->vdev_indirect_mapping = sci->sci_new_mapping;
	- rw_exit(&vd->vdev_indirect_rwlock);
	-
	- sci->sci_new_mapping = NULL;
	- vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
	- vic->vic_mapping_object = scip->scip_next_mapping_object;
	- scip->scip_next_mapping_object = 0;
	-
	- space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
	- spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	- scip->scip_prev_obsolete_sm_object = 0;
	-
	- scip->scip_vdev = 0;
	-
	- VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_CONDENSING_INDIRECT, tx));
	- spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
	- spa->spa_condensing_indirect = NULL;
	-
	- zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
	- "new mapping object %llu has %llu entries "
	- "(was %llu entries)",
	- vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
	- new_count, old_count);
	-
	- vdev_config_dirty(spa->spa_root_vdev);
	-}
	-
	-/*
	- * This sync task appends entries to the new mapping object.
	- */
	-static void
	-spa_condense_indirect_commit_sync(void arg, dmu_tx_t tx)
	-{
	- spa_condensing_indirect_t *sci = arg;
	- uint64_t txg = dmu_tx_get_txg(tx);
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT3P(sci, ==, spa->spa_condensing_indirect);
	-
	- vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
	- &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
	- ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
	-}
	-
	-/*
	- * Open-context function to add one entry to the new mapping. The new
	- * entry will be remembered and written from syncing context.
	- */
	-static void
	-spa_condense_indirect_commit_entry(spa_t *spa,
	- vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
	-{
	- spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
	-
	- ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
	-
	- dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	- dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
	- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	- int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
	-
	- /*
	- * If we are the first entry committed this txg, kick off the sync
	- * task to write to the MOS on our behalf.
	- */
	- if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
	- dsl_sync_task_nowait(dmu_tx_pool(tx),
	- spa_condense_indirect_commit_sync, sci,
	- 0, ZFS_SPACE_CHECK_NONE, tx);
	- }
	-
	- vdev_indirect_mapping_entry_t *vime =
	- kmem_alloc(sizeof (*vime), KM_SLEEP);
	- vime->vime_mapping = *vimep;
	- vime->vime_obsolete_count = count;
	- list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
	-
	- dmu_tx_commit(tx);
	-}
	-
	-static void
	-spa_condense_indirect_generate_new_mapping(vdev_t *vd,
	- uint32_t obsolete_counts, uint64_t start_index, zthr_t zthr)
	-{
	- spa_t *spa = vd->vdev_spa;
	- uint64_t mapi = start_index;
	- vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
	- uint64_t old_num_entries =
	- vdev_indirect_mapping_num_entries(old_mapping);
	-
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	- ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
	-
	- zfs_dbgmsg("starting condense of vdev %llu from index %llu",
	- (u_longlong_t)vd->vdev_id,
	- (u_longlong_t)mapi);
	-
	- while (mapi < old_num_entries) {
	-
	- if (zthr_iscancelled(zthr)) {
	- zfs_dbgmsg("pausing condense of vdev %llu "
	- "at index %llu", (u_longlong_t)vd->vdev_id,
	- (u_longlong_t)mapi);
	- break;
	- }
	-
	- vdev_indirect_mapping_entry_phys_t *entry =
	- &old_mapping->vim_entries[mapi];
	- uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
	- ASSERT3U(obsolete_counts[mapi], <=, entry_size);
	- if (obsolete_counts[mapi] < entry_size) {
	- spa_condense_indirect_commit_entry(spa, entry,
	- obsolete_counts[mapi]);
	-
	- /*
	- * This delay may be requested for testing, debugging,
	- * or performance reasons.
	- */
	- delay(zfs_condense_indirect_commit_entry_delay_ticks);
	- }
	-
	- mapi++;
	- }
	-}
	-
	-/* ARGSUSED */
	-static boolean_t
	-spa_condense_indirect_thread_check(void arg, zthr_t zthr)
	-{
	- spa_t *spa = arg;
	-
	- return (spa->spa_condensing_indirect != NULL);
	-}
	-
	-/* ARGSUSED */
	-static void
	-spa_condense_indirect_thread(void arg, zthr_t zthr)
	-{
	- spa_t *spa = arg;
	- vdev_t *vd;
	-
	- ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	- vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
	- ASSERT3P(vd, !=, NULL);
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	-
	- spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
	- spa_condensing_indirect_phys_t *scip =
	- &spa->spa_condensing_indirect_phys;
	- uint32_t *counts;
	- uint64_t start_index;
	- vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
	- space_map_t *prev_obsolete_sm = NULL;
	-
	- ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
	- ASSERT(scip->scip_next_mapping_object != 0);
	- ASSERT(scip->scip_prev_obsolete_sm_object != 0);
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	-
	- for (int i = 0; i < TXG_SIZE; i++) {
	- /*
	- * The list must start out empty in order for the
	- * _commit_sync() sync task to be properly registered
	- * on the first call to _commit_entry(); so it's wise
	- * to double check and ensure we actually are starting
	- * with empty lists.
	- */
	- ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
	- }
	-
	- VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
	- scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
	- counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
	- if (prev_obsolete_sm != NULL) {
	- vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
	- counts, prev_obsolete_sm);
	- }
	- space_map_close(prev_obsolete_sm);
	-
	- /*
	- * Generate new mapping. Determine what index to continue from
	- * based on the max offset that we've already written in the
	- * new mapping.
	- */
	- uint64_t max_offset =
	- vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
	- if (max_offset == 0) {
	- /* We haven't written anything to the new mapping yet. */
	- start_index = 0;
	- } else {
	- /*
	- * Pick up from where we left off. _entry_for_offset()
	- * returns a pointer into the vim_entries array. If
	- * max_offset is greater than any of the mappings
	- * contained in the table NULL will be returned and
	- * that indicates we've exhausted our iteration of the
	- * old_mapping.
	- */
	-
	- vdev_indirect_mapping_entry_phys_t *entry =
	- vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
	- max_offset);
	-
	- if (entry == NULL) {
	- /*
	- * We've already written the whole new mapping.
	- * This special value will cause us to skip the
	- * generate_new_mapping step and just do the sync
	- * task to complete the condense.
	- */
	- start_index = UINT64_MAX;
	- } else {
	- start_index = entry - old_mapping->vim_entries;
	- ASSERT3U(start_index, <,
	- vdev_indirect_mapping_num_entries(old_mapping));
	- }
	- }
	-
	- spa_condense_indirect_generate_new_mapping(vd, counts,
	- start_index, zthr);
	-
	- vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
	-
	- /*
	- * If the zthr has received a cancellation signal while running
	- * in generate_new_mapping() or at any point after that, then bail
	- * early. We don't want to complete the condense if the spa is
	- * shutting down.
	- */
	- if (zthr_iscancelled(zthr))
	- return;
	-
	- VERIFY0(dsl_sync_task(spa_name(spa), NULL,
	- spa_condense_indirect_complete_sync, sci, 0,
	- ZFS_SPACE_CHECK_EXTRA_RESERVED));
	-}
	-
	-/*
	- * Sync task to begin the condensing process.
	- */
	-void
	-spa_condense_indirect_start_sync(vdev_t vd, dmu_tx_t tx)
	-{
	- spa_t *spa = vd->vdev_spa;
	- spa_condensing_indirect_phys_t *scip =
	- &spa->spa_condensing_indirect_phys;
	-
	- ASSERT0(scip->scip_next_mapping_object);
	- ASSERT0(scip->scip_prev_obsolete_sm_object);
	- ASSERT0(scip->scip_vdev);
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	- ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
	- ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
	-
	- uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
	- ASSERT(obsolete_sm_obj != 0);
	-
	- scip->scip_vdev = vd->vdev_id;
	- scip->scip_next_mapping_object =
	- vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
	-
	- scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
	-
	- /*
	- * We don't need to allocate a new space map object, since
	- * vdev_indirect_sync_obsolete will allocate one when needed.
	- */
	- space_map_close(vd->vdev_obsolete_sm);
	- vd->vdev_obsolete_sm = NULL;
	- VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
	- VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
	-
	- VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
	- sizeof (*scip) / sizeof (uint64_t), scip, tx));
	-
	- ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
	- spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
	-
	- zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
	- "posm=%llu nm=%llu",
	- vd->vdev_id, dmu_tx_get_txg(tx),
	- (u_longlong_t)scip->scip_prev_obsolete_sm_object,
	- (u_longlong_t)scip->scip_next_mapping_object);
	-
	- zthr_wakeup(spa->spa_condense_zthr);
	-}
	-
	-/*
	- * Sync to the given vdev's obsolete space map any segments that are no longer
	- * referenced as of the given txg.
	- *
	- * If the obsolete space map doesn't exist yet, create and open it.
	- */
	-void
	-vdev_indirect_sync_obsolete(vdev_t vd, dmu_tx_t tx)
	-{
	- spa_t *spa = vd->vdev_spa;
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	-
	- ASSERT3U(vic->vic_mapping_object, !=, 0);
	- ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
	- ASSERT(vd->vdev_removing \|\| vd->vdev_ops == &vdev_indirect_ops);
	- ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
	-
	- if (vdev_obsolete_sm_object(vd) == 0) {
	- uint64_t obsolete_sm_object =
	- space_map_alloc(spa->spa_meta_objset,
	- vdev_standard_sm_blksz, tx);
	-
	- ASSERT(vd->vdev_top_zap != 0);
	- VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
	- VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
	- sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
	- ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
	-
	- spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	- VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
	- spa->spa_meta_objset, obsolete_sm_object,
	- 0, vd->vdev_asize, 0));
	- }
	-
	- ASSERT(vd->vdev_obsolete_sm != NULL);
	- ASSERT3U(vdev_obsolete_sm_object(vd), ==,
	- space_map_object(vd->vdev_obsolete_sm));
	-
	- space_map_write(vd->vdev_obsolete_sm,
	- vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
	- range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
	-}
	-
	-int
	-spa_condense_init(spa_t *spa)
	-{
	- int error = zap_lookup(spa->spa_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
	- sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
	- &spa->spa_condensing_indirect_phys);
	- if (error == 0) {
	- if (spa_writeable(spa)) {
	- spa->spa_condensing_indirect =
	- spa_condensing_indirect_create(spa);
	- }
	- return (0);
	- } else if (error == ENOENT) {
	- return (0);
	- } else {
	- return (error);
	- }
	-}
	-
	-void
	-spa_condense_fini(spa_t *spa)
	-{
	- if (spa->spa_condensing_indirect != NULL) {
	- spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
	- spa->spa_condensing_indirect = NULL;
	- }
	-}
	-
	-void
	-spa_start_indirect_condensing_thread(spa_t *spa)
	-{
	- ASSERT3P(spa->spa_condense_zthr, ==, NULL);
	- spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
	- spa_condense_indirect_thread, spa);
	-}
	-
	-/*
	- * Gets the obsolete spacemap object from the vdev's ZAP.
	- * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
	- * exist yet.
	- */
	-int
	-vdev_obsolete_sm_object(vdev_t *vd)
	-{
	- ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
	- if (vd->vdev_top_zap == 0) {
	- return (0);
	- }
	-
	- uint64_t sm_obj = 0;
	- int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
	- VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
	-
	- ASSERT(err == 0 \|\| err == ENOENT);
	-
	- return (sm_obj);
	-}
	-
	-boolean_t
	-vdev_obsolete_counts_are_precise(vdev_t *vd)
	-{
	- ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
	- if (vd->vdev_top_zap == 0) {
	- return (B_FALSE);
	- }
	-
	- uint64_t val = 0;
	- int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
	- VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
	-
	- ASSERT(err == 0 \|\| err == ENOENT);
	-
	- return (val != 0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-vdev_indirect_close(vdev_t *vd)
	-{
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_indirect_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	- uint64_t logical_ashift, uint64_t physical_ashift)
	-{
	- psize = max_psize = vd->vdev_asize +
	- VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
	- *logical_ashift = vd->vdev_ashift;
	- *physical_ashift = vd->vdev_physical_ashift;
	- return (0);
	-}
	-
	-typedef struct remap_segment {
	- vdev_t *rs_vd;
	- uint64_t rs_offset;
	- uint64_t rs_asize;
	- uint64_t rs_split_offset;
	- list_node_t rs_node;
	-} remap_segment_t;
	-
	-remap_segment_t *
	-rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
	-{
	- remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
	- rs->rs_vd = vd;
	- rs->rs_offset = offset;
	- rs->rs_asize = asize;
	- rs->rs_split_offset = split_offset;
	- return (rs);
	-}
	-
	-/*
	- * Given an indirect vdev and an extent on that vdev, it duplicates the
	- * physical entries of the indirect mapping that correspond to the extent
	- * to a new array and returns a pointer to it. In addition, copied_entries
	- * is populated with the number of mapping entries that were duplicated.
	- *
	- * Note that the function assumes that the caller holds vdev_indirect_rwlock.
	- * This ensures that the mapping won't change due to condensing as we
	- * copy over its contents.
	- *
	- * Finally, since we are doing an allocation, it is up to the caller to
	- * free the array allocated in this function.
	- */
	-vdev_indirect_mapping_entry_phys_t *
	-vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
	- uint64_t asize, uint64_t *copied_entries)
	-{
	- vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- uint64_t entries = 0;
	-
	- ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
	-
	- vdev_indirect_mapping_entry_phys_t *first_mapping =
	- vdev_indirect_mapping_entry_for_offset(vim, offset);
	- ASSERT3P(first_mapping, !=, NULL);
	-
	- vdev_indirect_mapping_entry_phys_t *m = first_mapping;
	- while (asize > 0) {
	- uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
	-
	- ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
	- ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
	-
	- uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
	- uint64_t inner_size = MIN(asize, size - inner_offset);
	-
	- offset += inner_size;
	- asize -= inner_size;
	- entries++;
	- m++;
	- }
	-
	- size_t copy_length = entries * sizeof (*first_mapping);
	- duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
	- bcopy(first_mapping, duplicate_mappings, copy_length);
	- *copied_entries = entries;
	-
	- return (duplicate_mappings);
	-}
	-
	-/*
	- * Goes through the relevant indirect mappings until it hits a concrete vdev
	- * and issues the callback. On the way to the concrete vdev, if any other
	- * indirect vdevs are encountered, then the callback will also be called on
	- * each of those indirect vdevs. For example, if the segment is mapped to
	- * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
	- * mapped to segment B on concrete vdev 2, then the callback will be called on
	- * both vdev 1 and vdev 2.
	- *
	- * While the callback passed to vdev_indirect_remap() is called on every vdev
	- * the function encounters, certain callbacks only care about concrete vdevs.
	- * These types of callbacks should return immediately and explicitly when they
	- * are called on an indirect vdev.
	- *
	- * Because there is a possibility that a DVA section in the indirect device
	- * has been split into multiple sections in our mapping, we keep track
	- * of the relevant contiguous segments of the new location (remap_segment_t)
	- * in a stack. This way we can call the callback for each of the new sections
	- * created by a single section of the indirect device. Note though, that in
	- * this scenario the callbacks in each split block won't occur in-order in
	- * terms of offset, so callers should not make any assumptions about that.
	- *
	- * For callbacks that don't handle split blocks and immediately return when
	- * they encounter them (as is the case for remap_blkptr_cb), the caller can
	- * assume that its callback will be applied from the first indirect vdev
	- * encountered to the last one and then the concrete vdev, in that order.
	- */
	-static void
	-vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
	- void (func)(uint64_t, vdev_t , uint64_t, uint64_t, void ), void arg)
	-{
	- list_t stack;
	- spa_t *spa = vd->vdev_spa;
	-
	- list_create(&stack, sizeof (remap_segment_t),
	- offsetof(remap_segment_t, rs_node));
	-
	- for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
	- rs != NULL; rs = list_remove_head(&stack)) {
	- vdev_t *v = rs->rs_vd;
	- uint64_t num_entries = 0;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
	- ASSERT(rs->rs_asize > 0);
	-
	- /*
	- * Note: As this function can be called from open context
	- * (e.g. zio_read()), we need the following rwlock to
	- * prevent the mapping from being changed by condensing.
	- *
	- * So we grab the lock and we make a copy of the entries
	- * that are relevant to the extent that we are working on.
	- * Once that is done, we drop the lock and iterate over
	- * our copy of the mapping. Once we are done with the with
	- * the remap segment and we free it, we also free our copy
	- * of the indirect mapping entries that are relevant to it.
	- *
	- * This way we don't need to wait until the function is
	- * finished with a segment, to condense it. In addition, we
	- * don't need a recursive rwlock for the case that a call to
	- * vdev_indirect_remap() needs to call itself (through the
	- * codepath of its callback) for the same vdev in the middle
	- * of its execution.
	- */
	- rw_enter(&v->vdev_indirect_rwlock, RW_READER);
	- vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
	- ASSERT3P(vim, !=, NULL);
	-
	- vdev_indirect_mapping_entry_phys_t *mapping =
	- vdev_indirect_mapping_duplicate_adjacent_entries(v,
	- rs->rs_offset, rs->rs_asize, &num_entries);
	- ASSERT3P(mapping, !=, NULL);
	- ASSERT3U(num_entries, >, 0);
	- rw_exit(&v->vdev_indirect_rwlock);
	-
	- for (uint64_t i = 0; i < num_entries; i++) {
	- /*
	- * Note: the vdev_indirect_mapping can not change
	- * while we are running. It only changes while the
	- * removal is in progress, and then only from syncing
	- * context. While a removal is in progress, this
	- * function is only called for frees, which also only
	- * happen from syncing context.
	- */
	- vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
	-
	- ASSERT3P(m, !=, NULL);
	- ASSERT3U(rs->rs_asize, >, 0);
	-
	- uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
	- uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
	- uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
	-
	- ASSERT3U(rs->rs_offset, >=,
	- DVA_MAPPING_GET_SRC_OFFSET(m));
	- ASSERT3U(rs->rs_offset, <,
	- DVA_MAPPING_GET_SRC_OFFSET(m) + size);
	- ASSERT3U(dst_vdev, !=, v->vdev_id);
	-
	- uint64_t inner_offset = rs->rs_offset -
	- DVA_MAPPING_GET_SRC_OFFSET(m);
	- uint64_t inner_size =
	- MIN(rs->rs_asize, size - inner_offset);
	-
	- vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
	- ASSERT3P(dst_v, !=, NULL);
	-
	- if (dst_v->vdev_ops == &vdev_indirect_ops) {
	- list_insert_head(&stack,
	- rs_alloc(dst_v, dst_offset + inner_offset,
	- inner_size, rs->rs_split_offset));
	-
	- }
	-
	- if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
	- IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
	- /*
	- * Note: This clause exists only solely for
	- * testing purposes. We use it to ensure that
	- * split blocks work and that the callbacks
	- * using them yield the same result if issued
	- * in reverse order.
	- */
	- uint64_t inner_half = inner_size / 2;
	-
	- func(rs->rs_split_offset + inner_half, dst_v,
	- dst_offset + inner_offset + inner_half,
	- inner_half, arg);
	-
	- func(rs->rs_split_offset, dst_v,
	- dst_offset + inner_offset,
	- inner_half, arg);
	- } else {
	- func(rs->rs_split_offset, dst_v,
	- dst_offset + inner_offset,
	- inner_size, arg);
	- }
	-
	- rs->rs_offset += inner_size;
	- rs->rs_asize -= inner_size;
	- rs->rs_split_offset += inner_size;
	- }
	- VERIFY0(rs->rs_asize);
	-
	- kmem_free(mapping, num_entries * sizeof (*mapping));
	- kmem_free(rs, sizeof (remap_segment_t));
	- }
	- list_destroy(&stack);
	-}
	-
	-static void
	-vdev_indirect_child_io_done(zio_t *zio)
	-{
	- zio_t *pio = zio->io_private;
	-
	- mutex_enter(&pio->io_lock);
	- pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
	- mutex_exit(&pio->io_lock);
	-
	-#ifdef __FreeBSD__
	- if (zio->io_abd != NULL)
	-#endif
	- abd_put(zio->io_abd);
	-}
	-
	-/*
	- * This is a callback for vdev_indirect_remap() which allocates an
	- * indirect_split_t for each split segment and adds it to iv_splits.
	- */
	-static void
	-vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
	- uint64_t size, void *arg)
	-{
	- zio_t *zio = arg;
	- indirect_vsd_t *iv = zio->io_vsd;
	-
	- ASSERT3P(vd, !=, NULL);
	-
	- if (vd->vdev_ops == &vdev_indirect_ops)
	- return;
	-
	- int n = 1;
	- if (vd->vdev_ops == &vdev_mirror_ops)
	- n = vd->vdev_children;
	-
	- indirect_split_t *is =
	- kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
	-
	- is->is_children = n;
	- is->is_size = size;
	- is->is_split_offset = split_offset;
	- is->is_target_offset = offset;
	- is->is_vdev = vd;
	- list_create(&is->is_unique_child, sizeof (indirect_child_t),
	- offsetof(indirect_child_t, ic_node));
	-
	- /*
	- * Note that we only consider multiple copies of the data for
	- * mirror vdevs. We don't for "replacing" or "spare" vdevs, even
	- * though they use the same ops as mirror, because there's only one
	- * "good" copy under the replacing/spare.
	- */
	- if (vd->vdev_ops == &vdev_mirror_ops) {
	- for (int i = 0; i < n; i++) {
	- is->is_child[i].ic_vdev = vd->vdev_child[i];
	- list_link_init(&is->is_child[i].ic_node);
	- }
	- } else {
	- is->is_child[0].ic_vdev = vd;
	- }
	-
	- list_insert_tail(&iv->iv_splits, is);
	-}
	-
	-static void
	-vdev_indirect_read_split_done(zio_t *zio)
	-{
	- indirect_child_t *ic = zio->io_private;
	-
	- if (zio->io_error != 0) {
	- /*
	- * Clear ic_data to indicate that we do not have data for this
	- * child.
	- */
	- abd_free(ic->ic_data);
	- ic->ic_data = NULL;
	- }
	-}
	-
	-/*
	- * Issue reads for all copies (mirror children) of all splits.
	- */
	-static void
	-vdev_indirect_read_all(zio_t *zio)
	-{
	- indirect_vsd_t *iv = zio->io_vsd;
	-
	- ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
	-
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- for (int i = 0; i < is->is_children; i++) {
	- indirect_child_t *ic = &is->is_child[i];
	-
	- if (!vdev_readable(ic->ic_vdev))
	- continue;
	-
	- /*
	- * Note, we may read from a child whose DTL
	- * indicates that the data may not be present here.
	- * While this might result in a few i/os that will
	- * likely return incorrect data, it simplifies the
	- * code since we can treat scrub and resilver
	- * identically. (The incorrect data will be
	- * detected and ignored when we verify the
	- * checksum.)
	- */
	-
	- ic->ic_data = abd_alloc_sametype(zio->io_abd,
	- is->is_size);
	- ic->ic_duplicate = NULL;
	-
	- zio_nowait(zio_vdev_child_io(zio, NULL,
	- ic->ic_vdev, is->is_target_offset, ic->ic_data,
	- is->is_size, zio->io_type, zio->io_priority, 0,
	- vdev_indirect_read_split_done, ic));
	- }
	- }
	- iv->iv_reconstruct = B_TRUE;
	-}
	-
	-static void
	-vdev_indirect_io_start(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- indirect_vsd_t iv = kmem_zalloc(sizeof (iv), KM_SLEEP);
	- list_create(&iv->iv_splits,
	- sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
	-
	- zio->io_vsd = iv;
	- zio->io_vsd_ops = &vdev_indirect_vsd_ops;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
	-#ifdef __FreeBSD__
	- if (zio->io_type == ZIO_TYPE_WRITE) {
	-#else
	- if (zio->io_type != ZIO_TYPE_READ) {
	- ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
	-#endif
	- /*
	- * Note: this code can handle other kinds of writes,
	- * but we don't expect them.
	- */
	- ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL \|
	- ZIO_FLAG_RESILVER \| ZIO_FLAG_INDUCE_DAMAGE)) != 0);
	- }
	-
	- vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
	- vdev_indirect_gather_splits, zio);
	-
	- indirect_split_t *first = list_head(&iv->iv_splits);
	- if (first->is_size == zio->io_size) {
	- /*
	- * This is not a split block; we are pointing to the entire
	- * data, which will checksum the same as the original data.
	- * Pass the BP down so that the child i/o can verify the
	- * checksum, and try a different location if available
	- * (e.g. on a mirror).
	- *
	- * While this special case could be handled the same as the
	- * general (split block) case, doing it this way ensures
	- * that the vast majority of blocks on indirect vdevs
	- * (which are not split) are handled identically to blocks
	- * on non-indirect vdevs. This allows us to be less strict
	- * about performance in the general (but rare) case.
	- */
	- ASSERT0(first->is_split_offset);
	- ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
	- zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
	- first->is_vdev, first->is_target_offset,
	-#ifdef __FreeBSD__
	- zio->io_abd == NULL ? NULL :
	-#endif
	- abd_get_offset(zio->io_abd, 0),
	- zio->io_size, zio->io_type, zio->io_priority, 0,
	- vdev_indirect_child_io_done, zio));
	- } else {
	- iv->iv_split_block = B_TRUE;
	- if (zio->io_type == ZIO_TYPE_READ &&
	- zio->io_flags & (ZIO_FLAG_SCRUB \| ZIO_FLAG_RESILVER)) {
	- /*
	- * Read all copies. Note that for simplicity,
	- * we don't bother consulting the DTL in the
	- * resilver case.
	- */
	- vdev_indirect_read_all(zio);
	- } else {
	- /*
	- * If this is a read zio, we read one copy of each
	- * split segment, from the top-level vdev. Since
	- * we don't know the checksum of each split
	- * individually, the child zio can't ensure that
	- * we get the right data. E.g. if it's a mirror,
	- * it will just read from a random (healthy) leaf
	- * vdev. We have to verify the checksum in
	- * vdev_indirect_io_done().
	- *
	- * For write zios, the vdev code will ensure we write
	- * to all children.
	- */
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- zio_nowait(zio_vdev_child_io(zio, NULL,
	- is->is_vdev, is->is_target_offset,
	-#ifdef __FreeBSD__
	- zio->io_abd == NULL ? NULL :
	-#endif
	- abd_get_offset(zio->io_abd,
	- is->is_split_offset),
	- is->is_size, zio->io_type,
	- zio->io_priority, 0,
	- vdev_indirect_child_io_done, zio));
	- }
	- }
	- }
	-
	- zio_execute(zio);
	-}
	-
	-/*
	- * Report a checksum error for a child.
	- */
	-static void
	-vdev_indirect_checksum_error(zio_t *zio,
	- indirect_split_t is, indirect_child_t ic)
	-{
	- vdev_t *vd = ic->ic_vdev;
	-
	- if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
	- return;
	-
	- mutex_enter(&vd->vdev_stat_lock);
	- vd->vdev_stat.vs_checksum_errors++;
	- mutex_exit(&vd->vdev_stat_lock);
	-
	- zio_bad_cksum_t zbc = { 0 };
	- void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
	- abd_t *good_abd = is->is_good_child->ic_data;
	- void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
	- zfs_ereport_post_checksum(zio->io_spa, vd, zio,
	- is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
	- abd_return_buf(ic->ic_data, bad_buf, is->is_size);
	- abd_return_buf(good_abd, good_buf, is->is_size);
	-}
	-
	-/*
	- * Issue repair i/os for any incorrect copies. We do this by comparing
	- * each split segment's correct data (is_good_child's ic_data) with each
	- * other copy of the data. If they differ, then we overwrite the bad data
	- * with the good copy. Note that we do this without regard for the DTL's,
	- * which simplifies this code and also issues the optimal number of writes
	- * (based on which copies actually read bad data, as opposed to which we
	- * think might be wrong). For the same reason, we always use
	- * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
	- */
	-static void
	-vdev_indirect_repair(zio_t *zio)
	-{
	- indirect_vsd_t *iv = zio->io_vsd;
	-
	- enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
	-
	- if (!(zio->io_flags & (ZIO_FLAG_SCRUB \| ZIO_FLAG_RESILVER)))
	- flags \|= ZIO_FLAG_SELF_HEAL;
	-
	- if (!spa_writeable(zio->io_spa))
	- return;
	-
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- for (int c = 0; c < is->is_children; c++) {
	- indirect_child_t *ic = &is->is_child[c];
	- if (ic == is->is_good_child)
	- continue;
	- if (ic->ic_data == NULL)
	- continue;
	- if (ic->ic_duplicate == is->is_good_child)
	- continue;
	-
	- zio_nowait(zio_vdev_child_io(zio, NULL,
	- ic->ic_vdev, is->is_target_offset,
	- is->is_good_child->ic_data, is->is_size,
	- ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
	- ZIO_FLAG_IO_REPAIR \| ZIO_FLAG_SELF_HEAL,
	- NULL, NULL));
	-
	- vdev_indirect_checksum_error(zio, is, ic);
	- }
	- }
	-}
	-
	-/*
	- * Report checksum errors on all children that we read from.
	- */
	-static void
	-vdev_indirect_all_checksum_errors(zio_t *zio)
	-{
	- indirect_vsd_t *iv = zio->io_vsd;
	-
	- if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
	- return;
	-
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- for (int c = 0; c < is->is_children; c++) {
	- indirect_child_t *ic = &is->is_child[c];
	-
	- if (ic->ic_data == NULL)
	- continue;
	-
	- vdev_t *vd = ic->ic_vdev;
	-
	- mutex_enter(&vd->vdev_stat_lock);
	- vd->vdev_stat.vs_checksum_errors++;
	- mutex_exit(&vd->vdev_stat_lock);
	-
	- zfs_ereport_post_checksum(zio->io_spa, vd, zio,
	- is->is_target_offset, is->is_size,
	- NULL, NULL, NULL);
	- }
	- }
	-}
	-
	-/*
	- * Copy data from all the splits to a main zio then validate the checksum.
	- * If then checksum is successfully validated return success.
	- */
	-static int
	-vdev_indirect_splits_checksum_validate(indirect_vsd_t iv, zio_t zio)
	-{
	- zio_bad_cksum_t zbc;
	-
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	-
	- ASSERT3P(is->is_good_child->ic_data, !=, NULL);
	- ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
	-
	- abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
	- is->is_split_offset, 0, is->is_size);
	- }
	-
	- return (zio_checksum_error(zio, &zbc));
	-}
	-
	-/*
	- * There are relatively few possible combinations making it feasible to
	- * deterministically check them all. We do this by setting the good_child
	- * to the next unique split version. If we reach the end of the list then
	- * "carry over" to the next unique split version (like counting in base
	- * is_unique_children, but each digit can have a different base).
	- */
	-static int
	-vdev_indirect_splits_enumerate_all(indirect_vsd_t iv, zio_t zio)
	-{
	- boolean_t more = B_TRUE;
	-
	- iv->iv_attempts = 0;
	-
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is))
	- is->is_good_child = list_head(&is->is_unique_child);
	-
	- while (more == B_TRUE) {
	- iv->iv_attempts++;
	- more = B_FALSE;
	-
	- if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
	- return (0);
	-
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- is->is_good_child = list_next(&is->is_unique_child,
	- is->is_good_child);
	- if (is->is_good_child != NULL) {
	- more = B_TRUE;
	- break;
	- }
	-
	- is->is_good_child = list_head(&is->is_unique_child);
	- }
	- }
	-
	- ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
	-
	- return (SET_ERROR(ECKSUM));
	-}
	-
	-/*
	- * There are too many combinations to try all of them in a reasonable amount
	- * of time. So try a fixed number of random combinations from the unique
	- * split versions, after which we'll consider the block unrecoverable.
	- */
	-static int
	-vdev_indirect_splits_enumerate_randomly(indirect_vsd_t iv, zio_t zio)
	-{
	- iv->iv_attempts = 0;
	-
	- while (iv->iv_attempts < iv->iv_attempts_max) {
	- iv->iv_attempts++;
	-
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- indirect_child_t *ic = list_head(&is->is_unique_child);
	- int children = is->is_unique_children;
	-
	- for (int i = spa_get_random(children); i > 0; i--)
	- ic = list_next(&is->is_unique_child, ic);
	-
	- ASSERT3P(ic, !=, NULL);
	- is->is_good_child = ic;
	- }
	-
	- if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
	- return (0);
	- }
	-
	- return (SET_ERROR(ECKSUM));
	-}
	-
	-/*
	- * This is a validation function for reconstruction. It randomly selects
	- * a good combination, if one can be found, and then it intentionally
	- * damages all other segment copes by zeroing them. This forces the
	- * reconstruction algorithm to locate the one remaining known good copy.
	- */
	-static int
	-vdev_indirect_splits_damage(indirect_vsd_t iv, zio_t zio)
	-{
	- /* Presume all the copies are unique for initial selection. */
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- is->is_unique_children = 0;
	-
	- for (int i = 0; i < is->is_children; i++) {
	- indirect_child_t *ic = &is->is_child[i];
	- if (ic->ic_data != NULL) {
	- is->is_unique_children++;
	- list_insert_tail(&is->is_unique_child, ic);
	- }
	- }
	- }
	-
	- /*
	- * Set each is_good_child to a randomly-selected child which
	- * is known to contain validated data.
	- */
	- int error = vdev_indirect_splits_enumerate_randomly(iv, zio);
	- if (error)
	- goto out;
	-
	- /*
	- * Damage all but the known good copy by zeroing it. This will
	- * result in two or less unique copies per indirect_child_t.
	- * Both may need to be checked in order to reconstruct the block.
	- * Set iv->iv_attempts_max such that all unique combinations will
	- * enumerated, but limit the damage to at most 16 indirect splits.
	- */
	- iv->iv_attempts_max = 1;
	-
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- for (int c = 0; c < is->is_children; c++) {
	- indirect_child_t *ic = &is->is_child[c];
	-
	- if (ic == is->is_good_child)
	- continue;
	- if (ic->ic_data == NULL)
	- continue;
	-
	- abd_zero(ic->ic_data, ic->ic_data->abd_size);
	- }
	-
	- iv->iv_attempts_max *= 2;
	- if (iv->iv_attempts_max > (1ULL << 16)) {
	- iv->iv_attempts_max = UINT64_MAX;
	- break;
	- }
	- }
	-
	-out:
	- /* Empty the unique children lists so they can be reconstructed. */
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- indirect_child_t *ic;
	- while ((ic = list_head(&is->is_unique_child)) != NULL)
	- list_remove(&is->is_unique_child, ic);
	-
	- is->is_unique_children = 0;
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * This function is called when we have read all copies of the data and need
	- * to try to find a combination of copies that gives us the right checksum.
	- *
	- * If we pointed to any mirror vdevs, this effectively does the job of the
	- * mirror. The mirror vdev code can't do its own job because we don't know
	- * the checksum of each split segment individually.
	- *
	- * We have to try every unique combination of copies of split segments, until
	- * we find one that checksums correctly. Duplicate segment copies are first
	- * identified and latter skipped during reconstruction. This optimization
	- * reduces the search space and ensures that of the remaining combinations
	- * at most one is correct.
	- *
	- * When the total number of combinations is small they can all be checked.
	- * For example, if we have 3 segments in the split, and each points to a
	- * 2-way mirror with unique copies, we will have the following pieces of data:
	- *
	- * \| mirror child
	- * split \| [0] [1]
	- * ======\|=====================
	- * A \| data_A_0 data_A_1
	- * B \| data_B_0 data_B_1
	- * C \| data_C_0 data_C_1
	- *
	- * We will try the following (mirror children)^(number of splits) (2^3=8)
	- * combinations, which is similar to bitwise-little-endian counting in
	- * binary. In general each "digit" corresponds to a split segment, and the
	- * base of each digit is is_children, which can be different for each
	- * digit.
	- *
	- * "low bit" "high bit"
	- * v v
	- * data_A_0 data_B_0 data_C_0
	- * data_A_1 data_B_0 data_C_0
	- * data_A_0 data_B_1 data_C_0
	- * data_A_1 data_B_1 data_C_0
	- * data_A_0 data_B_0 data_C_1
	- * data_A_1 data_B_0 data_C_1
	- * data_A_0 data_B_1 data_C_1
	- * data_A_1 data_B_1 data_C_1
	- *
	- * Note that the split segments may be on the same or different top-level
	- * vdevs. In either case, we may need to try lots of combinations (see
	- * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror
	- * has small silent errors on all of its children, we can still reconstruct
	- * the correct data, as long as those errors are at sufficiently-separated
	- * offsets (specifically, separated by the largest block size - default of
	- * 128KB, but up to 16MB).
	- */
	-static void
	-vdev_indirect_reconstruct_io_done(zio_t *zio)
	-{
	- indirect_vsd_t *iv = zio->io_vsd;
	- boolean_t known_good = B_FALSE;
	- int error;
	-
	- iv->iv_unique_combinations = 1;
	- iv->iv_attempts_max = UINT64_MAX;
	-
	- if (zfs_reconstruct_indirect_combinations_max > 0)
	- iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
	-
	- /*
	- * If nonzero, every 1/x blocks will be damaged, in order to validate
	- * reconstruction when there are split segments with damaged copies.
	- * Known_good will TRUE when reconstruction is known to be possible.
	- */
	- if (zfs_reconstruct_indirect_damage_fraction != 0 &&
	- spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
	- known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
	-
	- /*
	- * Determine the unique children for a split segment and add them
	- * to the is_unique_child list. By restricting reconstruction
	- * to these children, only unique combinations will be considered.
	- * This can vastly reduce the search space when there are a large
	- * number of indirect splits.
	- */
	- for (indirect_split_t *is = list_head(&iv->iv_splits);
	- is != NULL; is = list_next(&iv->iv_splits, is)) {
	- is->is_unique_children = 0;
	-
	- for (int i = 0; i < is->is_children; i++) {
	- indirect_child_t *ic_i = &is->is_child[i];
	-
	- if (ic_i->ic_data == NULL \|\|
	- ic_i->ic_duplicate != NULL)
	- continue;
	-
	- for (int j = i + 1; j < is->is_children; j++) {
	- indirect_child_t *ic_j = &is->is_child[j];
	-
	- if (ic_j->ic_data == NULL \|\|
	- ic_j->ic_duplicate != NULL)
	- continue;
	-
	- if (abd_cmp(ic_i->ic_data, ic_j->ic_data,
	- is->is_size) == 0) {
	- ic_j->ic_duplicate = ic_i;
	- }
	- }
	-
	- is->is_unique_children++;
	- list_insert_tail(&is->is_unique_child, ic_i);
	- }
	-
	- /* Reconstruction is impossible, no valid children */
	- EQUIV(list_is_empty(&is->is_unique_child),
	- is->is_unique_children == 0);
	- if (list_is_empty(&is->is_unique_child)) {
	- zio->io_error = EIO;
	- vdev_indirect_all_checksum_errors(zio);
	- zio_checksum_verified(zio);
	- return;
	- }
	-
	- iv->iv_unique_combinations *= is->is_unique_children;
	- }
	-
	- if (iv->iv_unique_combinations <= iv->iv_attempts_max)
	- error = vdev_indirect_splits_enumerate_all(iv, zio);
	- else
	- error = vdev_indirect_splits_enumerate_randomly(iv, zio);
	-
	- if (error != 0) {
	- /* All attempted combinations failed. */
	- ASSERT3B(known_good, ==, B_FALSE);
	- zio->io_error = error;
	- vdev_indirect_all_checksum_errors(zio);
	- } else {
	- /*
	- * The checksum has been successfully validated. Issue
	- * repair I/Os to any copies of splits which don't match
	- * the validated version.
	- */
	- ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
	- vdev_indirect_repair(zio);
	- zio_checksum_verified(zio);
	- }
	-}
	-
	-static void
	-vdev_indirect_io_done(zio_t *zio)
	-{
	- indirect_vsd_t *iv = zio->io_vsd;
	-
	- if (iv->iv_reconstruct) {
	- /*
	- * We have read all copies of the data (e.g. from mirrors),
	- * either because this was a scrub/resilver, or because the
	- * one-copy read didn't checksum correctly.
	- */
	- vdev_indirect_reconstruct_io_done(zio);
	- return;
	- }
	-
	- if (!iv->iv_split_block) {
	- /*
	- * This was not a split block, so we passed the BP down,
	- * and the checksum was handled by the (one) child zio.
	- */
	- return;
	- }
	-
	- zio_bad_cksum_t zbc;
	- int ret = zio_checksum_error(zio, &zbc);
	- if (ret == 0) {
	- zio_checksum_verified(zio);
	- return;
	- }
	-
	- /*
	- * The checksum didn't match. Read all copies of all splits, and
	- * then we will try to reconstruct. The next time
	- * vdev_indirect_io_done() is called, iv_reconstruct will be set.
	- */
	- vdev_indirect_read_all(zio);
	-
	- zio_vdev_io_redone(zio);
	-}
	-
	-vdev_ops_t vdev_indirect_ops = {
	- vdev_indirect_open,
	- vdev_indirect_close,
	- vdev_default_asize,
	- vdev_indirect_io_start,
	- vdev_indirect_io_done,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- vdev_indirect_remap,
	- NULL,
	- VDEV_TYPE_INDIRECT, /* name of this vdev type */
	- B_FALSE /* leaf vdev */
	-};
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
	@@ -1,212 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2015 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/dmu_tx.h>
	-#include <sys/spa.h>
	-#include <sys/dmu.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/vdev_indirect_births.h>
	-
	-static boolean_t
	-vdev_indirect_births_verify(vdev_indirect_births_t *vib)
	-{
	- ASSERT(vib != NULL);
	-
	- ASSERT(vib->vib_object != 0);
	- ASSERT(vib->vib_objset != NULL);
	- ASSERT(vib->vib_phys != NULL);
	- ASSERT(vib->vib_dbuf != NULL);
	-
	- EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL);
	-
	- return (B_TRUE);
	-}
	-
	-uint64_t
	-vdev_indirect_births_count(vdev_indirect_births_t *vib)
	-{
	- ASSERT(vdev_indirect_births_verify(vib));
	-
	- return (vib->vib_phys->vib_count);
	-}
	-
	-uint64_t
	-vdev_indirect_births_object(vdev_indirect_births_t *vib)
	-{
	- ASSERT(vdev_indirect_births_verify(vib));
	-
	- return (vib->vib_object);
	-}
	-
	-static uint64_t
	-vdev_indirect_births_size_impl(vdev_indirect_births_t *vib)
	-{
	- return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries));
	-}
	-
	-void
	-vdev_indirect_births_close(vdev_indirect_births_t *vib)
	-{
	- ASSERT(vdev_indirect_births_verify(vib));
	-
	- if (vib->vib_phys->vib_count > 0) {
	- uint64_t births_size = vdev_indirect_births_size_impl(vib);
	-
	- kmem_free(vib->vib_entries, births_size);
	- vib->vib_entries = NULL;
	- }
	-
	- dmu_buf_rele(vib->vib_dbuf, vib);
	-
	- vib->vib_objset = NULL;
	- vib->vib_object = 0;
	- vib->vib_dbuf = NULL;
	- vib->vib_phys = NULL;
	-
	- kmem_free(vib, sizeof (*vib));
	-}
	-
	-uint64_t
	-vdev_indirect_births_alloc(objset_t os, dmu_tx_t tx)
	-{
	- ASSERT(dmu_tx_is_syncing(tx));
	-
	- return (dmu_object_alloc(os,
	- DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
	- DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t),
	- tx));
	-}
	-
	-vdev_indirect_births_t *
	-vdev_indirect_births_open(objset_t *os, uint64_t births_object)
	-{
	- vdev_indirect_births_t vib = kmem_zalloc(sizeof (vib), KM_SLEEP);
	-
	- vib->vib_objset = os;
	- vib->vib_object = births_object;
	-
	- VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf));
	- vib->vib_phys = vib->vib_dbuf->db_data;
	-
	- if (vib->vib_phys->vib_count > 0) {
	- uint64_t births_size = vdev_indirect_births_size_impl(vib);
	- vib->vib_entries = kmem_alloc(births_size, KM_SLEEP);
	- VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0,
	- births_size, vib->vib_entries, DMU_READ_PREFETCH));
	- }
	-
	- ASSERT(vdev_indirect_births_verify(vib));
	-
	- return (vib);
	-}
	-
	-void
	-vdev_indirect_births_free(objset_t os, uint64_t object, dmu_tx_t tx)
	-{
	- VERIFY0(dmu_object_free(os, object, tx));
	-}
	-
	-void
	-vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
	- uint64_t max_offset, uint64_t txg, dmu_tx_t *tx)
	-{
	- vdev_indirect_birth_entry_phys_t vibe;
	- uint64_t old_size;
	- uint64_t new_size;
	- vdev_indirect_birth_entry_phys_t *new_entries;
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
	- ASSERT(vdev_indirect_births_verify(vib));
	-
	- dmu_buf_will_dirty(vib->vib_dbuf, tx);
	-
	- vibe.vibe_offset = max_offset;
	- vibe.vibe_phys_birth_txg = txg;
	-
	- old_size = vdev_indirect_births_size_impl(vib);
	- dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe),
	- &vibe, tx);
	- vib->vib_phys->vib_count++;
	- new_size = vdev_indirect_births_size_impl(vib);
	-
	- new_entries = kmem_alloc(new_size, KM_SLEEP);
	- if (old_size > 0) {
	- bcopy(vib->vib_entries, new_entries, old_size);
	- kmem_free(vib->vib_entries, old_size);
	- }
	- new_entries[vib->vib_phys->vib_count - 1] = vibe;
	- vib->vib_entries = new_entries;
	-}
	-
	-uint64_t
	-vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib)
	-{
	- ASSERT(vdev_indirect_births_verify(vib));
	- ASSERT(vib->vib_phys->vib_count > 0);
	-
	- vdev_indirect_birth_entry_phys_t *last =
	- &vib->vib_entries[vib->vib_phys->vib_count - 1];
	- return (last->vibe_phys_birth_txg);
	-}
	-
	-/*
	- * Return the txg in which the given range was copied (i.e. its physical
	- * birth txg). The specified offset+asize must be contiguously mapped
	- * (i.e. not a split block).
	- *
	- * The entries are sorted by increasing phys_birth, and also by increasing
	- * offset. We find the specified offset by binary search. Note that we
	- * can not use bsearch() because looking at each entry independently is
	- * insufficient to find the correct entry. Each entry implicitly relies
	- * on the previous entry: an entry indicates that the offsets from the
	- * end of the previous entry to the end of this entry were written in the
	- * specified txg.
	- */
	-uint64_t
	-vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset,
	- uint64_t asize)
	-{
	- vdev_indirect_birth_entry_phys_t *base;
	- vdev_indirect_birth_entry_phys_t *last;
	-
	- ASSERT(vdev_indirect_births_verify(vib));
	- ASSERT(vib->vib_phys->vib_count > 0);
	-
	- base = vib->vib_entries;
	- last = base + vib->vib_phys->vib_count - 1;
	-
	- ASSERT3U(offset, <, last->vibe_offset);
	-
	- while (last >= base) {
	- vdev_indirect_birth_entry_phys_t *p =
	- base + ((last - base) / 2);
	- if (offset >= p->vibe_offset) {
	- base = p + 1;
	- } else if (p == vib->vib_entries \|\|
	- offset >= (p - 1)->vibe_offset) {
	- ASSERT3U(offset + asize, <=, p->vibe_offset);
	- return (p->vibe_phys_birth_txg);
	- } else {
	- last = p - 1;
	- }
	- }
	- ASSERT(!"offset not found");
	- return (-1);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
	@@ -1,593 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/spa.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/vdev_indirect_mapping.h>
	-#include <sys/zfeature.h>
	-#include <sys/dmu_objset.h>
	-
	-static boolean_t
	-vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
	-{
	- ASSERT(vim != NULL);
	-
	- ASSERT(vim->vim_object != 0);
	- ASSERT(vim->vim_objset != NULL);
	- ASSERT(vim->vim_phys != NULL);
	- ASSERT(vim->vim_dbuf != NULL);
	-
	- EQUIV(vim->vim_phys->vimp_num_entries > 0,
	- vim->vim_entries != NULL);
	- if (vim->vim_phys->vimp_num_entries > 0) {
	- vdev_indirect_mapping_entry_phys_t *last_entry =
	- &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
	- uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry);
	- uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst);
	-
	- ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
	- }
	- if (vim->vim_havecounts) {
	- ASSERT(vim->vim_phys->vimp_counts_object != 0);
	- }
	-
	- return (B_TRUE);
	-}
	-
	-uint64_t
	-vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
	-{
	- ASSERT(vdev_indirect_mapping_verify(vim));
	-
	- return (vim->vim_phys->vimp_num_entries);
	-}
	-
	-uint64_t
	-vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
	-{
	- ASSERT(vdev_indirect_mapping_verify(vim));
	-
	- return (vim->vim_phys->vimp_max_offset);
	-}
	-
	-uint64_t
	-vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
	-{
	- ASSERT(vdev_indirect_mapping_verify(vim));
	-
	- return (vim->vim_object);
	-}
	-
	-uint64_t
	-vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
	-{
	- ASSERT(vdev_indirect_mapping_verify(vim));
	-
	- return (vim->vim_phys->vimp_bytes_mapped);
	-}
	-
	-/*
	- * The length (in bytes) of the mapping object array in memory and
	- * (logically) on disk.
	- *
	- * Note that unlike most of our accessor functions,
	- * we don't assert that the struct is consistent; therefore it can be
	- * called while there may be concurrent changes, if we don't care about
	- * the value being immediately stale (e.g. from spa_removal_get_stats()).
	- */
	-uint64_t
	-vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
	-{
	- return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
	-}
	-
	-/*
	- * Compare an offset with an indirect mapping entry; there are three
	- * possible scenarios:
	- *
	- * 1. The offset is "less than" the mapping entry; meaning the
	- * offset is less than the source offset of the mapping entry. In
	- * this case, there is no overlap between the offset and the
	- * mapping entry and -1 will be returned.
	- *
	- * 2. The offset is "greater than" the mapping entry; meaning the
	- * offset is greater than the mapping entry's source offset plus
	- * the entry's size. In this case, there is no overlap between
	- * the offset and the mapping entry and 1 will be returned.
	- *
	- * NOTE: If the offset is actually equal to the entry's offset
	- * plus size, this is considered to be "greater" than the entry,
	- * and this case applies (i.e. 1 will be returned). Thus, the
	- * entry's "range" can be considered to be inclusive at its
	- * start, but exclusive at its end: e.g. [src, src + size).
	- *
	- * 3. The last case to consider is if the offset actually falls
	- * within the mapping entry's range. If this is the case, the
	- * offset is considered to be "equal to" the mapping entry and
	- * 0 will be returned.
	- *
	- * NOTE: If the offset is equal to the entry's source offset,
	- * this case applies and 0 will be returned. If the offset is
	- * equal to the entry's source plus its size, this case does
	- * not apply (see "NOTE" above for scenario 2), and 1 will be
	- * returned.
	- */
	-static int
	-dva_mapping_overlap_compare(const void v_key, const void v_array_elem)
	-{
	- const uint64_t *key = v_key;
	- const vdev_indirect_mapping_entry_phys_t *array_elem =
	- v_array_elem;
	- uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
	-
	- if (*key < src_offset) {
	- return (-1);
	- } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
	- return (0);
	- } else {
	- return (1);
	- }
	-}
	-
	-/*
	- * Returns the mapping entry for the given offset.
	- *
	- * It's possible that the given offset will not be in the mapping table
	- * (i.e. no mapping entries contain this offset), in which case, the
	- * return value value depends on the "next_if_missing" parameter.
	- *
	- * If the offset is not found in the table and "next_if_missing" is
	- * B_FALSE, then NULL will always be returned. The behavior is intended
	- * to allow consumers to get the entry corresponding to the offset
	- * parameter, iff the offset overlaps with an entry in the table.
	- *
	- * If the offset is not found in the table and "next_if_missing" is
	- * B_TRUE, then the entry nearest to the given offset will be returned,
	- * such that the entry's source offset is greater than the offset
	- * passed in (i.e. the "next" mapping entry in the table is returned, if
	- * the offset is missing from the table). If there are no entries whose
	- * source offset is greater than the passed in offset, NULL is returned.
	- */
	-static vdev_indirect_mapping_entry_phys_t *
	-vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
	- uint64_t offset, boolean_t next_if_missing)
	-{
	- ASSERT(vdev_indirect_mapping_verify(vim));
	- ASSERT(vim->vim_phys->vimp_num_entries > 0);
	-
	- vdev_indirect_mapping_entry_phys_t *entry = NULL;
	-
	- uint64_t last = vim->vim_phys->vimp_num_entries - 1;
	- uint64_t base = 0;
	-
	- /*
	- * We don't define these inside of the while loop because we use
	- * their value in the case that offset isn't in the mapping.
	- */
	- uint64_t mid;
	- int result;
	-
	- while (last >= base) {
	- mid = base + ((last - base) >> 1);
	-
	- result = dva_mapping_overlap_compare(&offset,
	- &vim->vim_entries[mid]);
	-
	- if (result == 0) {
	- entry = &vim->vim_entries[mid];
	- break;
	- } else if (result < 0) {
	- last = mid - 1;
	- } else {
	- base = mid + 1;
	- }
	- }
	-
	- if (entry == NULL && next_if_missing) {
	- ASSERT3U(base, ==, last + 1);
	- ASSERT(mid == base \|\| mid == last);
	- ASSERT3S(result, !=, 0);
	-
	- /*
	- * The offset we're looking for isn't actually contained
	- * in the mapping table, thus we need to return the
	- * closest mapping entry that is greater than the
	- * offset. We reuse the result of the last comparison,
	- * comparing the mapping entry at index "mid" and the
	- * offset. The offset is guaranteed to lie between
	- * indices one less than "mid", and one greater than
	- * "mid"; we just need to determine if offset is greater
	- * than, or less than the mapping entry contained at
	- * index "mid".
	- */
	-
	- uint64_t index;
	- if (result < 0)
	- index = mid;
	- else
	- index = mid + 1;
	-
	- ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
	-
	- if (index == vim->vim_phys->vimp_num_entries) {
	- /*
	- * If "index" is past the end of the entries
	- * array, then not only is the offset not in the
	- * mapping table, but it's actually greater than
	- * all entries in the table. In this case, we
	- * can't return a mapping entry greater than the
	- * offset (since none exist), so we return NULL.
	- */
	-
	- ASSERT3S(dva_mapping_overlap_compare(&offset,
	- &vim->vim_entries[index - 1]), >, 0);
	-
	- return (NULL);
	- } else {
	- /*
	- * Just to be safe, we verify the offset falls
	- * in between the mapping entries at index and
	- * one less than index. Since we know the offset
	- * doesn't overlap an entry, and we're supposed
	- * to return the entry just greater than the
	- * offset, both of the following tests must be
	- * true.
	- */
	- ASSERT3S(dva_mapping_overlap_compare(&offset,
	- &vim->vim_entries[index]), <, 0);
	- IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
	- &vim->vim_entries[index - 1]) > 0);
	-
	- return (&vim->vim_entries[index]);
	- }
	- } else {
	- return (entry);
	- }
	-}
	-
	-vdev_indirect_mapping_entry_phys_t *
	-vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
	- uint64_t offset)
	-{
	- return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
	- B_FALSE));
	-}
	-
	-vdev_indirect_mapping_entry_phys_t *
	-vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
	- uint64_t offset)
	-{
	- return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
	- B_TRUE));
	-}
	-
	-void
	-vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
	-{
	- ASSERT(vdev_indirect_mapping_verify(vim));
	-
	- if (vim->vim_phys->vimp_num_entries > 0) {
	- uint64_t map_size = vdev_indirect_mapping_size(vim);
	- kmem_free(vim->vim_entries, map_size);
	- vim->vim_entries = NULL;
	- }
	-
	- dmu_buf_rele(vim->vim_dbuf, vim);
	-
	- vim->vim_objset = NULL;
	- vim->vim_object = 0;
	- vim->vim_dbuf = NULL;
	- vim->vim_phys = NULL;
	-
	- kmem_free(vim, sizeof (*vim));
	-}
	-
	-uint64_t
	-vdev_indirect_mapping_alloc(objset_t os, dmu_tx_t tx)
	-{
	- uint64_t object;
	- ASSERT(dmu_tx_is_syncing(tx));
	- uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
	-
	- if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	- bonus_size = sizeof (vdev_indirect_mapping_phys_t);
	- }
	-
	- object = dmu_object_alloc(os,
	- DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
	- DMU_OTN_UINT64_METADATA, bonus_size,
	- tx);
	-
	- if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	- dmu_buf_t *dbuf;
	- vdev_indirect_mapping_phys_t *vimp;
	-
	- VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
	- dmu_buf_will_dirty(dbuf, tx);
	- vimp = dbuf->db_data;
	- vimp->vimp_counts_object = dmu_object_alloc(os,
	- DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
	- DMU_OT_NONE, 0, tx);
	- spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	- dmu_buf_rele(dbuf, FTAG);
	- }
	-
	- return (object);
	-}
	-
	-
	-vdev_indirect_mapping_t *
	-vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
	-{
	- vdev_indirect_mapping_t vim = kmem_zalloc(sizeof (vim), KM_SLEEP);
	- dmu_object_info_t doi;
	- VERIFY0(dmu_object_info(os, mapping_object, &doi));
	-
	- vim->vim_objset = os;
	- vim->vim_object = mapping_object;
	-
	- VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
	- &vim->vim_dbuf));
	- vim->vim_phys = vim->vim_dbuf->db_data;
	-
	- vim->vim_havecounts =
	- (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
	-
	- if (vim->vim_phys->vimp_num_entries > 0) {
	- uint64_t map_size = vdev_indirect_mapping_size(vim);
	- vim->vim_entries = kmem_alloc(map_size, KM_SLEEP);
	- VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
	- vim->vim_entries, DMU_READ_PREFETCH));
	- }
	-
	- ASSERT(vdev_indirect_mapping_verify(vim));
	-
	- return (vim);
	-}
	-
	-void
	-vdev_indirect_mapping_free(objset_t os, uint64_t object, dmu_tx_t tx)
	-{
	- vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
	- if (vim->vim_havecounts) {
	- VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
	- tx));
	- spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	- }
	- vdev_indirect_mapping_close(vim);
	-
	- VERIFY0(dmu_object_free(os, object, tx));
	-}
	-
	-/*
	- * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
	- * mapping object. Also remove the entries from the list and free them.
	- * This also implicitly extends the max_offset of the mapping (to the end
	- * of the last entry).
	- */
	-void
	-vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
	- list_t list, dmu_tx_t tx)
	-{
	- vdev_indirect_mapping_entry_phys_t *mapbuf;
	- uint64_t old_size;
	- uint32_t *countbuf = NULL;
	- vdev_indirect_mapping_entry_phys_t *old_entries;
	- uint64_t old_count;
	- uint64_t entries_written = 0;
	-
	- ASSERT(vdev_indirect_mapping_verify(vim));
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
	- ASSERT(!list_is_empty(list));
	-
	- old_size = vdev_indirect_mapping_size(vim);
	- old_entries = vim->vim_entries;
	- old_count = vim->vim_phys->vimp_num_entries;
	-
	- dmu_buf_will_dirty(vim->vim_dbuf, tx);
	-
	- mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
	- if (vim->vim_havecounts) {
	- countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
	- ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
	- SPA_FEATURE_OBSOLETE_COUNTS));
	- }
	- while (!list_is_empty(list)) {
	- uint64_t i;
	- /*
	- * Write entries from the list to the
	- * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
	- */
	- for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
	- vdev_indirect_mapping_entry_t *entry =
	- list_remove_head(list);
	- if (entry == NULL)
	- break;
	-
	- uint64_t size =
	- DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
	- uint64_t src_offset =
	- DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
	-
	- /*
	- * We shouldn't be adding an entry which is fully
	- * obsolete.
	- */
	- ASSERT3U(entry->vime_obsolete_count, <, size);
	- IMPLY(entry->vime_obsolete_count != 0,
	- vim->vim_havecounts);
	-
	- mapbuf[i] = entry->vime_mapping;
	- if (vim->vim_havecounts)
	- countbuf[i] = entry->vime_obsolete_count;
	-
	- vim->vim_phys->vimp_bytes_mapped += size;
	- ASSERT3U(src_offset, >=,
	- vim->vim_phys->vimp_max_offset);
	- vim->vim_phys->vimp_max_offset = src_offset + size;
	-
	- entries_written++;
	-
	- kmem_free(entry, sizeof (*entry));
	- }
	- dmu_write(vim->vim_objset, vim->vim_object,
	- vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
	- i * sizeof (*mapbuf),
	- mapbuf, tx);
	- if (vim->vim_havecounts) {
	- dmu_write(vim->vim_objset,
	- vim->vim_phys->vimp_counts_object,
	- vim->vim_phys->vimp_num_entries *
	- sizeof (*countbuf),
	- i * sizeof (*countbuf), countbuf, tx);
	- }
	- vim->vim_phys->vimp_num_entries += i;
	- }
	- zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
	- if (vim->vim_havecounts)
	- zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
	-
	- /*
	- * Update the entry array to reflect the new entries. First, copy
	- * over any old entries then read back the new entries we just wrote.
	- */
	- uint64_t new_size = vdev_indirect_mapping_size(vim);
	- ASSERT3U(new_size, >, old_size);
	- ASSERT3U(new_size - old_size, ==,
	- entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
	- vim->vim_entries = kmem_alloc(new_size, KM_SLEEP);
	- if (old_size > 0) {
	- bcopy(old_entries, vim->vim_entries, old_size);
	- kmem_free(old_entries, old_size);
	- }
	- VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
	- new_size - old_size, &vim->vim_entries[old_count],
	- DMU_READ_PREFETCH));
	-
	- zfs_dbgmsg("txg %llu: wrote %llu entries to "
	- "indirect mapping obj %llu; max offset=0x%llx",
	- (u_longlong_t)dmu_tx_get_txg(tx),
	- (u_longlong_t)entries_written,
	- (u_longlong_t)vim->vim_object,
	- (u_longlong_t)vim->vim_phys->vimp_max_offset);
	-}
	-
	-/*
	- * Increment the relevant counts for the specified offset and length.
	- * The counts array must be obtained from
	- * vdev_indirect_mapping_load_obsolete_counts().
	- */
	-void
	-vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
	- uint64_t offset, uint64_t length, uint32_t *counts)
	-{
	- vdev_indirect_mapping_entry_phys_t *mapping;
	- uint64_t index;
	-
	- mapping = vdev_indirect_mapping_entry_for_offset(vim, offset);
	-
	- ASSERT(length > 0);
	- ASSERT3P(mapping, !=, NULL);
	-
	- index = mapping - vim->vim_entries;
	-
	- while (length > 0) {
	- ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
	-
	- uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
	- uint64_t inner_offset = offset -
	- DVA_MAPPING_GET_SRC_OFFSET(mapping);
	- VERIFY3U(inner_offset, <, size);
	- uint64_t inner_size = MIN(length, size - inner_offset);
	-
	- VERIFY3U(counts[index] + inner_size, <=, size);
	- counts[index] += inner_size;
	-
	- offset += inner_size;
	- length -= inner_size;
	- mapping++;
	- index++;
	- }
	-}
	-
	-typedef struct load_obsolete_space_map_arg {
	- vdev_indirect_mapping_t *losma_vim;
	- uint32_t *losma_counts;
	-} load_obsolete_space_map_arg_t;
	-
	-static int
	-load_obsolete_sm_callback(space_map_entry_t sme, void arg)
	-{
	- load_obsolete_space_map_arg_t *losma = arg;
	- ASSERT3S(sme->sme_type, ==, SM_ALLOC);
	-
	- vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
	- sme->sme_offset, sme->sme_run, losma->losma_counts);
	-
	- return (0);
	-}
	-
	-/*
	- * Modify the counts (increment them) based on the spacemap.
	- */
	-void
	-vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
	- uint32_t counts, space_map_t obsolete_space_sm)
	-{
	- load_obsolete_space_map_arg_t losma;
	- losma.losma_counts = counts;
	- losma.losma_vim = vim;
	- VERIFY0(space_map_iterate(obsolete_space_sm,
	- space_map_length(obsolete_space_sm),
	- load_obsolete_sm_callback, &losma));
	-}
	-
	-/*
	- * Read the obsolete counts from disk, returning them in an array.
	- */
	-uint32_t *
	-vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
	-{
	- ASSERT(vdev_indirect_mapping_verify(vim));
	-
	- uint64_t counts_size =
	- vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
	- uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP);
	- if (vim->vim_havecounts) {
	- VERIFY0(dmu_read(vim->vim_objset,
	- vim->vim_phys->vimp_counts_object,
	- 0, counts_size,
	- counts, DMU_READ_PREFETCH));
	- } else {
	- bzero(counts, counts_size);
	- }
	- return (counts);
	-}
	-
	-extern void
	-vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
	- uint32_t *counts)
	-{
	- ASSERT(vdev_indirect_mapping_verify(vim));
	-
	- kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
	@@ -1,782 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/txg.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/refcount.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/zap.h>
	-#include <sys/dmu_tx.h>
	-
	-/*
	- * Maximum number of metaslabs per group that can be initialized
	- * simultaneously.
	- */
	-int max_initialize_ms = 3;
	-
	-/*
	- * Value that is written to disk during initialization.
	- */
	-uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL;
	-
	-/* maximum number of I/Os outstanding per leaf vdev */
	-int zfs_initialize_limit = 1;
	-
	-/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
	-uint64_t zfs_initialize_chunk_size = 1024 * 1024;
	-
	-static boolean_t
	-vdev_initialize_should_stop(vdev_t *vd)
	-{
	- return (vd->vdev_initialize_exit_wanted \|\| !vdev_writeable(vd) \|\|
	- vd->vdev_detached \|\| vd->vdev_top->vdev_removing);
	-}
	-
	-static void
	-vdev_initialize_zap_update_sync(void arg, dmu_tx_t tx)
	-{
	- /*
	- * We pass in the guid instead of the vdev_t since the vdev may
	- * have been freed prior to the sync task being processed. This
	- * happens when a vdev is detached as we call spa_config_vdev_exit(),
	- * stop the intializing thread, schedule the sync task, and free
	- * the vdev. Later when the scheduled sync task is invoked, it would
	- * find that the vdev has been freed.
	- */
	- uint64_t guid = (uint64_t )arg;
	- uint64_t txg = dmu_tx_get_txg(tx);
	- kmem_free(arg, sizeof (uint64_t));
	-
	- vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
	- if (vd == NULL \|\| vd->vdev_top->vdev_removing \|\| !vdev_is_concrete(vd))
	- return;
	-
	- uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
	- vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
	-
	- VERIFY(vd->vdev_leaf_zap != 0);
	-
	- objset_t *mos = vd->vdev_spa->spa_meta_objset;
	-
	- if (last_offset > 0) {
	- vd->vdev_initialize_last_offset = last_offset;
	- VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
	- VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
	- sizeof (last_offset), 1, &last_offset, tx));
	- }
	- if (vd->vdev_initialize_action_time > 0) {
	- uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
	- VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
	- VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
	- 1, &val, tx));
	- }
	-
	- uint64_t initialize_state = vd->vdev_initialize_state;
	- VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
	- VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
	- &initialize_state, tx));
	-}
	-
	-static void
	-vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
	-{
	- ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
	- spa_t *spa = vd->vdev_spa;
	-
	- if (new_state == vd->vdev_initialize_state)
	- return;
	-
	- /*
	- * Copy the vd's guid, this will be freed by the sync task.
	- */
	- uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
	- *guid = vd->vdev_guid;
	-
	- /*
	- * If we're suspending, then preserving the original start time.
	- */
	- if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
	- vd->vdev_initialize_action_time = gethrestime_sec();
	- }
	- vd->vdev_initialize_state = new_state;
	-
	- dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	- dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
	- guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
	-
	- switch (new_state) {
	- case VDEV_INITIALIZE_ACTIVE:
	- spa_history_log_internal(spa, "initialize", tx,
	- "vdev=%s activated", vd->vdev_path);
	- break;
	- case VDEV_INITIALIZE_SUSPENDED:
	- spa_history_log_internal(spa, "initialize", tx,
	- "vdev=%s suspended", vd->vdev_path);
	- break;
	- case VDEV_INITIALIZE_CANCELED:
	- spa_history_log_internal(spa, "initialize", tx,
	- "vdev=%s canceled", vd->vdev_path);
	- break;
	- case VDEV_INITIALIZE_COMPLETE:
	- spa_history_log_internal(spa, "initialize", tx,
	- "vdev=%s complete", vd->vdev_path);
	- break;
	- default:
	- panic("invalid state %llu", (unsigned long long)new_state);
	- }
	-
	- dmu_tx_commit(tx);
	-}
	-
	-static void
	-vdev_initialize_cb(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- mutex_enter(&vd->vdev_initialize_io_lock);
	- if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
	- /*
	- * The I/O failed because the vdev was unavailable; roll the
	- * last offset back. (This works because spa_sync waits on
	- * spa_txg_zio before it runs sync tasks.)
	- */
	- uint64_t *off =
	- &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
	- off = MIN(off, zio->io_offset);
	- } else {
	- /*
	- * Since initializing is best-effort, we ignore I/O errors and
	- * rely on vdev_probe to determine if the errors are more
	- * critical.
	- */
	- if (zio->io_error != 0)
	- vd->vdev_stat.vs_initialize_errors++;
	-
	- vd->vdev_initialize_bytes_done += zio->io_orig_size;
	- }
	- ASSERT3U(vd->vdev_initialize_inflight, >, 0);
	- vd->vdev_initialize_inflight--;
	- cv_broadcast(&vd->vdev_initialize_io_cv);
	- mutex_exit(&vd->vdev_initialize_io_lock);
	-
	- spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
	-}
	-
	-/* Takes care of physical writing and limiting # of concurrent ZIOs. */
	-static int
	-vdev_initialize_write(vdev_t vd, uint64_t start, uint64_t size, abd_t data)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- /* Limit inflight initializing I/Os */
	- mutex_enter(&vd->vdev_initialize_io_lock);
	- while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
	- cv_wait(&vd->vdev_initialize_io_cv,
	- &vd->vdev_initialize_io_lock);
	- }
	- vd->vdev_initialize_inflight++;
	- mutex_exit(&vd->vdev_initialize_io_lock);
	-
	- dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	- uint64_t txg = dmu_tx_get_txg(tx);
	-
	- spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
	- mutex_enter(&vd->vdev_initialize_lock);
	-
	- if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
	- uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
	- *guid = vd->vdev_guid;
	-
	- /* This is the first write of this txg. */
	- dsl_sync_task_nowait(spa_get_dsl(spa),
	- vdev_initialize_zap_update_sync, guid, 2,
	- ZFS_SPACE_CHECK_RESERVED, tx);
	- }
	-
	- /*
	- * We know the vdev struct will still be around since all
	- * consumers of vdev_free must stop the initialization first.
	- */
	- if (vdev_initialize_should_stop(vd)) {
	- mutex_enter(&vd->vdev_initialize_io_lock);
	- ASSERT3U(vd->vdev_initialize_inflight, >, 0);
	- vd->vdev_initialize_inflight--;
	- mutex_exit(&vd->vdev_initialize_io_lock);
	- spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
	- mutex_exit(&vd->vdev_initialize_lock);
	- dmu_tx_commit(tx);
	- return (SET_ERROR(EINTR));
	- }
	- mutex_exit(&vd->vdev_initialize_lock);
	-
	- vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
	- zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
	- size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
	- ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
	- /* vdev_initialize_cb releases SCL_STATE_ALL */
	-
	- dmu_tx_commit(tx);
	-
	- return (0);
	-}
	-
	-/*
	- * Translate a logical range to the physical range for the specified vdev_t.
	- * This function is initially called with a leaf vdev and will walk each
	- * parent vdev until it reaches a top-level vdev. Once the top-level is
	- * reached the physical range is initialized and the recursive function
	- * begins to unwind. As it unwinds it calls the parent's vdev specific
	- * translation function to do the real conversion.
	- */
	-void
	-vdev_xlate(vdev_t vd, const range_seg_t logical_rs, range_seg_t *physical_rs)
	-{
	- /*
	- * Walk up the vdev tree
	- */
	- if (vd != vd->vdev_top) {
	- vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
	- } else {
	- /*
	- * We've reached the top-level vdev, initialize the
	- * physical range to the logical range and start to
	- * unwind.
	- */
	- physical_rs->rs_start = logical_rs->rs_start;
	- physical_rs->rs_end = logical_rs->rs_end;
	- return;
	- }
	-
	- vdev_t *pvd = vd->vdev_parent;
	- ASSERT3P(pvd, !=, NULL);
	- ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
	-
	- /*
	- * As this recursive function unwinds, translate the logical
	- * range into its physical components by calling the
	- * vdev specific translate function.
	- */
	- range_seg_t intermediate = { 0 };
	- pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
	-
	- physical_rs->rs_start = intermediate.rs_start;
	- physical_rs->rs_end = intermediate.rs_end;
	-}
	-
	-/*
	- * Callback to fill each ABD chunk with zfs_initialize_value. len must be
	- * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
	- * allocation will guarantee these for us.
	- */
	-/* ARGSUSED */
	-static int
	-vdev_initialize_block_fill(void buf, size_t len, void unused)
	-{
	- ASSERT0(len % sizeof (uint64_t));
	- for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
	- (uint64_t )((char *)(buf) + i) = zfs_initialize_value;
	- }
	- return (0);
	-}
	-
	-static abd_t *
	-vdev_initialize_block_alloc()
	-{
	- /* Allocate ABD for filler data */
	- abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
	-
	- ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
	- (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
	- vdev_initialize_block_fill, NULL);
	-
	- return (data);
	-}
	-
	-static void
	-vdev_initialize_block_free(abd_t *data)
	-{
	- abd_free(data);
	-}
	-
	-static int
	-vdev_initialize_ranges(vdev_t vd, abd_t data)
	-{
	- avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
	-
	- for (range_seg_t *rs = avl_first(rt); rs != NULL;
	- rs = AVL_NEXT(rt, rs)) {
	- uint64_t size = rs->rs_end - rs->rs_start;
	-
	- /* Split range into legally-sized physical chunks */
	- uint64_t writes_required =
	- ((size - 1) / zfs_initialize_chunk_size) + 1;
	-
	- for (uint64_t w = 0; w < writes_required; w++) {
	- int error;
	-
	- error = vdev_initialize_write(vd,
	- VDEV_LABEL_START_SIZE + rs->rs_start +
	- (w * zfs_initialize_chunk_size),
	- MIN(size - (w * zfs_initialize_chunk_size),
	- zfs_initialize_chunk_size), data);
	- if (error != 0)
	- return (error);
	- }
	- }
	- return (0);
	-}
	-
	-static void
	-vdev_initialize_mg_wait(metaslab_group_t *mg)
	-{
	- ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
	- while (mg->mg_initialize_updating) {
	- cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
	- }
	-}
	-
	-static void
	-vdev_initialize_mg_mark(metaslab_group_t *mg)
	-{
	- ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
	- ASSERT(mg->mg_initialize_updating);
	-
	- while (mg->mg_ms_initializing >= max_initialize_ms) {
	- cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
	- }
	- mg->mg_ms_initializing++;
	- ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
	-}
	-
	-/*
	- * Mark the metaslab as being initialized to prevent any allocations
	- * on this metaslab. We must also track how many metaslabs are currently
	- * being initialized within a metaslab group and limit them to prevent
	- * allocation failures from occurring because all metaslabs are being
	- * initialized.
	- */
	-static void
	-vdev_initialize_ms_mark(metaslab_t *msp)
	-{
	- ASSERT(!MUTEX_HELD(&msp->ms_lock));
	- metaslab_group_t *mg = msp->ms_group;
	-
	- mutex_enter(&mg->mg_ms_initialize_lock);
	-
	- /*
	- * To keep an accurate count of how many threads are initializing
	- * a specific metaslab group, we only allow one thread to mark
	- * the metaslab group at a time. This ensures that the value of
	- * ms_initializing will be accurate when we decide to mark a metaslab
	- * group as being initialized. To do this we force all other threads
	- * to wait till the metaslab's mg_initialize_updating flag is no
	- * longer set.
	- */
	- vdev_initialize_mg_wait(mg);
	- mg->mg_initialize_updating = B_TRUE;
	- if (msp->ms_initializing == 0) {
	- vdev_initialize_mg_mark(mg);
	- }
	- mutex_enter(&msp->ms_lock);
	- msp->ms_initializing++;
	- mutex_exit(&msp->ms_lock);
	-
	- mg->mg_initialize_updating = B_FALSE;
	- cv_broadcast(&mg->mg_ms_initialize_cv);
	- mutex_exit(&mg->mg_ms_initialize_lock);
	-}
	-
	-static void
	-vdev_initialize_ms_unmark(metaslab_t *msp)
	-{
	- ASSERT(!MUTEX_HELD(&msp->ms_lock));
	- metaslab_group_t *mg = msp->ms_group;
	- mutex_enter(&mg->mg_ms_initialize_lock);
	- mutex_enter(&msp->ms_lock);
	- if (--msp->ms_initializing == 0) {
	- mg->mg_ms_initializing--;
	- cv_broadcast(&mg->mg_ms_initialize_cv);
	- }
	- mutex_exit(&msp->ms_lock);
	- mutex_exit(&mg->mg_ms_initialize_lock);
	-}
	-
	-static void
	-vdev_initialize_calculate_progress(vdev_t *vd)
	-{
	- ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) \|\|
	- spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
	- ASSERT(vd->vdev_leaf_zap != 0);
	-
	- vd->vdev_initialize_bytes_est = 0;
	- vd->vdev_initialize_bytes_done = 0;
	-
	- for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
	- metaslab_t *msp = vd->vdev_top->vdev_ms[i];
	- mutex_enter(&msp->ms_lock);
	-
	- uint64_t ms_free = msp->ms_size -
	- metaslab_allocated_space(msp);
	-
	- if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
	- ms_free /= vd->vdev_top->vdev_children;
	-
	- /*
	- * Convert the metaslab range to a physical range
	- * on our vdev. We use this to determine if we are
	- * in the middle of this metaslab range.
	- */
	- range_seg_t logical_rs, physical_rs;
	- logical_rs.rs_start = msp->ms_start;
	- logical_rs.rs_end = msp->ms_start + msp->ms_size;
	- vdev_xlate(vd, &logical_rs, &physical_rs);
	-
	- if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
	- vd->vdev_initialize_bytes_est += ms_free;
	- mutex_exit(&msp->ms_lock);
	- continue;
	- } else if (vd->vdev_initialize_last_offset >
	- physical_rs.rs_end) {
	- vd->vdev_initialize_bytes_done += ms_free;
	- vd->vdev_initialize_bytes_est += ms_free;
	- mutex_exit(&msp->ms_lock);
	- continue;
	- }
	-
	- /*
	- * If we get here, we're in the middle of initializing this
	- * metaslab. Load it and walk the free tree for more accurate
	- * progress estimation.
	- */
	- VERIFY0(metaslab_load(msp));
	-
	- for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
	- rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
	- logical_rs.rs_start = rs->rs_start;
	- logical_rs.rs_end = rs->rs_end;
	- vdev_xlate(vd, &logical_rs, &physical_rs);
	-
	- uint64_t size = physical_rs.rs_end -
	- physical_rs.rs_start;
	- vd->vdev_initialize_bytes_est += size;
	- if (vd->vdev_initialize_last_offset >
	- physical_rs.rs_end) {
	- vd->vdev_initialize_bytes_done += size;
	- } else if (vd->vdev_initialize_last_offset >
	- physical_rs.rs_start &&
	- vd->vdev_initialize_last_offset <
	- physical_rs.rs_end) {
	- vd->vdev_initialize_bytes_done +=
	- vd->vdev_initialize_last_offset -
	- physical_rs.rs_start;
	- }
	- }
	- mutex_exit(&msp->ms_lock);
	- }
	-}
	-
	-static void
	-vdev_initialize_load(vdev_t *vd)
	-{
	- ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) \|\|
	- spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
	- ASSERT(vd->vdev_leaf_zap != 0);
	-
	- if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE \|\|
	- vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
	- int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
	- vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
	- sizeof (vd->vdev_initialize_last_offset), 1,
	- &vd->vdev_initialize_last_offset);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- }
	-
	- vdev_initialize_calculate_progress(vd);
	-}
	-
	-
	-/*
	- * Convert the logical range into a physcial range and add it to our
	- * avl tree.
	- */
	-void
	-vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
	-{
	- vdev_t *vd = arg;
	- range_seg_t logical_rs, physical_rs;
	- logical_rs.rs_start = start;
	- logical_rs.rs_end = start + size;
	-
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	- vdev_xlate(vd, &logical_rs, &physical_rs);
	-
	- IMPLY(vd->vdev_top == vd,
	- logical_rs.rs_start == physical_rs.rs_start);
	- IMPLY(vd->vdev_top == vd,
	- logical_rs.rs_end == physical_rs.rs_end);
	-
	- /* Only add segments that we have not visited yet */
	- if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
	- return;
	-
	- /* Pick up where we left off mid-range. */
	- if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
	- zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
	- "(%llu, %llu)", vd->vdev_path,
	- (u_longlong_t)physical_rs.rs_start,
	- (u_longlong_t)physical_rs.rs_end,
	- (u_longlong_t)vd->vdev_initialize_last_offset,
	- (u_longlong_t)physical_rs.rs_end);
	- ASSERT3U(physical_rs.rs_end, >,
	- vd->vdev_initialize_last_offset);
	- physical_rs.rs_start = vd->vdev_initialize_last_offset;
	- }
	- ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
	-
	- /*
	- * With raidz, it's possible that the logical range does not live on
	- * this leaf vdev. We only add the physical range to this vdev's if it
	- * has a length greater than 0.
	- */
	- if (physical_rs.rs_end > physical_rs.rs_start) {
	- range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
	- physical_rs.rs_end - physical_rs.rs_start);
	- } else {
	- ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
	- }
	-}
	-
	-static void
	-vdev_initialize_thread(void *arg)
	-{
	- vdev_t *vd = arg;
	- spa_t *spa = vd->vdev_spa;
	- int error = 0;
	- uint64_t ms_count = 0;
	-
	- ASSERT(vdev_is_concrete(vd));
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	-
	- vd->vdev_initialize_last_offset = 0;
	- vdev_initialize_load(vd);
	-
	- abd_t *deadbeef = vdev_initialize_block_alloc();
	-
	- vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
	-
	- for (uint64_t i = 0; !vd->vdev_detached &&
	- i < vd->vdev_top->vdev_ms_count; i++) {
	- metaslab_t *msp = vd->vdev_top->vdev_ms[i];
	-
	- /*
	- * If we've expanded the top-level vdev or it's our
	- * first pass, calculate our progress.
	- */
	- if (vd->vdev_top->vdev_ms_count != ms_count) {
	- vdev_initialize_calculate_progress(vd);
	- ms_count = vd->vdev_top->vdev_ms_count;
	- }
	-
	- vdev_initialize_ms_mark(msp);
	- mutex_enter(&msp->ms_lock);
	- VERIFY0(metaslab_load(msp));
	-
	- range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
	- vd);
	- mutex_exit(&msp->ms_lock);
	-
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	- error = vdev_initialize_ranges(vd, deadbeef);
	- vdev_initialize_ms_unmark(msp);
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	-
	- range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
	- if (error != 0)
	- break;
	- }
	-
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	- mutex_enter(&vd->vdev_initialize_io_lock);
	- while (vd->vdev_initialize_inflight > 0) {
	- cv_wait(&vd->vdev_initialize_io_cv,
	- &vd->vdev_initialize_io_lock);
	- }
	- mutex_exit(&vd->vdev_initialize_io_lock);
	-
	- range_tree_destroy(vd->vdev_initialize_tree);
	- vdev_initialize_block_free(deadbeef);
	- vd->vdev_initialize_tree = NULL;
	-
	- mutex_enter(&vd->vdev_initialize_lock);
	- if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
	- vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
	- }
	- ASSERT(vd->vdev_initialize_thread != NULL \|\|
	- vd->vdev_initialize_inflight == 0);
	-
	- /*
	- * Drop the vdev_initialize_lock while we sync out the
	- * txg since it's possible that a device might be trying to
	- * come online and must check to see if it needs to restart an
	- * initialization. That thread will be holding the spa_config_lock
	- * which would prevent the txg_wait_synced from completing.
	- */
	- mutex_exit(&vd->vdev_initialize_lock);
	- txg_wait_synced(spa_get_dsl(spa), 0);
	- mutex_enter(&vd->vdev_initialize_lock);
	-
	- vd->vdev_initialize_thread = NULL;
	- cv_broadcast(&vd->vdev_initialize_cv);
	- mutex_exit(&vd->vdev_initialize_lock);
	- thread_exit();
	-}
	-
	-/*
	- * Initiates a device. Caller must hold vdev_initialize_lock.
	- * Device must be a leaf and not already be initializing.
	- */
	-void
	-vdev_initialize(vdev_t *vd)
	-{
	- ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	- ASSERT(vdev_is_concrete(vd));
	- ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
	- ASSERT(!vd->vdev_detached);
	- ASSERT(!vd->vdev_initialize_exit_wanted);
	- ASSERT(!vd->vdev_top->vdev_removing);
	-
	- vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
	- vd->vdev_initialize_thread = thread_create(NULL, 0,
	- vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
	-}
	-
	-/*
	- * Stop initializng a device, with the resultant initialing state being
	- * tgt_state. Blocks until the initializing thread has exited.
	- * Caller must hold vdev_initialize_lock and must not be writing to the spa
	- * config, as the initializing thread may try to enter the config as a reader
	- * before exiting.
	- */
	-void
	-vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
	-{
	- spa_t *spa = vd->vdev_spa;
	- ASSERT(!spa_config_held(spa, SCL_CONFIG \| SCL_STATE, RW_WRITER));
	-
	- ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	- ASSERT(vdev_is_concrete(vd));
	-
	- /*
	- * Allow cancel requests to proceed even if the initialize thread
	- * has stopped.
	- */
	- if (vd->vdev_initialize_thread == NULL &&
	- tgt_state != VDEV_INITIALIZE_CANCELED) {
	- return;
	- }
	-
	- vdev_initialize_change_state(vd, tgt_state);
	- vd->vdev_initialize_exit_wanted = B_TRUE;
	- while (vd->vdev_initialize_thread != NULL)
	- cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
	-
	- ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
	- vd->vdev_initialize_exit_wanted = B_FALSE;
	-}
	-
	-static void
	-vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
	-{
	- if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
	- mutex_enter(&vd->vdev_initialize_lock);
	- vdev_initialize_stop(vd, tgt_state);
	- mutex_exit(&vd->vdev_initialize_lock);
	- return;
	- }
	-
	- for (uint64_t i = 0; i < vd->vdev_children; i++) {
	- vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
	- }
	-}
	-
	-/*
	- * Convenience function to stop initializing of a vdev tree and set all
	- * initialize thread pointers to NULL.
	- */
	-void
	-vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
	-{
	- vdev_initialize_stop_all_impl(vd, tgt_state);
	-
	- if (vd->vdev_spa->spa_sync_on) {
	- /* Make sure that our state has been synced to disk */
	- txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
	- }
	-}
	-
	-void
	-vdev_initialize_restart(vdev_t *vd)
	-{
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
	-
	- if (vd->vdev_leaf_zap != 0) {
	- mutex_enter(&vd->vdev_initialize_lock);
	- uint64_t initialize_state = VDEV_INITIALIZE_NONE;
	- int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
	- vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
	- sizeof (initialize_state), 1, &initialize_state);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- vd->vdev_initialize_state = initialize_state;
	-
	- uint64_t timestamp = 0;
	- err = zap_lookup(vd->vdev_spa->spa_meta_objset,
	- vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
	- sizeof (timestamp), 1, &timestamp);
	- ASSERT(err == 0 \|\| err == ENOENT);
	- vd->vdev_initialize_action_time = (time_t)timestamp;
	-
	- if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED \|\|
	- vd->vdev_offline) {
	- /* load progress for reporting, but don't resume */
	- vdev_initialize_load(vd);
	- } else if (vd->vdev_initialize_state ==
	- VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
	- vdev_initialize(vd);
	- }
	-
	- mutex_exit(&vd->vdev_initialize_lock);
	- }
	-
	- for (uint64_t i = 0; i < vd->vdev_children; i++) {
	- vdev_initialize_restart(vd->vdev_child[i]);
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
	@@ -1,1701 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
	- * Copyright (c) 2017, Intel Corporation.
	- * Copyright 2019 Joyent, Inc.
	- */
	-
	-/*
	- * Virtual Device Labels
	- * ---------------------
	- *
	- * The vdev label serves several distinct purposes:
	- *
	- * 1. Uniquely identify this device as part of a ZFS pool and confirm its
	- * identity within the pool.
	- *
	- * 2. Verify that all the devices given in a configuration are present
	- * within the pool.
	- *
	- * 3. Determine the uberblock for the pool.
	- *
	- * 4. In case of an import operation, determine the configuration of the
	- * toplevel vdev of which it is a part.
	- *
	- * 5. If an import operation cannot find all the devices in the pool,
	- * provide enough information to the administrator to determine which
	- * devices are missing.
	- *
	- * It is important to note that while the kernel is responsible for writing the
	- * label, it only consumes the information in the first three cases. The
	- * latter information is only consumed in userland when determining the
	- * configuration to import a pool.
	- *
	- *
	- * Label Organization
	- * ------------------
	- *
	- * Before describing the contents of the label, it's important to understand how
	- * the labels are written and updated with respect to the uberblock.
	- *
	- * When the pool configuration is altered, either because it was newly created
	- * or a device was added, we want to update all the labels such that we can deal
	- * with fatal failure at any point. To this end, each disk has two labels which
	- * are updated before and after the uberblock is synced. Assuming we have
	- * labels and an uberblock with the following transaction groups:
	- *
	- * L1 UB L2
	- * +------+ +------+ +------+
	- * \| \| \| \| \| \|
	- * \| t10 \| \| t10 \| \| t10 \|
	- * \| \| \| \| \| \|
	- * +------+ +------+ +------+
	- *
	- * In this stable state, the labels and the uberblock were all updated within
	- * the same transaction group (10). Each label is mirrored and checksummed, so
	- * that we can detect when we fail partway through writing the label.
	- *
	- * In order to identify which labels are valid, the labels are written in the
	- * following manner:
	- *
	- * 1. For each vdev, update 'L1' to the new label
	- * 2. Update the uberblock
	- * 3. For each vdev, update 'L2' to the new label
	- *
	- * Given arbitrary failure, we can determine the correct label to use based on
	- * the transaction group. If we fail after updating L1 but before updating the
	- * UB, we will notice that L1's transaction group is greater than the uberblock,
	- * so L2 must be valid. If we fail after writing the uberblock but before
	- * writing L2, we will notice that L2's transaction group is less than L1, and
	- * therefore L1 is valid.
	- *
	- * Another added complexity is that not every label is updated when the config
	- * is synced. If we add a single device, we do not want to have to re-write
	- * every label for every device in the pool. This means that both L1 and L2 may
	- * be older than the pool uberblock, because the necessary information is stored
	- * on another vdev.
	- *
	- *
	- * On-disk Format
	- * --------------
	- *
	- * The vdev label consists of two distinct parts, and is wrapped within the
	- * vdev_label_t structure. The label includes 8k of padding to permit legacy
	- * VTOC disk labels, but is otherwise ignored.
	- *
	- * The first half of the label is a packed nvlist which contains pool wide
	- * properties, per-vdev properties, and configuration information. It is
	- * described in more detail below.
	- *
	- * The latter half of the label consists of a redundant array of uberblocks.
	- * These uberblocks are updated whenever a transaction group is committed,
	- * or when the configuration is updated. When a pool is loaded, we scan each
	- * vdev for the 'best' uberblock.
	- *
	- *
	- * Configuration Information
	- * -------------------------
	- *
	- * The nvlist describing the pool and vdev contains the following elements:
	- *
	- * version ZFS on-disk version
	- * name Pool name
	- * state Pool state
	- * txg Transaction group in which this label was written
	- * pool_guid Unique identifier for this pool
	- * vdev_tree An nvlist describing vdev tree.
	- * features_for_read
	- * An nvlist of the features necessary for reading the MOS.
	- *
	- * Each leaf device label also contains the following:
	- *
	- * top_guid Unique ID for top-level vdev in which this is contained
	- * guid Unique ID for the leaf vdev
	- *
	- * The 'vs' configuration follows the format described in 'spa_config.c'.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/dmu.h>
	-#include <sys/zap.h>
	-#include <sys/vdev.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/uberblock_impl.h>
	-#include <sys/metaslab.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/zio.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/abd.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/trim_map.h>
	-
	-static boolean_t vdev_trim_on_init = B_TRUE;
	-SYSCTL_DECL(_vfs_zfs_vdev);
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RWTUN,
	- &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
	-
	-/*
	- * Basic routines to read and write from a vdev label.
	- * Used throughout the rest of this file.
	- */
	-uint64_t
	-vdev_label_offset(uint64_t psize, int l, uint64_t offset)
	-{
	- ASSERT(offset < sizeof (vdev_label_t));
	- ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
	-
	- return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
	- 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
	-}
	-
	-/*
	- * Returns back the vdev label associated with the passed in offset.
	- */
	-int
	-vdev_label_number(uint64_t psize, uint64_t offset)
	-{
	- int l;
	-
	- if (offset >= psize - VDEV_LABEL_END_SIZE) {
	- offset -= psize - VDEV_LABEL_END_SIZE;
	- offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
	- }
	- l = offset / sizeof (vdev_label_t);
	- return (l < VDEV_LABELS ? l : -1);
	-}
	-
	-static void
	-vdev_label_read(zio_t zio, vdev_t vd, int l, abd_t *buf, uint64_t offset,
	- uint64_t size, zio_done_func_t done, void private, int flags)
	-{
	- ASSERT(
	- spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE \|\|
	- spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
	- ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
	-
	- zio_nowait(zio_read_phys(zio, vd,
	- vdev_label_offset(vd->vdev_psize, l, offset),
	- size, buf, ZIO_CHECKSUM_LABEL, done, private,
	- ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
	-}
	-
	-void
	-vdev_label_write(zio_t zio, vdev_t vd, int l, abd_t *buf, uint64_t offset,
	- uint64_t size, zio_done_func_t done, void private, int flags)
	-{
	- ASSERT(
	- spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE \|\|
	- spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
	- ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
	-
	- zio_nowait(zio_write_phys(zio, vd,
	- vdev_label_offset(vd->vdev_psize, l, offset),
	- size, buf, ZIO_CHECKSUM_LABEL, done, private,
	- ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
	-}
	-
	-static void
	-root_vdev_actions_getprogress(vdev_t vd, nvlist_t nvl)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- if (vd != spa->spa_root_vdev)
	- return;
	-
	- /* provide either current or previous scan information */
	- pool_scan_stat_t ps;
	- if (spa_scan_get_stats(spa, &ps) == 0) {
	- fnvlist_add_uint64_array(nvl,
	- ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
	- sizeof (pool_scan_stat_t) / sizeof (uint64_t));
	- }
	-
	- pool_removal_stat_t prs;
	- if (spa_removal_get_stats(spa, &prs) == 0) {
	- fnvlist_add_uint64_array(nvl,
	- ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
	- sizeof (prs) / sizeof (uint64_t));
	- }
	-
	- pool_checkpoint_stat_t pcs;
	- if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
	- fnvlist_add_uint64_array(nvl,
	- ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
	- sizeof (pcs) / sizeof (uint64_t));
	- }
	-}
	-
	-/*
	- * Generate the nvlist representing this vdev's config.
	- */
	-nvlist_t *
	-vdev_config_generate(spa_t spa, vdev_t vd, boolean_t getstats,
	- vdev_config_flag_t flags)
	-{
	- nvlist_t *nv = NULL;
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	-
	- nv = fnvlist_alloc();
	-
	- fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
	- if (!(flags & (VDEV_CONFIG_SPARE \| VDEV_CONFIG_L2CACHE)))
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
	-
	- if (vd->vdev_path != NULL)
	- fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
	-
	- if (vd->vdev_devid != NULL)
	- fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
	-
	- if (vd->vdev_physpath != NULL)
	- fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
	- vd->vdev_physpath);
	-
	- if (vd->vdev_fru != NULL)
	- fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
	-
	- if (vd->vdev_nparity != 0) {
	- ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
	- VDEV_TYPE_RAIDZ) == 0);
	-
	- /*
	- * Make sure someone hasn't managed to sneak a fancy new vdev
	- * into a crufty old storage pool.
	- */
	- ASSERT(vd->vdev_nparity == 1 \|\|
	- (vd->vdev_nparity <= 2 &&
	- spa_version(spa) >= SPA_VERSION_RAIDZ2) \|\|
	- (vd->vdev_nparity <= 3 &&
	- spa_version(spa) >= SPA_VERSION_RAIDZ3));
	-
	- /*
	- * Note that we'll add the nparity tag even on storage pools
	- * that only support a single parity device -- older software
	- * will just ignore it.
	- */
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
	- }
	-
	- if (vd->vdev_wholedisk != -1ULL)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	- vd->vdev_wholedisk);
	-
	- if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
	-
	- if (vd->vdev_isspare)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
	-
	- if (!(flags & (VDEV_CONFIG_SPARE \| VDEV_CONFIG_L2CACHE)) &&
	- vd == vd->vdev_top) {
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
	- vd->vdev_ms_array);
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
	- vd->vdev_ms_shift);
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
	- vd->vdev_asize);
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
	- if (vd->vdev_removing) {
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
	- vd->vdev_removing);
	- }
	-
	- /* zpool command expects alloc class data */
	- if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
	- const char *bias = NULL;
	-
	- switch (vd->vdev_alloc_bias) {
	- case VDEV_BIAS_LOG:
	- bias = VDEV_ALLOC_BIAS_LOG;
	- break;
	- case VDEV_BIAS_SPECIAL:
	- bias = VDEV_ALLOC_BIAS_SPECIAL;
	- break;
	- case VDEV_BIAS_DEDUP:
	- bias = VDEV_ALLOC_BIAS_DEDUP;
	- break;
	- default:
	- ASSERT3U(vd->vdev_alloc_bias, ==,
	- VDEV_BIAS_NONE);
	- }
	- fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
	- bias);
	- }
	- }
	-
	- if (vd->vdev_dtl_sm != NULL) {
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
	- space_map_object(vd->vdev_dtl_sm));
	- }
	-
	- if (vic->vic_mapping_object != 0) {
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
	- vic->vic_mapping_object);
	- }
	-
	- if (vic->vic_births_object != 0) {
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
	- vic->vic_births_object);
	- }
	-
	- if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
	- vic->vic_prev_indirect_vdev);
	- }
	-
	- if (vd->vdev_crtxg)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
	-
	- if (flags & VDEV_CONFIG_MOS) {
	- if (vd->vdev_leaf_zap != 0) {
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
	- vd->vdev_leaf_zap);
	- }
	-
	- if (vd->vdev_top_zap != 0) {
	- ASSERT(vd == vd->vdev_top);
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
	- vd->vdev_top_zap);
	- }
	- }
	-
	- if (getstats) {
	- vdev_stat_t vs;
	-
	- vdev_get_stats(vd, &vs);
	- fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
	-
	- root_vdev_actions_getprogress(vd, nv);
	-
	- /*
	- * Note: this can be called from open context
	- * (spa_get_stats()), so we need the rwlock to prevent
	- * the mapping from being changed by condensing.
	- */
	- rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
	- if (vd->vdev_indirect_mapping != NULL) {
	- ASSERT(vd->vdev_indirect_births != NULL);
	- vdev_indirect_mapping_t *vim =
	- vd->vdev_indirect_mapping;
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
	- vdev_indirect_mapping_size(vim));
	- }
	- rw_exit(&vd->vdev_indirect_rwlock);
	- if (vd->vdev_mg != NULL &&
	- vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
	- /*
	- * Compute approximately how much memory would be used
	- * for the indirect mapping if this device were to
	- * be removed.
	- *
	- * Note: If the frag metric is invalid, then not
	- * enough metaslabs have been converted to have
	- * histograms.
	- */
	- uint64_t seg_count = 0;
	- uint64_t to_alloc = vd->vdev_stat.vs_alloc;
	-
	- /*
	- * There are the same number of allocated segments
	- * as free segments, so we will have at least one
	- * entry per free segment. However, small free
	- * segments (smaller than vdev_removal_max_span)
	- * will be combined with adjacent allocated segments
	- * as a single mapping.
	- */
	- for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
	- if (1ULL << (i + 1) < vdev_removal_max_span) {
	- to_alloc +=
	- vd->vdev_mg->mg_histogram[i] <<
	- i + 1;
	- } else {
	- seg_count +=
	- vd->vdev_mg->mg_histogram[i];
	- }
	- }
	-
	- /*
	- * The maximum length of a mapping is
	- * zfs_remove_max_segment, so we need at least one entry
	- * per zfs_remove_max_segment of allocated data.
	- */
	- seg_count += to_alloc / zfs_remove_max_segment;
	-
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
	- seg_count *
	- sizeof (vdev_indirect_mapping_entry_phys_t));
	- }
	- }
	-
	- if (!vd->vdev_ops->vdev_op_leaf) {
	- nvlist_t **child;
	- int c, idx;
	-
	- ASSERT(!vd->vdev_ishole);
	-
	- child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
	- KM_SLEEP);
	-
	- for (c = 0, idx = 0; c < vd->vdev_children; c++) {
	- vdev_t *cvd = vd->vdev_child[c];
	-
	- /*
	- * If we're generating an nvlist of removing
	- * vdevs then skip over any device which is
	- * not being removed.
	- */
	- if ((flags & VDEV_CONFIG_REMOVING) &&
	- !cvd->vdev_removing)
	- continue;
	-
	- child[idx++] = vdev_config_generate(spa, cvd,
	- getstats, flags);
	- }
	-
	- if (idx) {
	- fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	- child, idx);
	- }
	-
	- for (c = 0; c < idx; c++)
	- nvlist_free(child[c]);
	-
	- kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
	-
	- } else {
	- const char *aux = NULL;
	-
	- if (vd->vdev_offline && !vd->vdev_tmpoffline)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
	- if (vd->vdev_resilver_txg != 0)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
	- vd->vdev_resilver_txg);
	- if (vd->vdev_faulted)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
	- if (vd->vdev_degraded)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
	- if (vd->vdev_removed)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
	- if (vd->vdev_unspare)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
	- if (vd->vdev_ishole)
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
	-
	- switch (vd->vdev_stat.vs_aux) {
	- case VDEV_AUX_ERR_EXCEEDED:
	- aux = "err_exceeded";
	- break;
	-
	- case VDEV_AUX_EXTERNAL:
	- aux = "external";
	- break;
	- }
	-
	- if (aux != NULL)
	- fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
	-
	- if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
	- vd->vdev_orig_guid);
	- }
	- }
	-
	- return (nv);
	-}
	-
	-/*
	- * Generate a view of the top-level vdevs. If we currently have holes
	- * in the namespace, then generate an array which contains a list of holey
	- * vdevs. Additionally, add the number of top-level children that currently
	- * exist.
	- */
	-void
	-vdev_top_config_generate(spa_t spa, nvlist_t config)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- uint64_t *array;
	- uint_t c, idx;
	-
	- array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
	-
	- for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
	- vdev_t *tvd = rvd->vdev_child[c];
	-
	- if (tvd->vdev_ishole) {
	- array[idx++] = c;
	- }
	- }
	-
	- if (idx) {
	- VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
	- array, idx) == 0);
	- }
	-
	- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
	- rvd->vdev_children) == 0);
	-
	- kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
	-}
	-
	-/*
	- * Returns the configuration from the label of the given vdev. For vdevs
	- * which don't have a txg value stored on their label (i.e. spares/cache)
	- * or have not been completely initialized (txg = 0) just return
	- * the configuration from the first valid label we find. Otherwise,
	- * find the most up-to-date label that does not exceed the specified
	- * 'txg' value.
	- */
	-nvlist_t *
	-vdev_label_read_config(vdev_t *vd, uint64_t txg)
	-{
	- spa_t *spa = vd->vdev_spa;
	- nvlist_t *config = NULL;
	- vdev_phys_t *vp;
	- abd_t *vp_abd;
	- zio_t *zio;
	- uint64_t best_txg = 0;
	- uint64_t label_txg = 0;
	- int error = 0;
	- int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL \|
	- ZIO_FLAG_SPECULATIVE;
	-
	- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	-
	- if (!vdev_readable(vd))
	- return (NULL);
	-
	- vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
	- vp = abd_to_buf(vp_abd);
	-
	-retry:
	- for (int l = 0; l < VDEV_LABELS; l++) {
	- nvlist_t *label = NULL;
	-
	- zio = zio_root(spa, NULL, NULL, flags);
	-
	- vdev_label_read(zio, vd, l, vp_abd,
	- offsetof(vdev_label_t, vl_vdev_phys),
	- sizeof (vdev_phys_t), NULL, NULL, flags);
	-
	- if (zio_wait(zio) == 0 &&
	- nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
	- &label, 0) == 0) {
	- /*
	- * Auxiliary vdevs won't have txg values in their
	- * labels and newly added vdevs may not have been
	- * completely initialized so just return the
	- * configuration from the first valid label we
	- * encounter.
	- */
	- error = nvlist_lookup_uint64(label,
	- ZPOOL_CONFIG_POOL_TXG, &label_txg);
	- if ((error \|\| label_txg == 0) && !config) {
	- config = label;
	- break;
	- } else if (label_txg <= txg && label_txg > best_txg) {
	- best_txg = label_txg;
	- nvlist_free(config);
	- config = fnvlist_dup(label);
	- }
	- }
	-
	- if (label != NULL) {
	- nvlist_free(label);
	- label = NULL;
	- }
	- }
	-
	- if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
	- flags \|= ZIO_FLAG_TRYHARD;
	- goto retry;
	- }
	-
	- /*
	- * We found a valid label but it didn't pass txg restrictions.
	- */
	- if (config == NULL && label_txg != 0) {
	- vdev_dbgmsg(vd, "label discarded as txg is too large "
	- "(%llu > %llu)", (u_longlong_t)label_txg,
	- (u_longlong_t)txg);
	- }
	-
	- abd_free(vp_abd);
	-
	- return (config);
	-}
	-
	-/*
	- * Determine if a device is in use. The 'spare_guid' parameter will be filled
	- * in with the device guid if this spare is active elsewhere on the system.
	- */
	-static boolean_t
	-vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
	- uint64_t spare_guid, uint64_t l2cache_guid)
	-{
	- spa_t *spa = vd->vdev_spa;
	- uint64_t state, pool_guid, device_guid, txg, spare_pool;
	- uint64_t vdtxg = 0;
	- nvlist_t *label;
	-
	- if (spare_guid)
	- *spare_guid = 0ULL;
	- if (l2cache_guid)
	- *l2cache_guid = 0ULL;
	-
	- /*
	- * Read the label, if any, and perform some basic sanity checks.
	- */
	- if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
	- return (B_FALSE);
	-
	- (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
	- &vdtxg);
	-
	- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
	- &state) != 0 \|\|
	- nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
	- &device_guid) != 0) {
	- nvlist_free(label);
	- return (B_FALSE);
	- }
	-
	- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	- (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
	- &pool_guid) != 0 \|\|
	- nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
	- &txg) != 0)) {
	- nvlist_free(label);
	- return (B_FALSE);
	- }
	-
	- nvlist_free(label);
	-
	- /*
	- * Check to see if this device indeed belongs to the pool it claims to
	- * be a part of. The only way this is allowed is if the device is a hot
	- * spare (which we check for later on).
	- */
	- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	- !spa_guid_exists(pool_guid, device_guid) &&
	- !spa_spare_exists(device_guid, NULL, NULL) &&
	- !spa_l2cache_exists(device_guid, NULL))
	- return (B_FALSE);
	-
	- /*
	- * If the transaction group is zero, then this an initialized (but
	- * unused) label. This is only an error if the create transaction
	- * on-disk is the same as the one we're using now, in which case the
	- * user has attempted to add the same vdev multiple times in the same
	- * transaction.
	- */
	- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	- txg == 0 && vdtxg == crtxg)
	- return (B_TRUE);
	-
	- /*
	- * Check to see if this is a spare device. We do an explicit check for
	- * spa_has_spare() here because it may be on our pending list of spares
	- * to add. We also check if it is an l2cache device.
	- */
	- if (spa_spare_exists(device_guid, &spare_pool, NULL) \|\|
	- spa_has_spare(spa, device_guid)) {
	- if (spare_guid)
	- *spare_guid = device_guid;
	-
	- switch (reason) {
	- case VDEV_LABEL_CREATE:
	- case VDEV_LABEL_L2CACHE:
	- return (B_TRUE);
	-
	- case VDEV_LABEL_REPLACE:
	- return (!spa_has_spare(spa, device_guid) \|\|
	- spare_pool != 0ULL);
	-
	- case VDEV_LABEL_SPARE:
	- return (spa_has_spare(spa, device_guid));
	- }
	- }
	-
	- /*
	- * Check to see if this is an l2cache device.
	- */
	- if (spa_l2cache_exists(device_guid, NULL))
	- return (B_TRUE);
	-
	- /*
	- * We can't rely on a pool's state if it's been imported
	- * read-only. Instead we look to see if the pools is marked
	- * read-only in the namespace and set the state to active.
	- */
	- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	- (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
	- spa_mode(spa) == FREAD)
	- state = POOL_STATE_ACTIVE;
	-
	- /*
	- * If the device is marked ACTIVE, then this device is in use by another
	- * pool on the system.
	- */
	- return (state == POOL_STATE_ACTIVE);
	-}
	-
	-/*
	- * Initialize a vdev label. We check to make sure each leaf device is not in
	- * use, and writable. We put down an initial label which we will later
	- * overwrite with a complete label. Note that it's important to do this
	- * sequentially, not in parallel, so that we catch cases of multiple use of the
	- * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
	- * itself.
	- */
	-int
	-vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
	-{
	- spa_t *spa = vd->vdev_spa;
	- nvlist_t *label;
	- vdev_phys_t *vp;
	- abd_t *vp_abd;
	- abd_t *bootenv;
	- uberblock_t *ub;
	- abd_t *ub_abd;
	- zio_t *zio;
	- char *buf;
	- size_t buflen;
	- int error;
	- uint64_t spare_guid, l2cache_guid;
	- int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL;
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- for (int c = 0; c < vd->vdev_children; c++)
	- if ((error = vdev_label_init(vd->vdev_child[c],
	- crtxg, reason)) != 0)
	- return (error);
	-
	- /* Track the creation time for this vdev */
	- vd->vdev_crtxg = crtxg;
	-
	- if (!vd->vdev_ops->vdev_op_leaf \|\| !spa_writeable(spa))
	- return (0);
	-
	- /*
	- * Dead vdevs cannot be initialized.
	- */
	- if (vdev_is_dead(vd))
	- return (SET_ERROR(EIO));
	-
	- /*
	- * Determine if the vdev is in use.
	- */
	- if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
	- vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
	- return (SET_ERROR(EBUSY));
	-
	- /*
	- * If this is a request to add or replace a spare or l2cache device
	- * that is in use elsewhere on the system, then we must update the
	- * guid (which was initialized to a random value) to reflect the
	- * actual GUID (which is shared between multiple pools).
	- */
	- if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
	- spare_guid != 0ULL) {
	- uint64_t guid_delta = spare_guid - vd->vdev_guid;
	-
	- vd->vdev_guid += guid_delta;
	-
	- for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
	- pvd->vdev_guid_sum += guid_delta;
	-
	- /*
	- * If this is a replacement, then we want to fallthrough to the
	- * rest of the code. If we're adding a spare, then it's already
	- * labeled appropriately and we can just return.
	- */
	- if (reason == VDEV_LABEL_SPARE)
	- return (0);
	- ASSERT(reason == VDEV_LABEL_REPLACE \|\|
	- reason == VDEV_LABEL_SPLIT);
	- }
	-
	- if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
	- l2cache_guid != 0ULL) {
	- uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
	-
	- vd->vdev_guid += guid_delta;
	-
	- for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
	- pvd->vdev_guid_sum += guid_delta;
	-
	- /*
	- * If this is a replacement, then we want to fallthrough to the
	- * rest of the code. If we're adding an l2cache, then it's
	- * already labeled appropriately and we can just return.
	- */
	- if (reason == VDEV_LABEL_L2CACHE)
	- return (0);
	- ASSERT(reason == VDEV_LABEL_REPLACE);
	- }
	-
	- /*
	- * TRIM the whole thing, excluding the blank space and boot header
	- * as specified by ZFS On-Disk Specification (section 1.3), so that
	- * we start with a clean slate.
	- * It's just an optimization, so we don't care if it fails.
	- * Don't TRIM if removing so that we don't interfere with zpool
	- * disaster recovery.
	- */
	- if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim &&
	- (reason == VDEV_LABEL_CREATE \|\| reason == VDEV_LABEL_SPARE \|\|
	- reason == VDEV_LABEL_L2CACHE))
	- zio_wait(zio_trim(NULL, spa, vd, VDEV_SKIP_SIZE,
	- vd->vdev_psize - VDEV_SKIP_SIZE));
	-
	- /*
	- * Initialize its label.
	- */
	- vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
	- abd_zero(vp_abd, sizeof (vdev_phys_t));
	- vp = abd_to_buf(vp_abd);
	-
	- /*
	- * Generate a label describing the pool and our top-level vdev.
	- * We mark it as being from txg 0 to indicate that it's not
	- * really part of an active pool just yet. The labels will
	- * be written again with a meaningful txg by spa_sync().
	- */
	- if (reason == VDEV_LABEL_SPARE \|\|
	- (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
	- /*
	- * For inactive hot spares, we generate a special label that
	- * identifies as a mutually shared hot spare. We write the
	- * label if we are adding a hot spare, or if we are removing an
	- * active hot spare (in which case we want to revert the
	- * labels).
	- */
	- VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
	- spa_version(spa)) == 0);
	- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
	- POOL_STATE_SPARE) == 0);
	- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
	- vd->vdev_guid) == 0);
	- } else if (reason == VDEV_LABEL_L2CACHE \|\|
	- (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
	- /*
	- * For level 2 ARC devices, add a special label.
	- */
	- VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
	- spa_version(spa)) == 0);
	- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
	- POOL_STATE_L2CACHE) == 0);
	- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
	- vd->vdev_guid) == 0);
	- } else {
	- uint64_t txg = 0ULL;
	-
	- if (reason == VDEV_LABEL_SPLIT)
	- txg = spa->spa_uberblock.ub_txg;
	- label = spa_config_generate(spa, vd, txg, B_FALSE);
	-
	- /*
	- * Add our creation time. This allows us to detect multiple
	- * vdev uses as described above, and automatically expires if we
	- * fail.
	- */
	- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
	- crtxg) == 0);
	- }
	-
	- buf = vp->vp_nvlist;
	- buflen = sizeof (vp->vp_nvlist);
	-
	- error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
	- if (error != 0) {
	- nvlist_free(label);
	- abd_free(vp_abd);
	- /* EFAULT means nvlist_pack ran out of room */
	- return (error == EFAULT ? ENAMETOOLONG : EINVAL);
	- }
	-
	- /*
	- * Initialize uberblock template.
	- */
	- ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
	- abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
	- abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
	- ub = abd_to_buf(ub_abd);
	- ub->ub_txg = 0;
	-
	- /* Initialize the 2nd padding area. */
	- bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
	- abd_zero(bootenv, VDEV_PAD_SIZE);
	-
	- /*
	- * Write everything in parallel.
	- */
	-retry:
	- zio = zio_root(spa, NULL, NULL, flags);
	-
	- for (int l = 0; l < VDEV_LABELS; l++) {
	-
	- vdev_label_write(zio, vd, l, vp_abd,
	- offsetof(vdev_label_t, vl_vdev_phys),
	- sizeof (vdev_phys_t), NULL, NULL, flags);
	-
	- /*
	- * Skip the 1st padding area.
	- * Zero out the 2nd padding area where it might have
	- * left over data from previous filesystem format.
	- */
	- vdev_label_write(zio, vd, l, bootenv,
	- offsetof(vdev_label_t, vl_be),
	- VDEV_PAD_SIZE, NULL, NULL, flags);
	-
	- vdev_label_write(zio, vd, l, ub_abd,
	- offsetof(vdev_label_t, vl_uberblock),
	- VDEV_UBERBLOCK_RING, NULL, NULL, flags);
	- }
	-
	- error = zio_wait(zio);
	-
	- if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
	- flags \|= ZIO_FLAG_TRYHARD;
	- goto retry;
	- }
	-
	- nvlist_free(label);
	- abd_free(bootenv);
	- abd_free(ub_abd);
	- abd_free(vp_abd);
	-
	- /*
	- * If this vdev hasn't been previously identified as a spare, then we
	- * mark it as such only if a) we are labeling it as a spare, or b) it
	- * exists as a spare elsewhere in the system. Do the same for
	- * level 2 ARC devices.
	- */
	- if (error == 0 && !vd->vdev_isspare &&
	- (reason == VDEV_LABEL_SPARE \|\|
	- spa_spare_exists(vd->vdev_guid, NULL, NULL)))
	- spa_spare_add(vd);
	-
	- if (error == 0 && !vd->vdev_isl2cache &&
	- (reason == VDEV_LABEL_L2CACHE \|\|
	- spa_l2cache_exists(vd->vdev_guid, NULL)))
	- spa_l2cache_add(vd);
	-
	- return (error);
	-}
	-
	-/*
	- * Done callback for vdev_label_read_bootenv_impl. If this is the first
	- * callback to finish, store our abd in the callback pointer. Otherwise, we
	- * just free our abd and return.
	- */
	-static void
	-vdev_label_read_bootenv_done(zio_t *zio)
	-{
	- zio_t *rio = zio->io_private;
	- abd_t **cbp = rio->io_private;
	-
	- ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE);
	-
	- if (zio->io_error == 0) {
	- mutex_enter(&rio->io_lock);
	- if (*cbp == NULL) {
	- /* Will free this buffer in vdev_label_read_bootenv. */
	- *cbp = zio->io_abd;
	- } else {
	- abd_free(zio->io_abd);
	- }
	- mutex_exit(&rio->io_lock);
	- } else {
	- abd_free(zio->io_abd);
	- }
	-}
	-
	-static void
	-vdev_label_read_bootenv_impl(zio_t zio, vdev_t vd, int flags)
	-{
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags);
	-
	- /*
	- * We just use the first label that has a correct checksum; the
	- * bootloader should have rewritten them all to be the same on boot,
	- * and any changes we made since boot have been the same across all
	- * labels.
	- *
	- * While grub supports writing to all four labels, other bootloaders
	- * don't, so we only use the first two labels to store boot
	- * information.
	- */
	- if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
	- for (int l = 0; l < VDEV_LABELS / 2; l++) {
	- vdev_label_read(zio, vd, l,
	- abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE),
	- offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE,
	- vdev_label_read_bootenv_done, zio, flags);
	- }
	- }
	-}
	-
	-int
	-vdev_label_read_bootenv(vdev_t rvd, nvlist_t command)
	-{
	- spa_t *spa = rvd->vdev_spa;
	- abd_t *abd = NULL;
	- int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL \|
	- ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_TRYHARD;
	-
	- ASSERT(command);
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- zio_t *zio = zio_root(spa, NULL, &abd, flags);
	- vdev_label_read_bootenv_impl(zio, rvd, flags);
	- int err = zio_wait(zio);
	-
	- if (abd != NULL) {
	- vdev_boot_envblock_t *vbe = abd_to_buf(abd);
	- if (vbe->vbe_version != VB_RAW) {
	- abd_free(abd);
	- return (SET_ERROR(ENOTSUP));
	- }
	- vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
	- fnvlist_add_string(command, "envmap", vbe->vbe_bootenv);
	- /* abd was allocated in vdev_label_read_bootenv_impl() */
	- abd_free(abd);
	- /* If we managed to read any successfully, return success. */
	- return (0);
	- }
	- return (err);
	-}
	-
	-int
	-vdev_label_write_bootenv(vdev_t vd, char envmap)
	-{
	- zio_t *zio;
	- spa_t *spa = vd->vdev_spa;
	- vdev_boot_envblock_t *bootenv;
	- int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL;
	- int error = ENXIO;
	-
	- if (strlen(envmap) >= sizeof (bootenv->vbe_bootenv)) {
	- return (SET_ERROR(E2BIG));
	- }
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- int child_err = vdev_label_write_bootenv(vd->vdev_child[c],
	- envmap);
	- /*
	- * As long as any of the disks managed to write all of their
	- * labels successfully, return success.
	- */
	- if (child_err == 0)
	- error = child_err;
	- }
	-
	- if (!vd->vdev_ops->vdev_op_leaf \|\| vdev_is_dead(vd) \|\|
	- !vdev_writeable(vd)) {
	- return (error);
	- }
	- ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE);
	- abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
	- abd_zero(abd, VDEV_PAD_SIZE);
	- bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE);
	-
	- char *buf = bootenv->vbe_bootenv;
	- (void) strlcpy(buf, envmap, sizeof (bootenv->vbe_bootenv));
	- bootenv->vbe_version = VB_RAW;
	- abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);
	-
	-retry:
	- zio = zio_root(spa, NULL, NULL, flags);
	- for (int l = 0; l < VDEV_LABELS / 2; l++) {
	- vdev_label_write(zio, vd, l, abd,
	- offsetof(vdev_label_t, vl_be),
	- VDEV_PAD_SIZE, NULL, NULL, flags);
	- }
	-
	- error = zio_wait(zio);
	- if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
	- flags \|= ZIO_FLAG_TRYHARD;
	- goto retry;
	- }
	-
	- abd_free(abd);
	- return (error);
	-}
	-
	-int
	-vdev_label_write_pad2(vdev_t vd, const char buf, size_t size)
	-{
	- spa_t *spa = vd->vdev_spa;
	- zio_t *zio;
	- abd_t *pad2;
	- int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL;
	- int error;
	-
	- if (size > VDEV_PAD_SIZE)
	- return (EINVAL);
	-
	- if (!vd->vdev_ops->vdev_op_leaf)
	- return (ENODEV);
	- if (vdev_is_dead(vd))
	- return (ENXIO);
	-
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
	- abd_zero(pad2, VDEV_PAD_SIZE);
	- abd_copy_from_buf(pad2, buf, size);
	-
	-retry:
	- zio = zio_root(spa, NULL, NULL, flags);
	- vdev_label_write(zio, vd, 0, pad2,
	- offsetof(vdev_label_t, vl_be),
	- VDEV_PAD_SIZE, NULL, NULL, flags);
	- error = zio_wait(zio);
	- if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
	- flags \|= ZIO_FLAG_TRYHARD;
	- goto retry;
	- }
	-
	- abd_free(pad2);
	- return (error);
	-}
	-
	-/*
	- * ==========================================================================
	- * uberblock load/sync
	- * ==========================================================================
	- */
	-
	-/*
	- * Consider the following situation: txg is safely synced to disk. We've
	- * written the first uberblock for txg + 1, and then we lose power. When we
	- * come back up, we fail to see the uberblock for txg + 1 because, say,
	- * it was on a mirrored device and the replica to which we wrote txg + 1
	- * is now offline. If we then make some changes and sync txg + 1, and then
	- * the missing replica comes back, then for a few seconds we'll have two
	- * conflicting uberblocks on disk with the same txg. The solution is simple:
	- * among uberblocks with equal txg, choose the one with the latest timestamp.
	- */
	-static int
	-vdev_uberblock_compare(const uberblock_t ub1, const uberblock_t ub2)
	-{
	- int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
	-
	- if (likely(cmp))
	- return (cmp);
	-
	- cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
	- if (likely(cmp))
	- return (cmp);
	-
	- /*
	- * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware
	- * ZFS, e.g. zfsonlinux >= 0.7.
	- *
	- * If one ub has MMP and the other does not, they were written by
	- * different hosts, which matters for MMP. So we treat no MMP/no SEQ as
	- * a 0 value.
	- *
	- * Since timestamp and txg are the same if we get this far, either is
	- * acceptable for importing the pool.
	- */
	- unsigned int seq1 = 0;
	- unsigned int seq2 = 0;
	-
	- if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
	- seq1 = MMP_SEQ(ub1);
	-
	- if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
	- seq2 = MMP_SEQ(ub2);
	-
	- return (AVL_CMP(seq1, seq2));
	-}
	-
	-struct ubl_cbdata {
	- uberblock_t ubl_ubbest; / Best uberblock */
	- vdev_t ubl_vd; / vdev associated with the above */
	-};
	-
	-static void
	-vdev_uberblock_load_done(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- spa_t *spa = zio->io_spa;
	- zio_t *rio = zio->io_private;
	- uberblock_t *ub = abd_to_buf(zio->io_abd);
	- struct ubl_cbdata *cbp = rio->io_private;
	-
	- ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
	-
	- if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
	- mutex_enter(&rio->io_lock);
	- if (ub->ub_txg <= spa->spa_load_max_txg &&
	- vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
	- /*
	- * Keep track of the vdev in which this uberblock
	- * was found. We will use this information later
	- * to obtain the config nvlist associated with
	- * this uberblock.
	- */
	- cbp->ubl_ubbest = ub;
	- cbp->ubl_vd = vd;
	- }
	- mutex_exit(&rio->io_lock);
	- }
	-
	- abd_free(zio->io_abd);
	-}
	-
	-static void
	-vdev_uberblock_load_impl(zio_t zio, vdev_t vd, int flags,
	- struct ubl_cbdata *cbp)
	-{
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
	-
	- if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
	- for (int l = 0; l < VDEV_LABELS; l++) {
	- for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
	- vdev_label_read(zio, vd, l,
	- abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
	- B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
	- VDEV_UBERBLOCK_SIZE(vd),
	- vdev_uberblock_load_done, zio, flags);
	- }
	- }
	- }
	-}
	-
	-/*
	- * Reads the 'best' uberblock from disk along with its associated
	- * configuration. First, we read the uberblock array of each label of each
	- * vdev, keeping track of the uberblock with the highest txg in each array.
	- * Then, we read the configuration from the same vdev as the best uberblock.
	- */
	-void
	-vdev_uberblock_load(vdev_t rvd, uberblock_t ub, nvlist_t **config)
	-{
	- zio_t *zio;
	- spa_t *spa = rvd->vdev_spa;
	- struct ubl_cbdata cb;
	- int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL \|
	- ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_TRYHARD;
	-
	- ASSERT(ub);
	- ASSERT(config);
	-
	- bzero(ub, sizeof (uberblock_t));
	- *config = NULL;
	-
	- cb.ubl_ubbest = ub;
	- cb.ubl_vd = NULL;
	-
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- zio = zio_root(spa, NULL, &cb, flags);
	- vdev_uberblock_load_impl(zio, rvd, flags, &cb);
	- (void) zio_wait(zio);
	-
	- /*
	- * It's possible that the best uberblock was discovered on a label
	- * that has a configuration which was written in a future txg.
	- * Search all labels on this vdev to find the configuration that
	- * matches the txg for our uberblock.
	- */
	- if (cb.ubl_vd != NULL) {
	- vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
	- "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
	-
	- *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
	- if (*config == NULL && spa->spa_extreme_rewind) {
	- vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
	- "Trying again without txg restrictions.");
	- *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
	- }
	- if (*config == NULL) {
	- vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
	- }
	- }
	- spa_config_exit(spa, SCL_ALL, FTAG);
	-}
	-
	-/*
	- * On success, increment root zio's count of good writes.
	- * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
	- */
	-static void
	-vdev_uberblock_sync_done(zio_t *zio)
	-{
	- uint64_t *good_writes = zio->io_private;
	-
	- if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
	- atomic_inc_64(good_writes);
	-}
	-
	-/*
	- * Write the uberblock to all labels of all leaves of the specified vdev.
	- */
	-static void
	-vdev_uberblock_sync(zio_t zio, uint64_t good_writes,
	- uberblock_t ub, vdev_t vd, int flags)
	-{
	- for (uint64_t c = 0; c < vd->vdev_children; c++) {
	- vdev_uberblock_sync(zio, good_writes,
	- ub, vd->vdev_child[c], flags);
	- }
	-
	- if (!vd->vdev_ops->vdev_op_leaf)
	- return;
	-
	- if (!vdev_writeable(vd))
	- return;
	-
	- int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
	- int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m);
	-
	- /* Copy the uberblock_t into the ABD */
	- abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
	- abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
	- abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
	-
	- for (int l = 0; l < VDEV_LABELS; l++)
	- vdev_label_write(zio, vd, l, ub_abd,
	- VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
	- vdev_uberblock_sync_done, good_writes,
	- flags \| ZIO_FLAG_DONT_PROPAGATE);
	-
	- abd_free(ub_abd);
	-}
	-
	-/* Sync the uberblocks to all vdevs in svd[] */
	-int
	-vdev_uberblock_sync_list(vdev_t *svd, int svdcount, uberblock_t ub, int flags)
	-{
	- spa_t *spa = svd[0]->vdev_spa;
	- zio_t *zio;
	- uint64_t good_writes = 0;
	-
	- zio = zio_root(spa, NULL, NULL, flags);
	-
	- for (int v = 0; v < svdcount; v++)
	- vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
	-
	- (void) zio_wait(zio);
	-
	- /*
	- * Flush the uberblocks to disk. This ensures that the odd labels
	- * are no longer needed (because the new uberblocks and the even
	- * labels are safely on disk), so it is safe to overwrite them.
	- */
	- zio = zio_root(spa, NULL, NULL, flags);
	-
	- for (int v = 0; v < svdcount; v++) {
	- if (vdev_writeable(svd[v])) {
	- zio_flush(zio, svd[v]);
	- }
	- }
	-
	- (void) zio_wait(zio);
	-
	- return (good_writes >= 1 ? 0 : EIO);
	-}
	-
	-/*
	- * On success, increment the count of good writes for our top-level vdev.
	- */
	-static void
	-vdev_label_sync_done(zio_t *zio)
	-{
	- uint64_t *good_writes = zio->io_private;
	-
	- if (zio->io_error == 0)
	- atomic_inc_64(good_writes);
	-}
	-
	-/*
	- * If there weren't enough good writes, indicate failure to the parent.
	- */
	-static void
	-vdev_label_sync_top_done(zio_t *zio)
	-{
	- uint64_t *good_writes = zio->io_private;
	-
	- if (*good_writes == 0)
	- zio->io_error = SET_ERROR(EIO);
	-
	- kmem_free(good_writes, sizeof (uint64_t));
	-}
	-
	-/*
	- * We ignore errors for log and cache devices, simply free the private data.
	- */
	-static void
	-vdev_label_sync_ignore_done(zio_t *zio)
	-{
	- kmem_free(zio->io_private, sizeof (uint64_t));
	-}
	-
	-/*
	- * Write all even or odd labels to all leaves of the specified vdev.
	- */
	-static void
	-vdev_label_sync(zio_t zio, uint64_t good_writes,
	- vdev_t *vd, int l, uint64_t txg, int flags)
	-{
	- nvlist_t *label;
	- vdev_phys_t *vp;
	- abd_t *vp_abd;
	- char *buf;
	- size_t buflen;
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- vdev_label_sync(zio, good_writes,
	- vd->vdev_child[c], l, txg, flags);
	- }
	-
	- if (!vd->vdev_ops->vdev_op_leaf)
	- return;
	-
	- if (!vdev_writeable(vd))
	- return;
	-
	- /*
	- * Generate a label describing the top-level config to which we belong.
	- */
	- label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
	-
	- vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
	- abd_zero(vp_abd, sizeof (vdev_phys_t));
	- vp = abd_to_buf(vp_abd);
	-
	- buf = vp->vp_nvlist;
	- buflen = sizeof (vp->vp_nvlist);
	-
	- if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
	- for (; l < VDEV_LABELS; l += 2) {
	- vdev_label_write(zio, vd, l, vp_abd,
	- offsetof(vdev_label_t, vl_vdev_phys),
	- sizeof (vdev_phys_t),
	- vdev_label_sync_done, good_writes,
	- flags \| ZIO_FLAG_DONT_PROPAGATE);
	- }
	- }
	-
	- abd_free(vp_abd);
	- nvlist_free(label);
	-}
	-
	-int
	-vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
	-{
	- list_t *dl = &spa->spa_config_dirty_list;
	- vdev_t *vd;
	- zio_t *zio;
	- int error;
	-
	- /*
	- * Write the new labels to disk.
	- */
	- zio = zio_root(spa, NULL, NULL, flags);
	-
	- for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
	- uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
	- KM_SLEEP);
	-
	- ASSERT(!vd->vdev_ishole);
	-
	- zio_t *vio = zio_null(zio, spa, NULL,
	- (vd->vdev_islog \|\| vd->vdev_aux != NULL) ?
	- vdev_label_sync_ignore_done : vdev_label_sync_top_done,
	- good_writes, flags);
	- vdev_label_sync(vio, good_writes, vd, l, txg, flags);
	- zio_nowait(vio);
	- }
	-
	- error = zio_wait(zio);
	-
	- /*
	- * Flush the new labels to disk.
	- */
	- zio = zio_root(spa, NULL, NULL, flags);
	-
	- for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
	- zio_flush(zio, vd);
	-
	- (void) zio_wait(zio);
	-
	- return (error);
	-}
	-
	-/*
	- * Sync the uberblock and any changes to the vdev configuration.
	- *
	- * The order of operations is carefully crafted to ensure that
	- * if the system panics or loses power at any time, the state on disk
	- * is still transactionally consistent. The in-line comments below
	- * describe the failure semantics at each stage.
	- *
	- * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
	- * at any time, you can just call it again, and it will resume its work.
	- */
	-int
	-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
	-{
	- spa_t *spa = svd[0]->vdev_spa;
	- uberblock_t *ub = &spa->spa_uberblock;
	- int error = 0;
	- int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL;
	-
	- ASSERT(svdcount != 0);
	-retry:
	- /*
	- * Normally, we don't want to try too hard to write every label and
	- * uberblock. If there is a flaky disk, we don't want the rest of the
	- * sync process to block while we retry. But if we can't write a
	- * single label out, we should retry with ZIO_FLAG_TRYHARD before
	- * bailing out and declaring the pool faulted.
	- */
	- if (error != 0) {
	- if ((flags & ZIO_FLAG_TRYHARD) != 0)
	- return (error);
	- flags \|= ZIO_FLAG_TRYHARD;
	- }
	-
	- ASSERT(ub->ub_txg <= txg);
	-
	- /*
	- * If this isn't a resync due to I/O errors,
	- * and nothing changed in this transaction group,
	- * and the vdev configuration hasn't changed,
	- * then there's nothing to do.
	- */
	- if (ub->ub_txg < txg) {
	- boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
	- txg, spa->spa_mmp.mmp_delay);
	-
	- if (!changed && list_is_empty(&spa->spa_config_dirty_list))
	- return (0);
	- }
	-
	- if (txg > spa_freeze_txg(spa))
	- return (0);
	-
	- ASSERT(txg <= spa->spa_final_txg);
	-
	- /*
	- * Flush the write cache of every disk that's been written to
	- * in this transaction group. This ensures that all blocks
	- * written in this txg will be committed to stable storage
	- * before any uberblock that references them.
	- */
	- zio_t *zio = zio_root(spa, NULL, NULL, flags);
	-
	- for (vdev_t *vd =
	- txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
	- vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
	- zio_flush(zio, vd);
	-
	- (void) zio_wait(zio);
	-
	- /*
	- * Sync out the even labels (L0, L2) for every dirty vdev. If the
	- * system dies in the middle of this process, that's OK: all of the
	- * even labels that made it to disk will be newer than any uberblock,
	- * and will therefore be considered invalid. The odd labels (L1, L3),
	- * which have not yet been touched, will still be valid. We flush
	- * the new labels to disk to ensure that all even-label updates
	- * are committed to stable storage before the uberblock update.
	- */
	- if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
	- if ((flags & ZIO_FLAG_TRYHARD) != 0) {
	- zfs_dbgmsg("vdev_label_sync_list() returned error %d "
	- "for pool '%s' when syncing out the even labels "
	- "of dirty vdevs", error, spa_name(spa));
	- }
	- goto retry;
	- }
	-
	- /*
	- * Sync the uberblocks to all vdevs in svd[].
	- * If the system dies in the middle of this step, there are two cases
	- * to consider, and the on-disk state is consistent either way:
	- *
	- * (1) If none of the new uberblocks made it to disk, then the
	- * previous uberblock will be the newest, and the odd labels
	- * (which had not yet been touched) will be valid with respect
	- * to that uberblock.
	- *
	- * (2) If one or more new uberblocks made it to disk, then they
	- * will be the newest, and the even labels (which had all
	- * been successfully committed) will be valid with respect
	- * to the new uberblocks.
	- */
	- if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
	- if ((flags & ZIO_FLAG_TRYHARD) != 0) {
	- zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
	- "%d for pool '%s'", error, spa_name(spa));
	- }
	- goto retry;
	- }
	-
	- if (spa_multihost(spa))
	- mmp_update_uberblock(spa, ub);
	-
	- /*
	- * Sync out odd labels for every dirty vdev. If the system dies
	- * in the middle of this process, the even labels and the new
	- * uberblocks will suffice to open the pool. The next time
	- * the pool is opened, the first thing we'll do -- before any
	- * user data is modified -- is mark every vdev dirty so that
	- * all labels will be brought up to date. We flush the new labels
	- * to disk to ensure that all odd-label updates are committed to
	- * stable storage before the next transaction group begins.
	- */
	- if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
	- if ((flags & ZIO_FLAG_TRYHARD) != 0) {
	- zfs_dbgmsg("vdev_label_sync_list() returned error %d "
	- "for pool '%s' when syncing out the odd labels of "
	- "dirty vdevs", error, spa_name(spa));
	- }
	- goto retry;;
	- }
	-
	- trim_thread_wakeup(spa);
	-
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
	@@ -1,779 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zio.h>
	-#include <sys/abd.h>
	-#include <sys/fs/zfs.h>
	-
	-/*
	- * Virtual device vector for mirroring.
	- */
	-
	-typedef struct mirror_child {
	- vdev_t *mc_vd;
	- uint64_t mc_offset;
	- int mc_error;
	- int mc_load;
	- uint8_t mc_tried;
	- uint8_t mc_skipped;
	- uint8_t mc_speculative;
	-} mirror_child_t;
	-
	-typedef struct mirror_map {
	- int *mm_preferred;
	- int mm_preferred_cnt;
	- int mm_children;
	- boolean_t mm_resilvering;
	- boolean_t mm_root;
	- mirror_child_t mm_child[];
	-} mirror_map_t;
	-
	-static int vdev_mirror_shift = 21;
	-
	-#ifdef _KERNEL
	-SYSCTL_DECL(_vfs_zfs_vdev);
	-static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror,
	- CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	- "ZFS VDEV Mirror");
	-#endif
	-
	-/*
	- * The load configuration settings below are tuned by default for
	- * the case where all devices are of the same rotational type.
	- *
	- * If there is a mixture of rotating and non-rotating media, setting
	- * non_rotating_seek_inc to 0 may well provide better results as it
	- * will direct more reads to the non-rotating vdevs which are more
	- * likely to have a higher performance.
	- */
	-
	-/* Rotating media load calculation configuration. */
	-static int rotating_inc = 0;
	-#ifdef _KERNEL
	-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN,
	- &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
	-#endif
	-
	-static int rotating_seek_inc = 5;
	-#ifdef _KERNEL
	-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN,
	- &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
	-#endif
	-
	-static int rotating_seek_offset = 1 * 1024 * 1024;
	-#ifdef _KERNEL
	-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN,
	- &rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
	- "triggers a reduced rotating media seek increment");
	-#endif
	-
	-/* Non-rotating media load calculation configuration. */
	-static int non_rotating_inc = 0;
	-#ifdef _KERNEL
	-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN,
	- &non_rotating_inc, 0,
	- "Non-rotating media load increment for non-seeking I/O's");
	-#endif
	-
	-static int non_rotating_seek_inc = 1;
	-#ifdef _KERNEL
	-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN,
	- &non_rotating_seek_inc, 0,
	- "Non-rotating media load increment for seeking I/O's");
	-#endif
	-
	-
	-static inline size_t
	-vdev_mirror_map_size(int children)
	-{
	- return (offsetof(mirror_map_t, mm_child[children]) +
	- sizeof(int) * children);
	-}
	-
	-static inline mirror_map_t *
	-vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
	-{
	- mirror_map_t *mm;
	-
	- mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
	- mm->mm_children = children;
	- mm->mm_resilvering = resilvering;
	- mm->mm_root = root;
	- mm->mm_preferred = (int *)((uintptr_t)mm +
	- offsetof(mirror_map_t, mm_child[children]));
	-
	- return mm;
	-}
	-
	-static void
	-vdev_mirror_map_free(zio_t *zio)
	-{
	- mirror_map_t *mm = zio->io_vsd;
	-
	- kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
	-}
	-
	-static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
	- vdev_mirror_map_free,
	- zio_vsd_default_cksum_report
	-};
	-
	-static int
	-vdev_mirror_load(mirror_map_t mm, vdev_t vd, uint64_t zio_offset)
	-{
	- uint64_t lastoffset;
	- int load;
	-
	- /* All DVAs have equal weight at the root. */
	- if (mm->mm_root)
	- return (INT_MAX);
	-
	- /*
	- * We don't return INT_MAX if the device is resilvering i.e.
	- * vdev_resilver_txg != 0 as when tested performance was slightly
	- * worse overall when resilvering with compared to without.
	- */
	-
	- /* Standard load based on pending queue length. */
	- load = vdev_queue_length(vd);
	- lastoffset = vdev_queue_lastoffset(vd);
	-
	- if (vd->vdev_nonrot) {
	- /* Non-rotating media. */
	- if (lastoffset == zio_offset)
	- return (load + non_rotating_inc);
	-
	- /*
	- * Apply a seek penalty even for non-rotating devices as
	- * sequential I/O'a can be aggregated into fewer operations
	- * on the device, thus avoiding unnecessary per-command
	- * overhead and boosting performance.
	- */
	- return (load + non_rotating_seek_inc);
	- }
	-
	- /* Rotating media I/O's which directly follow the last I/O. */
	- if (lastoffset == zio_offset)
	- return (load + rotating_inc);
	-
	- /*
	- * Apply half the seek increment to I/O's within seek offset
	- * of the last I/O queued to this vdev as they should incure less
	- * of a seek increment.
	- */
	- if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
	- return (load + (rotating_seek_inc / 2));
	-
	- /* Apply the full seek increment to all other I/O's. */
	- return (load + rotating_seek_inc);
	-}
	-
	-
	-static mirror_map_t *
	-vdev_mirror_map_init(zio_t *zio)
	-{
	- mirror_map_t *mm = NULL;
	- mirror_child_t *mc;
	- vdev_t *vd = zio->io_vd;
	- int c;
	-
	- if (vd == NULL) {
	- dva_t *dva = zio->io_bp->blk_dva;
	- spa_t *spa = zio->io_spa;
	- dva_t dva_copy[SPA_DVAS_PER_BP];
	-
	- c = BP_GET_NDVAS(zio->io_bp);
	-
	- /*
	- * If we do not trust the pool config, some DVAs might be
	- * invalid or point to vdevs that do not exist. We skip them.
	- */
	- if (!spa_trust_config(spa)) {
	- ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
	- int j = 0;
	- for (int i = 0; i < c; i++) {
	- if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
	- dva_copy[j++] = dva[i];
	- }
	- if (j == 0) {
	- zio->io_vsd = NULL;
	- zio->io_error = ENXIO;
	- return (NULL);
	- }
	- if (j < c) {
	- dva = dva_copy;
	- c = j;
	- }
	- }
	-
	- mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
	-
	- for (c = 0; c < mm->mm_children; c++) {
	- mc = &mm->mm_child[c];
	- mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
	- mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
	- }
	- } else {
	- /*
	- * If we are resilvering, then we should handle scrub reads
	- * differently; we shouldn't issue them to the resilvering
	- * device because it might not have those blocks.
	- *
	- * We are resilvering iff:
	- * 1) We are a replacing vdev (ie our name is "replacing-1" or
	- * "spare-1" or something like that), and
	- * 2) The pool is currently being resilvered.
	- *
	- * We cannot simply check vd->vdev_resilver_txg, because it's
	- * not set in this path.
	- *
	- * Nor can we just check our vdev_ops; there are cases (such as
	- * when a user types "zpool replace pool odev spare_dev" and
	- * spare_dev is in the spare list, or when a spare device is
	- * automatically used to replace a DEGRADED device) when
	- * resilvering is complete but both the original vdev and the
	- * spare vdev remain in the pool. That behavior is intentional.
	- * It helps implement the policy that a spare should be
	- * automatically removed from the pool after the user replaces
	- * the device that originally failed.
	- *
	- * If a spa load is in progress, then spa_dsl_pool may be
	- * uninitialized. But we shouldn't be resilvering during a spa
	- * load anyway.
	- */
	- boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops \|\|
	- vd->vdev_ops == &vdev_spare_ops) &&
	- spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
	- dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
	- mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
	- B_FALSE);
	- for (c = 0; c < mm->mm_children; c++) {
	- mc = &mm->mm_child[c];
	- mc->mc_vd = vd->vdev_child[c];
	- mc->mc_offset = zio->io_offset;
	- }
	- }
	-
	- zio->io_vsd = mm;
	- zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	- return (mm);
	-}
	-
	-static int
	-vdev_mirror_open(vdev_t vd, uint64_t asize, uint64_t *max_asize,
	- uint64_t logical_ashift, uint64_t physical_ashift)
	-{
	- int numerrors = 0;
	- int lasterror = 0;
	-
	- if (vd->vdev_children == 0) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	- return (SET_ERROR(EINVAL));
	- }
	-
	- vdev_open_children(vd);
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- vdev_t *cvd = vd->vdev_child[c];
	-
	- if (cvd->vdev_open_error) {
	- lasterror = cvd->vdev_open_error;
	- numerrors++;
	- continue;
	- }
	-
	- asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
	- max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
	- logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
	- physical_ashift = MAX(physical_ashift,
	- cvd->vdev_physical_ashift);
	- }
	-
	- if (numerrors == vd->vdev_children) {
	- if (vdev_children_are_offline(vd))
	- vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
	- else
	- vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
	- return (lasterror);
	- }
	-
	- return (0);
	-}
	-
	-static void
	-vdev_mirror_close(vdev_t *vd)
	-{
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_close(vd->vdev_child[c]);
	-}
	-
	-static void
	-vdev_mirror_child_done(zio_t *zio)
	-{
	- mirror_child_t *mc = zio->io_private;
	-
	- mc->mc_error = zio->io_error;
	- mc->mc_tried = 1;
	- mc->mc_skipped = 0;
	-}
	-
	-static void
	-vdev_mirror_scrub_done(zio_t *zio)
	-{
	- mirror_child_t *mc = zio->io_private;
	-
	- if (zio->io_error == 0) {
	- zio_t *pio;
	- zio_link_t *zl = NULL;
	-
	- mutex_enter(&zio->io_lock);
	- while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
	- mutex_enter(&pio->io_lock);
	- ASSERT3U(zio->io_size, >=, pio->io_size);
	- abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
	- mutex_exit(&pio->io_lock);
	- }
	- mutex_exit(&zio->io_lock);
	- }
	- abd_free(zio->io_abd);
	-
	- mc->mc_error = zio->io_error;
	- mc->mc_tried = 1;
	- mc->mc_skipped = 0;
	-}
	-
	-/*
	- * Check the other, lower-index DVAs to see if they're on the same
	- * vdev as the child we picked. If they are, use them since they
	- * are likely to have been allocated from the primary metaslab in
	- * use at the time, and hence are more likely to have locality with
	- * single-copy data.
	- */
	-static int
	-vdev_mirror_dva_select(zio_t *zio, int p)
	-{
	- dva_t *dva = zio->io_bp->blk_dva;
	- mirror_map_t *mm = zio->io_vsd;
	- int preferred;
	- int c;
	-
	- preferred = mm->mm_preferred[p];
	- for (p-- ; p >= 0; p--) {
	- c = mm->mm_preferred[p];
	- if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
	- preferred = c;
	- }
	- return (preferred);
	-}
	-
	-static int
	-vdev_mirror_preferred_child_randomize(zio_t *zio)
	-{
	- mirror_map_t *mm = zio->io_vsd;
	- int p;
	-
	- if (mm->mm_root) {
	- p = spa_get_random(mm->mm_preferred_cnt);
	- return (vdev_mirror_dva_select(zio, p));
	- }
	-
	- /*
	- * To ensure we don't always favour the first matching vdev,
	- * which could lead to wear leveling issues on SSD's, we
	- * use the I/O offset as a pseudo random seed into the vdevs
	- * which have the lowest load.
	- */
	- p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
	- return (mm->mm_preferred[p]);
	-}
	-
	-/*
	- * Try to find a vdev whose DTL doesn't contain the block we want to read
	- * prefering vdevs based on determined load.
	- *
	- * If we can't, try the read on any vdev we haven't already tried.
	- */
	-static int
	-vdev_mirror_child_select(zio_t *zio)
	-{
	- mirror_map_t *mm = zio->io_vsd;
	- uint64_t txg = zio->io_txg;
	- int c, lowest_load;
	-
	- ASSERT(zio->io_bp == NULL \|\| BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
	-
	- lowest_load = INT_MAX;
	- mm->mm_preferred_cnt = 0;
	- for (c = 0; c < mm->mm_children; c++) {
	- mirror_child_t *mc;
	-
	- mc = &mm->mm_child[c];
	- if (mc->mc_tried \|\| mc->mc_skipped)
	- continue;
	-
	- if (!vdev_readable(mc->mc_vd)) {
	- mc->mc_error = SET_ERROR(ENXIO);
	- mc->mc_tried = 1; /* don't even try */
	- mc->mc_skipped = 1;
	- continue;
	- }
	-
	- if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
	- mc->mc_error = SET_ERROR(ESTALE);
	- mc->mc_skipped = 1;
	- mc->mc_speculative = 1;
	- continue;
	- }
	-
	- mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
	- if (mc->mc_load > lowest_load)
	- continue;
	-
	- if (mc->mc_load < lowest_load) {
	- lowest_load = mc->mc_load;
	- mm->mm_preferred_cnt = 0;
	- }
	- mm->mm_preferred[mm->mm_preferred_cnt] = c;
	- mm->mm_preferred_cnt++;
	- }
	-
	- if (mm->mm_preferred_cnt == 1) {
	- vdev_queue_register_lastoffset(
	- mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
	- return (mm->mm_preferred[0]);
	- }
	-
	- if (mm->mm_preferred_cnt > 1) {
	- int c = vdev_mirror_preferred_child_randomize(zio);
	-
	- vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
	- return (c);
	- }
	-
	- /*
	- * Every device is either missing or has this txg in its DTL.
	- * Look for any child we haven't already tried before giving up.
	- */
	- for (c = 0; c < mm->mm_children; c++) {
	- if (!mm->mm_child[c].mc_tried) {
	- vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
	- zio);
	- return (c);
	- }
	- }
	-
	- /*
	- * Every child failed. There's no place left to look.
	- */
	- return (-1);
	-}
	-
	-static void
	-vdev_mirror_io_start(zio_t *zio)
	-{
	- mirror_map_t *mm;
	- mirror_child_t *mc;
	- int c, children;
	-
	- mm = vdev_mirror_map_init(zio);
	-
	- if (mm == NULL) {
	- ASSERT(!spa_trust_config(zio->io_spa));
	- ASSERT(zio->io_type == ZIO_TYPE_READ);
	- zio_execute(zio);
	- return;
	- }
	-
	- if (zio->io_type == ZIO_TYPE_READ) {
	- if (zio->io_bp != NULL &&
	- (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
	- mm->mm_children > 1) {
	- /*
	- * For scrubbing reads (if we can verify the
	- * checksum here, as indicated by io_bp being
	- * non-NULL) we need to allocate a read buffer for
	- * each child and issue reads to all children. If
	- * any child succeeds, it will copy its data into
	- * zio->io_data in vdev_mirror_scrub_done.
	- */
	- for (c = 0; c < mm->mm_children; c++) {
	- mc = &mm->mm_child[c];
	- zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
	- mc->mc_vd, mc->mc_offset,
	- abd_alloc_sametype(zio->io_abd,
	- zio->io_size), zio->io_size,
	- zio->io_type, zio->io_priority, 0,
	- vdev_mirror_scrub_done, mc));
	- }
	- zio_execute(zio);
	- return;
	- }
	- /*
	- * For normal reads just pick one child.
	- */
	- c = vdev_mirror_child_select(zio);
	- children = (c >= 0);
	- } else {
	- ASSERT(zio->io_type == ZIO_TYPE_WRITE \|\|
	- zio->io_type == ZIO_TYPE_FREE);
	-
	- /*
	- * Writes and frees go to all children.
	- */
	- c = 0;
	- children = mm->mm_children;
	- }
	-
	- while (children--) {
	- mc = &mm->mm_child[c];
	- zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
	- mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
	- zio->io_type, zio->io_priority, 0,
	- vdev_mirror_child_done, mc));
	- c++;
	- }
	-
	- zio_execute(zio);
	-}
	-
	-static int
	-vdev_mirror_worst_error(mirror_map_t *mm)
	-{
	- int error[2] = { 0, 0 };
	-
	- for (int c = 0; c < mm->mm_children; c++) {
	- mirror_child_t *mc = &mm->mm_child[c];
	- int s = mc->mc_speculative;
	- error[s] = zio_worst_error(error[s], mc->mc_error);
	- }
	-
	- return (error[0] ? error[0] : error[1]);
	-}
	-
	-static void
	-vdev_mirror_io_done(zio_t *zio)
	-{
	- mirror_map_t *mm = zio->io_vsd;
	- mirror_child_t *mc;
	- int c;
	- int good_copies = 0;
	- int unexpected_errors = 0;
	-
	- if (mm == NULL)
	- return;
	-
	- for (c = 0; c < mm->mm_children; c++) {
	- mc = &mm->mm_child[c];
	-
	- if (mc->mc_error) {
	- if (!mc->mc_skipped)
	- unexpected_errors++;
	- } else if (mc->mc_tried) {
	- good_copies++;
	- }
	- }
	-
	- if (zio->io_type == ZIO_TYPE_WRITE) {
	- /*
	- * XXX -- for now, treat partial writes as success.
	- *
	- * Now that we support write reallocation, it would be better
	- * to treat partial failure as real failure unless there are
	- * no non-degraded top-level vdevs left, and not update DTLs
	- * if we intend to reallocate.
	- */
	- /* XXPOLICY */
	- if (good_copies != mm->mm_children) {
	- /*
	- * Always require at least one good copy.
	- *
	- * For ditto blocks (io_vd == NULL), require
	- * all copies to be good.
	- *
	- * XXX -- for replacing vdevs, there's no great answer.
	- * If the old device is really dead, we may not even
	- * be able to access it -- so we only want to
	- * require good writes to the new device. But if
	- * the new device turns out to be flaky, we want
	- * to be able to detach it -- which requires all
	- * writes to the old device to have succeeded.
	- */
	- if (good_copies == 0 \|\| zio->io_vd == NULL)
	- zio->io_error = vdev_mirror_worst_error(mm);
	- }
	- return;
	- } else if (zio->io_type == ZIO_TYPE_FREE) {
	- return;
	- }
	-
	- ASSERT(zio->io_type == ZIO_TYPE_READ);
	-
	- /*
	- * If we don't have a good copy yet, keep trying other children.
	- */
	- /* XXPOLICY */
	- if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
	- ASSERT(c >= 0 && c < mm->mm_children);
	- mc = &mm->mm_child[c];
	- zio_vdev_io_redone(zio);
	- zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
	- mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
	- ZIO_TYPE_READ, zio->io_priority, 0,
	- vdev_mirror_child_done, mc));
	- return;
	- }
	-
	- /* XXPOLICY */
	- if (good_copies == 0) {
	- zio->io_error = vdev_mirror_worst_error(mm);
	- ASSERT(zio->io_error != 0);
	- }
	-
	- if (good_copies && spa_writeable(zio->io_spa) &&
	- (unexpected_errors \|\|
	- (zio->io_flags & ZIO_FLAG_RESILVER) \|\|
	- ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
	- /*
	- * Use the good data we have in hand to repair damaged children.
	- */
	- for (c = 0; c < mm->mm_children; c++) {
	- /*
	- * Don't rewrite known good children.
	- * Not only is it unnecessary, it could
	- * actually be harmful: if the system lost
	- * power while rewriting the only good copy,
	- * there would be no good copies left!
	- */
	- mc = &mm->mm_child[c];
	-
	- if (mc->mc_error == 0) {
	- if (mc->mc_tried)
	- continue;
	- /*
	- * We didn't try this child. We need to
	- * repair it if:
	- * 1. it's a scrub (in which case we have
	- * tried everything that was healthy)
	- * - or -
	- * 2. it's an indirect vdev (in which case
	- * it could point to any other vdev, which
	- * might have a bad DTL)
	- * - or -
	- * 3. the DTL indicates that this data is
	- * missing from this vdev
	- */
	- if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
	- mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
	- !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
	- zio->io_txg, 1))
	- continue;
	- mc->mc_error = SET_ERROR(ESTALE);
	- }
	-
	- zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
	- mc->mc_vd, mc->mc_offset,
	- zio->io_abd, zio->io_size,
	- ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
	- ZIO_FLAG_IO_REPAIR \| (unexpected_errors ?
	- ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
	- }
	- }
	-}
	-
	-static void
	-vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
	-{
	- if (faulted == vd->vdev_children) {
	- if (vdev_children_are_offline(vd)) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
	- VDEV_AUX_CHILDREN_OFFLINE);
	- } else {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_NO_REPLICAS);
	- }
	- } else if (degraded + faulted != 0) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
	- } else {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
	- }
	-}
	-
	-vdev_ops_t vdev_mirror_ops = {
	- vdev_mirror_open,
	- vdev_mirror_close,
	- vdev_default_asize,
	- vdev_mirror_io_start,
	- vdev_mirror_io_done,
	- vdev_mirror_state_change,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- vdev_default_xlate,
	- VDEV_TYPE_MIRROR, /* name of this vdev type */
	- B_FALSE /* not a leaf vdev */
	-};
	-
	-vdev_ops_t vdev_replacing_ops = {
	- vdev_mirror_open,
	- vdev_mirror_close,
	- vdev_default_asize,
	- vdev_mirror_io_start,
	- vdev_mirror_io_done,
	- vdev_mirror_state_change,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- vdev_default_xlate,
	- VDEV_TYPE_REPLACING, /* name of this vdev type */
	- B_FALSE /* not a leaf vdev */
	-};
	-
	-vdev_ops_t vdev_spare_ops = {
	- vdev_mirror_open,
	- vdev_mirror_close,
	- vdev_default_asize,
	- vdev_mirror_io_start,
	- vdev_mirror_io_done,
	- vdev_mirror_state_change,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- vdev_default_xlate,
	- VDEV_TYPE_SPARE, /* name of this vdev type */
	- B_FALSE /* not a leaf vdev */
	-};
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
	@@ -1,113 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * The 'missing' vdev is a special vdev type used only during import. It
	- * signifies a placeholder in the root vdev for some vdev that we know is
	- * missing. We pass it down to the kernel to allow the rest of the
	- * configuration to parsed and an attempt made to open all available devices.
	- * Because its GUID is always 0, we know that the guid sum will mismatch and we
	- * won't be able to open the pool anyway.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zio.h>
	-
	-/* ARGSUSED */
	-static int
	-vdev_missing_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	- uint64_t logical_ashift, uint64_t physical_ashift)
	-{
	- /*
	- * Really this should just fail. But then the root vdev will be in the
	- * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
	- * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
	- * will fail the GUID sum check before ever trying to open the pool.
	- */
	- *psize = 0;
	- *max_psize = 0;
	- *logical_ashift = 0;
	- *physical_ashift = 0;
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-vdev_missing_close(vdev_t *vd)
	-{
	-}
	-
	-/* ARGSUSED */
	-static void
	-vdev_missing_io_start(zio_t *zio)
	-{
	- zio->io_error = SET_ERROR(ENOTSUP);
	- zio_execute(zio);
	-}
	-
	-/* ARGSUSED */
	-static void
	-vdev_missing_io_done(zio_t *zio)
	-{
	-}
	-
	-vdev_ops_t vdev_missing_ops = {
	- vdev_missing_open,
	- vdev_missing_close,
	- vdev_default_asize,
	- vdev_missing_io_start,
	- vdev_missing_io_done,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- VDEV_TYPE_MISSING, /* name of this vdev type */
	- B_TRUE /* leaf vdev */
	-};
	-
	-vdev_ops_t vdev_hole_ops = {
	- vdev_missing_open,
	- vdev_missing_close,
	- vdev_default_asize,
	- vdev_missing_io_start,
	- vdev_missing_io_done,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- VDEV_TYPE_HOLE, /* name of this vdev type */
	- B_TRUE /* leaf vdev */
	-};
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
	@@ -1,1047 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zio.h>
	-#include <sys/avl.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/abd.h>
	-
	-/*
	- * ZFS I/O Scheduler
	- * ---------------
	- *
	- * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The
	- * I/O scheduler determines when and in what order those operations are
	- * issued. The I/O scheduler divides operations into six I/O classes
	- * prioritized in the following order: sync read, sync write, async read,
	- * async write, scrub/resilver and trim. Each queue defines the minimum and
	- * maximum number of concurrent operations that may be issued to the device.
	- * In addition, the device has an aggregate maximum. Note that the sum of the
	- * per-queue minimums must not exceed the aggregate maximum, and if the
	- * aggregate maximum is equal to or greater than the sum of the per-queue
	- * maximums, the per-queue minimum has no effect.
	- *
	- * For many physical devices, throughput increases with the number of
	- * concurrent operations, but latency typically suffers. Further, physical
	- * devices typically have a limit at which more concurrent operations have no
	- * effect on throughput or can actually cause it to decrease.
	- *
	- * The scheduler selects the next operation to issue by first looking for an
	- * I/O class whose minimum has not been satisfied. Once all are satisfied and
	- * the aggregate maximum has not been hit, the scheduler looks for classes
	- * whose maximum has not been satisfied. Iteration through the I/O classes is
	- * done in the order specified above. No further operations are issued if the
	- * aggregate maximum number of concurrent operations has been hit or if there
	- * are no operations queued for an I/O class that has not hit its maximum.
	- * Every time an I/O is queued or an operation completes, the I/O scheduler
	- * looks for new operations to issue.
	- *
	- * All I/O classes have a fixed maximum number of outstanding operations
	- * except for the async write class. Asynchronous writes represent the data
	- * that is committed to stable storage during the syncing stage for
	- * transaction groups (see txg.c). Transaction groups enter the syncing state
	- * periodically so the number of queued async writes will quickly burst up and
	- * then bleed down to zero. Rather than servicing them as quickly as possible,
	- * the I/O scheduler changes the maximum number of active async write I/Os
	- * according to the amount of dirty data in the pool (see dsl_pool.c). Since
	- * both throughput and latency typically increase with the number of
	- * concurrent operations issued to physical devices, reducing the burstiness
	- * in the number of concurrent operations also stabilizes the response time of
	- * operations from other -- and in particular synchronous -- queues. In broad
	- * strokes, the I/O scheduler will issue more concurrent operations from the
	- * async write queue as there's more dirty data in the pool.
	- *
	- * Async Writes
	- *
	- * The number of concurrent operations issued for the async write I/O class
	- * follows a piece-wise linear function defined by a few adjustable points.
	- *
	- * \| o---------\| <-- zfs_vdev_async_write_max_active
	- * ^ \| /^ \|
	- * \| \| / \| \|
	- * active \| / \| \|
	- * I/O \| / \| \|
	- * count \| / \| \|
	- * \| / \| \|
	- * \|------------o \| \| <-- zfs_vdev_async_write_min_active
	- * 0\|____________^______\|_________\|
	- * 0% \| \| 100% of zfs_dirty_data_max
	- * \| \|
	- * \| `-- zfs_vdev_async_write_active_max_dirty_percent
	- * `--------- zfs_vdev_async_write_active_min_dirty_percent
	- *
	- * Until the amount of dirty data exceeds a minimum percentage of the dirty
	- * data allowed in the pool, the I/O scheduler will limit the number of
	- * concurrent operations to the minimum. As that threshold is crossed, the
	- * number of concurrent operations issued increases linearly to the maximum at
	- * the specified maximum percentage of the dirty data allowed in the pool.
	- *
	- * Ideally, the amount of dirty data on a busy pool will stay in the sloped
	- * part of the function between zfs_vdev_async_write_active_min_dirty_percent
	- * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
	- * maximum percentage, this indicates that the rate of incoming data is
	- * greater than the rate that the backend storage can handle. In this case, we
	- * must further throttle incoming writes (see dmu_tx_delay() for details).
	- */
	-
	-/*
	- * The maximum number of I/Os active to each device. Ideally, this will be >=
	- * the sum of each queue's max_active. It must be at least the sum of each
	- * queue's min_active.
	- */
	-uint32_t zfs_vdev_max_active = 1000;
	-
	-/*
	- * Per-queue limits on the number of I/Os active to each device. If the
	- * sum of the queue's max_active is < zfs_vdev_max_active, then the
	- * min_active comes into play. We will send min_active from each queue,
	- * and then select from queues in the order defined by zio_priority_t.
	- *
	- * In general, smaller max_active's will lead to lower latency of synchronous
	- * operations. Larger max_active's may lead to higher overall throughput,
	- * depending on underlying storage.
	- *
	- * The ratio of the queues' max_actives determines the balance of performance
	- * between reads, writes, and scrubs. E.g., increasing
	- * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
	- * more quickly, but reads and writes to have higher latency and lower
	- * throughput.
	- */
	-uint32_t zfs_vdev_sync_read_min_active = 10;
	-uint32_t zfs_vdev_sync_read_max_active = 10;
	-uint32_t zfs_vdev_sync_write_min_active = 10;
	-uint32_t zfs_vdev_sync_write_max_active = 10;
	-uint32_t zfs_vdev_async_read_min_active = 1;
	-uint32_t zfs_vdev_async_read_max_active = 3;
	-uint32_t zfs_vdev_async_write_min_active = 1;
	-uint32_t zfs_vdev_async_write_max_active = 10;
	-uint32_t zfs_vdev_scrub_min_active = 1;
	-uint32_t zfs_vdev_scrub_max_active = 2;
	-uint32_t zfs_vdev_trim_min_active = 1;
	-/*
	- * TRIM max active is large in comparison to the other values due to the fact
	- * that TRIM IOs are coalesced at the device layer. This value is set such
	- * that a typical SSD can process the queued IOs in a single request.
	- */
	-uint32_t zfs_vdev_trim_max_active = 64;
	-uint32_t zfs_vdev_removal_min_active = 1;
	-uint32_t zfs_vdev_removal_max_active = 2;
	-uint32_t zfs_vdev_initializing_min_active = 1;
	-uint32_t zfs_vdev_initializing_max_active = 1;
	-
	-
	-/*
	- * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
	- * dirty data, use zfs_vdev_async_write_min_active. When it has more than
	- * zfs_vdev_async_write_active_max_dirty_percent, use
	- * zfs_vdev_async_write_max_active. The value is linearly interpolated
	- * between min and max.
	- */
	-int zfs_vdev_async_write_active_min_dirty_percent = 30;
	-int zfs_vdev_async_write_active_max_dirty_percent = 60;
	-
	-/*
	- * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
	- * For read I/Os, we also aggregate across small adjacency gaps; for writes
	- * we include spans of optional I/Os to aid aggregation at the disk even when
	- * they aren't able to help us aggregate at this level.
	- */
	-int zfs_vdev_aggregation_limit = 1 << 20;
	-int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
	-int zfs_vdev_read_gap_limit = 32 << 10;
	-int zfs_vdev_write_gap_limit = 4 << 10;
	-
	-/*
	- * Define the queue depth percentage for each top-level. This percentage is
	- * used in conjunction with zfs_vdev_async_max_active to determine how many
	- * allocations a specific top-level vdev should handle. Once the queue depth
	- * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
	- * then allocator will stop allocating blocks on that top-level device.
	- * The default kernel setting is 1000% which will yield 100 allocations per
	- * device. For userland testing, the default setting is 300% which equates
	- * to 30 allocations per device.
	- */
	-#ifdef _KERNEL
	-int zfs_vdev_queue_depth_pct = 1000;
	-#else
	-int zfs_vdev_queue_depth_pct = 300;
	-#endif
	-
	-/*
	- * When performing allocations for a given metaslab, we want to make sure that
	- * there are enough IOs to aggregate together to improve throughput. We want to
	- * ensure that there are at least 128k worth of IOs that can be aggregated, and
	- * we assume that the average allocation size is 4k, so we need the queue depth
	- * to be 32 per allocator to get good aggregation of sequential writes.
	- */
	-int zfs_vdev_def_queue_depth = 32;
	-
	-#ifdef __FreeBSD__
	-#ifdef _KERNEL
	-SYSCTL_DECL(_vfs_zfs_vdev);
	-
	-static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
	-SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
	- CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN, 0, sizeof(int),
	- sysctl_zfs_async_write_active_min_dirty_percent, "I",
	- "Percentage of async write dirty data below which "
	- "async_write_min_active is used.");
	-
	-static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
	-SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
	- CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN, 0, sizeof(int),
	- sysctl_zfs_async_write_active_max_dirty_percent, "I",
	- "Percentage of async write dirty data above which "
	- "async_write_max_active is used.");
	-
	-SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
	- &zfs_vdev_max_active, 0,
	- "The maximum number of I/Os of all types active for each device.");
	-
	-#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
	-SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
	- &zfs_vdev_ ## name ## _min_active, 0, \
	- "Initial number of I/O requests of type " #name \
	- " active for each device");
	-
	-#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \
	-SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\
	- &zfs_vdev_ ## name ## _max_active, 0, \
	- "Maximum number of I/O requests of type " #name \
	- " active for each device");
	-
	-ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
	-ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
	-ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
	-ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
	-ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
	-ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
	-ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
	-ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
	-ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
	-ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
	-ZFS_VDEV_QUEUE_KNOB_MIN(trim);
	-ZFS_VDEV_QUEUE_KNOB_MAX(trim);
	-ZFS_VDEV_QUEUE_KNOB_MIN(removal);
	-ZFS_VDEV_QUEUE_KNOB_MAX(removal);
	-ZFS_VDEV_QUEUE_KNOB_MIN(initializing);
	-ZFS_VDEV_QUEUE_KNOB_MAX(initializing);
	-
	-#undef ZFS_VDEV_QUEUE_KNOB
	-
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
	- &zfs_vdev_aggregation_limit, 0,
	- "I/O requests are aggregated up to this size");
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit_non_rotating, CTLFLAG_RWTUN,
	- &zfs_vdev_aggregation_limit_non_rotating, 0,
	- "I/O requests are aggregated up to this size for non-rotating media");
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
	- &zfs_vdev_read_gap_limit, 0,
	- "Acceptable gap between two reads being aggregated");
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
	- &zfs_vdev_write_gap_limit, 0,
	- "Acceptable gap between two writes being aggregated");
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
	- &zfs_vdev_queue_depth_pct, 0,
	- "Queue depth percentage for each top-level");
	-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN,
	- &zfs_vdev_def_queue_depth, 0,
	- "Default queue depth for each allocator");
	-
	-static int
	-sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
	-{
	- int val, err;
	-
	- val = zfs_vdev_async_write_active_min_dirty_percent;
	- err = sysctl_handle_int(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val < 0 \|\| val > 100 \|\|
	- val >= zfs_vdev_async_write_active_max_dirty_percent)
	- return (EINVAL);
	-
	- zfs_vdev_async_write_active_min_dirty_percent = val;
	-
	- return (0);
	-}
	-
	-static int
	-sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
	-{
	- int val, err;
	-
	- val = zfs_vdev_async_write_active_max_dirty_percent;
	- err = sysctl_handle_int(oidp, &val, 0, req);
	- if (err != 0 \|\| req->newptr == NULL)
	- return (err);
	-
	- if (val < 0 \|\| val > 100 \|\|
	- val <= zfs_vdev_async_write_active_min_dirty_percent)
	- return (EINVAL);
	-
	- zfs_vdev_async_write_active_max_dirty_percent = val;
	-
	- return (0);
	-}
	-#endif
	-#endif
	-
	-int
	-vdev_queue_offset_compare(const void x1, const void x2)
	-{
	- const zio_t z1 = (const zio_t )x1;
	- const zio_t z2 = (const zio_t )x2;
	-
	- int cmp = AVL_CMP(z1->io_offset, z2->io_offset);
	-
	- if (likely(cmp))
	- return (cmp);
	-
	- return (AVL_PCMP(z1, z2));
	-}
	-
	-static inline avl_tree_t *
	-vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
	-{
	- return (&vq->vq_class[p].vqc_queued_tree);
	-}
	-
	-static inline avl_tree_t *
	-vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
	-{
	- if (t == ZIO_TYPE_READ)
	- return (&vq->vq_read_offset_tree);
	- else if (t == ZIO_TYPE_WRITE)
	- return (&vq->vq_write_offset_tree);
	- else
	- return (NULL);
	-}
	-
	-int
	-vdev_queue_timestamp_compare(const void x1, const void x2)
	-{
	- const zio_t z1 = (const zio_t )x1;
	- const zio_t z2 = (const zio_t )x2;
	-
	- int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp);
	-
	- if (likely(cmp))
	- return (cmp);
	-
	- return (AVL_PCMP(z1, z2));
	-}
	-
	-void
	-vdev_queue_init(vdev_t *vd)
	-{
	- vdev_queue_t *vq = &vd->vdev_queue;
	-
	- mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
	- vq->vq_vdev = vd;
	-
	- avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
	- sizeof (zio_t), offsetof(struct zio, io_queue_node));
	- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
	- vdev_queue_offset_compare, sizeof (zio_t),
	- offsetof(struct zio, io_offset_node));
	- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
	- vdev_queue_offset_compare, sizeof (zio_t),
	- offsetof(struct zio, io_offset_node));
	-
	- for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
	- int (compfn) (const void , const void *);
	-
	- /*
	- * The synchronous i/o queues are dispatched in FIFO rather
	- * than LBA order. This provides more consistent latency for
	- * these i/os.
	- */
	- if (p == ZIO_PRIORITY_SYNC_READ \|\| p == ZIO_PRIORITY_SYNC_WRITE)
	- compfn = vdev_queue_timestamp_compare;
	- else
	- compfn = vdev_queue_offset_compare;
	-
	- avl_create(vdev_queue_class_tree(vq, p), compfn,
	- sizeof (zio_t), offsetof(struct zio, io_queue_node));
	- }
	-
	- vq->vq_lastoffset = 0;
	-}
	-
	-void
	-vdev_queue_fini(vdev_t *vd)
	-{
	- vdev_queue_t *vq = &vd->vdev_queue;
	-
	- for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
	- avl_destroy(vdev_queue_class_tree(vq, p));
	- avl_destroy(&vq->vq_active_tree);
	- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
	- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
	-
	- mutex_destroy(&vq->vq_lock);
	-}
	-
	-static void
	-vdev_queue_io_add(vdev_queue_t vq, zio_t zio)
	-{
	- spa_t *spa = zio->io_spa;
	- avl_tree_t *qtt;
	-
	- ASSERT(MUTEX_HELD(&vq->vq_lock));
	- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	- avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
	- qtt = vdev_queue_type_tree(vq, zio->io_type);
	- if (qtt)
	- avl_add(qtt, zio);
	-
	-#ifdef illumos
	- mutex_enter(&spa->spa_iokstat_lock);
	- spa->spa_queue_stats[zio->io_priority].spa_queued++;
	- if (spa->spa_iokstat != NULL)
	- kstat_waitq_enter(spa->spa_iokstat->ks_data);
	- mutex_exit(&spa->spa_iokstat_lock);
	-#endif
	-}
	-
	-static void
	-vdev_queue_io_remove(vdev_queue_t vq, zio_t zio)
	-{
	- spa_t *spa = zio->io_spa;
	- avl_tree_t *qtt;
	-
	- ASSERT(MUTEX_HELD(&vq->vq_lock));
	- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	- avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
	- qtt = vdev_queue_type_tree(vq, zio->io_type);
	- if (qtt)
	- avl_remove(qtt, zio);
	-
	-#ifdef illumos
	- mutex_enter(&spa->spa_iokstat_lock);
	- ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
	- spa->spa_queue_stats[zio->io_priority].spa_queued--;
	- if (spa->spa_iokstat != NULL)
	- kstat_waitq_exit(spa->spa_iokstat->ks_data);
	- mutex_exit(&spa->spa_iokstat_lock);
	-#endif
	-}
	-
	-static void
	-vdev_queue_pending_add(vdev_queue_t vq, zio_t zio)
	-{
	- spa_t *spa = zio->io_spa;
	- ASSERT(MUTEX_HELD(&vq->vq_lock));
	- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	- vq->vq_class[zio->io_priority].vqc_active++;
	- avl_add(&vq->vq_active_tree, zio);
	-
	-#ifdef illumos
	- mutex_enter(&spa->spa_iokstat_lock);
	- spa->spa_queue_stats[zio->io_priority].spa_active++;
	- if (spa->spa_iokstat != NULL)
	- kstat_runq_enter(spa->spa_iokstat->ks_data);
	- mutex_exit(&spa->spa_iokstat_lock);
	-#endif
	-}
	-
	-static void
	-vdev_queue_pending_remove(vdev_queue_t vq, zio_t zio)
	-{
	- spa_t *spa = zio->io_spa;
	- ASSERT(MUTEX_HELD(&vq->vq_lock));
	- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	- vq->vq_class[zio->io_priority].vqc_active--;
	- avl_remove(&vq->vq_active_tree, zio);
	-
	-#ifdef illumos
	- mutex_enter(&spa->spa_iokstat_lock);
	- ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
	- spa->spa_queue_stats[zio->io_priority].spa_active--;
	- if (spa->spa_iokstat != NULL) {
	- kstat_io_t *ksio = spa->spa_iokstat->ks_data;
	-
	- kstat_runq_exit(spa->spa_iokstat->ks_data);
	- if (zio->io_type == ZIO_TYPE_READ) {
	- ksio->reads++;
	- ksio->nread += zio->io_size;
	- } else if (zio->io_type == ZIO_TYPE_WRITE) {
	- ksio->writes++;
	- ksio->nwritten += zio->io_size;
	- }
	- }
	- mutex_exit(&spa->spa_iokstat_lock);
	-#endif
	-}
	-
	-static void
	-vdev_queue_agg_io_done(zio_t *aio)
	-{
	- if (aio->io_type == ZIO_TYPE_READ) {
	- zio_t *pio;
	- zio_link_t *zl = NULL;
	- while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
	- abd_copy_off(pio->io_abd, aio->io_abd,
	- 0, pio->io_offset - aio->io_offset, pio->io_size);
	- }
	- }
	-
	- abd_free(aio->io_abd);
	-}
	-
	-static int
	-vdev_queue_class_min_active(zio_priority_t p)
	-{
	- switch (p) {
	- case ZIO_PRIORITY_SYNC_READ:
	- return (zfs_vdev_sync_read_min_active);
	- case ZIO_PRIORITY_SYNC_WRITE:
	- return (zfs_vdev_sync_write_min_active);
	- case ZIO_PRIORITY_ASYNC_READ:
	- return (zfs_vdev_async_read_min_active);
	- case ZIO_PRIORITY_ASYNC_WRITE:
	- return (zfs_vdev_async_write_min_active);
	- case ZIO_PRIORITY_SCRUB:
	- return (zfs_vdev_scrub_min_active);
	- case ZIO_PRIORITY_TRIM:
	- return (zfs_vdev_trim_min_active);
	- case ZIO_PRIORITY_REMOVAL:
	- return (zfs_vdev_removal_min_active);
	- case ZIO_PRIORITY_INITIALIZING:
	- return (zfs_vdev_initializing_min_active);
	- default:
	- panic("invalid priority %u", p);
	- return (0);
	- }
	-}
	-
	-static __noinline int
	-vdev_queue_max_async_writes(spa_t *spa)
	-{
	- int writes;
	- uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
	- uint64_t min_bytes = zfs_dirty_data_max *
	- zfs_vdev_async_write_active_min_dirty_percent / 100;
	- uint64_t max_bytes = zfs_dirty_data_max *
	- zfs_vdev_async_write_active_max_dirty_percent / 100;
	-
	- /*
	- * Sync tasks correspond to interactive user actions. To reduce the
	- * execution time of those actions we push data out as fast as possible.
	- */
	- if (spa_has_pending_synctask(spa)) {
	- return (zfs_vdev_async_write_max_active);
	- }
	-
	- if (dirty < min_bytes)
	- return (zfs_vdev_async_write_min_active);
	- if (dirty > max_bytes)
	- return (zfs_vdev_async_write_max_active);
	-
	- /*
	- * linear interpolation:
	- * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
	- * move right by min_bytes
	- * move up by min_writes
	- */
	- writes = (dirty - min_bytes) *
	- (zfs_vdev_async_write_max_active -
	- zfs_vdev_async_write_min_active) /
	- (max_bytes - min_bytes) +
	- zfs_vdev_async_write_min_active;
	- ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
	- ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
	- return (writes);
	-}
	-
	-static int
	-vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
	-{
	- switch (p) {
	- case ZIO_PRIORITY_SYNC_READ:
	- return (zfs_vdev_sync_read_max_active);
	- case ZIO_PRIORITY_SYNC_WRITE:
	- return (zfs_vdev_sync_write_max_active);
	- case ZIO_PRIORITY_ASYNC_READ:
	- return (zfs_vdev_async_read_max_active);
	- case ZIO_PRIORITY_ASYNC_WRITE:
	- return (vdev_queue_max_async_writes(spa));
	- case ZIO_PRIORITY_SCRUB:
	- return (zfs_vdev_scrub_max_active);
	- case ZIO_PRIORITY_TRIM:
	- return (zfs_vdev_trim_max_active);
	- case ZIO_PRIORITY_REMOVAL:
	- return (zfs_vdev_removal_max_active);
	- case ZIO_PRIORITY_INITIALIZING:
	- return (zfs_vdev_initializing_max_active);
	- default:
	- panic("invalid priority %u", p);
	- return (0);
	- }
	-}
	-
	-/*
	- * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
	- * there is no eligible class.
	- */
	-static zio_priority_t
	-vdev_queue_class_to_issue(vdev_queue_t *vq)
	-{
	- spa_t *spa = vq->vq_vdev->vdev_spa;
	- zio_priority_t p;
	-
	- ASSERT(MUTEX_HELD(&vq->vq_lock));
	-
	- if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
	- return (ZIO_PRIORITY_NUM_QUEUEABLE);
	-
	- /* find a queue that has not reached its minimum # outstanding i/os */
	- for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
	- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
	- vq->vq_class[p].vqc_active <
	- vdev_queue_class_min_active(p))
	- return (p);
	- }
	-
	- /*
	- * If we haven't found a queue, look for one that hasn't reached its
	- * maximum # outstanding i/os.
	- */
	- for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
	- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
	- vq->vq_class[p].vqc_active <
	- vdev_queue_class_max_active(spa, p))
	- return (p);
	- }
	-
	- /* No eligible queued i/os */
	- return (ZIO_PRIORITY_NUM_QUEUEABLE);
	-}
	-
	-/*
	- * Compute the range spanned by two i/os, which is the endpoint of the last
	- * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
	- * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
	- * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
	- */
	-#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
	-#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
	-
	-static zio_t *
	-vdev_queue_aggregate(vdev_queue_t vq, zio_t zio)
	-{
	- zio_t first, last, aio, dio, mandatory, nio;
	- zio_link_t *zl = NULL;
	- uint64_t maxgap = 0;
	- uint64_t size;
	- uint64_t limit;
	- int maxblocksize;
	- boolean_t stretch;
	- avl_tree_t *t;
	- enum zio_flag flags;
	-
	- ASSERT(MUTEX_HELD(&vq->vq_lock));
	-
	- maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
	- if (vq->vq_vdev->vdev_nonrot)
	- limit = zfs_vdev_aggregation_limit_non_rotating;
	- else
	- limit = zfs_vdev_aggregation_limit;
	- limit = MAX(MIN(limit, maxblocksize), 0);
	-
	- if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE \|\| limit == 0)
	- return (NULL);
	-
	- first = last = zio;
	-
	- if (zio->io_type == ZIO_TYPE_READ)
	- maxgap = zfs_vdev_read_gap_limit;
	-
	- /*
	- * We can aggregate I/Os that are sufficiently adjacent and of
	- * the same flavor, as expressed by the AGG_INHERIT flags.
	- * The latter requirement is necessary so that certain
	- * attributes of the I/O, such as whether it's a normal I/O
	- * or a scrub/resilver, can be preserved in the aggregate.
	- * We can include optional I/Os, but don't allow them
	- * to begin a range as they add no benefit in that situation.
	- */
	-
	- /*
	- * We keep track of the last non-optional I/O.
	- */
	- mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
	-
	- /*
	- * Walk backwards through sufficiently contiguous I/Os
	- * recording the last non-optional I/O.
	- */
	- flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
	- t = vdev_queue_type_tree(vq, zio->io_type);
	- while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
	- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	- IO_SPAN(dio, last) <= limit &&
	- IO_GAP(dio, first) <= maxgap &&
	- dio->io_type == zio->io_type) {
	- first = dio;
	- if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
	- mandatory = first;
	- }
	-
	- /*
	- * Skip any initial optional I/Os.
	- */
	- while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
	- first = AVL_NEXT(t, first);
	- ASSERT(first != NULL);
	- }
	-
	- /*
	- * Walk forward through sufficiently contiguous I/Os.
	- * The aggregation limit does not apply to optional i/os, so that
	- * we can issue contiguous writes even if they are larger than the
	- * aggregation limit.
	- */
	- while ((dio = AVL_NEXT(t, last)) != NULL &&
	- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	- (IO_SPAN(first, dio) <= limit \|\|
	- (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
	- IO_SPAN(first, dio) <= maxblocksize &&
	- IO_GAP(last, dio) <= maxgap &&
	- dio->io_type == zio->io_type) {
	- last = dio;
	- if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
	- mandatory = last;
	- }
	-
	- /*
	- * Now that we've established the range of the I/O aggregation
	- * we must decide what to do with trailing optional I/Os.
	- * For reads, there's nothing to do. While we are unable to
	- * aggregate further, it's possible that a trailing optional
	- * I/O would allow the underlying device to aggregate with
	- * subsequent I/Os. We must therefore determine if the next
	- * non-optional I/O is close enough to make aggregation
	- * worthwhile.
	- */
	- stretch = B_FALSE;
	- if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
	- zio_t *nio = last;
	- while ((dio = AVL_NEXT(t, nio)) != NULL &&
	- IO_GAP(nio, dio) == 0 &&
	- IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
	- nio = dio;
	- if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
	- stretch = B_TRUE;
	- break;
	- }
	- }
	- }
	-
	- if (stretch) {
	- /*
	- * We are going to include an optional io in our aggregated
	- * span, thus closing the write gap. Only mandatory i/os can
	- * start aggregated spans, so make sure that the next i/o
	- * after our span is mandatory.
	- */
	- dio = AVL_NEXT(t, last);
	- dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
	- } else {
	- /* do not include the optional i/o */
	- while (last != mandatory && last != first) {
	- ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
	- last = AVL_PREV(t, last);
	- ASSERT(last != NULL);
	- }
	- }
	-
	- if (first == last)
	- return (NULL);
	-
	- size = IO_SPAN(first, last);
	- ASSERT3U(size, <=, maxblocksize);
	-
	- aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
	- abd_alloc_for_io(size, B_TRUE), size, first->io_type,
	- zio->io_priority, flags \| ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_QUEUE,
	- vdev_queue_agg_io_done, NULL);
	- aio->io_timestamp = first->io_timestamp;
	-
	- nio = first;
	- do {
	- dio = nio;
	- nio = AVL_NEXT(t, dio);
	- zio_add_child(dio, aio);
	- vdev_queue_io_remove(vq, dio);
	- } while (dio != last);
	-
	- /*
	- * We need to drop the vdev queue's lock during zio_execute() to
	- * avoid a deadlock that we could encounter due to lock order
	- * reversal between vq_lock and io_lock in zio_change_priority().
	- * Use the dropped lock to do memory copy without congestion.
	- */
	- mutex_exit(&vq->vq_lock);
	- while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
	- ASSERT3U(dio->io_type, ==, aio->io_type);
	-
	- if (dio->io_flags & ZIO_FLAG_NODATA) {
	- ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
	- abd_zero_off(aio->io_abd,
	- dio->io_offset - aio->io_offset, dio->io_size);
	- } else if (dio->io_type == ZIO_TYPE_WRITE) {
	- abd_copy_off(aio->io_abd, dio->io_abd,
	- dio->io_offset - aio->io_offset, 0, dio->io_size);
	- }
	-
	- zio_vdev_io_bypass(dio);
	- zio_execute(dio);
	- }
	- mutex_enter(&vq->vq_lock);
	-
	- return (aio);
	-}
	-
	-static zio_t *
	-vdev_queue_io_to_issue(vdev_queue_t *vq)
	-{
	- zio_t zio, aio;
	- zio_priority_t p;
	- avl_index_t idx;
	- avl_tree_t *tree;
	- zio_t search;
	-
	-again:
	- ASSERT(MUTEX_HELD(&vq->vq_lock));
	-
	- p = vdev_queue_class_to_issue(vq);
	-
	- if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
	- /* No eligible queued i/os */
	- return (NULL);
	- }
	-
	- /*
	- * For LBA-ordered queues (async / scrub / initializing), issue the
	- * i/o which follows the most recently issued i/o in LBA (offset) order.
	- *
	- * For FIFO queues (sync), issue the i/o with the lowest timestamp.
	- */
	- tree = vdev_queue_class_tree(vq, p);
	- search.io_timestamp = 0;
	- search.io_offset = vq->vq_last_offset + 1;
	- VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
	- zio = avl_nearest(tree, idx, AVL_AFTER);
	- if (zio == NULL)
	- zio = avl_first(tree);
	- ASSERT3U(zio->io_priority, ==, p);
	-
	- aio = vdev_queue_aggregate(vq, zio);
	- if (aio != NULL)
	- zio = aio;
	- else
	- vdev_queue_io_remove(vq, zio);
	-
	- /*
	- * If the I/O is or was optional and therefore has no data, we need to
	- * simply discard it. We need to drop the vdev queue's lock to avoid a
	- * deadlock that we could encounter since this I/O will complete
	- * immediately.
	- */
	- if (zio->io_flags & ZIO_FLAG_NODATA) {
	- mutex_exit(&vq->vq_lock);
	- zio_vdev_io_bypass(zio);
	- zio_execute(zio);
	- mutex_enter(&vq->vq_lock);
	- goto again;
	- }
	-
	- vdev_queue_pending_add(vq, zio);
	- vq->vq_last_offset = zio->io_offset;
	-
	- return (zio);
	-}
	-
	-zio_t *
	-vdev_queue_io(zio_t *zio)
	-{
	- vdev_queue_t *vq = &zio->io_vd->vdev_queue;
	- zio_t *nio;
	-
	- if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
	- return (zio);
	-
	- /*
	- * Children i/os inherent their parent's priority, which might
	- * not match the child's i/o type. Fix it up here.
	- */
	- if (zio->io_type == ZIO_TYPE_READ) {
	- if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
	- zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
	- zio->io_priority != ZIO_PRIORITY_SCRUB &&
	- zio->io_priority != ZIO_PRIORITY_REMOVAL &&
	- zio->io_priority != ZIO_PRIORITY_INITIALIZING)
	- zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
	- } else if (zio->io_type == ZIO_TYPE_WRITE) {
	- if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
	- zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
	- zio->io_priority != ZIO_PRIORITY_REMOVAL &&
	- zio->io_priority != ZIO_PRIORITY_INITIALIZING)
	- zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
	- } else {
	- ASSERT(zio->io_type == ZIO_TYPE_FREE);
	- zio->io_priority = ZIO_PRIORITY_TRIM;
	- }
	-
	- zio->io_flags \|= ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_QUEUE;
	-
	- mutex_enter(&vq->vq_lock);
	- zio->io_timestamp = gethrtime();
	- vdev_queue_io_add(vq, zio);
	- nio = vdev_queue_io_to_issue(vq);
	- mutex_exit(&vq->vq_lock);
	-
	- if (nio == NULL)
	- return (NULL);
	-
	- if (nio->io_done == vdev_queue_agg_io_done) {
	- zio_nowait(nio);
	- return (NULL);
	- }
	-
	- return (nio);
	-}
	-
	-void
	-vdev_queue_io_done(zio_t *zio)
	-{
	- vdev_queue_t *vq = &zio->io_vd->vdev_queue;
	- zio_t *nio;
	-
	- mutex_enter(&vq->vq_lock);
	-
	- vdev_queue_pending_remove(vq, zio);
	-
	- vq->vq_io_complete_ts = gethrtime();
	-
	- while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
	- mutex_exit(&vq->vq_lock);
	- if (nio->io_done == vdev_queue_agg_io_done) {
	- zio_nowait(nio);
	- } else {
	- zio_vdev_io_reissue(nio);
	- zio_execute(nio);
	- }
	- mutex_enter(&vq->vq_lock);
	- }
	-
	- mutex_exit(&vq->vq_lock);
	-}
	-
	-void
	-vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
	-{
	- vdev_queue_t *vq = &zio->io_vd->vdev_queue;
	- avl_tree_t *tree;
	-
	- /*
	- * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
	- * code to issue IOs without adding them to the vdev queue. In this
	- * case, the zio is already going to be issued as quickly as possible
	- * and so it doesn't need any reprioitization to help.
	- */
	- if (zio->io_priority == ZIO_PRIORITY_NOW)
	- return;
	-
	- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	- ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	-
	- if (zio->io_type == ZIO_TYPE_READ) {
	- if (priority != ZIO_PRIORITY_SYNC_READ &&
	- priority != ZIO_PRIORITY_ASYNC_READ &&
	- priority != ZIO_PRIORITY_SCRUB)
	- priority = ZIO_PRIORITY_ASYNC_READ;
	- } else {
	- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
	- if (priority != ZIO_PRIORITY_SYNC_WRITE &&
	- priority != ZIO_PRIORITY_ASYNC_WRITE)
	- priority = ZIO_PRIORITY_ASYNC_WRITE;
	- }
	-
	- mutex_enter(&vq->vq_lock);
	-
	- /*
	- * If the zio is in none of the queues we can simply change
	- * the priority. If the zio is waiting to be submitted we must
	- * remove it from the queue and re-insert it with the new priority.
	- * Otherwise, the zio is currently active and we cannot change its
	- * priority.
	- */
	- tree = vdev_queue_class_tree(vq, zio->io_priority);
	- if (avl_find(tree, zio, NULL) == zio) {
	- avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
	- zio->io_priority = priority;
	- avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
	- } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
	- zio->io_priority = priority;
	- }
	-
	- mutex_exit(&vq->vq_lock);
	-}
	-
	-/*
	- * As these three methods are only used for load calculations we're not concerned
	- * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
	- * use here, instead we prefer to keep it lock free for performance.
	- */
	-int
	-vdev_queue_length(vdev_t *vd)
	-{
	- return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
	-}
	-
	-uint64_t
	-vdev_queue_lastoffset(vdev_t *vd)
	-{
	- return (vd->vdev_queue.vq_lastoffset);
	-}
	-
	-void
	-vdev_queue_register_lastoffset(vdev_t vd, zio_t zio)
	-{
	- vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
	@@ -1,2707 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/vdev_impl.h>
	-#ifdef illumos
	-#include <sys/vdev_disk.h>
	-#endif
	-#include <sys/vdev_file.h>
	-#include <sys/vdev_raidz.h>
	-#include <sys/zio.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/abd.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/fm/fs/zfs.h>
	-#include <sys/bio.h>
	-
	-#ifdef ZFS_DEBUG
	-#include <sys/vdev_initialize.h> /* vdev_xlate testing */
	-#endif
	-
	-/*
	- * Virtual device vector for RAID-Z.
	- *
	- * This vdev supports single, double, and triple parity. For single parity,
	- * we use a simple XOR of all the data columns. For double or triple parity,
	- * we use a special case of Reed-Solomon coding. This extends the
	- * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
	- * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
	- * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
	- * former is also based. The latter is designed to provide higher performance
	- * for writes.
	- *
	- * Note that the Plank paper claimed to support arbitrary N+M, but was then
	- * amended six years later identifying a critical flaw that invalidates its
	- * claims. Nevertheless, the technique can be adapted to work for up to
	- * triple parity. For additional parity, the amendment "Note: Correction to
	- * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
	- * is viable, but the additional complexity means that write performance will
	- * suffer.
	- *
	- * All of the methods above operate on a Galois field, defined over the
	- * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
	- * can be expressed with a single byte. Briefly, the operations on the
	- * field are defined as follows:
	- *
	- * o addition (+) is represented by a bitwise XOR
	- * o subtraction (-) is therefore identical to addition: A + B = A - B
	- * o multiplication of A by 2 is defined by the following bitwise expression:
	- *
	- * (A * 2)_7 = A_6
	- * (A * 2)_6 = A_5
	- * (A * 2)_5 = A_4
	- * (A * 2)_4 = A_3 + A_7
	- * (A * 2)_3 = A_2 + A_7
	- * (A * 2)_2 = A_1 + A_7
	- * (A * 2)_1 = A_0
	- * (A * 2)_0 = A_7
	- *
	- * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
	- * As an aside, this multiplication is derived from the error correcting
	- * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
	- *
	- * Observe that any number in the field (except for 0) can be expressed as a
	- * power of 2 -- a generator for the field. We store a table of the powers of
	- * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
	- * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
	- * than field addition). The inverse of a field element A (A^-1) is therefore
	- * A ^ (255 - 1) = A^254.
	- *
	- * The up-to-three parity columns, P, Q, R over several data columns,
	- * D_0, ... D_n-1, can be expressed by field operations:
	- *
	- * P = D_0 + D_1 + ... + D_n-2 + D_n-1
	- * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
	- * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
	- * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
	- * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
	- *
	- * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
	- * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
	- * independent coefficients. (There are no additional coefficients that have
	- * this property which is why the uncorrected Plank method breaks down.)
	- *
	- * See the reconstruction code below for how P, Q and R can used individually
	- * or in concert to recover missing data columns.
	- */
	-
	-typedef struct raidz_col {
	- uint64_t rc_devidx; /* child device index for I/O */
	- uint64_t rc_offset; /* device offset */
	- uint64_t rc_size; /* I/O size */
	- abd_t rc_abd; / I/O data */
	- void rc_gdata; / used to store the "good" version */
	- int rc_error; /* I/O error for this device */
	- uint8_t rc_tried; /* Did we attempt this I/O column? */
	- uint8_t rc_skipped; /* Did we skip this I/O column? */
	-} raidz_col_t;
	-
	-typedef struct raidz_map {
	- uint64_t rm_cols; /* Regular column count */
	- uint64_t rm_scols; /* Count including skipped columns */
	- uint64_t rm_bigcols; /* Number of oversized columns */
	- uint64_t rm_asize; /* Actual total I/O size */
	- uint64_t rm_missingdata; /* Count of missing data devices */
	- uint64_t rm_missingparity; /* Count of missing parity devices */
	- uint64_t rm_firstdatacol; /* First data column/parity count */
	- uint64_t rm_nskip; /* Skipped sectors for padding */
	- uint64_t rm_skipstart; /* Column index of padding start */
	- abd_t rm_abd_copy; / rm_asize-buffer of copied data */
	- uintptr_t rm_reports; /* # of referencing checksum reports */
	- uint8_t rm_freed; /* map no longer has referencing ZIO */
	- uint8_t rm_ecksuminjected; /* checksum error was injected */
	- raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
	-} raidz_map_t;
	-
	-#define VDEV_RAIDZ_P 0
	-#define VDEV_RAIDZ_Q 1
	-#define VDEV_RAIDZ_R 2
	-
	-#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
	-#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
	-
	-/*
	- * We provide a mechanism to perform the field multiplication operation on a
	- * 64-bit value all at once rather than a byte at a time. This works by
	- * creating a mask from the top bit in each byte and using that to
	- * conditionally apply the XOR of 0x1d.
	- */
	-#define VDEV_RAIDZ_64MUL_2(x, mask) \
	-{ \
	- (mask) = (x) & 0x8080808080808080ULL; \
	- (mask) = ((mask) << 1) - ((mask) >> 7); \
	- (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
	- ((mask) & 0x1d1d1d1d1d1d1d1d); \
	-}
	-
	-#define VDEV_RAIDZ_64MUL_4(x, mask) \
	-{ \
	- VDEV_RAIDZ_64MUL_2((x), mask); \
	- VDEV_RAIDZ_64MUL_2((x), mask); \
	-}
	-
	-#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
	-
	-/*
	- * Force reconstruction to use the general purpose method.
	- */
	-int vdev_raidz_default_to_general;
	-
	-/* Powers of 2 in the Galois field defined above. */
	-static const uint8_t vdev_raidz_pow2[256] = {
	- 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
	- 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
	- 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
	- 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
	- 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
	- 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
	- 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
	- 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
	- 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
	- 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
	- 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
	- 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
	- 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
	- 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
	- 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
	- 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
	- 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
	- 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
	- 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
	- 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
	- 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
	- 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
	- 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
	- 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
	- 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
	- 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
	- 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
	- 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
	- 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
	- 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
	- 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
	- 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
	-};
	-/* Logs of 2 in the Galois field defined above. */
	-static const uint8_t vdev_raidz_log2[256] = {
	- 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
	- 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
	- 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
	- 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
	- 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
	- 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
	- 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
	- 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
	- 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
	- 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
	- 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
	- 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
	- 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
	- 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
	- 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
	- 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
	- 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
	- 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
	- 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
	- 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
	- 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
	- 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
	- 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
	- 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
	- 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
	- 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
	- 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
	- 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
	- 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
	- 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
	- 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
	- 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
	-};
	-
	-static void vdev_raidz_generate_parity(raidz_map_t *rm);
	-
	-/*
	- * Multiply a given number by 2 raised to the given power.
	- */
	-static uint8_t
	-vdev_raidz_exp2(uint_t a, int exp)
	-{
	- if (a == 0)
	- return (0);
	-
	- ASSERT(exp >= 0);
	- ASSERT(vdev_raidz_log2[a] > 0 \|\| a == 1);
	-
	- exp += vdev_raidz_log2[a];
	- if (exp > 255)
	- exp -= 255;
	-
	- return (vdev_raidz_pow2[exp]);
	-}
	-
	-static void
	-vdev_raidz_map_free(raidz_map_t *rm)
	-{
	- int c;
	-
	- for (c = 0; c < rm->rm_firstdatacol; c++) {
	- if (rm->rm_col[c].rc_abd != NULL)
	- abd_free(rm->rm_col[c].rc_abd);
	-
	- if (rm->rm_col[c].rc_gdata != NULL)
	- zio_buf_free(rm->rm_col[c].rc_gdata,
	- rm->rm_col[c].rc_size);
	- }
	-
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- if (rm->rm_col[c].rc_abd != NULL)
	- abd_put(rm->rm_col[c].rc_abd);
	- }
	-
	- if (rm->rm_abd_copy != NULL)
	- abd_free(rm->rm_abd_copy);
	-
	- kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
	-}
	-
	-static void
	-vdev_raidz_map_free_vsd(zio_t *zio)
	-{
	- raidz_map_t *rm = zio->io_vsd;
	-
	- ASSERT0(rm->rm_freed);
	- rm->rm_freed = 1;
	-
	- if (rm->rm_reports == 0)
	- vdev_raidz_map_free(rm);
	-}
	-
	-/ARGSUSED/
	-static void
	-vdev_raidz_cksum_free(void *arg, size_t ignored)
	-{
	- raidz_map_t *rm = arg;
	-
	- ASSERT3U(rm->rm_reports, >, 0);
	-
	- if (--rm->rm_reports == 0 && rm->rm_freed != 0)
	- vdev_raidz_map_free(rm);
	-}
	-
	-static void
	-vdev_raidz_cksum_finish(zio_cksum_report_t zcr, const void good_data)
	-{
	- raidz_map_t *rm = zcr->zcr_cbdata;
	- size_t c = zcr->zcr_cbinfo;
	- size_t x;
	-
	- const char *good = NULL;
	- char *bad;
	-
	- if (good_data == NULL) {
	- zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
	- return;
	- }
	-
	- if (c < rm->rm_firstdatacol) {
	- /*
	- * The first time through, calculate the parity blocks for
	- * the good data (this relies on the fact that the good
	- * data never changes for a given logical ZIO)
	- */
	- if (rm->rm_col[0].rc_gdata == NULL) {
	- abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
	- char *buf;
	- int offset;
	-
	- /*
	- * Set up the rm_col[]s to generate the parity for
	- * good_data, first saving the parity bufs and
	- * replacing them with buffers to hold the result.
	- */
	- for (x = 0; x < rm->rm_firstdatacol; x++) {
	- bad_parity[x] = rm->rm_col[x].rc_abd;
	- rm->rm_col[x].rc_gdata =
	- zio_buf_alloc(rm->rm_col[x].rc_size);
	- rm->rm_col[x].rc_abd =
	- abd_get_from_buf(rm->rm_col[x].rc_gdata,
	- rm->rm_col[x].rc_size);
	- }
	-
	- /* fill in the data columns from good_data */
	- buf = (char *)good_data;
	- for (; x < rm->rm_cols; x++) {
	- abd_put(rm->rm_col[x].rc_abd);
	- rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
	- rm->rm_col[x].rc_size);
	- buf += rm->rm_col[x].rc_size;
	- }
	-
	- /*
	- * Construct the parity from the good data.
	- */
	- vdev_raidz_generate_parity(rm);
	-
	- /* restore everything back to its original state */
	- for (x = 0; x < rm->rm_firstdatacol; x++) {
	- abd_put(rm->rm_col[x].rc_abd);
	- rm->rm_col[x].rc_abd = bad_parity[x];
	- }
	-
	- offset = 0;
	- for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
	- abd_put(rm->rm_col[x].rc_abd);
	- rm->rm_col[x].rc_abd = abd_get_offset(
	- rm->rm_abd_copy, offset);
	- offset += rm->rm_col[x].rc_size;
	- }
	- }
	-
	- ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
	- good = rm->rm_col[c].rc_gdata;
	- } else {
	- /* adjust good_data to point at the start of our column */
	- good = good_data;
	-
	- for (x = rm->rm_firstdatacol; x < c; x++)
	- good += rm->rm_col[x].rc_size;
	- }
	-
	- bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
	- /* we drop the ereport if it ends up that the data was good */
	- zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
	- abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
	-}
	-
	-/*
	- * Invoked indirectly by zfs_ereport_start_checksum(), called
	- * below when our read operation fails completely. The main point
	- * is to keep a copy of everything we read from disk, so that at
	- * vdev_raidz_cksum_finish() time we can compare it with the good data.
	- */
	-static void
	-vdev_raidz_cksum_report(zio_t zio, zio_cksum_report_t zcr, void *arg)
	-{
	- size_t c = (size_t)(uintptr_t)arg;
	- size_t offset;
	-
	- raidz_map_t *rm = zio->io_vsd;
	- size_t size;
	-
	- /* set up the report and bump the refcount */
	- zcr->zcr_cbdata = rm;
	- zcr->zcr_cbinfo = c;
	- zcr->zcr_finish = vdev_raidz_cksum_finish;
	- zcr->zcr_free = vdev_raidz_cksum_free;
	-
	- rm->rm_reports++;
	- ASSERT3U(rm->rm_reports, >, 0);
	-
	- if (rm->rm_abd_copy != NULL)
	- return;
	-
	- /*
	- * It's the first time we're called for this raidz_map_t, so we need
	- * to copy the data aside; there's no guarantee that our zio's buffer
	- * won't be re-used for something else.
	- *
	- * Our parity data is already in separate buffers, so there's no need
	- * to copy them.
	- */
	-
	- size = 0;
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
	- size += rm->rm_col[c].rc_size;
	-
	- rm->rm_abd_copy =
	- abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
	-
	- for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- raidz_col_t *col = &rm->rm_col[c];
	- abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
	-
	- abd_copy(tmp, col->rc_abd, col->rc_size);
	- abd_put(col->rc_abd);
	- col->rc_abd = tmp;
	-
	- offset += col->rc_size;
	- }
	- ASSERT3U(offset, ==, size);
	-}
	-
	-static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
	- vdev_raidz_map_free_vsd,
	- vdev_raidz_cksum_report
	-};
	-
	-/*
	- * Divides the IO evenly across all child vdevs; usually, dcols is
	- * the number of children in the target vdev.
	- */
	-static raidz_map_t *
	-vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
	- uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
	-{
	- raidz_map_t *rm;
	- /* The starting RAIDZ (parent) vdev sector of the block. */
	- uint64_t b = offset >> unit_shift;
	- /* The zio's size in units of the vdev's minimum sector size. */
	- uint64_t s = size >> unit_shift;
	- /* The first column for this stripe. */
	- uint64_t f = b % dcols;
	- /* The starting byte offset on each child vdev. */
	- uint64_t o = (b / dcols) << unit_shift;
	- uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
	- uint64_t off = 0;
	-
	- /*
	- * "Quotient": The number of data sectors for this stripe on all but
	- * the "big column" child vdevs that also contain "remainder" data.
	- */
	- q = s / (dcols - nparity);
	-
	- /*
	- * "Remainder": The number of partial stripe data sectors in this I/O.
	- * This will add a sector to some, but not all, child vdevs.
	- */
	- r = s - q * (dcols - nparity);
	-
	- /* The number of "big columns" - those which contain remainder data. */
	- bc = (r == 0 ? 0 : r + nparity);
	-
	- /*
	- * The total number of data and parity sectors associated with
	- * this I/O.
	- */
	- tot = s + nparity * (q + (r == 0 ? 0 : 1));
	-
	- /* acols: The columns that will be accessed. */
	- /* scols: The columns that will be accessed or skipped. */
	- if (q == 0) {
	- /* Our I/O request doesn't span all child vdevs. */
	- acols = bc;
	- scols = MIN(dcols, roundup(bc, nparity + 1));
	- } else {
	- acols = dcols;
	- scols = dcols;
	- }
	-
	- ASSERT3U(acols, <=, scols);
	-
	- rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
	-
	- rm->rm_cols = acols;
	- rm->rm_scols = scols;
	- rm->rm_bigcols = bc;
	- rm->rm_skipstart = bc;
	- rm->rm_missingdata = 0;
	- rm->rm_missingparity = 0;
	- rm->rm_firstdatacol = nparity;
	- rm->rm_abd_copy = NULL;
	- rm->rm_reports = 0;
	- rm->rm_freed = 0;
	- rm->rm_ecksuminjected = 0;
	-
	- asize = 0;
	-
	- for (c = 0; c < scols; c++) {
	- col = f + c;
	- coff = o;
	- if (col >= dcols) {
	- col -= dcols;
	- coff += 1ULL << unit_shift;
	- }
	- rm->rm_col[c].rc_devidx = col;
	- rm->rm_col[c].rc_offset = coff;
	- rm->rm_col[c].rc_abd = NULL;
	- rm->rm_col[c].rc_gdata = NULL;
	- rm->rm_col[c].rc_error = 0;
	- rm->rm_col[c].rc_tried = 0;
	- rm->rm_col[c].rc_skipped = 0;
	-
	- if (c >= acols)
	- rm->rm_col[c].rc_size = 0;
	- else if (c < bc)
	- rm->rm_col[c].rc_size = (q + 1) << unit_shift;
	- else
	- rm->rm_col[c].rc_size = q << unit_shift;
	-
	- asize += rm->rm_col[c].rc_size;
	- }
	-
	- ASSERT3U(asize, ==, tot << unit_shift);
	- rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
	- rm->rm_nskip = roundup(tot, nparity + 1) - tot;
	- ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
	- ASSERT3U(rm->rm_nskip, <=, nparity);
	-
	- if (!dofree) {
	- for (c = 0; c < rm->rm_firstdatacol; c++) {
	- rm->rm_col[c].rc_abd =
	- abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
	- }
	-
	- for (off = 0, c = rm->rm_firstdatacol; c < acols; c++) {
	- rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
	- off += rm->rm_col[c].rc_size;
	- }
	- }
	-
	- /*
	- * If all data stored spans all columns, there's a danger that parity
	- * will always be on the same device and, since parity isn't read
	- * during normal operation, that that device's I/O bandwidth won't be
	- * used effectively. We therefore switch the parity every 1MB.
	- *
	- * ... at least that was, ostensibly, the theory. As a practical
	- * matter unless we juggle the parity between all devices evenly, we
	- * won't see any benefit. Further, occasional writes that aren't a
	- * multiple of the LCM of the number of children and the minimum
	- * stripe width are sufficient to avoid pessimal behavior.
	- * Unfortunately, this decision created an implicit on-disk format
	- * requirement that we need to support for all eternity, but only
	- * for single-parity RAID-Z.
	- *
	- * If we intend to skip a sector in the zeroth column for padding
	- * we must make sure to note this swap. We will never intend to
	- * skip the first column since at least one data and one parity
	- * column must appear in each row.
	- */
	- ASSERT(rm->rm_cols >= 2);
	- ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
	-
	- if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
	- devidx = rm->rm_col[0].rc_devidx;
	- o = rm->rm_col[0].rc_offset;
	- rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
	- rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
	- rm->rm_col[1].rc_devidx = devidx;
	- rm->rm_col[1].rc_offset = o;
	-
	- if (rm->rm_skipstart == 0)
	- rm->rm_skipstart = 1;
	- }
	-
	- return (rm);
	-}
	-
	-struct pqr_struct {
	- uint64_t *p;
	- uint64_t *q;
	- uint64_t *r;
	-};
	-
	-static int
	-vdev_raidz_p_func(void buf, size_t size, void private)
	-{
	- struct pqr_struct *pqr = private;
	- const uint64_t *src = buf;
	- int i, cnt = size / sizeof (src[0]);
	-
	- ASSERT(pqr->p && !pqr->q && !pqr->r);
	-
	- for (i = 0; i < cnt; i++, src++, pqr->p++)
	- pqr->p ^= src;
	-
	- return (0);
	-}
	-
	-static int
	-vdev_raidz_pq_func(void buf, size_t size, void private)
	-{
	- struct pqr_struct *pqr = private;
	- const uint64_t *src = buf;
	- uint64_t mask;
	- int i, cnt = size / sizeof (src[0]);
	-
	- ASSERT(pqr->p && pqr->q && !pqr->r);
	-
	- for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
	- pqr->p ^= src;
	- VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
	- pqr->q ^= src;
	- }
	-
	- return (0);
	-}
	-
	-static int
	-vdev_raidz_pqr_func(void buf, size_t size, void private)
	-{
	- struct pqr_struct *pqr = private;
	- const uint64_t *src = buf;
	- uint64_t mask;
	- int i, cnt = size / sizeof (src[0]);
	-
	- ASSERT(pqr->p && pqr->q && pqr->r);
	-
	- for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
	- pqr->p ^= src;
	- VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
	- pqr->q ^= src;
	- VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
	- pqr->r ^= src;
	- }
	-
	- return (0);
	-}
	-
	-static void
	-vdev_raidz_generate_parity_p(raidz_map_t *rm)
	-{
	- uint64_t *p;
	- int c;
	- abd_t *src;
	-
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- src = rm->rm_col[c].rc_abd;
	- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	-
	- if (c == rm->rm_firstdatacol) {
	- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
	- } else {
	- struct pqr_struct pqr = { p, NULL, NULL };
	- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
	- vdev_raidz_p_func, &pqr);
	- }
	- }
	-}
	-
	-static void
	-vdev_raidz_generate_parity_pq(raidz_map_t *rm)
	-{
	- uint64_t p, q, pcnt, ccnt, mask, i;
	- int c;
	- abd_t *src;
	-
	- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
	- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
	- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
	-
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- src = rm->rm_col[c].rc_abd;
	- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	-
	- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
	-
	- if (c == rm->rm_firstdatacol) {
	- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
	- (void) memcpy(q, p, rm->rm_col[c].rc_size);
	- } else {
	- struct pqr_struct pqr = { p, q, NULL };
	- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
	- vdev_raidz_pq_func, &pqr);
	- }
	-
	- if (c == rm->rm_firstdatacol) {
	- for (i = ccnt; i < pcnt; i++) {
	- p[i] = 0;
	- q[i] = 0;
	- }
	- } else {
	- /*
	- * Treat short columns as though they are full of 0s.
	- * Note that there's therefore nothing needed for P.
	- */
	- for (i = ccnt; i < pcnt; i++) {
	- VDEV_RAIDZ_64MUL_2(q[i], mask);
	- }
	- }
	- }
	-}
	-
	-static void
	-vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
	-{
	- uint64_t p, q, *r, pcnt, ccnt, mask, i;
	- int c;
	- abd_t *src;
	-
	- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
	- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
	- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
	- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
	- rm->rm_col[VDEV_RAIDZ_R].rc_size);
	-
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- src = rm->rm_col[c].rc_abd;
	- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	- r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
	-
	- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
	-
	- if (c == rm->rm_firstdatacol) {
	- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
	- (void) memcpy(q, p, rm->rm_col[c].rc_size);
	- (void) memcpy(r, p, rm->rm_col[c].rc_size);
	- } else {
	- struct pqr_struct pqr = { p, q, r };
	- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
	- vdev_raidz_pqr_func, &pqr);
	- }
	-
	- if (c == rm->rm_firstdatacol) {
	- for (i = ccnt; i < pcnt; i++) {
	- p[i] = 0;
	- q[i] = 0;
	- r[i] = 0;
	- }
	- } else {
	- /*
	- * Treat short columns as though they are full of 0s.
	- * Note that there's therefore nothing needed for P.
	- */
	- for (i = ccnt; i < pcnt; i++) {
	- VDEV_RAIDZ_64MUL_2(q[i], mask);
	- VDEV_RAIDZ_64MUL_4(r[i], mask);
	- }
	- }
	- }
	-}
	-
	-/*
	- * Generate RAID parity in the first virtual columns according to the number of
	- * parity columns available.
	- */
	-static void
	-vdev_raidz_generate_parity(raidz_map_t *rm)
	-{
	- switch (rm->rm_firstdatacol) {
	- case 1:
	- vdev_raidz_generate_parity_p(rm);
	- break;
	- case 2:
	- vdev_raidz_generate_parity_pq(rm);
	- break;
	- case 3:
	- vdev_raidz_generate_parity_pqr(rm);
	- break;
	- default:
	- cmn_err(CE_PANIC, "invalid RAID-Z configuration");
	- }
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_raidz_reconst_p_func(void dbuf, void sbuf, size_t size, void *private)
	-{
	- uint64_t *dst = dbuf;
	- uint64_t *src = sbuf;
	- int cnt = size / sizeof (src[0]);
	-
	- for (int i = 0; i < cnt; i++) {
	- dst[i] ^= src[i];
	- }
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_raidz_reconst_q_pre_func(void dbuf, void sbuf, size_t size,
	- void *private)
	-{
	- uint64_t *dst = dbuf;
	- uint64_t *src = sbuf;
	- uint64_t mask;
	- int cnt = size / sizeof (dst[0]);
	-
	- for (int i = 0; i < cnt; i++, dst++, src++) {
	- VDEV_RAIDZ_64MUL_2(*dst, mask);
	- dst ^= src;
	- }
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-vdev_raidz_reconst_q_pre_tail_func(void buf, size_t size, void private)
	-{
	- uint64_t *dst = buf;
	- uint64_t mask;
	- int cnt = size / sizeof (dst[0]);
	-
	- for (int i = 0; i < cnt; i++, dst++) {
	- /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
	- VDEV_RAIDZ_64MUL_2(*dst, mask);
	- }
	-
	- return (0);
	-}
	-
	-struct reconst_q_struct {
	- uint64_t *q;
	- int exp;
	-};
	-
	-static int
	-vdev_raidz_reconst_q_post_func(void buf, size_t size, void private)
	-{
	- struct reconst_q_struct *rq = private;
	- uint64_t *dst = buf;
	- int cnt = size / sizeof (dst[0]);
	-
	- for (int i = 0; i < cnt; i++, dst++, rq->q++) {
	- dst ^= rq->q;
	-
	- int j;
	- uint8_t *b;
	- for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
	- b = vdev_raidz_exp2(b, rq->exp);
	- }
	- }
	-
	- return (0);
	-}
	-
	-struct reconst_pq_struct {
	- uint8_t *p;
	- uint8_t *q;
	- uint8_t *pxy;
	- uint8_t *qxy;
	- int aexp;
	- int bexp;
	-};
	-
	-static int
	-vdev_raidz_reconst_pq_func(void xbuf, void ybuf, size_t size, void *private)
	-{
	- struct reconst_pq_struct *rpq = private;
	- uint8_t *xd = xbuf;
	- uint8_t *yd = ybuf;
	-
	- for (int i = 0; i < size;
	- i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
	- xd = vdev_raidz_exp2(rpq->p ^ *rpq->pxy, rpq->aexp) ^
	- vdev_raidz_exp2(rpq->q ^ rpq->qxy, rpq->bexp);
	- yd = rpq->p ^ rpq->pxy ^ xd;
	- }
	-
	- return (0);
	-}
	-
	-static int
	-vdev_raidz_reconst_pq_tail_func(void xbuf, size_t size, void private)
	-{
	- struct reconst_pq_struct *rpq = private;
	- uint8_t *xd = xbuf;
	-
	- for (int i = 0; i < size;
	- i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
	- /* same operation as vdev_raidz_reconst_pq_func() on xd */
	- xd = vdev_raidz_exp2(rpq->p ^ *rpq->pxy, rpq->aexp) ^
	- vdev_raidz_exp2(rpq->q ^ rpq->qxy, rpq->bexp);
	- }
	-
	- return (0);
	-}
	-
	-static int
	-vdev_raidz_reconstruct_p(raidz_map_t rm, int tgts, int ntgts)
	-{
	- int x = tgts[0];
	- int c;
	- abd_t dst, src;
	-
	- ASSERT(ntgts == 1);
	- ASSERT(x >= rm->rm_firstdatacol);
	- ASSERT(x < rm->rm_cols);
	-
	- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
	- ASSERT(rm->rm_col[x].rc_size > 0);
	-
	- src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
	- dst = rm->rm_col[x].rc_abd;
	-
	- abd_copy(dst, src, rm->rm_col[x].rc_size);
	-
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- uint64_t size = MIN(rm->rm_col[x].rc_size,
	- rm->rm_col[c].rc_size);
	-
	- src = rm->rm_col[c].rc_abd;
	- dst = rm->rm_col[x].rc_abd;
	-
	- if (c == x)
	- continue;
	-
	- (void) abd_iterate_func2(dst, src, 0, 0, size,
	- vdev_raidz_reconst_p_func, NULL);
	- }
	-
	- return (1 << VDEV_RAIDZ_P);
	-}
	-
	-static int
	-vdev_raidz_reconstruct_q(raidz_map_t rm, int tgts, int ntgts)
	-{
	- int x = tgts[0];
	- int c, exp;
	- abd_t dst, src;
	-
	- ASSERT(ntgts == 1);
	-
	- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
	-
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
	- rm->rm_col[c].rc_size);
	-
	- src = rm->rm_col[c].rc_abd;
	- dst = rm->rm_col[x].rc_abd;
	-
	- if (c == rm->rm_firstdatacol) {
	- abd_copy(dst, src, size);
	- if (rm->rm_col[x].rc_size > size)
	- abd_zero_off(dst, size,
	- rm->rm_col[x].rc_size - size);
	- } else {
	- ASSERT3U(size, <=, rm->rm_col[x].rc_size);
	- (void) abd_iterate_func2(dst, src, 0, 0, size,
	- vdev_raidz_reconst_q_pre_func, NULL);
	- (void) abd_iterate_func(dst,
	- size, rm->rm_col[x].rc_size - size,
	- vdev_raidz_reconst_q_pre_tail_func, NULL);
	- }
	- }
	-
	- src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
	- dst = rm->rm_col[x].rc_abd;
	- exp = 255 - (rm->rm_cols - 1 - x);
	-
	- struct reconst_q_struct rq = { abd_to_buf(src), exp };
	- (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
	- vdev_raidz_reconst_q_post_func, &rq);
	-
	- return (1 << VDEV_RAIDZ_Q);
	-}
	-
	-static int
	-vdev_raidz_reconstruct_pq(raidz_map_t rm, int tgts, int ntgts)
	-{
	- uint8_t p, q, pxy, qxy, tmp, a, b, aexp, bexp;
	- abd_t pdata, qdata;
	- uint64_t xsize, ysize;
	- int x = tgts[0];
	- int y = tgts[1];
	- abd_t xd, yd;
	-
	- ASSERT(ntgts == 2);
	- ASSERT(x < y);
	- ASSERT(x >= rm->rm_firstdatacol);
	- ASSERT(y < rm->rm_cols);
	-
	- ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
	-
	- /*
	- * Move the parity data aside -- we're going to compute parity as
	- * though columns x and y were full of zeros -- Pxy and Qxy. We want to
	- * reuse the parity generation mechanism without trashing the actual
	- * parity so we make those columns appear to be full of zeros by
	- * setting their lengths to zero.
	- */
	- pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
	- qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
	- xsize = rm->rm_col[x].rc_size;
	- ysize = rm->rm_col[y].rc_size;
	-
	- rm->rm_col[VDEV_RAIDZ_P].rc_abd =
	- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
	- rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
	- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
	- rm->rm_col[x].rc_size = 0;
	- rm->rm_col[y].rc_size = 0;
	-
	- vdev_raidz_generate_parity_pq(rm);
	-
	- rm->rm_col[x].rc_size = xsize;
	- rm->rm_col[y].rc_size = ysize;
	-
	- p = abd_to_buf(pdata);
	- q = abd_to_buf(qdata);
	- pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	- qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	- xd = rm->rm_col[x].rc_abd;
	- yd = rm->rm_col[y].rc_abd;
	-
	- /*
	- * We now have:
	- * Pxy = P + D_x + D_y
	- * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
	- *
	- * We can then solve for D_x:
	- * D_x = A * (P + Pxy) + B * (Q + Qxy)
	- * where
	- * A = 2^(x - y) * (2^(x - y) + 1)^-1
	- * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
	- *
	- * With D_x in hand, we can easily solve for D_y:
	- * D_y = P + Pxy + D_x
	- */
	-
	- a = vdev_raidz_pow2[255 + x - y];
	- b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
	- tmp = 255 - vdev_raidz_log2[a ^ 1];
	-
	- aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
	- bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
	-
	- ASSERT3U(xsize, >=, ysize);
	- struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
	- (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
	- vdev_raidz_reconst_pq_func, &rpq);
	- (void) abd_iterate_func(xd, ysize, xsize - ysize,
	- vdev_raidz_reconst_pq_tail_func, &rpq);
	-
	- abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	- abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	-
	- /*
	- * Restore the saved parity data.
	- */
	- rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
	- rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
	-
	- return ((1 << VDEV_RAIDZ_P) \| (1 << VDEV_RAIDZ_Q));
	-}
	-
	-/* BEGIN CSTYLED */
	-/*
	- * In the general case of reconstruction, we must solve the system of linear
	- * equations defined by the coeffecients used to generate parity as well as
	- * the contents of the data and parity disks. This can be expressed with
	- * vectors for the original data (D) and the actual data (d) and parity (p)
	- * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
	- *
	- * __ __ __ __
	- * \| \| __ __ \| p_0 \|
	- * \| V \| \| D_0 \| \| p_m-1 \|
	- * \| \| x \| : \| = \| d_0 \|
	- * \| I \| \| D_n-1 \| \| : \|
	- * \| \| ~~ ~~ \| d_n-1 \|
	- * ~~ ~~ ~~ ~~
	- *
	- * I is simply a square identity matrix of size n, and V is a vandermonde
	- * matrix defined by the coeffecients we chose for the various parity columns
	- * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
	- * computation as well as linear separability.
	- *
	- * __ __ __ __
	- * \| 1 .. 1 1 1 \| \| p_0 \|
	- * \| 2^n-1 .. 4 2 1 \| __ __ \| : \|
	- * \| 4^n-1 .. 16 4 1 \| \| D_0 \| \| p_m-1 \|
	- * \| 1 .. 0 0 0 \| \| D_1 \| \| d_0 \|
	- * \| 0 .. 0 0 0 \| x \| D_2 \| = \| d_1 \|
	- * \| : : : : \| \| : \| \| d_2 \|
	- * \| 0 .. 1 0 0 \| \| D_n-1 \| \| : \|
	- * \| 0 .. 0 1 0 \| ~~ ~~ \| : \|
	- * \| 0 .. 0 0 1 \| \| d_n-1 \|
	- * ~~ ~~ ~~ ~~
	- *
	- * Note that I, V, d, and p are known. To compute D, we must invert the
	- * matrix and use the known data and parity values to reconstruct the unknown
	- * data values. We begin by removing the rows in V\|I and d\|p that correspond
	- * to failed or missing columns; we then make V\|I square (n x n) and d\|p
	- * sized n by removing rows corresponding to unused parity from the bottom up
	- * to generate (V\|I)' and (d\|p)'. We can then generate the inverse of (V\|I)'
	- * using Gauss-Jordan elimination. In the example below we use m=3 parity
	- * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
	- * __ __
	- * \| 1 1 1 1 1 1 1 1 \|
	- * \| 128 64 32 16 8 4 2 1 \| <-----+-+-- missing disks
	- * \| 19 205 116 29 64 16 4 1 \| / /
	- * \| 1 0 0 0 0 0 0 0 \| / /
	- * \| 0 1 0 0 0 0 0 0 \| <--' /
	- * (V\|I) = \| 0 0 1 0 0 0 0 0 \| <---'
	- * \| 0 0 0 1 0 0 0 0 \|
	- * \| 0 0 0 0 1 0 0 0 \|
	- * \| 0 0 0 0 0 1 0 0 \|
	- * \| 0 0 0 0 0 0 1 0 \|
	- * \| 0 0 0 0 0 0 0 1 \|
	- * ~~ ~~
	- * __ __
	- * \| 1 1 1 1 1 1 1 1 \|
	- * \| 19 205 116 29 64 16 4 1 \|
	- * \| 1 0 0 0 0 0 0 0 \|
	- * (V\|I)' = \| 0 0 0 1 0 0 0 0 \|
	- * \| 0 0 0 0 1 0 0 0 \|
	- * \| 0 0 0 0 0 1 0 0 \|
	- * \| 0 0 0 0 0 0 1 0 \|
	- * \| 0 0 0 0 0 0 0 1 \|
	- * ~~ ~~
	- *
	- * Here we employ Gauss-Jordan elimination to find the inverse of (V\|I)'. We
	- * have carefully chosen the seed values 1, 2, and 4 to ensure that this
	- * matrix is not singular.
	- * __ __
	- * \| 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 \|
	- * \| 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 \|
	- * \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	- * \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	- * \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	- * \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	- * \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	- * \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	- * ~~ ~~
	- * __ __
	- * \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	- * \| 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 \|
	- * \| 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 \|
	- * \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	- * \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	- * \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	- * \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	- * \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	- * ~~ ~~
	- * __ __
	- * \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	- * \| 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 \|
	- * \| 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 \|
	- * \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	- * \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	- * \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	- * \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	- * \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	- * ~~ ~~
	- * __ __
	- * \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	- * \| 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 \|
	- * \| 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 \|
	- * \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	- * \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	- * \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	- * \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	- * \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	- * ~~ ~~
	- * __ __
	- * \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	- * \| 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 \|
	- * \| 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 \|
	- * \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	- * \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	- * \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	- * \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	- * \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	- * ~~ ~~
	- * __ __
	- * \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	- * \| 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 \|
	- * \| 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 \|
	- * \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	- * \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	- * \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	- * \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	- * \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	- * ~~ ~~
	- * __ __
	- * \| 0 0 1 0 0 0 0 0 \|
	- * \| 167 100 5 41 159 169 217 208 \|
	- * \| 166 100 4 40 158 168 216 209 \|
	- * (V\|I)'^-1 = \| 0 0 0 1 0 0 0 0 \|
	- * \| 0 0 0 0 1 0 0 0 \|
	- * \| 0 0 0 0 0 1 0 0 \|
	- * \| 0 0 0 0 0 0 1 0 \|
	- * \| 0 0 0 0 0 0 0 1 \|
	- * ~~ ~~
	- *
	- * We can then simply compute D = (V\|I)'^-1 x (d\|p)' to discover the values
	- * of the missing data.
	- *
	- * As is apparent from the example above, the only non-trivial rows in the
	- * inverse matrix correspond to the data disks that we're trying to
	- * reconstruct. Indeed, those are the only rows we need as the others would
	- * only be useful for reconstructing data known or assumed to be valid. For
	- * that reason, we only build the coefficients in the rows that correspond to
	- * targeted columns.
	- */
	-/* END CSTYLED */
	-
	-static void
	-vdev_raidz_matrix_init(raidz_map_t rm, int n, int nmap, int map,
	- uint8_t **rows)
	-{
	- int i, j;
	- int pow;
	-
	- ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
	-
	- /*
	- * Fill in the missing rows of interest.
	- */
	- for (i = 0; i < nmap; i++) {
	- ASSERT3S(0, <=, map[i]);
	- ASSERT3S(map[i], <=, 2);
	-
	- pow = map[i] * n;
	- if (pow > 255)
	- pow -= 255;
	- ASSERT(pow <= 255);
	-
	- for (j = 0; j < n; j++) {
	- pow -= map[i];
	- if (pow < 0)
	- pow += 255;
	- rows[i][j] = vdev_raidz_pow2[pow];
	- }
	- }
	-}
	-
	-static void
	-vdev_raidz_matrix_invert(raidz_map_t rm, int n, int nmissing, int missing,
	- uint8_t rows, uint8_t invrows, const uint8_t *used)
	-{
	- int i, j, ii, jj;
	- uint8_t log;
	-
	- /*
	- * Assert that the first nmissing entries from the array of used
	- * columns correspond to parity columns and that subsequent entries
	- * correspond to data columns.
	- */
	- for (i = 0; i < nmissing; i++) {
	- ASSERT3S(used[i], <, rm->rm_firstdatacol);
	- }
	- for (; i < n; i++) {
	- ASSERT3S(used[i], >=, rm->rm_firstdatacol);
	- }
	-
	- /*
	- * First initialize the storage where we'll compute the inverse rows.
	- */
	- for (i = 0; i < nmissing; i++) {
	- for (j = 0; j < n; j++) {
	- invrows[i][j] = (i == j) ? 1 : 0;
	- }
	- }
	-
	- /*
	- * Subtract all trivial rows from the rows of consequence.
	- */
	- for (i = 0; i < nmissing; i++) {
	- for (j = nmissing; j < n; j++) {
	- ASSERT3U(used[j], >=, rm->rm_firstdatacol);
	- jj = used[j] - rm->rm_firstdatacol;
	- ASSERT3S(jj, <, n);
	- invrows[i][j] = rows[i][jj];
	- rows[i][jj] = 0;
	- }
	- }
	-
	- /*
	- * For each of the rows of interest, we must normalize it and subtract
	- * a multiple of it from the other rows.
	- */
	- for (i = 0; i < nmissing; i++) {
	- for (j = 0; j < missing[i]; j++) {
	- ASSERT0(rows[i][j]);
	- }
	- ASSERT3U(rows[i][missing[i]], !=, 0);
	-
	- /*
	- * Compute the inverse of the first element and multiply each
	- * element in the row by that value.
	- */
	- log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
	-
	- for (j = 0; j < n; j++) {
	- rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
	- invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
	- }
	-
	- for (ii = 0; ii < nmissing; ii++) {
	- if (i == ii)
	- continue;
	-
	- ASSERT3U(rows[ii][missing[i]], !=, 0);
	-
	- log = vdev_raidz_log2[rows[ii][missing[i]]];
	-
	- for (j = 0; j < n; j++) {
	- rows[ii][j] ^=
	- vdev_raidz_exp2(rows[i][j], log);
	- invrows[ii][j] ^=
	- vdev_raidz_exp2(invrows[i][j], log);
	- }
	- }
	- }
	-
	- /*
	- * Verify that the data that is left in the rows are properly part of
	- * an identity matrix.
	- */
	- for (i = 0; i < nmissing; i++) {
	- for (j = 0; j < n; j++) {
	- if (j == missing[i]) {
	- ASSERT3U(rows[i][j], ==, 1);
	- } else {
	- ASSERT0(rows[i][j]);
	- }
	- }
	- }
	-}
	-
	-static void
	-vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
	- int missing, uint8_t invrows, const uint8_t used)
	-{
	- int i, j, x, cc, c;
	- uint8_t *src;
	- uint64_t ccount;
	- uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
	- uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
	- uint8_t log = 0;
	- uint8_t val;
	- int ll;
	- uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
	- uint8_t p, pp;
	- size_t psize;
	-
	- psize = sizeof (invlog[0][0]) * n * nmissing;
	- p = kmem_alloc(psize, KM_SLEEP);
	-
	- for (pp = p, i = 0; i < nmissing; i++) {
	- invlog[i] = pp;
	- pp += n;
	- }
	-
	- for (i = 0; i < nmissing; i++) {
	- for (j = 0; j < n; j++) {
	- ASSERT3U(invrows[i][j], !=, 0);
	- invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
	- }
	- }
	-
	- for (i = 0; i < n; i++) {
	- c = used[i];
	- ASSERT3U(c, <, rm->rm_cols);
	-
	- src = abd_to_buf(rm->rm_col[c].rc_abd);
	- ccount = rm->rm_col[c].rc_size;
	- for (j = 0; j < nmissing; j++) {
	- cc = missing[j] + rm->rm_firstdatacol;
	- ASSERT3U(cc, >=, rm->rm_firstdatacol);
	- ASSERT3U(cc, <, rm->rm_cols);
	- ASSERT3U(cc, !=, c);
	-
	- dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
	- dcount[j] = rm->rm_col[cc].rc_size;
	- }
	-
	- ASSERT(ccount >= rm->rm_col[missing[0]].rc_size \|\| i > 0);
	-
	- for (x = 0; x < ccount; x++, src++) {
	- if (*src != 0)
	- log = vdev_raidz_log2[*src];
	-
	- for (cc = 0; cc < nmissing; cc++) {
	- if (x >= dcount[cc])
	- continue;
	-
	- if (*src == 0) {
	- val = 0;
	- } else {
	- if ((ll = log + invlog[cc][i]) >= 255)
	- ll -= 255;
	- val = vdev_raidz_pow2[ll];
	- }
	-
	- if (i == 0)
	- dst[cc][x] = val;
	- else
	- dst[cc][x] ^= val;
	- }
	- }
	- }
	-
	- kmem_free(p, psize);
	-}
	-
	-static int
	-vdev_raidz_reconstruct_general(raidz_map_t rm, int tgts, int ntgts)
	-{
	- int n, i, c, t, tt;
	- int nmissing_rows;
	- int missing_rows[VDEV_RAIDZ_MAXPARITY];
	- int parity_map[VDEV_RAIDZ_MAXPARITY];
	-
	- uint8_t p, pp;
	- size_t psize;
	-
	- uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
	- uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
	- uint8_t *used;
	-
	- abd_t **bufs = NULL;
	-
	- int code = 0;
	-
	- /*
	- * Matrix reconstruction can't use scatter ABDs yet, so we allocate
	- * temporary linear ABDs.
	- */
	- if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
	- bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
	-
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- raidz_col_t *col = &rm->rm_col[c];
	-
	- bufs[c] = col->rc_abd;
	- col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
	- abd_copy(col->rc_abd, bufs[c], col->rc_size);
	- }
	- }
	-
	- n = rm->rm_cols - rm->rm_firstdatacol;
	-
	- /*
	- * Figure out which data columns are missing.
	- */
	- nmissing_rows = 0;
	- for (t = 0; t < ntgts; t++) {
	- if (tgts[t] >= rm->rm_firstdatacol) {
	- missing_rows[nmissing_rows++] =
	- tgts[t] - rm->rm_firstdatacol;
	- }
	- }
	-
	- /*
	- * Figure out which parity columns to use to help generate the missing
	- * data columns.
	- */
	- for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
	- ASSERT(tt < ntgts);
	- ASSERT(c < rm->rm_firstdatacol);
	-
	- /*
	- * Skip any targeted parity columns.
	- */
	- if (c == tgts[tt]) {
	- tt++;
	- continue;
	- }
	-
	- code \|= 1 << c;
	-
	- parity_map[i] = c;
	- i++;
	- }
	-
	- ASSERT(code != 0);
	- ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
	-
	- psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
	- nmissing_rows * n + sizeof (used[0]) * n;
	- p = kmem_alloc(psize, KM_SLEEP);
	-
	- for (pp = p, i = 0; i < nmissing_rows; i++) {
	- rows[i] = pp;
	- pp += n;
	- invrows[i] = pp;
	- pp += n;
	- }
	- used = pp;
	-
	- for (i = 0; i < nmissing_rows; i++) {
	- used[i] = parity_map[i];
	- }
	-
	- for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- if (tt < nmissing_rows &&
	- c == missing_rows[tt] + rm->rm_firstdatacol) {
	- tt++;
	- continue;
	- }
	-
	- ASSERT3S(i, <, n);
	- used[i] = c;
	- i++;
	- }
	-
	- /*
	- * Initialize the interesting rows of the matrix.
	- */
	- vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
	-
	- /*
	- * Invert the matrix.
	- */
	- vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
	- invrows, used);
	-
	- /*
	- * Reconstruct the missing data using the generated matrix.
	- */
	- vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
	- invrows, used);
	-
	- kmem_free(p, psize);
	-
	- /*
	- * copy back from temporary linear abds and free them
	- */
	- if (bufs) {
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- raidz_col_t *col = &rm->rm_col[c];
	-
	- abd_copy(bufs[c], col->rc_abd, col->rc_size);
	- abd_free(col->rc_abd);
	- col->rc_abd = bufs[c];
	- }
	- kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
	- }
	-
	- return (code);
	-}
	-
	-static int
	-vdev_raidz_reconstruct(raidz_map_t rm, int t, int nt)
	-{
	- int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
	- int ntgts;
	- int i, c;
	- int code;
	- int nbadparity, nbaddata;
	- int parity_valid[VDEV_RAIDZ_MAXPARITY];
	-
	- /*
	- * The tgts list must already be sorted.
	- */
	- for (i = 1; i < nt; i++) {
	- ASSERT(t[i] > t[i - 1]);
	- }
	-
	- nbadparity = rm->rm_firstdatacol;
	- nbaddata = rm->rm_cols - nbadparity;
	- ntgts = 0;
	- for (i = 0, c = 0; c < rm->rm_cols; c++) {
	- if (c < rm->rm_firstdatacol)
	- parity_valid[c] = B_FALSE;
	-
	- if (i < nt && c == t[i]) {
	- tgts[ntgts++] = c;
	- i++;
	- } else if (rm->rm_col[c].rc_error != 0) {
	- tgts[ntgts++] = c;
	- } else if (c >= rm->rm_firstdatacol) {
	- nbaddata--;
	- } else {
	- parity_valid[c] = B_TRUE;
	- nbadparity--;
	- }
	- }
	-
	- ASSERT(ntgts >= nt);
	- ASSERT(nbaddata >= 0);
	- ASSERT(nbaddata + nbadparity == ntgts);
	-
	- dt = &tgts[nbadparity];
	-
	- /*
	- * See if we can use any of our optimized reconstruction routines.
	- */
	- if (!vdev_raidz_default_to_general) {
	- switch (nbaddata) {
	- case 1:
	- if (parity_valid[VDEV_RAIDZ_P])
	- return (vdev_raidz_reconstruct_p(rm, dt, 1));
	-
	- ASSERT(rm->rm_firstdatacol > 1);
	-
	- if (parity_valid[VDEV_RAIDZ_Q])
	- return (vdev_raidz_reconstruct_q(rm, dt, 1));
	-
	- ASSERT(rm->rm_firstdatacol > 2);
	- break;
	-
	- case 2:
	- ASSERT(rm->rm_firstdatacol > 1);
	-
	- if (parity_valid[VDEV_RAIDZ_P] &&
	- parity_valid[VDEV_RAIDZ_Q])
	- return (vdev_raidz_reconstruct_pq(rm, dt, 2));
	-
	- ASSERT(rm->rm_firstdatacol > 2);
	-
	- break;
	- }
	- }
	-
	- code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
	- ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
	- ASSERT(code > 0);
	- return (code);
	-}
	-
	-static int
	-vdev_raidz_open(vdev_t vd, uint64_t asize, uint64_t *max_asize,
	- uint64_t logical_ashift, uint64_t physical_ashift)
	-{
	- vdev_t *cvd;
	- uint64_t nparity = vd->vdev_nparity;
	- int c;
	- int lasterror = 0;
	- int numerrors = 0;
	-
	- ASSERT(nparity > 0);
	-
	- if (nparity > VDEV_RAIDZ_MAXPARITY \|\|
	- vd->vdev_children < nparity + 1) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	- return (SET_ERROR(EINVAL));
	- }
	-
	- vdev_open_children(vd);
	-
	- for (c = 0; c < vd->vdev_children; c++) {
	- cvd = vd->vdev_child[c];
	-
	- if (cvd->vdev_open_error != 0) {
	- lasterror = cvd->vdev_open_error;
	- numerrors++;
	- continue;
	- }
	-
	- asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
	- max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
	- logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
	- physical_ashift = MAX(physical_ashift,
	- cvd->vdev_physical_ashift);
	- }
	-
	- asize = vd->vdev_children;
	- max_asize = vd->vdev_children;
	-
	- if (numerrors > nparity) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
	- return (lasterror);
	- }
	-
	- return (0);
	-}
	-
	-static void
	-vdev_raidz_close(vdev_t *vd)
	-{
	- int c;
	-
	- for (c = 0; c < vd->vdev_children; c++)
	- vdev_close(vd->vdev_child[c]);
	-}
	-
	-#ifdef illumos
	-/*
	- * Handle a read or write I/O to a RAID-Z dump device.
	- *
	- * The dump device is in a unique situation compared to other ZFS datasets:
	- * writing to this device should be as simple and fast as possible. In
	- * addition, durability matters much less since the dump will be extracted
	- * once the machine reboots. For that reason, this function eschews parity for
	- * performance and simplicity. The dump device uses the checksum setting
	- * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
	- * dataset.
	- *
	- * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
	- * 128 KB will not fill an entire block; in addition, they may not be properly
	- * aligned. In that case, this function uses the preallocated 128 KB block and
	- * omits reading or writing any "empty" portions of that block, as opposed to
	- * allocating a fresh appropriately-sized block.
	- *
	- * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
	- *
	- * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
	- *
	- * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
	- * allocated which spans all five child vdevs. 8 KB of data would be written to
	- * each of four vdevs, with the fifth containing the parity bits.
	- *
	- * parity data data data data
	- * \| PP \| XX \| XX \| XX \| XX \|
	- * ^ ^ ^ ^ ^
	- * \| \| \| \| \|
	- * 8 KB parity ------8 KB data blocks------
	- *
	- * However, when writing to the dump device, the behavior is different:
	- *
	- * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
	- *
	- * Unlike the normal RAID-Z case in which the block is allocated based on the
	- * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
	- * I/O size is less than 128 KB, only the actual portions of data are written.
	- * In this example the data is written to the third data vdev since that vdev
	- * contains the offset [64 KB, 96 KB).
	- *
	- * parity data data data data
	- * \| \| \| \| XX \| \|
	- * ^
	- * \|
	- * 32 KB data block
	- *
	- * As a result, an individual I/O may not span all child vdevs; moreover, a
	- * small I/O may only operate on a single child vdev.
	- *
	- * Note that since there are no parity bits calculated or written, this format
	- * remains the same no matter how many parity bits are used in a normal RAID-Z
	- * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
	- * would look like:
	- *
	- * parity parity parity data data data data
	- * \| \| \| \| \| \| XX \| \|
	- * ^
	- * \|
	- * 32 KB data block
	- */
	-int
	-vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
	- uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
	-{
	- vdev_t *tvd = vd->vdev_top;
	- vdev_t *cvd;
	- raidz_map_t *rm;
	- raidz_col_t *rc;
	- int c, err = 0;
	-
	- uint64_t start, end, colstart, colend;
	- uint64_t coloffset, colsize, colskip;
	-
	- int flags = doread ? BIO_READ : BIO_WRITE;
	-
	-#ifdef _KERNEL
	-
	- /*
	- * Don't write past the end of the block
	- */
	- VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
	-
	- start = offset;
	- end = start + size;
	-
	- /*
	- * Allocate a RAID-Z map for this block. Note that this block starts
	- * from the "original" offset, this is, the offset of the extent which
	- * contains the requisite offset of the data being read or written.
	- *
	- * Even if this I/O operation doesn't span the full block size, let's
	- * treat the on-disk format as if the only blocks are the complete 128
	- * KB size.
	- */
	- abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
	- SPA_OLD_MAXBLOCKSIZE);
	- rm = vdev_raidz_map_alloc(abd,
	- SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
	- vd->vdev_children, vd->vdev_nparity);
	-
	- coloffset = origoffset;
	-
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols;
	- c++, coloffset += rc->rc_size) {
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	-
	- /*
	- * Find the start and end of this column in the RAID-Z map,
	- * keeping in mind that the stated size and offset of the
	- * operation may not fill the entire column for this vdev.
	- *
	- * If any portion of the data spans this column, issue the
	- * appropriate operation to the vdev.
	- */
	- if (coloffset + rc->rc_size <= start)
	- continue;
	- if (coloffset >= end)
	- continue;
	-
	- colstart = MAX(coloffset, start);
	- colend = MIN(end, coloffset + rc->rc_size);
	- colsize = colend - colstart;
	- colskip = colstart - coloffset;
	-
	- VERIFY3U(colsize, <=, rc->rc_size);
	- VERIFY3U(colskip, <=, rc->rc_size);
	-
	- /*
	- * Note that the child vdev will have a vdev label at the start
	- * of its range of offsets, hence the need for
	- * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
	- * example of why this calculation is needed.
	- */
	- if ((err = vdev_disk_physio(cvd,
	- ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
	- VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
	- flags, isdump)) != 0)
	- break;
	- }
	-
	- vdev_raidz_map_free(rm);
	- abd_put(abd);
	-#endif /* KERNEL */
	-
	- return (err);
	-}
	-#endif
	-
	-static uint64_t
	-vdev_raidz_asize(vdev_t *vd, uint64_t psize)
	-{
	- uint64_t asize;
	- uint64_t ashift = vd->vdev_top->vdev_ashift;
	- uint64_t cols = vd->vdev_children;
	- uint64_t nparity = vd->vdev_nparity;
	-
	- asize = ((psize - 1) >> ashift) + 1;
	- asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
	- asize = roundup(asize, nparity + 1) << ashift;
	-
	- return (asize);
	-}
	-
	-static void
	-vdev_raidz_child_done(zio_t *zio)
	-{
	- raidz_col_t *rc = zio->io_private;
	-
	- rc->rc_error = zio->io_error;
	- rc->rc_tried = 1;
	- rc->rc_skipped = 0;
	-}
	-
	-static void
	-vdev_raidz_io_verify(zio_t zio, raidz_map_t rm, int col)
	-{
	-#ifdef ZFS_DEBUG
	- vdev_t *vd = zio->io_vd;
	- vdev_t *tvd = vd->vdev_top;
	-
	- range_seg_t logical_rs, physical_rs;
	- logical_rs.rs_start = zio->io_offset;
	- logical_rs.rs_end = logical_rs.rs_start +
	- vdev_raidz_asize(zio->io_vd, zio->io_size);
	-
	- raidz_col_t *rc = &rm->rm_col[col];
	- vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
	-
	- vdev_xlate(cvd, &logical_rs, &physical_rs);
	- ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
	- ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
	- /*
	- * It would be nice to assert that rs_end is equal
	- * to rc_offset + rc_size but there might be an
	- * optional I/O at the end that is not accounted in
	- * rc_size.
	- */
	- if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
	- ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
	- rc->rc_size + (1 << tvd->vdev_ashift));
	- } else {
	- ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
	- }
	-#endif
	-}
	-
	-/*
	- * Start an IO operation on a RAIDZ VDev
	- *
	- * Outline:
	- * - For write operations:
	- * 1. Generate the parity data
	- * 2. Create child zio write operations to each column's vdev, for both
	- * data and parity.
	- * 3. If the column skips any sectors for padding, create optional dummy
	- * write zio children for those areas to improve aggregation continuity.
	- * - For read operations:
	- * 1. Create child zio read operations to each data column's vdev to read
	- * the range of data required for zio.
	- * 2. If this is a scrub or resilver operation, or if any of the data
	- * vdevs have had errors, then create zio read operations to the parity
	- * columns' VDevs as well.
	- */
	-static void
	-vdev_raidz_io_start(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- vdev_t *tvd = vd->vdev_top;
	- vdev_t *cvd;
	- raidz_map_t *rm;
	- raidz_col_t *rc;
	- int c, i;
	-
	- rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
	- zio->io_type == ZIO_TYPE_FREE,
	- tvd->vdev_ashift, vd->vdev_children,
	- vd->vdev_nparity);
	-
	- zio->io_vsd = rm;
	- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
	-
	- ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
	-
	- if (zio->io_type == ZIO_TYPE_FREE) {
	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	- rc->rc_offset, rc->rc_abd, rc->rc_size,
	- zio->io_type, zio->io_priority, 0,
	- vdev_raidz_child_done, rc));
	- }
	-
	- zio_execute(zio);
	- return;
	- }
	-
	- if (zio->io_type == ZIO_TYPE_WRITE) {
	- vdev_raidz_generate_parity(rm);
	-
	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	-
	- /*
	- * Verify physical to logical translation.
	- */
	- vdev_raidz_io_verify(zio, rm, c);
	-
	- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	- rc->rc_offset, rc->rc_abd, rc->rc_size,
	- zio->io_type, zio->io_priority, 0,
	- vdev_raidz_child_done, rc));
	- }
	-
	- /*
	- * Generate optional I/Os for any skipped sectors to improve
	- * aggregation contiguity.
	- */
	- for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
	- ASSERT(c <= rm->rm_scols);
	- if (c == rm->rm_scols)
	- c = 0;
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	- rc->rc_offset + rc->rc_size, NULL,
	- 1 << tvd->vdev_ashift,
	- zio->io_type, zio->io_priority,
	- ZIO_FLAG_NODATA \| ZIO_FLAG_OPTIONAL, NULL, NULL));
	- }
	-
	- zio_execute(zio);
	- return;
	- }
	-
	- ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
	-
	- /*
	- * Iterate over the columns in reverse order so that we hit the parity
	- * last -- any errors along the way will force us to read the parity.
	- */
	- for (c = rm->rm_cols - 1; c >= 0; c--) {
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	- if (!vdev_readable(cvd)) {
	- if (c >= rm->rm_firstdatacol)
	- rm->rm_missingdata++;
	- else
	- rm->rm_missingparity++;
	- rc->rc_error = SET_ERROR(ENXIO);
	- rc->rc_tried = 1; /* don't even try */
	- rc->rc_skipped = 1;
	- continue;
	- }
	- if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
	- if (c >= rm->rm_firstdatacol)
	- rm->rm_missingdata++;
	- else
	- rm->rm_missingparity++;
	- rc->rc_error = SET_ERROR(ESTALE);
	- rc->rc_skipped = 1;
	- continue;
	- }
	- if (c >= rm->rm_firstdatacol \|\| rm->rm_missingdata > 0 \|\|
	- (zio->io_flags & (ZIO_FLAG_SCRUB \| ZIO_FLAG_RESILVER))) {
	- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	- rc->rc_offset, rc->rc_abd, rc->rc_size,
	- zio->io_type, zio->io_priority, 0,
	- vdev_raidz_child_done, rc));
	- }
	- }
	-
	- zio_execute(zio);
	-}
	-
	-
	-/*
	- * Report a checksum error for a child of a RAID-Z device.
	- */
	-static void
	-raidz_checksum_error(zio_t zio, raidz_col_t rc, void *bad_data)
	-{
	- void *buf;
	- vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
	-
	- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	- zio_bad_cksum_t zbc;
	- raidz_map_t *rm = zio->io_vsd;
	-
	- mutex_enter(&vd->vdev_stat_lock);
	- vd->vdev_stat.vs_checksum_errors++;
	- mutex_exit(&vd->vdev_stat_lock);
	-
	- zbc.zbc_has_cksum = 0;
	- zbc.zbc_injected = rm->rm_ecksuminjected;
	-
	- buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
	- zfs_ereport_post_checksum(zio->io_spa, vd, zio,
	- rc->rc_offset, rc->rc_size, buf, bad_data,
	- &zbc);
	- abd_return_buf(rc->rc_abd, buf, rc->rc_size);
	- }
	-}
	-
	-/*
	- * We keep track of whether or not there were any injected errors, so that
	- * any ereports we generate can note it.
	- */
	-static int
	-raidz_checksum_verify(zio_t *zio)
	-{
	- zio_bad_cksum_t zbc;
	- raidz_map_t *rm = zio->io_vsd;
	-
	- int ret = zio_checksum_error(zio, &zbc);
	- if (ret != 0 && zbc.zbc_injected != 0)
	- rm->rm_ecksuminjected = 1;
	-
	- return (ret);
	-}
	-
	-/*
	- * Generate the parity from the data columns. If we tried and were able to
	- * read the parity without error, verify that the generated parity matches the
	- * data we read. If it doesn't, we fire off a checksum error. Return the
	- * number such failures.
	- */
	-static int
	-raidz_parity_verify(zio_t zio, raidz_map_t rm)
	-{
	- void *orig[VDEV_RAIDZ_MAXPARITY];
	- int c, ret = 0;
	- raidz_col_t *rc;
	-
	- blkptr_t *bp = zio->io_bp;
	- enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
	- (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
	-
	- if (checksum == ZIO_CHECKSUM_NOPARITY)
	- return (ret);
	-
	- for (c = 0; c < rm->rm_firstdatacol; c++) {
	- rc = &rm->rm_col[c];
	- if (!rc->rc_tried \|\| rc->rc_error != 0)
	- continue;
	- orig[c] = zio_buf_alloc(rc->rc_size);
	- abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
	- }
	-
	- vdev_raidz_generate_parity(rm);
	-
	- for (c = 0; c < rm->rm_firstdatacol; c++) {
	- rc = &rm->rm_col[c];
	- if (!rc->rc_tried \|\| rc->rc_error != 0)
	- continue;
	- if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
	- raidz_checksum_error(zio, rc, orig[c]);
	- rc->rc_error = SET_ERROR(ECKSUM);
	- ret++;
	- }
	- zio_buf_free(orig[c], rc->rc_size);
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * Keep statistics on all the ways that we used parity to correct data.
	- */
	-static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
	-
	-static int
	-vdev_raidz_worst_error(raidz_map_t *rm)
	-{
	- int error = 0;
	-
	- for (int c = 0; c < rm->rm_cols; c++)
	- error = zio_worst_error(error, rm->rm_col[c].rc_error);
	-
	- return (error);
	-}
	-
	-/*
	- * Iterate over all combinations of bad data and attempt a reconstruction.
	- * Note that the algorithm below is non-optimal because it doesn't take into
	- * account how reconstruction is actually performed. For example, with
	- * triple-parity RAID-Z the reconstruction procedure is the same if column 4
	- * is targeted as invalid as if columns 1 and 4 are targeted since in both
	- * cases we'd only use parity information in column 0.
	- */
	-static int
	-vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
	-{
	- raidz_map_t *rm = zio->io_vsd;
	- raidz_col_t *rc;
	- void *orig[VDEV_RAIDZ_MAXPARITY];
	- int tstore[VDEV_RAIDZ_MAXPARITY + 2];
	- int *tgts = &tstore[1];
	- int current, next, i, c, n;
	- int code, ret = 0;
	-
	- ASSERT(total_errors < rm->rm_firstdatacol);
	-
	- /*
	- * This simplifies one edge condition.
	- */
	- tgts[-1] = -1;
	-
	- for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
	- /*
	- * Initialize the targets array by finding the first n columns
	- * that contain no error.
	- *
	- * If there were no data errors, we need to ensure that we're
	- * always explicitly attempting to reconstruct at least one
	- * data column. To do this, we simply push the highest target
	- * up into the data columns.
	- */
	- for (c = 0, i = 0; i < n; i++) {
	- if (i == n - 1 && data_errors == 0 &&
	- c < rm->rm_firstdatacol) {
	- c = rm->rm_firstdatacol;
	- }
	-
	- while (rm->rm_col[c].rc_error != 0) {
	- c++;
	- ASSERT3S(c, <, rm->rm_cols);
	- }
	-
	- tgts[i] = c++;
	- }
	-
	- /*
	- * Setting tgts[n] simplifies the other edge condition.
	- */
	- tgts[n] = rm->rm_cols;
	-
	- /*
	- * These buffers were allocated in previous iterations.
	- */
	- for (i = 0; i < n - 1; i++) {
	- ASSERT(orig[i] != NULL);
	- }
	-
	- orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
	-
	- current = 0;
	- next = tgts[current];
	-
	- while (current != n) {
	- tgts[current] = next;
	- current = 0;
	-
	- /*
	- * Save off the original data that we're going to
	- * attempt to reconstruct.
	- */
	- for (i = 0; i < n; i++) {
	- ASSERT(orig[i] != NULL);
	- c = tgts[i];
	- ASSERT3S(c, >=, 0);
	- ASSERT3S(c, <, rm->rm_cols);
	- rc = &rm->rm_col[c];
	- abd_copy_to_buf(orig[i], rc->rc_abd,
	- rc->rc_size);
	- }
	-
	- /*
	- * Attempt a reconstruction and exit the outer loop on
	- * success.
	- */
	- code = vdev_raidz_reconstruct(rm, tgts, n);
	- if (raidz_checksum_verify(zio) == 0) {
	- atomic_inc_64(&raidz_corrected[code]);
	-
	- for (i = 0; i < n; i++) {
	- c = tgts[i];
	- rc = &rm->rm_col[c];
	- ASSERT(rc->rc_error == 0);
	- if (rc->rc_tried)
	- raidz_checksum_error(zio, rc,
	- orig[i]);
	- rc->rc_error = SET_ERROR(ECKSUM);
	- }
	-
	- ret = code;
	- goto done;
	- }
	-
	- /*
	- * Restore the original data.
	- */
	- for (i = 0; i < n; i++) {
	- c = tgts[i];
	- rc = &rm->rm_col[c];
	- abd_copy_from_buf(rc->rc_abd, orig[i],
	- rc->rc_size);
	- }
	-
	- do {
	- /*
	- * Find the next valid column after the current
	- * position..
	- */
	- for (next = tgts[current] + 1;
	- next < rm->rm_cols &&
	- rm->rm_col[next].rc_error != 0; next++)
	- continue;
	-
	- ASSERT(next <= tgts[current + 1]);
	-
	- /*
	- * If that spot is available, we're done here.
	- */
	- if (next != tgts[current + 1])
	- break;
	-
	- /*
	- * Otherwise, find the next valid column after
	- * the previous position.
	- */
	- for (c = tgts[current - 1] + 1;
	- rm->rm_col[c].rc_error != 0; c++)
	- continue;
	-
	- tgts[current] = c;
	- current++;
	-
	- } while (current != n);
	- }
	- }
	- n--;
	-done:
	- for (i = 0; i < n; i++) {
	- zio_buf_free(orig[i], rm->rm_col[0].rc_size);
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * Complete an IO operation on a RAIDZ VDev
	- *
	- * Outline:
	- * - For write operations:
	- * 1. Check for errors on the child IOs.
	- * 2. Return, setting an error code if too few child VDevs were written
	- * to reconstruct the data later. Note that partial writes are
	- * considered successful if they can be reconstructed at all.
	- * - For read operations:
	- * 1. Check for errors on the child IOs.
	- * 2. If data errors occurred:
	- * a. Try to reassemble the data from the parity available.
	- * b. If we haven't yet read the parity drives, read them now.
	- * c. If all parity drives have been read but the data still doesn't
	- * reassemble with a correct checksum, then try combinatorial
	- * reconstruction.
	- * d. If that doesn't work, return an error.
	- * 3. If there were unexpected errors or this is a resilver operation,
	- * rewrite the vdevs that had errors.
	- */
	-static void
	-vdev_raidz_io_done(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- vdev_t *cvd;
	- raidz_map_t *rm = zio->io_vsd;
	- raidz_col_t *rc;
	- int unexpected_errors = 0;
	- int parity_errors = 0;
	- int parity_untried = 0;
	- int data_errors = 0;
	- int total_errors = 0;
	- int n, c;
	- int tgts[VDEV_RAIDZ_MAXPARITY];
	- int code;
	-
	- ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
	-
	- ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
	- ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
	-
	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	-
	- if (rc->rc_error) {
	- ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
	-
	- if (c < rm->rm_firstdatacol)
	- parity_errors++;
	- else
	- data_errors++;
	-
	- if (!rc->rc_skipped)
	- unexpected_errors++;
	-
	- total_errors++;
	- } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
	- parity_untried++;
	- }
	- }
	-
	- if (zio->io_type == ZIO_TYPE_WRITE) {
	- /*
	- * XXX -- for now, treat partial writes as a success.
	- * (If we couldn't write enough columns to reconstruct
	- * the data, the I/O failed. Otherwise, good enough.)
	- *
	- * Now that we support write reallocation, it would be better
	- * to treat partial failure as real failure unless there are
	- * no non-degraded top-level vdevs left, and not update DTLs
	- * if we intend to reallocate.
	- */
	- /* XXPOLICY */
	- if (total_errors > rm->rm_firstdatacol)
	- zio->io_error = vdev_raidz_worst_error(rm);
	-
	- return;
	- } else if (zio->io_type == ZIO_TYPE_FREE) {
	- return;
	- }
	-
	- ASSERT(zio->io_type == ZIO_TYPE_READ);
	- /*
	- * There are three potential phases for a read:
	- * 1. produce valid data from the columns read
	- * 2. read all disks and try again
	- * 3. perform combinatorial reconstruction
	- *
	- * Each phase is progressively both more expensive and less likely to
	- * occur. If we encounter more errors than we can repair or all phases
	- * fail, we have no choice but to return an error.
	- */
	-
	- /*
	- * If the number of errors we saw was correctable -- less than or equal
	- * to the number of parity disks read -- attempt to produce data that
	- * has a valid checksum. Naturally, this case applies in the absence of
	- * any errors.
	- */
	- if (total_errors <= rm->rm_firstdatacol - parity_untried) {
	- if (data_errors == 0) {
	- if (raidz_checksum_verify(zio) == 0) {
	- /*
	- * If we read parity information (unnecessarily
	- * as it happens since no reconstruction was
	- * needed) regenerate and verify the parity.
	- * We also regenerate parity when resilvering
	- * so we can write it out to the failed device
	- * later.
	- */
	- if (parity_errors + parity_untried <
	- rm->rm_firstdatacol \|\|
	- (zio->io_flags & ZIO_FLAG_RESILVER)) {
	- n = raidz_parity_verify(zio, rm);
	- unexpected_errors += n;
	- ASSERT(parity_errors + n <=
	- rm->rm_firstdatacol);
	- }
	- goto done;
	- }
	- } else {
	- /*
	- * We either attempt to read all the parity columns or
	- * none of them. If we didn't try to read parity, we
	- * wouldn't be here in the correctable case. There must
	- * also have been fewer parity errors than parity
	- * columns or, again, we wouldn't be in this code path.
	- */
	- ASSERT(parity_untried == 0);
	- ASSERT(parity_errors < rm->rm_firstdatacol);
	-
	- /*
	- * Identify the data columns that reported an error.
	- */
	- n = 0;
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- if (rc->rc_error != 0) {
	- ASSERT(n < VDEV_RAIDZ_MAXPARITY);
	- tgts[n++] = c;
	- }
	- }
	-
	- ASSERT(rm->rm_firstdatacol >= n);
	-
	- code = vdev_raidz_reconstruct(rm, tgts, n);
	-
	- if (raidz_checksum_verify(zio) == 0) {
	- atomic_inc_64(&raidz_corrected[code]);
	-
	- /*
	- * If we read more parity disks than were used
	- * for reconstruction, confirm that the other
	- * parity disks produced correct data. This
	- * routine is suboptimal in that it regenerates
	- * the parity that we already used in addition
	- * to the parity that we're attempting to
	- * verify, but this should be a relatively
	- * uncommon case, and can be optimized if it
	- * becomes a problem. Note that we regenerate
	- * parity when resilvering so we can write it
	- * out to failed devices later.
	- */
	- if (parity_errors < rm->rm_firstdatacol - n \|\|
	- (zio->io_flags & ZIO_FLAG_RESILVER)) {
	- n = raidz_parity_verify(zio, rm);
	- unexpected_errors += n;
	- ASSERT(parity_errors + n <=
	- rm->rm_firstdatacol);
	- }
	-
	- goto done;
	- }
	- }
	- }
	-
	- /*
	- * This isn't a typical situation -- either we got a read error or
	- * a child silently returned bad data. Read every block so we can
	- * try again with as much data and parity as we can track down. If
	- * we've already been through once before, all children will be marked
	- * as tried so we'll proceed to combinatorial reconstruction.
	- */
	- unexpected_errors = 1;
	- rm->rm_missingdata = 0;
	- rm->rm_missingparity = 0;
	-
	- for (c = 0; c < rm->rm_cols; c++) {
	- if (rm->rm_col[c].rc_tried)
	- continue;
	-
	- zio_vdev_io_redone(zio);
	- do {
	- rc = &rm->rm_col[c];
	- if (rc->rc_tried)
	- continue;
	- zio_nowait(zio_vdev_child_io(zio, NULL,
	- vd->vdev_child[rc->rc_devidx],
	- rc->rc_offset, rc->rc_abd, rc->rc_size,
	- zio->io_type, zio->io_priority, 0,
	- vdev_raidz_child_done, rc));
	- } while (++c < rm->rm_cols);
	-
	- return;
	- }
	-
	- /*
	- * At this point we've attempted to reconstruct the data given the
	- * errors we detected, and we've attempted to read all columns. There
	- * must, therefore, be one or more additional problems -- silent errors
	- * resulting in invalid data rather than explicit I/O errors resulting
	- * in absent data. We check if there is enough additional data to
	- * possibly reconstruct the data and then perform combinatorial
	- * reconstruction over all possible combinations. If that fails,
	- * we're cooked.
	- */
	- if (total_errors > rm->rm_firstdatacol) {
	- zio->io_error = vdev_raidz_worst_error(rm);
	-
	- } else if (total_errors < rm->rm_firstdatacol &&
	- (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
	- /*
	- * If we didn't use all the available parity for the
	- * combinatorial reconstruction, verify that the remaining
	- * parity is correct.
	- */
	- if (code != (1 << rm->rm_firstdatacol) - 1)
	- (void) raidz_parity_verify(zio, rm);
	- } else {
	- /*
	- * We're here because either:
	- *
	- * total_errors == rm_firstdatacol, or
	- * vdev_raidz_combrec() failed
	- *
	- * In either case, there is enough bad data to prevent
	- * reconstruction.
	- *
	- * Start checksum ereports for all children which haven't
	- * failed, and the IO wasn't speculative.
	- */
	- zio->io_error = SET_ERROR(ECKSUM);
	-
	- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- if (rc->rc_error == 0) {
	- zio_bad_cksum_t zbc;
	- zbc.zbc_has_cksum = 0;
	- zbc.zbc_injected =
	- rm->rm_ecksuminjected;
	-
	- zfs_ereport_start_checksum(
	- zio->io_spa,
	- vd->vdev_child[rc->rc_devidx],
	- zio, rc->rc_offset, rc->rc_size,
	- (void *)(uintptr_t)c, &zbc);
	- }
	- }
	- }
	- }
	-
	-done:
	- zio_checksum_verified(zio);
	-
	- if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
	- (unexpected_errors \|\| (zio->io_flags & ZIO_FLAG_RESILVER))) {
	- /*
	- * Use the good data we have in hand to repair damaged children.
	- */
	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	-
	- if (rc->rc_error == 0)
	- continue;
	-
	- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	- rc->rc_offset, rc->rc_abd, rc->rc_size,
	- ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
	- ZIO_FLAG_IO_REPAIR \| (unexpected_errors ?
	- ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
	- }
	- }
	-}
	-
	-static void
	-vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
	-{
	- if (faulted > vd->vdev_nparity)
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_NO_REPLICAS);
	- else if (degraded + faulted != 0)
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
	- else
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
	-}
	-
	-/*
	- * Determine if any portion of the provided block resides on a child vdev
	- * with a dirty DTL and therefore needs to be resilvered. The function
	- * assumes that at least one DTL is dirty which imples that full stripe
	- * width blocks must be resilvered.
	- */
	-static boolean_t
	-vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
	-{
	- uint64_t dcols = vd->vdev_children;
	- uint64_t nparity = vd->vdev_nparity;
	- uint64_t ashift = vd->vdev_top->vdev_ashift;
	- /* The starting RAIDZ (parent) vdev sector of the block. */
	- uint64_t b = offset >> ashift;
	- /* The zio's size in units of the vdev's minimum sector size. */
	- uint64_t s = ((psize - 1) >> ashift) + 1;
	- /* The first column for this stripe. */
	- uint64_t f = b % dcols;
	-
	- if (s + nparity >= dcols)
	- return (B_TRUE);
	-
	- for (uint64_t c = 0; c < s + nparity; c++) {
	- uint64_t devidx = (f + c) % dcols;
	- vdev_t *cvd = vd->vdev_child[devidx];
	-
	- /*
	- * dsl_scan_need_resilver() already checked vd with
	- * vdev_dtl_contains(). So here just check cvd with
	- * vdev_dtl_empty(), cheaper and a good approximation.
	- */
	- if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-static void
	-vdev_raidz_xlate(vdev_t cvd, const range_seg_t in, range_seg_t *res)
	-{
	- vdev_t *raidvd = cvd->vdev_parent;
	- ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
	-
	- uint64_t width = raidvd->vdev_children;
	- uint64_t tgt_col = cvd->vdev_id;
	- uint64_t ashift = raidvd->vdev_top->vdev_ashift;
	-
	- /* make sure the offsets are block-aligned */
	- ASSERT0(in->rs_start % (1 << ashift));
	- ASSERT0(in->rs_end % (1 << ashift));
	- uint64_t b_start = in->rs_start >> ashift;
	- uint64_t b_end = in->rs_end >> ashift;
	-
	- uint64_t start_row = 0;
	- if (b_start > tgt_col) /* avoid underflow */
	- start_row = ((b_start - tgt_col - 1) / width) + 1;
	-
	- uint64_t end_row = 0;
	- if (b_end > tgt_col)
	- end_row = ((b_end - tgt_col - 1) / width) + 1;
	-
	- res->rs_start = start_row << ashift;
	- res->rs_end = end_row << ashift;
	-
	- ASSERT3U(res->rs_start, <=, in->rs_start);
	- ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
	-}
	-
	-vdev_ops_t vdev_raidz_ops = {
	- vdev_raidz_open,
	- vdev_raidz_close,
	- vdev_raidz_asize,
	- vdev_raidz_io_start,
	- vdev_raidz_io_done,
	- vdev_raidz_state_change,
	- vdev_raidz_need_resilver,
	- NULL,
	- NULL,
	- NULL,
	- vdev_raidz_xlate,
	- VDEV_TYPE_RAIDZ, /* name of this vdev type */
	- B_FALSE /* not a leaf vdev */
	-};
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
	@@ -1,2156 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa_impl.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/zap.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/metaslab.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/uberblock_impl.h>
	-#include <sys/txg.h>
	-#include <sys/avl.h>
	-#include <sys/bpobj.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/arc.h>
	-#include <sys/zfeature.h>
	-#include <sys/vdev_indirect_births.h>
	-#include <sys/vdev_indirect_mapping.h>
	-#include <sys/abd.h>
	-#include <sys/vdev_initialize.h>
	-
	-/*
	- * This file contains the necessary logic to remove vdevs from a
	- * storage pool. Currently, the only devices that can be removed
	- * are log, cache, and spare devices; and top level vdevs from a pool
	- * w/o raidz. (Note that members of a mirror can also be removed
	- * by the detach operation.)
	- *
	- * Log vdevs are removed by evacuating them and then turning the vdev
	- * into a hole vdev while holding spa config locks.
	- *
	- * Top level vdevs are removed and converted into an indirect vdev via
	- * a multi-step process:
	- *
	- * - Disable allocations from this device (spa_vdev_remove_top).
	- *
	- * - From a new thread (spa_vdev_remove_thread), copy data from
	- * the removing vdev to a different vdev. The copy happens in open
	- * context (spa_vdev_copy_impl) and issues a sync task
	- * (vdev_mapping_sync) so the sync thread can update the partial
	- * indirect mappings in core and on disk.
	- *
	- * - If a free happens during a removal, it is freed from the
	- * removing vdev, and if it has already been copied, from the new
	- * location as well (free_from_removing_vdev).
	- *
	- * - After the removal is completed, the copy thread converts the vdev
	- * into an indirect vdev (vdev_remove_complete) before instructing
	- * the sync thread to destroy the space maps and finish the removal
	- * (spa_finish_removal).
	- */
	-
	-typedef struct vdev_copy_arg {
	- metaslab_t *vca_msp;
	- uint64_t vca_outstanding_bytes;
	- kcondvar_t vca_cv;
	- kmutex_t vca_lock;
	-} vdev_copy_arg_t;
	-
	-/*
	- * The maximum amount of memory we can use for outstanding i/o while
	- * doing a device removal. This determines how much i/o we can have
	- * in flight concurrently.
	- */
	-int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
	-
	-/*
	- * The largest contiguous segment that we will attempt to allocate when
	- * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If
	- * there is a performance problem with attempting to allocate large blocks,
	- * consider decreasing this.
	- *
	- * Note: we will issue I/Os of up to this size. The mpt driver does not
	- * respond well to I/Os larger than 1MB, so we set this to 1MB. (When
	- * mpt processes an I/O larger than 1MB, it needs to do an allocation of
	- * 2 physically contiguous pages; if this allocation fails, mpt will drop
	- * the I/O and hang the device.)
	- */
	-int zfs_remove_max_segment = 1024 * 1024;
	-
	-/*
	- * Allow a remap segment to span free chunks of at most this size. The main
	- * impact of a larger span is that we will read and write larger, more
	- * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
	- * for iops. The value here was chosen to align with
	- * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
	- * reads (but there's no reason it has to be the same).
	- *
	- * Additionally, a higher span will have the following relatively minor
	- * effects:
	- * - the mapping will be smaller, since one entry can cover more allocated
	- * segments
	- * - more of the fragmentation in the removing device will be preserved
	- * - we'll do larger allocations, which may fail and fall back on smaller
	- * allocations
	- */
	-int vdev_removal_max_span = 32 * 1024;
	-
	-/*
	- * This is used by the test suite so that it can ensure that certain
	- * actions happen while in the middle of a removal.
	- */
	-uint64_t zfs_remove_max_bytes_pause = UINT64_MAX;
	-
	-#define VDEV_REMOVAL_ZAP_OBJS "lzap"
	-
	-static void spa_vdev_remove_thread(void *arg);
	-
	-static void
	-spa_sync_removing_state(spa_t spa, dmu_tx_t tx)
	-{
	- VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_REMOVING, sizeof (uint64_t),
	- sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
	- &spa->spa_removing_phys, tx));
	-}
	-
	-static nvlist_t *
	-spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
	-{
	- for (int i = 0; i < count; i++) {
	- uint64_t guid =
	- fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
	-
	- if (guid == target_guid)
	- return (nvpp[i]);
	- }
	-
	- return (NULL);
	-}
	-
	-static void
	-spa_vdev_remove_aux(nvlist_t config, char name, nvlist_t **dev, int count,
	- nvlist_t *dev_to_remove)
	-{
	- nvlist_t **newdev = NULL;
	-
	- if (count > 1)
	- newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
	-
	- for (int i = 0, j = 0; i < count; i++) {
	- if (dev[i] == dev_to_remove)
	- continue;
	- VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
	- }
	-
	- VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
	- VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
	-
	- for (int i = 0; i < count - 1; i++)
	- nvlist_free(newdev[i]);
	-
	- if (count > 1)
	- kmem_free(newdev, (count - 1) * sizeof (void *));
	-}
	-
	-static spa_vdev_removal_t *
	-spa_vdev_removal_create(vdev_t *vd)
	-{
	- spa_vdev_removal_t svr = kmem_zalloc(sizeof (svr), KM_SLEEP);
	- mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
	- svr->svr_allocd_segs = range_tree_create(NULL, NULL);
	- svr->svr_vdev_id = vd->vdev_id;
	-
	- for (int i = 0; i < TXG_SIZE; i++) {
	- svr->svr_frees[i] = range_tree_create(NULL, NULL);
	- list_create(&svr->svr_new_segments[i],
	- sizeof (vdev_indirect_mapping_entry_t),
	- offsetof(vdev_indirect_mapping_entry_t, vime_node));
	- }
	-
	- return (svr);
	-}
	-
	-void
	-spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
	-{
	- for (int i = 0; i < TXG_SIZE; i++) {
	- ASSERT0(svr->svr_bytes_done[i]);
	- ASSERT0(svr->svr_max_offset_to_sync[i]);
	- range_tree_destroy(svr->svr_frees[i]);
	- list_destroy(&svr->svr_new_segments[i]);
	- }
	-
	- range_tree_destroy(svr->svr_allocd_segs);
	- mutex_destroy(&svr->svr_lock);
	- cv_destroy(&svr->svr_cv);
	- kmem_free(svr, sizeof (*svr));
	-}
	-
	-/*
	- * This is called as a synctask in the txg in which we will mark this vdev
	- * as removing (in the config stored in the MOS).
	- *
	- * It begins the evacuation of a toplevel vdev by:
	- * - initializing the spa_removing_phys which tracks this removal
	- * - computing the amount of space to remove for accounting purposes
	- * - dirtying all dbufs in the spa_config_object
	- * - creating the spa_vdev_removal
	- * - starting the spa_vdev_remove_thread
	- */
	-static void
	-vdev_remove_initiate_sync(void arg, dmu_tx_t tx)
	-{
	- int vdev_id = (uintptr_t)arg;
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- vdev_t *vd = vdev_lookup_top(spa, vdev_id);
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	- objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
	- spa_vdev_removal_t *svr = NULL;
	- uint64_t txg = dmu_tx_get_txg(tx);
	-
	- ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
	- svr = spa_vdev_removal_create(vd);
	-
	- ASSERT(vd->vdev_removing);
	- ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
	-
	- spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	- /*
	- * By activating the OBSOLETE_COUNTS feature, we prevent
	- * the pool from being downgraded and ensure that the
	- * refcounts are precise.
	- */
	- spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	- uint64_t one = 1;
	- VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
	- VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
	- &one, tx));
	- ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
	- }
	-
	- vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
	- vd->vdev_indirect_mapping =
	- vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
	- vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
	- vd->vdev_indirect_births =
	- vdev_indirect_births_open(mos, vic->vic_births_object);
	- spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
	- spa->spa_removing_phys.sr_start_time = gethrestime_sec();
	- spa->spa_removing_phys.sr_end_time = 0;
	- spa->spa_removing_phys.sr_state = DSS_SCANNING;
	- spa->spa_removing_phys.sr_to_copy = 0;
	- spa->spa_removing_phys.sr_copied = 0;
	-
	- /*
	- * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
	- * there may be space in the defer tree, which is free, but still
	- * counted in vs_alloc.
	- */
	- for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
	- metaslab_t *ms = vd->vdev_ms[i];
	- if (ms->ms_sm == NULL)
	- continue;
	-
	- spa->spa_removing_phys.sr_to_copy +=
	- metaslab_allocated_space(ms);
	-
	- /*
	- * Space which we are freeing this txg does not need to
	- * be copied.
	- */
	- spa->spa_removing_phys.sr_to_copy -=
	- range_tree_space(ms->ms_freeing);
	-
	- ASSERT0(range_tree_space(ms->ms_freed));
	- for (int t = 0; t < TXG_SIZE; t++)
	- ASSERT0(range_tree_space(ms->ms_allocating[t]));
	- }
	-
	- /*
	- * Sync tasks are called before metaslab_sync(), so there should
	- * be no already-synced metaslabs in the TXG_CLEAN list.
	- */
	- ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
	-
	- spa_sync_removing_state(spa, tx);
	-
	- /*
	- * All blocks that we need to read the most recent mapping must be
	- * stored on concrete vdevs. Therefore, we must dirty anything that
	- * is read before spa_remove_init(). Specifically, the
	- * spa_config_object. (Note that although we already modified the
	- * spa_config_object in spa_sync_removing_state, that may not have
	- * modified all blocks of the object.)
	- */
	- dmu_object_info_t doi;
	- VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
	- for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
	- dmu_buf_t *dbuf;
	- VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
	- offset, FTAG, &dbuf, 0));
	- dmu_buf_will_dirty(dbuf, tx);
	- offset += dbuf->db_size;
	- dmu_buf_rele(dbuf, FTAG);
	- }
	-
	- /*
	- * Now that we've allocated the im_object, dirty the vdev to ensure
	- * that the object gets written to the config on disk.
	- */
	- vdev_config_dirty(vd);
	-
	- zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu "
	- "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx),
	- vic->vic_mapping_object);
	-
	- spa_history_log_internal(spa, "vdev remove started", tx,
	- "%s vdev %llu %s", spa_name(spa), vd->vdev_id,
	- (vd->vdev_path != NULL) ? vd->vdev_path : "-");
	- /*
	- * Setting spa_vdev_removal causes subsequent frees to call
	- * free_from_removing_vdev(). Note that we don't need any locking
	- * because we are the sync thread, and metaslab_free_impl() is only
	- * called from syncing context (potentially from a zio taskq thread,
	- * but in any case only when there are outstanding free i/os, which
	- * there are not).
	- */
	- ASSERT3P(spa->spa_vdev_removal, ==, NULL);
	- spa->spa_vdev_removal = svr;
	- svr->svr_thread = thread_create(NULL, 0,
	- spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
	-}
	-
	-/*
	- * When we are opening a pool, we must read the mapping for each
	- * indirect vdev in order from most recently removed to least
	- * recently removed. We do this because the blocks for the mapping
	- * of older indirect vdevs may be stored on more recently removed vdevs.
	- * In order to read each indirect mapping object, we must have
	- * initialized all more recently removed vdevs.
	- */
	-int
	-spa_remove_init(spa_t *spa)
	-{
	- int error;
	-
	- error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
	- DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_REMOVING, sizeof (uint64_t),
	- sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
	- &spa->spa_removing_phys);
	-
	- if (error == ENOENT) {
	- spa->spa_removing_phys.sr_state = DSS_NONE;
	- spa->spa_removing_phys.sr_removing_vdev = -1;
	- spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
	- spa->spa_indirect_vdevs_loaded = B_TRUE;
	- return (0);
	- } else if (error != 0) {
	- return (error);
	- }
	-
	- if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
	- /*
	- * We are currently removing a vdev. Create and
	- * initialize a spa_vdev_removal_t from the bonus
	- * buffer of the removing vdevs vdev_im_object, and
	- * initialize its partial mapping.
	- */
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	- vdev_t *vd = vdev_lookup_top(spa,
	- spa->spa_removing_phys.sr_removing_vdev);
	-
	- if (vd == NULL) {
	- spa_config_exit(spa, SCL_STATE, FTAG);
	- return (EINVAL);
	- }
	-
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	-
	- ASSERT(vdev_is_concrete(vd));
	- spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
	- ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
	- ASSERT(vd->vdev_removing);
	-
	- vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
	- spa->spa_meta_objset, vic->vic_mapping_object);
	- vd->vdev_indirect_births = vdev_indirect_births_open(
	- spa->spa_meta_objset, vic->vic_births_object);
	- spa_config_exit(spa, SCL_STATE, FTAG);
	-
	- spa->spa_vdev_removal = svr;
	- }
	-
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	- uint64_t indirect_vdev_id =
	- spa->spa_removing_phys.sr_prev_indirect_vdev;
	- while (indirect_vdev_id != UINT64_MAX) {
	- vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	-
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	- vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
	- spa->spa_meta_objset, vic->vic_mapping_object);
	- vd->vdev_indirect_births = vdev_indirect_births_open(
	- spa->spa_meta_objset, vic->vic_births_object);
	-
	- indirect_vdev_id = vic->vic_prev_indirect_vdev;
	- }
	- spa_config_exit(spa, SCL_STATE, FTAG);
	-
	- /*
	- * Now that we've loaded all the indirect mappings, we can allow
	- * reads from other blocks (e.g. via predictive prefetch).
	- */
	- spa->spa_indirect_vdevs_loaded = B_TRUE;
	- return (0);
	-}
	-
	-void
	-spa_restart_removal(spa_t *spa)
	-{
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	-
	- if (svr == NULL)
	- return;
	-
	- /*
	- * In general when this function is called there is no
	- * removal thread running. The only scenario where this
	- * is not true is during spa_import() where this function
	- * is called twice [once from spa_import_impl() and
	- * spa_async_resume()]. Thus, in the scenario where we
	- * import a pool that has an ongoing removal we don't
	- * want to spawn a second thread.
	- */
	- if (svr->svr_thread != NULL)
	- return;
	-
	- if (!spa_writeable(spa))
	- return;
	-
	- zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
	- svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
	- 0, &p0, TS_RUN, minclsyspri);
	-}
	-
	-/*
	- * Process freeing from a device which is in the middle of being removed.
	- * We must handle this carefully so that we attempt to copy freed data,
	- * and we correctly free already-copied data.
	- */
	-void
	-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
	-{
	- spa_t *spa = vd->vdev_spa;
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- uint64_t txg = spa_syncing_txg(spa);
	- uint64_t max_offset_yet = 0;
	-
	- ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
	- ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
	- vdev_indirect_mapping_object(vim));
	- ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
	-
	- mutex_enter(&svr->svr_lock);
	-
	- /*
	- * Remove the segment from the removing vdev's spacemap. This
	- * ensures that we will not attempt to copy this space (if the
	- * removal thread has not yet visited it), and also ensures
	- * that we know what is actually allocated on the new vdevs
	- * (needed if we cancel the removal).
	- *
	- * Note: we must do the metaslab_free_concrete() with the svr_lock
	- * held, so that the remove_thread can not load this metaslab and then
	- * visit this offset between the time that we metaslab_free_concrete()
	- * and when we check to see if it has been visited.
	- *
	- * Note: The checkpoint flag is set to false as having/taking
	- * a checkpoint and removing a device can't happen at the same
	- * time.
	- */
	- ASSERT(!spa_has_checkpoint(spa));
	- metaslab_free_concrete(vd, offset, size, B_FALSE);
	-
	- uint64_t synced_size = 0;
	- uint64_t synced_offset = 0;
	- uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
	- if (offset < max_offset_synced) {
	- /*
	- * The mapping for this offset is already on disk.
	- * Free from the new location.
	- *
	- * Note that we use svr_max_synced_offset because it is
	- * updated atomically with respect to the in-core mapping.
	- * By contrast, vim_max_offset is not.
	- *
	- * This block may be split between a synced entry and an
	- * in-flight or unvisited entry. Only process the synced
	- * portion of it here.
	- */
	- synced_size = MIN(size, max_offset_synced - offset);
	- synced_offset = offset;
	-
	- ASSERT3U(max_offset_yet, <=, max_offset_synced);
	- max_offset_yet = max_offset_synced;
	-
	- DTRACE_PROBE3(remove__free__synced,
	- spa_t *, spa,
	- uint64_t, offset,
	- uint64_t, synced_size);
	-
	- size -= synced_size;
	- offset += synced_size;
	- }
	-
	- /*
	- * Look at all in-flight txgs starting from the currently syncing one
	- * and see if a section of this free is being copied. By starting from
	- * this txg and iterating forward, we might find that this region
	- * was copied in two different txgs and handle it appropriately.
	- */
	- for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
	- int txgoff = (txg + i) & TXG_MASK;
	- if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
	- /*
	- * The mapping for this offset is in flight, and
	- * will be synced in txg+i.
	- */
	- uint64_t inflight_size = MIN(size,
	- svr->svr_max_offset_to_sync[txgoff] - offset);
	-
	- DTRACE_PROBE4(remove__free__inflight,
	- spa_t *, spa,
	- uint64_t, offset,
	- uint64_t, inflight_size,
	- uint64_t, txg + i);
	-
	- /*
	- * We copy data in order of increasing offset.
	- * Therefore the max_offset_to_sync[] must increase
	- * (or be zero, indicating that nothing is being
	- * copied in that txg).
	- */
	- if (svr->svr_max_offset_to_sync[txgoff] != 0) {
	- ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
	- >=, max_offset_yet);
	- max_offset_yet =
	- svr->svr_max_offset_to_sync[txgoff];
	- }
	-
	- /*
	- * We've already committed to copying this segment:
	- * we have allocated space elsewhere in the pool for
	- * it and have an IO outstanding to copy the data. We
	- * cannot free the space before the copy has
	- * completed, or else the copy IO might overwrite any
	- * new data. To free that space, we record the
	- * segment in the appropriate svr_frees tree and free
	- * the mapped space later, in the txg where we have
	- * completed the copy and synced the mapping (see
	- * vdev_mapping_sync).
	- */
	- range_tree_add(svr->svr_frees[txgoff],
	- offset, inflight_size);
	- size -= inflight_size;
	- offset += inflight_size;
	-
	- /*
	- * This space is already accounted for as being
	- * done, because it is being copied in txg+i.
	- * However, if i!=0, then it is being copied in
	- * a future txg. If we crash after this txg
	- * syncs but before txg+i syncs, then the space
	- * will be free. Therefore we must account
	- * for the space being done in this txg
	- * (when it is freed) rather than the future txg
	- * (when it will be copied).
	- */
	- ASSERT3U(svr->svr_bytes_done[txgoff], >=,
	- inflight_size);
	- svr->svr_bytes_done[txgoff] -= inflight_size;
	- svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
	- }
	- }
	- ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
	-
	- if (size > 0) {
	- /*
	- * The copy thread has not yet visited this offset. Ensure
	- * that it doesn't.
	- */
	-
	- DTRACE_PROBE3(remove__free__unvisited,
	- spa_t *, spa,
	- uint64_t, offset,
	- uint64_t, size);
	-
	- if (svr->svr_allocd_segs != NULL)
	- range_tree_clear(svr->svr_allocd_segs, offset, size);
	-
	- /*
	- * Since we now do not need to copy this data, for
	- * accounting purposes we have done our job and can count
	- * it as completed.
	- */
	- svr->svr_bytes_done[txg & TXG_MASK] += size;
	- }
	- mutex_exit(&svr->svr_lock);
	-
	- /*
	- * Now that we have dropped svr_lock, process the synced portion
	- * of this free.
	- */
	- if (synced_size > 0) {
	- vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
	-
	- /*
	- * Note: this can only be called from syncing context,
	- * and the vdev_indirect_mapping is only changed from the
	- * sync thread, so we don't need svr_lock while doing
	- * metaslab_free_impl_cb.
	- */
	- boolean_t checkpoint = B_FALSE;
	- vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
	- metaslab_free_impl_cb, &checkpoint);
	- }
	-}
	-
	-/*
	- * Stop an active removal and update the spa_removing phys.
	- */
	-static void
	-spa_finish_removal(spa_t spa, dsl_scan_state_t state, dmu_tx_t tx)
	-{
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	- ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
	-
	- /* Ensure the removal thread has completed before we free the svr. */
	- spa_vdev_remove_suspend(spa);
	-
	- ASSERT(state == DSS_FINISHED \|\| state == DSS_CANCELED);
	-
	- if (state == DSS_FINISHED) {
	- spa_removing_phys_t *srp = &spa->spa_removing_phys;
	- vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	-
	- if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
	- vdev_t *pvd = vdev_lookup_top(spa,
	- srp->sr_prev_indirect_vdev);
	- ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
	- }
	-
	- vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
	- srp->sr_prev_indirect_vdev = vd->vdev_id;
	- }
	- spa->spa_removing_phys.sr_state = state;
	- spa->spa_removing_phys.sr_end_time = gethrestime_sec();
	-
	- spa->spa_vdev_removal = NULL;
	- spa_vdev_removal_destroy(svr);
	-
	- spa_sync_removing_state(spa, tx);
	-
	- vdev_config_dirty(spa->spa_root_vdev);
	-}
	-
	-static void
	-free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
	-{
	- vdev_t *vd = arg;
	- vdev_indirect_mark_obsolete(vd, offset, size);
	- boolean_t checkpoint = B_FALSE;
	- vdev_indirect_ops.vdev_op_remap(vd, offset, size,
	- metaslab_free_impl_cb, &checkpoint);
	-}
	-
	-/*
	- * On behalf of the removal thread, syncs an incremental bit more of
	- * the indirect mapping to disk and updates the in-memory mapping.
	- * Called as a sync task in every txg that the removal thread makes progress.
	- */
	-static void
	-vdev_mapping_sync(void arg, dmu_tx_t tx)
	-{
	- spa_vdev_removal_t *svr = arg;
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	- uint64_t txg = dmu_tx_get_txg(tx);
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	-
	- ASSERT(vic->vic_mapping_object != 0);
	- ASSERT3U(txg, ==, spa_syncing_txg(spa));
	-
	- vdev_indirect_mapping_add_entries(vim,
	- &svr->svr_new_segments[txg & TXG_MASK], tx);
	- vdev_indirect_births_add_entry(vd->vdev_indirect_births,
	- vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
	-
	- /*
	- * Free the copied data for anything that was freed while the
	- * mapping entries were in flight.
	- */
	- mutex_enter(&svr->svr_lock);
	- range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
	- free_mapped_segment_cb, vd);
	- ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
	- vdev_indirect_mapping_max_offset(vim));
	- svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
	- mutex_exit(&svr->svr_lock);
	-
	- spa_sync_removing_state(spa, tx);
	-}
	-
	-typedef struct vdev_copy_segment_arg {
	- spa_t *vcsa_spa;
	- dva_t *vcsa_dest_dva;
	- uint64_t vcsa_txg;
	- range_tree_t *vcsa_obsolete_segs;
	-} vdev_copy_segment_arg_t;
	-
	-static void
	-unalloc_seg(void *arg, uint64_t start, uint64_t size)
	-{
	- vdev_copy_segment_arg_t *vcsa = arg;
	- spa_t *spa = vcsa->vcsa_spa;
	- blkptr_t bp = { 0 };
	-
	- BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
	- BP_SET_LSIZE(&bp, size);
	- BP_SET_PSIZE(&bp, size);
	- BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
	- BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
	- BP_SET_TYPE(&bp, DMU_OT_NONE);
	- BP_SET_LEVEL(&bp, 0);
	- BP_SET_DEDUP(&bp, 0);
	- BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
	-
	- DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
	- DVA_SET_OFFSET(&bp.blk_dva[0],
	- DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
	- DVA_SET_ASIZE(&bp.blk_dva[0], size);
	-
	- zio_free(spa, vcsa->vcsa_txg, &bp);
	-}
	-
	-/*
	- * All reads and writes associated with a call to spa_vdev_copy_segment()
	- * are done.
	- */
	-static void
	-spa_vdev_copy_segment_done(zio_t *zio)
	-{
	- vdev_copy_segment_arg_t *vcsa = zio->io_private;
	-
	- range_tree_vacate(vcsa->vcsa_obsolete_segs,
	- unalloc_seg, vcsa);
	- range_tree_destroy(vcsa->vcsa_obsolete_segs);
	- kmem_free(vcsa, sizeof (*vcsa));
	-
	- spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
	-}
	-
	-/*
	- * The write of the new location is done.
	- */
	-static void
	-spa_vdev_copy_segment_write_done(zio_t *zio)
	-{
	- vdev_copy_arg_t *vca = zio->io_private;
	-
	- abd_free(zio->io_abd);
	-
	- mutex_enter(&vca->vca_lock);
	- vca->vca_outstanding_bytes -= zio->io_size;
	- cv_signal(&vca->vca_cv);
	- mutex_exit(&vca->vca_lock);
	-}
	-
	-/*
	- * The read of the old location is done. The parent zio is the write to
	- * the new location. Allow it to start.
	- */
	-static void
	-spa_vdev_copy_segment_read_done(zio_t *zio)
	-{
	- zio_nowait(zio_unique_parent(zio));
	-}
	-
	-/*
	- * If the old and new vdevs are mirrors, we will read both sides of the old
	- * mirror, and write each copy to the corresponding side of the new mirror.
	- * If the old and new vdevs have a different number of children, we will do
	- * this as best as possible. Since we aren't verifying checksums, this
	- * ensures that as long as there's a good copy of the data, we'll have a
	- * good copy after the removal, even if there's silent damage to one side
	- * of the mirror. If we're removing a mirror that has some silent damage,
	- * we'll have exactly the same damage in the new location (assuming that
	- * the new location is also a mirror).
	- *
	- * We accomplish this by creating a tree of zio_t's, with as many writes as
	- * there are "children" of the new vdev (a non-redundant vdev counts as one
	- * child, a 2-way mirror has 2 children, etc). Each write has an associated
	- * read from a child of the old vdev. Typically there will be the same
	- * number of children of the old and new vdevs. However, if there are more
	- * children of the new vdev, some child(ren) of the old vdev will be issued
	- * multiple reads. If there are more children of the old vdev, some copies
	- * will be dropped.
	- *
	- * For example, the tree of zio_t's for a 2-way mirror is:
	- *
	- * null
	- * / \
	- * write(new vdev, child 0) write(new vdev, child 1)
	- * \| \|
	- * read(old vdev, child 0) read(old vdev, child 1)
	- *
	- * Child zio's complete before their parents complete. However, zio's
	- * created with zio_vdev_child_io() may be issued before their children
	- * complete. In this case we need to make sure that the children (reads)
	- * complete before the parents (writes) are issued. We do this by not
	- * calling zio_nowait() on each write until its corresponding read has
	- * completed.
	- *
	- * The spa_config_lock must be held while zio's created by
	- * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
	- * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
	- * zio is needed to release the spa_config_lock after all the reads and
	- * writes complete. (Note that we can't grab the config lock for each read,
	- * because it is not reentrant - we could deadlock with a thread waiting
	- * for a write lock.)
	- */
	-static void
	-spa_vdev_copy_one_child(vdev_copy_arg_t vca, zio_t nzio,
	- vdev_t *source_vd, uint64_t source_offset,
	- vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
	-{
	- ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
	-
	- mutex_enter(&vca->vca_lock);
	- vca->vca_outstanding_bytes += size;
	- mutex_exit(&vca->vca_lock);
	-
	- abd_t *abd = abd_alloc_for_io(size, B_FALSE);
	-
	- vdev_t *source_child_vd;
	- if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
	- /*
	- * Source and dest are both mirrors. Copy from the same
	- * child id as we are copying to (wrapping around if there
	- * are more dest children than source children).
	- */
	- source_child_vd =
	- source_vd->vdev_child[dest_id % source_vd->vdev_children];
	- } else {
	- source_child_vd = source_vd;
	- }
	-
	- zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
	- dest_child_vd, dest_offset, abd, size,
	- ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
	- ZIO_FLAG_CANFAIL,
	- spa_vdev_copy_segment_write_done, vca);
	-
	- zio_nowait(zio_vdev_child_io(write_zio, NULL,
	- source_child_vd, source_offset, abd, size,
	- ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
	- ZIO_FLAG_CANFAIL,
	- spa_vdev_copy_segment_read_done, vca));
	-}
	-
	-/*
	- * Allocate a new location for this segment, and create the zio_t's to
	- * read from the old location and write to the new location.
	- */
	-static int
	-spa_vdev_copy_segment(vdev_t vd, range_tree_t segs,
	- uint64_t maxalloc, uint64_t txg,
	- vdev_copy_arg_t vca, zio_alloc_list_t zal)
	-{
	- metaslab_group_t *mg = vd->vdev_mg;
	- spa_t *spa = vd->vdev_spa;
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	- vdev_indirect_mapping_entry_t *entry;
	- dva_t dst = { 0 };
	- uint64_t start = range_tree_min(segs);
	-
	- ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
	-
	- uint64_t size = range_tree_span(segs);
	- if (range_tree_span(segs) > maxalloc) {
	- /*
	- * We can't allocate all the segments. Prefer to end
	- * the allocation at the end of a segment, thus avoiding
	- * additional split blocks.
	- */
	- range_seg_t search;
	- avl_index_t where;
	- search.rs_start = start + maxalloc;
	- search.rs_end = search.rs_start;
	- range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
	- if (rs == NULL) {
	- rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
	- } else {
	- rs = AVL_PREV(&segs->rt_root, rs);
	- }
	- if (rs != NULL) {
	- size = rs->rs_end - start;
	- } else {
	- /*
	- * There are no segments that end before maxalloc.
	- * I.e. the first segment is larger than maxalloc,
	- * so we must split it.
	- */
	- size = maxalloc;
	- }
	- }
	- ASSERT3U(size, <=, maxalloc);
	-
	- /*
	- * An allocation class might not have any remaining vdevs or space
	- */
	- metaslab_class_t *mc = mg->mg_class;
	- if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
	- mc = spa_normal_class(spa);
	- int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
	- zal, 0);
	- if (error == ENOSPC && mc != spa_normal_class(spa)) {
	- error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
	- &dst, 0, NULL, txg, 0, zal, 0);
	- }
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Determine the ranges that are not actually needed. Offsets are
	- * relative to the start of the range to be copied (i.e. relative to the
	- * local variable "start").
	- */
	- range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
	-
	- range_seg_t *rs = avl_first(&segs->rt_root);
	- ASSERT3U(rs->rs_start, ==, start);
	- uint64_t prev_seg_end = rs->rs_end;
	- while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
	- if (rs->rs_start >= start + size) {
	- break;
	- } else {
	- range_tree_add(obsolete_segs,
	- prev_seg_end - start,
	- rs->rs_start - prev_seg_end);
	- }
	- prev_seg_end = rs->rs_end;
	- }
	- /* We don't end in the middle of an obsolete range */
	- ASSERT3U(start + size, <=, prev_seg_end);
	-
	- range_tree_clear(segs, start, size);
	-
	- /*
	- * We can't have any padding of the allocated size, otherwise we will
	- * misunderstand what's allocated, and the size of the mapping.
	- * The caller ensures this will be true by passing in a size that is
	- * aligned to the worst (highest) ashift in the pool.
	- */
	- ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
	-
	- entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
	- DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
	- entry->vime_mapping.vimep_dst = dst;
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	- entry->vime_obsolete_count = range_tree_space(obsolete_segs);
	- }
	-
	- vdev_copy_segment_arg_t vcsa = kmem_zalloc(sizeof (vcsa), KM_SLEEP);
	- vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
	- vcsa->vcsa_obsolete_segs = obsolete_segs;
	- vcsa->vcsa_spa = spa;
	- vcsa->vcsa_txg = txg;
	-
	- /*
	- * See comment before spa_vdev_copy_one_child().
	- */
	- spa_config_enter(spa, SCL_STATE, spa, RW_READER);
	- zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
	- spa_vdev_copy_segment_done, vcsa, 0);
	- vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
	- if (dest_vd->vdev_ops == &vdev_mirror_ops) {
	- for (int i = 0; i < dest_vd->vdev_children; i++) {
	- vdev_t *child = dest_vd->vdev_child[i];
	- spa_vdev_copy_one_child(vca, nzio, vd, start,
	- child, DVA_GET_OFFSET(&dst), i, size);
	- }
	- } else {
	- spa_vdev_copy_one_child(vca, nzio, vd, start,
	- dest_vd, DVA_GET_OFFSET(&dst), -1, size);
	- }
	- zio_nowait(nzio);
	-
	- list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
	- ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
	- vdev_dirty(vd, 0, NULL, txg);
	-
	- return (0);
	-}
	-
	-/*
	- * Complete the removal of a toplevel vdev. This is called as a
	- * synctask in the same txg that we will sync out the new config (to the
	- * MOS object) which indicates that this vdev is indirect.
	- */
	-static void
	-vdev_remove_complete_sync(void arg, dmu_tx_t tx)
	-{
	- spa_vdev_removal_t *svr = arg;
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
	-
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	-
	- for (int i = 0; i < TXG_SIZE; i++) {
	- ASSERT0(svr->svr_bytes_done[i]);
	- }
	-
	- ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
	- spa->spa_removing_phys.sr_to_copy);
	-
	- vdev_destroy_spacemaps(vd, tx);
	-
	- /* destroy leaf zaps, if any */
	- ASSERT3P(svr->svr_zaplist, !=, NULL);
	- for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
	- pair != NULL;
	- pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
	- vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
	- }
	- fnvlist_free(svr->svr_zaplist);
	-
	- spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
	- /* vd->vdev_path is not available here */
	- spa_history_log_internal(spa, "vdev remove completed", tx,
	- "%s vdev %llu", spa_name(spa), vd->vdev_id);
	-}
	-
	-static void
	-vdev_remove_enlist_zaps(vdev_t vd, nvlist_t zlist)
	-{
	- ASSERT3P(zlist, !=, NULL);
	- ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
	-
	- if (vd->vdev_leaf_zap != 0) {
	- char zkey[32];
	- (void) snprintf(zkey, sizeof (zkey), "%s-%ju",
	- VDEV_REMOVAL_ZAP_OBJS, (uintmax_t)vd->vdev_leaf_zap);
	- fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
	- }
	-
	- for (uint64_t id = 0; id < vd->vdev_children; id++) {
	- vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
	- }
	-}
	-
	-static void
	-vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
	-{
	- vdev_t *ivd;
	- dmu_tx_t *tx;
	- spa_t *spa = vd->vdev_spa;
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	-
	- /*
	- * First, build a list of leaf zaps to be destroyed.
	- * This is passed to the sync context thread,
	- * which does the actual unlinking.
	- */
	- svr->svr_zaplist = fnvlist_alloc();
	- vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
	-
	- ivd = vdev_add_parent(vd, &vdev_indirect_ops);
	- ivd->vdev_removing = 0;
	-
	- vd->vdev_leaf_zap = 0;
	-
	- vdev_remove_child(ivd, vd);
	- vdev_compact_children(ivd);
	-
	- ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
	-
	- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	- dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
	- 0, ZFS_SPACE_CHECK_NONE, tx);
	- dmu_tx_commit(tx);
	-
	- /*
	- * Indicate that this thread has exited.
	- * After this, we can not use svr.
	- */
	- mutex_enter(&svr->svr_lock);
	- svr->svr_thread = NULL;
	- cv_broadcast(&svr->svr_cv);
	- mutex_exit(&svr->svr_lock);
	-}
	-
	-/*
	- * Complete the removal of a toplevel vdev. This is called in open
	- * context by the removal thread after we have copied all vdev's data.
	- */
	-static void
	-vdev_remove_complete(spa_t *spa)
	-{
	- uint64_t txg;
	-
	- /*
	- * Wait for any deferred frees to be synced before we call
	- * vdev_metaslab_fini()
	- */
	- txg_wait_synced(spa->spa_dsl_pool, 0);
	- txg = spa_vdev_enter(spa);
	- vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
	- ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
	-
	- sysevent_t *ev = spa_event_create(spa, vd, NULL,
	- ESC_ZFS_VDEV_REMOVE_DEV);
	-
	- zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
	- vd->vdev_id, txg);
	-
	- /*
	- * Discard allocation state.
	- */
	- if (vd->vdev_mg != NULL) {
	- vdev_metaslab_fini(vd);
	- metaslab_group_destroy(vd->vdev_mg);
	- vd->vdev_mg = NULL;
	- }
	- ASSERT0(vd->vdev_stat.vs_space);
	- ASSERT0(vd->vdev_stat.vs_dspace);
	-
	- vdev_remove_replace_with_indirect(vd, txg);
	-
	- /*
	- * We now release the locks, allowing spa_sync to run and finish the
	- * removal via vdev_remove_complete_sync in syncing context.
	- *
	- * Note that we hold on to the vdev_t that has been replaced. Since
	- * it isn't part of the vdev tree any longer, it can't be concurrently
	- * manipulated, even while we don't have the config lock.
	- */
	- (void) spa_vdev_exit(spa, NULL, txg, 0);
	-
	- /*
	- * Top ZAP should have been transferred to the indirect vdev in
	- * vdev_remove_replace_with_indirect.
	- */
	- ASSERT0(vd->vdev_top_zap);
	-
	- /*
	- * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
	- */
	- ASSERT0(vd->vdev_leaf_zap);
	-
	- txg = spa_vdev_enter(spa);
	- (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
	- /*
	- * Request to update the config and the config cachefile.
	- */
	- vdev_config_dirty(spa->spa_root_vdev);
	- (void) spa_vdev_exit(spa, vd, txg, 0);
	-
	- spa_event_post(ev);
	-}
	-
	-/*
	- * Evacuates a segment of size at most max_alloc from the vdev
	- * via repeated calls to spa_vdev_copy_segment. If an allocation
	- * fails, the pool is probably too fragmented to handle such a
	- * large size, so decrease max_alloc so that the caller will not try
	- * this size again this txg.
	- */
	-static void
	-spa_vdev_copy_impl(vdev_t vd, spa_vdev_removal_t svr, vdev_copy_arg_t *vca,
	- uint64_t max_alloc, dmu_tx_t tx)
	-{
	- uint64_t txg = dmu_tx_get_txg(tx);
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- mutex_enter(&svr->svr_lock);
	-
	- /*
	- * Determine how big of a chunk to copy. We can allocate up
	- * to max_alloc bytes, and we can span up to vdev_removal_max_span
	- * bytes of unallocated space at a time. "segs" will track the
	- * allocated segments that we are copying. We may also be copying
	- * free segments (of up to vdev_removal_max_span bytes).
	- */
	- range_tree_t *segs = range_tree_create(NULL, NULL);
	- for (;;) {
	- range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
	- if (rs == NULL)
	- break;
	-
	- uint64_t seg_length;
	-
	- if (range_tree_is_empty(segs)) {
	- /* need to truncate the first seg based on max_alloc */
	- seg_length =
	- MIN(rs->rs_end - rs->rs_start, *max_alloc);
	- } else {
	- if (rs->rs_start - range_tree_max(segs) >
	- vdev_removal_max_span) {
	- /*
	- * Including this segment would cause us to
	- * copy a larger unneeded chunk than is allowed.
	- */
	- break;
	- } else if (rs->rs_end - range_tree_min(segs) >
	- *max_alloc) {
	- /*
	- * This additional segment would extend past
	- * max_alloc. Rather than splitting this
	- * segment, leave it for the next mapping.
	- */
	- break;
	- } else {
	- seg_length = rs->rs_end - rs->rs_start;
	- }
	- }
	-
	- range_tree_add(segs, rs->rs_start, seg_length);
	- range_tree_remove(svr->svr_allocd_segs,
	- rs->rs_start, seg_length);
	- }
	-
	- if (range_tree_is_empty(segs)) {
	- mutex_exit(&svr->svr_lock);
	- range_tree_destroy(segs);
	- return;
	- }
	-
	- if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
	- dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
	- svr, 0, ZFS_SPACE_CHECK_NONE, tx);
	- }
	-
	- svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
	-
	- /*
	- * Note: this is the amount of allocated space
	- * that we are taking care of each txg.
	- */
	- svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
	-
	- mutex_exit(&svr->svr_lock);
	-
	- zio_alloc_list_t zal;
	- metaslab_trace_init(&zal);
	- uint64_t thismax = SPA_MAXBLOCKSIZE;
	- while (!range_tree_is_empty(segs)) {
	- int error = spa_vdev_copy_segment(vd,
	- segs, thismax, txg, vca, &zal);
	-
	- if (error == ENOSPC) {
	- /*
	- * Cut our segment in half, and don't try this
	- * segment size again this txg. Note that the
	- * allocation size must be aligned to the highest
	- * ashift in the pool, so that the allocation will
	- * not be padded out to a multiple of the ashift,
	- * which could cause us to think that this mapping
	- * is larger than we intended.
	- */
	- ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
	- ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
	- uint64_t attempted =
	- MIN(range_tree_span(segs), thismax);
	- thismax = P2ROUNDUP(attempted / 2,
	- 1 << spa->spa_max_ashift);
	- /*
	- * The minimum-size allocation can not fail.
	- */
	- ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
	- *max_alloc = attempted - (1 << spa->spa_max_ashift);
	- } else {
	- ASSERT0(error);
	-
	- /*
	- * We've performed an allocation, so reset the
	- * alloc trace list.
	- */
	- metaslab_trace_fini(&zal);
	- metaslab_trace_init(&zal);
	- }
	- }
	- metaslab_trace_fini(&zal);
	- range_tree_destroy(segs);
	-}
	-
	-/*
	- * The removal thread operates in open context. It iterates over all
	- * allocated space in the vdev, by loading each metaslab's spacemap.
	- * For each contiguous segment of allocated space (capping the segment
	- * size at SPA_MAXBLOCKSIZE), we:
	- * - Allocate space for it on another vdev.
	- * - Create a new mapping from the old location to the new location
	- * (as a record in svr_new_segments).
	- * - Initiate a logical read zio to get the data off the removing disk.
	- * - In the read zio's done callback, initiate a logical write zio to
	- * write it to the new vdev.
	- * Note that all of this will take effect when a particular TXG syncs.
	- * The sync thread ensures that all the phys reads and writes for the syncing
	- * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
	- * (see vdev_mapping_sync()).
	- */
	-static void
	-spa_vdev_remove_thread(void *arg)
	-{
	- spa_t *spa = arg;
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	- vdev_copy_arg_t vca;
	- uint64_t max_alloc = zfs_remove_max_segment;
	- uint64_t last_txg = 0;
	-
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	- vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
	-
	- ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
	- ASSERT(vdev_is_concrete(vd));
	- ASSERT(vd->vdev_removing);
	- ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
	- ASSERT(vim != NULL);
	-
	- mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
	- vca.vca_outstanding_bytes = 0;
	-
	- mutex_enter(&svr->svr_lock);
	-
	- /*
	- * Start from vim_max_offset so we pick up where we left off
	- * if we are restarting the removal after opening the pool.
	- */
	- uint64_t msi;
	- for (msi = start_offset >> vd->vdev_ms_shift;
	- msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
	- metaslab_t *msp = vd->vdev_ms[msi];
	- ASSERT3U(msi, <=, vd->vdev_ms_count);
	-
	- ASSERT0(range_tree_space(svr->svr_allocd_segs));
	-
	- mutex_enter(&msp->ms_sync_lock);
	- mutex_enter(&msp->ms_lock);
	-
	- /*
	- * Assert nothing in flight -- ms_*tree is empty.
	- */
	- for (int i = 0; i < TXG_SIZE; i++) {
	- ASSERT0(range_tree_space(msp->ms_allocating[i]));
	- }
	-
	- /*
	- * If the metaslab has ever been allocated from (ms_sm!=NULL),
	- * read the allocated segments from the space map object
	- * into svr_allocd_segs. Since we do this while holding
	- * svr_lock and ms_sync_lock, concurrent frees (which
	- * would have modified the space map) will wait for us
	- * to finish loading the spacemap, and then take the
	- * appropriate action (see free_from_removing_vdev()).
	- */
	- if (msp->ms_sm != NULL) {
	- VERIFY0(space_map_load(msp->ms_sm,
	- svr->svr_allocd_segs, SM_ALLOC));
	-
	- range_tree_walk(msp->ms_freeing,
	- range_tree_remove, svr->svr_allocd_segs);
	-
	- /*
	- * When we are resuming from a paused removal (i.e.
	- * when importing a pool with a removal in progress),
	- * discard any state that we have already processed.
	- */
	- range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
	- }
	- mutex_exit(&msp->ms_lock);
	- mutex_exit(&msp->ms_sync_lock);
	-
	- vca.vca_msp = msp;
	- zfs_dbgmsg("copying %llu segments for metaslab %llu",
	- avl_numnodes(&svr->svr_allocd_segs->rt_root),
	- msp->ms_id);
	-
	- while (!svr->svr_thread_exit &&
	- !range_tree_is_empty(svr->svr_allocd_segs)) {
	-
	- mutex_exit(&svr->svr_lock);
	-
	- /*
	- * We need to periodically drop the config lock so that
	- * writers can get in. Additionally, we can't wait
	- * for a txg to sync while holding a config lock
	- * (since a waiting writer could cause a 3-way deadlock
	- * with the sync thread, which also gets a config
	- * lock for reader). So we can't hold the config lock
	- * while calling dmu_tx_assign().
	- */
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	-
	- /*
	- * This delay will pause the removal around the point
	- * specified by zfs_remove_max_bytes_pause. We do this
	- * solely from the test suite or during debugging.
	- */
	- uint64_t bytes_copied =
	- spa->spa_removing_phys.sr_copied;
	- for (int i = 0; i < TXG_SIZE; i++)
	- bytes_copied += svr->svr_bytes_done[i];
	- while (zfs_remove_max_bytes_pause <= bytes_copied &&
	- !svr->svr_thread_exit)
	- delay(hz);
	-
	- mutex_enter(&vca.vca_lock);
	- while (vca.vca_outstanding_bytes >
	- zfs_remove_max_copy_bytes) {
	- cv_wait(&vca.vca_cv, &vca.vca_lock);
	- }
	- mutex_exit(&vca.vca_lock);
	-
	- dmu_tx_t *tx =
	- dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	-
	- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	- uint64_t txg = dmu_tx_get_txg(tx);
	-
	- /*
	- * Reacquire the vdev_config lock. The vdev_t
	- * that we're removing may have changed, e.g. due
	- * to a vdev_attach or vdev_detach.
	- */
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	- vd = vdev_lookup_top(spa, svr->svr_vdev_id);
	-
	- if (txg != last_txg)
	- max_alloc = zfs_remove_max_segment;
	- last_txg = txg;
	-
	- spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
	-
	- dmu_tx_commit(tx);
	- mutex_enter(&svr->svr_lock);
	- }
	- }
	-
	- mutex_exit(&svr->svr_lock);
	-
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	-
	- /*
	- * Wait for all copies to finish before cleaning up the vca.
	- */
	- txg_wait_synced(spa->spa_dsl_pool, 0);
	- ASSERT0(vca.vca_outstanding_bytes);
	-
	- mutex_destroy(&vca.vca_lock);
	- cv_destroy(&vca.vca_cv);
	-
	- if (svr->svr_thread_exit) {
	- mutex_enter(&svr->svr_lock);
	- range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
	- svr->svr_thread = NULL;
	- cv_broadcast(&svr->svr_cv);
	- mutex_exit(&svr->svr_lock);
	- } else {
	- ASSERT0(range_tree_space(svr->svr_allocd_segs));
	- vdev_remove_complete(spa);
	- }
	- thread_exit();
	-}
	-
	-void
	-spa_vdev_remove_suspend(spa_t *spa)
	-{
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	-
	- if (svr == NULL)
	- return;
	-
	- mutex_enter(&svr->svr_lock);
	- svr->svr_thread_exit = B_TRUE;
	- while (svr->svr_thread != NULL)
	- cv_wait(&svr->svr_cv, &svr->svr_lock);
	- svr->svr_thread_exit = B_FALSE;
	- mutex_exit(&svr->svr_lock);
	-}
	-
	-/* ARGSUSED */
	-static int
	-spa_vdev_remove_cancel_check(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- if (spa->spa_vdev_removal == NULL)
	- return (ESRCH);
	- return (0);
	-}
	-
	-/*
	- * Cancel a removal by freeing all entries from the partial mapping
	- * and marking the vdev as no longer being removing.
	- */
	-/* ARGSUSED */
	-static void
	-spa_vdev_remove_cancel_sync(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	- vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	- objset_t *mos = spa->spa_meta_objset;
	-
	- ASSERT3P(svr->svr_thread, ==, NULL);
	-
	- spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
	- if (vdev_obsolete_counts_are_precise(vd)) {
	- spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	- VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
	- VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
	- }
	-
	- if (vdev_obsolete_sm_object(vd) != 0) {
	- ASSERT(vd->vdev_obsolete_sm != NULL);
	- ASSERT3U(vdev_obsolete_sm_object(vd), ==,
	- space_map_object(vd->vdev_obsolete_sm));
	-
	- space_map_free(vd->vdev_obsolete_sm, tx);
	- VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
	- VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
	- space_map_close(vd->vdev_obsolete_sm);
	- vd->vdev_obsolete_sm = NULL;
	- spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	- }
	- for (int i = 0; i < TXG_SIZE; i++) {
	- ASSERT(list_is_empty(&svr->svr_new_segments[i]));
	- ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
	- vdev_indirect_mapping_max_offset(vim));
	- }
	-
	- for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
	- metaslab_t *msp = vd->vdev_ms[msi];
	-
	- if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
	- break;
	-
	- ASSERT0(range_tree_space(svr->svr_allocd_segs));
	-
	- mutex_enter(&msp->ms_lock);
	-
	- /*
	- * Assert nothing in flight -- ms_*tree is empty.
	- */
	- for (int i = 0; i < TXG_SIZE; i++)
	- ASSERT0(range_tree_space(msp->ms_allocating[i]));
	- for (int i = 0; i < TXG_DEFER_SIZE; i++)
	- ASSERT0(range_tree_space(msp->ms_defer[i]));
	- ASSERT0(range_tree_space(msp->ms_freed));
	-
	- if (msp->ms_sm != NULL) {
	- mutex_enter(&svr->svr_lock);
	- VERIFY0(space_map_load(msp->ms_sm,
	- svr->svr_allocd_segs, SM_ALLOC));
	- range_tree_walk(msp->ms_freeing,
	- range_tree_remove, svr->svr_allocd_segs);
	-
	- /*
	- * Clear everything past what has been synced,
	- * because we have not allocated mappings for it yet.
	- */
	- uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
	- uint64_t sm_end = msp->ms_sm->sm_start +
	- msp->ms_sm->sm_size;
	- if (sm_end > syncd)
	- range_tree_clear(svr->svr_allocd_segs,
	- syncd, sm_end - syncd);
	-
	- mutex_exit(&svr->svr_lock);
	- }
	- mutex_exit(&msp->ms_lock);
	-
	- mutex_enter(&svr->svr_lock);
	- range_tree_vacate(svr->svr_allocd_segs,
	- free_mapped_segment_cb, vd);
	- mutex_exit(&svr->svr_lock);
	- }
	-
	- /*
	- * Note: this must happen after we invoke free_mapped_segment_cb,
	- * because it adds to the obsolete_segments.
	- */
	- range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
	-
	- ASSERT3U(vic->vic_mapping_object, ==,
	- vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
	- vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
	- vd->vdev_indirect_mapping = NULL;
	- vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
	- vic->vic_mapping_object = 0;
	-
	- ASSERT3U(vic->vic_births_object, ==,
	- vdev_indirect_births_object(vd->vdev_indirect_births));
	- vdev_indirect_births_close(vd->vdev_indirect_births);
	- vd->vdev_indirect_births = NULL;
	- vdev_indirect_births_free(mos, vic->vic_births_object, tx);
	- vic->vic_births_object = 0;
	-
	- /*
	- * We may have processed some frees from the removing vdev in this
	- * txg, thus increasing svr_bytes_done; discard that here to
	- * satisfy the assertions in spa_vdev_removal_destroy().
	- * Note that future txg's can not have any bytes_done, because
	- * future TXG's are only modified from open context, and we have
	- * already shut down the copying thread.
	- */
	- svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
	- spa_finish_removal(spa, DSS_CANCELED, tx);
	-
	- vd->vdev_removing = B_FALSE;
	- vdev_config_dirty(vd);
	-
	- zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
	- vd->vdev_id, dmu_tx_get_txg(tx));
	- spa_history_log_internal(spa, "vdev remove canceled", tx,
	- "%s vdev %llu %s", spa_name(spa),
	- vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
	-}
	-
	-int
	-spa_vdev_remove_cancel(spa_t *spa)
	-{
	- spa_vdev_remove_suspend(spa);
	-
	- if (spa->spa_vdev_removal == NULL)
	- return (ESRCH);
	-
	- uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
	-
	- int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
	- spa_vdev_remove_cancel_sync, NULL, 0,
	- ZFS_SPACE_CHECK_EXTRA_RESERVED);
	-
	- if (error == 0) {
	- spa_config_enter(spa, SCL_ALLOC \| SCL_VDEV, FTAG, RW_WRITER);
	- vdev_t *vd = vdev_lookup_top(spa, vdid);
	- metaslab_group_activate(vd->vdev_mg);
	- spa_config_exit(spa, SCL_ALLOC \| SCL_VDEV, FTAG);
	- }
	-
	- return (error);
	-}
	-
	-void
	-svr_sync(spa_t spa, dmu_tx_t tx)
	-{
	- spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	- int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
	-
	- /*
	- * This check is necessary so that we do not dirty the
	- * DIRECTORY_OBJECT via spa_sync_removing_state() when there
	- * is nothing to do. Dirtying it every time would prevent us
	- * from syncing-to-convergence.
	- */
	- if (svr->svr_bytes_done[txgoff] == 0)
	- return;
	-
	- /*
	- * Update progress accounting.
	- */
	- spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
	- svr->svr_bytes_done[txgoff] = 0;
	-
	- spa_sync_removing_state(spa, tx);
	-}
	-
	-static void
	-vdev_remove_make_hole_and_free(vdev_t *vd)
	-{
	- uint64_t id = vd->vdev_id;
	- spa_t *spa = vd->vdev_spa;
	- vdev_t *rvd = spa->spa_root_vdev;
	- boolean_t last_vdev = (id == (rvd->vdev_children - 1));
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- vdev_free(vd);
	-
	- if (last_vdev) {
	- vdev_compact_children(rvd);
	- } else {
	- vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
	- vdev_add_child(rvd, vd);
	- }
	- vdev_config_dirty(rvd);
	-
	- /*
	- * Reassess the health of our root vdev.
	- */
	- vdev_reopen(rvd);
	-}
	-
	-/*
	- * Remove a log device. The config lock is held for the specified TXG.
	- */
	-static int
	-spa_vdev_remove_log(vdev_t vd, uint64_t txg)
	-{
	- metaslab_group_t *mg = vd->vdev_mg;
	- spa_t *spa = vd->vdev_spa;
	- int error = 0;
	-
	- ASSERT(vd->vdev_islog);
	- ASSERT(vd == vd->vdev_top);
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- /*
	- * Stop allocating from this vdev.
	- */
	- metaslab_group_passivate(mg);
	-
	- /*
	- * Wait for the youngest allocations and frees to sync,
	- * and then wait for the deferral of those frees to finish.
	- */
	- spa_vdev_config_exit(spa, NULL,
	- *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
	-
	- /*
	- * Evacuate the device. We don't hold the config lock as
	- * writer since we need to do I/O but we do keep the
	- * spa_namespace_lock held. Once this completes the device
	- * should no longer have any blocks allocated on it.
	- */
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- if (vd->vdev_stat.vs_alloc != 0)
	- error = spa_reset_logs(spa);
	-
	- *txg = spa_vdev_config_enter(spa);
	-
	- if (error != 0) {
	- metaslab_group_activate(mg);
	- return (error);
	- }
	- ASSERT0(vd->vdev_stat.vs_alloc);
	-
	- /*
	- * The evacuation succeeded. Remove any remaining MOS metadata
	- * associated with this vdev, and wait for these changes to sync.
	- */
	- vd->vdev_removing = B_TRUE;
	-
	- vdev_dirty_leaves(vd, VDD_DTL, *txg);
	- vdev_config_dirty(vd);
	-
	- vdev_metaslab_fini(vd);
	-
	- spa_history_log_internal(spa, "vdev remove", NULL,
	- "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
	- (vd->vdev_path != NULL) ? vd->vdev_path : "-");
	-
	- /* Make sure these changes are sync'ed */
	- spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
	-
	- /* Stop initializing */
	- (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
	-
	- *txg = spa_vdev_config_enter(spa);
	-
	- sysevent_t *ev = spa_event_create(spa, vd, NULL,
	- ESC_ZFS_VDEV_REMOVE_DEV);
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	-
	- /* The top ZAP should have been destroyed by vdev_remove_empty. */
	- ASSERT0(vd->vdev_top_zap);
	- /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
	- ASSERT0(vd->vdev_leaf_zap);
	-
	- (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
	-
	- if (list_link_active(&vd->vdev_state_dirty_node))
	- vdev_state_clean(vd);
	- if (list_link_active(&vd->vdev_config_dirty_node))
	- vdev_config_clean(vd);
	-
	- ASSERT0(vd->vdev_stat.vs_alloc);
	-
	- /*
	- * Clean up the vdev namespace.
	- */
	- vdev_remove_make_hole_and_free(vd);
	-
	- if (ev != NULL)
	- spa_event_post(ev);
	-
	- return (0);
	-}
	-
	-static int
	-spa_vdev_remove_top_check(vdev_t *vd)
	-{
	- spa_t *spa = vd->vdev_spa;
	-
	- if (vd != vd->vdev_top)
	- return (SET_ERROR(ENOTSUP));
	-
	- if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
	- return (SET_ERROR(ENOTSUP));
	-
	- /* available space in the pool's normal class */
	- uint64_t available = dsl_dir_space_available(
	- spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
	-
	- metaslab_class_t *mc = vd->vdev_mg->mg_class;
	-
	- /*
	- * When removing a vdev from an allocation class that has
	- * remaining vdevs, include available space from the class.
	- */
	- if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
	- uint64_t class_avail = metaslab_class_get_space(mc) -
	- metaslab_class_get_alloc(mc);
	-
	- /* add class space, adjusted for overhead */
	- available += (class_avail * 94) / 100;
	- }
	-
	- /*
	- * There has to be enough free space to remove the
	- * device and leave double the "slop" space (i.e. we
	- * must leave at least 3% of the pool free, in addition to
	- * the normal slop space).
	- */
	- if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
	- return (SET_ERROR(ENOSPC));
	- }
	-
	- /*
	- * There can not be a removal in progress.
	- */
	- if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
	- return (SET_ERROR(EBUSY));
	-
	- /*
	- * The device must have all its data.
	- */
	- if (!vdev_dtl_empty(vd, DTL_MISSING) \|\|
	- !vdev_dtl_empty(vd, DTL_OUTAGE))
	- return (SET_ERROR(EBUSY));
	-
	- /*
	- * The device must be healthy.
	- */
	- if (!vdev_readable(vd))
	- return (SET_ERROR(EIO));
	-
	- /*
	- * All vdevs in normal class must have the same ashift.
	- */
	- if (spa->spa_max_ashift != spa->spa_min_ashift) {
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * All vdevs in normal class must have the same ashift
	- * and not be raidz.
	- */
	- vdev_t *rvd = spa->spa_root_vdev;
	- int num_indirect = 0;
	- for (uint64_t id = 0; id < rvd->vdev_children; id++) {
	- vdev_t *cvd = rvd->vdev_child[id];
	- if (cvd->vdev_ashift != 0 && !cvd->vdev_islog)
	- ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
	- if (cvd->vdev_ops == &vdev_indirect_ops)
	- num_indirect++;
	- if (!vdev_is_concrete(cvd))
	- continue;
	- if (cvd->vdev_ops == &vdev_raidz_ops)
	- return (SET_ERROR(EINVAL));
	- /*
	- * Need the mirror to be mirror of leaf vdevs only
	- */
	- if (cvd->vdev_ops == &vdev_mirror_ops) {
	- for (uint64_t cid = 0;
	- cid < cvd->vdev_children; cid++) {
	- vdev_t *tmp = cvd->vdev_child[cid];
	- if (!tmp->vdev_ops->vdev_op_leaf)
	- return (SET_ERROR(EINVAL));
	- }
	- }
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Initiate removal of a top-level vdev, reducing the total space in the pool.
	- * The config lock is held for the specified TXG. Once initiated,
	- * evacuation of all allocated space (copying it to other vdevs) happens
	- * in the background (see spa_vdev_remove_thread()), and can be canceled
	- * (see spa_vdev_remove_cancel()). If successful, the vdev will
	- * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
	- */
	-static int
	-spa_vdev_remove_top(vdev_t vd, uint64_t txg)
	-{
	- spa_t *spa = vd->vdev_spa;
	- int error;
	-
	- /*
	- * Check for errors up-front, so that we don't waste time
	- * passivating the metaslab group and clearing the ZIL if there
	- * are errors.
	- */
	- error = spa_vdev_remove_top_check(vd);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * Stop allocating from this vdev. Note that we must check
	- * that this is not the only device in the pool before
	- * passivating, otherwise we will not be able to make
	- * progress because we can't allocate from any vdevs.
	- * The above check for sufficient free space serves this
	- * purpose.
	- */
	- metaslab_group_t *mg = vd->vdev_mg;
	- metaslab_group_passivate(mg);
	-
	- /*
	- * Wait for the youngest allocations and frees to sync,
	- * and then wait for the deferral of those frees to finish.
	- */
	- spa_vdev_config_exit(spa, NULL,
	- *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
	-
	- /*
	- * We must ensure that no "stubby" log blocks are allocated
	- * on the device to be removed. These blocks could be
	- * written at any time, including while we are in the middle
	- * of copying them.
	- */
	- error = spa_reset_logs(spa);
	-
	- /*
	- * We stop any initializing that is currently in progress but leave
	- * the state as "active". This will allow the initializing to resume
	- * if the removal is canceled sometime later.
	- */
	- vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
	-
	- *txg = spa_vdev_config_enter(spa);
	-
	- /*
	- * Things might have changed while the config lock was dropped
	- * (e.g. space usage). Check for errors again.
	- */
	- if (error == 0)
	- error = spa_vdev_remove_top_check(vd);
	-
	- if (error != 0) {
	- metaslab_group_activate(mg);
	- spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
	- return (error);
	- }
	-
	- vd->vdev_removing = B_TRUE;
	-
	- vdev_dirty_leaves(vd, VDD_DTL, *txg);
	- vdev_config_dirty(vd);
	- dmu_tx_t tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	- dsl_sync_task_nowait(spa->spa_dsl_pool,
	- vdev_remove_initiate_sync,
	- (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
	- dmu_tx_commit(tx);
	-
	- return (0);
	-}
	-
	-/*
	- * Remove a device from the pool.
	- *
	- * Removing a device from the vdev namespace requires several steps
	- * and can take a significant amount of time. As a result we use
	- * the spa_vdev_config_[enter/exit] functions which allow us to
	- * grab and release the spa_config_lock while still holding the namespace
	- * lock. During each step the configuration is synced out.
	- */
	-int
	-spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
	-{
	- vdev_t *vd;
	- nvlist_t spares, l2cache, *nv;
	- uint64_t txg = 0;
	- uint_t nspares, nl2cache;
	- int error = 0;
	- boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
	- sysevent_t *ev = NULL;
	-
	- ASSERT(spa_writeable(spa));
	-
	- if (!locked)
	- txg = spa_vdev_enter(spa);
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
	- error = (spa_has_checkpoint(spa)) ?
	- ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
	-
	- if (!locked)
	- return (spa_vdev_exit(spa, NULL, txg, error));
	-
	- return (error);
	- }
	-
	- vd = spa_lookup_by_guid(spa, guid, B_FALSE);
	-
	- if (spa->spa_spares.sav_vdevs != NULL &&
	- nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
	- (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
	- /*
	- * Only remove the hot spare if it's not currently in use
	- * in this pool.
	- */
	- if (vd == NULL \|\| unspare) {
	- char *nvstr = fnvlist_lookup_string(nv,
	- ZPOOL_CONFIG_PATH);
	- spa_history_log_internal(spa, "vdev remove", NULL,
	- "%s vdev (%s) %s", spa_name(spa),
	- VDEV_TYPE_SPARE, nvstr);
	- if (vd == NULL)
	- vd = spa_lookup_by_guid(spa, guid, B_TRUE);
	- ev = spa_event_create(spa, vd, NULL,
	- ESC_ZFS_VDEV_REMOVE_AUX);
	- spa_vdev_remove_aux(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, spares, nspares, nv);
	- spa_load_spares(spa);
	- spa->spa_spares.sav_sync = B_TRUE;
	- } else {
	- error = SET_ERROR(EBUSY);
	- }
	- } else if (spa->spa_l2cache.sav_vdevs != NULL &&
	- nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
	- ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
	- (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
	- char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
	- spa_history_log_internal(spa, "vdev remove", NULL,
	- "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr);
	- /*
	- * Cache devices can always be removed.
	- */
	- vd = spa_lookup_by_guid(spa, guid, B_TRUE);
	- ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
	- spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
	- ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
	- spa_load_l2cache(spa);
	- spa->spa_l2cache.sav_sync = B_TRUE;
	- } else if (vd != NULL && vd->vdev_islog) {
	- ASSERT(!locked);
	- error = spa_vdev_remove_log(vd, &txg);
	- } else if (vd != NULL) {
	- ASSERT(!locked);
	- error = spa_vdev_remove_top(vd, &txg);
	- } else {
	- /*
	- * There is no vdev of any kind with the specified guid.
	- */
	- error = SET_ERROR(ENOENT);
	- }
	-
	- if (!locked)
	- error = spa_vdev_exit(spa, NULL, txg, error);
	-
	- if (ev != NULL) {
	- if (error != 0) {
	- spa_event_discard(ev);
	- } else {
	- spa_event_post(ev);
	- }
	- }
	-
	- return (error);
	-}
	-
	-int
	-spa_removal_get_stats(spa_t spa, pool_removal_stat_t prs)
	-{
	- prs->prs_state = spa->spa_removing_phys.sr_state;
	-
	- if (prs->prs_state == DSS_NONE)
	- return (SET_ERROR(ENOENT));
	-
	- prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
	- prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
	- prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
	- prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
	- prs->prs_copied = spa->spa_removing_phys.sr_copied;
	-
	- if (spa->spa_vdev_removal != NULL) {
	- for (int i = 0; i < TXG_SIZE; i++) {
	- prs->prs_copied +=
	- spa->spa_vdev_removal->svr_bytes_done[i];
	- }
	- }
	-
	- prs->prs_mapping_memory = 0;
	- uint64_t indirect_vdev_id =
	- spa->spa_removing_phys.sr_prev_indirect_vdev;
	- while (indirect_vdev_id != -1) {
	- vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
	- vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	-
	- ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	- prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
	- indirect_vdev_id = vic->vic_prev_indirect_vdev;
	- }
	-
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
	@@ -1,157 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zio.h>
	-#include <sys/fs/zfs.h>
	-
	-/*
	- * Virtual device vector for the pool's root vdev.
	- */
	-
	-static uint64_t
	-vdev_root_core_tvds(vdev_t *vd)
	-{
	- uint64_t tvds = 0;
	-
	- for (uint64_t c = 0; c < vd->vdev_children; c++) {
	- vdev_t *cvd = vd->vdev_child[c];
	-
	- if (!cvd->vdev_ishole && !cvd->vdev_islog &&
	- cvd->vdev_ops != &vdev_indirect_ops) {
	- tvds++;
	- }
	- }
	-
	- return (tvds);
	-}
	-
	-/*
	- * We should be able to tolerate one failure with absolutely no damage
	- * to our metadata. Two failures will take out space maps, a bunch of
	- * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
	- * place to live. When we get smarter, we can liberalize this policy.
	- * e.g. If we haven't lost two consecutive top-level vdevs, then we are
	- * probably fine. Adding bean counters during alloc/free can make this
	- * future guesswork more accurate.
	- */
	-static boolean_t
	-too_many_errors(vdev_t *vd, uint64_t numerrors)
	-{
	- uint64_t tvds;
	-
	- if (numerrors == 0)
	- return (B_FALSE);
	-
	- tvds = vdev_root_core_tvds(vd);
	- ASSERT3U(numerrors, <=, tvds);
	-
	- if (numerrors == tvds)
	- return (B_TRUE);
	-
	- return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa));
	-}
	-
	-static int
	-vdev_root_open(vdev_t vd, uint64_t asize, uint64_t *max_asize,
	- uint64_t logical_ashift, uint64_t physical_ashift)
	-{
	- spa_t *spa = vd->vdev_spa;
	- int lasterror = 0;
	- int numerrors = 0;
	-
	- if (vd->vdev_children == 0) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	- return (SET_ERROR(EINVAL));
	- }
	-
	- vdev_open_children(vd);
	-
	- for (int c = 0; c < vd->vdev_children; c++) {
	- vdev_t *cvd = vd->vdev_child[c];
	-
	- if (cvd->vdev_open_error && !cvd->vdev_islog) {
	- lasterror = cvd->vdev_open_error;
	- numerrors++;
	- }
	- }
	-
	- if (spa_load_state(spa) != SPA_LOAD_NONE)
	- spa_set_missing_tvds(spa, numerrors);
	-
	- if (too_many_errors(vd, numerrors)) {
	- vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
	- return (lasterror);
	- }
	-
	- *asize = 0;
	- *max_asize = 0;
	- *logical_ashift = 0;
	- *physical_ashift = 0;
	-
	- return (0);
	-}
	-
	-static void
	-vdev_root_close(vdev_t *vd)
	-{
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_close(vd->vdev_child[c]);
	-}
	-
	-static void
	-vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
	-{
	- if (too_many_errors(vd, faulted)) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_NO_REPLICAS);
	- } else if (degraded \|\| faulted) {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
	- } else {
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
	- }
	-}
	-
	-vdev_ops_t vdev_root_ops = {
	- vdev_root_open,
	- vdev_root_close,
	- vdev_default_asize,
	- NULL, /* io_start - not applicable to the root */
	- NULL, /* io_done - not applicable to the root */
	- vdev_root_state_change,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- NULL,
	- VDEV_TYPE_ROOT, /* name of this vdev type */
	- B_FALSE /* not a leaf vdev */
	-};
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
	@@ -1,1378 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- */
	-
	-/*
	- * This file contains the top half of the zfs directory structure
	- * implementation. The bottom half is in zap_leaf.c.
	- *
	- * The zdir is an extendable hash data structure. There is a table of
	- * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
	- * each a constant size and hold a variable number of directory entries.
	- * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
	- *
	- * The pointer table holds a power of 2 number of pointers.
	- * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
	- * by the pointer at index i in the table holds entries whose hash value
	- * has a zd_prefix_len - bit prefix
	- */
	-
	-#include <sys/spa.h>
	-#include <sys/dmu.h>
	-#include <sys/zfs_context.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zap.h>
	-#include <sys/refcount.h>
	-#include <sys/zap_impl.h>
	-#include <sys/zap_leaf.h>
	-
	-/*
	- * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
	- * (all leaf blocks) when we start iterating over it.
	- *
	- * For zap_cursor_init(), the callers all intend to iterate through all the
	- * entries. There are a few cases where an error (typically i/o error) could
	- * cause it to bail out early.
	- *
	- * For zap_cursor_init_serialized(), there are callers that do the iteration
	- * outside of ZFS. Typically they would iterate over everything, but we
	- * don't have control of that. E.g. zfs_ioc_snapshot_list_next(),
	- * zcp_snapshots_iter(), and other iterators over things in the MOS - these
	- * are called by /sbin/zfs and channel programs. The other example is
	- * zfs_readdir() which iterates over directory entries for the getdents()
	- * syscall. /sbin/ls iterates to the end (unless it receives a signal), but
	- * userland doesn't have to.
	- *
	- * Given that the ZAP entries aren't returned in a specific order, the only
	- * legitimate use cases for partial iteration would be:
	- *
	- * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
	- * get the first 100 and then wait for the user to hit "next page", which
	- * they may never do).
	- *
	- * 2. You want to know if there are more than X entries, without relying on
	- * the zfs-specific implementation of the directory's st_size (which is
	- * the number of entries).
	- */
	-boolean_t zap_iterate_prefetch = B_TRUE;
	-
	-int fzap_default_block_shift = 14; /* 16k blocksize */
	-
	-extern inline zap_phys_t zap_f_phys(zap_t zap);
	-
	-static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
	-
	-void
	-fzap_byteswap(void *vbuf, size_t size)
	-{
	- uint64_t block_type = (uint64_t )vbuf;
	-
	- if (block_type == ZBT_LEAF \|\| block_type == BSWAP_64(ZBT_LEAF))
	- zap_leaf_byteswap(vbuf, size);
	- else {
	- /* it's a ptrtbl block */
	- byteswap_uint64_array(vbuf, size);
	- }
	-}
	-
	-void
	-fzap_upgrade(zap_t zap, dmu_tx_t tx, zap_flags_t flags)
	-{
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	- zap->zap_ismicro = FALSE;
	-
	- zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
	- zap->zap_dbu.dbu_evict_func_async = NULL;
	-
	- mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
	- zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
	-
	- zap_phys_t *zp = zap_f_phys(zap);
	- /*
	- * explicitly zero it since it might be coming from an
	- * initialized microzap
	- */
	- bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
	- zp->zap_block_type = ZBT_HEADER;
	- zp->zap_magic = ZAP_MAGIC;
	-
	- zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
	-
	- zp->zap_freeblk = 2; /* block 1 will be the first leaf */
	- zp->zap_num_leafs = 1;
	- zp->zap_num_entries = 0;
	- zp->zap_salt = zap->zap_salt;
	- zp->zap_normflags = zap->zap_normflags;
	- zp->zap_flags = flags;
	-
	- /* block 1 will be the first leaf */
	- for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
	- ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
	-
	- /*
	- * set up block 1 - the first leaf
	- */
	- dmu_buf_t *db;
	- VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
	- 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
	- dmu_buf_will_dirty(db, tx);
	-
	- zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
	- l->l_dbuf = db;
	-
	- zap_leaf_init(l, zp->zap_normflags != 0);
	-
	- kmem_free(l, sizeof (zap_leaf_t));
	- dmu_buf_rele(db, FTAG);
	-}
	-
	-static int
	-zap_tryupgradedir(zap_t zap, dmu_tx_t tx)
	-{
	- if (RW_WRITE_HELD(&zap->zap_rwlock))
	- return (1);
	- if (rw_tryupgrade(&zap->zap_rwlock)) {
	- dmu_buf_will_dirty(zap->zap_dbuf, tx);
	- return (1);
	- }
	- return (0);
	-}
	-
	-/*
	- * Generic routines for dealing with the pointer & cookie tables.
	- */
	-
	-static int
	-zap_table_grow(zap_t zap, zap_table_phys_t tbl,
	- void (transfer_func)(const uint64_t src, uint64_t *dst, int n),
	- dmu_tx_t *tx)
	-{
	- uint64_t newblk;
	- int bs = FZAP_BLOCK_SHIFT(zap);
	- int hepb = 1<<(bs-4);
	- /* hepb = half the number of entries in a block */
	-
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	- ASSERT(tbl->zt_blk != 0);
	- ASSERT(tbl->zt_numblks > 0);
	-
	- if (tbl->zt_nextblk != 0) {
	- newblk = tbl->zt_nextblk;
	- } else {
	- newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
	- tbl->zt_nextblk = newblk;
	- ASSERT0(tbl->zt_blks_copied);
	- dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
	- tbl->zt_blk << bs, tbl->zt_numblks << bs,
	- ZIO_PRIORITY_SYNC_READ);
	- }
	-
	- /*
	- * Copy the ptrtbl from the old to new location.
	- */
	-
	- uint64_t b = tbl->zt_blks_copied;
	- dmu_buf_t *db_old;
	- int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
	- (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
	- if (err != 0)
	- return (err);
	-
	- /* first half of entries in old[b] go to new[2b+0] /
	- dmu_buf_t *db_new;
	- VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
	- (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
	- dmu_buf_will_dirty(db_new, tx);
	- transfer_func(db_old->db_data, db_new->db_data, hepb);
	- dmu_buf_rele(db_new, FTAG);
	-
	- /* second half of entries in old[b] go to new[2b+1] /
	- VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
	- (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
	- dmu_buf_will_dirty(db_new, tx);
	- transfer_func((uint64_t *)db_old->db_data + hepb,
	- db_new->db_data, hepb);
	- dmu_buf_rele(db_new, FTAG);
	-
	- dmu_buf_rele(db_old, FTAG);
	-
	- tbl->zt_blks_copied++;
	-
	- dprintf("copied block %llu of %llu\n",
	- tbl->zt_blks_copied, tbl->zt_numblks);
	-
	- if (tbl->zt_blks_copied == tbl->zt_numblks) {
	- (void) dmu_free_range(zap->zap_objset, zap->zap_object,
	- tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
	-
	- tbl->zt_blk = newblk;
	- tbl->zt_numblks *= 2;
	- tbl->zt_shift++;
	- tbl->zt_nextblk = 0;
	- tbl->zt_blks_copied = 0;
	-
	- dprintf("finished; numblocks now %llu (%lluk entries)\n",
	- tbl->zt_numblks, 1<<(tbl->zt_shift-10));
	- }
	-
	- return (0);
	-}
	-
	-static int
	-zap_table_store(zap_t zap, zap_table_phys_t tbl, uint64_t idx, uint64_t val,
	- dmu_tx_t *tx)
	-{
	- int bs = FZAP_BLOCK_SHIFT(zap);
	-
	- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
	- ASSERT(tbl->zt_blk != 0);
	-
	- dprintf("storing %llx at index %llx\n", val, idx);
	-
	- uint64_t blk = idx >> (bs-3);
	- uint64_t off = idx & ((1<<(bs-3))-1);
	-
	- dmu_buf_t *db;
	- int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
	- (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
	- if (err != 0)
	- return (err);
	- dmu_buf_will_dirty(db, tx);
	-
	- if (tbl->zt_nextblk != 0) {
	- uint64_t idx2 = idx * 2;
	- uint64_t blk2 = idx2 >> (bs-3);
	- uint64_t off2 = idx2 & ((1<<(bs-3))-1);
	- dmu_buf_t *db2;
	-
	- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
	- (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
	- DMU_READ_NO_PREFETCH);
	- if (err != 0) {
	- dmu_buf_rele(db, FTAG);
	- return (err);
	- }
	- dmu_buf_will_dirty(db2, tx);
	- ((uint64_t *)db2->db_data)[off2] = val;
	- ((uint64_t *)db2->db_data)[off2+1] = val;
	- dmu_buf_rele(db2, FTAG);
	- }
	-
	- ((uint64_t *)db->db_data)[off] = val;
	- dmu_buf_rele(db, FTAG);
	-
	- return (0);
	-}
	-
	-static int
	-zap_table_load(zap_t zap, zap_table_phys_t tbl, uint64_t idx, uint64_t *valp)
	-{
	- int bs = FZAP_BLOCK_SHIFT(zap);
	-
	- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
	-
	- uint64_t blk = idx >> (bs-3);
	- uint64_t off = idx & ((1<<(bs-3))-1);
	-
	- /*
	- * Note: this is equivalent to dmu_buf_hold(), but we use
	- * _dnode_enter / _by_dnode because it's faster because we don't
	- * have to hold the dnode.
	- */
	- dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
	- dmu_buf_t *db;
	- int err = dmu_buf_hold_by_dnode(dn,
	- (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
	- dmu_buf_dnode_exit(zap->zap_dbuf);
	- if (err != 0)
	- return (err);
	- valp = ((uint64_t )db->db_data)[off];
	- dmu_buf_rele(db, FTAG);
	-
	- if (tbl->zt_nextblk != 0) {
	- /*
	- * read the nextblk for the sake of i/o error checking,
	- * so that zap_table_load() will catch errors for
	- * zap_table_store.
	- */
	- blk = (idx*2) >> (bs-3);
	-
	- dn = dmu_buf_dnode_enter(zap->zap_dbuf);
	- err = dmu_buf_hold_by_dnode(dn,
	- (tbl->zt_nextblk + blk) << bs, FTAG, &db,
	- DMU_READ_NO_PREFETCH);
	- dmu_buf_dnode_exit(zap->zap_dbuf);
	- if (err == 0)
	- dmu_buf_rele(db, FTAG);
	- }
	- return (err);
	-}
	-
	-/*
	- * Routines for growing the ptrtbl.
	- */
	-
	-static void
	-zap_ptrtbl_transfer(const uint64_t src, uint64_t dst, int n)
	-{
	- for (int i = 0; i < n; i++) {
	- uint64_t lb = src[i];
	- dst[2 * i + 0] = lb;
	- dst[2 * i + 1] = lb;
	- }
	-}
	-
	-static int
	-zap_grow_ptrtbl(zap_t zap, dmu_tx_t tx)
	-{
	- /*
	- * The pointer table should never use more hash bits than we
	- * have (otherwise we'd be using useless zero bits to index it).
	- * If we are within 2 bits of running out, stop growing, since
	- * this is already an aberrant condition.
	- */
	- if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
	- return (SET_ERROR(ENOSPC));
	-
	- if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
	- /*
	- * We are outgrowing the "embedded" ptrtbl (the one
	- * stored in the header block). Give it its own entire
	- * block, which will double the size of the ptrtbl.
	- */
	- ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
	- ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
	- ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
	-
	- uint64_t newblk = zap_allocate_blocks(zap, 1);
	- dmu_buf_t *db_new;
	- int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
	- newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
	- DMU_READ_NO_PREFETCH);
	- if (err != 0)
	- return (err);
	- dmu_buf_will_dirty(db_new, tx);
	- zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
	- db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
	- dmu_buf_rele(db_new, FTAG);
	-
	- zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
	- zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
	- zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
	-
	- ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
	- zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
	- (FZAP_BLOCK_SHIFT(zap)-3));
	-
	- return (0);
	- } else {
	- return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
	- zap_ptrtbl_transfer, tx));
	- }
	-}
	-
	-static void
	-zap_increment_num_entries(zap_t zap, int delta, dmu_tx_t tx)
	-{
	- dmu_buf_will_dirty(zap->zap_dbuf, tx);
	- mutex_enter(&zap->zap_f.zap_num_entries_mtx);
	- ASSERT(delta > 0 \|\| zap_f_phys(zap)->zap_num_entries >= -delta);
	- zap_f_phys(zap)->zap_num_entries += delta;
	- mutex_exit(&zap->zap_f.zap_num_entries_mtx);
	-}
	-
	-static uint64_t
	-zap_allocate_blocks(zap_t *zap, int nblocks)
	-{
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	- uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
	- zap_f_phys(zap)->zap_freeblk += nblocks;
	- return (newblk);
	-}
	-
	-static void
	-zap_leaf_evict_sync(void *dbu)
	-{
	- zap_leaf_t *l = dbu;
	-
	- rw_destroy(&l->l_rwlock);
	- kmem_free(l, sizeof (zap_leaf_t));
	-}
	-
	-static zap_leaf_t *
	-zap_create_leaf(zap_t zap, dmu_tx_t tx)
	-{
	- zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
	-
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	-
	- rw_init(&l->l_rwlock, 0, 0, 0);
	- rw_enter(&l->l_rwlock, RW_WRITER);
	- l->l_blkid = zap_allocate_blocks(zap, 1);
	- l->l_dbuf = NULL;
	-
	- VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
	- l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
	- DMU_READ_NO_PREFETCH));
	- dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
	- VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
	- dmu_buf_will_dirty(l->l_dbuf, tx);
	-
	- zap_leaf_init(l, zap->zap_normflags != 0);
	-
	- zap_f_phys(zap)->zap_num_leafs++;
	-
	- return (l);
	-}
	-
	-int
	-fzap_count(zap_t zap, uint64_t count)
	-{
	- ASSERT(!zap->zap_ismicro);
	- mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
	- *count = zap_f_phys(zap)->zap_num_entries;
	- mutex_exit(&zap->zap_f.zap_num_entries_mtx);
	- return (0);
	-}
	-
	-/*
	- * Routines for obtaining zap_leaf_t's
	- */
	-
	-void
	-zap_put_leaf(zap_leaf_t *l)
	-{
	- rw_exit(&l->l_rwlock);
	- dmu_buf_rele(l->l_dbuf, NULL);
	-}
	-
	-static zap_leaf_t *
	-zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
	-{
	- ASSERT(blkid != 0);
	-
	- zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
	- rw_init(&l->l_rwlock, 0, 0, 0);
	- rw_enter(&l->l_rwlock, RW_WRITER);
	- l->l_blkid = blkid;
	- l->l_bs = highbit64(db->db_size) - 1;
	- l->l_dbuf = db;
	-
	- dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
	- zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
	-
	- rw_exit(&l->l_rwlock);
	- if (winner != NULL) {
	- /* someone else set it first */
	- zap_leaf_evict_sync(&l->l_dbu);
	- l = winner;
	- }
	-
	- /*
	- * lhr_pad was previously used for the next leaf in the leaf
	- * chain. There should be no chained leafs (as we have removed
	- * support for them).
	- */
	- ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
	-
	- /*
	- * There should be more hash entries than there can be
	- * chunks to put in the hash table
	- */
	- ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
	-
	- /* The chunks should begin at the end of the hash table */
	- ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
	- &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
	-
	- /* The chunks should end at the end of the block */
	- ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
	- (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
	-
	- return (l);
	-}
	-
	-static int
	-zap_get_leaf_byblk(zap_t zap, uint64_t blkid, dmu_tx_t tx, krw_t lt,
	- zap_leaf_t **lp)
	-{
	- dmu_buf_t *db;
	-
	- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
	-
	- int bs = FZAP_BLOCK_SHIFT(zap);
	- dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
	- int err = dmu_buf_hold_by_dnode(dn,
	- blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
	- dmu_buf_dnode_exit(zap->zap_dbuf);
	- if (err != 0)
	- return (err);
	-
	- ASSERT3U(db->db_object, ==, zap->zap_object);
	- ASSERT3U(db->db_offset, ==, blkid << bs);
	- ASSERT3U(db->db_size, ==, 1 << bs);
	- ASSERT(blkid != 0);
	-
	- zap_leaf_t *l = dmu_buf_get_user(db);
	-
	- if (l == NULL)
	- l = zap_open_leaf(blkid, db);
	-
	- rw_enter(&l->l_rwlock, lt);
	- /*
	- * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
	- * causing ASSERT below to fail.
	- */
	- if (lt == RW_WRITER)
	- dmu_buf_will_dirty(db, tx);
	- ASSERT3U(l->l_blkid, ==, blkid);
	- ASSERT3P(l->l_dbuf, ==, db);
	- ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
	- ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
	-
	- *lp = l;
	- return (0);
	-}
	-
	-static int
	-zap_idx_to_blk(zap_t zap, uint64_t idx, uint64_t valp)
	-{
	- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
	-
	- if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
	- ASSERT3U(idx, <,
	- (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
	- *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
	- return (0);
	- } else {
	- return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
	- idx, valp));
	- }
	-}
	-
	-static int
	-zap_set_idx_to_blk(zap_t zap, uint64_t idx, uint64_t blk, dmu_tx_t tx)
	-{
	- ASSERT(tx != NULL);
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	-
	- if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
	- ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
	- return (0);
	- } else {
	- return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
	- idx, blk, tx));
	- }
	-}
	-
	-static int
	-zap_deref_leaf(zap_t zap, uint64_t h, dmu_tx_t tx, krw_t lt, zap_leaf_t **lp)
	-{
	- uint64_t blk;
	-
	- ASSERT(zap->zap_dbuf == NULL \|\|
	- zap_f_phys(zap) == zap->zap_dbuf->db_data);
	-
	- /* Reality check for corrupt zap objects (leaf or header). */
	- if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
	- zap_f_phys(zap)->zap_block_type != ZBT_HEADER) \|\|
	- zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
	- return (SET_ERROR(EIO));
	- }
	-
	- uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
	- int err = zap_idx_to_blk(zap, idx, &blk);
	- if (err != 0)
	- return (err);
	- err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
	-
	- ASSERT(err \|\|
	- ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
	- zap_leaf_phys(*lp)->l_hdr.lh_prefix);
	- return (err);
	-}
	-
	-static int
	-zap_expand_leaf(zap_name_t zn, zap_leaf_t l,
	- void tag, dmu_tx_t tx, zap_leaf_t **lp)
	-{
	- zap_t *zap = zn->zn_zap;
	- uint64_t hash = zn->zn_hash;
	- int err;
	- int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
	-
	- ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
	- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
	-
	- ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
	- zap_leaf_phys(l)->l_hdr.lh_prefix);
	-
	- if (zap_tryupgradedir(zap, tx) == 0 \|\|
	- old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
	- /* We failed to upgrade, or need to grow the pointer table */
	- objset_t *os = zap->zap_objset;
	- uint64_t object = zap->zap_object;
	-
	- zap_put_leaf(l);
	- zap_unlockdir(zap, tag);
	- err = zap_lockdir(os, object, tx, RW_WRITER,
	- FALSE, FALSE, tag, &zn->zn_zap);
	- zap = zn->zn_zap;
	- if (err != 0)
	- return (err);
	- ASSERT(!zap->zap_ismicro);
	-
	- while (old_prefix_len ==
	- zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
	- err = zap_grow_ptrtbl(zap, tx);
	- if (err != 0)
	- return (err);
	- }
	-
	- err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
	- if (err != 0)
	- return (err);
	-
	- if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
	- /* it split while our locks were down */
	- *lp = l;
	- return (0);
	- }
	- }
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	- ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
	- ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
	- zap_leaf_phys(l)->l_hdr.lh_prefix);
	-
	- int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
	- (old_prefix_len + 1);
	- uint64_t sibling =
	- (ZAP_HASH_IDX(hash, old_prefix_len + 1) \| 1) << prefix_diff;
	-
	- /* check for i/o errors before doing zap_leaf_split */
	- for (int i = 0; i < (1ULL << prefix_diff); i++) {
	- uint64_t blk;
	- err = zap_idx_to_blk(zap, sibling + i, &blk);
	- if (err != 0)
	- return (err);
	- ASSERT3U(blk, ==, l->l_blkid);
	- }
	-
	- zap_leaf_t *nl = zap_create_leaf(zap, tx);
	- zap_leaf_split(l, nl, zap->zap_normflags != 0);
	-
	- /* set sibling pointers */
	- for (int i = 0; i < (1ULL << prefix_diff); i++) {
	- err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
	- ASSERT0(err); /* we checked for i/o errors above */
	- }
	-
	- if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
	- /* we want the sibling */
	- zap_put_leaf(l);
	- *lp = nl;
	- } else {
	- zap_put_leaf(nl);
	- *lp = l;
	- }
	-
	- return (0);
	-}
	-
	-static void
	-zap_put_leaf_maybe_grow_ptrtbl(zap_name_t zn, zap_leaf_t l,
	- void tag, dmu_tx_t tx)
	-{
	- zap_t *zap = zn->zn_zap;
	- int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
	- int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
	- zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
	-
	- zap_put_leaf(l);
	-
	- if (leaffull \|\| zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
	- /*
	- * We are in the middle of growing the pointer table, or
	- * this leaf will soon make us grow it.
	- */
	- if (zap_tryupgradedir(zap, tx) == 0) {
	- objset_t *os = zap->zap_objset;
	- uint64_t zapobj = zap->zap_object;
	-
	- zap_unlockdir(zap, tag);
	- int err = zap_lockdir(os, zapobj, tx,
	- RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
	- zap = zn->zn_zap;
	- if (err != 0)
	- return;
	- }
	-
	- /* could have finished growing while our locks were down */
	- if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
	- (void) zap_grow_ptrtbl(zap, tx);
	- }
	-}
	-
	-static int
	-fzap_checkname(zap_name_t *zn)
	-{
	- if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
	- return (SET_ERROR(ENAMETOOLONG));
	- return (0);
	-}
	-
	-static int
	-fzap_checksize(uint64_t integer_size, uint64_t num_integers)
	-{
	- /* Only integer sizes supported by C */
	- switch (integer_size) {
	- case 1:
	- case 2:
	- case 4:
	- case 8:
	- break;
	- default:
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (integer_size * num_integers > ZAP_MAXVALUELEN)
	- return (E2BIG);
	-
	- return (0);
	-}
	-
	-static int
	-fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
	-{
	- int err = fzap_checkname(zn);
	- if (err != 0)
	- return (err);
	- return (fzap_checksize(integer_size, num_integers));
	-}
	-
	-/*
	- * Routines for manipulating attributes.
	- */
	-int
	-fzap_lookup(zap_name_t *zn,
	- uint64_t integer_size, uint64_t num_integers, void *buf,
	- char realname, int rn_len, boolean_t ncp)
	-{
	- zap_leaf_t *l;
	- zap_entry_handle_t zeh;
	-
	- int err = fzap_checkname(zn);
	- if (err != 0)
	- return (err);
	-
	- err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
	- if (err != 0)
	- return (err);
	- err = zap_leaf_lookup(l, zn, &zeh);
	- if (err == 0) {
	- if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
	- zap_put_leaf(l);
	- return (err);
	- }
	-
	- err = zap_entry_read(&zeh, integer_size, num_integers, buf);
	- (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
	- if (ncp) {
	- *ncp = zap_entry_normalization_conflict(&zeh,
	- zn, NULL, zn->zn_zap);
	- }
	- }
	-
	- zap_put_leaf(l);
	- return (err);
	-}
	-
	-int
	-fzap_add_cd(zap_name_t *zn,
	- uint64_t integer_size, uint64_t num_integers,
	- const void val, uint32_t cd, void tag, dmu_tx_t *tx)
	-{
	- zap_leaf_t *l;
	- int err;
	- zap_entry_handle_t zeh;
	- zap_t *zap = zn->zn_zap;
	-
	- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
	- ASSERT(!zap->zap_ismicro);
	- ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
	-
	- err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
	- if (err != 0)
	- return (err);
	-retry:
	- err = zap_leaf_lookup(l, zn, &zeh);
	- if (err == 0) {
	- err = SET_ERROR(EEXIST);
	- goto out;
	- }
	- if (err != ENOENT)
	- goto out;
	-
	- err = zap_entry_create(l, zn, cd,
	- integer_size, num_integers, val, &zeh);
	-
	- if (err == 0) {
	- zap_increment_num_entries(zap, 1, tx);
	- } else if (err == EAGAIN) {
	- err = zap_expand_leaf(zn, l, tag, tx, &l);
	- zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
	- if (err == 0)
	- goto retry;
	- }
	-
	-out:
	- if (zap != NULL)
	- zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
	- return (err);
	-}
	-
	-int
	-fzap_add(zap_name_t *zn,
	- uint64_t integer_size, uint64_t num_integers,
	- const void val, void tag, dmu_tx_t *tx)
	-{
	- int err = fzap_check(zn, integer_size, num_integers);
	- if (err != 0)
	- return (err);
	-
	- return (fzap_add_cd(zn, integer_size, num_integers,
	- val, ZAP_NEED_CD, tag, tx));
	-}
	-
	-int
	-fzap_update(zap_name_t *zn,
	- int integer_size, uint64_t num_integers, const void *val,
	- void tag, dmu_tx_t tx)
	-{
	- zap_leaf_t *l;
	- int err;
	- boolean_t create;
	- zap_entry_handle_t zeh;
	- zap_t *zap = zn->zn_zap;
	-
	- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
	- err = fzap_check(zn, integer_size, num_integers);
	- if (err != 0)
	- return (err);
	-
	- err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
	- if (err != 0)
	- return (err);
	-retry:
	- err = zap_leaf_lookup(l, zn, &zeh);
	- create = (err == ENOENT);
	- ASSERT(err == 0 \|\| err == ENOENT);
	-
	- if (create) {
	- err = zap_entry_create(l, zn, ZAP_NEED_CD,
	- integer_size, num_integers, val, &zeh);
	- if (err == 0)
	- zap_increment_num_entries(zap, 1, tx);
	- } else {
	- err = zap_entry_update(&zeh, integer_size, num_integers, val);
	- }
	-
	- if (err == EAGAIN) {
	- err = zap_expand_leaf(zn, l, tag, tx, &l);
	- zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
	- if (err == 0)
	- goto retry;
	- }
	-
	- if (zap != NULL)
	- zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
	- return (err);
	-}
	-
	-int
	-fzap_length(zap_name_t *zn,
	- uint64_t integer_size, uint64_t num_integers)
	-{
	- zap_leaf_t *l;
	- int err;
	- zap_entry_handle_t zeh;
	-
	- err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
	- if (err != 0)
	- return (err);
	- err = zap_leaf_lookup(l, zn, &zeh);
	- if (err != 0)
	- goto out;
	-
	- if (integer_size != 0)
	- *integer_size = zeh.zeh_integer_size;
	- if (num_integers != 0)
	- *num_integers = zeh.zeh_num_integers;
	-out:
	- zap_put_leaf(l);
	- return (err);
	-}
	-
	-int
	-fzap_remove(zap_name_t zn, dmu_tx_t tx)
	-{
	- zap_leaf_t *l;
	- int err;
	- zap_entry_handle_t zeh;
	-
	- err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
	- if (err != 0)
	- return (err);
	- err = zap_leaf_lookup(l, zn, &zeh);
	- if (err == 0) {
	- zap_entry_remove(&zeh);
	- zap_increment_num_entries(zn->zn_zap, -1, tx);
	- }
	- zap_put_leaf(l);
	- return (err);
	-}
	-
	-void
	-fzap_prefetch(zap_name_t *zn)
	-{
	- uint64_t blk;
	- zap_t *zap = zn->zn_zap;
	-
	- uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
	- zap_f_phys(zap)->zap_ptrtbl.zt_shift);
	- if (zap_idx_to_blk(zap, idx, &blk) != 0)
	- return;
	- int bs = FZAP_BLOCK_SHIFT(zap);
	- dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
	- ZIO_PRIORITY_SYNC_READ);
	-}
	-
	-/*
	- * Helper functions for consumers.
	- */
	-
	-uint64_t
	-zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
	- const char name, dmu_tx_t tx)
	-{
	- return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
	-}
	-
	-uint64_t
	-zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
	- const char name, int dnodesize, dmu_tx_t tx)
	-{
	- uint64_t new_obj;
	-
	- VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0,
	- dnodesize, tx)) > 0);
	- VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
	- tx));
	-
	- return (new_obj);
	-}
	-
	-int
	-zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
	- char *name)
	-{
	- zap_cursor_t zc;
	- int err;
	-
	- if (mask == 0)
	- mask = -1ULL;
	-
	- zap_attribute_t za = kmem_alloc(sizeof (za), KM_SLEEP);
	- for (zap_cursor_init(&zc, os, zapobj);
	- (err = zap_cursor_retrieve(&zc, za)) == 0;
	- zap_cursor_advance(&zc)) {
	- if ((za->za_first_integer & mask) == (value & mask)) {
	- (void) strcpy(name, za->za_name);
	- break;
	- }
	- }
	- zap_cursor_fini(&zc);
	- kmem_free(za, sizeof (*za));
	- return (err);
	-}
	-
	-int
	-zap_join(objset_t os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t tx)
	-{
	- zap_cursor_t zc;
	- int err = 0;
	-
	- zap_attribute_t za = kmem_alloc(sizeof (za), KM_SLEEP);
	- for (zap_cursor_init(&zc, os, fromobj);
	- zap_cursor_retrieve(&zc, za) == 0;
	- (void) zap_cursor_advance(&zc)) {
	- if (za->za_integer_length != 8 \|\| za->za_num_integers != 1) {
	- err = SET_ERROR(EINVAL);
	- break;
	- }
	- err = zap_add(os, intoobj, za->za_name,
	- 8, 1, &za->za_first_integer, tx);
	- if (err != 0)
	- break;
	- }
	- zap_cursor_fini(&zc);
	- kmem_free(za, sizeof (*za));
	- return (err);
	-}
	-
	-int
	-zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
	- uint64_t value, dmu_tx_t *tx)
	-{
	- zap_cursor_t zc;
	- int err = 0;
	-
	- zap_attribute_t za = kmem_alloc(sizeof (za), KM_SLEEP);
	- for (zap_cursor_init(&zc, os, fromobj);
	- zap_cursor_retrieve(&zc, za) == 0;
	- (void) zap_cursor_advance(&zc)) {
	- if (za->za_integer_length != 8 \|\| za->za_num_integers != 1) {
	- err = SET_ERROR(EINVAL);
	- break;
	- }
	- err = zap_add(os, intoobj, za->za_name,
	- 8, 1, &value, tx);
	- if (err != 0)
	- break;
	- }
	- zap_cursor_fini(&zc);
	- kmem_free(za, sizeof (*za));
	- return (err);
	-}
	-
	-int
	-zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
	- dmu_tx_t *tx)
	-{
	- zap_cursor_t zc;
	- int err = 0;
	-
	- zap_attribute_t za = kmem_alloc(sizeof (za), KM_SLEEP);
	- for (zap_cursor_init(&zc, os, fromobj);
	- zap_cursor_retrieve(&zc, za) == 0;
	- (void) zap_cursor_advance(&zc)) {
	- uint64_t delta = 0;
	-
	- if (za->za_integer_length != 8 \|\| za->za_num_integers != 1) {
	- err = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
	- if (err != 0 && err != ENOENT)
	- break;
	- delta += za->za_first_integer;
	- err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
	- if (err != 0)
	- break;
	- }
	- zap_cursor_fini(&zc);
	- kmem_free(za, sizeof (*za));
	- return (err);
	-}
	-
	-int
	-zap_add_int(objset_t os, uint64_t obj, uint64_t value, dmu_tx_t tx)
	-{
	- char name[20];
	-
	- (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
	- return (zap_add(os, obj, name, 8, 1, &value, tx));
	-}
	-
	-int
	-zap_remove_int(objset_t os, uint64_t obj, uint64_t value, dmu_tx_t tx)
	-{
	- char name[20];
	-
	- (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
	- return (zap_remove(os, obj, name, tx));
	-}
	-
	-int
	-zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
	-{
	- char name[20];
	-
	- (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
	- return (zap_lookup(os, obj, name, 8, 1, &value));
	-}
	-
	-int
	-zap_add_int_key(objset_t *os, uint64_t obj,
	- uint64_t key, uint64_t value, dmu_tx_t *tx)
	-{
	- char name[20];
	-
	- (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
	- return (zap_add(os, obj, name, 8, 1, &value, tx));
	-}
	-
	-int
	-zap_update_int_key(objset_t *os, uint64_t obj,
	- uint64_t key, uint64_t value, dmu_tx_t *tx)
	-{
	- char name[20];
	-
	- (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
	- return (zap_update(os, obj, name, 8, 1, &value, tx));
	-}
	-
	-int
	-zap_lookup_int_key(objset_t os, uint64_t obj, uint64_t key, uint64_t valuep)
	-{
	- char name[20];
	-
	- (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
	- return (zap_lookup(os, obj, name, 8, 1, valuep));
	-}
	-
	-int
	-zap_increment(objset_t os, uint64_t obj, const char name, int64_t delta,
	- dmu_tx_t *tx)
	-{
	- uint64_t value = 0;
	-
	- if (delta == 0)
	- return (0);
	-
	- int err = zap_lookup(os, obj, name, 8, 1, &value);
	- if (err != 0 && err != ENOENT)
	- return (err);
	- value += delta;
	- if (value == 0)
	- err = zap_remove(os, obj, name, tx);
	- else
	- err = zap_update(os, obj, name, 8, 1, &value, tx);
	- return (err);
	-}
	-
	-int
	-zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
	- dmu_tx_t *tx)
	-{
	- char name[20];
	-
	- (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
	- return (zap_increment(os, obj, name, delta, tx));
	-}
	-
	-/*
	- * Routines for iterating over the attributes.
	- */
	-
	-int
	-fzap_cursor_retrieve(zap_t zap, zap_cursor_t zc, zap_attribute_t *za)
	-{
	- int err = ENOENT;
	- zap_entry_handle_t zeh;
	- zap_leaf_t *l;
	-
	- /* retrieve the next entry at or after zc_hash/zc_cd */
	- /* if no entry, return ENOENT */
	-
	- /*
	- * If we are reading from the beginning, we're almost
	- * certain to iterate over the entire ZAP object. If there are
	- * multiple leaf blocks (freeblk > 2), prefetch the whole
	- * object, so that we read the leaf blocks concurrently.
	- * (Unless noprefetch was requested via zap_cursor_init_noprefetch()).
	- */
	- if (zc->zc_hash == 0 && zap_iterate_prefetch &&
	- zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
	- dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
	- zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
	- ZIO_PRIORITY_ASYNC_READ);
	- }
	-
	- if (zc->zc_leaf &&
	- (ZAP_HASH_IDX(zc->zc_hash,
	- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
	- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
	- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
	- zap_put_leaf(zc->zc_leaf);
	- zc->zc_leaf = NULL;
	- }
	-
	-again:
	- if (zc->zc_leaf == NULL) {
	- err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
	- &zc->zc_leaf);
	- if (err != 0)
	- return (err);
	- } else {
	- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
	- }
	- l = zc->zc_leaf;
	-
	- err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
	-
	- if (err == ENOENT) {
	- uint64_t nocare =
	- (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
	- zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
	- zc->zc_cd = 0;
	- if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 \|\|
	- zc->zc_hash == 0) {
	- zc->zc_hash = -1ULL;
	- } else {
	- zap_put_leaf(zc->zc_leaf);
	- zc->zc_leaf = NULL;
	- goto again;
	- }
	- }
	-
	- if (err == 0) {
	- zc->zc_hash = zeh.zeh_hash;
	- zc->zc_cd = zeh.zeh_cd;
	- za->za_integer_length = zeh.zeh_integer_size;
	- za->za_num_integers = zeh.zeh_num_integers;
	- if (zeh.zeh_num_integers == 0) {
	- za->za_first_integer = 0;
	- } else {
	- err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
	- ASSERT(err == 0 \|\| err == EOVERFLOW);
	- }
	- err = zap_entry_read_name(zap, &zeh,
	- sizeof (za->za_name), za->za_name);
	- ASSERT(err == 0);
	-
	- za->za_normalization_conflict =
	- zap_entry_normalization_conflict(&zeh,
	- NULL, za->za_name, zap);
	- }
	- rw_exit(&zc->zc_leaf->l_rwlock);
	- return (err);
	-}
	-
	-static void
	-zap_stats_ptrtbl(zap_t zap, uint64_t tbl, int len, zap_stats_t *zs)
	-{
	- uint64_t lastblk = 0;
	-
	- /*
	- * NB: if a leaf has more pointers than an entire ptrtbl block
	- * can hold, then it'll be accounted for more than once, since
	- * we won't have lastblk.
	- */
	- for (int i = 0; i < len; i++) {
	- zap_leaf_t *l;
	-
	- if (tbl[i] == lastblk)
	- continue;
	- lastblk = tbl[i];
	-
	- int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
	- if (err == 0) {
	- zap_leaf_stats(zap, l, zs);
	- zap_put_leaf(l);
	- }
	- }
	-}
	-
	-int
	-fzap_cursor_move_to_key(zap_cursor_t zc, zap_name_t zn)
	-{
	- int err;
	- zap_leaf_t *l;
	- zap_entry_handle_t zeh;
	-
	- if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
	- return (SET_ERROR(ENAMETOOLONG));
	-
	- err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
	- if (err != 0)
	- return (err);
	-
	- err = zap_leaf_lookup(l, zn, &zeh);
	- if (err != 0)
	- return (err);
	-
	- zc->zc_leaf = l;
	- zc->zc_hash = zeh.zeh_hash;
	- zc->zc_cd = zeh.zeh_cd;
	-
	- return (err);
	-}
	-
	-void
	-fzap_get_stats(zap_t zap, zap_stats_t zs)
	-{
	- int bs = FZAP_BLOCK_SHIFT(zap);
	- zs->zs_blocksize = 1ULL << bs;
	-
	- /*
	- * Set zap_phys_t fields
	- */
	- zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
	- zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
	- zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
	- zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
	- zs->zs_magic = zap_f_phys(zap)->zap_magic;
	- zs->zs_salt = zap_f_phys(zap)->zap_salt;
	-
	- /*
	- * Set zap_ptrtbl fields
	- */
	- zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
	- zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
	- zs->zs_ptrtbl_blks_copied =
	- zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
	- zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
	- zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
	- zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
	-
	- if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
	- /* the ptrtbl is entirely in the header block. */
	- zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
	- 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
	- } else {
	- dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
	- zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
	- zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
	- ZIO_PRIORITY_SYNC_READ);
	-
	- for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
	- b++) {
	- dmu_buf_t *db;
	- int err;
	-
	- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
	- (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
	- FTAG, &db, DMU_READ_NO_PREFETCH);
	- if (err == 0) {
	- zap_stats_ptrtbl(zap, db->db_data,
	- 1<<(bs-3), zs);
	- dmu_buf_rele(db, FTAG);
	- }
	- }
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
	@@ -1,849 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
	- * Copyright 2017 Nexenta Systems, Inc.
	- */
	-
	-/*
	- * The 512-byte leaf is broken into 32 16-byte chunks.
	- * chunk number n means l_chunk[n], even though the header precedes it.
	- * the names are stored null-terminated.
	- */
	-
	-#include <sys/zio.h>
	-#include <sys/spa.h>
	-#include <sys/dmu.h>
	-#include <sys/zfs_context.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zap.h>
	-#include <sys/zap_impl.h>
	-#include <sys/zap_leaf.h>
	-#include <sys/arc.h>
	-
	-static uint16_t zap_leaf_rehash_entry(zap_leaf_t l, uint16_t entry);
	-
	-#define CHAIN_END 0xffff /* end of the chunk chain */
	-
	-/* half the (current) minimum block size */
	-#define MAX_ARRAY_BYTES (8<<10)
	-
	-#define LEAF_HASH(l, h) \
	- ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
	- ((h) >> \
	- (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
	-
	-#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
	-
	-extern inline zap_leaf_phys_t zap_leaf_phys(zap_leaf_t l);
	-
	-static void
	-zap_memset(void *a, int c, size_t n)
	-{
	- char *cp = a;
	- char *cpend = cp + n;
	-
	- while (cp < cpend)
	- *cp++ = c;
	-}
	-
	-static void
	-stv(int len, void *addr, uint64_t value)
	-{
	- switch (len) {
	- case 1:
	- (uint8_t )addr = value;
	- return;
	- case 2:
	- (uint16_t )addr = value;
	- return;
	- case 4:
	- (uint32_t )addr = value;
	- return;
	- case 8:
	- (uint64_t )addr = value;
	- return;
	- }
	- ASSERT(!"bad int len");
	-}
	-
	-static uint64_t
	-ldv(int len, const void *addr)
	-{
	- switch (len) {
	- case 1:
	- return ((uint8_t )addr);
	- case 2:
	- return ((uint16_t )addr);
	- case 4:
	- return ((uint32_t )addr);
	- case 8:
	- return ((uint64_t )addr);
	- }
	- ASSERT(!"bad int len");
	- return (0xFEEDFACEDEADBEEFULL);
	-}
	-
	-void
	-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
	-{
	- zap_leaf_t l;
	- dmu_buf_t l_dbuf;
	-
	- l_dbuf.db_data = buf;
	- l.l_bs = highbit64(size) - 1;
	- l.l_dbuf = &l_dbuf;
	-
	- buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
	- buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
	- buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
	- buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
	- buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
	- buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
	- buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
	-
	- for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
	- buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
	-
	- for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
	- zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
	- struct zap_leaf_entry *le;
	-
	- switch (lc->l_free.lf_type) {
	- case ZAP_CHUNK_ENTRY:
	- le = &lc->l_entry;
	-
	- le->le_type = BSWAP_8(le->le_type);
	- le->le_value_intlen = BSWAP_8(le->le_value_intlen);
	- le->le_next = BSWAP_16(le->le_next);
	- le->le_name_chunk = BSWAP_16(le->le_name_chunk);
	- le->le_name_numints = BSWAP_16(le->le_name_numints);
	- le->le_value_chunk = BSWAP_16(le->le_value_chunk);
	- le->le_value_numints = BSWAP_16(le->le_value_numints);
	- le->le_cd = BSWAP_32(le->le_cd);
	- le->le_hash = BSWAP_64(le->le_hash);
	- break;
	- case ZAP_CHUNK_FREE:
	- lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
	- lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
	- break;
	- case ZAP_CHUNK_ARRAY:
	- lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
	- lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
	- /* la_array doesn't need swapping */
	- break;
	- default:
	- ASSERT(!"bad leaf type");
	- }
	- }
	-}
	-
	-void
	-zap_leaf_init(zap_leaf_t *l, boolean_t sort)
	-{
	- l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
	- zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
	- sizeof (struct zap_leaf_header));
	- zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
	- 2*ZAP_LEAF_HASH_NUMENTRIES(l));
	- for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
	- ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
	- ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
	- }
	- ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
	- zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
	- zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
	- zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
	- if (sort)
	- zap_leaf_phys(l)->l_hdr.lh_flags \|= ZLF_ENTRIES_CDSORTED;
	-}
	-
	-/*
	- * Routines which manipulate leaf chunks (l_chunk[]).
	- */
	-
	-static uint16_t
	-zap_leaf_chunk_alloc(zap_leaf_t *l)
	-{
	- ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
	-
	- int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
	- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
	- ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
	-
	- zap_leaf_phys(l)->l_hdr.lh_freelist =
	- ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
	-
	- zap_leaf_phys(l)->l_hdr.lh_nfree--;
	-
	- return (chunk);
	-}
	-
	-static void
	-zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
	-{
	- struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
	- ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
	- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
	- ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
	-
	- zlf->lf_type = ZAP_CHUNK_FREE;
	- zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
	- bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
	- zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
	-
	- zap_leaf_phys(l)->l_hdr.lh_nfree++;
	-}
	-
	-/*
	- * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
	- */
	-
	-static uint16_t
	-zap_leaf_array_create(zap_leaf_t l, const char buf,
	- int integer_size, int num_integers)
	-{
	- uint16_t chunk_head;
	- uint16_t *chunkp = &chunk_head;
	- int byten = 0;
	- uint64_t value = 0;
	- int shift = (integer_size - 1) * 8;
	- int len = num_integers;
	-
	- ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
	-
	- while (len > 0) {
	- uint16_t chunk = zap_leaf_chunk_alloc(l);
	- struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
	-
	- la->la_type = ZAP_CHUNK_ARRAY;
	- for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
	- if (byten == 0)
	- value = ldv(integer_size, buf);
	- la->la_array[i] = value >> shift;
	- value <<= 8;
	- if (++byten == integer_size) {
	- byten = 0;
	- buf += integer_size;
	- if (--len == 0)
	- break;
	- }
	- }
	-
	- *chunkp = chunk;
	- chunkp = &la->la_next;
	- }
	- *chunkp = CHAIN_END;
	-
	- return (chunk_head);
	-}
	-
	-static void
	-zap_leaf_array_free(zap_leaf_t l, uint16_t chunkp)
	-{
	- uint16_t chunk = *chunkp;
	-
	- *chunkp = CHAIN_END;
	-
	- while (chunk != CHAIN_END) {
	- int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
	- ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
	- ZAP_CHUNK_ARRAY);
	- zap_leaf_chunk_free(l, chunk);
	- chunk = nextchunk;
	- }
	-}
	-
	-/* array_len and buf_len are in integers, not bytes */
	-static void
	-zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
	- int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
	- void *buf)
	-{
	- int len = MIN(array_len, buf_len);
	- int byten = 0;
	- uint64_t value = 0;
	- char *p = buf;
	-
	- ASSERT3U(array_int_len, <=, buf_int_len);
	-
	- /* Fast path for one 8-byte integer */
	- if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
	- struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
	- uint8_t *ip = la->la_array;
	- uint64_t *buf64 = buf;
	-
	- *buf64 = (uint64_t)ip[0] << 56 \| (uint64_t)ip[1] << 48 \|
	- (uint64_t)ip[2] << 40 \| (uint64_t)ip[3] << 32 \|
	- (uint64_t)ip[4] << 24 \| (uint64_t)ip[5] << 16 \|
	- (uint64_t)ip[6] << 8 \| (uint64_t)ip[7];
	- return;
	- }
	-
	- /* Fast path for an array of 1-byte integers (eg. the entry name) */
	- if (array_int_len == 1 && buf_int_len == 1 &&
	- buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
	- while (chunk != CHAIN_END) {
	- struct zap_leaf_array *la =
	- &ZAP_LEAF_CHUNK(l, chunk).l_array;
	- bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
	- p += ZAP_LEAF_ARRAY_BYTES;
	- chunk = la->la_next;
	- }
	- return;
	- }
	-
	- while (len > 0) {
	- struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
	-
	- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
	- for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
	- value = (value << 8) \| la->la_array[i];
	- byten++;
	- if (byten == array_int_len) {
	- stv(buf_int_len, p, value);
	- byten = 0;
	- len--;
	- if (len == 0)
	- return;
	- p += buf_int_len;
	- }
	- }
	- chunk = la->la_next;
	- }
	-}
	-
	-static boolean_t
	-zap_leaf_array_match(zap_leaf_t l, zap_name_t zn,
	- int chunk, int array_numints)
	-{
	- int bseen = 0;
	-
	- if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
	- uint64_t *thiskey =
	- kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
	- ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
	-
	- zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
	- sizeof (*thiskey), array_numints, thiskey);
	- boolean_t match = bcmp(thiskey, zn->zn_key_orig,
	- array_numints * sizeof (*thiskey)) == 0;
	- kmem_free(thiskey, array_numints * sizeof (*thiskey));
	- return (match);
	- }
	-
	- ASSERT(zn->zn_key_intlen == 1);
	- if (zn->zn_matchtype & MT_NORMALIZE) {
	- char *thisname = kmem_alloc(array_numints, KM_SLEEP);
	-
	- zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
	- sizeof (char), array_numints, thisname);
	- boolean_t match = zap_match(zn, thisname);
	- kmem_free(thisname, array_numints);
	- return (match);
	- }
	-
	- /*
	- * Fast path for exact matching.
	- * First check that the lengths match, so that we don't read
	- * past the end of the zn_key_orig array.
	- */
	- if (array_numints != zn->zn_key_orig_numints)
	- return (B_FALSE);
	- while (bseen < array_numints) {
	- struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
	- int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
	- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
	- if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
	- break;
	- chunk = la->la_next;
	- bseen += toread;
	- }
	- return (bseen == array_numints);
	-}
	-
	-/*
	- * Routines which manipulate leaf entries.
	- */
	-
	-int
	-zap_leaf_lookup(zap_leaf_t l, zap_name_t zn, zap_entry_handle_t *zeh)
	-{
	- struct zap_leaf_entry *le;
	-
	- ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
	-
	- for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
	- *chunkp != CHAIN_END; chunkp = &le->le_next) {
	- uint16_t chunk = *chunkp;
	- le = ZAP_LEAF_ENTRY(l, chunk);
	-
	- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
	- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
	-
	- if (le->le_hash != zn->zn_hash)
	- continue;
	-
	- /*
	- * NB: the entry chain is always sorted by cd on
	- * normalized zap objects, so this will find the
	- * lowest-cd match for MT_NORMALIZE.
	- */
	- ASSERT((zn->zn_matchtype == 0) \|\|
	- (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
	- if (zap_leaf_array_match(l, zn, le->le_name_chunk,
	- le->le_name_numints)) {
	- zeh->zeh_num_integers = le->le_value_numints;
	- zeh->zeh_integer_size = le->le_value_intlen;
	- zeh->zeh_cd = le->le_cd;
	- zeh->zeh_hash = le->le_hash;
	- zeh->zeh_chunkp = chunkp;
	- zeh->zeh_leaf = l;
	- return (0);
	- }
	- }
	-
	- return (SET_ERROR(ENOENT));
	-}
	-
	-/* Return (h1,cd1 >= h2,cd2) */
	-#define HCD_GTEQ(h1, cd1, h2, cd2) \
	- ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
	-
	-int
	-zap_leaf_lookup_closest(zap_leaf_t *l,
	- uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
	-{
	- uint64_t besth = -1ULL;
	- uint32_t bestcd = -1U;
	- uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
	- struct zap_leaf_entry *le;
	-
	- ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
	-
	- for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
	- for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
	- chunk != CHAIN_END; chunk = le->le_next) {
	- le = ZAP_LEAF_ENTRY(l, chunk);
	-
	- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
	- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
	-
	- if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
	- HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
	- ASSERT3U(bestlh, >=, lh);
	- bestlh = lh;
	- besth = le->le_hash;
	- bestcd = le->le_cd;
	-
	- zeh->zeh_num_integers = le->le_value_numints;
	- zeh->zeh_integer_size = le->le_value_intlen;
	- zeh->zeh_cd = le->le_cd;
	- zeh->zeh_hash = le->le_hash;
	- zeh->zeh_fakechunk = chunk;
	- zeh->zeh_chunkp = &zeh->zeh_fakechunk;
	- zeh->zeh_leaf = l;
	- }
	- }
	- }
	-
	- return (bestcd == -1U ? ENOENT : 0);
	-}
	-
	-int
	-zap_entry_read(const zap_entry_handle_t *zeh,
	- uint8_t integer_size, uint64_t num_integers, void *buf)
	-{
	- struct zap_leaf_entry *le =
	- ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
	- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
	-
	- if (le->le_value_intlen > integer_size)
	- return (SET_ERROR(EINVAL));
	-
	- zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
	- le->le_value_intlen, le->le_value_numints,
	- integer_size, num_integers, buf);
	-
	- if (zeh->zeh_num_integers > num_integers)
	- return (SET_ERROR(EOVERFLOW));
	- return (0);
	-
	-}
	-
	-int
	-zap_entry_read_name(zap_t zap, const zap_entry_handle_t zeh, uint16_t buflen,
	- char *buf)
	-{
	- struct zap_leaf_entry *le =
	- ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
	- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
	-
	- if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
	- zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
	- le->le_name_numints, 8, buflen / 8, buf);
	- } else {
	- zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
	- le->le_name_numints, 1, buflen, buf);
	- }
	- if (le->le_name_numints > buflen)
	- return (SET_ERROR(EOVERFLOW));
	- return (0);
	-}
	-
	-int
	-zap_entry_update(zap_entry_handle_t *zeh,
	- uint8_t integer_size, uint64_t num_integers, const void *buf)
	-{
	- zap_leaf_t *l = zeh->zeh_leaf;
	- struct zap_leaf_entry le = ZAP_LEAF_ENTRY(l, zeh->zeh_chunkp);
	-
	- int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
	- ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
	-
	- if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
	- return (SET_ERROR(EAGAIN));
	-
	- zap_leaf_array_free(l, &le->le_value_chunk);
	- le->le_value_chunk =
	- zap_leaf_array_create(l, buf, integer_size, num_integers);
	- le->le_value_numints = num_integers;
	- le->le_value_intlen = integer_size;
	- return (0);
	-}
	-
	-void
	-zap_entry_remove(zap_entry_handle_t *zeh)
	-{
	- zap_leaf_t *l = zeh->zeh_leaf;
	-
	- ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
	-
	- uint16_t entry_chunk = *zeh->zeh_chunkp;
	- struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
	- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
	-
	- zap_leaf_array_free(l, &le->le_name_chunk);
	- zap_leaf_array_free(l, &le->le_value_chunk);
	-
	- *zeh->zeh_chunkp = le->le_next;
	- zap_leaf_chunk_free(l, entry_chunk);
	-
	- zap_leaf_phys(l)->l_hdr.lh_nentries--;
	-}
	-
	-int
	-zap_entry_create(zap_leaf_t l, zap_name_t zn, uint32_t cd,
	- uint8_t integer_size, uint64_t num_integers, const void *buf,
	- zap_entry_handle_t *zeh)
	-{
	- uint16_t chunk;
	- struct zap_leaf_entry *le;
	- uint64_t h = zn->zn_hash;
	-
	- uint64_t valuelen = integer_size * num_integers;
	-
	- int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
	- zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
	- if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
	- return (E2BIG);
	-
	- if (cd == ZAP_NEED_CD) {
	- /* find the lowest unused cd */
	- if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
	- cd = 0;
	-
	- for (chunk = *LEAF_HASH_ENTPTR(l, h);
	- chunk != CHAIN_END; chunk = le->le_next) {
	- le = ZAP_LEAF_ENTRY(l, chunk);
	- if (le->le_cd > cd)
	- break;
	- if (le->le_hash == h) {
	- ASSERT3U(cd, ==, le->le_cd);
	- cd++;
	- }
	- }
	- } else {
	- /* old unsorted format; do it the O(n^2) way */
	- for (cd = 0; ; cd++) {
	- for (chunk = *LEAF_HASH_ENTPTR(l, h);
	- chunk != CHAIN_END; chunk = le->le_next) {
	- le = ZAP_LEAF_ENTRY(l, chunk);
	- if (le->le_hash == h &&
	- le->le_cd == cd) {
	- break;
	- }
	- }
	- /* If this cd is not in use, we are good. */
	- if (chunk == CHAIN_END)
	- break;
	- }
	- }
	- /*
	- * We would run out of space in a block before we could
	- * store enough entries to run out of CD values.
	- */
	- ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
	- }
	-
	- if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
	- return (SET_ERROR(EAGAIN));
	-
	- /* make the entry */
	- chunk = zap_leaf_chunk_alloc(l);
	- le = ZAP_LEAF_ENTRY(l, chunk);
	- le->le_type = ZAP_CHUNK_ENTRY;
	- le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
	- zn->zn_key_intlen, zn->zn_key_orig_numints);
	- le->le_name_numints = zn->zn_key_orig_numints;
	- le->le_value_chunk =
	- zap_leaf_array_create(l, buf, integer_size, num_integers);
	- le->le_value_numints = num_integers;
	- le->le_value_intlen = integer_size;
	- le->le_hash = h;
	- le->le_cd = cd;
	-
	- /* link it into the hash chain */
	- /* XXX if we did the search above, we could just use that */
	- uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
	-
	- zap_leaf_phys(l)->l_hdr.lh_nentries++;
	-
	- zeh->zeh_leaf = l;
	- zeh->zeh_num_integers = num_integers;
	- zeh->zeh_integer_size = le->le_value_intlen;
	- zeh->zeh_cd = le->le_cd;
	- zeh->zeh_hash = le->le_hash;
	- zeh->zeh_chunkp = chunkp;
	-
	- return (0);
	-}
	-
	-/*
	- * Determine if there is another entry with the same normalized form.
	- * For performance purposes, either zn or name must be provided (the
	- * other can be NULL). Note, there usually won't be any hash
	- * conflicts, in which case we don't need the concatenated/normalized
	- * form of the name. But all callers have one of these on hand anyway,
	- * so might as well take advantage. A cleaner but slower interface
	- * would accept neither argument, and compute the normalized name as
	- * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
	- */
	-boolean_t
	-zap_entry_normalization_conflict(zap_entry_handle_t zeh, zap_name_t zn,
	- const char name, zap_t zap)
	-{
	- struct zap_leaf_entry *le;
	- boolean_t allocdzn = B_FALSE;
	-
	- if (zap->zap_normflags == 0)
	- return (B_FALSE);
	-
	- for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
	- chunk != CHAIN_END; chunk = le->le_next) {
	- le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
	- if (le->le_hash != zeh->zeh_hash)
	- continue;
	- if (le->le_cd == zeh->zeh_cd)
	- continue;
	-
	- if (zn == NULL) {
	- zn = zap_name_alloc(zap, name, MT_NORMALIZE);
	- allocdzn = B_TRUE;
	- }
	- if (zap_leaf_array_match(zeh->zeh_leaf, zn,
	- le->le_name_chunk, le->le_name_numints)) {
	- if (allocdzn)
	- zap_name_free(zn);
	- return (B_TRUE);
	- }
	- }
	- if (allocdzn)
	- zap_name_free(zn);
	- return (B_FALSE);
	-}
	-
	-/*
	- * Routines for transferring entries between leafs.
	- */
	-
	-static uint16_t *
	-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
	-{
	- struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
	- struct zap_leaf_entry *le2;
	- uint16_t *chunkp;
	-
	- /*
	- * keep the entry chain sorted by cd
	- * NB: this will not cause problems for unsorted leafs, though
	- * it is unnecessary there.
	- */
	- for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
	- *chunkp != CHAIN_END; chunkp = &le2->le_next) {
	- le2 = ZAP_LEAF_ENTRY(l, *chunkp);
	- if (le2->le_cd > le->le_cd)
	- break;
	- }
	-
	- le->le_next = *chunkp;
	- *chunkp = entry;
	- return (chunkp);
	-}
	-
	-static uint16_t
	-zap_leaf_transfer_array(zap_leaf_t l, uint16_t chunk, zap_leaf_t nl)
	-{
	- uint16_t new_chunk;
	- uint16_t *nchunkp = &new_chunk;
	-
	- while (chunk != CHAIN_END) {
	- uint16_t nchunk = zap_leaf_chunk_alloc(nl);
	- struct zap_leaf_array *nla =
	- &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
	- struct zap_leaf_array *la =
	- &ZAP_LEAF_CHUNK(l, chunk).l_array;
	- int nextchunk = la->la_next;
	-
	- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
	- ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
	-
	- nla = la; /* structure assignment */
	-
	- zap_leaf_chunk_free(l, chunk);
	- chunk = nextchunk;
	- *nchunkp = nchunk;
	- nchunkp = &nla->la_next;
	- }
	- *nchunkp = CHAIN_END;
	- return (new_chunk);
	-}
	-
	-static void
	-zap_leaf_transfer_entry(zap_leaf_t l, int entry, zap_leaf_t nl)
	-{
	- struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
	- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
	-
	- uint16_t chunk = zap_leaf_chunk_alloc(nl);
	- struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
	- nle = le; /* structure assignment */
	-
	- (void) zap_leaf_rehash_entry(nl, chunk);
	-
	- nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
	- nle->le_value_chunk =
	- zap_leaf_transfer_array(l, le->le_value_chunk, nl);
	-
	- zap_leaf_chunk_free(l, entry);
	-
	- zap_leaf_phys(l)->l_hdr.lh_nentries--;
	- zap_leaf_phys(nl)->l_hdr.lh_nentries++;
	-}
	-
	-/*
	- * Transfer the entries whose hash prefix ends in 1 to the new leaf.
	- */
	-void
	-zap_leaf_split(zap_leaf_t l, zap_leaf_t nl, boolean_t sort)
	-{
	- int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
	-
	- /* set new prefix and prefix_len */
	- zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
	- zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
	- zap_leaf_phys(nl)->l_hdr.lh_prefix =
	- zap_leaf_phys(l)->l_hdr.lh_prefix \| 1;
	- zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
	- zap_leaf_phys(l)->l_hdr.lh_prefix_len;
	-
	- /* break existing hash chains */
	- zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
	- 2*ZAP_LEAF_HASH_NUMENTRIES(l));
	-
	- if (sort)
	- zap_leaf_phys(l)->l_hdr.lh_flags \|= ZLF_ENTRIES_CDSORTED;
	-
	- /*
	- * Transfer entries whose hash bit 'bit' is set to nl; rehash
	- * the remaining entries
	- *
	- * NB: We could find entries via the hashtable instead. That
	- * would be O(hashents+numents) rather than O(numblks+numents),
	- * but this accesses memory more sequentially, and when we're
	- * called, the block is usually pretty full.
	- */
	- for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
	- struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
	- if (le->le_type != ZAP_CHUNK_ENTRY)
	- continue;
	-
	- if (le->le_hash & (1ULL << bit))
	- zap_leaf_transfer_entry(l, i, nl);
	- else
	- (void) zap_leaf_rehash_entry(l, i);
	- }
	-}
	-
	-void
	-zap_leaf_stats(zap_t zap, zap_leaf_t l, zap_stats_t *zs)
	-{
	- int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
	- zap_leaf_phys(l)->l_hdr.lh_prefix_len;
	- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
	- zs->zs_leafs_with_2n_pointers[n]++;
	-
	-
	- n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
	- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
	- zs->zs_blocks_with_n5_entries[n]++;
	-
	- n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
	- zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
	- (1<<FZAP_BLOCK_SHIFT(zap));
	- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
	- zs->zs_blocks_n_tenths_full[n]++;
	-
	- for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
	- int nentries = 0;
	- int chunk = zap_leaf_phys(l)->l_hash[i];
	-
	- while (chunk != CHAIN_END) {
	- struct zap_leaf_entry *le =
	- ZAP_LEAF_ENTRY(l, chunk);
	-
	- n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
	- ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
	- le->le_value_intlen);
	- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
	- zs->zs_entries_using_n_chunks[n]++;
	-
	- chunk = le->le_next;
	- nentries++;
	- }
	-
	- n = nentries;
	- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
	- zs->zs_buckets_with_n_entries[n]++;
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
	@@ -1,1609 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Nexenta Systems, Inc.
	- */
	-
	-#include <sys/zio.h>
	-#include <sys/spa.h>
	-#include <sys/dmu.h>
	-#include <sys/zfs_context.h>
	-#include <sys/zap.h>
	-#include <sys/refcount.h>
	-#include <sys/zap_impl.h>
	-#include <sys/zap_leaf.h>
	-#include <sys/avl.h>
	-#include <sys/arc.h>
	-#include <sys/dmu_objset.h>
	-
	-#ifdef _KERNEL
	-#include <sys/sunddi.h>
	-#endif
	-
	-extern inline mzap_phys_t zap_m_phys(zap_t zap);
	-
	-static int mzap_upgrade(zap_t **zapp,
	- void tag, dmu_tx_t tx, zap_flags_t flags);
	-
	-uint64_t
	-zap_getflags(zap_t *zap)
	-{
	- if (zap->zap_ismicro)
	- return (0);
	- return (zap_f_phys(zap)->zap_flags);
	-}
	-
	-int
	-zap_hashbits(zap_t *zap)
	-{
	- if (zap_getflags(zap) & ZAP_FLAG_HASH64)
	- return (48);
	- else
	- return (28);
	-}
	-
	-uint32_t
	-zap_maxcd(zap_t *zap)
	-{
	- if (zap_getflags(zap) & ZAP_FLAG_HASH64)
	- return ((1<<16)-1);
	- else
	- return (-1U);
	-}
	-
	-static uint64_t
	-zap_hash(zap_name_t *zn)
	-{
	- zap_t *zap = zn->zn_zap;
	- uint64_t h = 0;
	-
	- if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
	- ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
	- h = (uint64_t )zn->zn_key_orig;
	- } else {
	- h = zap->zap_salt;
	- ASSERT(h != 0);
	- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
	-
	- if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
	- const uint64_t *wp = zn->zn_key_norm;
	-
	- ASSERT(zn->zn_key_intlen == 8);
	- for (int i = 0; i < zn->zn_key_norm_numints;
	- wp++, i++) {
	- uint64_t word = *wp;
	-
	- for (int j = 0; j < zn->zn_key_intlen; j++) {
	- h = (h >> 8) ^
	- zfs_crc64_table[(h ^ word) & 0xFF];
	- word >>= NBBY;
	- }
	- }
	- } else {
	- const uint8_t *cp = zn->zn_key_norm;
	-
	- /*
	- * We previously stored the terminating null on
	- * disk, but didn't hash it, so we need to
	- * continue to not hash it. (The
	- * zn_key_*_numints includes the terminating
	- * null for non-binary keys.)
	- */
	- int len = zn->zn_key_norm_numints - 1;
	-
	- ASSERT(zn->zn_key_intlen == 1);
	- for (int i = 0; i < len; cp++, i++) {
	- h = (h >> 8) ^
	- zfs_crc64_table[(h ^ *cp) & 0xFF];
	- }
	- }
	- }
	- /*
	- * Don't use all 64 bits, since we need some in the cookie for
	- * the collision differentiator. We MUST use the high bits,
	- * since those are the ones that we first pay attention to when
	- * chosing the bucket.
	- */
	- h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
	-
	- return (h);
	-}
	-
	-static int
	-zap_normalize(zap_t zap, const char name, char *namenorm, int normflags)
	-{
	- ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
	-
	- size_t inlen = strlen(name) + 1;
	- size_t outlen = ZAP_MAXNAMELEN;
	-
	- int err = 0;
	- (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
	- normflags \| U8_TEXTPREP_IGNORE_NULL \| U8_TEXTPREP_IGNORE_INVALID,
	- U8_UNICODE_LATEST, &err);
	-
	- return (err);
	-}
	-
	-boolean_t
	-zap_match(zap_name_t zn, const char matchname)
	-{
	- ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
	-
	- if (zn->zn_matchtype & MT_NORMALIZE) {
	- char norm[ZAP_MAXNAMELEN];
	-
	- if (zap_normalize(zn->zn_zap, matchname, norm,
	- zn->zn_normflags) != 0)
	- return (B_FALSE);
	-
	- return (strcmp(zn->zn_key_norm, norm) == 0);
	- } else {
	- return (strcmp(zn->zn_key_orig, matchname) == 0);
	- }
	-}
	-
	-void
	-zap_name_free(zap_name_t *zn)
	-{
	- kmem_free(zn, sizeof (zap_name_t));
	-}
	-
	-zap_name_t *
	-zap_name_alloc(zap_t zap, const char key, matchtype_t mt)
	-{
	- zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
	-
	- zn->zn_zap = zap;
	- zn->zn_key_intlen = sizeof (*key);
	- zn->zn_key_orig = key;
	- zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
	- zn->zn_matchtype = mt;
	- zn->zn_normflags = zap->zap_normflags;
	-
	- /*
	- * If we're dealing with a case sensitive lookup on a mixed or
	- * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
	- * will fold case to all caps overriding the lookup request.
	- */
	- if (mt & MT_MATCH_CASE)
	- zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
	-
	- if (zap->zap_normflags) {
	- /*
	- * We must use zap_normflags because this normalization is
	- * what the hash is computed from.
	- */
	- if (zap_normalize(zap, key, zn->zn_normbuf,
	- zap->zap_normflags) != 0) {
	- zap_name_free(zn);
	- return (NULL);
	- }
	- zn->zn_key_norm = zn->zn_normbuf;
	- zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
	- } else {
	- if (mt != 0) {
	- zap_name_free(zn);
	- return (NULL);
	- }
	- zn->zn_key_norm = zn->zn_key_orig;
	- zn->zn_key_norm_numints = zn->zn_key_orig_numints;
	- }
	-
	- zn->zn_hash = zap_hash(zn);
	-
	- if (zap->zap_normflags != zn->zn_normflags) {
	- /*
	- * We must use zn_normflags because this normalization is
	- * what the matching is based on. (Not the hash!)
	- */
	- if (zap_normalize(zap, key, zn->zn_normbuf,
	- zn->zn_normflags) != 0) {
	- zap_name_free(zn);
	- return (NULL);
	- }
	- zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
	- }
	-
	- return (zn);
	-}
	-
	-zap_name_t *
	-zap_name_alloc_uint64(zap_t zap, const uint64_t key, int numints)
	-{
	- zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
	-
	- ASSERT(zap->zap_normflags == 0);
	- zn->zn_zap = zap;
	- zn->zn_key_intlen = sizeof (*key);
	- zn->zn_key_orig = zn->zn_key_norm = key;
	- zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
	- zn->zn_matchtype = 0;
	-
	- zn->zn_hash = zap_hash(zn);
	- return (zn);
	-}
	-
	-static void
	-mzap_byteswap(mzap_phys_t *buf, size_t size)
	-{
	- buf->mz_block_type = BSWAP_64(buf->mz_block_type);
	- buf->mz_salt = BSWAP_64(buf->mz_salt);
	- buf->mz_normflags = BSWAP_64(buf->mz_normflags);
	- int max = (size / MZAP_ENT_LEN) - 1;
	- for (int i = 0; i < max; i++) {
	- buf->mz_chunk[i].mze_value =
	- BSWAP_64(buf->mz_chunk[i].mze_value);
	- buf->mz_chunk[i].mze_cd =
	- BSWAP_32(buf->mz_chunk[i].mze_cd);
	- }
	-}
	-
	-void
	-zap_byteswap(void *buf, size_t size)
	-{
	- uint64_t block_type = (uint64_t )buf;
	-
	- if (block_type == ZBT_MICRO \|\| block_type == BSWAP_64(ZBT_MICRO)) {
	- /* ASSERT(magic == ZAP_LEAF_MAGIC); */
	- mzap_byteswap(buf, size);
	- } else {
	- fzap_byteswap(buf, size);
	- }
	-}
	-
	-static int
	-mze_compare(const void arg1, const void arg2)
	-{
	- const mzap_ent_t *mze1 = arg1;
	- const mzap_ent_t *mze2 = arg2;
	-
	- int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
	- if (likely(cmp))
	- return (cmp);
	-
	- return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
	-}
	-
	-static int
	-mze_insert(zap_t *zap, int chunkid, uint64_t hash)
	-{
	- avl_index_t idx;
	-
	- ASSERT(zap->zap_ismicro);
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	-
	- mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
	- mze->mze_chunkid = chunkid;
	- mze->mze_hash = hash;
	- mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
	- ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
	- if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) {
	- kmem_free(mze, sizeof (mzap_ent_t));
	- return (EEXIST);
	- }
	- avl_insert(&zap->zap_m.zap_avl, mze, idx);
	- return (0);
	-}
	-
	-static mzap_ent_t *
	-mze_find(zap_name_t *zn)
	-{
	- mzap_ent_t mze_tofind;
	- mzap_ent_t *mze;
	- avl_index_t idx;
	- avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
	-
	- ASSERT(zn->zn_zap->zap_ismicro);
	- ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
	-
	- mze_tofind.mze_hash = zn->zn_hash;
	- mze_tofind.mze_cd = 0;
	-
	- mze = avl_find(avl, &mze_tofind, &idx);
	- if (mze == NULL)
	- mze = avl_nearest(avl, idx, AVL_AFTER);
	- for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
	- ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
	- if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
	- return (mze);
	- }
	-
	- return (NULL);
	-}
	-
	-static uint32_t
	-mze_find_unused_cd(zap_t *zap, uint64_t hash)
	-{
	- mzap_ent_t mze_tofind;
	- avl_index_t idx;
	- avl_tree_t *avl = &zap->zap_m.zap_avl;
	-
	- ASSERT(zap->zap_ismicro);
	- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
	-
	- mze_tofind.mze_hash = hash;
	- mze_tofind.mze_cd = 0;
	-
	- uint32_t cd = 0;
	- for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
	- mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
	- if (mze->mze_cd != cd)
	- break;
	- cd++;
	- }
	-
	- return (cd);
	-}
	-
	-static void
	-mze_remove(zap_t zap, mzap_ent_t mze)
	-{
	- ASSERT(zap->zap_ismicro);
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	-
	- avl_remove(&zap->zap_m.zap_avl, mze);
	- kmem_free(mze, sizeof (mzap_ent_t));
	-}
	-
	-static void
	-mze_destroy(zap_t *zap)
	-{
	- mzap_ent_t *mze;
	- void *avlcookie = NULL;
	-
	- while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
	- kmem_free(mze, sizeof (mzap_ent_t));
	- avl_destroy(&zap->zap_m.zap_avl);
	-}
	-
	-static zap_t *
	-mzap_open(objset_t os, uint64_t obj, dmu_buf_t db)
	-{
	- zap_t *winner;
	- uint64_t zap_hdr = (uint64_t )db->db_data;
	- uint64_t zap_block_type = zap_hdr[0];
	- uint64_t zap_magic = zap_hdr[1];
	-
	- ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
	-
	- zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
	- rw_init(&zap->zap_rwlock, 0, 0, 0);
	- rw_enter(&zap->zap_rwlock, RW_WRITER);
	- zap->zap_objset = os;
	- zap->zap_object = obj;
	- zap->zap_dbuf = db;
	-
	- if (zap_block_type != ZBT_MICRO) {
	- mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
	- zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
	- if (zap_block_type != ZBT_HEADER \|\| zap_magic != ZAP_MAGIC) {
	- winner = NULL; /* No actual winner here... */
	- goto handle_winner;
	- }
	- } else {
	- zap->zap_ismicro = TRUE;
	- }
	-
	- /*
	- * Make sure that zap_ismicro is set before we let others see
	- * it, because zap_lockdir() checks zap_ismicro without the lock
	- * held.
	- */
	- dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
	- winner = dmu_buf_set_user(db, &zap->zap_dbu);
	-
	- if (winner != NULL)
	- goto handle_winner;
	-
	- if (zap->zap_ismicro) {
	- zap->zap_salt = zap_m_phys(zap)->mz_salt;
	- zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
	- zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
	- avl_create(&zap->zap_m.zap_avl, mze_compare,
	- sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
	-
	- for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
	- mzap_ent_phys_t *mze =
	- &zap_m_phys(zap)->mz_chunk[i];
	- if (mze->mze_name[0]) {
	- zap_name_t *zn;
	-
	- zn = zap_name_alloc(zap, mze->mze_name, 0);
	- if (mze_insert(zap, i, zn->zn_hash) == 0)
	- zap->zap_m.zap_num_entries++;
	- else {
	- printf("ZFS WARNING: Duplicated ZAP "
	- "entry detected (%s).\n",
	- mze->mze_name);
	- }
	- zap_name_free(zn);
	- }
	- }
	- } else {
	- zap->zap_salt = zap_f_phys(zap)->zap_salt;
	- zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
	-
	- ASSERT3U(sizeof (struct zap_leaf_header), ==,
	- 2*ZAP_LEAF_CHUNKSIZE);
	-
	- /*
	- * The embedded pointer table should not overlap the
	- * other members.
	- */
	- ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
	- &zap_f_phys(zap)->zap_salt);
	-
	- /*
	- * The embedded pointer table should end at the end of
	- * the block
	- */
	- ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
	- 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
	- (uintptr_t)zap_f_phys(zap), ==,
	- zap->zap_dbuf->db_size);
	- }
	- rw_exit(&zap->zap_rwlock);
	- return (zap);
	-
	-handle_winner:
	- rw_exit(&zap->zap_rwlock);
	- rw_destroy(&zap->zap_rwlock);
	- if (!zap->zap_ismicro)
	- mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
	- kmem_free(zap, sizeof (zap_t));
	- return (winner);
	-}
	-
	-/*
	- * This routine "consumes" the caller's hold on the dbuf, which must
	- * have the specified tag.
	- */
	-static int
	-zap_lockdir_impl(dmu_buf_t db, void tag, dmu_tx_t *tx,
	- krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
	-{
	- ASSERT0(db->db_offset);
	- objset_t *os = dmu_buf_get_objset(db);
	- uint64_t obj = db->db_object;
	-
	- *zapp = NULL;
	-
	- zap_t *zap = dmu_buf_get_user(db);
	- if (zap == NULL) {
	- zap = mzap_open(os, obj, db);
	- if (zap == NULL) {
	- /*
	- * mzap_open() didn't like what it saw on-disk.
	- * Check for corruption!
	- */
	- return (SET_ERROR(EIO));
	- }
	- }
	-
	- /*
	- * We're checking zap_ismicro without the lock held, in order to
	- * tell what type of lock we want. Once we have some sort of
	- * lock, see if it really is the right type. In practice this
	- * can only be different if it was upgraded from micro to fat,
	- * and micro wanted WRITER but fat only needs READER.
	- */
	- krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
	- rw_enter(&zap->zap_rwlock, lt);
	- if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
	- /* it was upgraded, now we only need reader */
	- ASSERT(lt == RW_WRITER);
	- ASSERT(RW_READER ==
	- (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
	- rw_downgrade(&zap->zap_rwlock);
	- lt = RW_READER;
	- }
	-
	- zap->zap_objset = os;
	-
	- if (lt == RW_WRITER)
	- dmu_buf_will_dirty(db, tx);
	-
	- ASSERT3P(zap->zap_dbuf, ==, db);
	-
	- ASSERT(!zap->zap_ismicro \|\|
	- zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
	- if (zap->zap_ismicro && tx && adding &&
	- zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
	- uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
	- if (newsz > MZAP_MAX_BLKSZ) {
	- dprintf("upgrading obj %llu: num_entries=%u\n",
	- obj, zap->zap_m.zap_num_entries);
	- *zapp = zap;
	- int err = mzap_upgrade(zapp, tag, tx, 0);
	- if (err != 0)
	- rw_exit(&zap->zap_rwlock);
	- return (err);
	- }
	- VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
	- zap->zap_m.zap_num_chunks =
	- db->db_size / MZAP_ENT_LEN - 1;
	- }
	-
	- *zapp = zap;
	- return (0);
	-}
	-
	-static int
	-zap_lockdir_by_dnode(dnode_t dn, dmu_tx_t tx,
	- krw_t lti, boolean_t fatreader, boolean_t adding, void tag, zap_t *zapp)
	-{
	- dmu_buf_t *db;
	-
	- int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
	- if (err != 0) {
	- return (err);
	- }
	-#ifdef ZFS_DEBUG
	- {
	- dmu_object_info_t doi;
	- dmu_object_info_from_db(db, &doi);
	- ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
	- }
	-#endif
	-
	- err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
	- if (err != 0) {
	- dmu_buf_rele(db, tag);
	- }
	- return (err);
	-}
	-
	-int
	-zap_lockdir(objset_t os, uint64_t obj, dmu_tx_t tx,
	- krw_t lti, boolean_t fatreader, boolean_t adding, void tag, zap_t *zapp)
	-{
	- dmu_buf_t *db;
	-
	- int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
	- if (err != 0)
	- return (err);
	-#ifdef ZFS_DEBUG
	- {
	- dmu_object_info_t doi;
	- dmu_object_info_from_db(db, &doi);
	- ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
	- }
	-#endif
	- err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
	- if (err != 0)
	- dmu_buf_rele(db, tag);
	- return (err);
	-}
	-
	-void
	-zap_unlockdir(zap_t zap, void tag)
	-{
	- rw_exit(&zap->zap_rwlock);
	- dmu_buf_rele(zap->zap_dbuf, tag);
	-}
	-
	-static int
	-mzap_upgrade(zap_t *zapp, void tag, dmu_tx_t *tx, zap_flags_t flags)
	-{
	- int err = 0;
	- zap_t zap = zapp;
	-
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	-
	- int sz = zap->zap_dbuf->db_size;
	- mzap_phys_t *mzp = zio_buf_alloc(sz);
	- bcopy(zap->zap_dbuf->db_data, mzp, sz);
	- int nchunks = zap->zap_m.zap_num_chunks;
	-
	- if (!flags) {
	- err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
	- 1ULL << fzap_default_block_shift, 0, tx);
	- if (err != 0) {
	- zio_buf_free(mzp, sz);
	- return (err);
	- }
	- }
	-
	- dprintf("upgrading obj=%llu with %u chunks\n",
	- zap->zap_object, nchunks);
	- /* XXX destroy the avl later, so we can use the stored hash value */
	- mze_destroy(zap);
	-
	- fzap_upgrade(zap, tx, flags);
	-
	- for (int i = 0; i < nchunks; i++) {
	- mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
	- if (mze->mze_name[0] == 0)
	- continue;
	- dprintf("adding %s=%llu\n",
	- mze->mze_name, mze->mze_value);
	- zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
	- err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
	- tag, tx);
	- zap = zn->zn_zap; /* fzap_add_cd() may change zap */
	- zap_name_free(zn);
	- if (err != 0)
	- break;
	- }
	- zio_buf_free(mzp, sz);
	- *zapp = zap;
	- return (err);
	-}
	-
	-/*
	- * The "normflags" determine the behavior of the matchtype_t which is
	- * passed to zap_lookup_norm(). Names which have the same normalized
	- * version will be stored with the same hash value, and therefore we can
	- * perform normalization-insensitive lookups. We can be Unicode form-
	- * insensitive and/or case-insensitive. The following flags are valid for
	- * "normflags":
	- *
	- * U8_TEXTPREP_NFC
	- * U8_TEXTPREP_NFD
	- * U8_TEXTPREP_NFKC
	- * U8_TEXTPREP_NFKD
	- * U8_TEXTPREP_TOUPPER
	- *
	- * The _NF (Normalization Form) flags are mutually exclusive; at most one
	- * of them may be supplied.
	- */
	-void
	-mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
	- dmu_tx_t *tx)
	-{
	- dmu_buf_t *db;
	-
	- VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
	-
	- dmu_buf_will_dirty(db, tx);
	- mzap_phys_t *zp = db->db_data;
	- zp->mz_block_type = ZBT_MICRO;
	- zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) \| 1ULL;
	- zp->mz_normflags = normflags;
	-
	- if (flags != 0) {
	- zap_t *zap;
	- /* Only fat zap supports flags; upgrade immediately. */
	- VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
	- B_FALSE, B_FALSE, &zap));
	- VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
	- zap_unlockdir(zap, FTAG);
	- } else {
	- dmu_buf_rele(db, FTAG);
	- }
	-}
	-
	-int
	-zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	-{
	- return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
	- 0, tx));
	-}
	-
	-int
	-zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
	-{
	- return (zap_create_claim_norm_dnsize(os, obj,
	- 0, ot, bonustype, bonuslen, dnodesize, tx));
	-}
	-
	-int
	-zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
	- dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	-{
	- return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
	- bonuslen, 0, tx));
	-}
	-
	-int
	-zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
	- dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
	- int dnodesize, dmu_tx_t *tx)
	-{
	- int err;
	-
	- err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
	- dnodesize, tx);
	- if (err != 0)
	- return (err);
	- mzap_create_impl(os, obj, normflags, 0, tx);
	- return (0);
	-}
	-
	-uint64_t
	-zap_create(objset_t *os, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	-{
	- return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
	-}
	-
	-uint64_t
	-zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
	-{
	- return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
	- dnodesize, tx));
	-}
	-
	-uint64_t
	-zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	-{
	- ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
	- return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
	- 0, tx));
	-}
	-
	-uint64_t
	-zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
	- dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
	-{
	- uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
	- dnodesize, tx);
	-
	- mzap_create_impl(os, obj, normflags, 0, tx);
	- return (obj);
	-}
	-
	-uint64_t
	-zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
	- dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
	- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	-{
	- ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
	- return (zap_create_flags_dnsize(os, normflags, flags, ot,
	- leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
	-}
	-
	-uint64_t
	-zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
	- dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
	- dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
	-{
	- uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
	- dnodesize, tx);
	-
	- ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
	- leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
	- indirect_blockshift >= SPA_MINBLOCKSHIFT &&
	- indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
	-
	- VERIFY(dmu_object_set_blocksize(os, obj,
	- 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
	-
	- mzap_create_impl(os, obj, normflags, flags, tx);
	- return (obj);
	-}
	-
	-int
	-zap_destroy(objset_t os, uint64_t zapobj, dmu_tx_t tx)
	-{
	- /*
	- * dmu_object_free will free the object number and free the
	- * data. Freeing the data will cause our pageout function to be
	- * called, which will destroy our data (zap_leaf_t's and zap_t).
	- */
	-
	- return (dmu_object_free(os, zapobj, tx));
	-}
	-
	-void
	-zap_evict_sync(void *dbu)
	-{
	- zap_t *zap = dbu;
	-
	- rw_destroy(&zap->zap_rwlock);
	-
	- if (zap->zap_ismicro)
	- mze_destroy(zap);
	- else
	- mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
	-
	- kmem_free(zap, sizeof (zap_t));
	-}
	-
	-int
	-zap_count(objset_t os, uint64_t zapobj, uint64_t count)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- if (!zap->zap_ismicro) {
	- err = fzap_count(zap, count);
	- } else {
	- *count = zap->zap_m.zap_num_entries;
	- }
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-/*
	- * zn may be NULL; if not specified, it will be computed if needed.
	- * See also the comment above zap_entry_normalization_conflict().
	- */
	-static boolean_t
	-mzap_normalization_conflict(zap_t zap, zap_name_t zn, mzap_ent_t *mze)
	-{
	- int direction = AVL_BEFORE;
	- boolean_t allocdzn = B_FALSE;
	-
	- if (zap->zap_normflags == 0)
	- return (B_FALSE);
	-
	-again:
	- for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
	- other && other->mze_hash == mze->mze_hash;
	- other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
	-
	- if (zn == NULL) {
	- zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
	- MT_NORMALIZE);
	- allocdzn = B_TRUE;
	- }
	- if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
	- if (allocdzn)
	- zap_name_free(zn);
	- return (B_TRUE);
	- }
	- }
	-
	- if (direction == AVL_BEFORE) {
	- direction = AVL_AFTER;
	- goto again;
	- }
	-
	- if (allocdzn)
	- zap_name_free(zn);
	- return (B_FALSE);
	-}
	-
	-/*
	- * Routines for manipulating attributes.
	- */
	-
	-int
	-zap_lookup(objset_t os, uint64_t zapobj, const char name,
	- uint64_t integer_size, uint64_t num_integers, void *buf)
	-{
	- return (zap_lookup_norm(os, zapobj, name, integer_size,
	- num_integers, buf, 0, NULL, 0, NULL));
	-}
	-
	-static int
	-zap_lookup_impl(zap_t zap, const char name,
	- uint64_t integer_size, uint64_t num_integers, void *buf,
	- matchtype_t mt, char *realname, int rn_len,
	- boolean_t *ncp)
	-{
	- int err = 0;
	-
	- zap_name_t *zn = zap_name_alloc(zap, name, mt);
	- if (zn == NULL)
	- return (SET_ERROR(ENOTSUP));
	-
	- if (!zap->zap_ismicro) {
	- err = fzap_lookup(zn, integer_size, num_integers, buf,
	- realname, rn_len, ncp);
	- } else {
	- mzap_ent_t *mze = mze_find(zn);
	- if (mze == NULL) {
	- err = SET_ERROR(ENOENT);
	- } else {
	- if (num_integers < 1) {
	- err = SET_ERROR(EOVERFLOW);
	- } else if (integer_size != 8) {
	- err = SET_ERROR(EINVAL);
	- } else {
	- (uint64_t )buf =
	- MZE_PHYS(zap, mze)->mze_value;
	- (void) strlcpy(realname,
	- MZE_PHYS(zap, mze)->mze_name, rn_len);
	- if (ncp) {
	- *ncp = mzap_normalization_conflict(zap,
	- zn, mze);
	- }
	- }
	- }
	- }
	- zap_name_free(zn);
	- return (err);
	-}
	-
	-int
	-zap_lookup_norm(objset_t os, uint64_t zapobj, const char name,
	- uint64_t integer_size, uint64_t num_integers, void *buf,
	- matchtype_t mt, char *realname, int rn_len,
	- boolean_t *ncp)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- err = zap_lookup_impl(zap, name, integer_size,
	- num_integers, buf, mt, realname, rn_len, ncp);
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_lookup_by_dnode(dnode_t dn, const char name,
	- uint64_t integer_size, uint64_t num_integers, void *buf)
	-{
	- return (zap_lookup_norm_by_dnode(dn, name, integer_size,
	- num_integers, buf, 0, NULL, 0, NULL));
	-}
	-
	-int
	-zap_lookup_norm_by_dnode(dnode_t dn, const char name,
	- uint64_t integer_size, uint64_t num_integers, void *buf,
	- matchtype_t mt, char *realname, int rn_len,
	- boolean_t *ncp)
	-{
	- zap_t *zap;
	-
	- int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
	- FTAG, &zap);
	- if (err != 0)
	- return (err);
	- err = zap_lookup_impl(zap, name, integer_size,
	- num_integers, buf, mt, realname, rn_len, ncp);
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_prefetch_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
	- if (zn == NULL) {
	- zap_unlockdir(zap, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- fzap_prefetch(zn);
	- zap_name_free(zn);
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_lookup_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
	- if (zn == NULL) {
	- zap_unlockdir(zap, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- err = fzap_lookup(zn, integer_size, num_integers, buf,
	- NULL, 0, NULL);
	- zap_name_free(zn);
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_contains(objset_t os, uint64_t zapobj, const char name)
	-{
	- int err = zap_lookup_norm(os, zapobj, name, 0,
	- 0, NULL, 0, NULL, 0, NULL);
	- if (err == EOVERFLOW \|\| err == EINVAL)
	- err = 0; /* found, but skipped reading the value */
	- return (err);
	-}
	-
	-int
	-zap_length(objset_t os, uint64_t zapobj, const char name,
	- uint64_t integer_size, uint64_t num_integers)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- zap_name_t *zn = zap_name_alloc(zap, name, 0);
	- if (zn == NULL) {
	- zap_unlockdir(zap, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- if (!zap->zap_ismicro) {
	- err = fzap_length(zn, integer_size, num_integers);
	- } else {
	- mzap_ent_t *mze = mze_find(zn);
	- if (mze == NULL) {
	- err = SET_ERROR(ENOENT);
	- } else {
	- if (integer_size)
	- *integer_size = 8;
	- if (num_integers)
	- *num_integers = 1;
	- }
	- }
	- zap_name_free(zn);
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_length_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints, uint64_t integer_size, uint64_t num_integers)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
	- if (zn == NULL) {
	- zap_unlockdir(zap, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- err = fzap_length(zn, integer_size, num_integers);
	- zap_name_free(zn);
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-static void
	-mzap_addent(zap_name_t *zn, uint64_t value)
	-{
	- zap_t *zap = zn->zn_zap;
	- int start = zap->zap_m.zap_alloc_next;
	-
	- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	-
	-#ifdef ZFS_DEBUG
	- for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
	- mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
	- ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
	- }
	-#endif
	-
	- uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
	- /* given the limited size of the microzap, this can't happen */
	- ASSERT(cd < zap_maxcd(zap));
	-
	-again:
	- for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
	- mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
	- if (mze->mze_name[0] == 0) {
	- mze->mze_value = value;
	- mze->mze_cd = cd;
	- (void) strcpy(mze->mze_name, zn->zn_key_orig);
	- zap->zap_m.zap_num_entries++;
	- zap->zap_m.zap_alloc_next = i+1;
	- if (zap->zap_m.zap_alloc_next ==
	- zap->zap_m.zap_num_chunks)
	- zap->zap_m.zap_alloc_next = 0;
	- VERIFY(0 == mze_insert(zap, i, zn->zn_hash));
	- return;
	- }
	- }
	- if (start != 0) {
	- start = 0;
	- goto again;
	- }
	- ASSERT(!"out of entries!");
	-}
	-
	-static int
	-zap_add_impl(zap_t zap, const char key,
	- int integer_size, uint64_t num_integers,
	- const void val, dmu_tx_t tx, void *tag)
	-{
	- const uint64_t *intval = val;
	- int err = 0;
	-
	- zap_name_t *zn = zap_name_alloc(zap, key, 0);
	- if (zn == NULL) {
	- zap_unlockdir(zap, tag);
	- return (SET_ERROR(ENOTSUP));
	- }
	- if (!zap->zap_ismicro) {
	- err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
	- zap = zn->zn_zap; /* fzap_add() may change zap */
	- } else if (integer_size != 8 \|\| num_integers != 1 \|\|
	- strlen(key) >= MZAP_NAME_LEN) {
	- err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
	- if (err == 0) {
	- err = fzap_add(zn, integer_size, num_integers, val,
	- tag, tx);
	- }
	- zap = zn->zn_zap; /* fzap_add() may change zap */
	- } else {
	- if (mze_find(zn) != NULL) {
	- err = SET_ERROR(EEXIST);
	- } else {
	- mzap_addent(zn, *intval);
	- }
	- }
	- ASSERT(zap == zn->zn_zap);
	- zap_name_free(zn);
	- if (zap != NULL) /* may be NULL if fzap_add() failed */
	- zap_unlockdir(zap, tag);
	- return (err);
	-}
	-
	-int
	-zap_add(objset_t os, uint64_t zapobj, const char key,
	- int integer_size, uint64_t num_integers,
	- const void val, dmu_tx_t tx)
	-{
	- zap_t *zap;
	- int err;
	-
	- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
	- /* zap_add_impl() calls zap_unlockdir() */
	- return (err);
	-}
	-
	-int
	-zap_add_by_dnode(dnode_t dn, const char key,
	- int integer_size, uint64_t num_integers,
	- const void val, dmu_tx_t tx)
	-{
	- zap_t *zap;
	- int err;
	-
	- err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
	- /* zap_add_impl() calls zap_unlockdir() */
	- return (err);
	-}
	-
	-int
	-zap_add_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints, int integer_size, uint64_t num_integers,
	- const void val, dmu_tx_t tx)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
	- if (zn == NULL) {
	- zap_unlockdir(zap, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
	- zap = zn->zn_zap; /* fzap_add() may change zap */
	- zap_name_free(zn);
	- if (zap != NULL) /* may be NULL if fzap_add() failed */
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_update(objset_t os, uint64_t zapobj, const char name,
	- int integer_size, uint64_t num_integers, const void val, dmu_tx_t tx)
	-{
	- zap_t *zap;
	- uint64_t oldval;
	- const uint64_t *intval = val;
	-
	-#ifdef ZFS_DEBUG
	- /*
	- * If there is an old value, it shouldn't change across the
	- * lockdir (eg, due to bprewrite's xlation).
	- */
	- if (integer_size == 8 && num_integers == 1)
	- (void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
	-#endif
	-
	- int err =
	- zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- zap_name_t *zn = zap_name_alloc(zap, name, 0);
	- if (zn == NULL) {
	- zap_unlockdir(zap, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- if (!zap->zap_ismicro) {
	- err = fzap_update(zn, integer_size, num_integers, val,
	- FTAG, tx);
	- zap = zn->zn_zap; /* fzap_update() may change zap */
	- } else if (integer_size != 8 \|\| num_integers != 1 \|\|
	- strlen(name) >= MZAP_NAME_LEN) {
	- dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
	- zapobj, integer_size, num_integers, name);
	- err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
	- if (err == 0) {
	- err = fzap_update(zn, integer_size, num_integers,
	- val, FTAG, tx);
	- }
	- zap = zn->zn_zap; /* fzap_update() may change zap */
	- } else {
	- mzap_ent_t *mze = mze_find(zn);
	- if (mze != NULL) {
	- ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
	- MZE_PHYS(zap, mze)->mze_value = *intval;
	- } else {
	- mzap_addent(zn, *intval);
	- }
	- }
	- ASSERT(zap == zn->zn_zap);
	- zap_name_free(zn);
	- if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_update_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints,
	- int integer_size, uint64_t num_integers, const void val, dmu_tx_t tx)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
	- if (zn == NULL) {
	- zap_unlockdir(zap, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
	- zap = zn->zn_zap; /* fzap_update() may change zap */
	- zap_name_free(zn);
	- if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_remove(objset_t os, uint64_t zapobj, const char name, dmu_tx_t *tx)
	-{
	- return (zap_remove_norm(os, zapobj, name, 0, tx));
	-}
	-
	-static int
	-zap_remove_impl(zap_t zap, const char name,
	- matchtype_t mt, dmu_tx_t *tx)
	-{
	- int err = 0;
	-
	- zap_name_t *zn = zap_name_alloc(zap, name, mt);
	- if (zn == NULL)
	- return (SET_ERROR(ENOTSUP));
	- if (!zap->zap_ismicro) {
	- err = fzap_remove(zn, tx);
	- } else {
	- mzap_ent_t *mze = mze_find(zn);
	- if (mze == NULL) {
	- err = SET_ERROR(ENOENT);
	- } else {
	- zap->zap_m.zap_num_entries--;
	- bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
	- sizeof (mzap_ent_phys_t));
	- mze_remove(zap, mze);
	- }
	- }
	- zap_name_free(zn);
	- return (err);
	-}
	-
	-int
	-zap_remove_norm(objset_t os, uint64_t zapobj, const char name,
	- matchtype_t mt, dmu_tx_t *tx)
	-{
	- zap_t *zap;
	- int err;
	-
	- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
	- if (err)
	- return (err);
	- err = zap_remove_impl(zap, name, mt, tx);
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_remove_by_dnode(dnode_t dn, const char name, dmu_tx_t *tx)
	-{
	- zap_t *zap;
	- int err;
	-
	- err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
	- if (err)
	- return (err);
	- err = zap_remove_impl(zap, name, 0, tx);
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-int
	-zap_remove_uint64(objset_t os, uint64_t zapobj, const uint64_t key,
	- int key_numints, dmu_tx_t *tx)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	- zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
	- if (zn == NULL) {
	- zap_unlockdir(zap, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- err = fzap_remove(zn, tx);
	- zap_name_free(zn);
	- zap_unlockdir(zap, FTAG);
	- return (err);
	-}
	-
	-/*
	- * Routines for iterating over the attributes.
	- */
	-
	-static void
	-zap_cursor_init_impl(zap_cursor_t zc, objset_t os, uint64_t zapobj,
	- uint64_t serialized, boolean_t prefetch)
	-{
	- zc->zc_objset = os;
	- zc->zc_zap = NULL;
	- zc->zc_leaf = NULL;
	- zc->zc_zapobj = zapobj;
	- zc->zc_serialized = serialized;
	- zc->zc_hash = 0;
	- zc->zc_cd = 0;
	- zc->zc_prefetch = prefetch;
	-}
	-void
	-zap_cursor_init_serialized(zap_cursor_t zc, objset_t os, uint64_t zapobj,
	- uint64_t serialized)
	-{
	- zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
	-}
	-
	-/*
	- * Initialize a cursor at the beginning of the ZAP object. The entire
	- * ZAP object will be prefetched.
	- */
	-void
	-zap_cursor_init(zap_cursor_t zc, objset_t os, uint64_t zapobj)
	-{
	- zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
	-}
	-
	-/*
	- * Initialize a cursor at the beginning, but request that we not prefetch
	- * the entire ZAP object.
	- */
	-void
	-zap_cursor_init_noprefetch(zap_cursor_t zc, objset_t os, uint64_t zapobj)
	-{
	- zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
	-}
	-
	-void
	-zap_cursor_fini(zap_cursor_t *zc)
	-{
	- if (zc->zc_zap) {
	- rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
	- zap_unlockdir(zc->zc_zap, NULL);
	- zc->zc_zap = NULL;
	- }
	- if (zc->zc_leaf) {
	- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
	- zap_put_leaf(zc->zc_leaf);
	- zc->zc_leaf = NULL;
	- }
	- zc->zc_objset = NULL;
	-}
	-
	-uint64_t
	-zap_cursor_serialize(zap_cursor_t *zc)
	-{
	- if (zc->zc_hash == -1ULL)
	- return (-1ULL);
	- if (zc->zc_zap == NULL)
	- return (zc->zc_serialized);
	- ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
	- ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
	-
	- /*
	- * We want to keep the high 32 bits of the cursor zero if we can, so
	- * that 32-bit programs can access this. So usually use a small
	- * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
	- * of the cursor.
	- *
	- * [ collision differentiator \| zap_hashbits()-bit hash value ]
	- */
	- return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) \|
	- ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
	-}
	-
	-int
	-zap_cursor_retrieve(zap_cursor_t zc, zap_attribute_t za)
	-{
	- int err;
	-
	- if (zc->zc_hash == -1ULL)
	- return (SET_ERROR(ENOENT));
	-
	- if (zc->zc_zap == NULL) {
	- int hb;
	- err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
	- RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
	- if (err != 0)
	- return (err);
	-
	- /*
	- * To support zap_cursor_init_serialized, advance, retrieve,
	- * we must add to the existing zc_cd, which may already
	- * be 1 due to the zap_cursor_advance.
	- */
	- ASSERT(zc->zc_hash == 0);
	- hb = zap_hashbits(zc->zc_zap);
	- zc->zc_hash = zc->zc_serialized << (64 - hb);
	- zc->zc_cd += zc->zc_serialized >> hb;
	- if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
	- zc->zc_cd = 0;
	- } else {
	- rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
	- }
	- if (!zc->zc_zap->zap_ismicro) {
	- err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
	- } else {
	- avl_index_t idx;
	- mzap_ent_t mze_tofind;
	-
	- mze_tofind.mze_hash = zc->zc_hash;
	- mze_tofind.mze_cd = zc->zc_cd;
	-
	- mzap_ent_t *mze =
	- avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
	- if (mze == NULL) {
	- mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
	- idx, AVL_AFTER);
	- }
	- if (mze) {
	- mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
	- ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
	- za->za_normalization_conflict =
	- mzap_normalization_conflict(zc->zc_zap, NULL, mze);
	- za->za_integer_length = 8;
	- za->za_num_integers = 1;
	- za->za_first_integer = mzep->mze_value;
	- (void) strcpy(za->za_name, mzep->mze_name);
	- zc->zc_hash = mze->mze_hash;
	- zc->zc_cd = mze->mze_cd;
	- err = 0;
	- } else {
	- zc->zc_hash = -1ULL;
	- err = SET_ERROR(ENOENT);
	- }
	- }
	- rw_exit(&zc->zc_zap->zap_rwlock);
	- return (err);
	-}
	-
	-void
	-zap_cursor_advance(zap_cursor_t *zc)
	-{
	- if (zc->zc_hash == -1ULL)
	- return;
	- zc->zc_cd++;
	-}
	-
	-int
	-zap_cursor_move_to_key(zap_cursor_t zc, const char name, matchtype_t mt)
	-{
	- int err = 0;
	- mzap_ent_t *mze;
	- zap_name_t *zn;
	-
	- if (zc->zc_zap == NULL) {
	- err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
	- RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap);
	- if (err)
	- return (err);
	- } else {
	- rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
	- }
	-
	- zn = zap_name_alloc(zc->zc_zap, name, mt);
	- if (zn == NULL) {
	- rw_exit(&zc->zc_zap->zap_rwlock);
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- if (!zc->zc_zap->zap_ismicro) {
	- err = fzap_cursor_move_to_key(zc, zn);
	- } else {
	- mze = mze_find(zn);
	- if (mze == NULL) {
	- err = SET_ERROR(ENOENT);
	- goto out;
	- }
	- zc->zc_hash = mze->mze_hash;
	- zc->zc_cd = mze->mze_cd;
	- }
	-
	-out:
	- zap_name_free(zn);
	- rw_exit(&zc->zc_zap->zap_rwlock);
	- return (err);
	-}
	-
	-int
	-zap_get_stats(objset_t os, uint64_t zapobj, zap_stats_t zs)
	-{
	- zap_t *zap;
	-
	- int err =
	- zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
	- if (err != 0)
	- return (err);
	-
	- bzero(zs, sizeof (zap_stats_t));
	-
	- if (zap->zap_ismicro) {
	- zs->zs_blocksize = zap->zap_dbuf->db_size;
	- zs->zs_num_entries = zap->zap_m.zap_num_entries;
	- zs->zs_num_blocks = 1;
	- } else {
	- fzap_get_stats(zap, zs);
	- }
	- zap_unlockdir(zap, FTAG);
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
	@@ -1,1432 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * ZFS Channel Programs (ZCP)
	- *
	- * The ZCP interface allows various ZFS commands and operations ZFS
	- * administrative operations (e.g. creating and destroying snapshots, typically
	- * performed via an ioctl to /dev/zfs by the zfs(1M) command and
	- * libzfs/libzfs_core) to be run * programmatically as a Lua script. A ZCP
	- * script is run as a dsl_sync_task and fully executed during one transaction
	- * group sync. This ensures that no other changes can be written concurrently
	- * with a running Lua script. Combining multiple calls to the exposed ZFS
	- * functions into one script gives a number of benefits:
	- *
	- * 1. Atomicity. For some compound or iterative operations, it's useful to be
	- * able to guarantee that the state of a pool has not changed between calls to
	- * ZFS.
	- *
	- * 2. Performance. If a large number of changes need to be made (e.g. deleting
	- * many filesystems), there can be a significant performance penalty as a
	- * result of the need to wait for a transaction group sync to pass for every
	- * single operation. When expressed as a single ZCP script, all these changes
	- * can be performed at once in one txg sync.
	- *
	- * A modified version of the Lua 5.2 interpreter is used to run channel program
	- * scripts. The Lua 5.2 manual can be found at:
	- *
	- * http://www.lua.org/manual/5.2/
	- *
	- * If being run by a user (via an ioctl syscall), executing a ZCP script
	- * requires root privileges in the global zone.
	- *
	- * Scripts are passed to zcp_eval() as a string, then run in a synctask by
	- * zcp_eval_sync(). Arguments can be passed into the Lua script as an nvlist,
	- * which will be converted to a Lua table. Similarly, values returned from
	- * a ZCP script will be converted to an nvlist. See zcp_lua_to_nvlist_impl()
	- * for details on exact allowed types and conversion.
	- *
	- * ZFS functionality is exposed to a ZCP script as a library of function calls.
	- * These calls are sorted into submodules, such as zfs.list and zfs.sync, for
	- * iterators and synctasks, respectively. Each of these submodules resides in
	- * its own source file, with a zcp_*_info structure describing each library
	- * call in the submodule.
	- *
	- * Error handling in ZCP scripts is handled by a number of different methods
	- * based on severity:
	- *
	- * 1. Memory and time limits are in place to prevent a channel program from
	- * consuming excessive system or running forever. If one of these limits is
	- * hit, the channel program will be stopped immediately and return from
	- * zcp_eval() with an error code. No attempt will be made to roll back or undo
	- * any changes made by the channel program before the error occured.
	- * Consumers invoking zcp_eval() from elsewhere in the kernel may pass a time
	- * limit of 0, disabling the time limit.
	- *
	- * 2. Internal Lua errors can occur as a result of a syntax error, calling a
	- * library function with incorrect arguments, invoking the error() function,
	- * failing an assert(), or other runtime errors. In these cases the channel
	- * program will stop executing and return from zcp_eval() with an error code.
	- * In place of a return value, an error message will also be returned in the
	- * 'result' nvlist containing information about the error. No attempt will be
	- * made to roll back or undo any changes made by the channel program before the
	- * error occured.
	- *
	- * 3. If an error occurs inside a ZFS library call which returns an error code,
	- * the error is returned to the Lua script to be handled as desired.
	- *
	- * In the first two cases, Lua's error-throwing mechanism is used, which
	- * longjumps out of the script execution with luaL_error() and returns with the
	- * error.
	- *
	- * See zfs-program(1M) for more information on high level usage.
	- */
	-
	-#include "lua.h"
	-#include "lualib.h"
	-#include "lauxlib.h"
	-
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/zcp.h>
	-#include <sys/zcp_iter.h>
	-#include <sys/zcp_prop.h>
	-#include <sys/zcp_global.h>
	-#ifdef illumos
	-#include <util/sscanf.h>
	-#endif
	-
	-#ifdef __FreeBSD__
	-#define ECHRNG EDOM
	-#define ETIME ETIMEDOUT
	-#endif
	-
	-#define ZCP_NVLIST_MAX_DEPTH 20
	-
	-uint64_t zfs_lua_check_instrlimit_interval = 100;
	-uint64_t zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT;
	-uint64_t zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT;
	-
	-/*
	- * Forward declarations for mutually recursive functions
	- */
	-static int zcp_nvpair_value_to_lua(lua_State , nvpair_t , char *, int);
	-static int zcp_lua_to_nvlist_impl(lua_State , int, nvlist_t , const char *,
	- int);
	-
	-/*
	- * The outer-most error callback handler for use with lua_pcall(). On
	- * error Lua will call this callback with a single argument that
	- * represents the error value. In most cases this will be a string
	- * containing an error message, but channel programs can use Lua's
	- * error() function to return arbitrary objects as errors. This callback
	- * returns (on the Lua stack) the original error object along with a traceback.
	- *
	- * Fatal Lua errors can occur while resources are held, so we also call any
	- * registered cleanup function here.
	- */
	-static int
	-zcp_error_handler(lua_State *state)
	-{
	- const char *msg;
	-
	- zcp_cleanup(state);
	-
	- VERIFY3U(1, ==, lua_gettop(state));
	- msg = lua_tostring(state, 1);
	- luaL_traceback(state, state, msg, 1);
	- return (1);
	-}
	-
	-int
	-zcp_argerror(lua_State state, int narg, const char msg, ...)
	-{
	- va_list alist;
	-
	- va_start(alist, msg);
	- const char *buf = lua_pushvfstring(state, msg, alist);
	- va_end(alist);
	-
	- return (luaL_argerror(state, narg, buf));
	-}
	-
	-/*
	- * Install a new cleanup function, which will be invoked with the given
	- * opaque argument if a fatal error causes the Lua interpreter to longjump out
	- * of a function call.
	- *
	- * If an error occurs, the cleanup function will be invoked exactly once and
	- * then unreigstered.
	- *
	- * Returns the registered cleanup handler so the caller can deregister it
	- * if no error occurs.
	- */
	-zcp_cleanup_handler_t *
	-zcp_register_cleanup(lua_State state, zcp_cleanup_t cleanfunc, void cleanarg)
	-{
	- zcp_run_info_t *ri = zcp_run_info(state);
	-
	- zcp_cleanup_handler_t zch = kmem_alloc(sizeof (zch), KM_SLEEP);
	- zch->zch_cleanup_func = cleanfunc;
	- zch->zch_cleanup_arg = cleanarg;
	- list_insert_head(&ri->zri_cleanup_handlers, zch);
	-
	- return (zch);
	-}
	-
	-void
	-zcp_deregister_cleanup(lua_State state, zcp_cleanup_handler_t zch)
	-{
	- zcp_run_info_t *ri = zcp_run_info(state);
	- list_remove(&ri->zri_cleanup_handlers, zch);
	- kmem_free(zch, sizeof (*zch));
	-}
	-
	-/*
	- * Execute the currently registered cleanup handlers then free them and
	- * destroy the handler list.
	- */
	-void
	-zcp_cleanup(lua_State *state)
	-{
	- zcp_run_info_t *ri = zcp_run_info(state);
	-
	- for (zcp_cleanup_handler_t *zch =
	- list_remove_head(&ri->zri_cleanup_handlers); zch != NULL;
	- zch = list_remove_head(&ri->zri_cleanup_handlers)) {
	- zch->zch_cleanup_func(zch->zch_cleanup_arg);
	- kmem_free(zch, sizeof (*zch));
	- }
	-}
	-
	-/*
	- * Convert the lua table at the given index on the Lua stack to an nvlist
	- * and return it.
	- *
	- * If the table can not be converted for any reason, NULL is returned and
	- * an error message is pushed onto the Lua stack.
	- */
	-static nvlist_t *
	-zcp_table_to_nvlist(lua_State *state, int index, int depth)
	-{
	- nvlist_t *nvl;
	- /*
	- * Converting a Lua table to an nvlist with key uniqueness checking is
	- * O(n^2) in the number of keys in the nvlist, which can take a long
	- * time when we return a large table from a channel program.
	- * Furthermore, Lua's table interface almost guarantees unique keys
	- * on its own (details below). Therefore, we don't use fnvlist_alloc()
	- * here to avoid the built-in uniqueness checking.
	- *
	- * The almost is because it's possible to have key collisions between
	- * e.g. the string "1" and the number 1, or the string "true" and the
	- * boolean true, so we explicitly check that when we're looking at a
	- * key which is an integer / boolean or a string that can be parsed as
	- * one of those types. In the worst case this could still devolve into
	- * O(n^2), so we only start doing these checks on boolean/integer keys
	- * once we've seen a string key which fits this weird usage pattern.
	- *
	- * Ultimately, we still want callers to know that the keys in this
	- * nvlist are unique, so before we return this we set the nvlist's
	- * flags to reflect that.
	- */
	- VERIFY0(nvlist_alloc(&nvl, 0, KM_SLEEP));
	-
	- /*
	- * Push an empty stack slot where lua_next() will store each
	- * table key.
	- */
	- lua_pushnil(state);
	- boolean_t saw_str_could_collide = B_FALSE;
	- while (lua_next(state, index) != 0) {
	- /*
	- * The next key-value pair from the table at index is
	- * now on the stack, with the key at stack slot -2 and
	- * the value at slot -1.
	- */
	- int err = 0;
	- char buf[32];
	- const char *key = NULL;
	- boolean_t key_could_collide = B_FALSE;
	-
	- switch (lua_type(state, -2)) {
	- case LUA_TSTRING:
	- key = lua_tostring(state, -2);
	-
	- /* check if this could collide with a number or bool */
	- long long tmp;
	- int parselen;
	- if ((sscanf(key, "%lld%n", &tmp, &parselen) > 0 &&
	- parselen == strlen(key)) \|\|
	- strcmp(key, "true") == 0 \|\|
	- strcmp(key, "false") == 0) {
	- key_could_collide = B_TRUE;
	- saw_str_could_collide = B_TRUE;
	- }
	- break;
	- case LUA_TBOOLEAN:
	- key = (lua_toboolean(state, -2) == B_TRUE ?
	- "true" : "false");
	- if (saw_str_could_collide) {
	- key_could_collide = B_TRUE;
	- }
	- break;
	- case LUA_TNUMBER:
	- VERIFY3U(sizeof (buf), >,
	- snprintf(buf, sizeof (buf), "%lld",
	- (longlong_t)lua_tonumber(state, -2)));
	- key = buf;
	- if (saw_str_could_collide) {
	- key_could_collide = B_TRUE;
	- }
	- break;
	- default:
	- fnvlist_free(nvl);
	- (void) lua_pushfstring(state, "Invalid key "
	- "type '%s' in table",
	- lua_typename(state, lua_type(state, -2)));
	- return (NULL);
	- }
	- /*
	- * Check for type-mismatched key collisions, and throw an error.
	- */
	- if (key_could_collide && nvlist_exists(nvl, key)) {
	- fnvlist_free(nvl);
	- (void) lua_pushfstring(state, "Collision of "
	- "key '%s' in table", key);
	- return (NULL);
	- }
	- /*
	- * Recursively convert the table value and insert into
	- * the new nvlist with the parsed key. To prevent
	- * stack overflow on circular or heavily nested tables,
	- * we track the current nvlist depth.
	- */
	- if (depth >= ZCP_NVLIST_MAX_DEPTH) {
	- fnvlist_free(nvl);
	- (void) lua_pushfstring(state, "Maximum table "
	- "depth (%d) exceeded for table",
	- ZCP_NVLIST_MAX_DEPTH);
	- return (NULL);
	- }
	- err = zcp_lua_to_nvlist_impl(state, -1, nvl, key,
	- depth + 1);
	- if (err != 0) {
	- fnvlist_free(nvl);
	- /*
	- * Error message has been pushed to the lua
	- * stack by the recursive call.
	- */
	- return (NULL);
	- }
	- /*
	- * Pop the value pushed by lua_next().
	- */
	- lua_pop(state, 1);
	- }
	-
	- /*
	- * Mark the nvlist as having unique keys. This is a little ugly, but we
	- * ensured above that there are no duplicate keys in the nvlist.
	- */
	- nvl->nvl_nvflag \|= NV_UNIQUE_NAME;
	-
	- return (nvl);
	-}
	-
	-/*
	- * Convert a value from the given index into the lua stack to an nvpair, adding
	- * it to an nvlist with the given key.
	- *
	- * Values are converted as follows:
	- *
	- * string -> string
	- * number -> int64
	- * boolean -> boolean
	- * nil -> boolean (no value)
	- *
	- * Lua tables are converted to nvlists and then inserted. The table's keys
	- * are converted to strings then used as keys in the nvlist to store each table
	- * element. Keys are converted as follows:
	- *
	- * string -> no change
	- * number -> "%lld"
	- * boolean -> "true" \| "false"
	- * nil -> error
	- *
	- * In the case of a key collision, an error is thrown.
	- *
	- * If an error is encountered, a nonzero error code is returned, and an error
	- * string will be pushed onto the Lua stack.
	- */
	-static int
	-zcp_lua_to_nvlist_impl(lua_State state, int index, nvlist_t nvl,
	- const char *key, int depth)
	-{
	- /*
	- * Verify that we have enough remaining space in the lua stack to parse
	- * a key-value pair and push an error.
	- */
	- if (!lua_checkstack(state, 3)) {
	- (void) lua_pushstring(state, "Lua stack overflow");
	- return (1);
	- }
	-
	- index = lua_absindex(state, index);
	-
	- switch (lua_type(state, index)) {
	- case LUA_TNIL:
	- fnvlist_add_boolean(nvl, key);
	- break;
	- case LUA_TBOOLEAN:
	- fnvlist_add_boolean_value(nvl, key,
	- lua_toboolean(state, index));
	- break;
	- case LUA_TNUMBER:
	- fnvlist_add_int64(nvl, key, lua_tonumber(state, index));
	- break;
	- case LUA_TSTRING:
	- fnvlist_add_string(nvl, key, lua_tostring(state, index));
	- break;
	- case LUA_TTABLE: {
	- nvlist_t *value_nvl = zcp_table_to_nvlist(state, index, depth);
	- if (value_nvl == NULL)
	- return (EINVAL);
	-
	- fnvlist_add_nvlist(nvl, key, value_nvl);
	- fnvlist_free(value_nvl);
	- break;
	- }
	- default:
	- (void) lua_pushfstring(state,
	- "Invalid value type '%s' for key '%s'",
	- lua_typename(state, lua_type(state, index)), key);
	- return (EINVAL);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Convert a lua value to an nvpair, adding it to an nvlist with the given key.
	- */
	-static void
	-zcp_lua_to_nvlist(lua_State state, int index, nvlist_t nvl, const char *key)
	-{
	- /*
	- * On error, zcp_lua_to_nvlist_impl pushes an error string onto the Lua
	- * stack before returning with a nonzero error code. If an error is
	- * returned, throw a fatal lua error with the given string.
	- */
	- if (zcp_lua_to_nvlist_impl(state, index, nvl, key, 0) != 0)
	- (void) lua_error(state);
	-}
	-
	-static int
	-zcp_lua_to_nvlist_helper(lua_State *state)
	-{
	- nvlist_t nv = (nvlist_t )lua_touserdata(state, 2);
	- const char key = (const char )lua_touserdata(state, 1);
	- zcp_lua_to_nvlist(state, 3, nv, key);
	- return (0);
	-}
	-
	-static void
	-zcp_convert_return_values(lua_State state, nvlist_t nvl,
	- const char key, int result)
	-{
	- int err;
	- VERIFY3U(1, ==, lua_gettop(state));
	- lua_pushcfunction(state, zcp_lua_to_nvlist_helper);
	- lua_pushlightuserdata(state, (char *)key);
	- lua_pushlightuserdata(state, nvl);
	- lua_pushvalue(state, 1);
	- lua_remove(state, 1);
	- err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */
	- if (err != 0) {
	- zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR);
	- *result = SET_ERROR(ECHRNG);
	- }
	-}
	-
	-/*
	- * Push a Lua table representing nvl onto the stack. If it can't be
	- * converted, return EINVAL, fill in errbuf, and push nothing. errbuf may
	- * be specified as NULL, in which case no error string will be output.
	- *
	- * Most nvlists are converted as simple key->value Lua tables, but we make
	- * an exception for the case where all nvlist entries are BOOLEANs (a string
	- * key without a value). In Lua, a table key pointing to a value of Nil
	- * (no value) is equivalent to the key not existing, so a BOOLEAN nvlist
	- * entry can't be directly converted to a Lua table entry. Nvlists of entirely
	- * BOOLEAN entries are frequently used to pass around lists of datasets, so for
	- * convenience we check for this case, and convert it to a simple Lua array of
	- * strings.
	- */
	-int
	-zcp_nvlist_to_lua(lua_State state, nvlist_t nvl,
	- char *errbuf, int errbuf_len)
	-{
	- nvpair_t *pair;
	- lua_newtable(state);
	- boolean_t has_values = B_FALSE;
	- /*
	- * If the list doesn't have any values, just convert it to a string
	- * array.
	- */
	- for (pair = nvlist_next_nvpair(nvl, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
	- if (nvpair_type(pair) != DATA_TYPE_BOOLEAN) {
	- has_values = B_TRUE;
	- break;
	- }
	- }
	- if (!has_values) {
	- int i = 1;
	- for (pair = nvlist_next_nvpair(nvl, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
	- (void) lua_pushinteger(state, i);
	- (void) lua_pushstring(state, nvpair_name(pair));
	- (void) lua_settable(state, -3);
	- i++;
	- }
	- } else {
	- for (pair = nvlist_next_nvpair(nvl, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
	- int err = zcp_nvpair_value_to_lua(state, pair,
	- errbuf, errbuf_len);
	- if (err != 0) {
	- lua_pop(state, 1);
	- return (err);
	- }
	- (void) lua_setfield(state, -2, nvpair_name(pair));
	- }
	- }
	- return (0);
	-}
	-
	-/*
	- * Push a Lua object representing the value of "pair" onto the stack.
	- *
	- * Only understands boolean_value, string, int64, nvlist,
	- * string_array, and int64_array type values. For other
	- * types, returns EINVAL, fills in errbuf, and pushes nothing.
	- */
	-static int
	-zcp_nvpair_value_to_lua(lua_State state, nvpair_t pair,
	- char *errbuf, int errbuf_len)
	-{
	- int err = 0;
	-
	- if (pair == NULL) {
	- lua_pushnil(state);
	- return (0);
	- }
	-
	- switch (nvpair_type(pair)) {
	- case DATA_TYPE_BOOLEAN_VALUE:
	- (void) lua_pushboolean(state,
	- fnvpair_value_boolean_value(pair));
	- break;
	- case DATA_TYPE_STRING:
	- (void) lua_pushstring(state, fnvpair_value_string(pair));
	- break;
	- case DATA_TYPE_INT64:
	- (void) lua_pushinteger(state, fnvpair_value_int64(pair));
	- break;
	- case DATA_TYPE_NVLIST:
	- err = zcp_nvlist_to_lua(state,
	- fnvpair_value_nvlist(pair), errbuf, errbuf_len);
	- break;
	- case DATA_TYPE_STRING_ARRAY: {
	- char **strarr;
	- uint_t nelem;
	- (void) nvpair_value_string_array(pair, &strarr, &nelem);
	- lua_newtable(state);
	- for (int i = 0; i < nelem; i++) {
	- (void) lua_pushinteger(state, i + 1);
	- (void) lua_pushstring(state, strarr[i]);
	- (void) lua_settable(state, -3);
	- }
	- break;
	- }
	- case DATA_TYPE_UINT64_ARRAY: {
	- uint64_t *intarr;
	- uint_t nelem;
	- (void) nvpair_value_uint64_array(pair, &intarr, &nelem);
	- lua_newtable(state);
	- for (int i = 0; i < nelem; i++) {
	- (void) lua_pushinteger(state, i + 1);
	- (void) lua_pushinteger(state, intarr[i]);
	- (void) lua_settable(state, -3);
	- }
	- break;
	- }
	- case DATA_TYPE_INT64_ARRAY: {
	- int64_t *intarr;
	- uint_t nelem;
	- (void) nvpair_value_int64_array(pair, &intarr, &nelem);
	- lua_newtable(state);
	- for (int i = 0; i < nelem; i++) {
	- (void) lua_pushinteger(state, i + 1);
	- (void) lua_pushinteger(state, intarr[i]);
	- (void) lua_settable(state, -3);
	- }
	- break;
	- }
	- default: {
	- if (errbuf != NULL) {
	- (void) snprintf(errbuf, errbuf_len,
	- "Unhandled nvpair type %d for key '%s'",
	- nvpair_type(pair), nvpair_name(pair));
	- }
	- return (EINVAL);
	- }
	- }
	- return (err);
	-}
	-
	-int
	-zcp_dataset_hold_error(lua_State state, dsl_pool_t dp, const char *dsname,
	- int error)
	-{
	- if (error == ENOENT) {
	- (void) zcp_argerror(state, 1, "no such dataset '%s'", dsname);
	- return (0); /* not reached; zcp_argerror will longjmp */
	- } else if (error == EXDEV) {
	- (void) zcp_argerror(state, 1,
	- "dataset '%s' is not in the target pool '%s'",
	- dsname, spa_name(dp->dp_spa));
	- return (0); /* not reached; zcp_argerror will longjmp */
	- } else if (error == EIO) {
	- (void) luaL_error(state,
	- "I/O error while accessing dataset '%s'", dsname);
	- return (0); /* not reached; luaL_error will longjmp */
	- } else if (error != 0) {
	- (void) luaL_error(state,
	- "unexpected error %d while accessing dataset '%s'",
	- error, dsname);
	- return (0); /* not reached; luaL_error will longjmp */
	- }
	- return (0);
	-}
	-
	-/*
	- * Note: will longjmp (via lua_error()) on error.
	- * Assumes that the dsname is argument #1 (for error reporting purposes).
	- */
	-dsl_dataset_t *
	-zcp_dataset_hold(lua_State state, dsl_pool_t dp, const char *dsname,
	- void *tag)
	-{
	- dsl_dataset_t *ds;
	- int error = dsl_dataset_hold(dp, dsname, tag, &ds);
	- (void) zcp_dataset_hold_error(state, dp, dsname, error);
	- return (ds);
	-}
	-
	-static int zcp_debug(lua_State *);
	-static zcp_lib_info_t zcp_debug_info = {
	- .name = "debug",
	- .func = zcp_debug,
	- .pargs = {
	- { .za_name = "debug string", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- }
	-};
	-
	-static int
	-zcp_debug(lua_State *state)
	-{
	- const char *dbgstring;
	- zcp_run_info_t *ri = zcp_run_info(state);
	- zcp_lib_info_t *libinfo = &zcp_debug_info;
	-
	- zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
	-
	- dbgstring = lua_tostring(state, 1);
	-
	- zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring);
	-
	- return (0);
	-}
	-
	-static int zcp_exists(lua_State *);
	-static zcp_lib_info_t zcp_exists_info = {
	- .name = "exists",
	- .func = zcp_exists,
	- .pargs = {
	- { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- }
	-};
	-
	-static int
	-zcp_exists(lua_State *state)
	-{
	- zcp_run_info_t *ri = zcp_run_info(state);
	- dsl_pool_t *dp = ri->zri_pool;
	- zcp_lib_info_t *libinfo = &zcp_exists_info;
	-
	- zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
	-
	- const char *dsname = lua_tostring(state, 1);
	-
	- dsl_dataset_t *ds;
	- int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
	- if (error == 0) {
	- dsl_dataset_rele(ds, FTAG);
	- lua_pushboolean(state, B_TRUE);
	- } else if (error == ENOENT) {
	- lua_pushboolean(state, B_FALSE);
	- } else if (error == EXDEV) {
	- return (luaL_error(state, "dataset '%s' is not in the "
	- "target pool", dsname));
	- } else if (error == EIO) {
	- return (luaL_error(state, "I/O error opening dataset '%s'",
	- dsname));
	- } else if (error != 0) {
	- return (luaL_error(state, "unexpected error %d", error));
	- }
	-
	- return (1);
	-}
	-
	-/*
	- * Allocate/realloc/free a buffer for the lua interpreter.
	- *
	- * When nsize is 0, behaves as free() and returns NULL.
	- *
	- * If ptr is NULL, behaves as malloc() and returns an allocated buffer of size
	- * at least nsize.
	- *
	- * Otherwise, behaves as realloc(), changing the allocation from osize to nsize.
	- * Shrinking the buffer size never fails.
	- *
	- * The original allocated buffer size is stored as a uint64 at the beginning of
	- * the buffer to avoid actually reallocating when shrinking a buffer, since lua
	- * requires that this operation never fail.
	- */
	-static void *
	-zcp_lua_alloc(void ud, void ptr, size_t osize, size_t nsize)
	-{
	- zcp_alloc_arg_t *allocargs = ud;
	- int flags = (allocargs->aa_must_succeed) ?
	- KM_SLEEP : (KM_NOSLEEP \| KM_NORMALPRI);
	-
	- if (nsize == 0) {
	- if (ptr != NULL) {
	- int64_t allocbuf = (int64_t )ptr - 1;
	- int64_t allocsize = *allocbuf;
	- ASSERT3S(allocsize, >, 0);
	- ASSERT3S(allocargs->aa_alloc_remaining + allocsize, <=,
	- allocargs->aa_alloc_limit);
	- allocargs->aa_alloc_remaining += allocsize;
	- kmem_free(allocbuf, allocsize);
	- }
	- return (NULL);
	- } else if (ptr == NULL) {
	- int64_t *allocbuf;
	- int64_t allocsize = nsize + sizeof (int64_t);
	-
	- if (!allocargs->aa_must_succeed &&
	- (allocsize <= 0 \|\|
	- allocsize > allocargs->aa_alloc_remaining)) {
	- return (NULL);
	- }
	-
	- allocbuf = kmem_alloc(allocsize, flags);
	- if (allocbuf == NULL) {
	- return (NULL);
	- }
	- allocargs->aa_alloc_remaining -= allocsize;
	-
	- *allocbuf = allocsize;
	- return (allocbuf + 1);
	- } else if (nsize <= osize) {
	- /*
	- * If shrinking the buffer, lua requires that the reallocation
	- * never fail.
	- */
	- return (ptr);
	- } else {
	- ASSERT3U(nsize, >, osize);
	-
	- uint64_t *luabuf = zcp_lua_alloc(ud, NULL, 0, nsize);
	- if (luabuf == NULL) {
	- return (NULL);
	- }
	- (void) memcpy(luabuf, ptr, osize);
	- VERIFY3P(zcp_lua_alloc(ud, ptr, osize, 0), ==, NULL);
	- return (luabuf);
	- }
	-}
	-
	-/* ARGSUSED */
	-static void
	-zcp_lua_counthook(lua_State state, lua_Debug ar)
	-{
	- lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
	- zcp_run_info_t *ri = lua_touserdata(state, -1);
	-
	- /*
	- * Check if we were canceled while waiting for the
	- * txg to sync or from our open context thread
	- */
	- if (ri->zri_canceled \|\|
	- (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) {
	- ri->zri_canceled = B_TRUE;
	- (void) lua_pushstring(state, "Channel program was canceled.");
	- (void) lua_error(state);
	- }
	-
	- /*
	- * Check how many instructions the channel program has
	- * executed so far, and compare against the limit.
	- */
	- ri->zri_curinstrs += zfs_lua_check_instrlimit_interval;
	- if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) {
	- ri->zri_timed_out = B_TRUE;
	- (void) lua_pushstring(state,
	- "Channel program timed out.");
	- (void) lua_error(state);
	- }
	-}
	-
	-static int
	-zcp_panic_cb(lua_State *state)
	-{
	- panic("unprotected error in call to Lua API (%s)\n",
	- lua_tostring(state, -1));
	- return (0);
	-}
	-
	-static void
	-zcp_eval_impl(dmu_tx_t tx, zcp_run_info_t ri)
	-{
	- int err;
	- lua_State *state = ri->zri_state;
	-
	- VERIFY3U(3, ==, lua_gettop(state));
	-
	- /* finish initializing our runtime state */
	- ri->zri_pool = dmu_tx_pool(tx);
	- ri->zri_tx = tx;
	- list_create(&ri->zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t),
	- offsetof(zcp_cleanup_handler_t, zch_node));
	-
	- /*
	- * Store the zcp_run_info_t struct for this run in the Lua registry.
	- * Registry entries are not directly accessible by the Lua scripts but
	- * can be accessed by our callbacks.
	- */
	- lua_pushlightuserdata(state, ri);
	- lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
	- VERIFY3U(3, ==, lua_gettop(state));
	-
	- /*
	- * Tell the Lua interpreter to call our handler every count
	- * instructions. Channel programs that execute too many instructions
	- * should die with ETIMEDOUT.
	- */
	- (void) lua_sethook(state, zcp_lua_counthook, LUA_MASKCOUNT,
	- zfs_lua_check_instrlimit_interval);
	-
	- /*
	- * Tell the Lua memory allocator to stop using KM_SLEEP before handing
	- * off control to the channel program. Channel programs that use too
	- * much memory should die with ENOSPC.
	- */
	- ri->zri_allocargs->aa_must_succeed = B_FALSE;
	-
	- /*
	- * Call the Lua function that open-context passed us. This pops the
	- * function and its input from the stack and pushes any return
	- * or error values.
	- */
	- err = lua_pcall(state, 1, LUA_MULTRET, 1);
	-
	- /*
	- * Let Lua use KM_SLEEP while we interpret the return values.
	- */
	- ri->zri_allocargs->aa_must_succeed = B_TRUE;
	-
	- /*
	- * Remove the error handler callback from the stack. At this point,
	- * there shouldn't be any cleanup handler registered in the handler
	- * list (zri_cleanup_handlers), regardless of whether it ran or not.
	- */
	- list_destroy(&ri->zri_cleanup_handlers);
	- lua_remove(state, 1);
	-
	- switch (err) {
	- case LUA_OK: {
	- /*
	- * Lua supports returning multiple values in a single return
	- * statement. Return values will have been pushed onto the
	- * stack:
	- * 1: Return value 1
	- * 2: Return value 2
	- * 3: etc...
	- * To simplify the process of retrieving a return value from a
	- * channel program, we disallow returning more than one value
	- * to ZFS from the Lua script, yielding a singleton return
	- * nvlist of the form { "return": Return value 1 }.
	- */
	- int return_count = lua_gettop(state);
	-
	- if (return_count == 1) {
	- ri->zri_result = 0;
	- zcp_convert_return_values(state, ri->zri_outnvl,
	- ZCP_RET_RETURN, &ri->zri_result);
	- } else if (return_count > 1) {
	- ri->zri_result = SET_ERROR(ECHRNG);
	- lua_settop(state, 0);
	- (void) lua_pushfstring(state, "Multiple return "
	- "values not supported");
	- zcp_convert_return_values(state, ri->zri_outnvl,
	- ZCP_RET_ERROR, &ri->zri_result);
	- }
	- break;
	- }
	- case LUA_ERRRUN:
	- case LUA_ERRGCMM: {
	- /*
	- * The channel program encountered a fatal error within the
	- * script, such as failing an assertion, or calling a function
	- * with incompatible arguments. The error value and the
	- * traceback generated by zcp_error_handler() should be on the
	- * stack.
	- */
	- VERIFY3U(1, ==, lua_gettop(state));
	- if (ri->zri_timed_out) {
	- ri->zri_result = SET_ERROR(ETIME);
	- } else if (ri->zri_canceled) {
	- ri->zri_result = SET_ERROR(EINTR);
	- } else {
	- ri->zri_result = SET_ERROR(ECHRNG);
	- }
	-
	- zcp_convert_return_values(state, ri->zri_outnvl,
	- ZCP_RET_ERROR, &ri->zri_result);
	- break;
	- }
	- case LUA_ERRERR: {
	- /*
	- * The channel program encountered a fatal error within the
	- * script, and we encountered another error while trying to
	- * compute the traceback in zcp_error_handler(). We can only
	- * return the error message.
	- */
	- VERIFY3U(1, ==, lua_gettop(state));
	- if (ri->zri_timed_out) {
	- ri->zri_result = SET_ERROR(ETIME);
	- } else if (ri->zri_canceled) {
	- ri->zri_result = SET_ERROR(EINTR);
	- } else {
	- ri->zri_result = SET_ERROR(ECHRNG);
	- }
	-
	- zcp_convert_return_values(state, ri->zri_outnvl,
	- ZCP_RET_ERROR, &ri->zri_result);
	- break;
	- }
	- case LUA_ERRMEM:
	- /*
	- * Lua ran out of memory while running the channel program.
	- * There's not much we can do.
	- */
	- ri->zri_result = SET_ERROR(ENOSPC);
	- break;
	- default:
	- VERIFY0(err);
	- }
	-}
	-
	-static void
	-zcp_pool_error(zcp_run_info_t ri, const char poolname)
	-{
	- ri->zri_result = SET_ERROR(ECHRNG);
	- lua_settop(ri->zri_state, 0);
	- (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s",
	- poolname);
	- zcp_convert_return_values(ri->zri_state, ri->zri_outnvl,
	- ZCP_RET_ERROR, &ri->zri_result);
	-
	-}
	-
	-/*
	- * This callback is called when txg_wait_synced_sig encountered a signal.
	- * The txg_wait_synced_sig will continue to wait for the txg to complete
	- * after calling this callback.
	- */
	-/* ARGSUSED */
	-static void
	-zcp_eval_sig(void arg, dmu_tx_t tx)
	-{
	- zcp_run_info_t *ri = arg;
	-
	- ri->zri_canceled = B_TRUE;
	-}
	-
	-static void
	-zcp_eval_sync(void arg, dmu_tx_t tx)
	-{
	- zcp_run_info_t *ri = arg;
	-
	- /*
	- * Open context should have setup the stack to contain:
	- * 1: Error handler callback
	- * 2: Script to run (converted to a Lua function)
	- * 3: nvlist input to function (converted to Lua table or nil)
	- */
	- VERIFY3U(3, ==, lua_gettop(ri->zri_state));
	-
	- zcp_eval_impl(tx, ri);
	-}
	-
	-static void
	-zcp_eval_open(zcp_run_info_t ri, const char poolname)
	-{
	- int error;
	- dsl_pool_t *dp;
	- dmu_tx_t *tx;
	-
	- /*
	- * See comment from the same assertion in zcp_eval_sync().
	- */
	- VERIFY3U(3, ==, lua_gettop(ri->zri_state));
	-
	- error = dsl_pool_hold(poolname, FTAG, &dp);
	- if (error != 0) {
	- zcp_pool_error(ri, poolname);
	- return;
	- }
	-
	- /*
	- * As we are running in open-context, we have no transaction associated
	- * with the channel program. At the same time, functions from the
	- * zfs.check submodule need to be associated with a transaction as
	- * they are basically dry-runs of their counterparts in the zfs.sync
	- * submodule. These functions should be able to run in open-context.
	- * Therefore we create a new transaction that we later abort once
	- * the channel program has been evaluated.
	- */
	- tx = dmu_tx_create_dd(dp->dp_mos_dir);
	-
	- zcp_eval_impl(tx, ri);
	-
	- dmu_tx_abort(tx);
	-
	- dsl_pool_rele(dp, FTAG);
	-}
	-
	-int
	-zcp_eval(const char poolname, const char program, boolean_t sync,
	- uint64_t instrlimit, uint64_t memlimit, nvpair_t nvarg, nvlist_t outnvl)
	-{
	- int err;
	- lua_State *state;
	- zcp_run_info_t runinfo;
	-
	- if (instrlimit > zfs_lua_max_instrlimit)
	- return (SET_ERROR(EINVAL));
	- if (memlimit == 0 \|\| memlimit > zfs_lua_max_memlimit)
	- return (SET_ERROR(EINVAL));
	-
	- zcp_alloc_arg_t allocargs = {
	- .aa_must_succeed = B_TRUE,
	- .aa_alloc_remaining = (int64_t)memlimit,
	- .aa_alloc_limit = (int64_t)memlimit,
	- };
	-
	- /*
	- * Creates a Lua state with a memory allocator that uses KM_SLEEP.
	- * This should never fail.
	- */
	- state = lua_newstate(zcp_lua_alloc, &allocargs);
	- VERIFY(state != NULL);
	- (void) lua_atpanic(state, zcp_panic_cb);
	-
	- /*
	- * Load core Lua libraries we want access to.
	- */
	- VERIFY3U(1, ==, luaopen_base(state));
	- lua_pop(state, 1);
	- VERIFY3U(1, ==, luaopen_coroutine(state));
	- lua_setglobal(state, LUA_COLIBNAME);
	- VERIFY0(lua_gettop(state));
	- VERIFY3U(1, ==, luaopen_string(state));
	- lua_setglobal(state, LUA_STRLIBNAME);
	- VERIFY0(lua_gettop(state));
	- VERIFY3U(1, ==, luaopen_table(state));
	- lua_setglobal(state, LUA_TABLIBNAME);
	- VERIFY0(lua_gettop(state));
	-
	- /*
	- * Load globally visible variables such as errno aliases.
	- */
	- zcp_load_globals(state);
	- VERIFY0(lua_gettop(state));
	-
	- /*
	- * Load ZFS-specific modules.
	- */
	- lua_newtable(state);
	- VERIFY3U(1, ==, zcp_load_list_lib(state));
	- lua_setfield(state, -2, "list");
	- VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_FALSE));
	- lua_setfield(state, -2, "check");
	- VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_TRUE));
	- lua_setfield(state, -2, "sync");
	- VERIFY3U(1, ==, zcp_load_get_lib(state));
	- lua_pushcclosure(state, zcp_debug_info.func, 0);
	- lua_setfield(state, -2, zcp_debug_info.name);
	- lua_pushcclosure(state, zcp_exists_info.func, 0);
	- lua_setfield(state, -2, zcp_exists_info.name);
	- lua_setglobal(state, "zfs");
	- VERIFY0(lua_gettop(state));
	-
	- /*
	- * Push the error-callback that calculates Lua stack traces on
	- * unexpected failures.
	- */
	- lua_pushcfunction(state, zcp_error_handler);
	- VERIFY3U(1, ==, lua_gettop(state));
	-
	- /*
	- * Load the actual script as a function onto the stack as text ("t").
	- * The only valid error condition is a syntax error in the script.
	- * ERRMEM should not be possible because our allocator is using
	- * KM_SLEEP. ERRGCMM should not be possible because we have not added
	- * any objects with __gc metamethods to the interpreter that could
	- * fail.
	- */
	- err = luaL_loadbufferx(state, program, strlen(program),
	- "channel program", "t");
	- if (err == LUA_ERRSYNTAX) {
	- fnvlist_add_string(outnvl, ZCP_RET_ERROR,
	- lua_tostring(state, -1));
	- lua_close(state);
	- return (SET_ERROR(EINVAL));
	- }
	- VERIFY0(err);
	- VERIFY3U(2, ==, lua_gettop(state));
	-
	- /*
	- * Convert the input nvlist to a Lua object and put it on top of the
	- * stack.
	- */
	- char errmsg[128];
	- err = zcp_nvpair_value_to_lua(state, nvarg,
	- errmsg, sizeof (errmsg));
	- if (err != 0) {
	- fnvlist_add_string(outnvl, ZCP_RET_ERROR, errmsg);
	- lua_close(state);
	- return (SET_ERROR(EINVAL));
	- }
	- VERIFY3U(3, ==, lua_gettop(state));
	-
	- runinfo.zri_state = state;
	- runinfo.zri_allocargs = &allocargs;
	- runinfo.zri_outnvl = outnvl;
	- runinfo.zri_result = 0;
	- runinfo.zri_cred = CRED();
	- runinfo.zri_timed_out = B_FALSE;
	- runinfo.zri_canceled = B_FALSE;
	- runinfo.zri_sync = sync;
	- runinfo.zri_space_used = 0;
	- runinfo.zri_curinstrs = 0;
	- runinfo.zri_maxinstrs = instrlimit;
	-
	- if (sync) {
	- err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync,
	- zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
	- if (err != 0)
	- zcp_pool_error(&runinfo, poolname);
	- } else {
	- zcp_eval_open(&runinfo, poolname);
	- }
	- lua_close(state);
	-
	- return (runinfo.zri_result);
	-}
	-
	-/*
	- * Retrieve metadata about the currently running channel program.
	- */
	-zcp_run_info_t *
	-zcp_run_info(lua_State *state)
	-{
	- zcp_run_info_t *ri;
	-
	- lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
	- ri = lua_touserdata(state, -1);
	- lua_pop(state, 1);
	- return (ri);
	-}
	-
	-/*
	- * Argument Parsing
	- * ================
	- *
	- * The Lua language allows methods to be called with any number
	- * of arguments of any type. When calling back into ZFS we need to sanitize
	- * arguments from channel programs to make sure unexpected arguments or
	- * arguments of the wrong type result in clear error messages. To do this
	- * in a uniform way all callbacks from channel programs should use the
	- * zcp_parse_args() function to interpret inputs.
	- *
	- * Positional vs Keyword Arguments
	- * ===============================
	- *
	- * Every callback function takes a fixed set of required positional arguments
	- * and optional keyword arguments. For example, the destroy function takes
	- * a single positional string argument (the name of the dataset to destroy)
	- * and an optional "defer" keyword boolean argument. When calling lua functions
	- * with parentheses, only positional arguments can be used:
	- *
	- * zfs.sync.snapshot("rpool@snap")
	- *
	- * To use keyword arguments functions should be called with a single argument
	- * that is a lua table containing mappings of integer -> positional arguments
	- * and string -> keyword arguments:
	- *
	- * zfs.sync.snapshot({1="rpool@snap", defer=true})
	- *
	- * The lua language allows curly braces to be used in place of parenthesis as
	- * syntactic sugar for this calling convention:
	- *
	- * zfs.sync.snapshot{"rpool@snap", defer=true}
	- */
	-
	-/*
	- * Throw an error and print the given arguments. If there are too many
	- * arguments to fit in the output buffer, only the error format string is
	- * output.
	- */
	-static void
	-zcp_args_error(lua_State state, const char fname, const zcp_arg_t *pargs,
	- const zcp_arg_t kwargs, const char fmt, ...)
	-{
	- int i;
	- char errmsg[512];
	- size_t len = sizeof (errmsg);
	- size_t msglen = 0;
	- va_list argp;
	-
	- va_start(argp, fmt);
	- VERIFY3U(len, >, vsnprintf(errmsg, len, fmt, argp));
	- va_end(argp);
	-
	- /*
	- * Calculate the total length of the final string, including extra
	- * formatting characters. If the argument dump would be too large,
	- * only print the error string.
	- */
	- msglen = strlen(errmsg);
	- msglen += strlen(fname) + 4; /* : + {} + null terminator */
	- for (i = 0; pargs[i].za_name != NULL; i++) {
	- msglen += strlen(pargs[i].za_name);
	- msglen += strlen(lua_typename(state, pargs[i].za_lua_type));
	- if (pargs[i + 1].za_name != NULL \|\| kwargs[0].za_name != NULL)
	- msglen += 5; /* < + ( + )> + , */
	- else
	- msglen += 4; /* < + ( + )> */
	- }
	- for (i = 0; kwargs[i].za_name != NULL; i++) {
	- msglen += strlen(kwargs[i].za_name);
	- msglen += strlen(lua_typename(state, kwargs[i].za_lua_type));
	- if (kwargs[i + 1].za_name != NULL)
	- msglen += 4; /* =( + ) + , */
	- else
	- msglen += 3; /* =( + ) */
	- }
	-
	- if (msglen >= len)
	- (void) luaL_error(state, errmsg);
	-
	- VERIFY3U(len, >, strlcat(errmsg, ": ", len));
	- VERIFY3U(len, >, strlcat(errmsg, fname, len));
	- VERIFY3U(len, >, strlcat(errmsg, "{", len));
	- for (i = 0; pargs[i].za_name != NULL; i++) {
	- VERIFY3U(len, >, strlcat(errmsg, "<", len));
	- VERIFY3U(len, >, strlcat(errmsg, pargs[i].za_name, len));
	- VERIFY3U(len, >, strlcat(errmsg, "(", len));
	- VERIFY3U(len, >, strlcat(errmsg,
	- lua_typename(state, pargs[i].za_lua_type), len));
	- VERIFY3U(len, >, strlcat(errmsg, ")>", len));
	- if (pargs[i + 1].za_name != NULL \|\| kwargs[0].za_name != NULL) {
	- VERIFY3U(len, >, strlcat(errmsg, ", ", len));
	- }
	- }
	- for (i = 0; kwargs[i].za_name != NULL; i++) {
	- VERIFY3U(len, >, strlcat(errmsg, kwargs[i].za_name, len));
	- VERIFY3U(len, >, strlcat(errmsg, "=(", len));
	- VERIFY3U(len, >, strlcat(errmsg,
	- lua_typename(state, kwargs[i].za_lua_type), len));
	- VERIFY3U(len, >, strlcat(errmsg, ")", len));
	- if (kwargs[i + 1].za_name != NULL) {
	- VERIFY3U(len, >, strlcat(errmsg, ", ", len));
	- }
	- }
	- VERIFY3U(len, >, strlcat(errmsg, "}", len));
	-
	- (void) luaL_error(state, errmsg);
	- panic("unreachable code");
	-}
	-
	-static void
	-zcp_parse_table_args(lua_State state, const char fname,
	- const zcp_arg_t pargs, const zcp_arg_t kwargs)
	-{
	- int i;
	- int type;
	-
	- for (i = 0; pargs[i].za_name != NULL; i++) {
	- /*
	- * Check the table for this positional argument, leaving it
	- * on the top of the stack once we finish validating it.
	- */
	- lua_pushinteger(state, i + 1);
	- lua_gettable(state, 1);
	-
	- type = lua_type(state, -1);
	- if (type == LUA_TNIL) {
	- zcp_args_error(state, fname, pargs, kwargs,
	- "too few arguments");
	- panic("unreachable code");
	- } else if (type != pargs[i].za_lua_type) {
	- zcp_args_error(state, fname, pargs, kwargs,
	- "arg %d wrong type (is '%s', expected '%s')",
	- i + 1, lua_typename(state, type),
	- lua_typename(state, pargs[i].za_lua_type));
	- panic("unreachable code");
	- }
	-
	- /*
	- * Remove the positional argument from the table.
	- */
	- lua_pushinteger(state, i + 1);
	- lua_pushnil(state);
	- lua_settable(state, 1);
	- }
	-
	- for (i = 0; kwargs[i].za_name != NULL; i++) {
	- /*
	- * Check the table for this keyword argument, which may be
	- * nil if it was omitted. Leave the value on the top of
	- * the stack after validating it.
	- */
	- lua_getfield(state, 1, kwargs[i].za_name);
	-
	- type = lua_type(state, -1);
	- if (type != LUA_TNIL && type != kwargs[i].za_lua_type) {
	- zcp_args_error(state, fname, pargs, kwargs,
	- "kwarg '%s' wrong type (is '%s', expected '%s')",
	- kwargs[i].za_name, lua_typename(state, type),
	- lua_typename(state, kwargs[i].za_lua_type));
	- panic("unreachable code");
	- }
	-
	- /*
	- * Remove the keyword argument from the table.
	- */
	- lua_pushnil(state);
	- lua_setfield(state, 1, kwargs[i].za_name);
	- }
	-
	- /*
	- * Any entries remaining in the table are invalid inputs, print
	- * an error message based on what the entry is.
	- */
	- lua_pushnil(state);
	- if (lua_next(state, 1)) {
	- if (lua_isnumber(state, -2) && lua_tointeger(state, -2) > 0) {
	- zcp_args_error(state, fname, pargs, kwargs,
	- "too many positional arguments");
	- } else if (lua_isstring(state, -2)) {
	- zcp_args_error(state, fname, pargs, kwargs,
	- "invalid kwarg '%s'", lua_tostring(state, -2));
	- } else {
	- zcp_args_error(state, fname, pargs, kwargs,
	- "kwarg keys must be strings");
	- }
	- panic("unreachable code");
	- }
	-
	- lua_remove(state, 1);
	-}
	-
	-static void
	-zcp_parse_pos_args(lua_State state, const char fname, const zcp_arg_t *pargs,
	- const zcp_arg_t *kwargs)
	-{
	- int i;
	- int type;
	-
	- for (i = 0; pargs[i].za_name != NULL; i++) {
	- type = lua_type(state, i + 1);
	- if (type == LUA_TNONE) {
	- zcp_args_error(state, fname, pargs, kwargs,
	- "too few arguments");
	- panic("unreachable code");
	- } else if (type != pargs[i].za_lua_type) {
	- zcp_args_error(state, fname, pargs, kwargs,
	- "arg %d wrong type (is '%s', expected '%s')",
	- i + 1, lua_typename(state, type),
	- lua_typename(state, pargs[i].za_lua_type));
	- panic("unreachable code");
	- }
	- }
	- if (lua_gettop(state) != i) {
	- zcp_args_error(state, fname, pargs, kwargs,
	- "too many positional arguments");
	- panic("unreachable code");
	- }
	-
	- for (i = 0; kwargs[i].za_name != NULL; i++) {
	- lua_pushnil(state);
	- }
	-}
	-
	-/*
	- * Checks the current Lua stack against an expected set of positional and
	- * keyword arguments. If the stack does not match the expected arguments
	- * aborts the current channel program with a useful error message, otherwise
	- * it re-arranges the stack so that it contains the positional arguments
	- * followed by the keyword argument values in declaration order. Any missing
	- * keyword argument will be represented by a nil value on the stack.
	- *
	- * If the stack contains exactly one argument of type LUA_TTABLE the curly
	- * braces calling convention is assumed, otherwise the stack is parsed for
	- * positional arguments only.
	- *
	- * This function should be used by every function callback. It should be called
	- * before the callback manipulates the Lua stack as it assumes the stack
	- * represents the function arguments.
	- */
	-void
	-zcp_parse_args(lua_State state, const char fname, const zcp_arg_t *pargs,
	- const zcp_arg_t *kwargs)
	-{
	- if (lua_gettop(state) == 1 && lua_istable(state, 1)) {
	- zcp_parse_table_args(state, fname, pargs, kwargs);
	- } else {
	- zcp_parse_pos_args(state, fname, pargs, kwargs);
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c
	@@ -1,865 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include "lua.h"
	-#include "lualib.h"
	-#include "lauxlib.h"
	-
	-#include <zfs_prop.h>
	-
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/mntent.h>
	-#include <sys/sunddi.h>
	-#include <sys/zap.h>
	-#include <sys/zcp.h>
	-#include <sys/zcp_iter.h>
	-#include <sys/zcp_global.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zvol.h>
	-
	-#ifdef _KERNEL
	-#include <sys/zfs_vfsops.h>
	-#endif
	-
	-static int
	-get_objset_type(dsl_dataset_t ds, zfs_type_t type)
	-{
	- int error;
	- objset_t *os;
	- error = dmu_objset_from_ds(ds, &os);
	- if (error != 0)
	- return (error);
	- if (ds->ds_is_snapshot) {
	- *type = ZFS_TYPE_SNAPSHOT;
	- } else {
	- switch (os->os_phys->os_type) {
	- case DMU_OST_ZFS:
	- *type = ZFS_TYPE_FILESYSTEM;
	- break;
	- case DMU_OST_ZVOL:
	- *type = ZFS_TYPE_VOLUME;
	- break;
	- default:
	- return (EINVAL);
	- }
	- }
	- return (0);
	-}
	-
	-/*
	- * Returns the string name of ds's type in str (a buffer which should be
	- * at least 12 bytes long).
	- */
	-static int
	-get_objset_type_name(dsl_dataset_t ds, char str)
	-{
	- int error;
	- zfs_type_t type;
	- error = get_objset_type(ds, &type);
	- if (error != 0)
	- return (error);
	- switch (type) {
	- case ZFS_TYPE_SNAPSHOT:
	- (void) strcpy(str, "snapshot");
	- break;
	- case ZFS_TYPE_FILESYSTEM:
	- (void) strcpy(str, "filesystem");
	- break;
	- case ZFS_TYPE_VOLUME:
	- (void) strcpy(str, "volume");
	- break;
	- default:
	- return (EINVAL);
	- }
	- return (0);
	-}
	-
	-/*
	- * Determines the source of a property given its setpoint and
	- * property type. It pushes the source to the lua stack.
	- */
	-static void
	-get_prop_src(lua_State state, const char setpoint, zfs_prop_t prop)
	-{
	- if (zfs_prop_readonly(prop) \|\| (prop == ZFS_PROP_VERSION)) {
	- lua_pushnil(state);
	- } else {
	- const char *src;
	- if (strcmp("", setpoint) == 0) {
	- src = "default";
	- } else {
	- src = setpoint;
	- }
	- (void) lua_pushstring(state, src);
	- }
	-}
	-
	-/*
	- * Given an error encountered while getting properties, either longjmp's for
	- * a fatal error or pushes nothing to the stack for a non fatal one.
	- */
	-static int
	-zcp_handle_error(lua_State state, const char dataset_name,
	- const char *property_name, int error)
	-{
	- ASSERT3S(error, !=, 0);
	- if (error == ENOENT) {
	- return (0);
	- } else if (error == EINVAL) {
	- return (luaL_error(state,
	- "property '%s' is not a valid property on dataset '%s'",
	- property_name, dataset_name));
	- } else if (error == EIO) {
	- return (luaL_error(state,
	- "I/O error while retrieving property '%s' on dataset '%s'",
	- property_name, dataset_name));
	- } else {
	- return (luaL_error(state, "unexpected error %d while "
	- "retrieving property '%s' on dataset '%s'",
	- error, property_name, dataset_name));
	- }
	-}
	-
	-/*
	- * Look up a user defined property in the zap object. If it exists, push it
	- * and the setpoint onto the stack, otherwise don't push anything.
	- */
	-static int
	-zcp_get_user_prop(lua_State state, dsl_pool_t dp, const char *dataset_name,
	- const char *property_name)
	-{
	- int error;
	- char *buf;
	- char setpoint[ZFS_MAX_DATASET_NAME_LEN];
	- /*
	- * zcp_dataset_hold will either successfully return the requested
	- * dataset or throw a lua error and longjmp out of the zfs.get_prop call
	- * without returning.
	- */
	- dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
	- if (ds == NULL)
	- return (1); /* not reached; zcp_dataset_hold() longjmp'd */
	-
	- buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
	- error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN,
	- buf, setpoint);
	- dsl_dataset_rele(ds, FTAG);
	-
	- if (error != 0) {
	- kmem_free(buf, ZAP_MAXVALUELEN);
	- return (zcp_handle_error(state, dataset_name, property_name,
	- error));
	- }
	- (void) lua_pushstring(state, buf);
	- (void) lua_pushstring(state, setpoint);
	- kmem_free(buf, ZAP_MAXVALUELEN);
	- return (2);
	-}
	-
	-/*
	- * Check if the property we're looking for is stored in the ds_dir. If so,
	- * return it in the 'val' argument. Return 0 on success and ENOENT and if
	- * the property is not present.
	- */
	-static int
	-get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop,
	- uint64_t *val)
	-{
	- dsl_dir_t *dd = ds->ds_dir;
	- mutex_enter(&dd->dd_lock);
	- switch (zfs_prop) {
	- case ZFS_PROP_USEDSNAP:
	- *val = dsl_dir_get_usedsnap(dd);
	- break;
	- case ZFS_PROP_USEDCHILD:
	- *val = dsl_dir_get_usedchild(dd);
	- break;
	- case ZFS_PROP_USEDDS:
	- *val = dsl_dir_get_usedds(dd);
	- break;
	- case ZFS_PROP_USEDREFRESERV:
	- *val = dsl_dir_get_usedrefreserv(dd);
	- break;
	- case ZFS_PROP_LOGICALUSED:
	- *val = dsl_dir_get_logicalused(dd);
	- break;
	- default:
	- mutex_exit(&dd->dd_lock);
	- return (ENOENT);
	- }
	- mutex_exit(&dd->dd_lock);
	- return (0);
	-}
	-
	-/*
	- * Takes a dataset, a property, a value and that value's setpoint as
	- * found in the ZAP. Checks if the property has been changed in the vfs.
	- * If so, val and setpoint will be overwritten with updated content.
	- * Otherwise, they are left unchanged.
	- */
	-static int
	-get_temporary_prop(dsl_dataset_t ds, zfs_prop_t zfs_prop, uint64_t val,
	- char *setpoint)
	-{
	-#ifndef _KERNEL
	- return (0);
	-#else
	- int error;
	-#ifdef illumos
	- zfsvfs_t *zfvp;
	-#endif
	- vfs_t *vfsp;
	- objset_t *os;
	- uint64_t tmp = *val;
	-
	- error = dmu_objset_from_ds(ds, &os);
	- if (error != 0)
	- return (error);
	-
	- error = getzfsvfs_impl(os, &vfsp);
	- if (error != 0)
	- return (error);
	-#ifdef illumos
	- vfsp = zfvp->z_vfs;
	-#endif
	- switch (zfs_prop) {
	- case ZFS_PROP_ATIME:
	- if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
	- tmp = 0;
	- if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
	- tmp = 1;
	- break;
	- case ZFS_PROP_DEVICES:
	- if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
	- tmp = 0;
	- if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
	- tmp = 1;
	- break;
	- case ZFS_PROP_EXEC:
	- if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
	- tmp = 0;
	- if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
	- tmp = 1;
	- break;
	- case ZFS_PROP_SETUID:
	- if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
	- tmp = 0;
	- if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
	- tmp = 1;
	- break;
	- case ZFS_PROP_READONLY:
	- if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
	- tmp = 0;
	- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
	- tmp = 1;
	- break;
	- case ZFS_PROP_XATTR:
	- if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
	- tmp = 0;
	- if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
	- tmp = 1;
	- break;
	- case ZFS_PROP_NBMAND:
	- if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
	- tmp = 0;
	- if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
	- tmp = 1;
	- break;
	- default:
	-#ifdef illumos
	- VFS_RELE(vfsp);
	-#else
	- vfs_rel(vfsp);
	-#endif
	- return (ENOENT);
	- }
	-
	-#ifdef illumos
	- VFS_RELE(vfsp);
	-#else
	- vfs_rel(vfsp);
	-#endif
	- if (tmp != *val) {
	- (void) strcpy(setpoint, "temporary");
	- *val = tmp;
	- }
	- return (0);
	-#endif
	-}
	-
	-/*
	- * Check if the property we're looking for is stored at the dsl_dataset or
	- * dsl_dir level. If so, push the property value and source onto the lua stack
	- * and return 0. If it is not present or a failure occurs in lookup, return a
	- * non-zero error value.
	- */
	-static int
	-get_special_prop(lua_State state, dsl_dataset_t ds, const char *dsname,
	- zfs_prop_t zfs_prop)
	-{
	- int error = 0;
	- objset_t *os;
	- uint64_t numval;
	- char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
	- char setpoint[ZFS_MAX_DATASET_NAME_LEN] =
	- "Internal error - setpoint not determined";
	- zfs_type_t ds_type;
	- zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
	- (void) get_objset_type(ds, &ds_type);
	-
	- switch (zfs_prop) {
	- case ZFS_PROP_REFRATIO:
	- numval = dsl_get_refratio(ds);
	- break;
	- case ZFS_PROP_USED:
	- numval = dsl_get_used(ds);
	- break;
	- case ZFS_PROP_CLONES: {
	- nvlist_t *clones = fnvlist_alloc();
	- error = get_clones_stat_impl(ds, clones);
	- if (error == 0) {
	- /* push list to lua stack */
	- VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0));
	- /* source */
	- (void) lua_pushnil(state);
	- }
	- nvlist_free(clones);
	- kmem_free(strval, ZAP_MAXVALUELEN);
	- return (error);
	- }
	- case ZFS_PROP_COMPRESSRATIO:
	- numval = dsl_get_compressratio(ds);
	- break;
	- case ZFS_PROP_CREATION:
	- numval = dsl_get_creation(ds);
	- break;
	- case ZFS_PROP_REFERENCED:
	- numval = dsl_get_referenced(ds);
	- break;
	- case ZFS_PROP_AVAILABLE:
	- numval = dsl_get_available(ds);
	- break;
	- case ZFS_PROP_LOGICALREFERENCED:
	- numval = dsl_get_logicalreferenced(ds);
	- break;
	- case ZFS_PROP_CREATETXG:
	- numval = dsl_get_creationtxg(ds);
	- break;
	- case ZFS_PROP_GUID:
	- numval = dsl_get_guid(ds);
	- break;
	- case ZFS_PROP_UNIQUE:
	- numval = dsl_get_unique(ds);
	- break;
	- case ZFS_PROP_OBJSETID:
	- numval = dsl_get_objsetid(ds);
	- break;
	- case ZFS_PROP_ORIGIN:
	- dsl_dir_get_origin(ds->ds_dir, strval);
	- break;
	- case ZFS_PROP_USERACCOUNTING:
	- error = dmu_objset_from_ds(ds, &os);
	- if (error == 0)
	- numval = dmu_objset_userspace_present(os);
	- break;
	- case ZFS_PROP_WRITTEN:
	- error = dsl_get_written(ds, &numval);
	- break;
	- case ZFS_PROP_TYPE:
	- error = get_objset_type_name(ds, strval);
	- break;
	- case ZFS_PROP_PREV_SNAP:
	- error = dsl_get_prev_snap(ds, strval);
	- break;
	- case ZFS_PROP_NAME:
	- dsl_dataset_name(ds, strval);
	- break;
	- case ZFS_PROP_MOUNTPOINT:
	- error = dsl_get_mountpoint(ds, dsname, strval, setpoint);
	- break;
	- case ZFS_PROP_VERSION:
	- /* should be a snapshot or filesystem */
	- ASSERT(ds_type != ZFS_TYPE_VOLUME);
	- error = dmu_objset_from_ds(ds, &os);
	- /* look in the master node for the version */
	- if (error == 0) {
	- error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
	- sizeof (numval), 1, &numval);
	- }
	- break;
	- case ZFS_PROP_DEFER_DESTROY:
	- numval = dsl_get_defer_destroy(ds);
	- break;
	- case ZFS_PROP_USERREFS:
	- numval = dsl_get_userrefs(ds);
	- break;
	- case ZFS_PROP_FILESYSTEM_COUNT:
	- error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval);
	- (void) strcpy(setpoint, "");
	- break;
	- case ZFS_PROP_SNAPSHOT_COUNT:
	- error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval);
	- (void) strcpy(setpoint, "");
	- break;
	- case ZFS_PROP_REMAPTXG:
	- error = dsl_dir_get_remaptxg(ds->ds_dir, &numval);
	- break;
	- case ZFS_PROP_NUMCLONES:
	- numval = dsl_get_numclones(ds);
	- break;
	- case ZFS_PROP_INCONSISTENT:
	- numval = dsl_get_inconsistent(ds);
	- break;
	- case ZFS_PROP_RECEIVE_RESUME_TOKEN: {
	- char *token = get_receive_resume_stats_impl(ds);
	- VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), <,
	- ZAP_MAXVALUELEN);
	- strfree(token);
	- if (strcmp(strval, "") == 0) {
	- token = get_child_receive_stats(ds);
	- VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), <,
	- ZAP_MAXVALUELEN);
	- strfree(token);
	- if (strcmp(strval, "") == 0)
	- error = ENOENT;
	- }
	- break;
	- }
	- case ZFS_PROP_VOLSIZE:
	- ASSERT(ds_type == ZFS_TYPE_VOLUME);
	- error = dmu_objset_from_ds(ds, &os);
	- if (error == 0) {
	- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size",
	- sizeof (numval), 1, &numval);
	- }
	- if (error == 0)
	- (void) strcpy(setpoint, dsname);
	-
	- break;
	- case ZFS_PROP_VOLBLOCKSIZE: {
	- ASSERT(ds_type == ZFS_TYPE_VOLUME);
	- dmu_object_info_t doi;
	- error = dmu_objset_from_ds(ds, &os);
	- if (error == 0) {
	- error = dmu_object_info(os, ZVOL_OBJ, &doi);
	- if (error == 0)
	- numval = doi.doi_data_block_size;
	- }
	- break;
	- }
	- default:
	- /* Did not match these props, check in the dsl_dir */
	- error = get_dsl_dir_prop(ds, zfs_prop, &numval);
	- }
	- if (error != 0) {
	- kmem_free(strval, ZAP_MAXVALUELEN);
	- return (error);
	- }
	-
	- switch (prop_type) {
	- case PROP_TYPE_NUMBER: {
	- (void) lua_pushnumber(state, numval);
	- break;
	- }
	- case PROP_TYPE_STRING: {
	- (void) lua_pushstring(state, strval);
	- break;
	- }
	- case PROP_TYPE_INDEX: {
	- const char *propval;
	- error = zfs_prop_index_to_string(zfs_prop, numval, &propval);
	- if (error != 0) {
	- kmem_free(strval, ZAP_MAXVALUELEN);
	- return (error);
	- }
	- (void) lua_pushstring(state, propval);
	- break;
	- }
	- }
	- kmem_free(strval, ZAP_MAXVALUELEN);
	-
	- /* Push the source to the stack */
	- get_prop_src(state, setpoint, zfs_prop);
	- return (0);
	-}
	-
	-/*
	- * Look up a property and its source in the zap object. If the value is
	- * present and successfully retrieved, push the value and source on the
	- * lua stack and return 0. On failure, return a non-zero error value.
	- */
	-static int
	-get_zap_prop(lua_State state, dsl_dataset_t ds, zfs_prop_t zfs_prop)
	-{
	- int error = 0;
	- char setpoint[ZFS_MAX_DATASET_NAME_LEN];
	- char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
	- uint64_t numval;
	- const char *prop_name = zfs_prop_to_name(zfs_prop);
	- zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
	-
	- if (prop_type == PROP_TYPE_STRING) {
	- /* Push value to lua stack */
	- error = dsl_prop_get_ds(ds, prop_name, 1,
	- ZAP_MAXVALUELEN, strval, setpoint);
	- if (error == 0)
	- (void) lua_pushstring(state, strval);
	- } else {
	- error = dsl_prop_get_ds(ds, prop_name, sizeof (numval),
	- 1, &numval, setpoint);
	-
	- /* Fill in temorary value for prop, if applicable */
	- (void) get_temporary_prop(ds, zfs_prop, &numval, setpoint);
	-
	- /* Push value to lua stack */
	- if (prop_type == PROP_TYPE_INDEX) {
	- const char *propval;
	- error = zfs_prop_index_to_string(zfs_prop, numval,
	- &propval);
	- if (error == 0)
	- (void) lua_pushstring(state, propval);
	- } else {
	- if (error == 0)
	- (void) lua_pushnumber(state, numval);
	- }
	- }
	- kmem_free(strval, ZAP_MAXVALUELEN);
	- if (error == 0)
	- get_prop_src(state, setpoint, zfs_prop);
	- return (error);
	-}
	-
	-/*
	- * Determine whether property is valid for a given dataset
	- */
	-boolean_t
	-prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
	-{
	- int error;
	- zfs_type_t zfs_type;
	-
	- /* properties not supported */
	- if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) \|\|
	- (zfs_prop == ZFS_PROP_MOUNTED))
	- return (B_FALSE);
	-
	- /* if we want the origin prop, ds must be a clone */
	- if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir)))
	- return (B_FALSE);
	-
	- error = get_objset_type(ds, &zfs_type);
	- if (error != 0)
	- return (B_FALSE);
	- return (zfs_prop_valid_for_type(zfs_prop, zfs_type));
	-}
	-
	-/*
	- * Look up a given dataset property. On success return 2, the number of
	- * values pushed to the lua stack (property value and source). On a fatal
	- * error, longjmp. On a non fatal error push nothing.
	- */
	-static int
	-zcp_get_system_prop(lua_State state, dsl_pool_t dp, const char *dataset_name,
	- zfs_prop_t zfs_prop)
	-{
	- int error;
	- /*
	- * zcp_dataset_hold will either successfully return the requested
	- * dataset or throw a lua error and longjmp out of the zfs.get_prop call
	- * without returning.
	- */
	- dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
	- if (ds == NULL)
	- return (1); /* not reached; zcp_dataset_hold() longjmp'd */
	-
	- /* Check that the property is valid for the given dataset */
	- const char *prop_name = zfs_prop_to_name(zfs_prop);
	- if (!prop_valid_for_ds(ds, zfs_prop)) {
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	- }
	-
	- /* Check if the property can be accessed directly */
	- error = get_special_prop(state, ds, dataset_name, zfs_prop);
	- if (error == 0) {
	- dsl_dataset_rele(ds, FTAG);
	- /* The value and source have been pushed by get_special_prop */
	- return (2);
	- }
	- if (error != ENOENT) {
	- dsl_dataset_rele(ds, FTAG);
	- return (zcp_handle_error(state, dataset_name,
	- prop_name, error));
	- }
	-
	- /* If we were unable to find it, look in the zap object */
	- error = get_zap_prop(state, ds, zfs_prop);
	- dsl_dataset_rele(ds, FTAG);
	- if (error != 0) {
	- return (zcp_handle_error(state, dataset_name,
	- prop_name, error));
	- }
	- /* The value and source have been pushed by get_zap_prop */
	- return (2);
	-}
	-
	-static zfs_userquota_prop_t
	-get_userquota_prop(const char *prop_name)
	-{
	- zfs_userquota_prop_t type;
	- /* Figure out the property type ({user\|group}{quota\|used}) */
	- for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
	- if (strncmp(prop_name, zfs_userquota_prop_prefixes[type],
	- strlen(zfs_userquota_prop_prefixes[type])) == 0)
	- break;
	- }
	- return (type);
	-}
	-
	-#ifdef _KERNEL
	-/*
	- * Given the name of a zfs_userquota_prop, this function determines the
	- * prop type as well as the numeric group/user ids based on the string
	- * following the '@' in the property name. On success, returns 0. On failure,
	- * returns a non-zero error.
	- * 'domain' must be free'd by caller using strfree()
	- */
	-static int
	-parse_userquota_prop(const char prop_name, zfs_userquota_prop_t type,
	- char *domain, uint64_t rid)
	-{
	- char cp, end, *domain_val;
	-
	- *type = get_userquota_prop(prop_name);
	- if (*type >= ZFS_NUM_USERQUOTA_PROPS)
	- return (EINVAL);
	-
	- *rid = 0;
	- cp = strchr(prop_name, '@') + 1;
	- if (strncmp(cp, "S-1-", 4) == 0) {
	- /*
	- * It's a numeric SID (eg "S-1-234-567-89") and we want to
	- * seperate the domain id and the rid
	- */
	- int domain_len = strrchr(cp, '-') - cp;
	- domain_val = kmem_alloc(domain_len + 1, KM_SLEEP);
	- (void) strncpy(domain_val, cp, domain_len);
	- domain_val[domain_len] = '\0';
	- cp += domain_len + 1;
	-
	- (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
	- if (*end != '\0') {
	- strfree(domain_val);
	- return (EINVAL);
	- }
	- } else {
	- /* It's only a user/group ID (eg "12345"), just get the rid */
	- domain_val = NULL;
	- (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
	- if (*end != '\0')
	- return (EINVAL);
	- }
	- *domain = domain_val;
	- return (0);
	-}
	-
	-/*
	- * Look up {user\|group}{quota\|used} property for given dataset. On success
	- * push the value (quota or used amount) and the setpoint. On failure, push
	- * a lua error.
	- */
	-static int
	-zcp_get_userquota_prop(lua_State state, dsl_pool_t dp,
	- const char dataset_name, const char prop_name)
	-{
	- zfsvfs_t *zfvp;
	- zfsvfs_t *zfsvfs;
	- int error;
	- zfs_userquota_prop_t type;
	- char *domain;
	- uint64_t rid, value;
	- objset_t *os;
	-
	- dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
	- if (ds == NULL)
	- return (1); /* not reached; zcp_dataset_hold() longjmp'd */
	-
	- error = parse_userquota_prop(prop_name, &type, &domain, &rid);
	- if (error == 0) {
	- error = dmu_objset_from_ds(ds, &os);
	- if (error == 0) {
	- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
	- error = zfsvfs_create_impl(&zfvp, zfsvfs, os);
	- if (error == 0) {
	- error = zfs_userspace_one(zfvp, type, domain,
	- rid, &value);
	- zfsvfs_free(zfvp);
	- }
	- }
	- if (domain != NULL)
	- strfree(domain);
	- }
	- dsl_dataset_rele(ds, FTAG);
	-
	- if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) \|\|
	- (type == ZFS_PROP_GROUPQUOTA)))
	- error = ENOENT;
	- if (error != 0) {
	- return (zcp_handle_error(state, dataset_name,
	- prop_name, error));
	- }
	-
	- (void) lua_pushnumber(state, value);
	- (void) lua_pushstring(state, dataset_name);
	- return (2);
	-}
	-#endif
	-
	-/*
	- * Determines the name of the snapshot referenced in the written property
	- * name. Returns snapshot name in snap_name, a buffer that must be at least
	- * as large as ZFS_MAX_DATASET_NAME_LEN
	- */
	-static void
	-parse_written_prop(const char dataset_name, const char prop_name,
	- char *snap_name)
	-{
	- ASSERT(zfs_prop_written(prop_name));
	- const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN;
	- if (strchr(name, '@') == NULL) {
	- (void) sprintf(snap_name, "%s@%s", dataset_name, name);
	- } else {
	- (void) strcpy(snap_name, name);
	- }
	-}
	-
	-/*
	- * Look up written@ property for given dataset. On success
	- * push the value and the setpoint. If error is fatal, we will
	- * longjmp, otherwise push nothing.
	- */
	-static int
	-zcp_get_written_prop(lua_State state, dsl_pool_t dp,
	- const char dataset_name, const char prop_name)
	-{
	- char snap_name[ZFS_MAX_DATASET_NAME_LEN];
	- uint64_t used, comp, uncomp;
	- dsl_dataset_t *old;
	- int error = 0;
	-
	- parse_written_prop(dataset_name, prop_name, snap_name);
	- dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG);
	- if (new == NULL)
	- return (1); /* not reached; zcp_dataset_hold() longjmp'd */
	-
	- error = dsl_dataset_hold(dp, snap_name, FTAG, &old);
	- if (error != 0) {
	- dsl_dataset_rele(new, FTAG);
	- return (zcp_dataset_hold_error(state, dp, snap_name,
	- error));
	- }
	- error = dsl_dataset_space_written(old, new,
	- &used, &comp, &uncomp);
	-
	- dsl_dataset_rele(old, FTAG);
	- dsl_dataset_rele(new, FTAG);
	-
	- if (error != 0) {
	- return (zcp_handle_error(state, dataset_name,
	- snap_name, error));
	- }
	- (void) lua_pushnumber(state, used);
	- (void) lua_pushstring(state, dataset_name);
	- return (2);
	-}
	-
	-static int zcp_get_prop(lua_State *state);
	-static zcp_lib_info_t zcp_get_prop_info = {
	- .name = "get_prop",
	- .func = zcp_get_prop,
	- .pargs = {
	- { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
	- { .za_name = "property", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- }
	-};
	-
	-static int
	-zcp_get_prop(lua_State *state)
	-{
	- const char *dataset_name;
	- const char *property_name;
	- dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
	- zcp_lib_info_t *libinfo = &zcp_get_prop_info;
	-
	- zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
	-
	- dataset_name = lua_tostring(state, 1);
	- property_name = lua_tostring(state, 2);
	-
	- /* User defined property */
	- if (zfs_prop_user(property_name)) {
	- return (zcp_get_user_prop(state, dp,
	- dataset_name, property_name));
	- }
	- /* userspace property */
	- if (zfs_prop_userquota(property_name)) {
	-#ifdef _KERNEL
	- return (zcp_get_userquota_prop(state, dp,
	- dataset_name, property_name));
	-#else
	- return (luaL_error(state,
	- "user quota properties only supported in kernel mode",
	- property_name));
	-#endif
	- }
	- /* written@ property */
	- if (zfs_prop_written(property_name)) {
	- return (zcp_get_written_prop(state, dp,
	- dataset_name, property_name));
	- }
	-
	- zfs_prop_t zfs_prop = zfs_name_to_prop(property_name);
	- /* Valid system property */
	- if (zfs_prop != ZPROP_INVAL) {
	- return (zcp_get_system_prop(state, dp, dataset_name,
	- zfs_prop));
	- }
	-
	- /* Invalid property name */
	- return (luaL_error(state,
	- "'%s' is not a valid property", property_name));
	-}
	-
	-int
	-zcp_load_get_lib(lua_State *state)
	-{
	- lua_pushcclosure(state, zcp_get_prop_info.func, 0);
	- lua_setfield(state, -2, zcp_get_prop_info.name);
	-
	- return (1);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c
	@@ -1,89 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zcp_global.h>
	-
	-#include "lua.h"
	-#include "lauxlib.h"
	-
	-typedef struct zcp_errno_global {
	- const char *zeg_name;
	- int zeg_errno;
	-} zcp_errno_global_t;
	-
	-static const zcp_errno_global_t errno_globals[] = {
	- {"EPERM", EPERM},
	- {"ENOENT", ENOENT},
	- {"ESRCH", ESRCH},
	- {"EINTR", EINTR},
	- {"EIO", EIO},
	- {"ENXIO", ENXIO},
	- {"E2BIG", E2BIG},
	- {"ENOEXEC", ENOEXEC},
	- {"EBADF", EBADF},
	- {"ECHILD", ECHILD},
	- {"EAGAIN", EAGAIN},
	- {"ENOMEM", ENOMEM},
	- {"EACCES", EACCES},
	- {"EFAULT", EFAULT},
	- {"ENOTBLK", ENOTBLK},
	- {"EBUSY", EBUSY},
	- {"EEXIST", EEXIST},
	- {"EXDEV", EXDEV},
	- {"ENODEV", ENODEV},
	- {"ENOTDIR", ENOTDIR},
	- {"EISDIR", EISDIR},
	- {"EINVAL", EINVAL},
	- {"ENFILE", ENFILE},
	- {"EMFILE", EMFILE},
	- {"ENOTTY", ENOTTY},
	- {"ETXTBSY", ETXTBSY},
	- {"EFBIG", EFBIG},
	- {"ENOSPC", ENOSPC},
	- {"ESPIPE", ESPIPE},
	- {"EROFS", EROFS},
	- {"EMLINK", EMLINK},
	- {"EPIPE", EPIPE},
	- {"EDOM", EDOM},
	- {"ERANGE", ERANGE},
	- {"EDEADLK", EDEADLK},
	- {"ENOLCK", ENOLCK},
	- {"ECANCELED", ECANCELED},
	- {"ENOTSUP", ENOTSUP},
	- {"EDQUOT", EDQUOT},
	- {"ENAMETOOLONG", ENAMETOOLONG},
	- {NULL, 0}
	-};
	-
	-static void
	-zcp_load_errno_globals(lua_State *state)
	-{
	- const zcp_errno_global_t *global = errno_globals;
	- while (global->zeg_name != NULL) {
	- lua_pushnumber(state, (lua_Number)global->zeg_errno);
	- lua_setglobal(state, global->zeg_name);
	- global++;
	- }
	-}
	-
	-void
	-zcp_load_globals(lua_State *state)
	-{
	- zcp_load_errno_globals(state);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c
	@@ -1,531 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016 by Delphix. All rights reserved.
	- */
	-
	-#include "lua.h"
	-#include "lauxlib.h"
	-
	-#include <sys/dmu.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/zap.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/zcp_prop.h>
	-
	-#include <sys/zcp.h>
	-
	-typedef int (zcp_list_func_t)(lua_State *);
	-typedef struct zcp_list_info {
	- const char *name;
	- zcp_list_func_t *func;
	- zcp_list_func_t *gc;
	- const zcp_arg_t pargs[4];
	- const zcp_arg_t kwargs[2];
	-} zcp_list_info_t;
	-
	-static int
	-zcp_clones_iter(lua_State *state)
	-{
	- int err;
	- char clonename[ZFS_MAX_DATASET_NAME_LEN];
	- uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
	- uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
	- dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
	- dsl_dataset_t ds, clone;
	- zap_attribute_t za;
	- zap_cursor_t zc;
	-
	- err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
	- if (err == ENOENT) {
	- return (0);
	- } else if (err != 0) {
	- return (luaL_error(state,
	- "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
	- err));
	- }
	-
	- if (dsl_dataset_phys(ds)->ds_next_clones_obj == 0) {
	- dsl_dataset_rele(ds, FTAG);
	- return (0);
	- }
	-
	- zap_cursor_init_serialized(&zc, dp->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_next_clones_obj, cursor);
	- dsl_dataset_rele(ds, FTAG);
	-
	- err = zap_cursor_retrieve(&zc, &za);
	- if (err != 0) {
	- zap_cursor_fini(&zc);
	- if (err != ENOENT) {
	- return (luaL_error(state,
	- "unexpected error %d from zap_cursor_retrieve()",
	- err));
	- }
	- return (0);
	- }
	- zap_cursor_advance(&zc);
	- cursor = zap_cursor_serialize(&zc);
	- zap_cursor_fini(&zc);
	-
	- err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &clone);
	- if (err != 0) {
	- return (luaL_error(state,
	- "unexpected error %d from "
	- "dsl_dataset_hold_obj(za_first_integer)", err));
	- }
	-
	- dsl_dir_name(clone->ds_dir, clonename);
	- dsl_dataset_rele(clone, FTAG);
	-
	- lua_pushnumber(state, cursor);
	- lua_replace(state, lua_upvalueindex(2));
	-
	- (void) lua_pushstring(state, clonename);
	- return (1);
	-}
	-
	-static int zcp_clones_list(lua_State *);
	-static zcp_list_info_t zcp_clones_list_info = {
	- .name = "clones",
	- .func = zcp_clones_list,
	- .gc = NULL,
	- .pargs = {
	- { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- }
	-};
	-
	-static int
	-zcp_clones_list(lua_State *state)
	-{
	- const char *snapname = lua_tostring(state, 1);
	- dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
	- boolean_t issnap;
	- uint64_t dsobj, cursor;
	-
	- /*
	- * zcp_dataset_hold will either successfully return the requested
	- * dataset or throw a lua error and longjmp out of the zfs.list.clones
	- * call without returning.
	- */
	- dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG);
	- if (ds == NULL)
	- return (1); /* not reached; zcp_dataset_hold() longjmp'd */
	- cursor = 0;
	- issnap = ds->ds_is_snapshot;
	- dsobj = ds->ds_object;
	- dsl_dataset_rele(ds, FTAG);
	-
	- if (!issnap) {
	- return (zcp_argerror(state, 1, "%s is not a snapshot",
	- snapname));
	- }
	-
	- lua_pushnumber(state, dsobj);
	- lua_pushnumber(state, cursor);
	- lua_pushcclosure(state, &zcp_clones_iter, 2);
	- return (1);
	-}
	-
	-static int
	-zcp_snapshots_iter(lua_State *state)
	-{
	- int err;
	- char snapname[ZFS_MAX_DATASET_NAME_LEN];
	- uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
	- uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
	- dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
	- dsl_dataset_t *ds;
	- objset_t *os;
	- char *p;
	-
	- err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
	- if (err != 0) {
	- return (luaL_error(state,
	- "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
	- err));
	- }
	-
	- dsl_dataset_name(ds, snapname);
	- VERIFY3U(sizeof (snapname), >,
	- strlcat(snapname, "@", sizeof (snapname)));
	-
	- p = strchr(snapname, '\0');
	- VERIFY0(dmu_objset_from_ds(ds, &os));
	- err = dmu_snapshot_list_next(os,
	- sizeof (snapname) - (p - snapname), p, NULL, &cursor, NULL);
	- dsl_dataset_rele(ds, FTAG);
	-
	- if (err == ENOENT) {
	- return (0);
	- } else if (err != 0) {
	- return (luaL_error(state,
	- "unexpected error %d from dmu_snapshot_list_next()", err));
	- }
	-
	- lua_pushnumber(state, cursor);
	- lua_replace(state, lua_upvalueindex(2));
	-
	- (void) lua_pushstring(state, snapname);
	- return (1);
	-}
	-
	-static int zcp_snapshots_list(lua_State *);
	-static zcp_list_info_t zcp_snapshots_list_info = {
	- .name = "snapshots",
	- .func = zcp_snapshots_list,
	- .gc = NULL,
	- .pargs = {
	- { .za_name = "filesystem \| volume", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- }
	-};
	-
	-static int
	-zcp_snapshots_list(lua_State *state)
	-{
	- const char *fsname = lua_tostring(state, 1);
	- dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
	- boolean_t issnap;
	- uint64_t dsobj;
	-
	- dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
	- if (ds == NULL)
	- return (1); /* not reached; zcp_dataset_hold() longjmp'd */
	- issnap = ds->ds_is_snapshot;
	- dsobj = ds->ds_object;
	- dsl_dataset_rele(ds, FTAG);
	-
	- if (issnap) {
	- return (zcp_argerror(state, 1,
	- "argument %s cannot be a snapshot", fsname));
	- }
	-
	- lua_pushnumber(state, dsobj);
	- lua_pushnumber(state, 0);
	- lua_pushcclosure(state, &zcp_snapshots_iter, 2);
	- return (1);
	-}
	-
	-/*
	- * Note: channel programs only run in the global zone, so all datasets
	- * are visible to this zone.
	- */
	-static boolean_t
	-dataset_name_hidden(const char *name)
	-{
	- if (strchr(name, '$') != NULL)
	- return (B_TRUE);
	- if (strchr(name, '%') != NULL)
	- return (B_TRUE);
	- return (B_FALSE);
	-}
	-
	-static int
	-zcp_children_iter(lua_State *state)
	-{
	- int err;
	- char childname[ZFS_MAX_DATASET_NAME_LEN];
	- uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
	- uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
	- zcp_run_info_t *ri = zcp_run_info(state);
	- dsl_pool_t *dp = ri->zri_pool;
	- dsl_dataset_t *ds;
	- objset_t *os;
	- char *p;
	-
	- err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
	- if (err != 0) {
	- return (luaL_error(state,
	- "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
	- err));
	- }
	-
	- dsl_dataset_name(ds, childname);
	- VERIFY3U(sizeof (childname), >,
	- strlcat(childname, "/", sizeof (childname)));
	- p = strchr(childname, '\0');
	-
	- VERIFY0(dmu_objset_from_ds(ds, &os));
	- do {
	- err = dmu_dir_list_next(os,
	- sizeof (childname) - (p - childname), p, NULL, &cursor);
	- } while (err == 0 && dataset_name_hidden(childname));
	- dsl_dataset_rele(ds, FTAG);
	-
	- if (err == ENOENT) {
	- return (0);
	- } else if (err != 0) {
	- return (luaL_error(state,
	- "unexpected error %d from dmu_dir_list_next()",
	- err));
	- }
	-
	- lua_pushnumber(state, cursor);
	- lua_replace(state, lua_upvalueindex(2));
	-
	- (void) lua_pushstring(state, childname);
	- return (1);
	-}
	-
	-static int zcp_children_list(lua_State *);
	-static zcp_list_info_t zcp_children_list_info = {
	- .name = "children",
	- .func = zcp_children_list,
	- .gc = NULL,
	- .pargs = {
	- { .za_name = "filesystem \| volume", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- }
	-};
	-
	-static int
	-zcp_children_list(lua_State *state)
	-{
	- const char *fsname = lua_tostring(state, 1);
	- dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
	- boolean_t issnap;
	- uint64_t dsobj;
	-
	- dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
	- if (ds == NULL)
	- return (1); /* not reached; zcp_dataset_hold() longjmp'd */
	-
	- issnap = ds->ds_is_snapshot;
	- dsobj = ds->ds_object;
	- dsl_dataset_rele(ds, FTAG);
	-
	- if (issnap) {
	- return (zcp_argerror(state, 1,
	- "argument %s cannot be a snapshot", fsname));
	- }
	-
	- lua_pushnumber(state, dsobj);
	- lua_pushnumber(state, 0);
	- lua_pushcclosure(state, &zcp_children_iter, 2);
	- return (1);
	-}
	-
	-static int
	-zcp_props_list_gc(lua_State *state)
	-{
	- nvlist_t **props = lua_touserdata(state, 1);
	- if (*props != NULL)
	- fnvlist_free(*props);
	- return (0);
	-}
	-
	-static int
	-zcp_props_iter(lua_State *state)
	-{
	- char source, val;
	- nvlist_t *nvprop;
	- nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1));
	- nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2));
	-
	- do {
	- pair = nvlist_next_nvpair(*props, pair);
	- if (pair == NULL) {
	- fnvlist_free(*props);
	- *props = NULL;
	- return (0);
	- }
	- } while (!zfs_prop_user(nvpair_name(pair)));
	-
	- lua_pushlightuserdata(state, pair);
	- lua_replace(state, lua_upvalueindex(2));
	-
	- nvprop = fnvpair_value_nvlist(pair);
	- val = fnvlist_lookup_string(nvprop, ZPROP_VALUE);
	- source = fnvlist_lookup_string(nvprop, ZPROP_SOURCE);
	-
	- (void) lua_pushstring(state, nvpair_name(pair));
	- (void) lua_pushstring(state, val);
	- (void) lua_pushstring(state, source);
	- return (3);
	-}
	-
	-static int zcp_props_list(lua_State *);
	-static zcp_list_info_t zcp_props_list_info = {
	- .name = "properties",
	- .func = zcp_props_list,
	- .gc = zcp_props_list_gc,
	- .pargs = {
	- { .za_name = "filesystem \| snapshot \| volume",
	- .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- }
	-};
	-
	-static int
	-zcp_props_list(lua_State *state)
	-{
	- const char *dsname = lua_tostring(state, 1);
	- dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
	- objset_t *os;
	- nvlist_t *props = lua_newuserdata(state, sizeof (nvlist_t ));
	-
	- dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG);
	- if (ds == NULL)
	- return (1); /* not reached; zcp_dataset_hold() longjmp'd */
	- VERIFY0(dmu_objset_from_ds(ds, &os));
	- VERIFY0(dsl_prop_get_all(os, props));
	- dsl_dataset_rele(ds, FTAG);
	-
	- /*
	- * Set the metatable for the properties list to free it on completion.
	- */
	- luaL_getmetatable(state, zcp_props_list_info.name);
	- (void) lua_setmetatable(state, -2);
	-
	- lua_pushlightuserdata(state, NULL);
	- lua_pushcclosure(state, &zcp_props_iter, 2);
	- return (1);
	-}
	-
	-
	-/*
	- * Populate nv with all valid properties and their values for the given
	- * dataset.
	- */
	-static void
	-zcp_dataset_props(dsl_dataset_t ds, nvlist_t nv)
	-{
	- for (int prop = ZFS_PROP_TYPE; prop < ZFS_NUM_PROPS; prop++) {
	- /* Do not display hidden props */
	- if (!zfs_prop_visible(prop))
	- continue;
	- /* Do not display props not valid for this dataset */
	- if (!prop_valid_for_ds(ds, prop))
	- continue;
	- fnvlist_add_boolean(nv, zfs_prop_to_name(prop));
	- }
	-}
	-
	-static int zcp_system_props_list(lua_State *);
	-static zcp_list_info_t zcp_system_props_list_info = {
	- .name = "system_properties",
	- .func = zcp_system_props_list,
	- .pargs = {
	- { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- }
	-};
	-
	-/*
	- * Get a list of all visble properties and their values for a given dataset.
	- * Returned on the stack as a Lua table.
	- */
	-static int
	-zcp_system_props_list(lua_State *state)
	-{
	- int error;
	- char errbuf[128];
	- const char *dataset_name;
	- dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
	- zcp_list_info_t *libinfo = &zcp_system_props_list_info;
	- zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
	- dataset_name = lua_tostring(state, 1);
	- nvlist_t *nv = fnvlist_alloc();
	-
	- dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
	- if (ds == NULL)
	- return (1); /* not reached; zcp_dataset_hold() longjmp'd */
	-
	- /* Get the names of all valid properties for this dataset */
	- zcp_dataset_props(ds, nv);
	- dsl_dataset_rele(ds, FTAG);
	-
	- /* push list as lua table */
	- error = zcp_nvlist_to_lua(state, nv, errbuf, sizeof (errbuf));
	- nvlist_free(nv);
	- if (error != 0) {
	- return (luaL_error(state,
	- "Error returning nvlist: %s", errbuf));
	- }
	- return (1);
	-}
	-
	-static int
	-zcp_list_func(lua_State *state)
	-{
	- zcp_list_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
	-
	- zcp_parse_args(state, info->name, info->pargs, info->kwargs);
	-
	- return (info->func(state));
	-}
	-
	-int
	-zcp_load_list_lib(lua_State *state)
	-{
	- int i;
	- zcp_list_info_t *zcp_list_funcs[] = {
	- &zcp_children_list_info,
	- &zcp_snapshots_list_info,
	- &zcp_props_list_info,
	- &zcp_clones_list_info,
	- &zcp_system_props_list_info,
	- NULL
	- };
	-
	- lua_newtable(state);
	-
	- for (i = 0; zcp_list_funcs[i] != NULL; i++) {
	- zcp_list_info_t *info = zcp_list_funcs[i];
	-
	- if (info->gc != NULL) {
	- /*
	- * If the function requires garbage collection, create
	- * a metatable with its name and register the __gc
	- * function.
	- */
	- (void) luaL_newmetatable(state, info->name);
	- (void) lua_pushstring(state, "__gc");
	- lua_pushcfunction(state, info->gc);
	- lua_settable(state, -3);
	- lua_pop(state, 1);
	- }
	-
	- lua_pushlightuserdata(state, info);
	- lua_pushcclosure(state, &zcp_list_func, 1);
	- lua_setfield(state, -2, info->name);
	- info++;
	- }
	-
	- return (1);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c
	@@ -1,360 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
	- */
	-
	-#include "lua.h"
	-#include "lauxlib.h"
	-
	-#include <sys/zcp.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_synctask.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_bookmark.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zfeature.h>
	-#include <sys/metaslab.h>
	-
	-#define DST_AVG_BLKSHIFT 14
	-
	-typedef int (zcp_synctask_func_t)(lua_State , boolean_t, nvlist_t );
	-typedef struct zcp_synctask_info {
	- const char *name;
	- zcp_synctask_func_t *func;
	- const zcp_arg_t pargs[4];
	- const zcp_arg_t kwargs[2];
	- zfs_space_check_t space_check;
	- int blocks_modified;
	-} zcp_synctask_info_t;
	-
	-/*
	- * Generic synctask interface for channel program syncfuncs.
	- *
	- * To perform some action in syncing context, we'd generally call
	- * dsl_sync_task(), but since the Lua script is already running inside a
	- * synctask we need to leave out some actions (such as acquiring the config
	- * rwlock and performing space checks).
	- *
	- * If 'sync' is false, executes a dry run and returns the error code.
	- *
	- * If we are not running in syncing context and we are not doing a dry run
	- * (meaning we are running a zfs.sync function in open-context) then we
	- * return a Lua error.
	- *
	- * This function also handles common fatal error cases for channel program
	- * library functions. If a fatal error occurs, err_dsname will be the dataset
	- * name reported in error messages, if supplied.
	- */
	-static int
	-zcp_sync_task(lua_State state, dsl_checkfunc_t checkfunc,
	- dsl_syncfunc_t syncfunc, void arg, boolean_t sync, const char *err_dsname)
	-{
	- int err;
	- zcp_run_info_t *ri = zcp_run_info(state);
	-
	- err = checkfunc(arg, ri->zri_tx);
	- if (!sync)
	- return (err);
	-
	- if (!ri->zri_sync) {
	- return (luaL_error(state, "running functions from the zfs.sync "
	- "submodule requires passing sync=TRUE to "
	- "lzc_channel_program() (i.e. do not specify the \"-n\" "
	- "command line argument)"));
	- }
	-
	- if (err == 0) {
	- syncfunc(arg, ri->zri_tx);
	- } else if (err == EIO) {
	- if (err_dsname != NULL) {
	- return (luaL_error(state,
	- "I/O error while accessing dataset '%s'",
	- err_dsname));
	- } else {
	- return (luaL_error(state,
	- "I/O error while accessing dataset."));
	- }
	- }
	-
	- return (err);
	-}
	-
	-
	-static int zcp_synctask_destroy(lua_State , boolean_t, nvlist_t );
	-static zcp_synctask_info_t zcp_synctask_destroy_info = {
	- .name = "destroy",
	- .func = zcp_synctask_destroy,
	- .pargs = {
	- {.za_name = "filesystem \| snapshot", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
	- {NULL, 0}
	- },
	- .space_check = ZFS_SPACE_CHECK_DESTROY,
	- .blocks_modified = 0
	-};
	-
	-/* ARGSUSED */
	-static int
	-zcp_synctask_destroy(lua_State state, boolean_t sync, nvlist_t err_details)
	-{
	- int err;
	- const char *dsname = lua_tostring(state, 1);
	-
	- boolean_t issnap = (strchr(dsname, '@') != NULL);
	-
	- if (!issnap && !lua_isnil(state, 2)) {
	- return (luaL_error(state,
	- "'deferred' kwarg only supported for snapshots: %s",
	- dsname));
	- }
	-
	- if (issnap) {
	- dsl_destroy_snapshot_arg_t ddsa = { 0 };
	- ddsa.ddsa_name = dsname;
	- if (!lua_isnil(state, 2)) {
	- ddsa.ddsa_defer = lua_toboolean(state, 2);
	- } else {
	- ddsa.ddsa_defer = B_FALSE;
	- }
	-
	- err = zcp_sync_task(state, dsl_destroy_snapshot_check,
	- dsl_destroy_snapshot_sync, &ddsa, sync, dsname);
	- } else {
	- dsl_destroy_head_arg_t ddha = { 0 };
	- ddha.ddha_name = dsname;
	-
	- err = zcp_sync_task(state, dsl_destroy_head_check,
	- dsl_destroy_head_sync, &ddha, sync, dsname);
	- }
	-
	- return (err);
	-}
	-
	-static int zcp_synctask_promote(lua_State , boolean_t, nvlist_t );
	-static zcp_synctask_info_t zcp_synctask_promote_info = {
	- .name = "promote",
	- .func = zcp_synctask_promote,
	- .pargs = {
	- {.za_name = "clone", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- },
	- .space_check = ZFS_SPACE_CHECK_RESERVED,
	- .blocks_modified = 3
	-};
	-
	-static int
	-zcp_synctask_promote(lua_State state, boolean_t sync, nvlist_t err_details)
	-{
	- int err;
	- dsl_dataset_promote_arg_t ddpa = { 0 };
	- const char *dsname = lua_tostring(state, 1);
	- zcp_run_info_t *ri = zcp_run_info(state);
	-
	- ddpa.ddpa_clonename = dsname;
	- ddpa.err_ds = err_details;
	- ddpa.cr = ri->zri_cred;
	-
	- /*
	- * If there was a snapshot name conflict, then err_ds will be filled
	- * with a list of conflicting snapshot names.
	- */
	- err = zcp_sync_task(state, dsl_dataset_promote_check,
	- dsl_dataset_promote_sync, &ddpa, sync, dsname);
	-
	- return (err);
	-}
	-
	-static int zcp_synctask_rollback(lua_State , boolean_t, nvlist_t err_details);
	-static zcp_synctask_info_t zcp_synctask_rollback_info = {
	- .name = "rollback",
	- .func = zcp_synctask_rollback,
	- .space_check = ZFS_SPACE_CHECK_RESERVED,
	- .blocks_modified = 1,
	- .pargs = {
	- {.za_name = "filesystem", .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- }
	-};
	-
	-static int
	-zcp_synctask_rollback(lua_State state, boolean_t sync, nvlist_t err_details)
	-{
	- int err;
	- const char *dsname = lua_tostring(state, 1);
	- dsl_dataset_rollback_arg_t ddra = { 0 };
	-
	- ddra.ddra_fsname = dsname;
	- ddra.ddra_result = err_details;
	-
	- err = zcp_sync_task(state, dsl_dataset_rollback_check,
	- dsl_dataset_rollback_sync, &ddra, sync, dsname);
	-
	- return (err);
	-}
	-
	-static int zcp_synctask_snapshot(lua_State , boolean_t, nvlist_t );
	-static zcp_synctask_info_t zcp_synctask_snapshot_info = {
	- .name = "snapshot",
	- .func = zcp_synctask_snapshot,
	- .pargs = {
	- {.za_name = "filesystem@snapname \| volume@snapname",
	- .za_lua_type = LUA_TSTRING},
	- {NULL, 0}
	- },
	- .kwargs = {
	- {NULL, 0}
	- },
	- .space_check = ZFS_SPACE_CHECK_NORMAL,
	- .blocks_modified = 3
	-};
	-
	-/* ARGSUSED */
	-static int
	-zcp_synctask_snapshot(lua_State state, boolean_t sync, nvlist_t err_details)
	-{
	- int err;
	- dsl_dataset_snapshot_arg_t ddsa = { 0 };
	- const char *dsname = lua_tostring(state, 1);
	- zcp_run_info_t *ri = zcp_run_info(state);
	-
	- /*
	- * On old pools, the ZIL must not be active when a snapshot is created,
	- * but we can't suspend the ZIL because we're already in syncing
	- * context.
	- */
	- if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) {
	- return (ENOTSUP);
	- }
	-
	- /*
	- * We only allow for a single snapshot rather than a list, so the
	- * error list output is unnecessary.
	- */
	- ddsa.ddsa_errors = NULL;
	- ddsa.ddsa_props = NULL;
	- ddsa.ddsa_cr = ri->zri_cred;
	- ddsa.ddsa_snaps = fnvlist_alloc();
	- fnvlist_add_boolean(ddsa.ddsa_snaps, dsname);
	-
	- zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
	- (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps);
	-
	- err = zcp_sync_task(state, dsl_dataset_snapshot_check,
	- dsl_dataset_snapshot_sync, &ddsa, sync, dsname);
	-
	- zcp_deregister_cleanup(state, zch);
	- fnvlist_free(ddsa.ddsa_snaps);
	-
	- return (err);
	-}
	-
	-static int
	-zcp_synctask_wrapper(lua_State *state)
	-{
	- int err;
	- zcp_cleanup_handler_t *zch;
	- int num_ret = 1;
	- nvlist_t *err_details = fnvlist_alloc();
	-
	- /*
	- * Make sure err_details is properly freed, even if a fatal error is
	- * thrown during the synctask.
	- */
	- zch = zcp_register_cleanup(state,
	- (zcp_cleanup_t *)&fnvlist_free, err_details);
	-
	- zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
	- boolean_t sync = lua_toboolean(state, lua_upvalueindex(2));
	-
	- zcp_run_info_t *ri = zcp_run_info(state);
	- dsl_pool_t *dp = ri->zri_pool;
	-
	- /* MOS space is triple-dittoed, so we multiply by 3. */
	- uint64_t funcspace = (info->blocks_modified << DST_AVG_BLKSHIFT) * 3;
	-
	- zcp_parse_args(state, info->name, info->pargs, info->kwargs);
	-
	- err = 0;
	- if (info->space_check != ZFS_SPACE_CHECK_NONE) {
	- uint64_t quota = dsl_pool_unreserved_space(dp,
	- info->space_check);
	- uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes +
	- ri->zri_space_used;
	-
	- if (used + funcspace > quota) {
	- err = SET_ERROR(ENOSPC);
	- }
	- }
	-
	- if (err == 0) {
	- err = info->func(state, sync, err_details);
	- }
	-
	- if (err == 0) {
	- ri->zri_space_used += funcspace;
	- }
	-
	- lua_pushnumber(state, (lua_Number)err);
	- if (fnvlist_num_pairs(err_details) > 0) {
	- (void) zcp_nvlist_to_lua(state, err_details, NULL, 0);
	- num_ret++;
	- }
	-
	- zcp_deregister_cleanup(state, zch);
	- fnvlist_free(err_details);
	-
	- return (num_ret);
	-}
	-
	-int
	-zcp_load_synctask_lib(lua_State *state, boolean_t sync)
	-{
	- int i;
	- zcp_synctask_info_t *zcp_synctask_funcs[] = {
	- &zcp_synctask_destroy_info,
	- &zcp_synctask_promote_info,
	- &zcp_synctask_rollback_info,
	- &zcp_synctask_snapshot_info,
	- NULL
	- };
	-
	- lua_newtable(state);
	-
	- for (i = 0; zcp_synctask_funcs[i] != NULL; i++) {
	- zcp_synctask_info_t *info = zcp_synctask_funcs[i];
	- lua_pushlightuserdata(state, info);
	- lua_pushboolean(state, sync);
	- lua_pushcclosure(state, &zcp_synctask_wrapper, 2);
	- lua_setfield(state, -2, info->name);
	- info++;
	- }
	-
	- return (1);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
	@@ -1,505 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/zfeature.h>
	-#include <sys/dmu.h>
	-#include <sys/nvpair.h>
	-#include <sys/zap.h>
	-#include <sys/dmu_tx.h>
	-#include "zfeature_common.h"
	-#include <sys/spa_impl.h>
	-
	-/*
	- * ZFS Feature Flags
	- * -----------------
	- *
	- * ZFS feature flags are used to provide fine-grained versioning to the ZFS
	- * on-disk format. Once enabled on a pool feature flags replace the old
	- * spa_version() number.
	- *
	- * Each new on-disk format change will be given a uniquely identifying string
	- * guid rather than a version number. This avoids the problem of different
	- * organizations creating new on-disk formats with the same version number. To
	- * keep feature guids unique they should consist of the reverse dns name of the
	- * organization which implemented the feature and a short name for the feature,
	- * separated by a colon (e.g. com.delphix:async_destroy).
	- *
	- * Reference Counts
	- * ----------------
	- *
	- * Within each pool features can be in one of three states: disabled, enabled,
	- * or active. These states are differentiated by a reference count stored on
	- * disk for each feature:
	- *
	- * 1) If there is no reference count stored on disk the feature is disabled.
	- * 2) If the reference count is 0 a system administrator has enabled the
	- * feature, but the feature has not been used yet, so no on-disk
	- * format changes have been made.
	- * 3) If the reference count is greater than 0 the feature is active.
	- * The format changes required by the feature are currently on disk.
	- * Note that if the feature's format changes are reversed the feature
	- * may choose to set its reference count back to 0.
	- *
	- * Feature flags makes no differentiation between non-zero reference counts
	- * for an active feature (e.g. a reference count of 1 means the same thing as a
	- * reference count of 27834721), but feature implementations may choose to use
	- * the reference count to store meaningful information. For example, a new RAID
	- * implementation might set the reference count to the number of vdevs using
	- * it. If all those disks are removed from the pool the feature goes back to
	- * having a reference count of 0.
	- *
	- * It is the responsibility of the individual features to maintain a non-zero
	- * reference count as long as the feature's format changes are present on disk.
	- *
	- * Dependencies
	- * ------------
	- *
	- * Each feature may depend on other features. The only effect of this
	- * relationship is that when a feature is enabled all of its dependencies are
	- * automatically enabled as well. Any future work to support disabling of
	- * features would need to ensure that features cannot be disabled if other
	- * enabled features depend on them.
	- *
	- * On-disk Format
	- * --------------
	- *
	- * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
	- * (5000). In order for this to work the pool is automatically upgraded to
	- * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
	- * format changes will be in use.
	- *
	- * Information about features is stored in 3 ZAP objects in the pool's MOS.
	- * These objects are linked to by the following names in the pool directory
	- * object:
	- *
	- * 1) features_for_read: feature guid -> reference count
	- * Features needed to open the pool for reading.
	- * 2) features_for_write: feature guid -> reference count
	- * Features needed to open the pool for writing.
	- * 3) feature_descriptions: feature guid -> descriptive string
	- * A human readable string.
	- *
	- * All enabled features appear in either features_for_read or
	- * features_for_write, but not both.
	- *
	- * To open a pool in read-only mode only the features listed in
	- * features_for_read need to be supported.
	- *
	- * To open the pool in read-write mode features in both features_for_read and
	- * features_for_write need to be supported.
	- *
	- * Some features may be required to read the ZAP objects containing feature
	- * information. To allow software to check for compatibility with these features
	- * before the pool is opened their names must be stored in the label in a
	- * new "features_for_read" entry (note that features that are only required
	- * to write to a pool never need to be stored in the label since the
	- * features_for_write ZAP object can be read before the pool is written to).
	- * To save space in the label features must be explicitly marked as needing to
	- * be written to the label. Also, reference counts are not stored in the label,
	- * instead any feature whose reference count drops to 0 is removed from the
	- * label.
	- *
	- * Adding New Features
	- * -------------------
	- *
	- * Features must be registered in zpool_feature_init() function in
	- * zfeature_common.c using the zfeature_register() function. This function
	- * has arguments to specify if the feature should be stored in the
	- * features_for_read or features_for_write ZAP object and if it needs to be
	- * written to the label when active.
	- *
	- * Once a feature is registered it will appear as a "feature@<feature name>"
	- * property which can be set by an administrator. Feature implementors should
	- * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
	- * query the state of a feature and the spa_feature_incr() and
	- * spa_feature_decr() functions to change an enabled feature's reference count.
	- * Reference counts may only be updated in the syncing context.
	- *
	- * Features may not perform enable-time initialization. Instead, any such
	- * initialization should occur when the feature is first used. This design
	- * enforces that on-disk changes be made only when features are used. Code
	- * should only check if a feature is enabled using spa_feature_is_enabled(),
	- * not by relying on any feature specific metadata existing. If a feature is
	- * enabled, but the feature's metadata is not on disk yet then it should be
	- * created as needed.
	- *
	- * As an example, consider the com.delphix:async_destroy feature. This feature
	- * relies on the existence of a bptree in the MOS that store blocks for
	- * asynchronous freeing. This bptree is not created when async_destroy is
	- * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
	- * called to check if async_destroy is enabled. If it is and the bptree object
	- * does not exist yet, the bptree object is created as part of the dataset
	- * destroy and async_destroy's reference count is incremented to indicate it
	- * has made an on-disk format change. Later, after the destroyed dataset's
	- * blocks have all been asynchronously freed there is no longer any use for the
	- * bptree object, so it is destroyed and async_destroy's reference count is
	- * decremented back to 0 to indicate that it has undone its on-disk format
	- * changes.
	- */
	-
	-typedef enum {
	- FEATURE_ACTION_INCR,
	- FEATURE_ACTION_DECR,
	-} feature_action_t;
	-
	-/*
	- * Checks that the active features in the pool are supported by
	- * this software. Adds each unsupported feature (name -> description) to
	- * the supplied nvlist.
	- */
	-boolean_t
	-spa_features_check(spa_t *spa, boolean_t for_write,
	- nvlist_t unsup_feat, nvlist_t enabled_feat)
	-{
	- objset_t *os = spa->spa_meta_objset;
	- boolean_t supported;
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- uint64_t obj = for_write ?
	- spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
	-
	- supported = B_TRUE;
	- for (zap_cursor_init(&zc, os, obj);
	- zap_cursor_retrieve(&zc, &za) == 0;
	- zap_cursor_advance(&zc)) {
	- ASSERT(za.za_integer_length == sizeof (uint64_t) &&
	- za.za_num_integers == 1);
	-
	- if (NULL != enabled_feat) {
	- fnvlist_add_uint64(enabled_feat, za.za_name,
	- za.za_first_integer);
	- }
	-
	- if (za.za_first_integer != 0 &&
	- !zfeature_is_supported(za.za_name)) {
	- supported = B_FALSE;
	-
	- if (NULL != unsup_feat) {
	- char *desc = "";
	- char buf[MAXPATHLEN];
	-
	- if (zap_lookup(os, spa->spa_feat_desc_obj,
	- za.za_name, 1, sizeof (buf), buf) == 0)
	- desc = buf;
	-
	- VERIFY(nvlist_add_string(unsup_feat, za.za_name,
	- desc) == 0);
	- }
	- }
	- }
	- zap_cursor_fini(&zc);
	-
	- return (supported);
	-}
	-
	-/*
	- * Use an in-memory cache of feature refcounts for quick retrieval.
	- *
	- * Note: well-designed features will not need to use this; they should
	- * use spa_feature_is_enabled() and spa_feature_is_active() instead.
	- * However, this is non-static for zdb, zhack, and spa_add_feature_stats().
	- */
	-int
	-feature_get_refcount(spa_t spa, zfeature_info_t feature, uint64_t *res)
	-{
	- ASSERT(VALID_FEATURE_FID(feature->fi_feature));
	- if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
	- SPA_FEATURE_DISABLED) {
	- return (SET_ERROR(ENOTSUP));
	- }
	- *res = spa->spa_feat_refcount_cache[feature->fi_feature];
	- return (0);
	-}
	-
	-/*
	- * Note: well-designed features will not need to use this; they should
	- * use spa_feature_is_enabled() and spa_feature_is_active() instead.
	- * However, this is non-static for zdb and zhack.
	- */
	-int
	-feature_get_refcount_from_disk(spa_t spa, zfeature_info_t feature,
	- uint64_t *res)
	-{
	- int err;
	- uint64_t refcount;
	- uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
	- spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
	-
	- /*
	- * If the pool is currently being created, the feature objects may not
	- * have been allocated yet. Act as though all features are disabled.
	- */
	- if (zapobj == 0)
	- return (SET_ERROR(ENOTSUP));
	-
	- err = zap_lookup(spa->spa_meta_objset, zapobj,
	- feature->fi_guid, sizeof (uint64_t), 1, &refcount);
	- if (err != 0) {
	- if (err == ENOENT)
	- return (SET_ERROR(ENOTSUP));
	- else
	- return (err);
	- }
	- *res = refcount;
	- return (0);
	-}
	-
	-
	-static int
	-feature_get_enabled_txg(spa_t spa, zfeature_info_t feature, uint64_t *res)
	-{
	- uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj;
	-
	- ASSERT(zfeature_depends_on(feature->fi_feature,
	- SPA_FEATURE_ENABLED_TXG));
	-
	- if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- ASSERT(enabled_txg_obj != 0);
	-
	- VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
	- feature->fi_guid, sizeof (uint64_t), 1, res));
	-
	- return (0);
	-}
	-
	-/*
	- * This function is non-static for zhack; it should otherwise not be used
	- * outside this file.
	- */
	-void
	-feature_sync(spa_t spa, zfeature_info_t feature, uint64_t refcount,
	- dmu_tx_t *tx)
	-{
	- ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
	- uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
	- spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
	-
	- VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
	- sizeof (uint64_t), 1, &refcount, tx));
	-
	- /*
	- * feature_sync is called directly from zhack, allowing the
	- * creation of arbitrary features whose fi_feature field may
	- * be greater than SPA_FEATURES. When called from zhack, the
	- * zfeature_info_t object's fi_feature field will be set to
	- * SPA_FEATURE_NONE.
	- */
	- if (feature->fi_feature != SPA_FEATURE_NONE) {
	- uint64_t *refcount_cache =
	- &spa->spa_feat_refcount_cache[feature->fi_feature];
	- VERIFY3U(*refcount_cache, ==,
	- atomic_swap_64(refcount_cache, refcount));
	- }
	-
	- if (refcount == 0)
	- spa_deactivate_mos_feature(spa, feature->fi_guid);
	- else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
	- spa_activate_mos_feature(spa, feature->fi_guid, tx);
	-}
	-
	-/*
	- * This function is non-static for zhack; it should otherwise not be used
	- * outside this file.
	- */
	-void
	-feature_enable_sync(spa_t spa, zfeature_info_t feature, dmu_tx_t *tx)
	-{
	- uint64_t initial_refcount =
	- (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
	- uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
	- spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
	-
	- ASSERT(0 != zapobj);
	- ASSERT(zfeature_is_valid_guid(feature->fi_guid));
	- ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
	-
	- /*
	- * If the feature is already enabled, ignore the request.
	- */
	- if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
	- return;
	-
	- for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
	- spa_feature_enable(spa, feature->fi_depends[i], tx);
	-
	- VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
	- feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
	- feature->fi_desc, tx));
	-
	- feature_sync(spa, feature, initial_refcount, tx);
	-
	- if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
	- uint64_t enabling_txg = dmu_tx_get_txg(tx);
	-
	- if (spa->spa_feat_enabled_txg_obj == 0ULL) {
	- spa->spa_feat_enabled_txg_obj =
	- zap_create_link(spa->spa_meta_objset,
	- DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_FEATURE_ENABLED_TXG, tx);
	- }
	- spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
	-
	- VERIFY0(zap_add(spa->spa_meta_objset,
	- spa->spa_feat_enabled_txg_obj, feature->fi_guid,
	- sizeof (uint64_t), 1, &enabling_txg, tx));
	- }
	-}
	-
	-static void
	-feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
	- dmu_tx_t *tx)
	-{
	- uint64_t refcount;
	- zfeature_info_t *feature = &spa_feature_table[fid];
	- uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
	- spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
	-
	- ASSERT(VALID_FEATURE_FID(fid));
	- ASSERT(0 != zapobj);
	- ASSERT(zfeature_is_valid_guid(feature->fi_guid));
	-
	- ASSERT(dmu_tx_is_syncing(tx));
	- ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
	-
	- VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
	-
	- switch (action) {
	- case FEATURE_ACTION_INCR:
	- VERIFY3U(refcount, !=, UINT64_MAX);
	- refcount++;
	- break;
	- case FEATURE_ACTION_DECR:
	- VERIFY3U(refcount, !=, 0);
	- refcount--;
	- break;
	- default:
	- ASSERT(0);
	- break;
	- }
	-
	- feature_sync(spa, feature, refcount, tx);
	-}
	-
	-void
	-spa_feature_create_zap_objects(spa_t spa, dmu_tx_t tx)
	-{
	- /*
	- * We create feature flags ZAP objects in two instances: during pool
	- * creation and during pool upgrade.
	- */
	- ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) \|\| (!spa->spa_sync_on &&
	- tx->tx_txg == TXG_INITIAL));
	-
	- spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
	- DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_FEATURES_FOR_READ, tx);
	- spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
	- DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_FEATURES_FOR_WRITE, tx);
	- spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
	- DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
	- DMU_POOL_FEATURE_DESCRIPTIONS, tx);
	-}
	-
	-/*
	- * Enable any required dependencies, then enable the requested feature.
	- */
	-void
	-spa_feature_enable(spa_t spa, spa_feature_t fid, dmu_tx_t tx)
	-{
	- ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
	- ASSERT(VALID_FEATURE_FID(fid));
	- feature_enable_sync(spa, &spa_feature_table[fid], tx);
	-}
	-
	-void
	-spa_feature_incr(spa_t spa, spa_feature_t fid, dmu_tx_t tx)
	-{
	- feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
	-}
	-
	-void
	-spa_feature_decr(spa_t spa, spa_feature_t fid, dmu_tx_t tx)
	-{
	- feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
	-}
	-
	-boolean_t
	-spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
	-{
	- int err;
	- uint64_t refcount;
	-
	- ASSERT(VALID_FEATURE_FID(fid));
	- if (spa_version(spa) < SPA_VERSION_FEATURES)
	- return (B_FALSE);
	-
	- err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
	- ASSERT(err == 0 \|\| err == ENOTSUP);
	- return (err == 0);
	-}
	-
	-boolean_t
	-spa_feature_is_active(spa_t *spa, spa_feature_t fid)
	-{
	- int err;
	- uint64_t refcount;
	-
	- ASSERT(VALID_FEATURE_FID(fid));
	- if (spa_version(spa) < SPA_VERSION_FEATURES)
	- return (B_FALSE);
	-
	- err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
	- ASSERT(err == 0 \|\| err == ENOTSUP);
	- return (err == 0 && refcount > 0);
	-}
	-
	-/*
	- * For the feature specified by fid (which must depend on
	- * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
	- * OUT txg argument.
	- *
	- * Returns B_TRUE if the feature is enabled, in which case txg will be filled
	- * with the transaction group in which the specified feature was enabled.
	- * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
	- */
	-boolean_t
	-spa_feature_enabled_txg(spa_t spa, spa_feature_t fid, uint64_t txg)
	-{
	- int err;
	-
	- ASSERT(VALID_FEATURE_FID(fid));
	- if (spa_version(spa) < SPA_VERSION_FEATURES)
	- return (B_FALSE);
	-
	- err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
	- ASSERT(err == 0 \|\| err == ENOTSUP);
	-
	- return (err == 0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
	@@ -1,28 +0,0 @@
	-#
	-# CDDL HEADER START
	-#
	-# The contents of this file are subject to the terms of the
	-# Common Development and Distribution License, Version 1.0 only
	-# (the "License"). You may not use this file except in compliance
	-# with the License.
	-#
	-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	-# or http://www.opensolaris.org/os/licensing.
	-# See the License for the specific language governing permissions
	-# and limitations under the License.
	-#
	-# When distributing Covered Code, include this CDDL HEADER in each
	-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	-# If applicable, add the following below this CDDL HEADER, with the
	-# fields enclosed by brackets "[]" replaced with your own identifying
	-# information: Portions Copyright [yyyy] [name of copyright owner]
	-#
	-# CDDL HEADER END
	-#
	-#
	-# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
	-# Use is subject to license terms.
	-#
	-# ident "%Z%%M% %I% %E% SMI"
	-#
	-name="zfs" parent="pseudo";
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
	@@ -1,2778 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/time.h>
	-#include <sys/systm.h>
	-#include <sys/sysmacros.h>
	-#include <sys/resource.h>
	-#include <sys/vfs.h>
	-#include <sys/vnode.h>
	-#include <sys/file.h>
	-#include <sys/stat.h>
	-#include <sys/kmem.h>
	-#include <sys/cmn_err.h>
	-#include <sys/errno.h>
	-#include <sys/unistd.h>
	-#include <sys/sdt.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/policy.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zfs_fuid.h>
	-#include <sys/zfs_acl.h>
	-#include <sys/zfs_dir.h>
	-#include <sys/zfs_vfsops.h>
	-#include <sys/dmu.h>
	-#include <sys/dnode.h>
	-#include <sys/zap.h>
	-#include <sys/sa.h>
	-#include <acl/acl_common.h>
	-
	-#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
	-#define DENY ACE_ACCESS_DENIED_ACE_TYPE
	-#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
	-#define MIN_ACE_TYPE ALLOW
	-
	-#define OWNING_GROUP (ACE_GROUP\|ACE_IDENTIFIER_GROUP)
	-#define EVERYONE_ALLOW_MASK (ACE_READ_ACL\|ACE_READ_ATTRIBUTES \| \
	- ACE_READ_NAMED_ATTRS\|ACE_SYNCHRONIZE)
	-#define EVERYONE_DENY_MASK (ACE_WRITE_ACL\|ACE_WRITE_OWNER \| \
	- ACE_WRITE_ATTRIBUTES\|ACE_WRITE_NAMED_ATTRS)
	-#define OWNER_ALLOW_MASK (ACE_WRITE_ACL \| ACE_WRITE_OWNER \| \
	- ACE_WRITE_ATTRIBUTES\|ACE_WRITE_NAMED_ATTRS)
	-
	-#define ZFS_CHECKED_MASKS (ACE_READ_ACL\|ACE_READ_ATTRIBUTES\|ACE_READ_DATA\| \
	- ACE_READ_NAMED_ATTRS\|ACE_WRITE_DATA\|ACE_WRITE_ATTRIBUTES\| \
	- ACE_WRITE_NAMED_ATTRS\|ACE_APPEND_DATA\|ACE_EXECUTE\|ACE_WRITE_OWNER\| \
	- ACE_WRITE_ACL\|ACE_DELETE\|ACE_DELETE_CHILD\|ACE_SYNCHRONIZE)
	-
	-#define WRITE_MASK_DATA (ACE_WRITE_DATA\|ACE_APPEND_DATA\|ACE_WRITE_NAMED_ATTRS)
	-#define WRITE_MASK_ATTRS (ACE_WRITE_ACL\|ACE_WRITE_OWNER\|ACE_WRITE_ATTRIBUTES\| \
	- ACE_DELETE\|ACE_DELETE_CHILD)
	-#define WRITE_MASK (WRITE_MASK_DATA\|WRITE_MASK_ATTRS)
	-
	-#define OGE_CLEAR (ACE_READ_DATA\|ACE_LIST_DIRECTORY\|ACE_WRITE_DATA\| \
	- ACE_ADD_FILE\|ACE_APPEND_DATA\|ACE_ADD_SUBDIRECTORY\|ACE_EXECUTE)
	-
	-#define OKAY_MASK_BITS (ACE_READ_DATA\|ACE_LIST_DIRECTORY\|ACE_WRITE_DATA\| \
	- ACE_ADD_FILE\|ACE_APPEND_DATA\|ACE_ADD_SUBDIRECTORY\|ACE_EXECUTE)
	-
	-#define ALL_INHERIT (ACE_FILE_INHERIT_ACE\|ACE_DIRECTORY_INHERIT_ACE \| \
	- ACE_NO_PROPAGATE_INHERIT_ACE\|ACE_INHERIT_ONLY_ACE\|ACE_INHERITED_ACE)
	-
	-#define RESTRICTED_CLEAR (ACE_WRITE_ACL\|ACE_WRITE_OWNER)
	-
	-#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT\|ZFS_ACL_DEFAULTED\|\
	- ZFS_ACL_PROTECTED)
	-
	-#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS\|ZFS_ACL_TRIVIAL\|ZFS_INHERIT_ACE\|\
	- ZFS_ACL_OBJ_ACE)
	-
	-#define ALL_MODE_EXECS (S_IXUSR \| S_IXGRP \| S_IXOTH)
	-
	-static uint16_t
	-zfs_ace_v0_get_type(void *acep)
	-{
	- return (((zfs_oldace_t *)acep)->z_type);
	-}
	-
	-static uint16_t
	-zfs_ace_v0_get_flags(void *acep)
	-{
	- return (((zfs_oldace_t *)acep)->z_flags);
	-}
	-
	-static uint32_t
	-zfs_ace_v0_get_mask(void *acep)
	-{
	- return (((zfs_oldace_t *)acep)->z_access_mask);
	-}
	-
	-static uint64_t
	-zfs_ace_v0_get_who(void *acep)
	-{
	- return (((zfs_oldace_t *)acep)->z_fuid);
	-}
	-
	-static void
	-zfs_ace_v0_set_type(void *acep, uint16_t type)
	-{
	- ((zfs_oldace_t *)acep)->z_type = type;
	-}
	-
	-static void
	-zfs_ace_v0_set_flags(void *acep, uint16_t flags)
	-{
	- ((zfs_oldace_t *)acep)->z_flags = flags;
	-}
	-
	-static void
	-zfs_ace_v0_set_mask(void *acep, uint32_t mask)
	-{
	- ((zfs_oldace_t *)acep)->z_access_mask = mask;
	-}
	-
	-static void
	-zfs_ace_v0_set_who(void *acep, uint64_t who)
	-{
	- ((zfs_oldace_t *)acep)->z_fuid = who;
	-}
	-
	-/ARGSUSED/
	-static size_t
	-zfs_ace_v0_size(void *acep)
	-{
	- return (sizeof (zfs_oldace_t));
	-}
	-
	-static size_t
	-zfs_ace_v0_abstract_size(void)
	-{
	- return (sizeof (zfs_oldace_t));
	-}
	-
	-static int
	-zfs_ace_v0_mask_off(void)
	-{
	- return (offsetof(zfs_oldace_t, z_access_mask));
	-}
	-
	-/ARGSUSED/
	-static int
	-zfs_ace_v0_data(void acep, void *datap)
	-{
	- *datap = NULL;
	- return (0);
	-}
	-
	-static acl_ops_t zfs_acl_v0_ops = {
	- zfs_ace_v0_get_mask,
	- zfs_ace_v0_set_mask,
	- zfs_ace_v0_get_flags,
	- zfs_ace_v0_set_flags,
	- zfs_ace_v0_get_type,
	- zfs_ace_v0_set_type,
	- zfs_ace_v0_get_who,
	- zfs_ace_v0_set_who,
	- zfs_ace_v0_size,
	- zfs_ace_v0_abstract_size,
	- zfs_ace_v0_mask_off,
	- zfs_ace_v0_data
	-};
	-
	-static uint16_t
	-zfs_ace_fuid_get_type(void *acep)
	-{
	- return (((zfs_ace_hdr_t *)acep)->z_type);
	-}
	-
	-static uint16_t
	-zfs_ace_fuid_get_flags(void *acep)
	-{
	- return (((zfs_ace_hdr_t *)acep)->z_flags);
	-}
	-
	-static uint32_t
	-zfs_ace_fuid_get_mask(void *acep)
	-{
	- return (((zfs_ace_hdr_t *)acep)->z_access_mask);
	-}
	-
	-static uint64_t
	-zfs_ace_fuid_get_who(void *args)
	-{
	- uint16_t entry_type;
	- zfs_ace_t *acep = args;
	-
	- entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
	-
	- if (entry_type == ACE_OWNER \|\| entry_type == OWNING_GROUP \|\|
	- entry_type == ACE_EVERYONE)
	- return (-1);
	- return (((zfs_ace_t *)acep)->z_fuid);
	-}
	-
	-static void
	-zfs_ace_fuid_set_type(void *acep, uint16_t type)
	-{
	- ((zfs_ace_hdr_t *)acep)->z_type = type;
	-}
	-
	-static void
	-zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
	-{
	- ((zfs_ace_hdr_t *)acep)->z_flags = flags;
	-}
	-
	-static void
	-zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
	-{
	- ((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
	-}
	-
	-static void
	-zfs_ace_fuid_set_who(void *arg, uint64_t who)
	-{
	- zfs_ace_t *acep = arg;
	-
	- uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
	-
	- if (entry_type == ACE_OWNER \|\| entry_type == OWNING_GROUP \|\|
	- entry_type == ACE_EVERYONE)
	- return;
	- acep->z_fuid = who;
	-}
	-
	-static size_t
	-zfs_ace_fuid_size(void *acep)
	-{
	- zfs_ace_hdr_t *zacep = acep;
	- uint16_t entry_type;
	-
	- switch (zacep->z_type) {
	- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
	- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
	- return (sizeof (zfs_object_ace_t));
	- case ALLOW:
	- case DENY:
	- entry_type =
	- (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
	- if (entry_type == ACE_OWNER \|\|
	- entry_type == OWNING_GROUP \|\|
	- entry_type == ACE_EVERYONE)
	- return (sizeof (zfs_ace_hdr_t));
	- /FALLTHROUGH/
	- default:
	- return (sizeof (zfs_ace_t));
	- }
	-}
	-
	-static size_t
	-zfs_ace_fuid_abstract_size(void)
	-{
	- return (sizeof (zfs_ace_hdr_t));
	-}
	-
	-static int
	-zfs_ace_fuid_mask_off(void)
	-{
	- return (offsetof(zfs_ace_hdr_t, z_access_mask));
	-}
	-
	-static int
	-zfs_ace_fuid_data(void acep, void *datap)
	-{
	- zfs_ace_t *zacep = acep;
	- zfs_object_ace_t *zobjp;
	-
	- switch (zacep->z_hdr.z_type) {
	- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
	- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
	- zobjp = acep;
	- *datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
	- return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
	- default:
	- *datap = NULL;
	- return (0);
	- }
	-}
	-
	-static acl_ops_t zfs_acl_fuid_ops = {
	- zfs_ace_fuid_get_mask,
	- zfs_ace_fuid_set_mask,
	- zfs_ace_fuid_get_flags,
	- zfs_ace_fuid_set_flags,
	- zfs_ace_fuid_get_type,
	- zfs_ace_fuid_set_type,
	- zfs_ace_fuid_get_who,
	- zfs_ace_fuid_set_who,
	- zfs_ace_fuid_size,
	- zfs_ace_fuid_abstract_size,
	- zfs_ace_fuid_mask_off,
	- zfs_ace_fuid_data
	-};
	-
	-/*
	- * The following three functions are provided for compatibility with
	- * older ZPL version in order to determine if the file use to have
	- * an external ACL and what version of ACL previously existed on the
	- * file. Would really be nice to not need this, sigh.
	- */
	-uint64_t
	-zfs_external_acl(znode_t *zp)
	-{
	- zfs_acl_phys_t acl_phys;
	- int error;
	-
	- if (zp->z_is_sa)
	- return (0);
	-
	- /*
	- * Need to deal with a potential
	- * race where zfs_sa_upgrade could cause
	- * z_isa_sa to change.
	- *
	- * If the lookup fails then the state of z_is_sa should have
	- * changed.
	- */
	-
	- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
	- &acl_phys, sizeof (acl_phys))) == 0)
	- return (acl_phys.z_acl_extern_obj);
	- else {
	- /*
	- * after upgrade the SA_ZPL_ZNODE_ACL should have been
	- * removed
	- */
	- VERIFY(zp->z_is_sa && error == ENOENT);
	- return (0);
	- }
	-}
	-
	-/*
	- * Determine size of ACL in bytes
	- *
	- * This is more complicated than it should be since we have to deal
	- * with old external ACLs.
	- */
	-static int
	-zfs_acl_znode_info(znode_t zp, int aclsize, int *aclcount,
	- zfs_acl_phys_t *aclphys)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- uint64_t acl_count;
	- int size;
	- int error;
	-
	- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
	- if (zp->z_is_sa) {
	- if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
	- &size)) != 0)
	- return (error);
	- *aclsize = size;
	- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
	- &acl_count, sizeof (acl_count))) != 0)
	- return (error);
	- *aclcount = acl_count;
	- } else {
	- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
	- aclphys, sizeof (*aclphys))) != 0)
	- return (error);
	-
	- if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
	- *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
	- *aclcount = aclphys->z_acl_size;
	- } else {
	- *aclsize = aclphys->z_acl_size;
	- *aclcount = aclphys->z_acl_count;
	- }
	- }
	- return (0);
	-}
	-
	-int
	-zfs_znode_acl_version(znode_t *zp)
	-{
	- zfs_acl_phys_t acl_phys;
	-
	- if (zp->z_is_sa)
	- return (ZFS_ACL_VERSION_FUID);
	- else {
	- int error;
	-
	- /*
	- * Need to deal with a potential
	- * race where zfs_sa_upgrade could cause
	- * z_isa_sa to change.
	- *
	- * If the lookup fails then the state of z_is_sa should have
	- * changed.
	- */
	- if ((error = sa_lookup(zp->z_sa_hdl,
	- SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
	- &acl_phys, sizeof (acl_phys))) == 0)
	- return (acl_phys.z_acl_version);
	- else {
	- /*
	- * After upgrade SA_ZPL_ZNODE_ACL should have
	- * been removed.
	- */
	- VERIFY(zp->z_is_sa && error == ENOENT);
	- return (ZFS_ACL_VERSION_FUID);
	- }
	- }
	-}
	-
	-static int
	-zfs_acl_version(int version)
	-{
	- if (version < ZPL_VERSION_FUID)
	- return (ZFS_ACL_VERSION_INITIAL);
	- else
	- return (ZFS_ACL_VERSION_FUID);
	-}
	-
	-static int
	-zfs_acl_version_zp(znode_t *zp)
	-{
	- return (zfs_acl_version(zp->z_zfsvfs->z_version));
	-}
	-
	-zfs_acl_t *
	-zfs_acl_alloc(int vers)
	-{
	- zfs_acl_t *aclp;
	-
	- aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
	- list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
	- offsetof(zfs_acl_node_t, z_next));
	- aclp->z_version = vers;
	- if (vers == ZFS_ACL_VERSION_FUID)
	- aclp->z_ops = zfs_acl_fuid_ops;
	- else
	- aclp->z_ops = zfs_acl_v0_ops;
	- return (aclp);
	-}
	-
	-zfs_acl_node_t *
	-zfs_acl_node_alloc(size_t bytes)
	-{
	- zfs_acl_node_t *aclnode;
	-
	- aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
	- if (bytes) {
	- aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
	- aclnode->z_allocdata = aclnode->z_acldata;
	- aclnode->z_allocsize = bytes;
	- aclnode->z_size = bytes;
	- }
	-
	- return (aclnode);
	-}
	-
	-static void
	-zfs_acl_node_free(zfs_acl_node_t *aclnode)
	-{
	- if (aclnode->z_allocsize)
	- kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
	- kmem_free(aclnode, sizeof (zfs_acl_node_t));
	-}
	-
	-static void
	-zfs_acl_release_nodes(zfs_acl_t *aclp)
	-{
	- zfs_acl_node_t *aclnode;
	-
	- while (aclnode = list_head(&aclp->z_acl)) {
	- list_remove(&aclp->z_acl, aclnode);
	- zfs_acl_node_free(aclnode);
	- }
	- aclp->z_acl_count = 0;
	- aclp->z_acl_bytes = 0;
	-}
	-
	-void
	-zfs_acl_free(zfs_acl_t *aclp)
	-{
	- zfs_acl_release_nodes(aclp);
	- list_destroy(&aclp->z_acl);
	- kmem_free(aclp, sizeof (zfs_acl_t));
	-}
	-
	-static boolean_t
	-zfs_acl_valid_ace_type(uint_t type, uint_t flags)
	-{
	- uint16_t entry_type;
	-
	- switch (type) {
	- case ALLOW:
	- case DENY:
	- case ACE_SYSTEM_AUDIT_ACE_TYPE:
	- case ACE_SYSTEM_ALARM_ACE_TYPE:
	- entry_type = flags & ACE_TYPE_FLAGS;
	- return (entry_type == ACE_OWNER \|\|
	- entry_type == OWNING_GROUP \|\|
	- entry_type == ACE_EVERYONE \|\| entry_type == 0 \|\|
	- entry_type == ACE_IDENTIFIER_GROUP);
	- default:
	- if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-static boolean_t
	-zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
	-{
	- /*
	- * first check type of entry
	- */
	-
	- if (!zfs_acl_valid_ace_type(type, iflags))
	- return (B_FALSE);
	-
	- switch (type) {
	- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
	- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
	- if (aclp->z_version < ZFS_ACL_VERSION_FUID)
	- return (B_FALSE);
	- aclp->z_hints \|= ZFS_ACL_OBJ_ACE;
	- }
	-
	- /*
	- * next check inheritance level flags
	- */
	-
	- if (obj_type == VDIR &&
	- (iflags & (ACE_FILE_INHERIT_ACE\|ACE_DIRECTORY_INHERIT_ACE)))
	- aclp->z_hints \|= ZFS_INHERIT_ACE;
	-
	- if (iflags & (ACE_INHERIT_ONLY_ACE\|ACE_NO_PROPAGATE_INHERIT_ACE)) {
	- if ((iflags & (ACE_FILE_INHERIT_ACE\|
	- ACE_DIRECTORY_INHERIT_ACE)) == 0) {
	- return (B_FALSE);
	- }
	- }
	-
	- return (B_TRUE);
	-}
	-
	-static void *
	-zfs_acl_next_ace(zfs_acl_t aclp, void start, uint64_t *who,
	- uint32_t access_mask, uint16_t iflags, uint16_t *type)
	-{
	- zfs_acl_node_t *aclnode;
	-
	- ASSERT(aclp);
	-
	- if (start == NULL) {
	- aclnode = list_head(&aclp->z_acl);
	- if (aclnode == NULL)
	- return (NULL);
	-
	- aclp->z_next_ace = aclnode->z_acldata;
	- aclp->z_curr_node = aclnode;
	- aclnode->z_ace_idx = 0;
	- }
	-
	- aclnode = aclp->z_curr_node;
	-
	- if (aclnode == NULL)
	- return (NULL);
	-
	- if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
	- aclnode = list_next(&aclp->z_acl, aclnode);
	- if (aclnode == NULL)
	- return (NULL);
	- else {
	- aclp->z_curr_node = aclnode;
	- aclnode->z_ace_idx = 0;
	- aclp->z_next_ace = aclnode->z_acldata;
	- }
	- }
	-
	- if (aclnode->z_ace_idx < aclnode->z_ace_count) {
	- void *acep = aclp->z_next_ace;
	- size_t ace_size;
	-
	- /*
	- * Make sure we don't overstep our bounds
	- */
	- ace_size = aclp->z_ops.ace_size(acep);
	-
	- if (((caddr_t)acep + ace_size) >
	- ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
	- return (NULL);
	- }
	-
	- *iflags = aclp->z_ops.ace_flags_get(acep);
	- *type = aclp->z_ops.ace_type_get(acep);
	- *access_mask = aclp->z_ops.ace_mask_get(acep);
	- *who = aclp->z_ops.ace_who_get(acep);
	- aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
	- aclnode->z_ace_idx++;
	-
	- return ((void *)acep);
	- }
	- return (NULL);
	-}
	-
	-/ARGSUSED/
	-static uint64_t
	-zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
	- uint16_t flags, uint16_t type, uint32_t *mask)
	-{
	- zfs_acl_t *aclp = datap;
	- zfs_ace_hdr_t acep = (zfs_ace_hdr_t )(uintptr_t)cookie;
	- uint64_t who;
	-
	- acep = zfs_acl_next_ace(aclp, acep, &who, mask,
	- flags, type);
	- return ((uint64_t)(uintptr_t)acep);
	-}
	-
	-static zfs_acl_node_t *
	-zfs_acl_curr_node(zfs_acl_t *aclp)
	-{
	- ASSERT(aclp->z_curr_node);
	- return (aclp->z_curr_node);
	-}
	-
	-/*
	- * Copy ACE to internal ZFS format.
	- * While processing the ACL each ACE will be validated for correctness.
	- * ACE FUIDs will be created later.
	- */
	-int
	-zfs_copy_ace_2_fuid(zfsvfs_t zfsvfs, vtype_t obj_type, zfs_acl_t aclp,
	- void datap, zfs_ace_t z_acl, uint64_t aclcnt, size_t *size,
	- zfs_fuid_info_t *fuidp, cred_t cr)
	-{
	- int i;
	- uint16_t entry_type;
	- zfs_ace_t *aceptr = z_acl;
	- ace_t *acep = datap;
	- zfs_object_ace_t *zobjacep;
	- ace_object_t *aceobjp;
	-
	- for (i = 0; i != aclcnt; i++) {
	- aceptr->z_hdr.z_access_mask = acep->a_access_mask;
	- aceptr->z_hdr.z_flags = acep->a_flags;
	- aceptr->z_hdr.z_type = acep->a_type;
	- entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
	- if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
	- entry_type != ACE_EVERYONE) {
	- aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
	- cr, (entry_type == 0) ?
	- ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
	- }
	-
	- /*
	- * Make sure ACE is valid
	- */
	- if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
	- aceptr->z_hdr.z_flags) != B_TRUE)
	- return (SET_ERROR(EINVAL));
	-
	- switch (acep->a_type) {
	- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
	- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
	- zobjacep = (zfs_object_ace_t *)aceptr;
	- aceobjp = (ace_object_t *)acep;
	-
	- bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
	- sizeof (aceobjp->a_obj_type));
	- bcopy(aceobjp->a_inherit_obj_type,
	- zobjacep->z_inherit_type,
	- sizeof (aceobjp->a_inherit_obj_type));
	- acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
	- break;
	- default:
	- acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
	- }
	-
	- aceptr = (zfs_ace_t *)((caddr_t)aceptr +
	- aclp->z_ops.ace_size(aceptr));
	- }
	-
	- *size = (caddr_t)aceptr - (caddr_t)z_acl;
	-
	- return (0);
	-}
	-
	-/*
	- * Copy ZFS ACEs to fixed size ace_t layout
	- */
	-static void
	-zfs_copy_fuid_2_ace(zfsvfs_t zfsvfs, zfs_acl_t aclp, cred_t *cr,
	- void *datap, int filter)
	-{
	- uint64_t who;
	- uint32_t access_mask;
	- uint16_t iflags, type;
	- zfs_ace_hdr_t *zacep = NULL;
	- ace_t *acep = datap;
	- ace_object_t *objacep;
	- zfs_object_ace_t *zobjacep;
	- size_t ace_size;
	- uint16_t entry_type;
	-
	- while (zacep = zfs_acl_next_ace(aclp, zacep,
	- &who, &access_mask, &iflags, &type)) {
	-
	- switch (type) {
	- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
	- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
	- if (filter) {
	- continue;
	- }
	- zobjacep = (zfs_object_ace_t *)zacep;
	- objacep = (ace_object_t *)acep;
	- bcopy(zobjacep->z_object_type,
	- objacep->a_obj_type,
	- sizeof (zobjacep->z_object_type));
	- bcopy(zobjacep->z_inherit_type,
	- objacep->a_inherit_obj_type,
	- sizeof (zobjacep->z_inherit_type));
	- ace_size = sizeof (ace_object_t);
	- break;
	- default:
	- ace_size = sizeof (ace_t);
	- break;
	- }
	-
	- entry_type = (iflags & ACE_TYPE_FLAGS);
	- if ((entry_type != ACE_OWNER &&
	- entry_type != OWNING_GROUP &&
	- entry_type != ACE_EVERYONE)) {
	- acep->a_who = zfs_fuid_map_id(zfsvfs, who,
	- cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
	- ZFS_ACE_GROUP : ZFS_ACE_USER);
	- } else {
	- acep->a_who = (uid_t)(int64_t)who;
	- }
	- acep->a_access_mask = access_mask;
	- acep->a_flags = iflags;
	- acep->a_type = type;
	- acep = (ace_t *)((caddr_t)acep + ace_size);
	- }
	-}
	-
	-static int
	-zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t aclp, ace_t acep,
	- zfs_oldace_t z_acl, int aclcnt, size_t size)
	-{
	- int i;
	- zfs_oldace_t *aceptr = z_acl;
	-
	- for (i = 0; i != aclcnt; i++, aceptr++) {
	- aceptr->z_access_mask = acep[i].a_access_mask;
	- aceptr->z_type = acep[i].a_type;
	- aceptr->z_flags = acep[i].a_flags;
	- aceptr->z_fuid = acep[i].a_who;
	- /*
	- * Make sure ACE is valid
	- */
	- if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
	- aceptr->z_flags) != B_TRUE)
	- return (SET_ERROR(EINVAL));
	- }
	- *size = (caddr_t)aceptr - (caddr_t)z_acl;
	- return (0);
	-}
	-
	-/*
	- * convert old ACL format to new
	- */
	-void
	-zfs_acl_xform(znode_t zp, zfs_acl_t aclp, cred_t *cr)
	-{
	- zfs_oldace_t *oldaclp;
	- int i;
	- uint16_t type, iflags;
	- uint32_t access_mask;
	- uint64_t who;
	- void *cookie = NULL;
	- zfs_acl_node_t *newaclnode;
	-
	- ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
	- /*
	- * First create the ACE in a contiguous piece of memory
	- * for zfs_copy_ace_2_fuid().
	- *
	- * We only convert an ACL once, so this won't happen
	- * everytime.
	- */
	- oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
	- KM_SLEEP);
	- i = 0;
	- while (cookie = zfs_acl_next_ace(aclp, cookie, &who,
	- &access_mask, &iflags, &type)) {
	- oldaclp[i].z_flags = iflags;
	- oldaclp[i].z_type = type;
	- oldaclp[i].z_fuid = who;
	- oldaclp[i++].z_access_mask = access_mask;
	- }
	-
	- newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
	- sizeof (zfs_object_ace_t));
	- aclp->z_ops = zfs_acl_fuid_ops;
	- VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
	- oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
	- &newaclnode->z_size, NULL, cr) == 0);
	- newaclnode->z_ace_count = aclp->z_acl_count;
	- aclp->z_version = ZFS_ACL_VERSION;
	- kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
	-
	- /*
	- * Release all previous ACL nodes
	- */
	-
	- zfs_acl_release_nodes(aclp);
	-
	- list_insert_head(&aclp->z_acl, newaclnode);
	-
	- aclp->z_acl_bytes = newaclnode->z_size;
	- aclp->z_acl_count = newaclnode->z_ace_count;
	-
	-}
	-
	-/*
	- * Convert unix access mask to v4 access mask
	- */
	-static uint32_t
	-zfs_unix_to_v4(uint32_t access_mask)
	-{
	- uint32_t new_mask = 0;
	-
	- if (access_mask & S_IXOTH)
	- new_mask \|= ACE_EXECUTE;
	- if (access_mask & S_IWOTH)
	- new_mask \|= ACE_WRITE_DATA;
	- if (access_mask & S_IROTH)
	- new_mask \|= ACE_READ_DATA;
	- return (new_mask);
	-}
	-
	-static void
	-zfs_set_ace(zfs_acl_t aclp, void acep, uint32_t access_mask,
	- uint16_t access_type, uint64_t fuid, uint16_t entry_type)
	-{
	- uint16_t type = entry_type & ACE_TYPE_FLAGS;
	-
	- aclp->z_ops.ace_mask_set(acep, access_mask);
	- aclp->z_ops.ace_type_set(acep, access_type);
	- aclp->z_ops.ace_flags_set(acep, entry_type);
	- if ((type != ACE_OWNER && type != OWNING_GROUP &&
	- type != ACE_EVERYONE))
	- aclp->z_ops.ace_who_set(acep, fuid);
	-}
	-
	-/*
	- * Determine mode of file based on ACL.
	- */
	-uint64_t
	-zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
	- uint64_t *pflags, uint64_t fuid, uint64_t fgid)
	-{
	- int entry_type;
	- mode_t mode;
	- mode_t seen = 0;
	- zfs_ace_hdr_t *acep = NULL;
	- uint64_t who;
	- uint16_t iflags, type;
	- uint32_t access_mask;
	- boolean_t an_exec_denied = B_FALSE;
	-
	- mode = (fmode & (S_IFMT \| S_ISUID \| S_ISGID \| S_ISVTX));
	-
	- while (acep = zfs_acl_next_ace(aclp, acep, &who,
	- &access_mask, &iflags, &type)) {
	-
	- if (!zfs_acl_valid_ace_type(type, iflags))
	- continue;
	-
	- entry_type = (iflags & ACE_TYPE_FLAGS);
	-
	- /*
	- * Skip over any inherit_only ACEs
	- */
	- if (iflags & ACE_INHERIT_ONLY_ACE)
	- continue;
	-
	- if (entry_type == ACE_OWNER \|\| (entry_type == 0 &&
	- who == fuid)) {
	- if ((access_mask & ACE_READ_DATA) &&
	- (!(seen & S_IRUSR))) {
	- seen \|= S_IRUSR;
	- if (type == ALLOW) {
	- mode \|= S_IRUSR;
	- }
	- }
	- if ((access_mask & ACE_WRITE_DATA) &&
	- (!(seen & S_IWUSR))) {
	- seen \|= S_IWUSR;
	- if (type == ALLOW) {
	- mode \|= S_IWUSR;
	- }
	- }
	- if ((access_mask & ACE_EXECUTE) &&
	- (!(seen & S_IXUSR))) {
	- seen \|= S_IXUSR;
	- if (type == ALLOW) {
	- mode \|= S_IXUSR;
	- }
	- }
	- } else if (entry_type == OWNING_GROUP \|\|
	- (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
	- if ((access_mask & ACE_READ_DATA) &&
	- (!(seen & S_IRGRP))) {
	- seen \|= S_IRGRP;
	- if (type == ALLOW) {
	- mode \|= S_IRGRP;
	- }
	- }
	- if ((access_mask & ACE_WRITE_DATA) &&
	- (!(seen & S_IWGRP))) {
	- seen \|= S_IWGRP;
	- if (type == ALLOW) {
	- mode \|= S_IWGRP;
	- }
	- }
	- if ((access_mask & ACE_EXECUTE) &&
	- (!(seen & S_IXGRP))) {
	- seen \|= S_IXGRP;
	- if (type == ALLOW) {
	- mode \|= S_IXGRP;
	- }
	- }
	- } else if (entry_type == ACE_EVERYONE) {
	- if ((access_mask & ACE_READ_DATA)) {
	- if (!(seen & S_IRUSR)) {
	- seen \|= S_IRUSR;
	- if (type == ALLOW) {
	- mode \|= S_IRUSR;
	- }
	- }
	- if (!(seen & S_IRGRP)) {
	- seen \|= S_IRGRP;
	- if (type == ALLOW) {
	- mode \|= S_IRGRP;
	- }
	- }
	- if (!(seen & S_IROTH)) {
	- seen \|= S_IROTH;
	- if (type == ALLOW) {
	- mode \|= S_IROTH;
	- }
	- }
	- }
	- if ((access_mask & ACE_WRITE_DATA)) {
	- if (!(seen & S_IWUSR)) {
	- seen \|= S_IWUSR;
	- if (type == ALLOW) {
	- mode \|= S_IWUSR;
	- }
	- }
	- if (!(seen & S_IWGRP)) {
	- seen \|= S_IWGRP;
	- if (type == ALLOW) {
	- mode \|= S_IWGRP;
	- }
	- }
	- if (!(seen & S_IWOTH)) {
	- seen \|= S_IWOTH;
	- if (type == ALLOW) {
	- mode \|= S_IWOTH;
	- }
	- }
	- }
	- if ((access_mask & ACE_EXECUTE)) {
	- if (!(seen & S_IXUSR)) {
	- seen \|= S_IXUSR;
	- if (type == ALLOW) {
	- mode \|= S_IXUSR;
	- }
	- }
	- if (!(seen & S_IXGRP)) {
	- seen \|= S_IXGRP;
	- if (type == ALLOW) {
	- mode \|= S_IXGRP;
	- }
	- }
	- if (!(seen & S_IXOTH)) {
	- seen \|= S_IXOTH;
	- if (type == ALLOW) {
	- mode \|= S_IXOTH;
	- }
	- }
	- }
	- } else {
	- /*
	- * Only care if this IDENTIFIER_GROUP or
	- * USER ACE denies execute access to someone,
	- * mode is not affected
	- */
	- if ((access_mask & ACE_EXECUTE) && type == DENY)
	- an_exec_denied = B_TRUE;
	- }
	- }
	-
	- /*
	- * Failure to allow is effectively a deny, so execute permission
	- * is denied if it was never mentioned or if we explicitly
	- * weren't allowed it.
	- */
	- if (!an_exec_denied &&
	- ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS \|\|
	- (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
	- an_exec_denied = B_TRUE;
	-
	- if (an_exec_denied)
	- *pflags &= ~ZFS_NO_EXECS_DENIED;
	- else
	- *pflags \|= ZFS_NO_EXECS_DENIED;
	-
	- return (mode);
	-}
	-
	-/*
	- * Read an external acl object. If the intent is to modify, always
	- * create a new acl and leave any cached acl in place.
	- */
	-static int
	-zfs_acl_node_read(znode_t zp, zfs_acl_t *aclpp, boolean_t will_modify)
	-{
	- zfs_acl_t *aclp;
	- int aclsize;
	- int acl_count;
	- zfs_acl_node_t *aclnode;
	- zfs_acl_phys_t znode_acl;
	- int version;
	- int error;
	-
	- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
	- ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
	-
	- if (zp->z_acl_cached && !will_modify) {
	- *aclpp = zp->z_acl_cached;
	- return (0);
	- }
	-
	- version = zfs_znode_acl_version(zp);
	-
	- if ((error = zfs_acl_znode_info(zp, &aclsize,
	- &acl_count, &znode_acl)) != 0) {
	- goto done;
	- }
	-
	- aclp = zfs_acl_alloc(version);
	-
	- aclp->z_acl_count = acl_count;
	- aclp->z_acl_bytes = aclsize;
	-
	- aclnode = zfs_acl_node_alloc(aclsize);
	- aclnode->z_ace_count = aclp->z_acl_count;
	- aclnode->z_size = aclsize;
	-
	- if (!zp->z_is_sa) {
	- if (znode_acl.z_acl_extern_obj) {
	- error = dmu_read(zp->z_zfsvfs->z_os,
	- znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
	- aclnode->z_acldata, DMU_READ_PREFETCH);
	- } else {
	- bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
	- aclnode->z_size);
	- }
	- } else {
	- error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
	- aclnode->z_acldata, aclnode->z_size);
	- }
	-
	- if (error != 0) {
	- zfs_acl_free(aclp);
	- zfs_acl_node_free(aclnode);
	- /* convert checksum errors into IO errors */
	- if (error == ECKSUM)
	- error = SET_ERROR(EIO);
	- goto done;
	- }
	-
	- list_insert_head(&aclp->z_acl, aclnode);
	-
	- *aclpp = aclp;
	- if (!will_modify)
	- zp->z_acl_cached = aclp;
	-done:
	- return (error);
	-}
	-
	-/ARGSUSED/
	-void
	-zfs_acl_data_locator(void *dataptr, uint32_t length, uint32_t buflen,
	- boolean_t start, void *userdata)
	-{
	- zfs_acl_locator_cb_t cb = (zfs_acl_locator_cb_t )userdata;
	-
	- if (start) {
	- cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
	- } else {
	- cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
	- cb->cb_acl_node);
	- }
	- *dataptr = cb->cb_acl_node->z_acldata;
	- *length = cb->cb_acl_node->z_size;
	-}
	-
	-int
	-zfs_acl_chown_setattr(znode_t *zp)
	-{
	- int error;
	- zfs_acl_t *aclp;
	-
	- ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
	- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
	- ASSERT_VOP_IN_SEQC(ZTOV(zp));
	-
	- if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
	- zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
	- &zp->z_pflags, zp->z_uid, zp->z_gid);
	- return (error);
	-}
	-
	-/*
	- * common code for setting ACLs.
	- *
	- * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
	- * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
	- * already checked the acl and knows whether to inherit.
	- */
	-int
	-zfs_aclset_common(znode_t zp, zfs_acl_t aclp, cred_t cr, dmu_tx_t tx)
	-{
	- int error;
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- dmu_object_type_t otype;
	- zfs_acl_locator_cb_t locate = { 0 };
	- uint64_t mode;
	- sa_bulk_attr_t bulk[5];
	- uint64_t ctime[2];
	- int count = 0;
	- zfs_acl_phys_t acl_phys;
	-
	- ASSERT_VOP_IN_SEQC(ZTOV(zp));
	-
	- mode = zp->z_mode;
	-
	- mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
	- zp->z_uid, zp->z_gid);
	-
	- zp->z_mode = mode;
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
	- &mode, sizeof (mode));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	- &zp->z_pflags, sizeof (zp->z_pflags));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
	- &ctime, sizeof (ctime));
	-
	- if (zp->z_acl_cached) {
	- zfs_acl_free(zp->z_acl_cached);
	- zp->z_acl_cached = NULL;
	- }
	-
	- /*
	- * Upgrade needed?
	- */
	- if (!zfsvfs->z_use_fuids) {
	- otype = DMU_OT_OLDACL;
	- } else {
	- if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
	- (zfsvfs->z_version >= ZPL_VERSION_FUID))
	- zfs_acl_xform(zp, aclp, cr);
	- ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
	- otype = DMU_OT_ACL;
	- }
	-
	- /*
	- * Arrgh, we have to handle old on disk format
	- * as well as newer (preferred) SA format.
	- */
	-
	- if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
	- locate.cb_aclp = aclp;
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
	- zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
	- NULL, &aclp->z_acl_count, sizeof (uint64_t));
	- } else { /* Painful legacy way */
	- zfs_acl_node_t *aclnode;
	- uint64_t off = 0;
	- uint64_t aoid;
	-
	- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
	- &acl_phys, sizeof (acl_phys))) != 0)
	- return (error);
	-
	- aoid = acl_phys.z_acl_extern_obj;
	-
	- if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	- /*
	- * If ACL was previously external and we are now
	- * converting to new ACL format then release old
	- * ACL object and create a new one.
	- */
	- if (aoid &&
	- aclp->z_version != acl_phys.z_acl_version) {
	- error = dmu_object_free(zfsvfs->z_os, aoid, tx);
	- if (error)
	- return (error);
	- aoid = 0;
	- }
	- if (aoid == 0) {
	- aoid = dmu_object_alloc(zfsvfs->z_os,
	- otype, aclp->z_acl_bytes,
	- otype == DMU_OT_ACL ?
	- DMU_OT_SYSACL : DMU_OT_NONE,
	- otype == DMU_OT_ACL ?
	- DN_OLD_MAX_BONUSLEN : 0, tx);
	- } else {
	- (void) dmu_object_set_blocksize(zfsvfs->z_os,
	- aoid, aclp->z_acl_bytes, 0, tx);
	- }
	- acl_phys.z_acl_extern_obj = aoid;
	- for (aclnode = list_head(&aclp->z_acl); aclnode;
	- aclnode = list_next(&aclp->z_acl, aclnode)) {
	- if (aclnode->z_ace_count == 0)
	- continue;
	- dmu_write(zfsvfs->z_os, aoid, off,
	- aclnode->z_size, aclnode->z_acldata, tx);
	- off += aclnode->z_size;
	- }
	- } else {
	- void *start = acl_phys.z_ace_data;
	- /*
	- * Migrating back embedded?
	- */
	- if (acl_phys.z_acl_extern_obj) {
	- error = dmu_object_free(zfsvfs->z_os,
	- acl_phys.z_acl_extern_obj, tx);
	- if (error)
	- return (error);
	- acl_phys.z_acl_extern_obj = 0;
	- }
	-
	- for (aclnode = list_head(&aclp->z_acl); aclnode;
	- aclnode = list_next(&aclp->z_acl, aclnode)) {
	- if (aclnode->z_ace_count == 0)
	- continue;
	- bcopy(aclnode->z_acldata, start,
	- aclnode->z_size);
	- start = (caddr_t)start + aclnode->z_size;
	- }
	- }
	- /*
	- * If Old version then swap count/bytes to match old
	- * layout of znode_acl_phys_t.
	- */
	- if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
	- acl_phys.z_acl_size = aclp->z_acl_count;
	- acl_phys.z_acl_count = aclp->z_acl_bytes;
	- } else {
	- acl_phys.z_acl_size = aclp->z_acl_bytes;
	- acl_phys.z_acl_count = aclp->z_acl_count;
	- }
	- acl_phys.z_acl_version = aclp->z_version;
	-
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
	- &acl_phys, sizeof (acl_phys));
	- }
	-
	- /*
	- * Replace ACL wide bits, but first clear them.
	- */
	- zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
	-
	- zp->z_pflags \|= aclp->z_hints;
	-
	- if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
	- zp->z_pflags \|= ZFS_ACL_TRIVIAL;
	-
	- zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
	- return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
	-}
	-
	-static void
	-zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
	- zfs_acl_t *aclp)
	-{
	- void *acep = NULL;
	- uint64_t who;
	- int new_count, new_bytes;
	- int ace_size;
	- int entry_type;
	- uint16_t iflags, type;
	- uint32_t access_mask;
	- zfs_acl_node_t *newnode;
	- size_t abstract_size = aclp->z_ops.ace_abstract_size();
	- void *zacep;
	- boolean_t isdir;
	- trivial_acl_t masks;
	-
	- new_count = new_bytes = 0;
	-
	- isdir = (vtype == VDIR);
	-
	- acl_trivial_access_masks((mode_t)mode, isdir, &masks);
	-
	- newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
	-
	- zacep = newnode->z_acldata;
	- if (masks.allow0) {
	- zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
	- zacep = (void *)((uintptr_t)zacep + abstract_size);
	- new_count++;
	- new_bytes += abstract_size;
	- }
	- if (masks.deny1) {
	- zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
	- zacep = (void *)((uintptr_t)zacep + abstract_size);
	- new_count++;
	- new_bytes += abstract_size;
	- }
	- if (masks.deny2) {
	- zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
	- zacep = (void *)((uintptr_t)zacep + abstract_size);
	- new_count++;
	- new_bytes += abstract_size;
	- }
	-
	- while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
	- &iflags, &type)) {
	- entry_type = (iflags & ACE_TYPE_FLAGS);
	- /*
	- * ACEs used to represent the file mode may be divided
	- * into an equivalent pair of inherit-only and regular
	- * ACEs, if they are inheritable.
	- * Skip regular ACEs, which are replaced by the new mode.
	- */
	- if (split && (entry_type == ACE_OWNER \|\|
	- entry_type == OWNING_GROUP \|\|
	- entry_type == ACE_EVERYONE)) {
	- if (!isdir \|\| !(iflags &
	- (ACE_FILE_INHERIT_ACE\|ACE_DIRECTORY_INHERIT_ACE)))
	- continue;
	- /*
	- * We preserve owner@, group@, or @everyone
	- * permissions, if they are inheritable, by
	- * copying them to inherit_only ACEs. This
	- * prevents inheritable permissions from being
	- * altered along with the file mode.
	- */
	- iflags \|= ACE_INHERIT_ONLY_ACE;
	- }
	-
	- /*
	- * If this ACL has any inheritable ACEs, mark that in
	- * the hints (which are later masked into the pflags)
	- * so create knows to do inheritance.
	- */
	- if (isdir && (iflags &
	- (ACE_FILE_INHERIT_ACE\|ACE_DIRECTORY_INHERIT_ACE)))
	- aclp->z_hints \|= ZFS_INHERIT_ACE;
	-
	- if ((type != ALLOW && type != DENY) \|\|
	- (iflags & ACE_INHERIT_ONLY_ACE)) {
	- switch (type) {
	- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
	- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
	- aclp->z_hints \|= ZFS_ACL_OBJ_ACE;
	- break;
	- }
	- } else {
	- /*
	- * Limit permissions granted by ACEs to be no greater
	- * than permissions of the requested group mode.
	- * Applies when the "aclmode" property is set to
	- * "groupmask".
	- */
	- if ((type == ALLOW) && trim)
	- access_mask &= masks.group;
	- }
	- zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
	- ace_size = aclp->z_ops.ace_size(acep);
	- zacep = (void *)((uintptr_t)zacep + ace_size);
	- new_count++;
	- new_bytes += ace_size;
	- }
	- zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
	- zacep = (void *)((uintptr_t)zacep + abstract_size);
	- zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
	- zacep = (void *)((uintptr_t)zacep + abstract_size);
	- zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
	-
	- new_count += 3;
	- new_bytes += abstract_size * 3;
	- zfs_acl_release_nodes(aclp);
	- aclp->z_acl_count = new_count;
	- aclp->z_acl_bytes = new_bytes;
	- newnode->z_ace_count = new_count;
	- newnode->z_size = new_bytes;
	- list_insert_tail(&aclp->z_acl, newnode);
	-}
	-
	-int
	-zfs_acl_chmod_setattr(znode_t zp, zfs_acl_t *aclp, uint64_t mode)
	-{
	- int error = 0;
	-
	- mutex_enter(&zp->z_acl_lock);
	- ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
	- if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
	- *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
	- else
	- error = zfs_acl_node_read(zp, aclp, B_TRUE);
	-
	- if (error == 0) {
	- (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
	- zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
	- (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
	- }
	- mutex_exit(&zp->z_acl_lock);
	-
	- return (error);
	-}
	-
	-/*
	- * Should ACE be inherited?
	- */
	-static int
	-zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
	-{
	- int iflags = (acep_flags & 0xf);
	-
	- if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
	- return (1);
	- else if (iflags & ACE_FILE_INHERIT_ACE)
	- return (!((vtype == VDIR) &&
	- (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
	- return (0);
	-}
	-
	-/*
	- * inherit inheritable ACEs from parent
	- */
	-static zfs_acl_t *
	-zfs_acl_inherit(zfsvfs_t zfsvfs, vtype_t vtype, zfs_acl_t paclp,
	- uint64_t mode, boolean_t *need_chmod)
	-{
	- void *pacep = NULL;
	- void *acep;
	- zfs_acl_node_t *aclnode;
	- zfs_acl_t *aclp = NULL;
	- uint64_t who;
	- uint32_t access_mask;
	- uint16_t iflags, newflags, type;
	- size_t ace_size;
	- void data1, data2;
	- size_t data1sz, data2sz;
	- uint_t aclinherit;
	- boolean_t isdir = (vtype == VDIR);
	- boolean_t isreg = (vtype == VREG);
	-
	- *need_chmod = B_TRUE;
	-
	- aclp = zfs_acl_alloc(paclp->z_version);
	- aclinherit = zfsvfs->z_acl_inherit;
	- if (aclinherit == ZFS_ACL_DISCARD \|\| vtype == VLNK)
	- return (aclp);
	-
	- while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
	- &access_mask, &iflags, &type)) {
	-
	- /*
	- * don't inherit bogus ACEs
	- */
	- if (!zfs_acl_valid_ace_type(type, iflags))
	- continue;
	-
	- /*
	- * Check if ACE is inheritable by this vnode
	- */
	- if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) \|\|
	- !zfs_ace_can_use(vtype, iflags))
	- continue;
	-
	- /*
	- * If owner@, group@, or everyone@ inheritable
	- * then zfs_acl_chmod() isn't needed.
	- */
	- if ((aclinherit == ZFS_ACL_PASSTHROUGH \|\|
	- aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
	- ((iflags & (ACE_OWNER\|ACE_EVERYONE)) \|\|
	- ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
	- (isreg \|\| (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
	- *need_chmod = B_FALSE;
	-
	- /*
	- * Strip inherited execute permission from file if
	- * not in mode
	- */
	- if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
	- !isdir && ((mode & (S_IXUSR\|S_IXGRP\|S_IXOTH)) == 0)) {
	- access_mask &= ~ACE_EXECUTE;
	- }
	-
	- /*
	- * Strip write_acl and write_owner from permissions
	- * when inheriting an ACE
	- */
	- if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
	- access_mask &= ~RESTRICTED_CLEAR;
	- }
	-
	- ace_size = aclp->z_ops.ace_size(pacep);
	- aclnode = zfs_acl_node_alloc(ace_size);
	- list_insert_tail(&aclp->z_acl, aclnode);
	- acep = aclnode->z_acldata;
	-
	- zfs_set_ace(aclp, acep, access_mask, type,
	- who, iflags\|ACE_INHERITED_ACE);
	-
	- /*
	- * Copy special opaque data if any
	- */
	- if ((data1sz = paclp->z_ops.ace_data(pacep, &data1)) != 0) {
	- VERIFY((data2sz = aclp->z_ops.ace_data(acep,
	- &data2)) == data1sz);
	- bcopy(data1, data2, data2sz);
	- }
	-
	- aclp->z_acl_count++;
	- aclnode->z_ace_count++;
	- aclp->z_acl_bytes += aclnode->z_size;
	- newflags = aclp->z_ops.ace_flags_get(acep);
	-
	- /*
	- * If ACE is not to be inherited further, or if the vnode is
	- * not a directory, remove all inheritance flags
	- */
	- if (!isdir \|\| (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
	- newflags &= ~ALL_INHERIT;
	- aclp->z_ops.ace_flags_set(acep,
	- newflags\|ACE_INHERITED_ACE);
	- continue;
	- }
	-
	- /*
	- * This directory has an inheritable ACE
	- */
	- aclp->z_hints \|= ZFS_INHERIT_ACE;
	-
	- /*
	- * If only FILE_INHERIT is set then turn on
	- * inherit_only
	- */
	- if ((iflags & (ACE_FILE_INHERIT_ACE \|
	- ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
	- newflags \|= ACE_INHERIT_ONLY_ACE;
	- aclp->z_ops.ace_flags_set(acep,
	- newflags\|ACE_INHERITED_ACE);
	- } else {
	- newflags &= ~ACE_INHERIT_ONLY_ACE;
	- aclp->z_ops.ace_flags_set(acep,
	- newflags\|ACE_INHERITED_ACE);
	- }
	- }
	-
	- return (aclp);
	-}
	-
	-/*
	- * Create file system object initial permissions
	- * including inheritable ACEs.
	- * Also, create FUIDs for owner and group.
	- */
	-int
	-zfs_acl_ids_create(znode_t dzp, int flag, vattr_t vap, cred_t *cr,
	- vsecattr_t vsecp, zfs_acl_ids_t acl_ids)
	-{
	- int error;
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- zfs_acl_t *paclp;
	- gid_t gid;
	- boolean_t need_chmod = B_TRUE;
	- boolean_t trim = B_FALSE;
	- boolean_t inherited = B_FALSE;
	-
	- if ((flag & IS_ROOT_NODE) == 0)
	- ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
	- else
	- ASSERT(dzp->z_vnode == NULL);
	- bzero(acl_ids, sizeof (zfs_acl_ids_t));
	- acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
	-
	- if (vsecp)
	- if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
	- &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
	- return (error);
	- /*
	- * Determine uid and gid.
	- */
	- if ((flag & IS_ROOT_NODE) \|\| zfsvfs->z_replay \|\|
	- ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
	- acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
	- (uint64_t)vap->va_uid, cr,
	- ZFS_OWNER, &acl_ids->z_fuidp);
	- acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
	- (uint64_t)vap->va_gid, cr,
	- ZFS_GROUP, &acl_ids->z_fuidp);
	- gid = vap->va_gid;
	- } else {
	- acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
	- cr, &acl_ids->z_fuidp);
	- acl_ids->z_fgid = 0;
	- if (vap->va_mask & AT_GID) {
	- acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
	- (uint64_t)vap->va_gid,
	- cr, ZFS_GROUP, &acl_ids->z_fuidp);
	- gid = vap->va_gid;
	- if (acl_ids->z_fgid != dzp->z_gid &&
	- !groupmember(vap->va_gid, cr) &&
	- secpolicy_vnode_create_gid(cr) != 0)
	- acl_ids->z_fgid = 0;
	- }
	- if (acl_ids->z_fgid == 0) {
	-#ifndef __FreeBSD_kernel__
	- if (dzp->z_mode & S_ISGID) {
	-#endif
	- char *domain;
	- uint32_t rid;
	-
	- acl_ids->z_fgid = dzp->z_gid;
	- gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
	- cr, ZFS_GROUP);
	-
	- if (zfsvfs->z_use_fuids &&
	- IS_EPHEMERAL(acl_ids->z_fgid)) {
	- domain = zfs_fuid_idx_domain(
	- &zfsvfs->z_fuid_idx,
	- FUID_INDEX(acl_ids->z_fgid));
	- rid = FUID_RID(acl_ids->z_fgid);
	- zfs_fuid_node_add(&acl_ids->z_fuidp,
	- domain, rid,
	- FUID_INDEX(acl_ids->z_fgid),
	- acl_ids->z_fgid, ZFS_GROUP);
	- }
	-#ifndef __FreeBSD_kernel__
	- } else {
	- acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
	- ZFS_GROUP, cr, &acl_ids->z_fuidp);
	- gid = crgetgid(cr);
	- }
	-#endif
	- }
	- }
	-
	- /*
	- * If we're creating a directory, and the parent directory has the
	- * set-GID bit set, set in on the new directory.
	- * Otherwise, if the user is neither privileged nor a member of the
	- * file's new group, clear the file's set-GID bit.
	- */
	-
	- if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
	- (vap->va_type == VDIR)) {
	- acl_ids->z_mode \|= S_ISGID;
	- } else {
	- if ((acl_ids->z_mode & S_ISGID) &&
	- secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0)
	- acl_ids->z_mode &= ~S_ISGID;
	- }
	-
	- if (acl_ids->z_aclp == NULL) {
	- mutex_enter(&dzp->z_acl_lock);
	- if (!(flag & IS_ROOT_NODE) &&
	- (dzp->z_pflags & ZFS_INHERIT_ACE) &&
	- !(dzp->z_pflags & ZFS_XATTR)) {
	- VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
	- acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
	- vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
	- inherited = B_TRUE;
	- } else {
	- acl_ids->z_aclp =
	- zfs_acl_alloc(zfs_acl_version_zp(dzp));
	- acl_ids->z_aclp->z_hints \|= ZFS_ACL_TRIVIAL;
	- }
	- mutex_exit(&dzp->z_acl_lock);
	-
	- if (need_chmod) {
	- if (vap->va_type == VDIR)
	- acl_ids->z_aclp->z_hints \|=
	- ZFS_ACL_AUTO_INHERIT;
	-
	- if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
	- zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
	- zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
	- trim = B_TRUE;
	- zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE,
	- trim, acl_ids->z_aclp);
	- }
	- }
	-
	- if (inherited \|\| vsecp) {
	- acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
	- acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
	- acl_ids->z_fuid, acl_ids->z_fgid);
	- if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
	- acl_ids->z_aclp->z_hints \|= ZFS_ACL_TRIVIAL;
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Free ACL and fuid_infop, but not the acl_ids structure
	- */
	-void
	-zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
	-{
	- if (acl_ids->z_aclp)
	- zfs_acl_free(acl_ids->z_aclp);
	- if (acl_ids->z_fuidp)
	- zfs_fuid_info_free(acl_ids->z_fuidp);
	- acl_ids->z_aclp = NULL;
	- acl_ids->z_fuidp = NULL;
	-}
	-
	-boolean_t
	-zfs_acl_ids_overquota(zfsvfs_t zfsvfs, zfs_acl_ids_t acl_ids)
	-{
	- return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) \|\|
	- zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
	-}
	-
	-/*
	- * Retrieve a file's ACL
	- */
	-int
	-zfs_getacl(znode_t zp, vsecattr_t vsecp, boolean_t skipaclchk, cred_t *cr)
	-{
	- zfs_acl_t *aclp;
	- ulong_t mask;
	- int error;
	- int count = 0;
	- int largeace = 0;
	-
	- mask = vsecp->vsa_mask & (VSA_ACE \| VSA_ACECNT \|
	- VSA_ACE_ACLFLAGS \| VSA_ACE_ALLTYPES);
	-
	- if (mask == 0)
	- return (SET_ERROR(ENOSYS));
	-
	- if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
	- return (error);
	-
	- mutex_enter(&zp->z_acl_lock);
	-
	- ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
	- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
	- if (error != 0) {
	- mutex_exit(&zp->z_acl_lock);
	- return (error);
	- }
	-
	- /*
	- * Scan ACL to determine number of ACEs
	- */
	- if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
	- void *zacep = NULL;
	- uint64_t who;
	- uint32_t access_mask;
	- uint16_t type, iflags;
	-
	- while (zacep = zfs_acl_next_ace(aclp, zacep,
	- &who, &access_mask, &iflags, &type)) {
	- switch (type) {
	- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
	- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
	- largeace++;
	- continue;
	- default:
	- count++;
	- }
	- }
	- vsecp->vsa_aclcnt = count;
	- } else
	- count = (int)aclp->z_acl_count;
	-
	- if (mask & VSA_ACECNT) {
	- vsecp->vsa_aclcnt = count;
	- }
	-
	- if (mask & VSA_ACE) {
	- size_t aclsz;
	-
	- aclsz = count * sizeof (ace_t) +
	- sizeof (ace_object_t) * largeace;
	-
	- vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
	- vsecp->vsa_aclentsz = aclsz;
	-
	- if (aclp->z_version == ZFS_ACL_VERSION_FUID)
	- zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
	- vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
	- else {
	- zfs_acl_node_t *aclnode;
	- void *start = vsecp->vsa_aclentp;
	-
	- for (aclnode = list_head(&aclp->z_acl); aclnode;
	- aclnode = list_next(&aclp->z_acl, aclnode)) {
	- bcopy(aclnode->z_acldata, start,
	- aclnode->z_size);
	- start = (caddr_t)start + aclnode->z_size;
	- }
	- ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
	- aclp->z_acl_bytes);
	- }
	- }
	- if (mask & VSA_ACE_ACLFLAGS) {
	- vsecp->vsa_aclflags = 0;
	- if (zp->z_pflags & ZFS_ACL_DEFAULTED)
	- vsecp->vsa_aclflags \|= ACL_DEFAULTED;
	- if (zp->z_pflags & ZFS_ACL_PROTECTED)
	- vsecp->vsa_aclflags \|= ACL_PROTECTED;
	- if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
	- vsecp->vsa_aclflags \|= ACL_AUTO_INHERIT;
	- }
	-
	- mutex_exit(&zp->z_acl_lock);
	-
	- return (0);
	-}
	-
	-int
	-zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
	- vsecattr_t vsecp, cred_t cr, zfs_fuid_info_t fuidp, zfs_acl_t zaclp)
	-{
	- zfs_acl_t *aclp;
	- zfs_acl_node_t *aclnode;
	- int aclcnt = vsecp->vsa_aclcnt;
	- int error;
	-
	- if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES \|\| vsecp->vsa_aclcnt <= 0)
	- return (SET_ERROR(EINVAL));
	-
	- aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
	-
	- aclp->z_hints = 0;
	- aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
	- if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
	- if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
	- (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
	- aclcnt, &aclnode->z_size)) != 0) {
	- zfs_acl_free(aclp);
	- zfs_acl_node_free(aclnode);
	- return (error);
	- }
	- } else {
	- if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
	- vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
	- &aclnode->z_size, fuidp, cr)) != 0) {
	- zfs_acl_free(aclp);
	- zfs_acl_node_free(aclnode);
	- return (error);
	- }
	- }
	- aclp->z_acl_bytes = aclnode->z_size;
	- aclnode->z_ace_count = aclcnt;
	- aclp->z_acl_count = aclcnt;
	- list_insert_head(&aclp->z_acl, aclnode);
	-
	- /*
	- * If flags are being set then add them to z_hints
	- */
	- if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
	- if (vsecp->vsa_aclflags & ACL_PROTECTED)
	- aclp->z_hints \|= ZFS_ACL_PROTECTED;
	- if (vsecp->vsa_aclflags & ACL_DEFAULTED)
	- aclp->z_hints \|= ZFS_ACL_DEFAULTED;
	- if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
	- aclp->z_hints \|= ZFS_ACL_AUTO_INHERIT;
	- }
	-
	- *zaclp = aclp;
	-
	- return (0);
	-}
	-
	-/*
	- * Set a file's ACL
	- */
	-int
	-zfs_setacl(znode_t zp, vsecattr_t vsecp, boolean_t skipaclchk, cred_t *cr)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- zilog_t *zilog = zfsvfs->z_log;
	- ulong_t mask = vsecp->vsa_mask & (VSA_ACE \| VSA_ACECNT);
	- dmu_tx_t *tx;
	- int error;
	- zfs_acl_t *aclp;
	- zfs_fuid_info_t *fuidp = NULL;
	- boolean_t fuid_dirtied;
	- uint64_t acl_obj;
	-
	- ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
	- if (mask == 0)
	- return (SET_ERROR(ENOSYS));
	-
	- if (zp->z_pflags & ZFS_IMMUTABLE)
	- return (SET_ERROR(EPERM));
	-
	- if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
	- return (error);
	-
	- error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
	- &aclp);
	- if (error)
	- return (error);
	-
	- /*
	- * If ACL wide flags aren't being set then preserve any
	- * existing flags.
	- */
	- if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
	- aclp->z_hints \|=
	- (zp->z_pflags & V4_ACL_WIDE_FLAGS);
	- }
	-top:
	- mutex_enter(&zp->z_acl_lock);
	-
	- tx = dmu_tx_create(zfsvfs->z_os);
	-
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
	-
	- fuid_dirtied = zfsvfs->z_fuid_dirty;
	- if (fuid_dirtied)
	- zfs_fuid_txhold(zfsvfs, tx);
	-
	- /*
	- * If old version and ACL won't fit in bonus and we aren't
	- * upgrading then take out necessary DMU holds
	- */
	-
	- if ((acl_obj = zfs_external_acl(zp)) != 0) {
	- if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
	- zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
	- dmu_tx_hold_free(tx, acl_obj, 0,
	- DMU_OBJECT_END);
	- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
	- aclp->z_acl_bytes);
	- } else {
	- dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
	- }
	- } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
	- }
	-
	- zfs_sa_upgrade_txholds(tx, zp);
	- error = dmu_tx_assign(tx, TXG_NOWAIT);
	- if (error) {
	- mutex_exit(&zp->z_acl_lock);
	-
	- if (error == ERESTART) {
	- dmu_tx_wait(tx);
	- dmu_tx_abort(tx);
	- goto top;
	- }
	- dmu_tx_abort(tx);
	- zfs_acl_free(aclp);
	- return (error);
	- }
	-
	- error = zfs_aclset_common(zp, aclp, cr, tx);
	- ASSERT(error == 0);
	- ASSERT(zp->z_acl_cached == NULL);
	- zp->z_acl_cached = aclp;
	-
	- if (fuid_dirtied)
	- zfs_fuid_sync(zfsvfs, tx);
	-
	- zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
	-
	- if (fuidp)
	- zfs_fuid_info_free(fuidp);
	- dmu_tx_commit(tx);
	- mutex_exit(&zp->z_acl_lock);
	-
	- return (error);
	-}
	-
	-/*
	- * Check accesses of interest (AoI) against attributes of the dataset
	- * such as read-only. Returns zero if no AoI conflict with dataset
	- * attributes, otherwise an appropriate errno is returned.
	- */
	-static int
	-zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
	-{
	- if ((v4_mode & WRITE_MASK) &&
	- (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
	- (!IS_DEVVP(ZTOV(zp)) \|\|
	- (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
	- return (SET_ERROR(EROFS));
	- }
	-
	- /*
	- * Intentionally allow ZFS_READONLY through here.
	- * See zfs_zaccess_common().
	- */
	- if ((v4_mode & WRITE_MASK_DATA) &&
	- (zp->z_pflags & ZFS_IMMUTABLE)) {
	- return (SET_ERROR(EPERM));
	- }
	-
	-#ifdef illumos
	- if ((v4_mode & (ACE_DELETE \| ACE_DELETE_CHILD)) &&
	- (zp->z_pflags & ZFS_NOUNLINK)) {
	- return (SET_ERROR(EPERM));
	- }
	-#else
	- /*
	- * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK
	- * (sunlnk) is set. We just don't allow directory removal, which is
	- * handled in zfs_zaccess_delete().
	- */
	- if ((v4_mode & ACE_DELETE) &&
	- (zp->z_pflags & ZFS_NOUNLINK)) {
	- return (EPERM);
	- }
	-#endif
	-
	- if (((v4_mode & (ACE_READ_DATA\|ACE_EXECUTE)) &&
	- (zp->z_pflags & ZFS_AV_QUARANTINED))) {
	- return (SET_ERROR(EACCES));
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * The primary usage of this function is to loop through all of the
	- * ACEs in the znode, determining what accesses of interest (AoI) to
	- * the caller are allowed or denied. The AoI are expressed as bits in
	- * the working_mode parameter. As each ACE is processed, bits covered
	- * by that ACE are removed from the working_mode. This removal
	- * facilitates two things. The first is that when the working mode is
	- * empty (= 0), we know we've looked at all the AoI. The second is
	- * that the ACE interpretation rules don't allow a later ACE to undo
	- * something granted or denied by an earlier ACE. Removing the
	- * discovered access or denial enforces this rule. At the end of
	- * processing the ACEs, all AoI that were found to be denied are
	- * placed into the working_mode, giving the caller a mask of denied
	- * accesses. Returns:
	- * 0 if all AoI granted
	- * EACCESS if the denied mask is non-zero
	- * other error if abnormal failure (e.g., IO error)
	- *
	- * A secondary usage of the function is to determine if any of the
	- * AoI are granted. If an ACE grants any access in
	- * the working_mode, we immediately short circuit out of the function.
	- * This mode is chosen by setting anyaccess to B_TRUE. The
	- * working_mode is not a denied access mask upon exit if the function
	- * is used in this manner.
	- */
	-static int
	-zfs_zaccess_aces_check(znode_t zp, uint32_t working_mode,
	- boolean_t anyaccess, cred_t *cr)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- zfs_acl_t *aclp;
	- int error;
	- uid_t uid = crgetuid(cr);
	- uint64_t who;
	- uint16_t type, iflags;
	- uint16_t entry_type;
	- uint32_t access_mask;
	- uint32_t deny_mask = 0;
	- zfs_ace_hdr_t *acep = NULL;
	- boolean_t checkit;
	- uid_t gowner;
	- uid_t fowner;
	-
	- zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
	-
	- mutex_enter(&zp->z_acl_lock);
	-
	- ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
	- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
	- if (error != 0) {
	- mutex_exit(&zp->z_acl_lock);
	- return (error);
	- }
	-
	- ASSERT(zp->z_acl_cached);
	-
	- while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
	- &iflags, &type)) {
	- uint32_t mask_matched;
	-
	- if (!zfs_acl_valid_ace_type(type, iflags))
	- continue;
	-
	- if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
	- continue;
	-
	- /* Skip ACE if it does not affect any AoI */
	- mask_matched = (access_mask & *working_mode);
	- if (!mask_matched)
	- continue;
	-
	- entry_type = (iflags & ACE_TYPE_FLAGS);
	-
	- checkit = B_FALSE;
	-
	- switch (entry_type) {
	- case ACE_OWNER:
	- if (uid == fowner)
	- checkit = B_TRUE;
	- break;
	- case OWNING_GROUP:
	- who = gowner;
	- /FALLTHROUGH/
	- case ACE_IDENTIFIER_GROUP:
	- checkit = zfs_groupmember(zfsvfs, who, cr);
	- break;
	- case ACE_EVERYONE:
	- checkit = B_TRUE;
	- break;
	-
	- /* USER Entry */
	- default:
	- if (entry_type == 0) {
	- uid_t newid;
	-
	- newid = zfs_fuid_map_id(zfsvfs, who, cr,
	- ZFS_ACE_USER);
	- if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
	- uid == newid)
	- checkit = B_TRUE;
	- break;
	- } else {
	- mutex_exit(&zp->z_acl_lock);
	- return (SET_ERROR(EIO));
	- }
	- }
	-
	- if (checkit) {
	- if (type == DENY) {
	- DTRACE_PROBE3(zfs__ace__denies,
	- znode_t *, zp,
	- zfs_ace_hdr_t *, acep,
	- uint32_t, mask_matched);
	- deny_mask \|= mask_matched;
	- } else {
	- DTRACE_PROBE3(zfs__ace__allows,
	- znode_t *, zp,
	- zfs_ace_hdr_t *, acep,
	- uint32_t, mask_matched);
	- if (anyaccess) {
	- mutex_exit(&zp->z_acl_lock);
	- return (0);
	- }
	- }
	- *working_mode &= ~mask_matched;
	- }
	-
	- /* Are we done? */
	- if (*working_mode == 0)
	- break;
	- }
	-
	- mutex_exit(&zp->z_acl_lock);
	-
	- /* Put the found 'denies' back on the working mode */
	- if (deny_mask) {
	- *working_mode \|= deny_mask;
	- return (SET_ERROR(EACCES));
	- } else if (*working_mode) {
	- return (-1);
	- }
	-
	- return (0);
	-}
	-
	-/*
	- * Return true if any access whatsoever granted, we don't actually
	- * care what access is granted.
	- */
	-boolean_t
	-zfs_has_access(znode_t zp, cred_t cr)
	-{
	- uint32_t have = ACE_ALL_PERMS;
	-
	- if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
	- uid_t owner;
	-
	- owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
	- return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
	- }
	- return (B_TRUE);
	-}
	-
	-static int
	-zfs_zaccess_common(znode_t zp, uint32_t v4_mode, uint32_t working_mode,
	- boolean_t check_privs, boolean_t skipaclchk, cred_t cr)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- int err;
	-
	- *working_mode = v4_mode;
	- *check_privs = B_TRUE;
	-
	- /*
	- * Short circuit empty requests
	- */
	- if (v4_mode == 0 \|\| zfsvfs->z_replay) {
	- *working_mode = 0;
	- return (0);
	- }
	-
	- if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
	- *check_privs = B_FALSE;
	- return (err);
	- }
	-
	- /*
	- * The caller requested that the ACL check be skipped. This
	- * would only happen if the caller checked VOP_ACCESS() with a
	- * 32 bit ACE mask and already had the appropriate permissions.
	- */
	- if (skipaclchk) {
	- *working_mode = 0;
	- return (0);
	- }
	-
	- /*
	- * Note: ZFS_READONLY represents the "DOS R/O" attribute.
	- * When that flag is set, we should behave as if write access
	- * were not granted by anything in the ACL. In particular:
	- * We _must_ allow writes after opening the file r/w, then
	- * setting the DOS R/O attribute, and writing some more.
	- * (Similar to how you can write after fchmod(fd, 0444).)
	- *
	- * Therefore ZFS_READONLY is ignored in the dataset check
	- * above, and checked here as if part of the ACL check.
	- * Also note: DOS R/O is ignored for directories.
	- */
	- if ((v4_mode & WRITE_MASK_DATA) &&
	- (ZTOV(zp)->v_type != VDIR) &&
	- (zp->z_pflags & ZFS_READONLY)) {
	- return (SET_ERROR(EPERM));
	- }
	-
	- return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
	-}
	-
	-static int
	-zfs_zaccess_append(znode_t zp, uint32_t working_mode, boolean_t *check_privs,
	- cred_t *cr)
	-{
	- if (*working_mode != ACE_WRITE_DATA)
	- return (SET_ERROR(EACCES));
	-
	- return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
	- check_privs, B_FALSE, cr));
	-}
	-
	-/*
	- * Check if VEXEC is allowed.
	- *
	- * This routine is based on zfs_fastaccesschk_execute which has slowpath
	- * calling zfs_zaccess. This would be incorrect on FreeBSD (see
	- * zfs_freebsd_access for the difference). Thus this variant let's the
	- * caller handle the slowpath (if necessary).
	- *
	- * We only check for ZFS_NO_EXECS_DENIED and fail early. This routine can
	- * be extended to cover more cases, but the flag covers the majority.
	- */
	-int
	-zfs_freebsd_fastaccesschk_execute(struct vnode vp, cred_t cr)
	-{
	- boolean_t is_attr;
	- znode_t *zdp = VTOZ(vp);
	-
	- ASSERT_VOP_LOCKED(vp, __func__);
	-
	- if (zdp->z_pflags & ZFS_AV_QUARANTINED)
	- return (1);
	-
	- is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
	- (ZTOV(zdp)->v_type == VDIR));
	- if (is_attr)
	- return (1);
	-
	- if (zdp->z_pflags & ZFS_NO_EXECS_DENIED)
	- return (0);
	-
	- return (1);
	-}
	-
	-#ifdef illumos
	-int
	-zfs_fastaccesschk_execute(znode_t zdp, cred_t cr)
	-{
	- boolean_t owner = B_FALSE;
	- boolean_t groupmbr = B_FALSE;
	- boolean_t is_attr;
	- uid_t uid = crgetuid(cr);
	- int error;
	-
	- if (zdp->z_pflags & ZFS_AV_QUARANTINED)
	- return (SET_ERROR(EACCES));
	-
	- is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
	- (ZTOV(zdp)->v_type == VDIR));
	- if (is_attr)
	- goto slow;
	-
	-
	- mutex_enter(&zdp->z_acl_lock);
	-
	- if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
	- mutex_exit(&zdp->z_acl_lock);
	- return (0);
	- }
	-
	- if (FUID_INDEX(zdp->z_uid) != 0 \|\| FUID_INDEX(zdp->z_gid) != 0) {
	- mutex_exit(&zdp->z_acl_lock);
	- goto slow;
	- }
	-
	- if (uid == zdp->z_uid) {
	- owner = B_TRUE;
	- if (zdp->z_mode & S_IXUSR) {
	- mutex_exit(&zdp->z_acl_lock);
	- return (0);
	- } else {
	- mutex_exit(&zdp->z_acl_lock);
	- goto slow;
	- }
	- }
	- if (groupmember(zdp->z_gid, cr)) {
	- groupmbr = B_TRUE;
	- if (zdp->z_mode & S_IXGRP) {
	- mutex_exit(&zdp->z_acl_lock);
	- return (0);
	- } else {
	- mutex_exit(&zdp->z_acl_lock);
	- goto slow;
	- }
	- }
	- if (!owner && !groupmbr) {
	- if (zdp->z_mode & S_IXOTH) {
	- mutex_exit(&zdp->z_acl_lock);
	- return (0);
	- }
	- }
	-
	- mutex_exit(&zdp->z_acl_lock);
	-
	-slow:
	- DTRACE_PROBE(zfs__fastpath__execute__access__miss);
	- ZFS_ENTER(zdp->z_zfsvfs);
	- error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
	- ZFS_EXIT(zdp->z_zfsvfs);
	- return (error);
	-}
	-#endif
	-
	-/*
	- * Determine whether Access should be granted/denied.
	- *
	- * The least priv subsystem is always consulted as a basic privilege
	- * can define any form of access.
	- */
	-int
	-zfs_zaccess(znode_t zp, int mode, int flags, boolean_t skipaclchk, cred_t cr)
	-{
	- uint32_t working_mode;
	- int error;
	- int is_attr;
	- boolean_t check_privs;
	- znode_t *xzp;
	- znode_t *check_zp = zp;
	- mode_t needed_bits;
	- uid_t owner;
	-
	- is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
	-
	-#ifdef __FreeBSD_kernel__
	- /*
	- * In FreeBSD, we don't care about permissions of individual ADS.
	- * Note that not checking them is not just an optimization - without
	- * this shortcut, EA operations may bogusly fail with EACCES.
	- */
	- if (zp->z_pflags & ZFS_XATTR)
	- return (0);
	-#else
	- /*
	- * If attribute then validate against base file
	- */
	- if (is_attr) {
	- uint64_t parent;
	-
	- if ((error = sa_lookup(zp->z_sa_hdl,
	- SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
	- sizeof (parent))) != 0)
	- return (error);
	-
	- if ((error = zfs_zget(zp->z_zfsvfs,
	- parent, &xzp)) != 0) {
	- return (error);
	- }
	-
	- check_zp = xzp;
	-
	- /*
	- * fixup mode to map to xattr perms
	- */
	-
	- if (mode & (ACE_WRITE_DATA\|ACE_APPEND_DATA)) {
	- mode &= ~(ACE_WRITE_DATA\|ACE_APPEND_DATA);
	- mode \|= ACE_WRITE_NAMED_ATTRS;
	- }
	-
	- if (mode & (ACE_READ_DATA\|ACE_EXECUTE)) {
	- mode &= ~(ACE_READ_DATA\|ACE_EXECUTE);
	- mode \|= ACE_READ_NAMED_ATTRS;
	- }
	- }
	-#endif
	-
	- owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
	- /*
	- * Map the bits required to the standard vnode flags VREAD\|VWRITE\|VEXEC
	- * in needed_bits. Map the bits mapped by working_mode (currently
	- * missing) in missing_bits.
	- * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
	- * needed_bits.
	- */
	- needed_bits = 0;
	-
	- working_mode = mode;
	- if ((working_mode & (ACE_READ_ACL\|ACE_READ_ATTRIBUTES)) &&
	- owner == crgetuid(cr))
	- working_mode &= ~(ACE_READ_ACL\|ACE_READ_ATTRIBUTES);
	-
	- if (working_mode & (ACE_READ_DATA\|ACE_READ_NAMED_ATTRS\|
	- ACE_READ_ACL\|ACE_READ_ATTRIBUTES\|ACE_SYNCHRONIZE))
	- needed_bits \|= VREAD;
	- if (working_mode & (ACE_WRITE_DATA\|ACE_WRITE_NAMED_ATTRS\|
	- ACE_APPEND_DATA\|ACE_WRITE_ATTRIBUTES\|ACE_SYNCHRONIZE))
	- needed_bits \|= VWRITE;
	- if (working_mode & ACE_EXECUTE)
	- needed_bits \|= VEXEC;
	-
	- if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
	- &check_privs, skipaclchk, cr)) == 0) {
	- if (is_attr)
	- VN_RELE(ZTOV(xzp));
	- return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
	- needed_bits, needed_bits));
	- }
	-
	- if (error && !check_privs) {
	- if (is_attr)
	- VN_RELE(ZTOV(xzp));
	- return (error);
	- }
	-
	- if (error && (flags & V_APPEND)) {
	- error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
	- }
	-
	- if (error && check_privs) {
	- mode_t checkmode = 0;
	-
	- /*
	- * First check for implicit owner permission on
	- * read_acl/read_attributes
	- */
	-
	- error = 0;
	- ASSERT(working_mode != 0);
	-
	- if ((working_mode & (ACE_READ_ACL\|ACE_READ_ATTRIBUTES) &&
	- owner == crgetuid(cr)))
	- working_mode &= ~(ACE_READ_ACL\|ACE_READ_ATTRIBUTES);
	-
	- if (working_mode & (ACE_READ_DATA\|ACE_READ_NAMED_ATTRS\|
	- ACE_READ_ACL\|ACE_READ_ATTRIBUTES\|ACE_SYNCHRONIZE))
	- checkmode \|= VREAD;
	- if (working_mode & (ACE_WRITE_DATA\|ACE_WRITE_NAMED_ATTRS\|
	- ACE_APPEND_DATA\|ACE_WRITE_ATTRIBUTES\|ACE_SYNCHRONIZE))
	- checkmode \|= VWRITE;
	- if (working_mode & ACE_EXECUTE)
	- checkmode \|= VEXEC;
	-
	- error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner,
	- needed_bits & ~checkmode, needed_bits);
	-
	- if (error == 0 && (working_mode & ACE_WRITE_OWNER))
	- error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
	- if (error == 0 && (working_mode & ACE_WRITE_ACL))
	- error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner);
	-
	- if (error == 0 && (working_mode &
	- (ACE_DELETE\|ACE_DELETE_CHILD)))
	- error = secpolicy_vnode_remove(ZTOV(check_zp), cr);
	-
	- if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
	- error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
	- }
	- if (error == 0) {
	- /*
	- * See if any bits other than those already checked
	- * for are still present. If so then return EACCES
	- */
	- if (working_mode & ~(ZFS_CHECKED_MASKS)) {
	- error = SET_ERROR(EACCES);
	- }
	- }
	- } else if (error == 0) {
	- error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
	- needed_bits, needed_bits);
	- }
	-
	-
	- if (is_attr)
	- VN_RELE(ZTOV(xzp));
	-
	- return (error);
	-}
	-
	-/*
	- * Translate traditional unix VREAD/VWRITE/VEXEC mode into
	- * native ACL format and call zfs_zaccess()
	- */
	-int
	-zfs_zaccess_rwx(znode_t zp, mode_t mode, int flags, cred_t cr)
	-{
	- return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
	-}
	-
	-/*
	- * Access function for secpolicy_vnode_setattr
	- */
	-int
	-zfs_zaccess_unix(znode_t zp, mode_t mode, cred_t cr)
	-{
	- int v4_mode = zfs_unix_to_v4(mode >> 6);
	-
	- return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
	-}
	-
	-static int
	-zfs_delete_final_check(znode_t zp, znode_t dzp,
	- mode_t available_perms, cred_t *cr)
	-{
	- int error;
	- uid_t downer;
	-
	- downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
	-
	- error = secpolicy_vnode_access2(cr, ZTOV(dzp),
	- downer, available_perms, VWRITE\|VEXEC);
	-
	- if (error == 0)
	- error = zfs_sticky_remove_access(dzp, zp, cr);
	-
	- return (error);
	-}
	-
	-/*
	- * Determine whether Access should be granted/deny, without
	- * consulting least priv subsystem.
	- *
	- * The following chart is the recommended NFSv4 enforcement for
	- * ability to delete an object.
	- *
	- * -------------------------------------------------------
	- * \| Parent Dir \| Target Object Permissions \|
	- * \| permissions \| \|
	- * -------------------------------------------------------
	- * \| \| ACL Allows \| ACL Denies\| Delete \|
	- * \| \| Delete \| Delete \| unspecified\|
	- * -------------------------------------------------------
	- * \| ACL Allows \| Permit \| Permit \| Permit \|
	- * \| DELETE_CHILD \| \|
	- * -------------------------------------------------------
	- * \| ACL Denies \| Permit \| Deny \| Deny \|
	- * \| DELETE_CHILD \| \| \| \|
	- * -------------------------------------------------------
	- * \| ACL specifies \| \| \| \|
	- * \| only allow \| Permit \| Permit \| Permit \|
	- * \| write and \| \| \| \|
	- * \| execute \| \| \| \|
	- * -------------------------------------------------------
	- * \| ACL denies \| \| \| \|
	- * \| write and \| Permit \| Deny \| Deny \|
	- * \| execute \| \| \| \|
	- * -------------------------------------------------------
	- * ^
	- * \|
	- * No search privilege, can't even look up file?
	- *
	- */
	-int
	-zfs_zaccess_delete(znode_t dzp, znode_t zp, cred_t *cr)
	-{
	- uint32_t dzp_working_mode = 0;
	- uint32_t zp_working_mode = 0;
	- int dzp_error, zp_error;
	- mode_t available_perms;
	- boolean_t dzpcheck_privs = B_TRUE;
	- boolean_t zpcheck_privs = B_TRUE;
	-
	- /*
	- * We want specific DELETE permissions to
	- * take precedence over WRITE/EXECUTE. We don't
	- * want an ACL such as this to mess us up.
	- * user:joe:write_data:deny,user:joe:delete:allow
	- *
	- * However, deny permissions may ultimately be overridden
	- * by secpolicy_vnode_access().
	- *
	- * We will ask for all of the necessary permissions and then
	- * look at the working modes from the directory and target object
	- * to determine what was found.
	- */
	-
	- if (zp->z_pflags & (ZFS_IMMUTABLE \| ZFS_NOUNLINK))
	- return (SET_ERROR(EPERM));
	-
	- /*
	- * First row
	- * If the directory permissions allow the delete, we are done.
	- */
	- if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
	- &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
	- return (0);
	-
	- /*
	- * If target object has delete permission then we are done
	- */
	- if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
	- &zpcheck_privs, B_FALSE, cr)) == 0)
	- return (0);
	-
	- ASSERT(dzp_error && zp_error);
	-
	- if (!dzpcheck_privs)
	- return (dzp_error);
	- if (!zpcheck_privs)
	- return (zp_error);
	-
	- /*
	- * Second row
	- *
	- * If directory returns EACCES then delete_child was denied
	- * due to deny delete_child. In this case send the request through
	- * secpolicy_vnode_remove(). We don't use zfs_delete_final_check()
	- * since that could allow the delete based on write/execute permission
	- * and we want delete permissions to override write/execute.
	- */
	-
	- if (dzp_error == EACCES)
	- return (secpolicy_vnode_remove(ZTOV(dzp), cr)); /* XXXPJD: s/dzp/zp/ ? */
	-
	- /*
	- * Third Row
	- * only need to see if we have write/execute on directory.
	- */
	-
	- dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE\|ACE_WRITE_DATA,
	- &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
	-
	- if (dzp_error != 0 && !dzpcheck_privs)
	- return (dzp_error);
	-
	- /*
	- * Fourth row
	- */
	-
	- available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
	- available_perms \|= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
	-
	- return (zfs_delete_final_check(zp, dzp, available_perms, cr));
	-
	-}
	-
	-int
	-zfs_zaccess_rename(znode_t sdzp, znode_t szp, znode_t *tdzp,
	- znode_t tzp, cred_t cr)
	-{
	- int add_perm;
	- int error;
	-
	- if (szp->z_pflags & ZFS_AV_QUARANTINED)
	- return (SET_ERROR(EACCES));
	-
	- add_perm = (ZTOV(szp)->v_type == VDIR) ?
	- ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
	-
	- /*
	- * Rename permissions are combination of delete permission +
	- * add file/subdir permission.
	- *
	- * BSD operating systems also require write permission
	- * on the directory being moved from one parent directory
	- * to another.
	- */
	- if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) {
	- if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))
	- return (error);
	- }
	-
	- /*
	- * first make sure we do the delete portion.
	- *
	- * If that succeeds then check for add_file/add_subdir permissions
	- */
	-
	- if (error = zfs_zaccess_delete(sdzp, szp, cr))
	- return (error);
	-
	- /*
	- * If we have a tzp, see if we can delete it?
	- */
	- if (tzp) {
	- if (error = zfs_zaccess_delete(tdzp, tzp, cr))
	- return (error);
	- }
	-
	- /*
	- * Now check for add permissions
	- */
	- error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
	-
	- return (error);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
	@@ -1,199 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/vfs.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zfs_sa.h>
	-#include <sys/zfs_acl.h>
	-
	-void
	-zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
	-{
	- int i;
	-
	- for (i = 0; i != ace_cnt; i++, ace++) {
	- ace->a_who = BSWAP_32(ace->a_who);
	- ace->a_access_mask = BSWAP_32(ace->a_access_mask);
	- ace->a_flags = BSWAP_16(ace->a_flags);
	- ace->a_type = BSWAP_16(ace->a_type);
	- }
	-}
	-
	-/*
	- * swap ace_t and ace_oject_t
	- */
	-void
	-zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
	-{
	- caddr_t end;
	- caddr_t ptr;
	- zfs_ace_t *zacep = NULL;
	- ace_t *acep;
	- uint16_t entry_type;
	- size_t entry_size;
	- int ace_type;
	-
	- end = (caddr_t)buf + size;
	- ptr = buf;
	-
	- while (ptr < end) {
	- if (zfs_layout) {
	- /*
	- * Avoid overrun. Embedded aces can have one
	- * of several sizes. We don't know exactly
	- * how many our present, only the size of the
	- * buffer containing them. That size may be
	- * larger than needed to hold the aces
	- * present. As long as we do not do any
	- * swapping beyond the end of our block we are
	- * okay. It it safe to swap any non-ace data
	- * within the block since it is just zeros.
	- */
	- if (ptr + sizeof (zfs_ace_hdr_t) > end) {
	- break;
	- }
	- zacep = (zfs_ace_t *)ptr;
	- zacep->z_hdr.z_access_mask =
	- BSWAP_32(zacep->z_hdr.z_access_mask);
	- zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags);
	- ace_type = zacep->z_hdr.z_type =
	- BSWAP_16(zacep->z_hdr.z_type);
	- entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
	- } else {
	- /* Overrun avoidance */
	- if (ptr + sizeof (ace_t) > end) {
	- break;
	- }
	- acep = (ace_t *)ptr;
	- acep->a_access_mask = BSWAP_32(acep->a_access_mask);
	- acep->a_flags = BSWAP_16(acep->a_flags);
	- ace_type = acep->a_type = BSWAP_16(acep->a_type);
	- acep->a_who = BSWAP_32(acep->a_who);
	- entry_type = acep->a_flags & ACE_TYPE_FLAGS;
	- }
	- switch (entry_type) {
	- case ACE_OWNER:
	- case ACE_EVERYONE:
	- case (ACE_IDENTIFIER_GROUP \| ACE_GROUP):
	- entry_size = zfs_layout ?
	- sizeof (zfs_ace_hdr_t) : sizeof (ace_t);
	- break;
	- case ACE_IDENTIFIER_GROUP:
	- default:
	- /* Overrun avoidance */
	- if (zfs_layout) {
	- if (ptr + sizeof (zfs_ace_t) <= end) {
	- zacep->z_fuid = BSWAP_64(zacep->z_fuid);
	- } else {
	- entry_size = sizeof (zfs_ace_t);
	- break;
	- }
	- }
	- switch (ace_type) {
	- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
	- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
	- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
	- entry_size = zfs_layout ?
	- sizeof (zfs_object_ace_t) :
	- sizeof (ace_object_t);
	- break;
	- default:
	- entry_size = zfs_layout ? sizeof (zfs_ace_t) :
	- sizeof (ace_t);
	- break;
	- }
	- }
	- ptr = ptr + entry_size;
	- }
	-}
	-
	-/* ARGSUSED */
	-void
	-zfs_oldacl_byteswap(void *buf, size_t size)
	-{
	- int cnt;
	-
	- /*
	- * Arggh, since we don't know how many ACEs are in
	- * the array, we have to swap the entire block
	- */
	-
	- cnt = size / sizeof (ace_t);
	-
	- zfs_oldace_byteswap((ace_t *)buf, cnt);
	-}
	-
	-/* ARGSUSED */
	-void
	-zfs_acl_byteswap(void *buf, size_t size)
	-{
	- zfs_ace_byteswap(buf, size, B_TRUE);
	-}
	-
	-void
	-zfs_znode_byteswap(void *buf, size_t size)
	-{
	- znode_phys_t *zp = buf;
	-
	- ASSERT(size >= sizeof (znode_phys_t));
	-
	- zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
	- zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
	- zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
	- zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
	- zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
	- zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
	- zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
	- zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
	- zp->zp_gen = BSWAP_64(zp->zp_gen);
	- zp->zp_mode = BSWAP_64(zp->zp_mode);
	- zp->zp_size = BSWAP_64(zp->zp_size);
	- zp->zp_parent = BSWAP_64(zp->zp_parent);
	- zp->zp_links = BSWAP_64(zp->zp_links);
	- zp->zp_xattr = BSWAP_64(zp->zp_xattr);
	- zp->zp_rdev = BSWAP_64(zp->zp_rdev);
	- zp->zp_flags = BSWAP_64(zp->zp_flags);
	- zp->zp_uid = BSWAP_64(zp->zp_uid);
	- zp->zp_gid = BSWAP_64(zp->zp_gid);
	- zp->zp_zap = BSWAP_64(zp->zp_zap);
	- zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
	- zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
	- zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
	-
	- zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
	- zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size);
	- zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
	- zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count);
	- if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
	- zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
	- ZFS_ACE_SPACE);
	- } else {
	- zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
	- ACE_SLOT_CNT);
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
	@@ -1,1364 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
	- */
	-
	-/*
	- * ZFS control directory (a.k.a. ".zfs")
	- *
	- * This directory provides a common location for all ZFS meta-objects.
	- * Currently, this is only the 'snapshot' directory, but this may expand in the
	- * future. The elements are built using the GFS primitives, as the hierarchy
	- * does not actually exist on disk.
	- *
	- * For 'snapshot', we don't want to have all snapshots always mounted, because
	- * this would take up a huge amount of space in /etc/mnttab. We have three
	- * types of objects:
	- *
	- * ctldir ------> snapshotdir -------> snapshot
	- * \|
	- * \|
	- * V
	- * mounted fs
	- *
	- * The 'snapshot' node contains just enough information to lookup '..' and act
	- * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
	- * perform an automount of the underlying filesystem and return the
	- * corresponding vnode.
	- *
	- * All mounts are handled automatically by the kernel, but unmounts are
	- * (currently) handled from user land. The main reason is that there is no
	- * reliable way to auto-unmount the filesystem when it's "no longer in use".
	- * When the user unmounts a filesystem, we call zfsctl_unmount(), which
	- * unmounts any snapshots within the snapshot directory.
	- *
	- * The '.zfs', '.zfs/snapshot', and all directories created under
	- * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
	- * share the same vfs_t as the head filesystem (what '.zfs' lives under).
	- *
	- * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
	- * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
	- * However, vnodes within these mounted on file systems have their v_vfsp
	- * fields set to the head filesystem to make NFS happy (see
	- * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
	- * so that it cannot be freed until all snapshots have been unmounted.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/zfs_ctldir.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zfs_vfsops.h>
	-#include <sys/namei.h>
	-#include <sys/stat.h>
	-#include <sys/dmu.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/mount.h>
	-#include <sys/zap.h>
	-
	-#include "zfs_namecheck.h"
	-
	-/* Common access mode for all virtual directories under the ctldir */
	-const u_short zfsctl_ctldir_mode = S_IRUSR \| S_IXUSR \| S_IRGRP \| S_IXGRP \|
	- S_IROTH \| S_IXOTH;
	-
	-/*
	- * "Synthetic" filesystem implementation.
	- */
	-
	-/*
	- * Assert that A implies B.
	- */
	-#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) \|\| (B), (msg));
	-
	-static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
	-
	-typedef struct sfs_node {
	- char sn_name[ZFS_MAX_DATASET_NAME_LEN];
	- uint64_t sn_parent_id;
	- uint64_t sn_id;
	-} sfs_node_t;
	-
	-/*
	- * Check the parent's ID as well as the node's to account for a chance
	- * that IDs originating from different domains (snapshot IDs, artifical
	- * IDs, znode IDs) may clash.
	- */
	-static int
	-sfs_compare_ids(struct vnode vp, void arg)
	-{
	- sfs_node_t *n1 = vp->v_data;
	- sfs_node_t *n2 = arg;
	- bool equal;
	-
	- equal = n1->sn_id == n2->sn_id &&
	- n1->sn_parent_id == n2->sn_parent_id;
	-
	- /* Zero means equality. */
	- return (!equal);
	-}
	-
	-static int
	-sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
	- uint64_t id, struct vnode **vpp)
	-{
	- sfs_node_t search;
	- int err;
	-
	- search.sn_id = id;
	- search.sn_parent_id = parent_id;
	- err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp,
	- sfs_compare_ids, &search);
	- return (err);
	-}
	-
	-static int
	-sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
	- uint64_t id, struct vnode **vpp)
	-{
	- int err;
	-
	- KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
	- err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp,
	- sfs_compare_ids, vp->v_data);
	- return (err);
	-}
	-
	-static void
	-sfs_vnode_remove(struct vnode *vp)
	-{
	- vfs_hash_remove(vp);
	-}
	-
	-typedef void sfs_vnode_setup_fn(vnode_t vp, void arg);
	-
	-static int
	-sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
	- const char tag, struct vop_vector vops,
	- sfs_vnode_setup_fn setup, void *arg,
	- struct vnode **vpp)
	-{
	- struct vnode *vp;
	- int error;
	-
	- error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
	- if (error != 0 \|\| *vpp != NULL) {
	- KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
	- "sfs vnode with no data");
	- return (error);
	- }
	-
	- /* Allocate a new vnode/inode. */
	- error = getnewvnode(tag, mp, vops, &vp);
	- if (error != 0) {
	- *vpp = NULL;
	- return (error);
	- }
	-
	- /*
	- * Exclusively lock the vnode vnode while it's being constructed.
	- */
	- lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
	- error = insmntque(vp, mp);
	- if (error != 0) {
	- *vpp = NULL;
	- return (error);
	- }
	-
	- setup(vp, arg);
	-
	- error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
	- if (error != 0 \|\| *vpp != NULL) {
	- KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
	- "sfs vnode with no data");
	- return (error);
	- }
	-
	- *vpp = vp;
	- return (0);
	-}
	-
	-static void
	-sfs_print_node(sfs_node_t *node)
	-{
	- printf("\tname = %s\n", node->sn_name);
	- printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
	- printf("\tid = %ju\n", (uintmax_t)node->sn_id);
	-}
	-
	-static sfs_node_t *
	-sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
	-{
	- struct sfs_node *node;
	-
	- KASSERT(strlen(name) < sizeof(node->sn_name),
	- ("sfs node name is too long"));
	- KASSERT(size >= sizeof(*node), ("sfs node size is too small"));
	- node = malloc(size, M_SFSNODES, M_WAITOK \| M_ZERO);
	- strlcpy(node->sn_name, name, sizeof(node->sn_name));
	- node->sn_parent_id = parent_id;
	- node->sn_id = id;
	-
	- return (node);
	-}
	-
	-static void
	-sfs_destroy_node(sfs_node_t *node)
	-{
	- free(node, M_SFSNODES);
	-}
	-
	-static void *
	-sfs_reclaim_vnode(vnode_t *vp)
	-{
	- sfs_node_t *node;
	- void *data;
	-
	- sfs_vnode_remove(vp);
	- data = vp->v_data;
	- vp->v_data = NULL;
	- return (data);
	-}
	-
	-static int
	-sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
	- uio_t uio, off_t offp)
	-{
	- struct dirent entry;
	- int error;
	-
	- /* Reset ncookies for subsequent use of vfs_read_dirent. */
	- if (ap->a_ncookies != NULL)
	- *ap->a_ncookies = 0;
	-
	- if (uio->uio_resid < sizeof(entry))
	- return (SET_ERROR(EINVAL));
	-
	- if (uio->uio_offset < 0)
	- return (SET_ERROR(EINVAL));
	- if (uio->uio_offset == 0) {
	- entry.d_fileno = id;
	- entry.d_type = DT_DIR;
	- entry.d_name[0] = '.';
	- entry.d_namlen = 1;
	- entry.d_reclen = sizeof(entry);
	- dirent_terminate(&entry);
	- error = vfs_read_dirent(ap, &entry, uio->uio_offset);
	- if (error != 0)
	- return (SET_ERROR(error));
	- }
	-
	- if (uio->uio_offset < sizeof(entry))
	- return (SET_ERROR(EINVAL));
	- if (uio->uio_offset == sizeof(entry)) {
	- entry.d_fileno = parent_id;
	- entry.d_type = DT_DIR;
	- entry.d_name[0] = '.';
	- entry.d_name[1] = '.';
	- entry.d_namlen = 2;
	- entry.d_reclen = sizeof(entry);
	- dirent_terminate(&entry);
	- error = vfs_read_dirent(ap, &entry, uio->uio_offset);
	- if (error != 0)
	- return (SET_ERROR(error));
	- }
	-
	- if (offp != NULL)
	- offp = 2 sizeof(entry);
	- return (0);
	-}
	-
	-
	-/*
	- * .zfs inode namespace
	- *
	- * We need to generate unique inode numbers for all files and directories
	- * within the .zfs pseudo-filesystem. We use the following scheme:
	- *
	- * ENTRY ZFSCTL_INODE
	- * .zfs 1
	- * .zfs/snapshot 2
	- * .zfs/snapshot/<snap> objectid(snap)
	- */
	-#define ZFSCTL_INO_SNAP(id) (id)
	-
	-static struct vop_vector zfsctl_ops_root;
	-static struct vop_vector zfsctl_ops_snapdir;
	-static struct vop_vector zfsctl_ops_snapshot;
	-static struct vop_vector zfsctl_ops_shares_dir;
	-
	-void
	-zfsctl_init(void)
	-{
	-}
	-
	-void
	-zfsctl_fini(void)
	-{
	-}
	-
	-boolean_t
	-zfsctl_is_node(vnode_t *vp)
	-{
	- return (vn_matchops(vp, zfsctl_ops_root) \|\|
	- vn_matchops(vp, zfsctl_ops_snapdir) \|\|
	- vn_matchops(vp, zfsctl_ops_snapshot) \|\|
	- vn_matchops(vp, zfsctl_ops_shares_dir));
	-
	-}
	-
	-typedef struct zfsctl_root {
	- sfs_node_t node;
	- sfs_node_t *snapdir;
	- timestruc_t cmtime;
	-} zfsctl_root_t;
	-
	-
	-/*
	- * Create the '.zfs' directory.
	- */
	-void
	-zfsctl_create(zfsvfs_t *zfsvfs)
	-{
	- zfsctl_root_t *dot_zfs;
	- sfs_node_t *snapdir;
	- vnode_t *rvp;
	- uint64_t crtime[2];
	-
	- ASSERT(zfsvfs->z_ctldir == NULL);
	-
	- snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT,
	- ZFSCTL_INO_SNAPDIR);
	- dot_zfs = (zfsctl_root_t )sfs_alloc_node(sizeof(dot_zfs), ".zfs", 0,
	- ZFSCTL_INO_ROOT);
	- dot_zfs->snapdir = snapdir;
	-
	- VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
	- VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
	- &crtime, sizeof(crtime)));
	- ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
	- vput(rvp);
	-
	- zfsvfs->z_ctldir = dot_zfs;
	-}
	-
	-/*
	- * Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
	- * The nodes must not have any associated vnodes by now as they should be
	- * vflush-ed.
	- */
	-void
	-zfsctl_destroy(zfsvfs_t *zfsvfs)
	-{
	- sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
	- sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
	- zfsvfs->z_ctldir = NULL;
	-}
	-
	-static int
	-zfsctl_fs_root_vnode(struct mount mp, void arg __unused, int flags,
	- struct vnode **vpp)
	-{
	- return (VFS_ROOT(mp, flags, vpp));
	-}
	-
	-static void
	-zfsctl_common_vnode_setup(vnode_t vp, void arg)
	-{
	- ASSERT_VOP_ELOCKED(vp, __func__);
	-
	- /* We support shared locking. */
	- VN_LOCK_ASHARE(vp);
	- vp->v_type = VDIR;
	- vp->v_data = arg;
	-}
	-
	-static int
	-zfsctl_root_vnode(struct mount mp, void arg __unused, int flags,
	- struct vnode **vpp)
	-{
	- void *node;
	- int err;
	-
	- node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir;
	- err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
	- zfsctl_common_vnode_setup, node, vpp);
	- return (err);
	-}
	-
	-static int
	-zfsctl_snapdir_vnode(struct mount mp, void arg __unused, int flags,
	- struct vnode **vpp)
	-{
	- void *node;
	- int err;
	-
	- node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir;
	- err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
	- &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
	- return (err);
	-}
	-
	-/*
	- * Given a root znode, retrieve the associated .zfs directory.
	- * Add a hold to the vnode and return it.
	- */
	-int
	-zfsctl_root(zfsvfs_t zfsvfs, int flags, vnode_t *vpp)
	-{
	- vnode_t *vp;
	- int error;
	-
	- error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
	- return (error);
	-}
	-
	-/*
	- * Common open routine. Disallow any write access.
	- */
	-static int
	-zfsctl_common_open(struct vop_open_args *ap)
	-{
	- int flags = ap->a_mode;
	-
	- if (flags & FWRITE)
	- return (SET_ERROR(EACCES));
	-
	- return (0);
	-}
	-
	-/*
	- * Common close routine. Nothing to do here.
	- */
	-/* ARGSUSED */
	-static int
	-zfsctl_common_close(struct vop_close_args *ap)
	-{
	- return (0);
	-}
	-
	-/*
	- * Common access routine. Disallow writes.
	- */
	-static int
	-zfsctl_common_access(ap)
	- struct vop_access_args /* {
	- struct vnode *a_vp;
	- accmode_t a_accmode;
	- struct ucred *a_cred;
	- struct thread *a_td;
	- } / ap;
	-{
	- accmode_t accmode = ap->a_accmode;
	-
	- if (accmode & VWRITE)
	- return (SET_ERROR(EACCES));
	- return (0);
	-}
	-
	-/*
	- * Common getattr function. Fill in basic information.
	- */
	-static void
	-zfsctl_common_getattr(vnode_t vp, vattr_t vap)
	-{
	- timestruc_t now;
	- sfs_node_t *node;
	-
	- node = vp->v_data;
	-
	- vap->va_uid = 0;
	- vap->va_gid = 0;
	- vap->va_rdev = 0;
	- /*
	- * We are a purely virtual object, so we have no
	- * blocksize or allocated blocks.
	- */
	- vap->va_blksize = 0;
	- vap->va_nblocks = 0;
	- vap->va_seq = 0;
	- vn_fsid(vp, vap);
	- vap->va_mode = zfsctl_ctldir_mode;
	- vap->va_type = VDIR;
	- /*
	- * We live in the now (for atime).
	- */
	- gethrestime(&now);
	- vap->va_atime = now;
	- /* FreeBSD: Reset chflags(2) flags. */
	- vap->va_flags = 0;
	-
	- vap->va_nodeid = node->sn_id;
	-
	- /* At least '.' and '..'. */
	- vap->va_nlink = 2;
	-}
	-
	-static int
	-zfsctl_common_fid(ap)
	- struct vop_fid_args /* {
	- struct vnode *a_vp;
	- struct fid *a_fid;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	- fid_t fidp = (void )ap->a_fid;
	- sfs_node_t *node = vp->v_data;
	- uint64_t object = node->sn_id;
	- zfid_short_t *zfid;
	- int i;
	-
	- zfid = (zfid_short_t *)fidp;
	- zfid->zf_len = SHORT_FID_LEN;
	-
	- for (i = 0; i < sizeof(zfid->zf_object); i++)
	- zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
	-
	- /* .zfs nodes always have a generation number of 0 */
	- for (i = 0; i < sizeof(zfid->zf_gen); i++)
	- zfid->zf_gen[i] = 0;
	-
	- return (0);
	-}
	-
	-static int
	-zfsctl_common_reclaim(ap)
	- struct vop_reclaim_args /* {
	- struct vnode *a_vp;
	- struct thread *a_td;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	-
	- (void) sfs_reclaim_vnode(vp);
	- return (0);
	-}
	-
	-static int
	-zfsctl_common_print(ap)
	- struct vop_print_args /* {
	- struct vnode *a_vp;
	- } / ap;
	-{
	- sfs_print_node(ap->a_vp->v_data);
	- return (0);
	-}
	-
	-/*
	- * Get root directory attributes.
	- */
	-static int
	-zfsctl_root_getattr(ap)
	- struct vop_getattr_args /* {
	- struct vnode *a_vp;
	- struct vattr *a_vap;
	- struct ucred *a_cred;
	- } / ap;
	-{
	- struct vnode *vp = ap->a_vp;
	- struct vattr *vap = ap->a_vap;
	- zfsctl_root_t *node = vp->v_data;
	-
	- zfsctl_common_getattr(vp, vap);
	- vap->va_ctime = node->cmtime;
	- vap->va_mtime = vap->va_ctime;
	- vap->va_birthtime = vap->va_ctime;
	- vap->va_nlink += 1; /* snapdir */
	- vap->va_size = vap->va_nlink;
	- return (0);
	-}
	-
	-/*
	- * When we lookup "." we still can be asked to lock it
	- * differently, can't we?
	- */
	-int
	-zfsctl_relock_dot(vnode_t *dvp, int ltype)
	-{
	- vref(dvp);
	- if (ltype != VOP_ISLOCKED(dvp)) {
	- if (ltype == LK_EXCLUSIVE)
	- vn_lock(dvp, LK_UPGRADE \| LK_RETRY);
	- else /* if (ltype == LK_SHARED) */
	- vn_lock(dvp, LK_DOWNGRADE \| LK_RETRY);
	-
	- /* Relock for the "." case may left us with reclaimed vnode. */
	- if (VN_IS_DOOMED(dvp)) {
	- vrele(dvp);
	- return (SET_ERROR(ENOENT));
	- }
	- }
	- return (0);
	-}
	-
	-/*
	- * Special case the handling of "..".
	- */
	-int
	-zfsctl_root_lookup(ap)
	- struct vop_lookup_args /* {
	- struct vnode *a_dvp;
	- struct vnode **a_vpp;
	- struct componentname *a_cnp;
	- } / ap;
	-{
	- struct componentname *cnp = ap->a_cnp;
	- vnode_t *dvp = ap->a_dvp;
	- vnode_t **vpp = ap->a_vpp;
	- cred_t *cr = ap->a_cnp->cn_cred;
	- int flags = ap->a_cnp->cn_flags;
	- int lkflags = ap->a_cnp->cn_lkflags;
	- int nameiop = ap->a_cnp->cn_nameiop;
	- int err;
	- int ltype;
	-
	- ASSERT(dvp->v_type == VDIR);
	-
	- if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
	- return (SET_ERROR(ENOTSUP));
	-
	- if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
	- err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
	- if (err == 0)
	- *vpp = dvp;
	- } else if ((flags & ISDOTDOT) != 0) {
	- err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
	- lkflags, vpp);
	- } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
	- err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
	- } else {
	- err = SET_ERROR(ENOENT);
	- }
	- if (err != 0)
	- *vpp = NULL;
	- return (err);
	-}
	-
	-static int
	-zfsctl_root_readdir(ap)
	- struct vop_readdir_args /* {
	- struct vnode *a_vp;
	- struct uio *a_uio;
	- struct ucred *a_cred;
	- int *a_eofflag;
	- int *ncookies;
	- u_long **a_cookies;
	- } / ap;
	-{
	- struct dirent entry;
	- vnode_t *vp = ap->a_vp;
	- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
	- zfsctl_root_t *node = vp->v_data;
	- uio_t *uio = ap->a_uio;
	- int *eofp = ap->a_eofflag;
	- off_t dots_offset;
	- int error;
	-
	- ASSERT(vp->v_type == VDIR);
	-
	- error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
	- &dots_offset);
	- if (error != 0) {
	- if (error == ENAMETOOLONG) /* ran out of destination space */
	- error = 0;
	- return (error);
	- }
	- if (uio->uio_offset != dots_offset)
	- return (SET_ERROR(EINVAL));
	-
	- CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name));
	- entry.d_fileno = node->snapdir->sn_id;
	- entry.d_type = DT_DIR;
	- strcpy(entry.d_name, node->snapdir->sn_name);
	- entry.d_namlen = strlen(entry.d_name);
	- entry.d_reclen = sizeof(entry);
	- dirent_terminate(&entry);
	- error = vfs_read_dirent(ap, &entry, uio->uio_offset);
	- if (error != 0) {
	- if (error == ENAMETOOLONG)
	- error = 0;
	- return (SET_ERROR(error));
	- }
	- if (eofp != NULL)
	- *eofp = 1;
	- return (0);
	-}
	-
	-static int
	-zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
	-{
	- static const char dotzfs_name[4] = ".zfs";
	- vnode_t *dvp;
	- int error;
	-
	- if (*ap->a_buflen < sizeof (dotzfs_name))
	- return (SET_ERROR(ENOMEM));
	-
	- error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
	- LK_SHARED, &dvp);
	- if (error != 0)
	- return (SET_ERROR(error));
	-
	- VOP_UNLOCK(dvp);
	- *ap->a_vpp = dvp;
	- *ap->a_buflen -= sizeof (dotzfs_name);
	- bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
	- return (0);
	-}
	-
	-static int
	-zfsctl_common_pathconf(ap)
	- struct vop_pathconf_args /* {
	- struct vnode *a_vp;
	- int a_name;
	- int *a_retval;
	- } / ap;
	-{
	- /*
	- * We care about ACL variables so that user land utilities like ls
	- * can display them correctly. Since the ctldir's st_dev is set to be
	- * the same as the parent dataset, we must support all variables that
	- * it supports.
	- */
	- switch (ap->a_name) {
	- case _PC_LINK_MAX:
	- *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
	- return (0);
	-
	- case _PC_FILESIZEBITS:
	- *ap->a_retval = 64;
	- return (0);
	-
	- case _PC_MIN_HOLE_SIZE:
	- *ap->a_retval = (int)SPA_MINBLOCKSIZE;
	- return (0);
	-
	- case _PC_ACL_NFS4:
	- *ap->a_retval = 1;
	- return (0);
	-
	- case _PC_ACL_PATH_MAX:
	- *ap->a_retval = ACL_MAX_ENTRIES;
	- return (0);
	-
	- case _PC_NAME_MAX:
	- *ap->a_retval = NAME_MAX;
	- return (0);
	-
	- default:
	- return (vop_stdpathconf(ap));
	- }
	-}
	-
	-/**
	- * Returns a trivial ACL
	- */
	-int
	-zfsctl_common_getacl(ap)
	- struct vop_getacl_args /* {
	- struct vnode *vp;
	- acl_type_t a_type;
	- struct acl *a_aclp;
	- struct ucred *cred;
	- struct thread *td;
	- } / ap;
	-{
	- int i;
	-
	- if (ap->a_type != ACL_TYPE_NFS4)
	- return (EINVAL);
	-
	- acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
	- /*
	- * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
	- * attributes. That is not the case for the ctldir, so we must clear
	- * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs
	- * aren't supported by the ctldir.
	- */
	- for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
	- struct acl_entry *entry;
	- entry = &(ap->a_aclp->acl_entry[i]);
	- uint32_t old_perm = entry->ae_perm;
	- entry->ae_perm &= ~(ACL_WRITE_ACL \| ACL_WRITE_OWNER \|
	- ACL_WRITE_ATTRIBUTES \| ACL_WRITE_NAMED_ATTRS \|
	- ACL_READ_NAMED_ATTRS );
	- }
	-
	- return (0);
	-}
	-
	-static struct vop_vector zfsctl_ops_root = {
	- .vop_default = &default_vnodeops,
	- .vop_open = zfsctl_common_open,
	- .vop_close = zfsctl_common_close,
	- .vop_ioctl = VOP_EINVAL,
	- .vop_getattr = zfsctl_root_getattr,
	- .vop_access = zfsctl_common_access,
	- .vop_readdir = zfsctl_root_readdir,
	- .vop_lookup = zfsctl_root_lookup,
	- .vop_inactive = VOP_NULL,
	- .vop_reclaim = zfsctl_common_reclaim,
	- .vop_fid = zfsctl_common_fid,
	- .vop_print = zfsctl_common_print,
	- .vop_vptocnp = zfsctl_root_vptocnp,
	- .vop_pathconf = zfsctl_common_pathconf,
	- .vop_getacl = zfsctl_common_getacl,
	-};
	-VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root);
	-
	-static int
	-zfsctl_snapshot_zname(vnode_t vp, const char name, int len, char *zname)
	-{
	- objset_t os = ((zfsvfs_t )((vp)->v_vfsp->vfs_data))->z_os;
	-
	- dmu_objset_name(os, zname);
	- if (strlen(zname) + 1 + strlen(name) >= len)
	- return (SET_ERROR(ENAMETOOLONG));
	- (void) strcat(zname, "@");
	- (void) strcat(zname, name);
	- return (0);
	-}
	-
	-static int
	-zfsctl_snapshot_lookup(vnode_t vp, const char name, uint64_t *id)
	-{
	- objset_t os = ((zfsvfs_t )((vp)->v_vfsp->vfs_data))->z_os;
	- int err;
	-
	- err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
	- return (err);
	-}
	-
	-/*
	- * Given a vnode get a root vnode of a filesystem mounted on top of
	- * the vnode, if any. The root vnode is referenced and locked.
	- * If no filesystem is mounted then the orinal vnode remains referenced
	- * and locked. If any error happens the orinal vnode is unlocked and
	- * released.
	- */
	-static int
	-zfsctl_mounted_here(vnode_t **vpp, int flags)
	-{
	- struct mount *mp;
	- int err;
	-
	- ASSERT_VOP_LOCKED(*vpp, __func__);
	- ASSERT3S((*vpp)->v_type, ==, VDIR);
	-
	- if ((mp = (*vpp)->v_mountedhere) != NULL) {
	- err = vfs_busy(mp, 0);
	- KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
	- KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
	- vput(*vpp);
	- err = VFS_ROOT(mp, flags, vpp);
	- vfs_unbusy(mp);
	- return (err);
	- }
	- return (EJUSTRETURN);
	-}
	-
	-typedef struct {
	- const char *snap_name;
	- uint64_t snap_id;
	-} snapshot_setup_arg_t;
	-
	-static void
	-zfsctl_snapshot_vnode_setup(vnode_t vp, void arg)
	-{
	- snapshot_setup_arg_t *ssa = arg;
	- sfs_node_t *node;
	-
	- ASSERT_VOP_ELOCKED(vp, __func__);
	-
	- node = sfs_alloc_node(sizeof(sfs_node_t),
	- ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
	- zfsctl_common_vnode_setup(vp, node);
	-
	- /* We have to support recursive locking. */
	- VN_LOCK_AREC(vp);
	-}
	-
	-/*
	- * Lookup entry point for the 'snapshot' directory. Try to open the
	- * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
	- * Perform a mount of the associated dataset on top of the vnode.
	- * There are four possibilities:
	- * - the snapshot node and vnode do not exist
	- * - the snapshot vnode is covered by the mounted snapshot
	- * - the snapshot vnode is not covered yet, the mount operation is in progress
	- * - the snapshot vnode is not covered, because the snapshot has been unmounted
	- * The last two states are transient and should be relatively short-lived.
	- */
	-int
	-zfsctl_snapdir_lookup(ap)
	- struct vop_lookup_args /* {
	- struct vnode *a_dvp;
	- struct vnode **a_vpp;
	- struct componentname *a_cnp;
	- } / ap;
	-{
	- vnode_t *dvp = ap->a_dvp;
	- vnode_t **vpp = ap->a_vpp;
	- struct componentname *cnp = ap->a_cnp;
	- char name[NAME_MAX + 1];
	- char fullname[ZFS_MAX_DATASET_NAME_LEN];
	- char *mountpoint;
	- size_t mountpoint_len;
	- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
	- uint64_t snap_id;
	- int nameiop = cnp->cn_nameiop;
	- int lkflags = cnp->cn_lkflags;
	- int flags = cnp->cn_flags;
	- int err;
	-
	- ASSERT(dvp->v_type == VDIR);
	-
	- if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
	- return (SET_ERROR(ENOTSUP));
	-
	- if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
	- err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
	- if (err == 0)
	- *vpp = dvp;
	- return (err);
	- }
	- if (flags & ISDOTDOT) {
	- err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
	- vpp);
	- return (err);
	- }
	-
	- if (cnp->cn_namelen >= sizeof(name))
	- return (SET_ERROR(ENAMETOOLONG));
	-
	- strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
	- err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
	- if (err != 0)
	- return (SET_ERROR(ENOENT));
	-
	- for (;;) {
	- snapshot_setup_arg_t ssa;
	-
	- ssa.snap_name = name;
	- ssa.snap_id = snap_id;
	- err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
	- snap_id, "zfs", &zfsctl_ops_snapshot,
	- zfsctl_snapshot_vnode_setup, &ssa, vpp);
	- if (err != 0)
	- return (err);
	-
	- /* Check if a new vnode has just been created. */
	- if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
	- break;
	-
	- /*
	- * Check if a snapshot is already mounted on top of the vnode.
	- */
	- err = zfsctl_mounted_here(vpp, lkflags);
	- if (err != EJUSTRETURN)
	- return (err);
	-
	- /*
	- * If the vnode is not covered, then either the mount operation
	- * is in progress or the snapshot has already been unmounted
	- * but the vnode hasn't been inactivated and reclaimed yet.
	- * We can try to re-use the vnode in the latter case.
	- */
	- VI_LOCK(*vpp);
	- if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
	- /* Upgrade to exclusive lock in order to:
	- * - avoid race conditions
	- * - satisfy the contract of mount_snapshot()
	- */
	- err = VOP_LOCK(*vpp, LK_TRYUPGRADE \| LK_INTERLOCK);
	- if (err == 0)
	- break;
	- } else {
	- VI_UNLOCK(*vpp);
	- }
	-
	- /*
	- * In this state we can loop on uncontested locks and starve
	- * the thread doing the lengthy, non-trivial mount operation.
	- * So, yield to prevent that from happening.
	- */
	- vput(*vpp);
	- kern_yield(PRI_USER);
	- }
	-
	- VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname));
	-
	- mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
	- strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
	- mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
	- (void) snprintf(mountpoint, mountpoint_len,
	- "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
	- dvp->v_vfsp->mnt_stat.f_mntonname, name);
	-
	- err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
	- kmem_free(mountpoint, mountpoint_len);
	- if (err == 0) {
	- /*
	- * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
	- *
	- * This is where we lie about our v_vfsp in order to
	- * make .zfs/snapshot/<snapname> accessible over NFS
	- * without requiring manual mounts of <snapname>.
	- */
	- ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
	- VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
	-
	- /* Clear the root flag (set via VFS_ROOT) as well. */
	- (*vpp)->v_vflag &= ~VV_ROOT;
	- }
	-
	- if (err != 0)
	- *vpp = NULL;
	- return (err);
	-}
	-
	-static int
	-zfsctl_snapdir_readdir(ap)
	- struct vop_readdir_args /* {
	- struct vnode *a_vp;
	- struct uio *a_uio;
	- struct ucred *a_cred;
	- int *a_eofflag;
	- int *ncookies;
	- u_long **a_cookies;
	- } / ap;
	-{
	- char snapname[ZFS_MAX_DATASET_NAME_LEN];
	- struct dirent entry;
	- vnode_t *vp = ap->a_vp;
	- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
	- uio_t *uio = ap->a_uio;
	- int *eofp = ap->a_eofflag;
	- off_t dots_offset;
	- int error;
	-
	- ASSERT(vp->v_type == VDIR);
	-
	- error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
	- &dots_offset);
	- if (error != 0) {
	- if (error == ENAMETOOLONG) /* ran out of destination space */
	- error = 0;
	- return (error);
	- }
	-
	- ZFS_ENTER(zfsvfs);
	- for (;;) {
	- uint64_t cookie;
	- uint64_t id;
	-
	- cookie = uio->uio_offset - dots_offset;
	-
	- dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
	- error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
	- snapname, &id, &cookie, NULL);
	- dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
	- if (error != 0) {
	- if (error == ENOENT) {
	- if (eofp != NULL)
	- *eofp = 1;
	- error = 0;
	- }
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- entry.d_fileno = id;
	- entry.d_type = DT_DIR;
	- strcpy(entry.d_name, snapname);
	- entry.d_namlen = strlen(entry.d_name);
	- entry.d_reclen = sizeof(entry);
	- /* NOTE: d_off is the offset for the next entry. */
	- entry.d_off = cookie + dots_offset;
	- dirent_terminate(&entry);
	- error = vfs_read_dirent(ap, &entry, uio->uio_offset);
	- if (error != 0) {
	- if (error == ENAMETOOLONG)
	- error = 0;
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(error));
	- }
	- uio->uio_offset = cookie + dots_offset;
	- }
	- /* NOTREACHED */
	-}
	-
	-static int
	-zfsctl_snapdir_getattr(ap)
	- struct vop_getattr_args /* {
	- struct vnode *a_vp;
	- struct vattr *a_vap;
	- struct ucred *a_cred;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	- vattr_t *vap = ap->a_vap;
	- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
	- dsl_dataset_t *ds;
	- sfs_node_t *node = vp->v_data;
	- uint64_t snap_count;
	- int err;
	-
	- ZFS_ENTER(zfsvfs);
	- ds = dmu_objset_ds(zfsvfs->z_os);
	- zfsctl_common_getattr(vp, vap);
	- vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
	- vap->va_mtime = vap->va_ctime;
	- vap->va_birthtime = vap->va_ctime;
	- if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
	- err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
	- dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
	- if (err != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (err);
	- }
	- vap->va_nlink += snap_count;
	- }
	- vap->va_size = vap->va_nlink;
	-
	- ZFS_EXIT(zfsvfs);
	- return (0);
	-}
	-
	-static struct vop_vector zfsctl_ops_snapdir = {
	- .vop_default = &default_vnodeops,
	- .vop_open = zfsctl_common_open,
	- .vop_close = zfsctl_common_close,
	- .vop_getattr = zfsctl_snapdir_getattr,
	- .vop_access = zfsctl_common_access,
	- .vop_readdir = zfsctl_snapdir_readdir,
	- .vop_lookup = zfsctl_snapdir_lookup,
	- .vop_reclaim = zfsctl_common_reclaim,
	- .vop_fid = zfsctl_common_fid,
	- .vop_print = zfsctl_common_print,
	- .vop_pathconf = zfsctl_common_pathconf,
	- .vop_getacl = zfsctl_common_getacl,
	-};
	-VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir);
	-
	-static int
	-zfsctl_snapshot_inactive(ap)
	- struct vop_inactive_args /* {
	- struct vnode *a_vp;
	- struct thread *a_td;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	-
	- VERIFY(vrecycle(vp) == 1);
	- return (0);
	-}
	-
	-static int
	-zfsctl_snapshot_reclaim(ap)
	- struct vop_reclaim_args /* {
	- struct vnode *a_vp;
	- struct thread *a_td;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	- void *data = vp->v_data;
	-
	- sfs_reclaim_vnode(vp);
	- sfs_destroy_node(data);
	- return (0);
	-}
	-
	-static int
	-zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
	-{
	- struct mount *mp;
	- vnode_t *dvp;
	- vnode_t *vp;
	- sfs_node_t *node;
	- size_t len;
	- enum vgetstate vs;
	- int locked;
	- int error;
	-
	- vp = ap->a_vp;
	- node = vp->v_data;
	- len = strlen(node->sn_name);
	- if (*ap->a_buflen < len)
	- return (SET_ERROR(ENOMEM));
	-
	- /*
	- * Prevent unmounting of the snapshot while the vnode lock
	- * is not held. That is not strictly required, but allows
	- * us to assert that an uncovered snapshot vnode is never
	- * "leaked".
	- */
	- mp = vp->v_mountedhere;
	- if (mp == NULL)
	- return (SET_ERROR(ENOENT));
	- error = vfs_busy(mp, 0);
	- KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
	-
	- /*
	- * We can vput the vnode as we can now depend on the reference owned
	- * by the busied mp. But we also need to hold the vnode, because
	- * the reference may go after vfs_unbusy() which has to be called
	- * before we can lock the vnode again.
	- */
	- locked = VOP_ISLOCKED(vp);
	- vs = vget_prep(vp);
	- vput(vp);
	-
	- /* Look up .zfs/snapshot, our parent. */
	- error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
	- if (error == 0) {
	- VOP_UNLOCK(dvp);
	- *ap->a_vpp = dvp;
	- *ap->a_buflen -= len;
	- bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
	- }
	- vfs_unbusy(mp);
	- vget_finish(vp, locked \| LK_RETRY, vs);
	- return (error);
	-}
	-
	-/*
	- * These VP's should never see the light of day. They should always
	- * be covered.
	- */
	-static struct vop_vector zfsctl_ops_snapshot = {
	- .vop_default = NULL, /* ensure very restricted access */
	- .vop_inactive = zfsctl_snapshot_inactive,
	- .vop_need_inactive = vop_stdneed_inactive,
	- .vop_reclaim = zfsctl_snapshot_reclaim,
	- .vop_vptocnp = zfsctl_snapshot_vptocnp,
	- .vop_lock1 = vop_stdlock,
	- .vop_unlock = vop_stdunlock,
	- .vop_islocked = vop_stdislocked,
	- .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */
	- .vop_print = zfsctl_common_print,
	-};
	-VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot);
	-
	-int
	-zfsctl_lookup_objset(vfs_t vfsp, uint64_t objsetid, zfsvfs_t *zfsvfsp)
	-{
	- struct mount *mp;
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	- vnode_t *vp;
	- int error;
	-
	- ASSERT(zfsvfs->z_ctldir != NULL);
	- *zfsvfsp = NULL;
	- error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
	- ZFSCTL_INO_SNAPDIR, objsetid, &vp);
	- if (error == 0 && vp != NULL) {
	- /*
	- * XXX Probably need to at least reference, if not busy, the mp.
	- */
	- if (vp->v_mountedhere != NULL)
	- *zfsvfsp = vp->v_mountedhere->mnt_data;
	- vput(vp);
	- }
	- if (*zfsvfsp == NULL)
	- return (SET_ERROR(EINVAL));
	- return (0);
	-}
	-
	-/*
	- * Unmount any snapshots for the given filesystem. This is called from
	- * zfs_umount() - if we have a ctldir, then go through and unmount all the
	- * snapshots.
	- */
	-int
	-zfsctl_umount_snapshots(vfs_t vfsp, int fflags, cred_t cr)
	-{
	- char snapname[ZFS_MAX_DATASET_NAME_LEN];
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	- struct mount *mp;
	- vnode_t *dvp;
	- vnode_t *vp;
	- sfs_node_t *node;
	- sfs_node_t *snap;
	- uint64_t cookie;
	- int error;
	-
	- ASSERT(zfsvfs->z_ctldir != NULL);
	-
	- cookie = 0;
	- for (;;) {
	- uint64_t id;
	-
	- dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
	- error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
	- snapname, &id, &cookie, NULL);
	- dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
	- if (error != 0) {
	- if (error == ENOENT)
	- error = 0;
	- break;
	- }
	-
	- for (;;) {
	- error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
	- ZFSCTL_INO_SNAPDIR, id, &vp);
	- if (error != 0 \|\| vp == NULL)
	- break;
	-
	- mp = vp->v_mountedhere;
	-
	- /*
	- * v_mountedhere being NULL means that the
	- * (uncovered) vnode is in a transient state
	- * (mounting or unmounting), so loop until it
	- * settles down.
	- */
	- if (mp != NULL)
	- break;
	- vput(vp);
	- }
	- if (error != 0)
	- break;
	- if (vp == NULL)
	- continue; /* no mountpoint, nothing to do */
	-
	- /*
	- * The mount-point vnode is kept locked to avoid spurious EBUSY
	- * from a concurrent umount.
	- * The vnode lock must have recursive locking enabled.
	- */
	- vfs_ref(mp);
	- error = dounmount(mp, fflags, curthread);
	- KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
	- ("extra references after unmount"));
	- vput(vp);
	- if (error != 0)
	- break;
	- }
	- KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
	- ("force unmounting failed"));
	- return (error);
	-}
	-
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
	@@ -1,112 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-
	-list_t zfs_dbgmsgs;
	-int zfs_dbgmsg_size;
	-kmutex_t zfs_dbgmsgs_lock;
	-int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
	-
	-void
	-zfs_dbgmsg_init(void)
	-{
	- list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
	- offsetof(zfs_dbgmsg_t, zdm_node));
	- mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
	-}
	-
	-void
	-zfs_dbgmsg_fini(void)
	-{
	- zfs_dbgmsg_t *zdm;
	-
	- while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
	- int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
	- kmem_free(zdm, size);
	- zfs_dbgmsg_size -= size;
	- }
	- mutex_destroy(&zfs_dbgmsgs_lock);
	- ASSERT0(zfs_dbgmsg_size);
	-}
	-
	-/*
	- * Print these messages by running:
	- * echo ::zfs_dbgmsg \| mdb -k
	- *
	- * Monitor these messages by running:
	- * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
	- *
	- * When used with libzpool, monitor with:
	- * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
	- */
	-void
	-zfs_dbgmsg(const char *fmt, ...)
	-{
	- int size;
	- va_list adx;
	- zfs_dbgmsg_t *zdm;
	-
	- va_start(adx, fmt);
	- size = vsnprintf(NULL, 0, fmt, adx);
	- va_end(adx);
	-
	- /*
	- * There is one byte of string in sizeof (zfs_dbgmsg_t), used
	- * for the terminating null.
	- */
	- zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
	- zdm->zdm_timestamp = gethrestime_sec();
	-
	- va_start(adx, fmt);
	- (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
	- va_end(adx);
	-
	- DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
	-
	- mutex_enter(&zfs_dbgmsgs_lock);
	- list_insert_tail(&zfs_dbgmsgs, zdm);
	- zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
	- while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
	- zdm = list_remove_head(&zfs_dbgmsgs);
	- size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
	- kmem_free(zdm, size);
	- zfs_dbgmsg_size -= size;
	- }
	- mutex_exit(&zfs_dbgmsgs_lock);
	-}
	-
	-void
	-zfs_dbgmsg_print(const char *tag)
	-{
	- zfs_dbgmsg_t *zdm;
	-
	- (void) printf("ZFS_DBGMSG(%s):\n", tag);
	- mutex_enter(&zfs_dbgmsgs_lock);
	- for (zdm = list_head(&zfs_dbgmsgs); zdm;
	- zdm = list_next(&zfs_dbgmsgs, zdm))
	- (void) printf("%s\n", zdm->zdm_msg);
	- mutex_exit(&zfs_dbgmsgs_lock);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
	@@ -1,968 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
	- * Copyright 2017 Nexenta Systems, Inc.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/time.h>
	-#include <sys/systm.h>
	-#include <sys/sysmacros.h>
	-#include <sys/resource.h>
	-#include <sys/vfs.h>
	-#include <sys/vnode.h>
	-#include <sys/file.h>
	-#include <sys/kmem.h>
	-#include <sys/uio.h>
	-#include <sys/cmn_err.h>
	-#include <sys/errno.h>
	-#include <sys/stat.h>
	-#include <sys/unistd.h>
	-#include <sys/sunddi.h>
	-#include <sys/random.h>
	-#include <sys/policy.h>
	-#include <sys/kcondvar.h>
	-#include <sys/callb.h>
	-#include <sys/smp.h>
	-#include <sys/zfs_dir.h>
	-#include <sys/zfs_acl.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zap.h>
	-#include <sys/dmu.h>
	-#include <sys/atomic.h>
	-#include <sys/zfs_ctldir.h>
	-#include <sys/zfs_fuid.h>
	-#include <sys/sa.h>
	-#include <sys/zfs_sa.h>
	-#include <sys/dnlc.h>
	-#include <sys/extdirent.h>
	-
	-/*
	- * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
	- * of names after deciding which is the appropriate lookup interface.
	- */
	-static int
	-zfs_match_find(zfsvfs_t zfsvfs, znode_t dzp, const char *name,
	- matchtype_t mt, uint64_t *zoid)
	-{
	- int error;
	-
	- if (zfsvfs->z_norm) {
	-
	- /*
	- * In the non-mixed case we only expect there would ever
	- * be one match, but we need to use the normalizing lookup.
	- */
	- error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
	- zoid, mt, NULL, 0, NULL);
	- } else {
	- error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
	- }
	- zoid = ZFS_DIRENT_OBJ(zoid);
	-
	- return (error);
	-}
	-
	-/*
	- * Look up a directory entry under a locked vnode.
	- * dvp being locked gives us a guarantee that there are no concurrent
	- * modification of the directory and, thus, if a node can be found in
	- * the directory, then it must not be unlinked.
	- *
	- * Input arguments:
	- * dzp - znode for directory
	- * name - name of entry to lock
	- * flag - ZNEW: if the entry already exists, fail with EEXIST.
	- * ZEXISTS: if the entry does not exist, fail with ENOENT.
	- * ZXATTR: we want dzp's xattr directory
	- *
	- * Output arguments:
	- * zpp - pointer to the znode for the entry (NULL if there isn't one)
	- *
	- * Return value: 0 on success or errno on failure.
	- *
	- * NOTE: Always checks for, and rejects, '.' and '..'.
	- */
	-int
	-zfs_dirent_lookup(znode_t dzp, const char name, znode_t **zpp, int flag)
	-{
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- matchtype_t mt = 0;
	- uint64_t zoid;
	- vnode_t *vp = NULL;
	- int error = 0;
	-
	- ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
	-
	- *zpp = NULL;
	-
	- /*
	- * Verify that we are not trying to lock '.', '..', or '.zfs'
	- */
	- if (name[0] == '.' &&
	- (name[1] == '\0' \|\| (name[1] == '.' && name[2] == '\0')) \|\|
	- zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
	- return (SET_ERROR(EEXIST));
	-
	- /*
	- * Case sensitivity and normalization preferences are set when
	- * the file system is created. These are stored in the
	- * zfsvfs->z_case and zfsvfs->z_norm fields. These choices
	- * affect how we perform zap lookups.
	- *
	- * When matching we may need to normalize & change case according to
	- * FS settings.
	- *
	- * Note that a normalized match is necessary for a case insensitive
	- * filesystem when the lookup request is not exact because normalization
	- * can fold case independent of normalizing code point sequences.
	- *
	- * See the table above zfs_dropname().
	- */
	- if (zfsvfs->z_norm != 0) {
	- mt = MT_NORMALIZE;
	-
	- /*
	- * Determine if the match needs to honor the case specified in
	- * lookup, and if so keep track of that so that during
	- * normalization we don't fold case.
	- */
	- if (zfsvfs->z_case == ZFS_CASE_MIXED) {
	- mt \|= MT_MATCH_CASE;
	- }
	- }
	-
	- /*
	- * Only look in or update the DNLC if we are looking for the
	- * name on a file system that does not require normalization
	- * or case folding. We can also look there if we happen to be
	- * on a non-normalizing, mixed sensitivity file system IF we
	- * are looking for the exact name.
	- *
	- * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
	- * because in that case MT_EXACT and MT_FIRST should produce exactly
	- * the same result.
	- */
	-
	- if (dzp->z_unlinked && !(flag & ZXATTR))
	- return (ENOENT);
	- if (flag & ZXATTR) {
	- error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
	- sizeof (zoid));
	- if (error == 0)
	- error = (zoid == 0 ? ENOENT : 0);
	- } else {
	- error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid);
	- }
	- if (error) {
	- if (error != ENOENT \|\| (flag & ZEXISTS)) {
	- return (error);
	- }
	- } else {
	- if (flag & ZNEW) {
	- return (SET_ERROR(EEXIST));
	- }
	- error = zfs_zget(zfsvfs, zoid, zpp);
	- if (error)
	- return (error);
	- ASSERT(!(*zpp)->z_unlinked);
	- }
	-
	- return (0);
	-}
	-
	-static int
	-zfs_dd_lookup(znode_t dzp, znode_t *zpp)
	-{
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- znode_t *zp;
	- uint64_t parent;
	- int error;
	-
	- ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
	- ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
	-
	- if (dzp->z_unlinked)
	- return (ENOENT);
	-
	- if ((error = sa_lookup(dzp->z_sa_hdl,
	- SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
	- return (error);
	-
	- error = zfs_zget(zfsvfs, parent, &zp);
	- if (error == 0)
	- *zpp = zp;
	- return (error);
	-}
	-
	-int
	-zfs_dirlook(znode_t dzp, const char name, znode_t **zpp)
	-{
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- znode_t *zp;
	- int error = 0;
	-
	- ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
	- ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
	-
	- if (dzp->z_unlinked)
	- return (SET_ERROR(ENOENT));
	-
	- if (name[0] == 0 \|\| (name[0] == '.' && name[1] == 0)) {
	- *zpp = dzp;
	- } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
	- error = zfs_dd_lookup(dzp, zpp);
	- } else {
	- error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
	- if (error == 0) {
	- dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
	- *zpp = zp;
	- }
	- }
	- return (error);
	-}
	-
	-/*
	- * unlinked Set (formerly known as the "delete queue") Error Handling
	- *
	- * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
	- * don't specify the name of the entry that we will be manipulating. We
	- * also fib and say that we won't be adding any new entries to the
	- * unlinked set, even though we might (this is to lower the minimum file
	- * size that can be deleted in a full filesystem). So on the small
	- * chance that the nlink list is using a fat zap (ie. has more than
	- * 2000 entries), we may not pre-read a block that's needed.
	- * Therefore it is remotely possible for some of the assertions
	- * regarding the unlinked set below to fail due to i/o error. On a
	- * nondebug system, this will result in the space being leaked.
	- */
	-void
	-zfs_unlinked_add(znode_t zp, dmu_tx_t tx)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	-
	- ASSERT(zp->z_unlinked);
	- ASSERT(zp->z_links == 0);
	-
	- VERIFY3U(0, ==,
	- zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
	-}
	-
	-/*
	- * Clean up any znodes that had no links when we either crashed or
	- * (force) umounted the file system.
	- */
	-void
	-zfs_unlinked_drain(zfsvfs_t *zfsvfs)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t zap;
	- dmu_object_info_t doi;
	- znode_t *zp;
	- dmu_tx_t *tx;
	- int error;
	-
	- /*
	- * Interate over the contents of the unlinked set.
	- */
	- for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
	- zap_cursor_retrieve(&zc, &zap) == 0;
	- zap_cursor_advance(&zc)) {
	-
	- /*
	- * See what kind of object we have in list
	- */
	-
	- error = dmu_object_info(zfsvfs->z_os,
	- zap.za_first_integer, &doi);
	- if (error != 0)
	- continue;
	-
	- ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) \|\|
	- (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
	- /*
	- * We need to re-mark these list entries for deletion,
	- * so we pull them back into core and set zp->z_unlinked.
	- */
	- error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
	-
	- /*
	- * We may pick up znodes that are already marked for deletion.
	- * This could happen during the purge of an extended attribute
	- * directory. All we need to do is skip over them, since they
	- * are already in the system marked z_unlinked.
	- */
	- if (error != 0)
	- continue;
	-
	- vn_lock(ZTOV(zp), LK_EXCLUSIVE \| LK_RETRY);
	-#if defined(__FreeBSD__)
	- /*
	- * Due to changes in zfs_rmnode we need to make sure the
	- * link count is set to zero here.
	- */
	- if (zp->z_links != 0) {
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error != 0) {
	- dmu_tx_abort(tx);
	- vput(ZTOV(zp));
	- continue;
	- }
	- zp->z_links = 0;
	- VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
	- &zp->z_links, sizeof (zp->z_links), tx));
	- dmu_tx_commit(tx);
	- }
	-#endif
	- zp->z_unlinked = B_TRUE;
	- vput(ZTOV(zp));
	- }
	- zap_cursor_fini(&zc);
	-}
	-
	-/*
	- * Delete the entire contents of a directory. Return a count
	- * of the number of entries that could not be deleted. If we encounter
	- * an error, return a count of at least one so that the directory stays
	- * in the unlinked set.
	- *
	- * NOTE: this function assumes that the directory is inactive,
	- * so there is no need to lock its entries before deletion.
	- * Also, it assumes the directory contents is only regular
	- * files.
	- */
	-static int
	-zfs_purgedir(znode_t *dzp)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t zap;
	- znode_t *xzp;
	- dmu_tx_t *tx;
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- int skipped = 0;
	- int error;
	-
	- for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
	- (error = zap_cursor_retrieve(&zc, &zap)) == 0;
	- zap_cursor_advance(&zc)) {
	- error = zfs_zget(zfsvfs,
	- ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
	- if (error) {
	- skipped += 1;
	- continue;
	- }
	-
	- vn_lock(ZTOV(xzp), LK_EXCLUSIVE \| LK_RETRY);
	- ASSERT((ZTOV(xzp)->v_type == VREG) \|\|
	- (ZTOV(xzp)->v_type == VLNK));
	-
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
	- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
	- dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
	- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
	- /* Is this really needed ? */
	- zfs_sa_upgrade_txholds(tx, xzp);
	- dmu_tx_mark_netfree(tx);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- vput(ZTOV(xzp));
	- skipped += 1;
	- continue;
	- }
	-
	- error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
	- if (error)
	- skipped += 1;
	- dmu_tx_commit(tx);
	-
	- vput(ZTOV(xzp));
	- }
	- zap_cursor_fini(&zc);
	- if (error != ENOENT)
	- skipped += 1;
	- return (skipped);
	-}
	-
	-#if defined(__FreeBSD__)
	-extern taskq_t *zfsvfs_taskq;
	-#endif
	-
	-void
	-zfs_rmnode(znode_t *zp)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- objset_t *os = zfsvfs->z_os;
	- dmu_tx_t *tx;
	- uint64_t acl_obj;
	- uint64_t xattr_obj;
	- int error;
	-
	- ASSERT(zp->z_links == 0);
	- ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
	-
	- /*
	- * If this is an attribute directory, purge its contents.
	- */
	- if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
	- (zp->z_pflags & ZFS_XATTR)) {
	- if (zfs_purgedir(zp) != 0) {
	- /*
	- * Not enough space to delete some xattrs.
	- * Leave it in the unlinked set.
	- */
	- zfs_znode_dmu_fini(zp);
	- zfs_znode_free(zp);
	- return;
	- }
	- } else {
	- /*
	- * Free up all the data in the file. We don't do this for
	- * XATTR directories because we need truncate and remove to be
	- * in the same tx, like in zfs_znode_delete(). Otherwise, if
	- * we crash here we'll end up with an inconsistent truncated
	- * zap object in the delete queue. Note a truncated file is
	- * harmless since it only contains user data.
	- */
	- error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
	- if (error) {
	- /*
	- * Not enough space or we were interrupted by unmount.
	- * Leave the file in the unlinked set.
	- */
	- zfs_znode_dmu_fini(zp);
	- zfs_znode_free(zp);
	- return;
	- }
	- }
	-
	- /*
	- * If the file has extended attributes, we're going to unlink
	- * the xattr dir.
	- */
	- error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
	- &xattr_obj, sizeof (xattr_obj));
	- if (error)
	- xattr_obj = 0;
	-
	- acl_obj = zfs_external_acl(zp);
	-
	- /*
	- * Set up the final transaction.
	- */
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
	- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
	- if (xattr_obj)
	- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
	- if (acl_obj)
	- dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
	-
	- zfs_sa_upgrade_txholds(tx, zp);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- /*
	- * Not enough space to delete the file. Leave it in the
	- * unlinked set, leaking it until the fs is remounted (at
	- * which point we'll call zfs_unlinked_drain() to process it).
	- */
	- dmu_tx_abort(tx);
	- zfs_znode_dmu_fini(zp);
	- zfs_znode_free(zp);
	- return;
	- }
	-
	-#if defined(__FreeBSD__)
	- /*
	- * FreeBSD's implemention of zfs_zget requires a vnode to back it.
	- * This means that we could end up calling into getnewvnode while
	- * calling zfs_rmnode as a result of a prior call to getnewvnode
	- * trying to clear vnodes out of the cache. If this repeats we can
	- * recurse enough that we overflow our stack. To avoid this, we
	- * avoid calling zfs_zget on the xattr znode and instead simply add
	- * it to the unlinked set and schedule a call to zfs_unlinked_drain.
	- */
	- if (xattr_obj) {
	- /* Add extended attribute directory to the unlinked set. */
	- VERIFY3U(0, ==,
	- zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx));
	- }
	-#else
	- if (xzp) {
	- ASSERT(error == 0);
	- xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
	- xzp->z_links = 0; /* no more links to it */
	- VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
	- &xzp->z_links, sizeof (xzp->z_links), tx));
	- zfs_unlinked_add(xzp, tx);
	- }
	-#endif
	-
	- /* Remove this znode from the unlinked set */
	- VERIFY3U(0, ==,
	- zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
	-
	- zfs_znode_delete(zp, tx);
	-
	- dmu_tx_commit(tx);
	-
	-#if defined(__FreeBSD__)
	- if (xattr_obj) {
	- /*
	- * We're using the FreeBSD taskqueue API here instead of
	- * the Solaris taskq API since the FreeBSD API allows for a
	- * task to be enqueued multiple times but executed once.
	- */
	- taskqueue_enqueue(zfsvfs_taskq->tq_queue,
	- &zfsvfs->z_unlinked_drain_task);
	- }
	-#endif
	-}
	-
	-static uint64_t
	-zfs_dirent(znode_t *zp, uint64_t mode)
	-{
	- uint64_t de = zp->z_id;
	-
	- if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
	- de \|= IFTODT(mode) << 60;
	- return (de);
	-}
	-
	-/*
	- * Link zp into dzp. Can only fail if zp has been unlinked.
	- */
	-int
	-zfs_link_create(znode_t dzp, const char name, znode_t zp, dmu_tx_t tx,
	- int flag)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- vnode_t *vp = ZTOV(zp);
	- uint64_t value;
	- int zp_is_dir = (vp->v_type == VDIR);
	- sa_bulk_attr_t bulk[5];
	- uint64_t mtime[2], ctime[2];
	- int count = 0;
	- int error;
	-
	- ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
	- ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
	-#ifdef __FreeBSD__
	- if (zp_is_dir) {
	- if (dzp->z_links >= ZFS_LINK_MAX)
	- return (SET_ERROR(EMLINK));
	- }
	-#endif
	- if (!(flag & ZRENAMING)) {
	- if (zp->z_unlinked) { /* no new links to unlinked zp */
	- ASSERT(!(flag & (ZNEW \| ZEXISTS)));
	- return (SET_ERROR(ENOENT));
	- }
	-#ifdef __FreeBSD__
	- if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) {
	- return (SET_ERROR(EMLINK));
	- }
	-#endif
	- zp->z_links++;
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
	- &zp->z_links, sizeof (zp->z_links));
	-
	- } else {
	- ASSERT(zp->z_unlinked == 0);
	- }
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
	- &dzp->z_id, sizeof (dzp->z_id));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	- &zp->z_pflags, sizeof (zp->z_pflags));
	-
	- if (!(flag & ZNEW)) {
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
	- ctime, sizeof (ctime));
	- zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
	- ctime, B_TRUE);
	- }
	- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	- ASSERT0(error);
	-
	- dzp->z_size++;
	- dzp->z_links += zp_is_dir;
	- count = 0;
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
	- &dzp->z_size, sizeof (dzp->z_size));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
	- &dzp->z_links, sizeof (dzp->z_links));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
	- mtime, sizeof (mtime));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
	- ctime, sizeof (ctime));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	- &dzp->z_pflags, sizeof (dzp->z_pflags));
	- zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
	- error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
	- ASSERT0(error);
	-
	- value = zfs_dirent(zp, zp->z_mode);
	- error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
	- 8, 1, &value, tx);
	- VERIFY0(error);
	-
	- return (0);
	-}
	-
	-/*
	- * The match type in the code for this function should conform to:
	- *
	- * ------------------------------------------------------------------------
	- * fs type \| z_norm \| lookup type \| match type
	- * ---------\|-------------\|-------------\|----------------------------------
	- * CS !norm \| 0 \| 0 \| 0 (exact)
	- * CS norm \| formX \| 0 \| MT_NORMALIZE
	- * CI !norm \| upper \| !ZCIEXACT \| MT_NORMALIZE
	- * CI !norm \| upper \| ZCIEXACT \| MT_NORMALIZE \| MT_MATCH_CASE
	- * CI norm \| upper\|formX \| !ZCIEXACT \| MT_NORMALIZE
	- * CI norm \| upper\|formX \| ZCIEXACT \| MT_NORMALIZE \| MT_MATCH_CASE
	- * CM !norm \| upper \| !ZCILOOK \| MT_NORMALIZE \| MT_MATCH_CASE
	- * CM !norm \| upper \| ZCILOOK \| MT_NORMALIZE
	- * CM norm \| upper\|formX \| !ZCILOOK \| MT_NORMALIZE \| MT_MATCH_CASE
	- * CM norm \| upper\|formX \| ZCILOOK \| MT_NORMALIZE
	- *
	- * Abbreviations:
	- * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
	- * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
	- * formX = unicode normalization form set on fs creation
	- */
	-static int
	-zfs_dropname(znode_t dzp, const char name, znode_t zp, dmu_tx_t tx,
	- int flag)
	-{
	- int error;
	-
	- if (zp->z_zfsvfs->z_norm) {
	- matchtype_t mt = MT_NORMALIZE;
	-
	- if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) {
	- mt \|= MT_MATCH_CASE;
	- }
	-
	- error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id,
	- name, mt, tx);
	- } else {
	- error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx);
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * Unlink zp from dzp, and mark zp for deletion if this was the last link.
	- * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
	- * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
	- * If it's non-NULL, we use it to indicate whether the znode needs deletion,
	- * and it's the caller's job to do it.
	- */
	-int
	-zfs_link_destroy(znode_t dzp, const char name, znode_t zp, dmu_tx_t tx,
	- int flag, boolean_t *unlinkedp)
	-{
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- vnode_t *vp = ZTOV(zp);
	- int zp_is_dir = (vp->v_type == VDIR);
	- boolean_t unlinked = B_FALSE;
	- sa_bulk_attr_t bulk[5];
	- uint64_t mtime[2], ctime[2];
	- int count = 0;
	- int error;
	-
	- ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
	- ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
	-
	- if (!(flag & ZRENAMING)) {
	-
	- if (zp_is_dir && !zfs_dirempty(zp)) {
	-#ifdef illumos
	- return (SET_ERROR(EEXIST));
	-#else
	- return (SET_ERROR(ENOTEMPTY));
	-#endif
	- }
	-
	- /*
	- * If we get here, we are going to try to remove the object.
	- * First try removing the name from the directory; if that
	- * fails, return the error.
	- */
	- error = zfs_dropname(dzp, name, zp, tx, flag);
	- if (error != 0) {
	- return (error);
	- }
	-
	- if (zp->z_links <= zp_is_dir) {
	- zfs_panic_recover("zfs: link count on vnode %p is %u, "
	- "should be at least %u", zp->z_vnode,
	- (int)zp->z_links,
	- zp_is_dir + 1);
	- zp->z_links = zp_is_dir + 1;
	- }
	- if (--zp->z_links == zp_is_dir) {
	- zp->z_unlinked = B_TRUE;
	- zp->z_links = 0;
	- unlinked = B_TRUE;
	- } else {
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
	- NULL, &ctime, sizeof (ctime));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
	- NULL, &zp->z_pflags, sizeof (zp->z_pflags));
	- zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
	- B_TRUE);
	- }
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
	- NULL, &zp->z_links, sizeof (zp->z_links));
	- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	- count = 0;
	- ASSERT0(error);
	- } else {
	- ASSERT(zp->z_unlinked == 0);
	- error = zfs_dropname(dzp, name, zp, tx, flag);
	- if (error != 0)
	- return (error);
	- }
	-
	- dzp->z_size--; /* one dirent removed */
	- dzp->z_links -= zp_is_dir; /* ".." link from zp */
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
	- NULL, &dzp->z_links, sizeof (dzp->z_links));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
	- NULL, &dzp->z_size, sizeof (dzp->z_size));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
	- NULL, ctime, sizeof (ctime));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
	- NULL, mtime, sizeof (mtime));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
	- NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
	- zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
	- error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
	- ASSERT0(error);
	-
	- if (unlinkedp != NULL)
	- *unlinkedp = unlinked;
	- else if (unlinked)
	- zfs_unlinked_add(zp, tx);
	-
	- return (0);
	-}
	-
	-/*
	- * Indicate whether the directory is empty.
	- */
	-boolean_t
	-zfs_dirempty(znode_t *dzp)
	-{
	- return (dzp->z_size == 2);
	-}
	-
	-int
	-zfs_make_xattrdir(znode_t zp, vattr_t vap, vnode_t *xvpp, cred_t cr)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- znode_t *xzp;
	- dmu_tx_t *tx;
	- int error;
	- zfs_acl_ids_t acl_ids;
	- boolean_t fuid_dirtied;
	- uint64_t parent;
	-
	- *xvpp = NULL;
	-
	- /*
	- * In FreeBSD, access checking for creating an EA is being done
	- * in zfs_setextattr(),
	- */
	-#ifndef __FreeBSD_kernel__
	- if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
	- return (error);
	-#endif
	-
	- if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
	- &acl_ids)) != 0)
	- return (error);
	- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
	- zfs_acl_ids_free(&acl_ids);
	- return (SET_ERROR(EDQUOT));
	- }
	-
	- getnewvnode_reserve();
	-
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
	- ZFS_SA_BASE_ATTR_SIZE);
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
	- fuid_dirtied = zfsvfs->z_fuid_dirty;
	- if (fuid_dirtied)
	- zfs_fuid_txhold(zfsvfs, tx);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- zfs_acl_ids_free(&acl_ids);
	- dmu_tx_abort(tx);
	- getnewvnode_drop_reserve();
	- return (error);
	- }
	- zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
	-
	- if (fuid_dirtied)
	- zfs_fuid_sync(zfsvfs, tx);
	-
	-#ifdef DEBUG
	- error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
	- &parent, sizeof (parent));
	- ASSERT(error == 0 && parent == zp->z_id);
	-#endif
	-
	- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
	- sizeof (xzp->z_id), tx));
	-
	- (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
	- xzp, "", NULL, acl_ids.z_fuidp, vap);
	-
	- zfs_acl_ids_free(&acl_ids);
	- dmu_tx_commit(tx);
	-
	- getnewvnode_drop_reserve();
	-
	- *xvpp = ZTOV(xzp);
	-
	- return (0);
	-}
	-
	-/*
	- * Return a znode for the extended attribute directory for zp.
	- * If the directory does not already exist, it is created
	- *
	- * IN: zp - znode to obtain attribute directory from
	- * cr - credentials of caller
	- * flags - flags from the VOP_LOOKUP call
	- *
	- * OUT: xzpp - pointer to extended attribute znode
	- *
	- * RETURN: 0 on success
	- * error number on failure
	- */
	-int
	-zfs_get_xattrdir(znode_t zp, vnode_t xvpp, cred_t cr, int flags)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- znode_t *xzp;
	- vattr_t va;
	- int error;
	-top:
	- error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
	- if (error)
	- return (error);
	-
	- if (xzp != NULL) {
	- *xvpp = ZTOV(xzp);
	- return (0);
	- }
	-
	-
	- if (!(flags & CREATE_XATTR_DIR)) {
	-#ifdef illumos
	- return (SET_ERROR(ENOENT));
	-#else
	- return (SET_ERROR(ENOATTR));
	-#endif
	- }
	-
	- if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
	- return (SET_ERROR(EROFS));
	- }
	-
	- /*
	- * The ability to 'create' files in an attribute
	- * directory comes from the write_xattr permission on the base file.
	- *
	- * The ability to 'search' an attribute directory requires
	- * read_xattr permission on the base file.
	- *
	- * Once in a directory the ability to read/write attributes
	- * is controlled by the permissions on the attribute file.
	- */
	- va.va_mask = AT_TYPE \| AT_MODE \| AT_UID \| AT_GID;
	- va.va_type = VDIR;
	- va.va_mode = S_IFDIR \| S_ISVTX \| 0777;
	- zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
	-
	- error = zfs_make_xattrdir(zp, &va, xvpp, cr);
	-
	- if (error == ERESTART) {
	- /* NB: we already did dmu_tx_wait() if necessary */
	- goto top;
	- }
	- if (error == 0)
	- VOP_UNLOCK(*xvpp);
	-
	- return (error);
	-}
	-
	-/*
	- * Decide whether it is okay to remove within a sticky directory.
	- *
	- * In sticky directories, write access is not sufficient;
	- * you can remove entries from a directory only if:
	- *
	- * you own the directory,
	- * you own the entry,
	- * the entry is a plain file and you have write access,
	- * or you are privileged (checked in secpolicy...).
	- *
	- * The function returns 0 if remove access is granted.
	- */
	-int
	-zfs_sticky_remove_access(znode_t zdp, znode_t zp, cred_t *cr)
	-{
	- uid_t uid;
	- uid_t downer;
	- uid_t fowner;
	- zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
	-
	- if (zdp->z_zfsvfs->z_replay)
	- return (0);
	-
	- if ((zdp->z_mode & S_ISVTX) == 0)
	- return (0);
	-
	- downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
	- fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
	-
	- if ((uid = crgetuid(cr)) == downer \|\| uid == fowner \|\|
	- (ZTOV(zp)->v_type == VREG &&
	- zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
	- return (0);
	- else
	- return (secpolicy_vnode_remove(ZTOV(zp), cr));
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
	@@ -1,871 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/vdev.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zio.h>
	-#include <sys/zio_checksum.h>
	-
	-#include <sys/fm/fs/zfs.h>
	-#include <sys/fm/protocol.h>
	-#include <sys/fm/util.h>
	-#include <sys/sysevent.h>
	-
	-/*
	- * This general routine is responsible for generating all the different ZFS
	- * ereports. The payload is dependent on the class, and which arguments are
	- * supplied to the function:
	- *
	- * EREPORT POOL VDEV IO
	- * block X X X
	- * data X X
	- * device X X
	- * pool X
	- *
	- * If we are in a loading state, all errors are chained together by the same
	- * SPA-wide ENA (Error Numeric Association).
	- *
	- * For isolated I/O requests, we get the ENA from the zio_t. The propagation
	- * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
	- * to chain together all ereports associated with a logical piece of data. For
	- * read I/Os, there are basically three 'types' of I/O, which form a roughly
	- * layered diagram:
	- *
	- * +---------------+
	- * \| Aggregate I/O \| No associated logical data or device
	- * +---------------+
	- * \|
	- * V
	- * +---------------+ Reads associated with a piece of logical data.
	- * \| Read I/O \| This includes reads on behalf of RAID-Z,
	- * +---------------+ mirrors, gang blocks, retries, etc.
	- * \|
	- * V
	- * +---------------+ Reads associated with a particular device, but
	- * \| Physical I/O \| no logical data. Issued as part of vdev caching
	- * +---------------+ and I/O aggregation.
	- *
	- * Note that 'physical I/O' here is not the same terminology as used in the rest
	- * of ZIO. Typically, 'physical I/O' simply means that there is no attached
	- * blockpointer. But I/O with no associated block pointer can still be related
	- * to a logical piece of data (i.e. RAID-Z requests).
	- *
	- * Purely physical I/O always have unique ENAs. They are not related to a
	- * particular piece of logical data, and therefore cannot be chained together.
	- * We still generate an ereport, but the DE doesn't correlate it with any
	- * logical piece of data. When such an I/O fails, the delegated I/O requests
	- * will issue a retry, which will trigger the 'real' ereport with the correct
	- * ENA.
	- *
	- * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
	- * When a new logical I/O is issued, we set this to point to itself. Child I/Os
	- * then inherit this pointer, so that when it is first set subsequent failures
	- * will use the same ENA. For vdev cache fill and queue aggregation I/O,
	- * this pointer is set to NULL, and no ereport will be generated (since it
	- * doesn't actually correspond to any particular device or piece of data,
	- * and the caller will always retry without caching or queueing anyway).
	- *
	- * For checksum errors, we want to include more information about the actual
	- * error which occurs. Accordingly, we build an ereport when the error is
	- * noticed, but instead of sending it in immediately, we hang it off of the
	- * io_cksum_report field of the logical IO. When the logical IO completes
	- * (successfully or not), zfs_ereport_finish_checksum() is called with the
	- * good and bad versions of the buffer (if available), and we annotate the
	- * ereport with information about the differences.
	- */
	-#ifdef _KERNEL
	-static void
	-zfs_ereport_start(nvlist_t ereport_out, nvlist_t detector_out,
	- const char subclass, spa_t spa, vdev_t vd, zio_t zio,
	- uint64_t stateoroffset, uint64_t size)
	-{
	- nvlist_t ereport, detector;
	-
	- uint64_t ena;
	- char class[64];
	-
	- /*
	- * If we are doing a spa_tryimport() or in recovery mode,
	- * ignore errors.
	- */
	- if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT \|\|
	- spa_load_state(spa) == SPA_LOAD_RECOVER)
	- return;
	-
	- /*
	- * If we are in the middle of opening a pool, and the previous attempt
	- * failed, don't bother logging any new ereports - we're just going to
	- * get the same diagnosis anyway.
	- */
	- if (spa_load_state(spa) != SPA_LOAD_NONE &&
	- spa->spa_last_open_failed)
	- return;
	-
	- if (zio != NULL) {
	- /*
	- * If this is not a read or write zio, ignore the error. This
	- * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
	- */
	- if (zio->io_type != ZIO_TYPE_READ &&
	- zio->io_type != ZIO_TYPE_WRITE)
	- return;
	-
	- /*
	- * Ignore any errors from speculative I/Os, as failure is an
	- * expected result.
	- */
	- if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
	- return;
	-
	- /*
	- * If this I/O is not a retry I/O, don't post an ereport.
	- * Otherwise, we risk making bad diagnoses based on B_FAILFAST
	- * I/Os.
	- */
	- if (zio->io_error == EIO &&
	- !(zio->io_flags & ZIO_FLAG_IO_RETRY))
	- return;
	-
	- if (vd != NULL) {
	- /*
	- * If the vdev has already been marked as failing due
	- * to a failed probe, then ignore any subsequent I/O
	- * errors, as the DE will automatically fault the vdev
	- * on the first such failure. This also catches cases
	- * where vdev_remove_wanted is set and the device has
	- * not yet been asynchronously placed into the REMOVED
	- * state.
	- */
	- if (zio->io_vd == vd && !vdev_accessible(vd, zio))
	- return;
	-
	- /*
	- * Ignore checksum errors for reads from DTL regions of
	- * leaf vdevs.
	- */
	- if (zio->io_type == ZIO_TYPE_READ &&
	- zio->io_error == ECKSUM &&
	- vd->vdev_ops->vdev_op_leaf &&
	- vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
	- return;
	- }
	- }
	-
	- /*
	- * For probe failure, we want to avoid posting ereports if we've
	- * already removed the device in the meantime.
	- */
	- if (vd != NULL &&
	- strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
	- (vd->vdev_remove_wanted \|\| vd->vdev_state == VDEV_STATE_REMOVED))
	- return;
	-
	- if ((ereport = fm_nvlist_create(NULL)) == NULL)
	- return;
	-
	- if ((detector = fm_nvlist_create(NULL)) == NULL) {
	- fm_nvlist_destroy(ereport, FM_NVA_FREE);
	- return;
	- }
	-
	- /*
	- * Serialize ereport generation
	- */
	- mutex_enter(&spa->spa_errlist_lock);
	-
	- /*
	- * Determine the ENA to use for this event. If we are in a loading
	- * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
	- * a root zio-wide ENA. Otherwise, simply use a unique ENA.
	- */
	- if (spa_load_state(spa) != SPA_LOAD_NONE) {
	- if (spa->spa_ena == 0)
	- spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
	- ena = spa->spa_ena;
	- } else if (zio != NULL && zio->io_logical != NULL) {
	- if (zio->io_logical->io_ena == 0)
	- zio->io_logical->io_ena =
	- fm_ena_generate(0, FM_ENA_FMT1);
	- ena = zio->io_logical->io_ena;
	- } else {
	- ena = fm_ena_generate(0, FM_ENA_FMT1);
	- }
	-
	- /*
	- * Construct the full class, detector, and other standard FMA fields.
	- */
	- (void) snprintf(class, sizeof (class), "%s.%s",
	- ZFS_ERROR_CLASS, subclass);
	-
	- fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
	- vd != NULL ? vd->vdev_guid : 0);
	-
	- fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
	-
	- /*
	- * Construct the per-ereport payload, depending on which parameters are
	- * passed in.
	- */
	-
	- /*
	- * Generic payload members common to all ereports.
	- */
	- fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
	- DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
	- DATA_TYPE_UINT64, spa_guid(spa),
	- FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
	- spa_load_state(spa), NULL);
	-
	- if (spa != NULL) {
	- fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
	- DATA_TYPE_STRING,
	- spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
	- FM_EREPORT_FAILMODE_WAIT :
	- spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
	- FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
	- NULL);
	- }
	-
	- if (vd != NULL) {
	- vdev_t *pvd = vd->vdev_parent;
	-
	- fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
	- DATA_TYPE_UINT64, vd->vdev_guid,
	- FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
	- DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
	- if (vd->vdev_path != NULL)
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
	- DATA_TYPE_STRING, vd->vdev_path, NULL);
	- if (vd->vdev_devid != NULL)
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
	- DATA_TYPE_STRING, vd->vdev_devid, NULL);
	- if (vd->vdev_fru != NULL)
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
	- DATA_TYPE_STRING, vd->vdev_fru, NULL);
	-
	- if (pvd != NULL) {
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
	- DATA_TYPE_UINT64, pvd->vdev_guid,
	- FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
	- DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
	- NULL);
	- if (pvd->vdev_path)
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
	- DATA_TYPE_STRING, pvd->vdev_path, NULL);
	- if (pvd->vdev_devid)
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
	- DATA_TYPE_STRING, pvd->vdev_devid, NULL);
	- }
	- }
	-
	- if (zio != NULL) {
	- /*
	- * Payload common to all I/Os.
	- */
	- fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
	- DATA_TYPE_INT32, zio->io_error, NULL);
	-
	- /*
	- * If the 'size' parameter is non-zero, it indicates this is a
	- * RAID-Z or other I/O where the physical offset and length are
	- * provided for us, instead of within the zio_t.
	- */
	- if (vd != NULL) {
	- if (size)
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
	- DATA_TYPE_UINT64, stateoroffset,
	- FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
	- DATA_TYPE_UINT64, size, NULL);
	- else
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
	- DATA_TYPE_UINT64, zio->io_offset,
	- FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
	- DATA_TYPE_UINT64, zio->io_size, NULL);
	- }
	-
	- /*
	- * Payload for I/Os with corresponding logical information.
	- */
	- if (zio->io_logical != NULL)
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
	- DATA_TYPE_UINT64,
	- zio->io_logical->io_bookmark.zb_objset,
	- FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
	- DATA_TYPE_UINT64,
	- zio->io_logical->io_bookmark.zb_object,
	- FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
	- DATA_TYPE_INT64,
	- zio->io_logical->io_bookmark.zb_level,
	- FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
	- DATA_TYPE_UINT64,
	- zio->io_logical->io_bookmark.zb_blkid, NULL);
	- } else if (vd != NULL) {
	- /*
	- * If we have a vdev but no zio, this is a device fault, and the
	- * 'stateoroffset' parameter indicates the previous state of the
	- * vdev.
	- */
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
	- DATA_TYPE_UINT64, stateoroffset, NULL);
	- }
	-
	- mutex_exit(&spa->spa_errlist_lock);
	-
	- *ereport_out = ereport;
	- *detector_out = detector;
	-}
	-
	-/* if it's <= 128 bytes, save the corruption directly */
	-#define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
	-
	-#define MAX_RANGES 16
	-
	-typedef struct zfs_ecksum_info {
	- /* histograms of set and cleared bits by bit number in a 64-bit word */
	- uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
	- uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
	-
	- /* inline arrays of bits set and cleared. */
	- uint64_t zei_bits_set[ZFM_MAX_INLINE];
	- uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
	-
	- /*
	- * for each range, the number of bits set and cleared. The Hamming
	- * distance between the good and bad buffers is the sum of them all.
	- */
	- uint32_t zei_range_sets[MAX_RANGES];
	- uint32_t zei_range_clears[MAX_RANGES];
	-
	- struct zei_ranges {
	- uint32_t zr_start;
	- uint32_t zr_end;
	- } zei_ranges[MAX_RANGES];
	-
	- size_t zei_range_count;
	- uint32_t zei_mingap;
	- uint32_t zei_allowed_mingap;
	-
	-} zfs_ecksum_info_t;
	-
	-static void
	-update_histogram(uint64_t value_arg, uint32_t hist, uint32_t count)
	-{
	- size_t i;
	- size_t bits = 0;
	- uint64_t value = BE_64(value_arg);
	-
	- /* We store the bits in big-endian (largest-first) order */
	- for (i = 0; i < 64; i++) {
	- if (value & (1ull << i)) {
	- hist[63 - i]++;
	- ++bits;
	- }
	- }
	- /* update the count of bits changed */
	- *count += bits;
	-}
	-
	-/*
	- * We've now filled up the range array, and need to increase "mingap" and
	- * shrink the range list accordingly. zei_mingap is always the smallest
	- * distance between array entries, so we set the new_allowed_gap to be
	- * one greater than that. We then go through the list, joining together
	- * any ranges which are closer than the new_allowed_gap.
	- *
	- * By construction, there will be at least one. We also update zei_mingap
	- * to the new smallest gap, to prepare for our next invocation.
	- */
	-static void
	-shrink_ranges(zfs_ecksum_info_t *eip)
	-{
	- uint32_t mingap = UINT32_MAX;
	- uint32_t new_allowed_gap = eip->zei_mingap + 1;
	-
	- size_t idx, output;
	- size_t max = eip->zei_range_count;
	-
	- struct zei_ranges *r = eip->zei_ranges;
	-
	- ASSERT3U(eip->zei_range_count, >, 0);
	- ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
	-
	- output = idx = 0;
	- while (idx < max - 1) {
	- uint32_t start = r[idx].zr_start;
	- uint32_t end = r[idx].zr_end;
	-
	- while (idx < max - 1) {
	- idx++;
	-
	- uint32_t nstart = r[idx].zr_start;
	- uint32_t nend = r[idx].zr_end;
	-
	- uint32_t gap = nstart - end;
	- if (gap < new_allowed_gap) {
	- end = nend;
	- continue;
	- }
	- if (gap < mingap)
	- mingap = gap;
	- break;
	- }
	- r[output].zr_start = start;
	- r[output].zr_end = end;
	- output++;
	- }
	- ASSERT3U(output, <, eip->zei_range_count);
	- eip->zei_range_count = output;
	- eip->zei_mingap = mingap;
	- eip->zei_allowed_mingap = new_allowed_gap;
	-}
	-
	-static void
	-add_range(zfs_ecksum_info_t *eip, int start, int end)
	-{
	- struct zei_ranges *r = eip->zei_ranges;
	- size_t count = eip->zei_range_count;
	-
	- if (count >= MAX_RANGES) {
	- shrink_ranges(eip);
	- count = eip->zei_range_count;
	- }
	- if (count == 0) {
	- eip->zei_mingap = UINT32_MAX;
	- eip->zei_allowed_mingap = 1;
	- } else {
	- int gap = start - r[count - 1].zr_end;
	-
	- if (gap < eip->zei_allowed_mingap) {
	- r[count - 1].zr_end = end;
	- return;
	- }
	- if (gap < eip->zei_mingap)
	- eip->zei_mingap = gap;
	- }
	- r[count].zr_start = start;
	- r[count].zr_end = end;
	- eip->zei_range_count++;
	-}
	-
	-static size_t
	-range_total_size(zfs_ecksum_info_t *eip)
	-{
	- struct zei_ranges *r = eip->zei_ranges;
	- size_t count = eip->zei_range_count;
	- size_t result = 0;
	- size_t idx;
	-
	- for (idx = 0; idx < count; idx++)
	- result += (r[idx].zr_end - r[idx].zr_start);
	-
	- return (result);
	-}
	-
	-static zfs_ecksum_info_t *
	-annotate_ecksum(nvlist_t ereport, zio_bad_cksum_t info,
	- const uint8_t goodbuf, const uint8_t badbuf, size_t size,
	- boolean_t drop_if_identical)
	-{
	- const uint64_t good = (const uint64_t )goodbuf;
	- const uint64_t bad = (const uint64_t )badbuf;
	-
	- uint64_t allset = 0;
	- uint64_t allcleared = 0;
	-
	- size_t nui64s = size / sizeof (uint64_t);
	-
	- size_t inline_size;
	- int no_inline = 0;
	- size_t idx;
	- size_t range;
	-
	- size_t offset = 0;
	- ssize_t start = -1;
	-
	- zfs_ecksum_info_t eip = kmem_zalloc(sizeof (eip), KM_SLEEP);
	-
	- /* don't do any annotation for injected checksum errors */
	- if (info != NULL && info->zbc_injected)
	- return (eip);
	-
	- if (info != NULL && info->zbc_has_cksum) {
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
	- DATA_TYPE_UINT64_ARRAY,
	- sizeof (info->zbc_expected) / sizeof (uint64_t),
	- (uint64_t *)&info->zbc_expected,
	- FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
	- DATA_TYPE_UINT64_ARRAY,
	- sizeof (info->zbc_actual) / sizeof (uint64_t),
	- (uint64_t *)&info->zbc_actual,
	- FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
	- DATA_TYPE_STRING,
	- info->zbc_checksum_name,
	- NULL);
	-
	- if (info->zbc_byteswapped) {
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
	- DATA_TYPE_BOOLEAN, 1,
	- NULL);
	- }
	- }
	-
	- if (badbuf == NULL \|\| goodbuf == NULL)
	- return (eip);
	-
	- ASSERT3U(nui64s, <=, UINT32_MAX);
	- ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
	- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
	- ASSERT3U(size, <=, UINT32_MAX);
	-
	- /* build up the range list by comparing the two buffers. */
	- for (idx = 0; idx < nui64s; idx++) {
	- if (good[idx] == bad[idx]) {
	- if (start == -1)
	- continue;
	-
	- add_range(eip, start, idx);
	- start = -1;
	- } else {
	- if (start != -1)
	- continue;
	-
	- start = idx;
	- }
	- }
	- if (start != -1)
	- add_range(eip, start, idx);
	-
	- /* See if it will fit in our inline buffers */
	- inline_size = range_total_size(eip);
	- if (inline_size > ZFM_MAX_INLINE)
	- no_inline = 1;
	-
	- /*
	- * If there is no change and we want to drop if the buffers are
	- * identical, do so.
	- */
	- if (inline_size == 0 && drop_if_identical) {
	- kmem_free(eip, sizeof (*eip));
	- return (NULL);
	- }
	-
	- /*
	- * Now walk through the ranges, filling in the details of the
	- * differences. Also convert our uint64_t-array offsets to byte
	- * offsets.
	- */
	- for (range = 0; range < eip->zei_range_count; range++) {
	- size_t start = eip->zei_ranges[range].zr_start;
	- size_t end = eip->zei_ranges[range].zr_end;
	-
	- for (idx = start; idx < end; idx++) {
	- uint64_t set, cleared;
	-
	- // bits set in bad, but not in good
	- set = ((~good[idx]) & bad[idx]);
	- // bits set in good, but not in bad
	- cleared = (good[idx] & (~bad[idx]));
	-
	- allset \|= set;
	- allcleared \|= cleared;
	-
	- if (!no_inline) {
	- ASSERT3U(offset, <, inline_size);
	- eip->zei_bits_set[offset] = set;
	- eip->zei_bits_cleared[offset] = cleared;
	- offset++;
	- }
	-
	- update_histogram(set, eip->zei_histogram_set,
	- &eip->zei_range_sets[range]);
	- update_histogram(cleared, eip->zei_histogram_cleared,
	- &eip->zei_range_clears[range]);
	- }
	-
	- /* convert to byte offsets */
	- eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
	- eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
	- }
	- eip->zei_allowed_mingap *= sizeof (uint64_t);
	- inline_size *= sizeof (uint64_t);
	-
	- /* fill in ereport */
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
	- DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
	- (uint32_t *)eip->zei_ranges,
	- FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
	- DATA_TYPE_UINT32, eip->zei_allowed_mingap,
	- FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
	- DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
	- FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
	- DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
	- NULL);
	-
	- if (!no_inline) {
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
	- DATA_TYPE_UINT8_ARRAY,
	- inline_size, (uint8_t *)eip->zei_bits_set,
	- FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
	- DATA_TYPE_UINT8_ARRAY,
	- inline_size, (uint8_t *)eip->zei_bits_cleared,
	- NULL);
	- } else {
	- fm_payload_set(ereport,
	- FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
	- DATA_TYPE_UINT32_ARRAY,
	- NBBY * sizeof (uint64_t), eip->zei_histogram_set,
	- FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
	- DATA_TYPE_UINT32_ARRAY,
	- NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
	- NULL);
	- }
	- return (eip);
	-}
	-#endif
	-
	-void
	-zfs_ereport_post(const char subclass, spa_t spa, vdev_t vd, zio_t zio,
	- uint64_t stateoroffset, uint64_t size)
	-{
	-#ifdef _KERNEL
	- nvlist_t *ereport = NULL;
	- nvlist_t *detector = NULL;
	-
	- zfs_ereport_start(&ereport, &detector,
	- subclass, spa, vd, zio, stateoroffset, size);
	-
	- if (ereport == NULL)
	- return;
	-
	- fm_ereport_post(ereport, EVCH_SLEEP);
	-
	- fm_nvlist_destroy(ereport, FM_NVA_FREE);
	- fm_nvlist_destroy(detector, FM_NVA_FREE);
	-#endif
	-}
	-
	-void
	-zfs_ereport_start_checksum(spa_t spa, vdev_t vd,
	- struct zio zio, uint64_t offset, uint64_t length, void arg,
	- zio_bad_cksum_t *info)
	-{
	- zio_cksum_report_t report = kmem_zalloc(sizeof (report), KM_SLEEP);
	-
	- if (zio->io_vsd != NULL)
	- zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
	- else
	- zio_vsd_default_cksum_report(zio, report, arg);
	-
	- /* copy the checksum failure information if it was provided */
	- if (info != NULL) {
	- report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
	- bcopy(info, report->zcr_ckinfo, sizeof (*info));
	- }
	-
	- report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
	- report->zcr_length = length;
	-
	-#ifdef _KERNEL
	- zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
	- FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
	-
	- if (report->zcr_ereport == NULL) {
	- report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
	- if (report->zcr_ckinfo != NULL) {
	- kmem_free(report->zcr_ckinfo,
	- sizeof (*report->zcr_ckinfo));
	- }
	- kmem_free(report, sizeof (*report));
	- return;
	- }
	-#endif
	-
	- mutex_enter(&spa->spa_errlist_lock);
	- report->zcr_next = zio->io_logical->io_cksum_report;
	- zio->io_logical->io_cksum_report = report;
	- mutex_exit(&spa->spa_errlist_lock);
	-}
	-
	-void
	-zfs_ereport_finish_checksum(zio_cksum_report_t *report,
	- const void good_data, const void bad_data, boolean_t drop_if_identical)
	-{
	-#ifdef _KERNEL
	- zfs_ecksum_info_t *info = NULL;
	- info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
	- good_data, bad_data, report->zcr_length, drop_if_identical);
	-
	- if (info != NULL)
	- fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
	-
	- fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
	- fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
	- report->zcr_ereport = report->zcr_detector = NULL;
	-
	- if (info != NULL)
	- kmem_free(info, sizeof (*info));
	-#endif
	-}
	-
	-void
	-zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
	-{
	-#ifdef _KERNEL
	- if (rpt->zcr_ereport != NULL) {
	- fm_nvlist_destroy(rpt->zcr_ereport,
	- FM_NVA_FREE);
	- fm_nvlist_destroy(rpt->zcr_detector,
	- FM_NVA_FREE);
	- }
	-#endif
	- rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
	-
	- if (rpt->zcr_ckinfo != NULL)
	- kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
	-
	- kmem_free(rpt, sizeof (*rpt));
	-}
	-
	-void
	-zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
	-{
	-#ifdef _KERNEL
	- fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
	-#endif
	-}
	-
	-void
	-zfs_ereport_post_checksum(spa_t spa, vdev_t vd,
	- struct zio *zio, uint64_t offset, uint64_t length,
	- const void good_data, const void bad_data, zio_bad_cksum_t *zbc)
	-{
	-#ifdef _KERNEL
	- nvlist_t *ereport = NULL;
	- nvlist_t *detector = NULL;
	- zfs_ecksum_info_t *info;
	-
	- zfs_ereport_start(&ereport, &detector,
	- FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
	-
	- if (ereport == NULL)
	- return;
	-
	- info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
	- B_FALSE);
	-
	- if (info != NULL)
	- fm_ereport_post(ereport, EVCH_SLEEP);
	-
	- fm_nvlist_destroy(ereport, FM_NVA_FREE);
	- fm_nvlist_destroy(detector, FM_NVA_FREE);
	-
	- if (info != NULL)
	- kmem_free(info, sizeof (*info));
	-#endif
	-}
	-
	-static void
	-zfs_post_common(spa_t spa, vdev_t vd, const char *name)
	-{
	-#ifdef _KERNEL
	- nvlist_t *resource;
	- char class[64];
	-
	- if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
	- return;
	-
	- if ((resource = fm_nvlist_create(NULL)) == NULL)
	- return;
	-
	- (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
	- ZFS_ERROR_CLASS, name);
	- VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
	- VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
	- VERIFY(nvlist_add_uint64(resource,
	- FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
	- if (vd)
	- VERIFY(nvlist_add_uint64(resource,
	- FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
	-
	- fm_ereport_post(resource, EVCH_SLEEP);
	-
	- fm_nvlist_destroy(resource, FM_NVA_FREE);
	-#endif
	-}
	-
	-/*
	- * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
	- * has been removed from the system. This will cause the DE to ignore any
	- * recent I/O errors, inferring that they are due to the asynchronous device
	- * removal.
	- */
	-void
	-zfs_post_remove(spa_t spa, vdev_t vd)
	-{
	- zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
	-}
	-
	-/*
	- * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
	- * has the 'autoreplace' property set, and therefore any broken vdevs will be
	- * handled by higher level logic, and no vdev fault should be generated.
	- */
	-void
	-zfs_post_autoreplace(spa_t spa, vdev_t vd)
	-{
	- zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
	-}
	-
	-/*
	- * The 'resource.fs.zfs.statechange' event is an internal signal that the
	- * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
	- * cause the retire agent to repair any outstanding fault management cases
	- * open because the device was not found (fault.fs.zfs.device).
	- */
	-void
	-zfs_post_state_change(spa_t spa, vdev_t vd)
	-{
	- zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
	@@ -1,762 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/dmu.h>
	-#include <sys/avl.h>
	-#include <sys/zap.h>
	-#include <sys/refcount.h>
	-#include <sys/nvpair.h>
	-#ifdef _KERNEL
	-#include <sys/kidmap.h>
	-#include <sys/sid.h>
	-#include <sys/zfs_vfsops.h>
	-#include <sys/zfs_znode.h>
	-#endif
	-#include <sys/zfs_fuid.h>
	-
	-/*
	- * FUID Domain table(s).
	- *
	- * The FUID table is stored as a packed nvlist of an array
	- * of nvlists which contain an index, domain string and offset
	- *
	- * During file system initialization the nvlist(s) are read and
	- * two AVL trees are created. One tree is keyed by the index number
	- * and the other by the domain string. Nodes are never removed from
	- * trees, but new entries may be added. If a new entry is added then
	- * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
	- * be responsible for calling zfs_fuid_sync() to sync the changes to disk.
	- *
	- */
	-
	-#define FUID_IDX "fuid_idx"
	-#define FUID_DOMAIN "fuid_domain"
	-#define FUID_OFFSET "fuid_offset"
	-#define FUID_NVP_ARRAY "fuid_nvlist"
	-
	-typedef struct fuid_domain {
	- avl_node_t f_domnode;
	- avl_node_t f_idxnode;
	- ksiddomain_t *f_ksid;
	- uint64_t f_idx;
	-} fuid_domain_t;
	-
	-static char *nulldomain = "";
	-
	-/*
	- * Compare two indexes.
	- */
	-static int
	-idx_compare(const void arg1, const void arg2)
	-{
	- const fuid_domain_t node1 = (const fuid_domain_t )arg1;
	- const fuid_domain_t node2 = (const fuid_domain_t )arg2;
	-
	- return (AVL_CMP(node1->f_idx, node2->f_idx));
	-}
	-
	-/*
	- * Compare two domain strings.
	- */
	-static int
	-domain_compare(const void arg1, const void arg2)
	-{
	- const fuid_domain_t node1 = (const fuid_domain_t )arg1;
	- const fuid_domain_t node2 = (const fuid_domain_t )arg2;
	- int val;
	-
	- val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
	-
	- return (AVL_ISIGN(val));
	-}
	-
	-void
	-zfs_fuid_avl_tree_create(avl_tree_t idx_tree, avl_tree_t domain_tree)
	-{
	- avl_create(idx_tree, idx_compare,
	- sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
	- avl_create(domain_tree, domain_compare,
	- sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
	-}
	-
	-/*
	- * load initial fuid domain and idx trees. This function is used by
	- * both the kernel and zdb.
	- */
	-uint64_t
	-zfs_fuid_table_load(objset_t os, uint64_t fuid_obj, avl_tree_t idx_tree,
	- avl_tree_t *domain_tree)
	-{
	- dmu_buf_t *db;
	- uint64_t fuid_size;
	-
	- ASSERT(fuid_obj != 0);
	- VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
	- FTAG, &db));
	- fuid_size = (uint64_t )db->db_data;
	- dmu_buf_rele(db, FTAG);
	-
	- if (fuid_size) {
	- nvlist_t **fuidnvp;
	- nvlist_t *nvp = NULL;
	- uint_t count;
	- char *packed;
	- int i;
	-
	- packed = kmem_alloc(fuid_size, KM_SLEEP);
	- VERIFY(dmu_read(os, fuid_obj, 0,
	- fuid_size, packed, DMU_READ_PREFETCH) == 0);
	- VERIFY(nvlist_unpack(packed, fuid_size,
	- &nvp, 0) == 0);
	- VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
	- &fuidnvp, &count) == 0);
	-
	- for (i = 0; i != count; i++) {
	- fuid_domain_t *domnode;
	- char *domain;
	- uint64_t idx;
	-
	- VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
	- &domain) == 0);
	- VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
	- &idx) == 0);
	-
	- domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
	-
	- domnode->f_idx = idx;
	- domnode->f_ksid = ksid_lookupdomain(domain);
	- avl_add(idx_tree, domnode);
	- avl_add(domain_tree, domnode);
	- }
	- nvlist_free(nvp);
	- kmem_free(packed, fuid_size);
	- }
	- return (fuid_size);
	-}
	-
	-void
	-zfs_fuid_table_destroy(avl_tree_t idx_tree, avl_tree_t domain_tree)
	-{
	- fuid_domain_t *domnode;
	- void *cookie;
	-
	- cookie = NULL;
	- while (domnode = avl_destroy_nodes(domain_tree, &cookie))
	- ksiddomain_rele(domnode->f_ksid);
	-
	- avl_destroy(domain_tree);
	- cookie = NULL;
	- while (domnode = avl_destroy_nodes(idx_tree, &cookie))
	- kmem_free(domnode, sizeof (fuid_domain_t));
	- avl_destroy(idx_tree);
	-}
	-
	-char *
	-zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
	-{
	- fuid_domain_t searchnode, *findnode;
	- avl_index_t loc;
	-
	- searchnode.f_idx = idx;
	-
	- findnode = avl_find(idx_tree, &searchnode, &loc);
	-
	- return (findnode ? findnode->f_ksid->kd_name : nulldomain);
	-}
	-
	-#ifdef _KERNEL
	-/*
	- * Load the fuid table(s) into memory.
	- */
	-static void
	-zfs_fuid_init(zfsvfs_t *zfsvfs)
	-{
	- rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
	-
	- if (zfsvfs->z_fuid_loaded) {
	- rw_exit(&zfsvfs->z_fuid_lock);
	- return;
	- }
	-
	- zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
	-
	- (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
	- ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
	- if (zfsvfs->z_fuid_obj != 0) {
	- zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
	- zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
	- &zfsvfs->z_fuid_domain);
	- }
	-
	- zfsvfs->z_fuid_loaded = B_TRUE;
	- rw_exit(&zfsvfs->z_fuid_lock);
	-}
	-
	-/*
	- * sync out AVL trees to persistent storage.
	- */
	-void
	-zfs_fuid_sync(zfsvfs_t zfsvfs, dmu_tx_t tx)
	-{
	- nvlist_t *nvp;
	- nvlist_t **fuids;
	- size_t nvsize = 0;
	- char *packed;
	- dmu_buf_t *db;
	- fuid_domain_t *domnode;
	- int numnodes;
	- int i;
	-
	- if (!zfsvfs->z_fuid_dirty) {
	- return;
	- }
	-
	- rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
	-
	- /*
	- * First see if table needs to be created?
	- */
	- if (zfsvfs->z_fuid_obj == 0) {
	- zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
	- DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
	- sizeof (uint64_t), tx);
	- VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
	- ZFS_FUID_TABLES, sizeof (uint64_t), 1,
	- &zfsvfs->z_fuid_obj, tx) == 0);
	- }
	-
	- VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
	- fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
	- for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
	- domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
	- VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
	- domnode->f_idx) == 0);
	- VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
	- VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
	- domnode->f_ksid->kd_name) == 0);
	- }
	- VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
	- fuids, numnodes) == 0);
	- for (i = 0; i != numnodes; i++)
	- nvlist_free(fuids[i]);
	- kmem_free(fuids, numnodes * sizeof (void *));
	- VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
	- packed = kmem_alloc(nvsize, KM_SLEEP);
	- VERIFY(nvlist_pack(nvp, &packed, &nvsize,
	- NV_ENCODE_XDR, KM_SLEEP) == 0);
	- nvlist_free(nvp);
	- zfsvfs->z_fuid_size = nvsize;
	- dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
	- zfsvfs->z_fuid_size, packed, tx);
	- kmem_free(packed, zfsvfs->z_fuid_size);
	- VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
	- FTAG, &db));
	- dmu_buf_will_dirty(db, tx);
	- (uint64_t )db->db_data = zfsvfs->z_fuid_size;
	- dmu_buf_rele(db, FTAG);
	-
	- zfsvfs->z_fuid_dirty = B_FALSE;
	- rw_exit(&zfsvfs->z_fuid_lock);
	-}
	-
	-/*
	- * Query domain table for a given domain.
	- *
	- * If domain isn't found and addok is set, it is added to AVL trees and
	- * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be
	- * necessary for the caller or another thread to detect the dirty table
	- * and sync out the changes.
	- */
	-int
	-zfs_fuid_find_by_domain(zfsvfs_t zfsvfs, const char domain,
	- char **retdomain, boolean_t addok)
	-{
	- fuid_domain_t searchnode, *findnode;
	- avl_index_t loc;
	- krw_t rw = RW_READER;
	-
	- /*
	- * If the dummy "nobody" domain then return an index of 0
	- * to cause the created FUID to be a standard POSIX id
	- * for the user nobody.
	- */
	- if (domain[0] == '\0') {
	- if (retdomain)
	- *retdomain = nulldomain;
	- return (0);
	- }
	-
	- searchnode.f_ksid = ksid_lookupdomain(domain);
	- if (retdomain)
	- *retdomain = searchnode.f_ksid->kd_name;
	- if (!zfsvfs->z_fuid_loaded)
	- zfs_fuid_init(zfsvfs);
	-
	-retry:
	- rw_enter(&zfsvfs->z_fuid_lock, rw);
	- findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
	-
	- if (findnode) {
	- rw_exit(&zfsvfs->z_fuid_lock);
	- ksiddomain_rele(searchnode.f_ksid);
	- return (findnode->f_idx);
	- } else if (addok) {
	- fuid_domain_t *domnode;
	- uint64_t retidx;
	-
	- if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
	- rw_exit(&zfsvfs->z_fuid_lock);
	- rw = RW_WRITER;
	- goto retry;
	- }
	-
	- domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
	- domnode->f_ksid = searchnode.f_ksid;
	-
	- retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
	-
	- avl_add(&zfsvfs->z_fuid_domain, domnode);
	- avl_add(&zfsvfs->z_fuid_idx, domnode);
	- zfsvfs->z_fuid_dirty = B_TRUE;
	- rw_exit(&zfsvfs->z_fuid_lock);
	- return (retidx);
	- } else {
	- rw_exit(&zfsvfs->z_fuid_lock);
	- return (-1);
	- }
	-}
	-
	-/*
	- * Query domain table by index, returning domain string
	- *
	- * Returns a pointer from an avl node of the domain string.
	- *
	- */
	-const char *
	-zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
	-{
	- char *domain;
	-
	- if (idx == 0 \|\| !zfsvfs->z_use_fuids)
	- return (NULL);
	-
	- if (!zfsvfs->z_fuid_loaded)
	- zfs_fuid_init(zfsvfs);
	-
	- rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
	-
	- if (zfsvfs->z_fuid_obj \|\| zfsvfs->z_fuid_dirty)
	- domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
	- else
	- domain = nulldomain;
	- rw_exit(&zfsvfs->z_fuid_lock);
	-
	- ASSERT(domain);
	- return (domain);
	-}
	-
	-void
	-zfs_fuid_map_ids(znode_t zp, cred_t cr, uid_t uidp, uid_t gidp)
	-{
	- *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
	- *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP);
	-}
	-
	-uid_t
	-zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
	- cred_t *cr, zfs_fuid_type_t type)
	-{
	- uint32_t index = FUID_INDEX(fuid);
	- const char *domain;
	- uid_t id;
	-
	- if (index == 0)
	- return (fuid);
	-
	- domain = zfs_fuid_find_by_idx(zfsvfs, index);
	- ASSERT(domain != NULL);
	-
	-#ifdef illumos
	- if (type == ZFS_OWNER \|\| type == ZFS_ACE_USER) {
	- (void) kidmap_getuidbysid(crgetzone(cr), domain,
	- FUID_RID(fuid), &id);
	- } else {
	- (void) kidmap_getgidbysid(crgetzone(cr), domain,
	- FUID_RID(fuid), &id);
	- }
	-#else
	- id = UID_NOBODY;
	-#endif
	- return (id);
	-}
	-
	-/*
	- * Add a FUID node to the list of fuid's being created for this
	- * ACL
	- *
	- * If ACL has multiple domains, then keep only one copy of each unique
	- * domain.
	- */
	-void
	-zfs_fuid_node_add(zfs_fuid_info_t *fuidpp, const char domain, uint32_t rid,
	- uint64_t idx, uint64_t id, zfs_fuid_type_t type)
	-{
	- zfs_fuid_t *fuid;
	- zfs_fuid_domain_t *fuid_domain;
	- zfs_fuid_info_t *fuidp;
	- uint64_t fuididx;
	- boolean_t found = B_FALSE;
	-
	- if (*fuidpp == NULL)
	- *fuidpp = zfs_fuid_info_alloc();
	-
	- fuidp = *fuidpp;
	- /*
	- * First find fuid domain index in linked list
	- *
	- * If one isn't found then create an entry.
	- */
	-
	- for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
	- fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
	- fuid_domain), fuididx++) {
	- if (idx == fuid_domain->z_domidx) {
	- found = B_TRUE;
	- break;
	- }
	- }
	-
	- if (!found) {
	- fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
	- fuid_domain->z_domain = domain;
	- fuid_domain->z_domidx = idx;
	- list_insert_tail(&fuidp->z_domains, fuid_domain);
	- fuidp->z_domain_str_sz += strlen(domain) + 1;
	- fuidp->z_domain_cnt++;
	- }
	-
	- if (type == ZFS_ACE_USER \|\| type == ZFS_ACE_GROUP) {
	-
	- /*
	- * Now allocate fuid entry and add it on the end of the list
	- */
	-
	- fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
	- fuid->z_id = id;
	- fuid->z_domidx = idx;
	- fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
	-
	- list_insert_tail(&fuidp->z_fuids, fuid);
	- fuidp->z_fuid_cnt++;
	- } else {
	- if (type == ZFS_OWNER)
	- fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
	- else
	- fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
	- }
	-}
	-
	-/*
	- * Create a file system FUID, based on information in the users cred
	- *
	- * If cred contains KSID_OWNER then it should be used to determine
	- * the uid otherwise cred's uid will be used. By default cred's gid
	- * is used unless it's an ephemeral ID in which case KSID_GROUP will
	- * be used if it exists.
	- */
	-uint64_t
	-zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
	- cred_t cr, zfs_fuid_info_t *fuidp)
	-{
	- uint64_t idx;
	- ksid_t *ksid;
	- uint32_t rid;
	- char *kdomain;
	- const char *domain;
	- uid_t id;
	-
	- VERIFY(type == ZFS_OWNER \|\| type == ZFS_GROUP);
	-
	- ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
	-
	- if (!zfsvfs->z_use_fuids \|\| (ksid == NULL)) {
	- id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
	-
	- if (IS_EPHEMERAL(id))
	- return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
	-
	- return ((uint64_t)id);
	- }
	-
	- /*
	- * ksid is present and FUID is supported
	- */
	- id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
	-
	- if (!IS_EPHEMERAL(id))
	- return ((uint64_t)id);
	-
	- if (type == ZFS_GROUP)
	- id = ksid_getid(ksid);
	-
	- rid = ksid_getrid(ksid);
	- domain = ksid_getdomain(ksid);
	-
	- idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
	-
	- zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
	-
	- return (FUID_ENCODE(idx, rid));
	-}
	-
	-/*
	- * Create a file system FUID for an ACL ace
	- * or a chown/chgrp of the file.
	- * This is similar to zfs_fuid_create_cred, except that
	- * we can't find the domain + rid information in the
	- * cred. Instead we have to query Winchester for the
	- * domain and rid.
	- *
	- * During replay operations the domain+rid information is
	- * found in the zfs_fuid_info_t that the replay code has
	- * attached to the zfsvfs of the file system.
	- */
	-uint64_t
	-zfs_fuid_create(zfsvfs_t zfsvfs, uint64_t id, cred_t cr,
	- zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
	-{
	- const char *domain;
	- char *kdomain;
	- uint32_t fuid_idx = FUID_INDEX(id);
	- uint32_t rid;
	- idmap_stat status;
	- uint64_t idx = 0;
	- zfs_fuid_t *zfuid = NULL;
	- zfs_fuid_info_t *fuidp = NULL;
	-
	- /*
	- * If POSIX ID, or entry is already a FUID then
	- * just return the id
	- *
	- * We may also be handed an already FUID'ized id via
	- * chmod.
	- */
	-
	- if (!zfsvfs->z_use_fuids \|\| !IS_EPHEMERAL(id) \|\| fuid_idx != 0)
	- return (id);
	-
	- if (zfsvfs->z_replay) {
	- fuidp = zfsvfs->z_fuid_replay;
	-
	- /*
	- * If we are passed an ephemeral id, but no
	- * fuid_info was logged then return NOBODY.
	- * This is most likely a result of idmap service
	- * not being available.
	- */
	- if (fuidp == NULL)
	- return (UID_NOBODY);
	-
	- VERIFY3U(type, >=, ZFS_OWNER);
	- VERIFY3U(type, <=, ZFS_ACE_GROUP);
	-
	- switch (type) {
	- case ZFS_ACE_USER:
	- case ZFS_ACE_GROUP:
	- zfuid = list_head(&fuidp->z_fuids);
	- rid = FUID_RID(zfuid->z_logfuid);
	- idx = FUID_INDEX(zfuid->z_logfuid);
	- break;
	- case ZFS_OWNER:
	- rid = FUID_RID(fuidp->z_fuid_owner);
	- idx = FUID_INDEX(fuidp->z_fuid_owner);
	- break;
	- case ZFS_GROUP:
	- rid = FUID_RID(fuidp->z_fuid_group);
	- idx = FUID_INDEX(fuidp->z_fuid_group);
	- break;
	- };
	- domain = fuidp->z_domain_table[idx - 1];
	- } else {
	- if (type == ZFS_OWNER \|\| type == ZFS_ACE_USER)
	- status = kidmap_getsidbyuid(crgetzone(cr), id,
	- &domain, &rid);
	- else
	- status = kidmap_getsidbygid(crgetzone(cr), id,
	- &domain, &rid);
	-
	- if (status != 0) {
	- /*
	- * When returning nobody we will need to
	- * make a dummy fuid table entry for logging
	- * purposes.
	- */
	- rid = UID_NOBODY;
	- domain = nulldomain;
	- }
	- }
	-
	- idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
	-
	- if (!zfsvfs->z_replay)
	- zfs_fuid_node_add(fuidpp, kdomain,
	- rid, idx, id, type);
	- else if (zfuid != NULL) {
	- list_remove(&fuidp->z_fuids, zfuid);
	- kmem_free(zfuid, sizeof (zfs_fuid_t));
	- }
	- return (FUID_ENCODE(idx, rid));
	-}
	-
	-void
	-zfs_fuid_destroy(zfsvfs_t *zfsvfs)
	-{
	- rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
	- if (!zfsvfs->z_fuid_loaded) {
	- rw_exit(&zfsvfs->z_fuid_lock);
	- return;
	- }
	- zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
	- rw_exit(&zfsvfs->z_fuid_lock);
	-}
	-
	-/*
	- * Allocate zfs_fuid_info for tracking FUIDs created during
	- * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
	- */
	-zfs_fuid_info_t *
	-zfs_fuid_info_alloc(void)
	-{
	- zfs_fuid_info_t *fuidp;
	-
	- fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
	- list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
	- offsetof(zfs_fuid_domain_t, z_next));
	- list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
	- offsetof(zfs_fuid_t, z_next));
	- return (fuidp);
	-}
	-
	-/*
	- * Release all memory associated with zfs_fuid_info_t
	- */
	-void
	-zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
	-{
	- zfs_fuid_t *zfuid;
	- zfs_fuid_domain_t *zdomain;
	-
	- while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
	- list_remove(&fuidp->z_fuids, zfuid);
	- kmem_free(zfuid, sizeof (zfs_fuid_t));
	- }
	-
	- if (fuidp->z_domain_table != NULL)
	- kmem_free(fuidp->z_domain_table,
	- (sizeof (char *)) fuidp->z_domain_cnt);
	-
	- while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
	- list_remove(&fuidp->z_domains, zdomain);
	- kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
	- }
	-
	- kmem_free(fuidp, sizeof (zfs_fuid_info_t));
	-}
	-
	-/*
	- * Check to see if id is a groupmember. If cred
	- * has ksid info then sidlist is checked first
	- * and if still not found then POSIX groups are checked
	- *
	- * Will use a straight FUID compare when possible.
	- */
	-boolean_t
	-zfs_groupmember(zfsvfs_t zfsvfs, uint64_t id, cred_t cr)
	-{
	-#ifdef illumos
	- ksid_t *ksid = crgetsid(cr, KSID_GROUP);
	- ksidlist_t *ksidlist = crgetsidlist(cr);
	-#endif
	- uid_t gid;
	-
	-#ifdef illumos
	- if (ksid && ksidlist) {
	- int i;
	- ksid_t *ksid_groups;
	- uint32_t idx = FUID_INDEX(id);
	- uint32_t rid = FUID_RID(id);
	-
	- ksid_groups = ksidlist->ksl_sids;
	-
	- for (i = 0; i != ksidlist->ksl_nsid; i++) {
	- if (idx == 0) {
	- if (id != IDMAP_WK_CREATOR_GROUP_GID &&
	- id == ksid_groups[i].ks_id) {
	- return (B_TRUE);
	- }
	- } else {
	- const char *domain;
	-
	- domain = zfs_fuid_find_by_idx(zfsvfs, idx);
	- ASSERT(domain != NULL);
	-
	- if (strcmp(domain,
	- IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
	- return (B_FALSE);
	-
	- if ((strcmp(domain,
	- ksid_groups[i].ks_domain->kd_name) == 0) &&
	- rid == ksid_groups[i].ks_rid)
	- return (B_TRUE);
	- }
	- }
	- }
	-#endif /* illumos */
	-
	- /*
	- * Not found in ksidlist, check posix groups
	- */
	- gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
	- return (groupmember(gid, cr));
	-}
	-
	-void
	-zfs_fuid_txhold(zfsvfs_t zfsvfs, dmu_tx_t tx)
	-{
	- if (zfsvfs->z_fuid_obj == 0) {
	- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
	- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
	- FUID_SIZE_ESTIMATE(zfsvfs));
	- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
	- } else {
	- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
	- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
	- FUID_SIZE_ESTIMATE(zfsvfs));
	- }
	-}
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
	@@ -1,7692 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
	- * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
	- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2013 Steven Hartland. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016 Toomas Soome <tsoome@me.com>
	- * Copyright 2017 RackTop Systems.
	- * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
	- * Copyright (c) 2019 Datto Inc.
	- */
	-
	-/*
	- * ZFS ioctls.
	- *
	- * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
	- * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
	- *
	- * There are two ways that we handle ioctls: the legacy way where almost
	- * all of the logic is in the ioctl callback, and the new way where most
	- * of the marshalling is handled in the common entry point, zfsdev_ioctl().
	- *
	- * Non-legacy ioctls should be registered by calling
	- * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked
	- * from userland by lzc_ioctl().
	- *
	- * The registration arguments are as follows:
	- *
	- * const char *name
	- * The name of the ioctl. This is used for history logging. If the
	- * ioctl returns successfully (the callback returns 0), and allow_log
	- * is true, then a history log entry will be recorded with the input &
	- * output nvlists. The log entry can be printed with "zpool history -i".
	- *
	- * zfs_ioc_t ioc
	- * The ioctl request number, which userland will pass to ioctl(2).
	- * We want newer versions of libzfs and libzfs_core to run against
	- * existing zfs kernel modules (i.e. a deferred reboot after an update).
	- * Therefore the ioctl numbers cannot change from release to release.
	- *
	- * zfs_secpolicy_func_t *secpolicy
	- * This function will be called before the zfs_ioc_func_t, to
	- * determine if this operation is permitted. It should return EPERM
	- * on failure, and 0 on success. Checks include determining if the
	- * dataset is visible in this zone, and if the user has either all
	- * zfs privileges in the zone (SYS_MOUNT), or has been granted permission
	- * to do this operation on this dataset with "zfs allow".
	- *
	- * zfs_ioc_namecheck_t namecheck
	- * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
	- * name, a dataset name, or nothing. If the name is not well-formed,
	- * the ioctl will fail and the callback will not be called.
	- * Therefore, the callback can assume that the name is well-formed
	- * (e.g. is null-terminated, doesn't have more than one '@' character,
	- * doesn't have invalid characters).
	- *
	- * zfs_ioc_poolcheck_t pool_check
	- * This specifies requirements on the pool state. If the pool does
	- * not meet them (is suspended or is readonly), the ioctl will fail
	- * and the callback will not be called. If any checks are specified
	- * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
	- * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED \|
	- * POOL_CHECK_READONLY).
	- *
	- * zfs_ioc_key_t *nvl_keys
	- * The list of expected/allowable innvl input keys. This list is used
	- * to validate the nvlist input to the ioctl.
	- *
	- * boolean_t smush_outnvlist
	- * If smush_outnvlist is true, then the output is presumed to be a
	- * list of errors, and it will be "smushed" down to fit into the
	- * caller's buffer, by removing some entries and replacing them with a
	- * single "N_MORE_ERRORS" entry indicating how many were removed. See
	- * nvlist_smush() for details. If smush_outnvlist is false, and the
	- * outnvlist does not fit into the userland-provided buffer, then the
	- * ioctl will fail with ENOMEM.
	- *
	- * zfs_ioc_func_t *func
	- * The callback function that will perform the operation.
	- *
	- * The callback should return 0 on success, or an error number on
	- * failure. If the function fails, the userland ioctl will return -1,
	- * and errno will be set to the callback's return value. The callback
	- * will be called with the following arguments:
	- *
	- * const char *name
	- * The name of the pool or dataset to operate on, from
	- * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the
	- * expected type (pool, dataset, or none).
	- *
	- * nvlist_t *innvl
	- * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or
	- * NULL if no input nvlist was provided. Changes to this nvlist are
	- * ignored. If the input nvlist could not be deserialized, the
	- * ioctl will fail and the callback will not be called.
	- *
	- * nvlist_t *outnvl
	- * The output nvlist, initially empty. The callback can fill it in,
	- * and it will be returned to userland by serializing it into
	- * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization
	- * fails (e.g. because the caller didn't supply a large enough
	- * buffer), then the overall ioctl will fail. See the
	- * 'smush_nvlist' argument above for additional behaviors.
	- *
	- * There are two typical uses of the output nvlist:
	- * - To return state, e.g. property values. In this case,
	- * smush_outnvlist should be false. If the buffer was not large
	- * enough, the caller will reallocate a larger buffer and try
	- * the ioctl again.
	- *
	- * - To return multiple errors from an ioctl which makes on-disk
	- * changes. In this case, smush_outnvlist should be true.
	- * Ioctls which make on-disk modifications should generally not
	- * use the outnvl if they succeed, because the caller can not
	- * distinguish between the operation failing, and
	- * deserialization failing.
	- *
	- *
	- * IOCTL Interface Errors
	- *
	- * The following ioctl input errors can be returned:
	- * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel
	- * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel
	- * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing
	- * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type
	- */
	-
	-#ifdef __FreeBSD__
	-#include "opt_kstack_pages.h"
	-#endif
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/systm.h>
	-#include <sys/conf.h>
	-#include <sys/kernel.h>
	-#include <sys/lock.h>
	-#include <sys/malloc.h>
	-#include <sys/mutex.h>
	-#include <sys/proc.h>
	-#include <sys/errno.h>
	-#include <sys/uio.h>
	-#include <sys/buf.h>
	-#include <sys/file.h>
	-#include <sys/kmem.h>
	-#include <sys/conf.h>
	-#include <sys/cmn_err.h>
	-#include <sys/stat.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zfs_vfsops.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zap.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/vdev.h>
	-#include <sys/dmu.h>
	-#include <sys/dsl_dir.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_impl.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/sunddi.h>
	-#include <sys/policy.h>
	-#include <sys/zone.h>
	-#include <sys/nvpair.h>
	-#include <sys/mount.h>
	-#include <sys/taskqueue.h>
	-#include <sys/sdt.h>
	-#include <sys/varargs.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zfs_ctldir.h>
	-#include <sys/zfs_dir.h>
	-#include <sys/zfs_onexit.h>
	-#include <sys/zvol.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_send.h>
	-#include <sys/dsl_destroy.h>
	-#include <sys/dsl_bookmark.h>
	-#include <sys/dsl_userhold.h>
	-#include <sys/zfeature.h>
	-#include <sys/zcp.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/vdev_removal.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/vdev_initialize.h>
	-
	-#include "zfs_namecheck.h"
	-#include "zfs_prop.h"
	-#include "zfs_deleg.h"
	-#include "zfs_comutil.h"
	-#include "zfs_ioctl_compat.h"
	-
	-#include "lua.h"
	-#include "lauxlib.h"
	-
	-#ifndef ARRAY_SIZE
	-#define ARRAY_SIZE(x) nitems(x)
	-#endif
	-
	-static struct cdev *zfsdev;
	-
	-extern void zfs_init(void);
	-extern void zfs_fini(void);
	-
	-uint_t zfs_fsyncer_key;
	-extern uint_t rrw_tsd_key;
	-static uint_t zfs_allow_log_key;
	-extern uint_t zfs_geom_probe_vdev_key;
	-
	-typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
	-typedef int zfs_ioc_func_t(const char , nvlist_t , nvlist_t *);
	-typedef int zfs_secpolicy_func_t(zfs_cmd_t , nvlist_t , cred_t *);
	-
	-/*
	- * IOC Keys are used to document and validate user->kernel interface inputs.
	- * See zfs_keys_recv_new for an example declaration. Any key name that is not
	- * listed will be rejected as input.
	- *
	- * The keyname 'optional' is always allowed, and must be an nvlist if present.
	- * Arguments which older kernels can safely ignore can be placed under the
	- * "optional" key.
	- *
	- * When adding new keys to an existing ioc for new functionality, consider:
	- * - adding an entry into zfs_sysfs.c zfs_features[] list
	- * - updating the libzfs_input_check.c test utility
	- *
	- * Note: in the ZK_WILDCARDLIST case, the name serves as documentation
	- * for the expected name (bookmark, snapshot, property, etc) but there
	- * is no validation in the preflight zfs_check_input_nvpairs() check.
	- */
	-typedef enum {
	- ZK_OPTIONAL = 1 << 0, /* pair is optional */
	- ZK_WILDCARDLIST = 1 << 1, /* one or more unspecified key names */
	-} ioc_key_flag_t;
	-
	-/* DATA_TYPE_ANY is used when zkey_type can vary. */
	-#define DATA_TYPE_ANY DATA_TYPE_UNKNOWN
	-
	-typedef struct zfs_ioc_key {
	- const char *zkey_name;
	- data_type_t zkey_type;
	- ioc_key_flag_t zkey_flags;
	-} zfs_ioc_key_t;
	-
	-typedef enum {
	- NO_NAME,
	- POOL_NAME,
	- DATASET_NAME,
	- ENTITY_NAME
	-} zfs_ioc_namecheck_t;
	-
	-typedef enum {
	- POOL_CHECK_NONE = 1 << 0,
	- POOL_CHECK_SUSPENDED = 1 << 1,
	- POOL_CHECK_READONLY = 1 << 2,
	-} zfs_ioc_poolcheck_t;
	-
	-typedef struct zfs_ioc_vec {
	- zfs_ioc_legacy_func_t *zvec_legacy_func;
	- zfs_ioc_func_t *zvec_func;
	- zfs_secpolicy_func_t *zvec_secpolicy;
	- zfs_ioc_namecheck_t zvec_namecheck;
	- boolean_t zvec_allow_log;
	- zfs_ioc_poolcheck_t zvec_pool_check;
	- boolean_t zvec_smush_outnvlist;
	- const char *zvec_name;
	- const zfs_ioc_key_t *zvec_nvl_keys;
	- size_t zvec_nvl_key_count;
	-} zfs_ioc_vec_t;
	-
	-/* This array is indexed by zfs_userquota_prop_t */
	-static const char *userquota_perms[] = {
	- ZFS_DELEG_PERM_USERUSED,
	- ZFS_DELEG_PERM_USERQUOTA,
	- ZFS_DELEG_PERM_GROUPUSED,
	- ZFS_DELEG_PERM_GROUPQUOTA,
	-};
	-
	-static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
	-static int zfs_check_settable(const char name, nvpair_t property,
	- cred_t *cr);
	-static int zfs_check_clearable(char dataset, nvlist_t props,
	- nvlist_t **errors);
	-static int zfs_fill_zplprops_root(uint64_t, nvlist_t , nvlist_t ,
	- boolean_t *);
	-int zfs_set_prop_nvlist(const char , zprop_source_t, nvlist_t , nvlist_t *);
	-static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
	-
	-static void zfsdev_close(void *data);
	-
	-static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
	-
	-/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
	-void
	-__dprintf(const char file, const char func, int line, const char *fmt, ...)
	-{
	- const char *newfile;
	- char buf[512];
	- va_list adx;
	-
	- /*
	- * Get rid of annoying "../common/" prefix to filename.
	- */
	- newfile = strrchr(file, '/');
	- if (newfile != NULL) {
	- newfile = newfile + 1; /* Get rid of leading / */
	- } else {
	- newfile = file;
	- }
	-
	- va_start(adx, fmt);
	- (void) vsnprintf(buf, sizeof (buf), fmt, adx);
	- va_end(adx);
	-
	- /*
	- * To get this data, use the zfs-dprintf probe as so:
	- * dtrace -q -n 'zfs-dprintf \
	- * /stringof(arg0) == "dbuf.c"/ \
	- * {printf("%s: %s", stringof(arg1), stringof(arg3))}'
	- * arg0 = file name
	- * arg1 = function name
	- * arg2 = line number
	- * arg3 = message
	- */
	- DTRACE_PROBE4(zfs__dprintf,
	- char , newfile, char , func, int, line, char *, buf);
	-}
	-
	-static void
	-history_str_free(char *buf)
	-{
	- kmem_free(buf, HIS_MAX_RECORD_LEN);
	-}
	-
	-static char *
	-history_str_get(zfs_cmd_t *zc)
	-{
	- char *buf;
	-
	- if (zc->zc_history == 0)
	- return (NULL);
	-
	- buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
	- if (copyinstr((void *)(uintptr_t)zc->zc_history,
	- buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
	- history_str_free(buf);
	- return (NULL);
	- }
	-
	- buf[HIS_MAX_RECORD_LEN -1] = '\0';
	-
	- return (buf);
	-}
	-
	-/*
	- * Check to see if the named dataset is currently defined as bootable
	- */
	-static boolean_t
	-zfs_is_bootfs(const char *name)
	-{
	- objset_t *os;
	-
	- if (dmu_objset_hold(name, FTAG, &os) == 0) {
	- boolean_t ret;
	- ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
	- dmu_objset_rele(os, FTAG);
	- return (ret);
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * Return non-zero if the spa version is less than requested version.
	- */
	-static int
	-zfs_earlier_version(const char *name, int version)
	-{
	- spa_t *spa;
	-
	- if (spa_open(name, &spa, FTAG) == 0) {
	- if (spa_version(spa) < version) {
	- spa_close(spa, FTAG);
	- return (1);
	- }
	- spa_close(spa, FTAG);
	- }
	- return (0);
	-}
	-
	-/*
	- * Return TRUE if the ZPL version is less than requested version.
	- */
	-static boolean_t
	-zpl_earlier_version(const char *name, int version)
	-{
	- objset_t *os;
	- boolean_t rc = B_TRUE;
	-
	- if (dmu_objset_hold(name, FTAG, &os) == 0) {
	- uint64_t zplversion;
	-
	- if (dmu_objset_type(os) != DMU_OST_ZFS) {
	- dmu_objset_rele(os, FTAG);
	- return (B_TRUE);
	- }
	- /* XXX reading from non-owned objset */
	- if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
	- rc = zplversion < version;
	- dmu_objset_rele(os, FTAG);
	- }
	- return (rc);
	-}
	-
	-static void
	-zfs_log_history(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- char *buf;
	-
	- if ((buf = history_str_get(zc)) == NULL)
	- return;
	-
	- if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
	- if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
	- (void) spa_history_log(spa, buf);
	- spa_close(spa, FTAG);
	- }
	- history_str_free(buf);
	-}
	-
	-/*
	- * Policy for top-level read operations (list pools). Requires no privileges,
	- * and can be used in the local zone, as there is no associated dataset.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_none(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- return (0);
	-}
	-
	-/*
	- * Policy for dataset read operations (list children, get statistics). Requires
	- * no privileges, but must be visible in the local zone.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_read(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- if (INGLOBALZONE(curthread) \|\|
	- zone_dataset_visible(zc->zc_name, NULL))
	- return (0);
	-
	- return (SET_ERROR(ENOENT));
	-}
	-
	-static int
	-zfs_dozonecheck_impl(const char dataset, uint64_t zoned, cred_t cr)
	-{
	- int writable = 1;
	-
	- /*
	- * The dataset must be visible by this zone -- check this first
	- * so they don't see EPERM on something they shouldn't know about.
	- */
	- if (!INGLOBALZONE(curthread) &&
	- !zone_dataset_visible(dataset, &writable))
	- return (SET_ERROR(ENOENT));
	-
	- if (INGLOBALZONE(curthread)) {
	- /*
	- * If the fs is zoned, only root can access it from the
	- * global zone.
	- */
	- if (secpolicy_zfs(cr) && zoned)
	- return (SET_ERROR(EPERM));
	- } else {
	- /*
	- * If we are in a local zone, the 'zoned' property must be set.
	- */
	- if (!zoned)
	- return (SET_ERROR(EPERM));
	-
	- /* must be writable by this zone */
	- if (!writable)
	- return (SET_ERROR(EPERM));
	- }
	- return (0);
	-}
	-
	-static int
	-zfs_dozonecheck(const char dataset, cred_t cr)
	-{
	- uint64_t zoned;
	-
	- if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
	- return (SET_ERROR(ENOENT));
	-
	- return (zfs_dozonecheck_impl(dataset, zoned, cr));
	-}
	-
	-static int
	-zfs_dozonecheck_ds(const char dataset, dsl_dataset_t ds, cred_t *cr)
	-{
	- uint64_t zoned;
	-
	- if (dsl_prop_get_int_ds(ds, "jailed", &zoned))
	- return (SET_ERROR(ENOENT));
	-
	- return (zfs_dozonecheck_impl(dataset, zoned, cr));
	-}
	-
	-static int
	-zfs_secpolicy_write_perms_ds(const char name, dsl_dataset_t ds,
	- const char perm, cred_t cr)
	-{
	- int error;
	-
	- error = zfs_dozonecheck_ds(name, ds, cr);
	- if (error == 0) {
	- error = secpolicy_zfs(cr);
	- if (error != 0)
	- error = dsl_deleg_access_impl(ds, perm, cr);
	- }
	- return (error);
	-}
	-
	-static int
	-zfs_secpolicy_write_perms(const char name, const char perm, cred_t *cr)
	-{
	- int error;
	- dsl_dataset_t *ds;
	- dsl_pool_t *dp;
	-
	- /*
	- * First do a quick check for root in the global zone, which
	- * is allowed to do all write_perms. This ensures that zfs_ioc_*
	- * will get to handle nonexistent datasets.
	- */
	- if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0)
	- return (0);
	-
	- error = dsl_pool_hold(name, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold(dp, name, FTAG, &ds);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
	-
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	-}
	-
	-#ifdef SECLABEL
	-/*
	- * Policy for setting the security label property.
	- *
	- * Returns 0 for success, non-zero for access and other errors.
	- */
	-static int
	-zfs_set_slabel_policy(const char name, char strval, cred_t *cr)
	-{
	- char ds_hexsl[MAXNAMELEN];
	- bslabel_t ds_sl, new_sl;
	- boolean_t new_default = FALSE;
	- uint64_t zoned;
	- int needed_priv = -1;
	- int error;
	-
	- /* First get the existing dataset label. */
	- error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
	- 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
	- if (error != 0)
	- return (SET_ERROR(EPERM));
	-
	- if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
	- new_default = TRUE;
	-
	- /* The label must be translatable */
	- if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * In a non-global zone, disallow attempts to set a label that
	- * doesn't match that of the zone; otherwise no other checks
	- * are needed.
	- */
	- if (!INGLOBALZONE(curproc)) {
	- if (new_default \|\| !blequal(&new_sl, CR_SL(CRED())))
	- return (SET_ERROR(EPERM));
	- return (0);
	- }
	-
	- /*
	- * For global-zone datasets (i.e., those whose zoned property is
	- * "off", verify that the specified new label is valid for the
	- * global zone.
	- */
	- if (dsl_prop_get_integer(name,
	- zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
	- return (SET_ERROR(EPERM));
	- if (!zoned) {
	- if (zfs_check_global_label(name, strval) != 0)
	- return (SET_ERROR(EPERM));
	- }
	-
	- /*
	- * If the existing dataset label is nondefault, check if the
	- * dataset is mounted (label cannot be changed while mounted).
	- * Get the zfsvfs; if there isn't one, then the dataset isn't
	- * mounted (or isn't a dataset, doesn't exist, ...).
	- */
	- if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
	- objset_t *os;
	- static char *setsl_tag = "setsl_tag";
	-
	- /*
	- * Try to own the dataset; abort if there is any error,
	- * (e.g., already mounted, in use, or other error).
	- */
	- error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
	- setsl_tag, &os);
	- if (error != 0)
	- return (SET_ERROR(EPERM));
	-
	- dmu_objset_disown(os, setsl_tag);
	-
	- if (new_default) {
	- needed_priv = PRIV_FILE_DOWNGRADE_SL;
	- goto out_check;
	- }
	-
	- if (hexstr_to_label(strval, &new_sl) != 0)
	- return (SET_ERROR(EPERM));
	-
	- if (blstrictdom(&ds_sl, &new_sl))
	- needed_priv = PRIV_FILE_DOWNGRADE_SL;
	- else if (blstrictdom(&new_sl, &ds_sl))
	- needed_priv = PRIV_FILE_UPGRADE_SL;
	- } else {
	- /* dataset currently has a default label */
	- if (!new_default)
	- needed_priv = PRIV_FILE_UPGRADE_SL;
	- }
	-
	-out_check:
	- if (needed_priv != -1)
	- return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
	- return (0);
	-}
	-#endif /* SECLABEL */
	-
	-static int
	-zfs_secpolicy_setprop(const char dsname, zfs_prop_t prop, nvpair_t propval,
	- cred_t *cr)
	-{
	- char *strval;
	-
	- /*
	- * Check permissions for special properties.
	- */
	- switch (prop) {
	- case ZFS_PROP_ZONED:
	- /*
	- * Disallow setting of 'zoned' from within a local zone.
	- */
	- if (!INGLOBALZONE(curthread))
	- return (SET_ERROR(EPERM));
	- break;
	-
	- case ZFS_PROP_QUOTA:
	- case ZFS_PROP_FILESYSTEM_LIMIT:
	- case ZFS_PROP_SNAPSHOT_LIMIT:
	- if (!INGLOBALZONE(curthread)) {
	- uint64_t zoned;
	- char setpoint[ZFS_MAX_DATASET_NAME_LEN];
	- /*
	- * Unprivileged users are allowed to modify the
	- * limit on things under (ie. contained by)
	- * the thing they own.
	- */
	- if (dsl_prop_get_integer(dsname, "jailed", &zoned,
	- setpoint))
	- return (SET_ERROR(EPERM));
	- if (!zoned \|\| strlen(dsname) <= strlen(setpoint))
	- return (SET_ERROR(EPERM));
	- }
	- break;
	-
	- case ZFS_PROP_MLSLABEL:
	-#ifdef SECLABEL
	- if (!is_system_labeled())
	- return (SET_ERROR(EPERM));
	-
	- if (nvpair_value_string(propval, &strval) == 0) {
	- int err;
	-
	- err = zfs_set_slabel_policy(dsname, strval, CRED());
	- if (err != 0)
	- return (err);
	- }
	-#else
	- return (EOPNOTSUPP);
	-#endif
	- break;
	- }
	-
	- return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_set_fsacl(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- int error;
	-
	- error = zfs_dozonecheck(zc->zc_name, cr);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * permission to set permissions will be evaluated later in
	- * dsl_deleg_can_allow()
	- */
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_rollback(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- return (zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_ROLLBACK, cr));
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_send(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- char *cp;
	- int error;
	-
	- /*
	- * Generate the current snapshot name from the given objsetid, then
	- * use that name for the secpolicy/zone checks.
	- */
	- cp = strchr(zc->zc_name, '@');
	- if (cp == NULL)
	- return (SET_ERROR(EINVAL));
	- error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- dsl_dataset_name(ds, zc->zc_name);
	-
	- error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
	- ZFS_DELEG_PERM_SEND, cr);
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	-
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_send_new(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- return (zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_SEND, cr));
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_deleg_share(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- vnode_t *vp;
	- int error;
	-
	- if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
	- NO_FOLLOW, NULL, &vp)) != 0)
	- return (error);
	-
	- /* Now make sure mntpnt and dataset are ZFS */
	-
	- if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 \|\|
	- (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
	- zc->zc_name) != 0)) {
	- VN_RELE(vp);
	- return (SET_ERROR(EPERM));
	- }
	-
	- VN_RELE(vp);
	- return (dsl_deleg_access(zc->zc_name,
	- ZFS_DELEG_PERM_SHARE, cr));
	-}
	-
	-int
	-zfs_secpolicy_share(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- if (!INGLOBALZONE(curthread))
	- return (SET_ERROR(EPERM));
	-
	- if (secpolicy_nfs(cr) == 0) {
	- return (0);
	- } else {
	- return (zfs_secpolicy_deleg_share(zc, innvl, cr));
	- }
	-}
	-
	-int
	-zfs_secpolicy_smb_acl(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- if (!INGLOBALZONE(curthread))
	- return (SET_ERROR(EPERM));
	-
	- if (secpolicy_smb(cr) == 0) {
	- return (0);
	- } else {
	- return (zfs_secpolicy_deleg_share(zc, innvl, cr));
	- }
	-}
	-
	-static int
	-zfs_get_parent(const char datasetname, char parent, int parentsize)
	-{
	- char *cp;
	-
	- /*
	- * Remove the @bla or /bla from the end of the name to get the parent.
	- */
	- (void) strncpy(parent, datasetname, parentsize);
	- cp = strrchr(parent, '@');
	- if (cp != NULL) {
	- cp[0] = '\0';
	- } else {
	- cp = strrchr(parent, '/');
	- if (cp == NULL)
	- return (SET_ERROR(ENOENT));
	- cp[0] = '\0';
	- }
	-
	- return (0);
	-}
	-
	-int
	-zfs_secpolicy_destroy_perms(const char name, cred_t cr)
	-{
	- int error;
	-
	- if ((error = zfs_secpolicy_write_perms(name,
	- ZFS_DELEG_PERM_MOUNT, cr)) != 0)
	- return (error);
	-
	- return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_destroy(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
	-}
	-
	-/*
	- * Destroying snapshots with delegated permissions requires
	- * descendant mount and destroy permissions.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_destroy_snaps(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- nvlist_t *snaps;
	- nvpair_t pair, nextpair;
	- int error = 0;
	-
	- snaps = fnvlist_lookup_nvlist(innvl, "snaps");
	-
	- for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	- pair = nextpair) {
	- nextpair = nvlist_next_nvpair(snaps, pair);
	- error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
	- if (error == ENOENT) {
	- /*
	- * Ignore any snapshots that don't exist (we consider
	- * them "already destroyed"). Remove the name from the
	- * nvl here in case the snapshot is created between
	- * now and when we try to destroy it (in which case
	- * we don't want to destroy it since we haven't
	- * checked for permission).
	- */
	- fnvlist_remove_nvpair(snaps, pair);
	- error = 0;
	- }
	- if (error != 0)
	- break;
	- }
	-
	- return (error);
	-}
	-
	-int
	-zfs_secpolicy_rename_perms(const char from, const char to, cred_t *cr)
	-{
	- char parentname[ZFS_MAX_DATASET_NAME_LEN];
	- int error;
	-
	- if ((error = zfs_secpolicy_write_perms(from,
	- ZFS_DELEG_PERM_RENAME, cr)) != 0)
	- return (error);
	-
	- if ((error = zfs_secpolicy_write_perms(from,
	- ZFS_DELEG_PERM_MOUNT, cr)) != 0)
	- return (error);
	-
	- if ((error = zfs_get_parent(to, parentname,
	- sizeof (parentname))) != 0)
	- return (error);
	-
	- if ((error = zfs_secpolicy_write_perms(parentname,
	- ZFS_DELEG_PERM_CREATE, cr)) != 0)
	- return (error);
	-
	- if ((error = zfs_secpolicy_write_perms(parentname,
	- ZFS_DELEG_PERM_MOUNT, cr)) != 0)
	- return (error);
	-
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_rename(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- char *at = NULL;
	- char *pound;
	- int error;
	-
	- if ((pound = strchr(zc->zc_name, '#')) != NULL) {
	- *pound = '\0';
	- error = zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_RENAME, cr);
	- if (error == 0) {
	- error = zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_BOOKMARK, cr);
	- }
	- *pound = '#';
	- return (error);
	- }
	-
	- if ((zc->zc_cookie & 1) != 0) {
	- /*
	- * This is recursive rename, so the starting snapshot might
	- * not exist. Check file system or volume permission instead.
	- */
	- at = strchr(zc->zc_name, '@');
	- if (at == NULL)
	- return (EINVAL);
	- *at = '\0';
	- }
	-
	- error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr);
	-
	- if (at != NULL)
	- *at = '@';
	-
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_promote(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *clone;
	- int error;
	-
	- error = zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_PROMOTE, cr);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
	-
	- if (error == 0) {
	- char parentname[ZFS_MAX_DATASET_NAME_LEN];
	- dsl_dataset_t *origin = NULL;
	- dsl_dir_t *dd;
	- dd = clone->ds_dir;
	-
	- error = dsl_dataset_hold_obj(dd->dd_pool,
	- dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
	- if (error != 0) {
	- dsl_dataset_rele(clone, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
	- ZFS_DELEG_PERM_MOUNT, cr);
	-
	- dsl_dataset_name(origin, parentname);
	- if (error == 0) {
	- error = zfs_secpolicy_write_perms_ds(parentname, origin,
	- ZFS_DELEG_PERM_PROMOTE, cr);
	- }
	- dsl_dataset_rele(clone, FTAG);
	- dsl_dataset_rele(origin, FTAG);
	- }
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_recv(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- int error;
	-
	- if ((error = zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
	- return (error);
	-
	- if ((error = zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_MOUNT, cr)) != 0)
	- return (error);
	-
	- return (zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_CREATE, cr));
	-}
	-
	-int
	-zfs_secpolicy_snapshot_perms(const char name, cred_t cr)
	-{
	- return (zfs_secpolicy_write_perms(name,
	- ZFS_DELEG_PERM_SNAPSHOT, cr));
	-}
	-
	-/*
	- * Check for permission to create each snapshot in the nvlist.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_snapshot(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- nvlist_t *snaps;
	- int error;
	- nvpair_t *pair;
	-
	- snaps = fnvlist_lookup_nvlist(innvl, "snaps");
	-
	- for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(snaps, pair)) {
	- char *name = nvpair_name(pair);
	- char *atp = strchr(name, '@');
	-
	- if (atp == NULL) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	- *atp = '\0';
	- error = zfs_secpolicy_snapshot_perms(name, cr);
	- *atp = '@';
	- if (error != 0)
	- break;
	- }
	- return (error);
	-}
	-
	-/*
	- * Check for permission to create each bookmark in the nvlist.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_bookmark(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- int error = 0;
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
	- char *name = nvpair_name(pair);
	- char *hashp = strchr(name, '#');
	-
	- if (hashp == NULL) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	- *hashp = '\0';
	- error = zfs_secpolicy_write_perms(name,
	- ZFS_DELEG_PERM_BOOKMARK, cr);
	- *hashp = '#';
	- if (error != 0)
	- break;
	- }
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_remap(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- return (zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_REMAP, cr));
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_destroy_bookmarks(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- nvpair_t pair, nextpair;
	- int error = 0;
	-
	- for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
	- pair = nextpair) {
	- char *name = nvpair_name(pair);
	- char *hashp = strchr(name, '#');
	- nextpair = nvlist_next_nvpair(innvl, pair);
	-
	- if (hashp == NULL) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- *hashp = '\0';
	- error = zfs_secpolicy_write_perms(name,
	- ZFS_DELEG_PERM_DESTROY, cr);
	- *hashp = '#';
	- if (error == ENOENT) {
	- /*
	- * Ignore any filesystems that don't exist (we consider
	- * their bookmarks "already destroyed"). Remove
	- * the name from the nvl here in case the filesystem
	- * is created between now and when we try to destroy
	- * the bookmark (in which case we don't want to
	- * destroy it since we haven't checked for permission).
	- */
	- fnvlist_remove_nvpair(innvl, pair);
	- error = 0;
	- }
	- if (error != 0)
	- break;
	- }
	-
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_log_history(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- /*
	- * Even root must have a proper TSD so that we know what pool
	- * to log to.
	- */
	- if (tsd_get(zfs_allow_log_key) == NULL)
	- return (SET_ERROR(EPERM));
	- return (0);
	-}
	-
	-static int
	-zfs_secpolicy_create_clone(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- char parentname[ZFS_MAX_DATASET_NAME_LEN];
	- int error;
	- char *origin;
	-
	- if ((error = zfs_get_parent(zc->zc_name, parentname,
	- sizeof (parentname))) != 0)
	- return (error);
	-
	- if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
	- (error = zfs_secpolicy_write_perms(origin,
	- ZFS_DELEG_PERM_CLONE, cr)) != 0)
	- return (error);
	-
	- if ((error = zfs_secpolicy_write_perms(parentname,
	- ZFS_DELEG_PERM_CREATE, cr)) != 0)
	- return (error);
	-
	- return (zfs_secpolicy_write_perms(parentname,
	- ZFS_DELEG_PERM_MOUNT, cr));
	-}
	-
	-/*
	- * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
	- * SYS_CONFIG privilege, which is not available in a local zone.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_config(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- if (secpolicy_sys_config(cr, B_FALSE) != 0)
	- return (SET_ERROR(EPERM));
	-
	- return (0);
	-}
	-
	-/*
	- * Policy for object to name lookups.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_diff(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- int error;
	-
	- if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
	- return (0);
	-
	- error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
	- return (error);
	-}
	-
	-/*
	- * Policy for fault injection. Requires all privileges.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_inject(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- return (secpolicy_zinject(cr));
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_inherit_prop(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
	-
	- if (prop == ZPROP_INVAL) {
	- if (!zfs_prop_user(zc->zc_value))
	- return (SET_ERROR(EINVAL));
	- return (zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_USERPROP, cr));
	- } else {
	- return (zfs_secpolicy_setprop(zc->zc_name, prop,
	- NULL, cr));
	- }
	-}
	-
	-static int
	-zfs_secpolicy_userspace_one(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- int err = zfs_secpolicy_read(zc, innvl, cr);
	- if (err)
	- return (err);
	-
	- if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
	- return (SET_ERROR(EINVAL));
	-
	- if (zc->zc_value[0] == 0) {
	- /*
	- * They are asking about a posix uid/gid. If it's
	- * themself, allow it.
	- */
	- if (zc->zc_objset_type == ZFS_PROP_USERUSED \|\|
	- zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
	- if (zc->zc_guid == crgetuid(cr))
	- return (0);
	- } else {
	- if (groupmember(zc->zc_guid, cr))
	- return (0);
	- }
	- }
	-
	- return (zfs_secpolicy_write_perms(zc->zc_name,
	- userquota_perms[zc->zc_objset_type], cr));
	-}
	-
	-static int
	-zfs_secpolicy_userspace_many(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- int err = zfs_secpolicy_read(zc, innvl, cr);
	- if (err)
	- return (err);
	-
	- if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
	- return (SET_ERROR(EINVAL));
	-
	- return (zfs_secpolicy_write_perms(zc->zc_name,
	- userquota_perms[zc->zc_objset_type], cr));
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_userspace_upgrade(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
	- NULL, cr));
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_hold(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- nvpair_t *pair;
	- nvlist_t *holds;
	- int error;
	-
	- holds = fnvlist_lookup_nvlist(innvl, "holds");
	-
	- for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(holds, pair)) {
	- char fsname[ZFS_MAX_DATASET_NAME_LEN];
	- error = dmu_fsname(nvpair_name(pair), fsname);
	- if (error != 0)
	- return (error);
	- error = zfs_secpolicy_write_perms(fsname,
	- ZFS_DELEG_PERM_HOLD, cr);
	- if (error != 0)
	- return (error);
	- }
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_secpolicy_release(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- nvpair_t *pair;
	- int error;
	-
	- for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(innvl, pair)) {
	- char fsname[ZFS_MAX_DATASET_NAME_LEN];
	- error = dmu_fsname(nvpair_name(pair), fsname);
	- if (error != 0)
	- return (error);
	- error = zfs_secpolicy_write_perms(fsname,
	- ZFS_DELEG_PERM_RELEASE, cr);
	- if (error != 0)
	- return (error);
	- }
	- return (0);
	-}
	-
	-/*
	- * Policy for allowing temporary snapshots to be taken or released
	- */
	-static int
	-zfs_secpolicy_tmp_snapshot(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	-{
	- /*
	- * A temporary snapshot is the same as a snapshot,
	- * hold, destroy and release all rolled into one.
	- * Delegated diff alone is sufficient that we allow this.
	- */
	- int error;
	-
	- if ((error = zfs_secpolicy_write_perms(zc->zc_name,
	- ZFS_DELEG_PERM_DIFF, cr)) == 0)
	- return (0);
	-
	- error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
	- if (innvl != NULL) {
	- if (error == 0)
	- error = zfs_secpolicy_hold(zc, innvl, cr);
	- if (error == 0)
	- error = zfs_secpolicy_release(zc, innvl, cr);
	- if (error == 0)
	- error = zfs_secpolicy_destroy(zc, innvl, cr);
	- }
	- return (error);
	-}
	-
	-/*
	- * Returns the nvlist as specified by the user in the zfs_cmd_t.
	- */
	-static int
	-get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
	-{
	- char *packed;
	- int error;
	- nvlist_t *list = NULL;
	-
	- /*
	- * Read in and unpack the user-supplied nvlist.
	- */
	- if (size == 0)
	- return (SET_ERROR(EINVAL));
	-
	- packed = kmem_alloc(size, KM_SLEEP);
	-
	- if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
	- iflag)) != 0) {
	- kmem_free(packed, size);
	- return (SET_ERROR(EFAULT));
	- }
	-
	- if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
	- kmem_free(packed, size);
	- return (error);
	- }
	-
	- kmem_free(packed, size);
	-
	- *nvp = list;
	- return (0);
	-}
	-
	-/*
	- * Reduce the size of this nvlist until it can be serialized in 'max' bytes.
	- * Entries will be removed from the end of the nvlist, and one int32 entry
	- * named "N_MORE_ERRORS" will be added indicating how many entries were
	- * removed.
	- */
	-static int
	-nvlist_smush(nvlist_t *errors, size_t max)
	-{
	- size_t size;
	-
	- size = fnvlist_size(errors);
	-
	- if (size > max) {
	- nvpair_t *more_errors;
	- int n = 0;
	-
	- if (max < 1024)
	- return (SET_ERROR(ENOMEM));
	-
	- fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
	- more_errors = nvlist_prev_nvpair(errors, NULL);
	-
	- do {
	- nvpair_t *pair = nvlist_prev_nvpair(errors,
	- more_errors);
	- fnvlist_remove_nvpair(errors, pair);
	- n++;
	- size = fnvlist_size(errors);
	- } while (size > max);
	-
	- fnvlist_remove_nvpair(errors, more_errors);
	- fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
	- ASSERT3U(fnvlist_size(errors), <=, max);
	- }
	-
	- return (0);
	-}
	-
	-static int
	-put_nvlist(zfs_cmd_t zc, nvlist_t nvl)
	-{
	- char *packed = NULL;
	- int error = 0;
	- size_t size;
	-
	- size = fnvlist_size(nvl);
	-
	- if (size > zc->zc_nvlist_dst_size) {
	- /*
	- * Solaris returns ENOMEM here, because even if an error is
	- * returned from an ioctl(2), new zc_nvlist_dst_size will be
	- * passed to the userland. This is not the case for FreeBSD.
	- * We need to return 0, so the kernel will copy the
	- * zc_nvlist_dst_size back and the userland can discover that a
	- * bigger buffer is needed.
	- */
	- error = 0;
	- } else {
	- packed = fnvlist_pack(nvl, &size);
	- if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
	- size, zc->zc_iflags) != 0)
	- error = SET_ERROR(EFAULT);
	- fnvlist_pack_free(packed, size);
	- }
	-
	- zc->zc_nvlist_dst_size = size;
	- zc->zc_nvlist_dst_filled = B_TRUE;
	- return (error);
	-}
	-
	-int
	-getzfsvfs_impl(objset_t os, vfs_t *vfsp)
	-{
	- zfsvfs_t *zfvp;
	- int error = 0;
	-
	- if (dmu_objset_type(os) != DMU_OST_ZFS) {
	- return (SET_ERROR(EINVAL));
	- }
	-
	- mutex_enter(&os->os_user_ptr_lock);
	- zfvp = dmu_objset_get_user(os);
	- if (zfvp) {
	- *vfsp = zfvp->z_vfs;
	- vfs_ref(zfvp->z_vfs);
	- } else {
	- error = SET_ERROR(ESRCH);
	- }
	- mutex_exit(&os->os_user_ptr_lock);
	- return (error);
	-}
	-
	-int
	-getzfsvfs(const char dsname, zfsvfs_t *zfvp)
	-{
	- objset_t *os;
	- vfs_t *vfsp;
	- int error;
	-
	- error = dmu_objset_hold(dsname, FTAG, &os);
	- if (error != 0)
	- return (error);
	- error = getzfsvfs_impl(os, &vfsp);
	- dmu_objset_rele(os, FTAG);
	- if (error != 0)
	- return (error);
	-
	- error = vfs_busy(vfsp, 0);
	- vfs_rel(vfsp);
	- if (error != 0) {
	- *zfvp = NULL;
	- error = SET_ERROR(ESRCH);
	- } else {
	- *zfvp = vfsp->vfs_data;
	- }
	- return (error);
	-}
	-
	-/*
	- * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
	- * case its z_vfs will be NULL, and it will be opened as the owner.
	- * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
	- * which prevents all vnode ops from running.
	- */
	-static int
	-zfsvfs_hold(const char name, void tag, zfsvfs_t **zfvp, boolean_t writer)
	-{
	- int error = 0;
	-
	- if (getzfsvfs(name, zfvp) != 0)
	- error = zfsvfs_create(name, zfvp);
	- if (error == 0) {
	- rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
	- RW_READER, tag);
	-#ifdef illumos
	- if ((*zfvp)->z_unmounted) {
	- /*
	- * XXX we could probably try again, since the unmounting
	- * thread should be just about to disassociate the
	- * objset from the zfsvfs.
	- */
	- rrm_exit(&(*zfvp)->z_teardown_lock, tag);
	- return (SET_ERROR(EBUSY));
	- }
	-#else
	- /*
	- * vfs_busy() ensures that the filesystem is not and
	- * can not be unmounted.
	- */
	- ASSERT(!(*zfvp)->z_unmounted);
	-#endif
	- }
	- return (error);
	-}
	-
	-static void
	-zfsvfs_rele(zfsvfs_t zfsvfs, void tag)
	-{
	- rrm_exit(&zfsvfs->z_teardown_lock, tag);
	-
	- if (zfsvfs->z_vfs) {
	-#ifdef illumos
	- VFS_RELE(zfsvfs->z_vfs);
	-#else
	- vfs_unbusy(zfsvfs->z_vfs);
	-#endif
	- } else {
	- dmu_objset_disown(zfsvfs->z_os, zfsvfs);
	- zfsvfs_free(zfsvfs);
	- }
	-}
	-
	-static int
	-zfs_ioc_pool_create(zfs_cmd_t *zc)
	-{
	- int error;
	- nvlist_t config, props = NULL;
	- nvlist_t *rootprops = NULL;
	- nvlist_t *zplprops = NULL;
	- char *spa_name = zc->zc_name;
	-
	- if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	- zc->zc_iflags, &config))
	- return (error);
	-
	- if (zc->zc_nvlist_src_size != 0 && (error =
	- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	- zc->zc_iflags, &props))) {
	- nvlist_free(config);
	- return (error);
	- }
	-
	- if (props) {
	- nvlist_t *nvl = NULL;
	- uint64_t version = SPA_VERSION;
	- char *tname;
	-
	- (void) nvlist_lookup_uint64(props,
	- zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
	- if (!SPA_VERSION_IS_SUPPORTED(version)) {
	- error = SET_ERROR(EINVAL);
	- goto pool_props_bad;
	- }
	- (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
	- if (nvl) {
	- error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
	- if (error != 0) {
	- nvlist_free(config);
	- nvlist_free(props);
	- return (error);
	- }
	- (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
	- }
	- VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- error = zfs_fill_zplprops_root(version, rootprops,
	- zplprops, NULL);
	- if (error != 0)
	- goto pool_props_bad;
	-
	- if (nvlist_lookup_string(props,
	- zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0)
	- spa_name = tname;
	- }
	-
	- error = spa_create(zc->zc_name, config, props, zplprops);
	-
	- /*
	- * Set the remaining root properties
	- */
	- if (!error && (error = zfs_set_prop_nvlist(spa_name,
	- ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
	- (void) spa_destroy(spa_name);
	-
	-pool_props_bad:
	- nvlist_free(rootprops);
	- nvlist_free(zplprops);
	- nvlist_free(config);
	- nvlist_free(props);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_pool_destroy(zfs_cmd_t *zc)
	-{
	- int error;
	- zfs_log_history(zc);
	- error = spa_destroy(zc->zc_name);
	-#ifndef __FreeBSD__
	- if (error == 0)
	- zvol_remove_minors(zc->zc_name);
	-#endif
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_pool_import(zfs_cmd_t *zc)
	-{
	- nvlist_t config, props = NULL;
	- uint64_t guid;
	- int error;
	-
	- if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	- zc->zc_iflags, &config)) != 0)
	- return (error);
	-
	- if (zc->zc_nvlist_src_size != 0 && (error =
	- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	- zc->zc_iflags, &props))) {
	- nvlist_free(config);
	- return (error);
	- }
	-
	- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 \|\|
	- guid != zc->zc_guid)
	- error = SET_ERROR(EINVAL);
	- else
	- error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
	-
	- if (zc->zc_nvlist_dst != 0) {
	- int err;
	-
	- if ((err = put_nvlist(zc, config)) != 0)
	- error = err;
	- }
	-
	- nvlist_free(config);
	-
	- nvlist_free(props);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_pool_export(zfs_cmd_t *zc)
	-{
	- int error;
	- boolean_t force = (boolean_t)zc->zc_cookie;
	- boolean_t hardforce = (boolean_t)zc->zc_guid;
	-
	- zfs_log_history(zc);
	- error = spa_export(zc->zc_name, NULL, force, hardforce);
	-#ifndef __FreeBSD__
	- if (error == 0)
	- zvol_remove_minors(zc->zc_name);
	-#endif
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_pool_configs(zfs_cmd_t *zc)
	-{
	- nvlist_t *configs;
	- int error;
	-
	- if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
	- return (SET_ERROR(EEXIST));
	-
	- error = put_nvlist(zc, configs);
	-
	- nvlist_free(configs);
	-
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of the pool
	- *
	- * outputs:
	- * zc_cookie real errno
	- * zc_nvlist_dst config nvlist
	- * zc_nvlist_dst_size size of config nvlist
	- */
	-static int
	-zfs_ioc_pool_stats(zfs_cmd_t *zc)
	-{
	- nvlist_t *config;
	- int error;
	- int ret = 0;
	-
	- error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
	- sizeof (zc->zc_value));
	-
	- if (config != NULL) {
	- ret = put_nvlist(zc, config);
	- nvlist_free(config);
	-
	- /*
	- * The config may be present even if 'error' is non-zero.
	- * In this case we return success, and preserve the real errno
	- * in 'zc_cookie'.
	- */
	- zc->zc_cookie = error;
	- } else {
	- ret = error;
	- }
	-
	- return (ret);
	-}
	-
	-/*
	- * Try to import the given pool, returning pool stats as appropriate so that
	- * user land knows which devices are available and overall pool health.
	- */
	-static int
	-zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
	-{
	- nvlist_t tryconfig, config;
	- int error;
	-
	- if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	- zc->zc_iflags, &tryconfig)) != 0)
	- return (error);
	-
	- config = spa_tryimport(tryconfig);
	-
	- nvlist_free(tryconfig);
	-
	- if (config == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- error = put_nvlist(zc, config);
	- nvlist_free(config);
	-
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of the pool
	- * zc_cookie scan func (pool_scan_func_t)
	- * zc_flags scrub pause/resume flag (pool_scrub_cmd_t)
	- */
	-static int
	-zfs_ioc_pool_scan(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	- return (error);
	-
	- if (zc->zc_flags >= POOL_SCRUB_FLAGS_END)
	- return (SET_ERROR(EINVAL));
	-
	- if (zc->zc_flags == POOL_SCRUB_PAUSE)
	- error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
	- else if (zc->zc_cookie == POOL_SCAN_NONE)
	- error = spa_scan_stop(spa);
	- else
	- error = spa_scan(spa, zc->zc_cookie);
	-
	- spa_close(spa, FTAG);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_pool_freeze(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	-
	- error = spa_open(zc->zc_name, &spa, FTAG);
	- if (error == 0) {
	- spa_freeze(spa);
	- spa_close(spa, FTAG);
	- }
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	- return (error);
	-
	- if (zc->zc_cookie < spa_version(spa) \|\|
	- !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- spa_upgrade(spa, zc->zc_cookie);
	- spa_close(spa, FTAG);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_pool_get_history(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- char *hist_buf;
	- uint64_t size;
	- int error;
	-
	- if ((size = zc->zc_history_len) == 0)
	- return (SET_ERROR(EINVAL));
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	- return (error);
	-
	- if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- hist_buf = kmem_alloc(size, KM_SLEEP);
	- if ((error = spa_history_get(spa, &zc->zc_history_offset,
	- &zc->zc_history_len, hist_buf)) == 0) {
	- error = ddi_copyout(hist_buf,
	- (void *)(uintptr_t)zc->zc_history,
	- zc->zc_history_len, zc->zc_iflags);
	- }
	-
	- spa_close(spa, FTAG);
	- kmem_free(hist_buf, size);
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_pool_reguid(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	-
	- error = spa_open(zc->zc_name, &spa, FTAG);
	- if (error == 0) {
	- error = spa_change_guid(spa);
	- spa_close(spa, FTAG);
	- }
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
	-{
	- return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_obj object to find
	- *
	- * outputs:
	- * zc_value name of object
	- */
	-static int
	-zfs_ioc_obj_to_path(zfs_cmd_t *zc)
	-{
	- objset_t *os;
	- int error;
	-
	- /* XXX reading from objset not owned */
	- if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
	- return (error);
	- if (dmu_objset_type(os) != DMU_OST_ZFS) {
	- dmu_objset_rele(os, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	- error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
	- sizeof (zc->zc_value));
	- dmu_objset_rele(os, FTAG);
	-
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_obj object to find
	- *
	- * outputs:
	- * zc_stat stats on object
	- * zc_value path to object
	- */
	-static int
	-zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
	-{
	- objset_t *os;
	- int error;
	-
	- /* XXX reading from objset not owned */
	- if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
	- return (error);
	- if (dmu_objset_type(os) != DMU_OST_ZFS) {
	- dmu_objset_rele(os, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	- error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
	- sizeof (zc->zc_value));
	- dmu_objset_rele(os, FTAG);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_vdev_add(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	- nvlist_t config, l2cache, *spares;
	- uint_t nl2cache = 0, nspares = 0;
	-
	- error = spa_open(zc->zc_name, &spa, FTAG);
	- if (error != 0)
	- return (error);
	-
	- error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	- zc->zc_iflags, &config);
	- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
	- &l2cache, &nl2cache);
	-
	- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
	- &spares, &nspares);
	-
	-#ifdef illumos
	- /*
	- * A root pool with concatenated devices is not supported.
	- * Thus, can not add a device to a root pool.
	- *
	- * Intent log device can not be added to a rootpool because
	- * during mountroot, zil is replayed, a seperated log device
	- * can not be accessed during the mountroot time.
	- *
	- * l2cache and spare devices are ok to be added to a rootpool.
	- */
	- if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
	- nvlist_free(config);
	- spa_close(spa, FTAG);
	- return (SET_ERROR(EDOM));
	- }
	-#endif /* illumos */
	-
	- if (error == 0) {
	- error = spa_vdev_add(spa, config);
	- nvlist_free(config);
	- }
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of the pool
	- * zc_guid guid of vdev to remove
	- * zc_cookie cancel removal
	- */
	-static int
	-zfs_ioc_vdev_remove(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	-
	- error = spa_open(zc->zc_name, &spa, FTAG);
	- if (error != 0)
	- return (error);
	- if (zc->zc_cookie != 0) {
	- error = spa_vdev_remove_cancel(spa);
	- } else {
	- error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
	- }
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	- vdev_state_t newstate = VDEV_STATE_UNKNOWN;
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	- return (error);
	- switch (zc->zc_cookie) {
	- case VDEV_STATE_ONLINE:
	- error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
	- break;
	-
	- case VDEV_STATE_OFFLINE:
	- error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
	- break;
	-
	- case VDEV_STATE_FAULTED:
	- if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
	- zc->zc_obj != VDEV_AUX_EXTERNAL)
	- zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
	-
	- error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
	- break;
	-
	- case VDEV_STATE_DEGRADED:
	- if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
	- zc->zc_obj != VDEV_AUX_EXTERNAL)
	- zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
	-
	- error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
	- break;
	-
	- default:
	- error = SET_ERROR(EINVAL);
	- }
	- zc->zc_cookie = newstate;
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_vdev_attach(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int replacing = zc->zc_cookie;
	- nvlist_t *config;
	- int error;
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	- return (error);
	-
	- if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	- zc->zc_iflags, &config)) == 0) {
	- error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
	- nvlist_free(config);
	- }
	-
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_vdev_detach(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	- return (error);
	-
	- error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
	-
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_vdev_split(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- nvlist_t config, props = NULL;
	- int error;
	- boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	- return (error);
	-
	- if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	- zc->zc_iflags, &config)) {
	- spa_close(spa, FTAG);
	- return (error);
	- }
	-
	- if (zc->zc_nvlist_src_size != 0 && (error =
	- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	- zc->zc_iflags, &props))) {
	- spa_close(spa, FTAG);
	- nvlist_free(config);
	- return (error);
	- }
	-
	- error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
	-
	- spa_close(spa, FTAG);
	-
	- nvlist_free(config);
	- nvlist_free(props);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- char *path = zc->zc_value;
	- uint64_t guid = zc->zc_guid;
	- int error;
	-
	- error = spa_open(zc->zc_name, &spa, FTAG);
	- if (error != 0)
	- return (error);
	-
	- error = spa_vdev_setpath(spa, guid, path);
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- char *fru = zc->zc_value;
	- uint64_t guid = zc->zc_guid;
	- int error;
	-
	- error = spa_open(zc->zc_name, &spa, FTAG);
	- if (error != 0)
	- return (error);
	-
	- error = spa_vdev_setfru(spa, guid, fru);
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_objset_stats_impl(zfs_cmd_t zc, objset_t os)
	-{
	- int error = 0;
	- nvlist_t *nv;
	-
	- dmu_objset_fast_stat(os, &zc->zc_objset_stats);
	-
	- if (zc->zc_nvlist_dst != 0 &&
	- (error = dsl_prop_get_all(os, &nv)) == 0) {
	- dmu_objset_stats(os, nv);
	- /*
	- * NB: zvol_get_stats() will read the objset contents,
	- * which we aren't supposed to do with a
	- * DS_MODE_USER hold, because it could be
	- * inconsistent. So this is a bit of a workaround...
	- * XXX reading with out owning
	- */
	- if (!zc->zc_objset_stats.dds_inconsistent &&
	- dmu_objset_type(os) == DMU_OST_ZVOL) {
	- error = zvol_get_stats(os, nv);
	- if (error == EIO)
	- return (error);
	- VERIFY0(error);
	- }
	- error = put_nvlist(zc, nv);
	- nvlist_free(nv);
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_nvlist_dst_size size of buffer for property nvlist
	- *
	- * outputs:
	- * zc_objset_stats stats
	- * zc_nvlist_dst property nvlist
	- * zc_nvlist_dst_size size of property nvlist
	- */
	-static int
	-zfs_ioc_objset_stats(zfs_cmd_t *zc)
	-{
	- objset_t *os;
	- int error;
	-
	- error = dmu_objset_hold(zc->zc_name, FTAG, &os);
	- if (error == 0) {
	- error = zfs_ioc_objset_stats_impl(zc, os);
	- dmu_objset_rele(os, FTAG);
	- }
	-
	- if (error == ENOMEM)
	- error = 0;
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_nvlist_dst_size size of buffer for property nvlist
	- *
	- * outputs:
	- * zc_nvlist_dst received property nvlist
	- * zc_nvlist_dst_size size of received property nvlist
	- *
	- * Gets received properties (distinct from local properties on or after
	- * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
	- * local property values.
	- */
	-static int
	-zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
	-{
	- int error = 0;
	- nvlist_t *nv;
	-
	- /*
	- * Without this check, we would return local property values if the
	- * caller has not already received properties on or after
	- * SPA_VERSION_RECVD_PROPS.
	- */
	- if (!dsl_prop_get_hasrecvd(zc->zc_name))
	- return (SET_ERROR(ENOTSUP));
	-
	- if (zc->zc_nvlist_dst != 0 &&
	- (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
	- error = put_nvlist(zc, nv);
	- nvlist_free(nv);
	- }
	-
	- return (error);
	-}
	-
	-static int
	-nvl_add_zplprop(objset_t os, nvlist_t props, zfs_prop_t prop)
	-{
	- uint64_t value;
	- int error;
	-
	- /*
	- * zfs_get_zplprop() will either find a value or give us
	- * the default value (if there is one).
	- */
	- if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
	- return (error);
	- VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
	- return (0);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_nvlist_dst_size size of buffer for zpl property nvlist
	- *
	- * outputs:
	- * zc_nvlist_dst zpl property nvlist
	- * zc_nvlist_dst_size size of zpl property nvlist
	- */
	-static int
	-zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
	-{
	- objset_t *os;
	- int err;
	-
	- /* XXX reading without owning */
	- if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
	- return (err);
	-
	- dmu_objset_fast_stat(os, &zc->zc_objset_stats);
	-
	- /*
	- * NB: nvl_add_zplprop() will read the objset contents,
	- * which we aren't supposed to do with a DS_MODE_USER
	- * hold, because it could be inconsistent.
	- */
	- if (zc->zc_nvlist_dst != 0 &&
	- !zc->zc_objset_stats.dds_inconsistent &&
	- dmu_objset_type(os) == DMU_OST_ZFS) {
	- nvlist_t *nv;
	-
	- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
	- (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
	- (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
	- (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
	- err = put_nvlist(zc, nv);
	- nvlist_free(nv);
	- } else {
	- err = SET_ERROR(ENOENT);
	- }
	- dmu_objset_rele(os, FTAG);
	- return (err);
	-}
	-
	-boolean_t
	-dataset_name_hidden(const char *name)
	-{
	- /*
	- * Skip over datasets that are not visible in this zone,
	- * internal datasets (which have a $ in their name), and
	- * temporary datasets (which have a % in their name).
	- */
	- if (strchr(name, '$') != NULL)
	- return (B_TRUE);
	- if (strchr(name, '%') != NULL)
	- return (B_TRUE);
	- if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
	- return (B_TRUE);
	- return (B_FALSE);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_cookie zap cursor
	- * zc_nvlist_src iteration range nvlist
	- * zc_nvlist_src_size size of iteration range nvlist
	- *
	- * outputs:
	- * zc_name name of next filesystem
	- * zc_cookie zap cursor
	- * zc_objset_stats stats
	- * zc_nvlist_dst property nvlist
	- * zc_nvlist_dst_size size of property nvlist
	- */
	-static int
	-zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
	-{
	- objset_t *os;
	- int error;
	- char *p;
	- size_t orig_len = strlen(zc->zc_name);
	-
	-top:
	- if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
	- if (error == ENOENT)
	- error = SET_ERROR(ESRCH);
	- return (error);
	- }
	-
	- p = strrchr(zc->zc_name, '/');
	- if (p == NULL \|\| p[1] != '\0')
	- (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
	- p = zc->zc_name + strlen(zc->zc_name);
	-
	- do {
	- error = dmu_dir_list_next(os,
	- sizeof (zc->zc_name) - (p - zc->zc_name), p,
	- NULL, &zc->zc_cookie);
	- if (error == ENOENT)
	- error = SET_ERROR(ESRCH);
	- } while (error == 0 && dataset_name_hidden(zc->zc_name));
	- dmu_objset_rele(os, FTAG);
	-
	- /*
	- * If it's an internal dataset (ie. with a '$' in its name),
	- * don't try to get stats for it, otherwise we'll return ENOENT.
	- */
	- if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
	- error = zfs_ioc_objset_stats(zc); /* fill in the stats */
	- if (error == ENOENT) {
	- /* We lost a race with destroy, get the next one. */
	- zc->zc_name[orig_len] = '\0';
	- goto top;
	- }
	- }
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_cookie zap cursor
	- * zc_nvlist_dst_size size of buffer for property nvlist
	- * zc_simple when set, only name is requested
	- *
	- * outputs:
	- * zc_name name of next snapshot
	- * zc_objset_stats stats
	- * zc_nvlist_dst property nvlist
	- * zc_nvlist_dst_size size of property nvlist
	- */
	-static int
	-zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
	-{
	- int error;
	- objset_t os, ossnap;
	- dsl_dataset_t *ds;
	- uint64_t min_txg = 0, max_txg = 0;
	-
	- if (zc->zc_nvlist_src_size != 0) {
	- nvlist_t *props = NULL;
	- error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	- zc->zc_iflags, &props);
	- if (error != 0)
	- return (error);
	- (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG,
	- &min_txg);
	- (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG,
	- &max_txg);
	- nvlist_free(props);
	- }
	-
	- error = dmu_objset_hold(zc->zc_name, FTAG, &os);
	- if (error != 0) {
	- return (error == ENOENT ? ESRCH : error);
	- }
	-
	- /*
	- * A dataset name of maximum length cannot have any snapshots,
	- * so exit immediately.
	- */
	- if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
	- ZFS_MAX_DATASET_NAME_LEN) {
	- dmu_objset_rele(os, FTAG);
	- return (SET_ERROR(ESRCH));
	- }
	-
	- while (error == 0) {
	- if (issig(JUSTLOOKING) && issig(FORREAL)) {
	- error = SET_ERROR(EINTR);
	- break;
	- }
	-
	- error = dmu_snapshot_list_next(os,
	- sizeof (zc->zc_name) - strlen(zc->zc_name),
	- zc->zc_name + strlen(zc->zc_name), &zc->zc_obj,
	- &zc->zc_cookie, NULL);
	- if (error == ENOENT) {
	- error = SET_ERROR(ESRCH);
	- break;
	- } else if (error != 0) {
	- break;
	- }
	-
	- error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj,
	- FTAG, &ds);
	- if (error != 0)
	- break;
	-
	- if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) \|\|
	- (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) {
	- dsl_dataset_rele(ds, FTAG);
	- /* undo snapshot name append */
	- *(strchr(zc->zc_name, '@') + 1) = '\0';
	- /* skip snapshot */
	- continue;
	- }
	-
	- if (zc->zc_simple) {
	- dsl_dataset_rele(ds, FTAG);
	- break;
	- }
	-
	- if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- break;
	- }
	- if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- break;
	- }
	- dsl_dataset_rele(ds, FTAG);
	- break;
	- }
	-
	- dmu_objset_rele(os, FTAG);
	- /* if we failed, undo the @ that we tacked on to zc_name */
	- if (error != 0)
	- *strchr(zc->zc_name, '@') = '\0';
	- return (error);
	-}
	-
	-static int
	-zfs_prop_set_userquota(const char dsname, nvpair_t pair)
	-{
	- const char *propname = nvpair_name(pair);
	- uint64_t *valary;
	- unsigned int vallen;
	- const char *domain;
	- char *dash;
	- zfs_userquota_prop_t type;
	- uint64_t rid;
	- uint64_t quota;
	- zfsvfs_t *zfsvfs;
	- int err;
	-
	- if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	- nvlist_t *attrs;
	- VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
	- if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	- &pair) != 0)
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * A correctly constructed propname is encoded as
	- * userquota@<rid>-<domain>.
	- */
	- if ((dash = strchr(propname, '-')) == NULL \|\|
	- nvpair_value_uint64_array(pair, &valary, &vallen) != 0 \|\|
	- vallen != 3)
	- return (SET_ERROR(EINVAL));
	-
	- domain = dash + 1;
	- type = valary[0];
	- rid = valary[1];
	- quota = valary[2];
	-
	- err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
	- if (err == 0) {
	- err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
	- zfsvfs_rele(zfsvfs, FTAG);
	- }
	-
	- return (err);
	-}
	-
	-/*
	- * If the named property is one that has a special function to set its value,
	- * return 0 on success and a positive error code on failure; otherwise if it is
	- * not one of the special properties handled by this function, return -1.
	- *
	- * XXX: It would be better for callers of the property interface if we handled
	- * these special cases in dsl_prop.c (in the dsl layer).
	- */
	-static int
	-zfs_prop_set_special(const char *dsname, zprop_source_t source,
	- nvpair_t *pair)
	-{
	- const char *propname = nvpair_name(pair);
	- zfs_prop_t prop = zfs_name_to_prop(propname);
	- uint64_t intval;
	- int err = -1;
	-
	- if (prop == ZPROP_INVAL) {
	- if (zfs_prop_userquota(propname))
	- return (zfs_prop_set_userquota(dsname, pair));
	- return (-1);
	- }
	-
	- if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	- nvlist_t *attrs;
	- VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
	- VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	- &pair) == 0);
	- }
	-
	- if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
	- return (-1);
	-
	- VERIFY(0 == nvpair_value_uint64(pair, &intval));
	-
	- switch (prop) {
	- case ZFS_PROP_QUOTA:
	- err = dsl_dir_set_quota(dsname, source, intval);
	- break;
	- case ZFS_PROP_REFQUOTA:
	- err = dsl_dataset_set_refquota(dsname, source, intval);
	- break;
	- case ZFS_PROP_FILESYSTEM_LIMIT:
	- case ZFS_PROP_SNAPSHOT_LIMIT:
	- if (intval == UINT64_MAX) {
	- /* clearing the limit, just do it */
	- err = 0;
	- } else {
	- err = dsl_dir_activate_fs_ss_limit(dsname);
	- }
	- /*
	- * Set err to -1 to force the zfs_set_prop_nvlist code down the
	- * default path to set the value in the nvlist.
	- */
	- if (err == 0)
	- err = -1;
	- break;
	- case ZFS_PROP_RESERVATION:
	- err = dsl_dir_set_reservation(dsname, source, intval);
	- break;
	- case ZFS_PROP_REFRESERVATION:
	- err = dsl_dataset_set_refreservation(dsname, source, intval);
	- break;
	- case ZFS_PROP_VOLSIZE:
	- err = zvol_set_volsize(dsname, intval);
	- break;
	- case ZFS_PROP_VERSION:
	- {
	- zfsvfs_t *zfsvfs;
	-
	- if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
	- break;
	-
	- err = zfs_set_version(zfsvfs, intval);
	- zfsvfs_rele(zfsvfs, FTAG);
	-
	- if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
	- zfs_cmd_t *zc;
	-
	- zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
	- (void) strcpy(zc->zc_name, dsname);
	- (void) zfs_ioc_userspace_upgrade(zc);
	- kmem_free(zc, sizeof (zfs_cmd_t));
	- }
	- break;
	- }
	- default:
	- err = -1;
	- }
	-
	- return (err);
	-}
	-
	-/*
	- * This function is best effort. If it fails to set any of the given properties,
	- * it continues to set as many as it can and returns the last error
	- * encountered. If the caller provides a non-NULL errlist, it will be filled in
	- * with the list of names of all the properties that failed along with the
	- * corresponding error numbers.
	- *
	- * If every property is set successfully, zero is returned and errlist is not
	- * modified.
	- */
	-int
	-zfs_set_prop_nvlist(const char dsname, zprop_source_t source, nvlist_t nvl,
	- nvlist_t *errlist)
	-{
	- nvpair_t *pair;
	- nvpair_t *propval;
	- int rv = 0;
	- uint64_t intval;
	- char *strval;
	- nvlist_t *genericnvl = fnvlist_alloc();
	- nvlist_t *retrynvl = fnvlist_alloc();
	-
	-retry:
	- pair = NULL;
	- while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
	- const char *propname = nvpair_name(pair);
	- zfs_prop_t prop = zfs_name_to_prop(propname);
	- int err = 0;
	-
	- /* decode the property value */
	- propval = pair;
	- if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	- nvlist_t *attrs;
	- attrs = fnvpair_value_nvlist(pair);
	- if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	- &propval) != 0)
	- err = SET_ERROR(EINVAL);
	- }
	-
	- /* Validate value type */
	- if (err == 0 && prop == ZPROP_INVAL) {
	- if (zfs_prop_user(propname)) {
	- if (nvpair_type(propval) != DATA_TYPE_STRING)
	- err = SET_ERROR(EINVAL);
	- } else if (zfs_prop_userquota(propname)) {
	- if (nvpair_type(propval) !=
	- DATA_TYPE_UINT64_ARRAY)
	- err = SET_ERROR(EINVAL);
	- } else {
	- err = SET_ERROR(EINVAL);
	- }
	- } else if (err == 0) {
	- if (nvpair_type(propval) == DATA_TYPE_STRING) {
	- if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
	- err = SET_ERROR(EINVAL);
	- } else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
	- const char *unused;
	-
	- intval = fnvpair_value_uint64(propval);
	-
	- switch (zfs_prop_get_type(prop)) {
	- case PROP_TYPE_NUMBER:
	- break;
	- case PROP_TYPE_STRING:
	- err = SET_ERROR(EINVAL);
	- break;
	- case PROP_TYPE_INDEX:
	- if (zfs_prop_index_to_string(prop,
	- intval, &unused) != 0)
	- err = SET_ERROR(EINVAL);
	- break;
	- default:
	- cmn_err(CE_PANIC,
	- "unknown property type");
	- }
	- } else {
	- err = SET_ERROR(EINVAL);
	- }
	- }
	-
	- /* Validate permissions */
	- if (err == 0)
	- err = zfs_check_settable(dsname, pair, CRED());
	-
	- if (err == 0) {
	- err = zfs_prop_set_special(dsname, source, pair);
	- if (err == -1) {
	- /*
	- * For better performance we build up a list of
	- * properties to set in a single transaction.
	- */
	- err = nvlist_add_nvpair(genericnvl, pair);
	- } else if (err != 0 && nvl != retrynvl) {
	- /*
	- * This may be a spurious error caused by
	- * receiving quota and reservation out of order.
	- * Try again in a second pass.
	- */
	- err = nvlist_add_nvpair(retrynvl, pair);
	- }
	- }
	-
	- if (err != 0) {
	- if (errlist != NULL)
	- fnvlist_add_int32(errlist, propname, err);
	- rv = err;
	- }
	- }
	-
	- if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
	- nvl = retrynvl;
	- goto retry;
	- }
	-
	- if (!nvlist_empty(genericnvl) &&
	- dsl_props_set(dsname, source, genericnvl) != 0) {
	- /*
	- * If this fails, we still want to set as many properties as we
	- * can, so try setting them individually.
	- */
	- pair = NULL;
	- while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
	- const char *propname = nvpair_name(pair);
	- int err = 0;
	-
	- propval = pair;
	- if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	- nvlist_t *attrs;
	- attrs = fnvpair_value_nvlist(pair);
	- propval = fnvlist_lookup_nvpair(attrs,
	- ZPROP_VALUE);
	- }
	-
	- if (nvpair_type(propval) == DATA_TYPE_STRING) {
	- strval = fnvpair_value_string(propval);
	- err = dsl_prop_set_string(dsname, propname,
	- source, strval);
	- } else {
	- intval = fnvpair_value_uint64(propval);
	- err = dsl_prop_set_int(dsname, propname, source,
	- intval);
	- }
	-
	- if (err != 0) {
	- if (errlist != NULL) {
	- fnvlist_add_int32(errlist, propname,
	- err);
	- }
	- rv = err;
	- }
	- }
	- }
	- nvlist_free(genericnvl);
	- nvlist_free(retrynvl);
	-
	- return (rv);
	-}
	-
	-/*
	- * Check that all the properties are valid user properties.
	- */
	-static int
	-zfs_check_userprops(nvlist_t *nvl)
	-{
	- nvpair_t *pair = NULL;
	-
	- while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
	- const char *propname = nvpair_name(pair);
	-
	- if (!zfs_prop_user(propname) \|\|
	- nvpair_type(pair) != DATA_TYPE_STRING)
	- return (SET_ERROR(EINVAL));
	-
	- if (strlen(propname) >= ZAP_MAXNAMELEN)
	- return (SET_ERROR(ENAMETOOLONG));
	-
	- if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
	- return (E2BIG);
	- }
	- return (0);
	-}
	-
	-static void
	-props_skip(nvlist_t props, nvlist_t skipped, nvlist_t **newprops)
	-{
	- nvpair_t *pair;
	-
	- VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- pair = NULL;
	- while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
	- if (nvlist_exists(skipped, nvpair_name(pair)))
	- continue;
	-
	- VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
	- }
	-}
	-
	-static int
	-clear_received_props(const char dsname, nvlist_t props,
	- nvlist_t *skipped)
	-{
	- int err = 0;
	- nvlist_t *cleared_props = NULL;
	- props_skip(props, skipped, &cleared_props);
	- if (!nvlist_empty(cleared_props)) {
	- /*
	- * Acts on local properties until the dataset has received
	- * properties at least once on or after SPA_VERSION_RECVD_PROPS.
	- */
	- zprop_source_t flags = (ZPROP_SRC_NONE \|
	- (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
	- err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
	- }
	- nvlist_free(cleared_props);
	- return (err);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_value name of property to set
	- * zc_nvlist_src{_size} nvlist of properties to apply
	- * zc_cookie received properties flag
	- *
	- * outputs:
	- * zc_nvlist_dst{_size} error for each unapplied received property
	- */
	-static int
	-zfs_ioc_set_prop(zfs_cmd_t *zc)
	-{
	- nvlist_t *nvl;
	- boolean_t received = zc->zc_cookie;
	- zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
	- ZPROP_SRC_LOCAL);
	- nvlist_t *errors;
	- int error;
	-
	- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	- zc->zc_iflags, &nvl)) != 0)
	- return (error);
	-
	- if (received) {
	- nvlist_t *origprops;
	-
	- if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
	- (void) clear_received_props(zc->zc_name,
	- origprops, nvl);
	- nvlist_free(origprops);
	- }
	-
	- error = dsl_prop_set_hasrecvd(zc->zc_name);
	- }
	-
	- errors = fnvlist_alloc();
	- if (error == 0)
	- error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
	-
	- if (zc->zc_nvlist_dst != 0 && errors != NULL) {
	- (void) put_nvlist(zc, errors);
	- }
	-
	- nvlist_free(errors);
	- nvlist_free(nvl);
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_value name of property to inherit
	- * zc_cookie revert to received value if TRUE
	- *
	- * outputs: none
	- */
	-static int
	-zfs_ioc_inherit_prop(zfs_cmd_t *zc)
	-{
	- const char *propname = zc->zc_value;
	- zfs_prop_t prop = zfs_name_to_prop(propname);
	- boolean_t received = zc->zc_cookie;
	- zprop_source_t source = (received
	- ? ZPROP_SRC_NONE /* revert to received value, if any */
	- : ZPROP_SRC_INHERITED); /* explicitly inherit */
	-
	- if (received) {
	- nvlist_t *dummy;
	- nvpair_t *pair;
	- zprop_type_t type;
	- int err;
	-
	- /*
	- * zfs_prop_set_special() expects properties in the form of an
	- * nvpair with type info.
	- */
	- if (prop == ZPROP_INVAL) {
	- if (!zfs_prop_user(propname))
	- return (SET_ERROR(EINVAL));
	-
	- type = PROP_TYPE_STRING;
	- } else if (prop == ZFS_PROP_VOLSIZE \|\|
	- prop == ZFS_PROP_VERSION) {
	- return (SET_ERROR(EINVAL));
	- } else {
	- type = zfs_prop_get_type(prop);
	- }
	-
	- VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- switch (type) {
	- case PROP_TYPE_STRING:
	- VERIFY(0 == nvlist_add_string(dummy, propname, ""));
	- break;
	- case PROP_TYPE_NUMBER:
	- case PROP_TYPE_INDEX:
	- VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
	- break;
	- default:
	- nvlist_free(dummy);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- pair = nvlist_next_nvpair(dummy, NULL);
	- err = zfs_prop_set_special(zc->zc_name, source, pair);
	- nvlist_free(dummy);
	- if (err != -1)
	- return (err); /* special property already handled */
	- } else {
	- /*
	- * Only check this in the non-received case. We want to allow
	- * 'inherit -S' to revert non-inheritable properties like quota
	- * and reservation to the received or default values even though
	- * they are not considered inheritable.
	- */
	- if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /* property name has been validated by zfs_secpolicy_inherit_prop() */
	- return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
	-}
	-
	-static int
	-zfs_ioc_pool_set_props(zfs_cmd_t *zc)
	-{
	- nvlist_t *props;
	- spa_t *spa;
	- int error;
	- nvpair_t *pair;
	-
	- if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	- zc->zc_iflags, &props))
	- return (error);
	-
	- /*
	- * If the only property is the configfile, then just do a spa_lookup()
	- * to handle the faulted case.
	- */
	- pair = nvlist_next_nvpair(props, NULL);
	- if (pair != NULL && strcmp(nvpair_name(pair),
	- zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
	- nvlist_next_nvpair(props, pair) == NULL) {
	- mutex_enter(&spa_namespace_lock);
	- if ((spa = spa_lookup(zc->zc_name)) != NULL) {
	- spa_configfile_set(spa, props, B_FALSE);
	- spa_write_cachefile(spa, B_FALSE, B_TRUE);
	- }
	- mutex_exit(&spa_namespace_lock);
	- if (spa != NULL) {
	- nvlist_free(props);
	- return (0);
	- }
	- }
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
	- nvlist_free(props);
	- return (error);
	- }
	-
	- error = spa_prop_set(spa, props);
	-
	- nvlist_free(props);
	- spa_close(spa, FTAG);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_pool_get_props(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	- nvlist_t *nvp = NULL;
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
	- /*
	- * If the pool is faulted, there may be properties we can still
	- * get (such as altroot and cachefile), so attempt to get them
	- * anyway.
	- */
	- mutex_enter(&spa_namespace_lock);
	- if ((spa = spa_lookup(zc->zc_name)) != NULL)
	- error = spa_prop_get(spa, &nvp);
	- mutex_exit(&spa_namespace_lock);
	- } else {
	- error = spa_prop_get(spa, &nvp);
	- spa_close(spa, FTAG);
	- }
	-
	- if (error == 0 && zc->zc_nvlist_dst != 0)
	- error = put_nvlist(zc, nvp);
	- else
	- error = SET_ERROR(EFAULT);
	-
	- nvlist_free(nvp);
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_nvlist_src{_size} nvlist of delegated permissions
	- * zc_perm_action allow/unallow flag
	- *
	- * outputs: none
	- */
	-static int
	-zfs_ioc_set_fsacl(zfs_cmd_t *zc)
	-{
	- int error;
	- nvlist_t *fsaclnv = NULL;
	-
	- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	- zc->zc_iflags, &fsaclnv)) != 0)
	- return (error);
	-
	- /*
	- * Verify nvlist is constructed correctly
	- */
	- if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
	- nvlist_free(fsaclnv);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * If we don't have PRIV_SYS_MOUNT, then validate
	- * that user is allowed to hand out each permission in
	- * the nvlist(s)
	- */
	-
	- error = secpolicy_zfs(CRED());
	- if (error != 0) {
	- if (zc->zc_perm_action == B_FALSE) {
	- error = dsl_deleg_can_allow(zc->zc_name,
	- fsaclnv, CRED());
	- } else {
	- error = dsl_deleg_can_unallow(zc->zc_name,
	- fsaclnv, CRED());
	- }
	- }
	-
	- if (error == 0)
	- error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
	-
	- nvlist_free(fsaclnv);
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- *
	- * outputs:
	- * zc_nvlist_src{_size} nvlist of delegated permissions
	- */
	-static int
	-zfs_ioc_get_fsacl(zfs_cmd_t *zc)
	-{
	- nvlist_t *nvp;
	- int error;
	-
	- if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
	- error = put_nvlist(zc, nvp);
	- nvlist_free(nvp);
	- }
	-
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static void
	-zfs_create_cb(objset_t os, void arg, cred_t cr, dmu_tx_t tx)
	-{
	- zfs_creat_t *zct = arg;
	-
	- zfs_create_fs(os, cr, zct->zct_zplprops, tx);
	-}
	-
	-#define ZFS_PROP_UNDEFINED ((uint64_t)-1)
	-
	-/*
	- * inputs:
	- * os parent objset pointer (NULL if root fs)
	- * fuids_ok fuids allowed in this version of the spa?
	- * sa_ok SAs allowed in this version of the spa?
	- * createprops list of properties requested by creator
	- *
	- * outputs:
	- * zplprops values for the zplprops we attach to the master node object
	- * is_ci true if requested file system will be purely case-insensitive
	- *
	- * Determine the settings for utf8only, normalization and
	- * casesensitivity. Specific values may have been requested by the
	- * creator and/or we can inherit values from the parent dataset. If
	- * the file system is of too early a vintage, a creator can not
	- * request settings for these properties, even if the requested
	- * setting is the default value. We don't actually want to create dsl
	- * properties for these, so remove them from the source nvlist after
	- * processing.
	- */
	-static int
	-zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
	- boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
	- nvlist_t zplprops, boolean_t is_ci)
	-{
	- uint64_t sense = ZFS_PROP_UNDEFINED;
	- uint64_t norm = ZFS_PROP_UNDEFINED;
	- uint64_t u8 = ZFS_PROP_UNDEFINED;
	-
	- ASSERT(zplprops != NULL);
	-
	- /* parent dataset must be a filesystem */
	- if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS)
	- return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
	-
	- /*
	- * Pull out creator prop choices, if any.
	- */
	- if (createprops) {
	- (void) nvlist_lookup_uint64(createprops,
	- zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
	- (void) nvlist_lookup_uint64(createprops,
	- zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
	- (void) nvlist_remove_all(createprops,
	- zfs_prop_to_name(ZFS_PROP_NORMALIZE));
	- (void) nvlist_lookup_uint64(createprops,
	- zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
	- (void) nvlist_remove_all(createprops,
	- zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
	- (void) nvlist_lookup_uint64(createprops,
	- zfs_prop_to_name(ZFS_PROP_CASE), &sense);
	- (void) nvlist_remove_all(createprops,
	- zfs_prop_to_name(ZFS_PROP_CASE));
	- }
	-
	- /*
	- * If the zpl version requested is whacky or the file system
	- * or pool is version is too "young" to support normalization
	- * and the creator tried to set a value for one of the props,
	- * error out.
	- */
	- if ((zplver < ZPL_VERSION_INITIAL \|\| zplver > ZPL_VERSION) \|\|
	- (zplver >= ZPL_VERSION_FUID && !fuids_ok) \|\|
	- (zplver >= ZPL_VERSION_SA && !sa_ok) \|\|
	- (zplver < ZPL_VERSION_NORMALIZATION &&
	- (norm != ZFS_PROP_UNDEFINED \|\| u8 != ZFS_PROP_UNDEFINED \|\|
	- sense != ZFS_PROP_UNDEFINED)))
	- return (SET_ERROR(ENOTSUP));
	-
	- /*
	- * Put the version in the zplprops
	- */
	- VERIFY(nvlist_add_uint64(zplprops,
	- zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
	-
	- if (norm == ZFS_PROP_UNDEFINED)
	- VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
	- VERIFY(nvlist_add_uint64(zplprops,
	- zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
	-
	- /*
	- * If we're normalizing, names must always be valid UTF-8 strings.
	- */
	- if (norm)
	- u8 = 1;
	- if (u8 == ZFS_PROP_UNDEFINED)
	- VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
	- VERIFY(nvlist_add_uint64(zplprops,
	- zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
	-
	- if (sense == ZFS_PROP_UNDEFINED)
	- VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
	- VERIFY(nvlist_add_uint64(zplprops,
	- zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
	-
	- if (is_ci)
	- *is_ci = (sense == ZFS_CASE_INSENSITIVE);
	-
	- return (0);
	-}
	-
	-static int
	-zfs_fill_zplprops(const char dataset, nvlist_t createprops,
	- nvlist_t zplprops, boolean_t is_ci)
	-{
	- boolean_t fuids_ok, sa_ok;
	- uint64_t zplver = ZPL_VERSION;
	- objset_t *os = NULL;
	- char parentname[ZFS_MAX_DATASET_NAME_LEN];
	- spa_t *spa;
	- uint64_t spa_vers;
	- int error;
	-
	- zfs_get_parent(dataset, parentname, sizeof (parentname));
	-
	- if ((error = spa_open(dataset, &spa, FTAG)) != 0)
	- return (error);
	-
	- spa_vers = spa_version(spa);
	- spa_close(spa, FTAG);
	-
	- zplver = zfs_zpl_version_map(spa_vers);
	- fuids_ok = (zplver >= ZPL_VERSION_FUID);
	- sa_ok = (zplver >= ZPL_VERSION_SA);
	-
	- /*
	- * Open parent object set so we can inherit zplprop values.
	- */
	- if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
	- return (error);
	-
	- error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
	- zplprops, is_ci);
	- dmu_objset_rele(os, FTAG);
	- return (error);
	-}
	-
	-static int
	-zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
	- nvlist_t zplprops, boolean_t is_ci)
	-{
	- boolean_t fuids_ok;
	- boolean_t sa_ok;
	- uint64_t zplver = ZPL_VERSION;
	- int error;
	-
	- zplver = zfs_zpl_version_map(spa_vers);
	- fuids_ok = (zplver >= ZPL_VERSION_FUID);
	- sa_ok = (zplver >= ZPL_VERSION_SA);
	-
	- error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
	- createprops, zplprops, is_ci);
	- return (error);
	-}
	-
	-/*
	- * innvl: {
	- * "type" -> dmu_objset_type_t (int32)
	- * (optional) "props" -> { prop -> value }
	- * }
	- *
	- * outnvl: propname -> error code (int32)
	- */
	-
	-static const zfs_ioc_key_t zfs_keys_create[] = {
	- {"type", DATA_TYPE_INT32, 0},
	- {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
	- {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
	-};
	-
	-static int
	-zfs_ioc_create(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- int error = 0;
	- zfs_creat_t zct = { 0 };
	- nvlist_t *nvprops = NULL;
	- void (cbfunc)(objset_t os, void arg, cred_t cr, dmu_tx_t *tx);
	- dmu_objset_type_t type;
	- boolean_t is_insensitive = B_FALSE;
	-
	- type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type");
	- (void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
	-
	- switch (type) {
	- case DMU_OST_ZFS:
	- cbfunc = zfs_create_cb;
	- break;
	-
	- case DMU_OST_ZVOL:
	- cbfunc = zvol_create_cb;
	- break;
	-
	- default:
	- cbfunc = NULL;
	- break;
	- }
	- if (strchr(fsname, '@') \|\|
	- strchr(fsname, '%'))
	- return (SET_ERROR(EINVAL));
	-
	- zct.zct_props = nvprops;
	-
	- if (cbfunc == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- if (type == DMU_OST_ZVOL) {
	- uint64_t volsize, volblocksize;
	-
	- if (nvprops == NULL)
	- return (SET_ERROR(EINVAL));
	- if (nvlist_lookup_uint64(nvprops,
	- zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
	- return (SET_ERROR(EINVAL));
	-
	- if ((error = nvlist_lookup_uint64(nvprops,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	- &volblocksize)) != 0 && error != ENOENT)
	- return (SET_ERROR(EINVAL));
	-
	- if (error != 0)
	- volblocksize = zfs_prop_default_numeric(
	- ZFS_PROP_VOLBLOCKSIZE);
	-
	- if ((error = zvol_check_volblocksize(
	- volblocksize)) != 0 \|\|
	- (error = zvol_check_volsize(volsize,
	- volblocksize)) != 0)
	- return (error);
	- } else if (type == DMU_OST_ZFS) {
	- int error;
	-
	- /*
	- * We have to have normalization and
	- * case-folding flags correct when we do the
	- * file system creation, so go figure them out
	- * now.
	- */
	- VERIFY(nvlist_alloc(&zct.zct_zplprops,
	- NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- error = zfs_fill_zplprops(fsname, nvprops,
	- zct.zct_zplprops, &is_insensitive);
	- if (error != 0) {
	- nvlist_free(zct.zct_zplprops);
	- return (error);
	- }
	- }
	-
	- error = dmu_objset_create(fsname, type,
	- is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
	- nvlist_free(zct.zct_zplprops);
	-
	- /*
	- * It would be nice to do this atomically.
	- */
	- if (error == 0) {
	- error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
	- nvprops, outnvl);
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- /*
	- * Wait for ZVOL operations to settle down before destroying.
	- */
	- if (error != 0) {
	- spa_t *spa;
	-
	- if (spa_open(fsname, &spa, FTAG) == 0) {
	- taskqueue_drain_all(
	- spa->spa_zvol_taskq->tq_queue);
	- spa_close(spa, FTAG);
	- }
	- }
	-#endif
	- if (error != 0)
	- (void) dsl_destroy_head(fsname);
	- }
	- return (error);
	-}
	-
	-/*
	- * innvl: {
	- * "origin" -> name of origin snapshot
	- * (optional) "props" -> { prop -> value }
	- * }
	- *
	- * outnvl: propname -> error code (int32)
	- */
	-static const zfs_ioc_key_t zfs_keys_clone[] = {
	- {"origin", DATA_TYPE_STRING, 0},
	- {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
	- {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
	-};
	-
	-static int
	-zfs_ioc_clone(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- int error = 0;
	- nvlist_t *nvprops = NULL;
	- char *origin_name;
	-
	- origin_name = fnvlist_lookup_string(innvl, "origin");
	- (void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
	-
	- if (strchr(fsname, '@') \|\|
	- strchr(fsname, '%'))
	- return (SET_ERROR(EINVAL));
	-
	- if (dataset_namecheck(origin_name, NULL, NULL) != 0)
	- return (SET_ERROR(EINVAL));
	- error = dmu_objset_clone(fsname, origin_name);
	- if (error != 0)
	- return (error);
	-
	- /*
	- * It would be nice to do this atomically.
	- */
	- if (error == 0) {
	- error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
	- nvprops, outnvl);
	- if (error != 0)
	- (void) dsl_destroy_head(fsname);
	- }
	- return (error);
	-}
	-
	-static const zfs_ioc_key_t zfs_keys_remap[] = {
	- /* no nvl keys */
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_remap(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- if (strchr(fsname, '@') \|\|
	- strchr(fsname, '%'))
	- return (SET_ERROR(EINVAL));
	-
	- return (dmu_objset_remap_indirects(fsname));
	-}
	-
	-/*
	- * innvl: {
	- * "snaps" -> { snapshot1, snapshot2 }
	- * (optional) "props" -> { prop -> value (string) }
	- * }
	- *
	- * outnvl: snapshot -> error code (int32)
	- */
	-static const zfs_ioc_key_t zfs_keys_snapshot[] = {
	- {"snaps", DATA_TYPE_NVLIST, 0},
	- {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
	-};
	-
	-static int
	-zfs_ioc_snapshot(const char poolname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- nvlist_t *snaps;
	- nvlist_t *props = NULL;
	- int error, poollen;
	- nvpair_t *pair;
	-
	- (void) nvlist_lookup_nvlist(innvl, "props", &props);
	- if (!nvlist_empty(props) &&
	- zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
	- return (SET_ERROR(ENOTSUP));
	- if ((error = zfs_check_userprops(props)) != 0)
	- return (error);
	-
	- snaps = fnvlist_lookup_nvlist(innvl, "snaps");
	- poollen = strlen(poolname);
	- for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(snaps, pair)) {
	- const char *name = nvpair_name(pair);
	- char *cp = strchr(name, '@');
	-
	- /*
	- * The snap name must contain an @, and the part after it must
	- * contain only valid characters.
	- */
	- if (cp == NULL \|\|
	- zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * The snap must be in the specified pool.
	- */
	- if (strncmp(name, poolname, poollen) != 0 \|\|
	- (name[poollen] != '/' && name[poollen] != '@'))
	- return (SET_ERROR(EXDEV));
	-
	- /*
	- * Check for permission to set the properties on the fs.
	- */
	- if (!nvlist_empty(props)) {
	- *cp = '\0';
	- error = zfs_secpolicy_write_perms(name,
	- ZFS_DELEG_PERM_USERPROP, CRED());
	- *cp = '@';
	- if (error != 0)
	- return (error);
	- }
	-
	- /* This must be the only snap of this fs. */
	- for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
	- pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
	- if (strncmp(name, nvpair_name(pair2), cp - name + 1)
	- == 0) {
	- return (SET_ERROR(EXDEV));
	- }
	- }
	- }
	-
	- error = dsl_dataset_snapshot(snaps, props, outnvl);
	- return (error);
	-}
	-
	-/*
	- * innvl: "message" -> string
	- */
	-static const zfs_ioc_key_t zfs_keys_log_history[] = {
	- {"message", DATA_TYPE_STRING, 0},
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_log_history(const char unused, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- char *message;
	- spa_t *spa;
	- int error;
	- char *poolname;
	-
	- /*
	- * The poolname in the ioctl is not set, we get it from the TSD,
	- * which was set at the end of the last successful ioctl that allows
	- * logging. The secpolicy func already checked that it is set.
	- * Only one log ioctl is allowed after each successful ioctl, so
	- * we clear the TSD here.
	- */
	- poolname = tsd_get(zfs_allow_log_key);
	- (void) tsd_set(zfs_allow_log_key, NULL);
	- error = spa_open(poolname, &spa, FTAG);
	- strfree(poolname);
	- if (error != 0)
	- return (error);
	-
	- message = fnvlist_lookup_string(innvl, "message");
	-
	- if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- error = spa_history_log(spa, message);
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-/*
	- * This ioctl is used to set the bootenv configuration on the current
	- * pool. This configuration is stored in the second padding area of the label,
	- * and it is used by the GRUB bootloader used on Linux to store the contents
	- * of the grubenv file. The file is stored as raw ASCII, and is protected by
	- * an embedded checksum. By default, GRUB will check if the boot filesystem
	- * supports storing the environment data in a special location, and if so,
	- * will invoke filesystem specific logic to retrieve it. This can be overriden
	- * by a variable, should the user so desire.
	- */
	-/* ARGSUSED */
	-static const zfs_ioc_key_t zfs_keys_set_bootenv[] = {
	- {"envmap", DATA_TYPE_STRING, 0},
	-};
	-
	-static int
	-zfs_ioc_set_bootenv(const char name, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- char *envmap;
	- int error;
	- spa_t *spa;
	-
	- envmap = fnvlist_lookup_string(innvl, "envmap");
	- if ((error = spa_open(name, &spa, FTAG)) != 0)
	- return (error);
	- spa_vdev_state_enter(spa, SCL_ALL);
	- error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap);
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-static const zfs_ioc_key_t zfs_keys_get_bootenv[] = {
	- /* no nvl keys */
	-};
	-
	- /* ARGSUSED */
	-static int
	-zfs_ioc_get_bootenv(const char name, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- spa_t *spa;
	- int error;
	-
	- if ((error = spa_open(name, &spa, FTAG)) != 0)
	- return (error);
	- spa_vdev_state_enter(spa, SCL_ALL);
	- error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl);
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-
	-#ifdef __FreeBSD__
	-static const zfs_ioc_key_t zfs_keys_nextboot[] = {
	- {"command", DATA_TYPE_STRING, 0},
	- {ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 0},
	- {ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 0}
	-};
	-
	-static int
	-zfs_ioc_nextboot(const char unused, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- char name[MAXNAMELEN];
	- spa_t *spa;
	- vdev_t *vd;
	- char *command;
	- uint64_t pool_guid;
	- uint64_t vdev_guid;
	- int error;
	-
	- if (nvlist_lookup_uint64(innvl,
	- ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
	- return (EINVAL);
	- if (nvlist_lookup_uint64(innvl,
	- ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
	- return (EINVAL);
	- command = fnvlist_lookup_string(innvl, "command");
	-
	- mutex_enter(&spa_namespace_lock);
	- spa = spa_by_guid(pool_guid, vdev_guid);
	- if (spa != NULL)
	- strcpy(name, spa_name(spa));
	- mutex_exit(&spa_namespace_lock);
	- if (spa == NULL)
	- return (ENOENT);
	-
	- if ((error = spa_open(name, &spa, FTAG)) != 0)
	- return (error);
	- spa_vdev_state_enter(spa, SCL_ALL);
	- vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
	- if (vd == NULL) {
	- (void) spa_vdev_state_exit(spa, NULL, ENXIO);
	- spa_close(spa, FTAG);
	- return (ENODEV);
	- }
	- error = vdev_label_write_pad2(vd, command, strlen(command));
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	- txg_wait_synced(spa->spa_dsl_pool, 0);
	- spa_close(spa, FTAG);
	- return (error);
	-}
	-#endif
	-
	-/*
	- * The dp_config_rwlock must not be held when calling this, because the
	- * unmount may need to write out data.
	- *
	- * This function is best-effort. Callers must deal gracefully if it
	- * remains mounted (or is remounted after this call).
	- *
	- * Returns 0 if the argument is not a snapshot, or it is not currently a
	- * filesystem, or we were able to unmount it. Returns error code otherwise.
	- */
	-void
	-zfs_unmount_snap(const char *snapname)
	-{
	- vfs_t *vfsp = NULL;
	- zfsvfs_t *zfsvfs = NULL;
	-
	- if (strchr(snapname, '@') == NULL)
	- return;
	-
	- int err = getzfsvfs(snapname, &zfsvfs);
	- if (err != 0) {
	- ASSERT3P(zfsvfs, ==, NULL);
	- return;
	- }
	- vfsp = zfsvfs->z_vfs;
	-
	- ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
	-
	-#ifdef illumos
	- err = vn_vfswlock(vfsp->vfs_vnodecovered);
	- VFS_RELE(vfsp);
	- if (err != 0)
	- return;
	-#endif
	-
	- /*
	- * Always force the unmount for snapshots.
	- */
	-#ifdef illumos
	- (void) dounmount(vfsp, MS_FORCE, kcred);
	-#else
	- vfs_ref(vfsp);
	- vfs_unbusy(vfsp);
	- (void) dounmount(vfsp, MS_FORCE, curthread);
	-#endif
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_unmount_snap_cb(const char snapname, void arg)
	-{
	- zfs_unmount_snap(snapname);
	- return (0);
	-}
	-
	-/*
	- * When a clone is destroyed, its origin may also need to be destroyed,
	- * in which case it must be unmounted. This routine will do that unmount
	- * if necessary.
	- */
	-void
	-zfs_destroy_unmount_origin(const char *fsname)
	-{
	- int error;
	- objset_t *os;
	- dsl_dataset_t *ds;
	-
	- error = dmu_objset_hold(fsname, FTAG, &os);
	- if (error != 0)
	- return;
	- ds = dmu_objset_ds(os);
	- if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
	- char originname[ZFS_MAX_DATASET_NAME_LEN];
	- dsl_dataset_name(ds->ds_prev, originname);
	- dmu_objset_rele(os, FTAG);
	- zfs_unmount_snap(originname);
	- } else {
	- dmu_objset_rele(os, FTAG);
	- }
	-}
	-
	-/*
	- * innvl: {
	- * "snaps" -> { snapshot1, snapshot2 }
	- * (optional boolean) "defer"
	- * }
	- *
	- * outnvl: snapshot -> error code (int32)
	- *
	- */
	-static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = {
	- {"snaps", DATA_TYPE_NVLIST, 0},
	- {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_destroy_snaps(const char poolname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- int error, poollen;
	- nvlist_t *snaps;
	- nvpair_t *pair;
	- boolean_t defer;
	-
	- snaps = fnvlist_lookup_nvlist(innvl, "snaps");
	- defer = nvlist_exists(innvl, "defer");
	-
	- poollen = strlen(poolname);
	- for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(snaps, pair)) {
	- const char *name = nvpair_name(pair);
	-
	- /*
	- * The snap must be in the specified pool to prevent the
	- * invalid removal of zvol minors below.
	- */
	- if (strncmp(name, poolname, poollen) != 0 \|\|
	- (name[poollen] != '/' && name[poollen] != '@'))
	- return (SET_ERROR(EXDEV));
	-
	- zfs_unmount_snap(nvpair_name(pair));
	- }
	-
	- return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
	-}
	-
	-/*
	- * Create bookmarks. Bookmark names are of the form <fs>#<bmark>.
	- * All bookmarks must be in the same pool.
	- *
	- * innvl: {
	- * bookmark1 -> snapshot1, bookmark2 -> snapshot2
	- * }
	- *
	- * outnvl: bookmark -> error code (int32)
	- *
	- */
	-static const zfs_ioc_key_t zfs_keys_bookmark[] = {
	- {"<bookmark>...", DATA_TYPE_STRING, ZK_WILDCARDLIST},
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_bookmark(const char poolname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
	- char *snap_name;
	-
	- /*
	- * Verify the snapshot argument.
	- */
	- if (nvpair_value_string(pair, &snap_name) != 0)
	- return (SET_ERROR(EINVAL));
	-
	-
	- /* Verify that the keys (bookmarks) are unique */
	- for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
	- pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
	- if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
	- return (SET_ERROR(EINVAL));
	- }
	- }
	-
	- return (dsl_bookmark_create(innvl, outnvl));
	-}
	-
	-/*
	- * innvl: {
	- * property 1, property 2, ...
	- * }
	- *
	- * outnvl: {
	- * bookmark name 1 -> { property 1, property 2, ... },
	- * bookmark name 2 -> { property 1, property 2, ... }
	- * }
	- *
	- */
	-static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = {
	- {"<property>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST \| ZK_OPTIONAL},
	-};
	-
	-static int
	-zfs_ioc_get_bookmarks(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- return (dsl_get_bookmarks(fsname, innvl, outnvl));
	-}
	-
	-/*
	- * innvl: {
	- * bookmark name 1, bookmark name 2
	- * }
	- *
	- * outnvl: bookmark -> error code (int32)
	- *
	- */
	-static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = {
	- {"<bookmark>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST},
	-};
	-
	-static int
	-zfs_ioc_destroy_bookmarks(const char poolname, nvlist_t innvl,
	- nvlist_t *outnvl)
	-{
	- int error, poollen;
	-
	- poollen = strlen(poolname);
	- for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
	- const char *name = nvpair_name(pair);
	- const char *cp = strchr(name, '#');
	-
	- /*
	- * The bookmark name must contain an #, and the part after it
	- * must contain only valid characters.
	- */
	- if (cp == NULL \|\|
	- zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * The bookmark must be in the specified pool.
	- */
	- if (strncmp(name, poolname, poollen) != 0 \|\|
	- (name[poollen] != '/' && name[poollen] != '#'))
	- return (SET_ERROR(EXDEV));
	- }
	-
	- error = dsl_bookmark_destroy(innvl, outnvl);
	- return (error);
	-}
	-
	-static const zfs_ioc_key_t zfs_keys_channel_program[] = {
	- {"program", DATA_TYPE_STRING, 0},
	- {"arg", DATA_TYPE_ANY, 0},
	- {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
	- {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
	- {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
	-};
	-
	-static int
	-zfs_ioc_channel_program(const char poolname, nvlist_t innvl,
	- nvlist_t *outnvl)
	-{
	- char *program;
	- uint64_t instrlimit, memlimit;
	- boolean_t sync_flag;
	- nvpair_t *nvarg = NULL;
	-
	- program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM);
	- if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
	- sync_flag = B_TRUE;
	- }
	- if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
	- instrlimit = ZCP_DEFAULT_INSTRLIMIT;
	- }
	- if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
	- memlimit = ZCP_DEFAULT_MEMLIMIT;
	- }
	- nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST);
	-
	- if (instrlimit == 0 \|\| instrlimit > zfs_lua_max_instrlimit)
	- return (EINVAL);
	- if (memlimit == 0 \|\| memlimit > zfs_lua_max_memlimit)
	- return (EINVAL);
	-
	- return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
	- nvarg, outnvl));
	-}
	-
	-/*
	- * innvl: unused
	- * outnvl: empty
	- */
	-static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = {
	- /* no nvl keys */
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_pool_checkpoint(const char poolname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- return (spa_checkpoint(poolname));
	-}
	-
	-/*
	- * innvl: unused
	- * outnvl: empty
	- */
	-static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = {
	- /* no nvl keys */
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_pool_discard_checkpoint(const char poolname, nvlist_t innvl,
	- nvlist_t *outnvl)
	-{
	- return (spa_checkpoint_discard(poolname));
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of dataset to destroy
	- * zc_defer_destroy mark for deferred destroy
	- *
	- * outputs: none
	- */
	-static int
	-zfs_ioc_destroy(zfs_cmd_t *zc)
	-{
	- objset_t *os;
	- dmu_objset_type_t ost;
	- int err;
	-
	- err = dmu_objset_hold(zc->zc_name, FTAG, &os);
	- if (err != 0)
	- return (err);
	- ost = dmu_objset_type(os);
	- dmu_objset_rele(os, FTAG);
	-
	- if (ost == DMU_OST_ZFS)
	- zfs_unmount_snap(zc->zc_name);
	-
	- if (strchr(zc->zc_name, '@'))
	- err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
	- else
	- err = dsl_destroy_head(zc->zc_name);
	-#ifndef __FreeBSD__
	- if (ost == DMU_OST_ZVOL && err == 0)
	- (void) zvol_remove_minor(zc->zc_name);
	-#endif
	- return (err);
	-}
	-
	-/*
	- * innvl: {
	- * vdevs: {
	- * guid 1, guid 2, ...
	- * },
	- * func: POOL_INITIALIZE_{CANCEL\|DO\|SUSPEND}
	- * }
	- *
	- * outnvl: {
	- * [func: EINVAL (if provided command type didn't make sense)],
	- * [vdevs: {
	- * guid1: errno, (see function body for possible errnos)
	- * ...
	- * }]
	- * }
	- *
	- */
	-static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
	- {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0},
	- {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0}
	-};
	-
	-static int
	-zfs_ioc_pool_initialize(const char poolname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- spa_t *spa;
	- int error;
	-
	- error = spa_open(poolname, &spa, FTAG);
	- if (error != 0)
	- return (error);
	-
	- uint64_t cmd_type;
	- if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
	- &cmd_type) != 0) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	- if (!(cmd_type == POOL_INITIALIZE_CANCEL \|\|
	- cmd_type == POOL_INITIALIZE_DO \|\|
	- cmd_type == POOL_INITIALIZE_SUSPEND)) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- nvlist_t *vdev_guids;
	- if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
	- &vdev_guids) != 0) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- nvlist_t *vdev_errlist = fnvlist_alloc();
	- int total_errors = 0;
	-
	- for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
	- uint64_t vdev_guid = fnvpair_value_uint64(pair);
	-
	- error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
	- if (error != 0) {
	- char guid_as_str[MAXNAMELEN];
	-
	- (void) snprintf(guid_as_str, sizeof (guid_as_str),
	- "%llu", (unsigned long long)vdev_guid);
	- fnvlist_add_int64(vdev_errlist, guid_as_str, error);
	- total_errors++;
	- }
	- }
	- if (fnvlist_size(vdev_errlist) > 0) {
	- fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
	- vdev_errlist);
	- }
	- fnvlist_free(vdev_errlist);
	-
	- spa_close(spa, FTAG);
	- return (total_errors > 0 ? EINVAL : 0);
	-}
	-
	-/*
	- * fsname is name of dataset to rollback (to most recent snapshot)
	- *
	- * innvl may contain name of expected target snapshot
	- *
	- * outnvl: "target" -> name of most recent snapshot
	- * }
	- */
	-static const zfs_ioc_key_t zfs_keys_rollback[] = {
	- {"target", DATA_TYPE_STRING, ZK_OPTIONAL},
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_rollback(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- zfsvfs_t *zfsvfs;
	- char *target = NULL;
	- int error;
	-
	- (void) nvlist_lookup_string(innvl, "target", &target);
	- if (target != NULL) {
	- const char *cp = strchr(target, '@');
	-
	- /*
	- * The snap name must contain an @, and the part after it must
	- * contain only valid characters.
	- */
	- if (cp == NULL \|\|
	- zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (getzfsvfs(fsname, &zfsvfs) == 0) {
	- dsl_dataset_t *ds;
	-
	- ds = dmu_objset_ds(zfsvfs->z_os);
	- error = zfs_suspend_fs(zfsvfs);
	- if (error == 0) {
	- int resume_err;
	-
	- error = dsl_dataset_rollback(fsname, target, zfsvfs,
	- outnvl);
	- resume_err = zfs_resume_fs(zfsvfs, ds);
	- error = error ? error : resume_err;
	- }
	-#ifdef illumos
	- VFS_RELE(zfsvfs->z_vfs);
	-#else
	- vfs_unbusy(zfsvfs->z_vfs);
	-#endif
	- } else {
	- error = dsl_dataset_rollback(fsname, target, NULL, outnvl);
	- }
	- return (error);
	-}
	-
	-static int
	-recursive_unmount(const char fsname, void arg)
	-{
	- const char *snapname = arg;
	- char fullname[ZFS_MAX_DATASET_NAME_LEN];
	-
	- (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
	- zfs_unmount_snap(fullname);
	-
	- return (0);
	-}
	-
	-/*
	- * inputs:
	- * zc_name old name of dataset or bookmark
	- * zc_value new name of dataset or bookmark
	- * zc_cookie recursive flag (only valid for snapshots)
	- *
	- * outputs: none
	- */
	-static int
	-zfs_ioc_rename(zfs_cmd_t *zc)
	-{
	- objset_t *os;
	- dmu_objset_type_t ost;
	- boolean_t recursive = zc->zc_cookie & 1;
	- char pos, pos2;
	- boolean_t allow_mounted = B_TRUE;
	- int err;
	-
	-#ifdef __FreeBSD__
	- allow_mounted = (zc->zc_cookie & 2) != 0;
	-#endif
	-
	- zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
	- zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
	-
	- pos = strchr(zc->zc_name, '#');
	- if (pos != NULL) {
	- /* Bookmarks must be in same fs. */
	- pos2 = strchr(zc->zc_value, '#');
	- if (pos2 == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- /* Recursive flag is not supported yet. */
	- if (recursive)
	- return (SET_ERROR(ENOTSUP));
	-
	- *pos = '\0';
	- *pos2 = '\0';
	- if (strcmp(zc->zc_name, zc->zc_value) == 0) {
	- err = dsl_bookmark_rename(zc->zc_name,
	- pos + 1, pos2 + 1);
	- } else {
	- err = SET_ERROR(EXDEV);
	- }
	- *pos = '#';
	- *pos2 = '#';
	- return (err);
	- }
	-
	- /* "zfs rename" from and to ...%recv datasets should both fail */
	- if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 \|\|
	- dataset_namecheck(zc->zc_value, NULL, NULL) != 0 \|\|
	- strchr(zc->zc_name, '%') \|\| strchr(zc->zc_value, '%'))
	- return (SET_ERROR(EINVAL));
	-
	- err = dmu_objset_hold(zc->zc_name, FTAG, &os);
	- if (err != 0)
	- return (err);
	- ost = dmu_objset_type(os);
	- dmu_objset_rele(os, FTAG);
	-
	- pos = strchr(zc->zc_name, '@');
	- if (pos != NULL) {
	- /* Snapshots must be in same fs. */
	- pos2 = strchr(zc->zc_value, '@');
	- if (pos2 == NULL)
	- return (SET_ERROR(EINVAL));
	- *pos = '\0';
	- *pos2 = '\0';
	- if (strcmp(zc->zc_name, zc->zc_value) != 0) {
	- err = SET_ERROR(EXDEV);
	- } else {
	- if (ost == DMU_OST_ZFS && !allow_mounted) {
	- err = dmu_objset_find(zc->zc_name,
	- recursive_unmount, pos + 1,
	- recursive ? DS_FIND_CHILDREN : 0);
	- }
	- if (err == 0) {
	- err = dsl_dataset_rename_snapshot(zc->zc_name,
	- pos + 1, pos2 + 1, recursive);
	- }
	- }
	- *pos = '@';
	- *pos2 = '@';
	- return (err);
	- } else {
	-#ifdef illumos
	- if (ost == DMU_OST_ZVOL)
	- (void) zvol_remove_minor(zc->zc_name);
	-#endif
	- return (dsl_dir_rename(zc->zc_name, zc->zc_value));
	- }
	-}
	-
	-static int
	-zfs_check_settable(const char dsname, nvpair_t pair, cred_t *cr)
	-{
	- const char *propname = nvpair_name(pair);
	- boolean_t issnap = (strchr(dsname, '@') != NULL);
	- zfs_prop_t prop = zfs_name_to_prop(propname);
	- uint64_t intval;
	- int err;
	-
	- if (prop == ZPROP_INVAL) {
	- if (zfs_prop_user(propname)) {
	- if (err = zfs_secpolicy_write_perms(dsname,
	- ZFS_DELEG_PERM_USERPROP, cr))
	- return (err);
	- return (0);
	- }
	-
	- if (!issnap && zfs_prop_userquota(propname)) {
	- const char *perm = NULL;
	- const char *uq_prefix =
	- zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
	- const char *gq_prefix =
	- zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
	-
	- if (strncmp(propname, uq_prefix,
	- strlen(uq_prefix)) == 0) {
	- perm = ZFS_DELEG_PERM_USERQUOTA;
	- } else if (strncmp(propname, gq_prefix,
	- strlen(gq_prefix)) == 0) {
	- perm = ZFS_DELEG_PERM_GROUPQUOTA;
	- } else {
	- /* USERUSED and GROUPUSED are read-only */
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
	- return (err);
	- return (0);
	- }
	-
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (issnap)
	- return (SET_ERROR(EINVAL));
	-
	- if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	- /*
	- * dsl_prop_get_all_impl() returns properties in this
	- * format.
	- */
	- nvlist_t *attrs;
	- VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
	- VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	- &pair) == 0);
	- }
	-
	- /*
	- * Check that this value is valid for this pool version
	- */
	- switch (prop) {
	- case ZFS_PROP_COMPRESSION:
	- /*
	- * If the user specified gzip compression, make sure
	- * the SPA supports it. We ignore any errors here since
	- * we'll catch them later.
	- */
	- if (nvpair_value_uint64(pair, &intval) == 0) {
	- if (intval >= ZIO_COMPRESS_GZIP_1 &&
	- intval <= ZIO_COMPRESS_GZIP_9 &&
	- zfs_earlier_version(dsname,
	- SPA_VERSION_GZIP_COMPRESSION)) {
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- if (intval == ZIO_COMPRESS_ZLE &&
	- zfs_earlier_version(dsname,
	- SPA_VERSION_ZLE_COMPRESSION))
	- return (SET_ERROR(ENOTSUP));
	-
	- if (intval == ZIO_COMPRESS_LZ4) {
	- spa_t *spa;
	-
	- if ((err = spa_open(dsname, &spa, FTAG)) != 0)
	- return (err);
	-
	- if (!spa_feature_is_enabled(spa,
	- SPA_FEATURE_LZ4_COMPRESS)) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- spa_close(spa, FTAG);
	- }
	-
	- /*
	- * If this is a bootable dataset then
	- * verify that the compression algorithm
	- * is supported for booting. We must return
	- * something other than ENOTSUP since it
	- * implies a downrev pool version.
	- */
	- if (zfs_is_bootfs(dsname) &&
	- !BOOTFS_COMPRESS_VALID(intval)) {
	- return (SET_ERROR(ERANGE));
	- }
	- }
	- break;
	-
	- case ZFS_PROP_COPIES:
	- if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
	- return (SET_ERROR(ENOTSUP));
	- break;
	-
	- case ZFS_PROP_RECORDSIZE:
	- /* Record sizes above 128k need the feature to be enabled */
	- if (nvpair_value_uint64(pair, &intval) == 0 &&
	- intval > SPA_OLD_MAXBLOCKSIZE) {
	- spa_t *spa;
	-
	- /*
	- * We don't allow setting the property above 1MB,
	- * unless the tunable has been changed.
	- */
	- if (intval > zfs_max_recordsize \|\|
	- intval > SPA_MAXBLOCKSIZE)
	- return (SET_ERROR(ERANGE));
	-
	- if ((err = spa_open(dsname, &spa, FTAG)) != 0)
	- return (err);
	-
	- if (!spa_feature_is_enabled(spa,
	- SPA_FEATURE_LARGE_BLOCKS)) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- spa_close(spa, FTAG);
	- }
	- break;
	-
	- case ZFS_PROP_DNODESIZE:
	- /* Dnode sizes above 512 need the feature to be enabled */
	- if (nvpair_value_uint64(pair, &intval) == 0 &&
	- intval != ZFS_DNSIZE_LEGACY) {
	- spa_t *spa;
	-
	- if ((err = spa_open(dsname, &spa, FTAG)) != 0)
	- return (err);
	-
	- if (!spa_feature_is_enabled(spa,
	- SPA_FEATURE_LARGE_DNODE)) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- spa_close(spa, FTAG);
	- }
	- break;
	-
	- case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
	- /*
	- * This property could require the allocation classes
	- * feature to be active for setting, however we allow
	- * it so that tests of settable properties succeed.
	- * The CLI will issue a warning in this case.
	- */
	- break;
	-
	- case ZFS_PROP_SHARESMB:
	- if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
	- return (SET_ERROR(ENOTSUP));
	- break;
	-
	- case ZFS_PROP_ACLINHERIT:
	- if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
	- nvpair_value_uint64(pair, &intval) == 0) {
	- if (intval == ZFS_ACL_PASSTHROUGH_X &&
	- zfs_earlier_version(dsname,
	- SPA_VERSION_PASSTHROUGH_X))
	- return (SET_ERROR(ENOTSUP));
	- }
	- break;
	-
	- case ZFS_PROP_CHECKSUM:
	- case ZFS_PROP_DEDUP:
	- {
	- spa_feature_t feature;
	- spa_t *spa;
	-
	- /* dedup feature version checks */
	- if (prop == ZFS_PROP_DEDUP &&
	- zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
	- return (SET_ERROR(ENOTSUP));
	-
	- if (nvpair_value_uint64(pair, &intval) != 0)
	- return (SET_ERROR(EINVAL));
	-
	- /* check prop value is enabled in features */
	- feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
	- if (feature == SPA_FEATURE_NONE)
	- break;
	-
	- if ((err = spa_open(dsname, &spa, FTAG)) != 0)
	- return (err);
	-
	- if (!spa_feature_is_enabled(spa, feature)) {
	- spa_close(spa, FTAG);
	- return (SET_ERROR(ENOTSUP));
	- }
	- spa_close(spa, FTAG);
	- break;
	- }
	- }
	-
	- return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
	-}
	-
	-/*
	- * Checks for a race condition to make sure we don't increment a feature flag
	- * multiple times.
	- */
	-static int
	-zfs_prop_activate_feature_check(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- spa_feature_t *featurep = arg;
	-
	- if (!spa_feature_is_active(spa, *featurep))
	- return (0);
	- else
	- return (SET_ERROR(EBUSY));
	-}
	-
	-/*
	- * The callback invoked on feature activation in the sync task caused by
	- * zfs_prop_activate_feature.
	- */
	-static void
	-zfs_prop_activate_feature_sync(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	- spa_feature_t *featurep = arg;
	-
	- spa_feature_incr(spa, *featurep, tx);
	-}
	-
	-/*
	- * Activates a feature on a pool in response to a property setting. This
	- * creates a new sync task which modifies the pool to reflect the feature
	- * as being active.
	- */
	-static int
	-zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
	-{
	- int err;
	-
	- /* EBUSY here indicates that the feature is already active */
	- err = dsl_sync_task(spa_name(spa),
	- zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
	- &feature, 2, ZFS_SPACE_CHECK_RESERVED);
	-
	- if (err != 0 && err != EBUSY)
	- return (err);
	- else
	- return (0);
	-}
	-
	-/*
	- * Removes properties from the given props list that fail permission checks
	- * needed to clear them and to restore them in case of a receive error. For each
	- * property, make sure we have both set and inherit permissions.
	- *
	- * Returns the first error encountered if any permission checks fail. If the
	- * caller provides a non-NULL errlist, it also gives the complete list of names
	- * of all the properties that failed a permission check along with the
	- * corresponding error numbers. The caller is responsible for freeing the
	- * returned errlist.
	- *
	- * If every property checks out successfully, zero is returned and the list
	- * pointed at by errlist is NULL.
	- */
	-static int
	-zfs_check_clearable(char dataset, nvlist_t props, nvlist_t **errlist)
	-{
	- zfs_cmd_t *zc;
	- nvpair_t pair, next_pair;
	- nvlist_t *errors;
	- int err, rv = 0;
	-
	- if (props == NULL)
	- return (0);
	-
	- VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
	- (void) strcpy(zc->zc_name, dataset);
	- pair = nvlist_next_nvpair(props, NULL);
	- while (pair != NULL) {
	- next_pair = nvlist_next_nvpair(props, pair);
	-
	- (void) strcpy(zc->zc_value, nvpair_name(pair));
	- if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 \|\|
	- (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
	- VERIFY(nvlist_remove_nvpair(props, pair) == 0);
	- VERIFY(nvlist_add_int32(errors,
	- zc->zc_value, err) == 0);
	- }
	- pair = next_pair;
	- }
	- kmem_free(zc, sizeof (zfs_cmd_t));
	-
	- if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
	- nvlist_free(errors);
	- errors = NULL;
	- } else {
	- VERIFY(nvpair_value_int32(pair, &rv) == 0);
	- }
	-
	- if (errlist == NULL)
	- nvlist_free(errors);
	- else
	- *errlist = errors;
	-
	- return (rv);
	-}
	-
	-static boolean_t
	-propval_equals(nvpair_t p1, nvpair_t p2)
	-{
	- if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
	- /* dsl_prop_get_all_impl() format */
	- nvlist_t *attrs;
	- VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
	- VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	- &p1) == 0);
	- }
	-
	- if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
	- nvlist_t *attrs;
	- VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
	- VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	- &p2) == 0);
	- }
	-
	- if (nvpair_type(p1) != nvpair_type(p2))
	- return (B_FALSE);
	-
	- if (nvpair_type(p1) == DATA_TYPE_STRING) {
	- char valstr1, valstr2;
	-
	- VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
	- VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
	- return (strcmp(valstr1, valstr2) == 0);
	- } else {
	- uint64_t intval1, intval2;
	-
	- VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
	- VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
	- return (intval1 == intval2);
	- }
	-}
	-
	-/*
	- * Remove properties from props if they are not going to change (as determined
	- * by comparison with origprops). Remove them from origprops as well, since we
	- * do not need to clear or restore properties that won't change.
	- */
	-static void
	-props_reduce(nvlist_t props, nvlist_t origprops)
	-{
	- nvpair_t pair, next_pair;
	-
	- if (origprops == NULL)
	- return; /* all props need to be received */
	-
	- pair = nvlist_next_nvpair(props, NULL);
	- while (pair != NULL) {
	- const char *propname = nvpair_name(pair);
	- nvpair_t *match;
	-
	- next_pair = nvlist_next_nvpair(props, pair);
	-
	- if ((nvlist_lookup_nvpair(origprops, propname,
	- &match) != 0) \|\| !propval_equals(pair, match))
	- goto next; /* need to set received value */
	-
	- /* don't clear the existing received value */
	- (void) nvlist_remove_nvpair(origprops, match);
	- /* don't bother receiving the property */
	- (void) nvlist_remove_nvpair(props, pair);
	-next:
	- pair = next_pair;
	- }
	-}
	-
	-/*
	- * Extract properties that cannot be set PRIOR to the receipt of a dataset.
	- * For example, refquota cannot be set until after the receipt of a dataset,
	- * because in replication streams, an older/earlier snapshot may exceed the
	- * refquota. We want to receive the older/earlier snapshot, but setting
	- * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
	- * the older/earlier snapshot from being received (with EDQUOT).
	- *
	- * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
	- *
	- * libzfs will need to be judicious handling errors encountered by props
	- * extracted by this function.
	- */
	-static nvlist_t *
	-extract_delay_props(nvlist_t *props)
	-{
	- nvlist_t *delayprops;
	- nvpair_t nvp, tmp;
	- static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
	- int i;
	-
	- VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	-
	- for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
	- nvp = nvlist_next_nvpair(props, nvp)) {
	- /*
	- * strcmp() is safe because zfs_prop_to_name() always returns
	- * a bounded string.
	- */
	- for (i = 0; delayable[i] != 0; i++) {
	- if (strcmp(zfs_prop_to_name(delayable[i]),
	- nvpair_name(nvp)) == 0) {
	- break;
	- }
	- }
	- if (delayable[i] != 0) {
	- tmp = nvlist_prev_nvpair(props, nvp);
	- VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
	- VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
	- nvp = tmp;
	- }
	- }
	-
	- if (nvlist_empty(delayprops)) {
	- nvlist_free(delayprops);
	- delayprops = NULL;
	- }
	- return (delayprops);
	-}
	-
	-#ifdef DEBUG
	-static boolean_t zfs_ioc_recv_inject_err;
	-#endif
	-
	-/*
	- * inputs:
	- * zc_name name of containing filesystem
	- * zc_nvlist_src{_size} nvlist of properties to apply
	- * zc_value name of snapshot to create
	- * zc_string name of clone origin (if DRR_FLAG_CLONE)
	- * zc_cookie file descriptor to recv from
	- * zc_begin_record the BEGIN record of the stream (not byteswapped)
	- * zc_guid force flag
	- * zc_cleanup_fd cleanup-on-exit file descriptor
	- * zc_action_handle handle for this guid/ds mapping (or zero on first call)
	- * zc_resumable if data is incomplete assume sender will resume
	- *
	- * outputs:
	- * zc_cookie number of bytes read
	- * zc_nvlist_dst{_size} error for each unapplied received property
	- * zc_obj zprop_errflags_t
	- * zc_action_handle handle for this guid/ds mapping
	- */
	-static int
	-zfs_ioc_recv(zfs_cmd_t *zc)
	-{
	- file_t *fp;
	- dmu_recv_cookie_t drc;
	- boolean_t force = (boolean_t)zc->zc_guid;
	- int fd;
	- int error = 0;
	- int props_error = 0;
	- nvlist_t *errors;
	- offset_t off;
	- nvlist_t props = NULL; / sent properties */
	- nvlist_t origprops = NULL; / existing properties */
	- nvlist_t delayprops = NULL; / sent properties applied post-receive */
	- char *origin = NULL;
	- char *tosnap;
	- char tofs[ZFS_MAX_DATASET_NAME_LEN];
	- boolean_t first_recvd_props = B_FALSE;
	-
	- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 \|\|
	- strchr(zc->zc_value, '@') == NULL \|\|
	- strchr(zc->zc_value, '%'))
	- return (SET_ERROR(EINVAL));
	-
	- (void) strcpy(tofs, zc->zc_value);
	- tosnap = strchr(tofs, '@');
	- *tosnap++ = '\0';
	-
	- if (zc->zc_nvlist_src != 0 &&
	- (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	- zc->zc_iflags, &props)) != 0)
	- return (error);
	-
	- fd = zc->zc_cookie;
	-#ifdef illumos
	- fp = getf(fd);
	-#else
	- fget_read(curthread, fd, &cap_pread_rights, &fp);
	-#endif
	- if (fp == NULL) {
	- nvlist_free(props);
	- return (SET_ERROR(EBADF));
	- }
	-
	- errors = fnvlist_alloc();
	-
	- if (zc->zc_string[0])
	- origin = zc->zc_string;
	-
	- error = dmu_recv_begin(tofs, tosnap,
	- &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
	- if (error != 0)
	- goto out;
	-
	- /*
	- * Set properties before we receive the stream so that they are applied
	- * to the new data. Note that we must call dmu_recv_stream() if
	- * dmu_recv_begin() succeeds.
	- */
	- if (props != NULL && !drc.drc_newfs) {
	- if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
	- SPA_VERSION_RECVD_PROPS &&
	- !dsl_prop_get_hasrecvd(tofs))
	- first_recvd_props = B_TRUE;
	-
	- /*
	- * If new received properties are supplied, they are to
	- * completely replace the existing received properties, so stash
	- * away the existing ones.
	- */
	- if (dsl_prop_get_received(tofs, &origprops) == 0) {
	- nvlist_t *errlist = NULL;
	- /*
	- * Don't bother writing a property if its value won't
	- * change (and avoid the unnecessary security checks).
	- *
	- * The first receive after SPA_VERSION_RECVD_PROPS is a
	- * special case where we blow away all local properties
	- * regardless.
	- */
	- if (!first_recvd_props)
	- props_reduce(props, origprops);
	- if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
	- (void) nvlist_merge(errors, errlist, 0);
	- nvlist_free(errlist);
	-
	- if (clear_received_props(tofs, origprops,
	- first_recvd_props ? NULL : props) != 0)
	- zc->zc_obj \|= ZPROP_ERR_NOCLEAR;
	- } else {
	- zc->zc_obj \|= ZPROP_ERR_NOCLEAR;
	- }
	- }
	-
	- if (props != NULL) {
	- props_error = dsl_prop_set_hasrecvd(tofs);
	-
	- if (props_error == 0) {
	- delayprops = extract_delay_props(props);
	- (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
	- props, errors);
	- }
	- }
	-
	- off = fp->f_offset;
	- error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
	- &zc->zc_action_handle);
	-
	- if (error == 0) {
	- zfsvfs_t *zfsvfs = NULL;
	-
	- if (getzfsvfs(tofs, &zfsvfs) == 0) {
	- /* online recv */
	- dsl_dataset_t *ds;
	- int end_err;
	-
	- ds = dmu_objset_ds(zfsvfs->z_os);
	- error = zfs_suspend_fs(zfsvfs);
	- /*
	- * If the suspend fails, then the recv_end will
	- * likely also fail, and clean up after itself.
	- */
	- end_err = dmu_recv_end(&drc, zfsvfs);
	- if (error == 0)
	- error = zfs_resume_fs(zfsvfs, ds);
	- error = error ? error : end_err;
	-#ifdef illumos
	- VFS_RELE(zfsvfs->z_vfs);
	-#else
	- vfs_unbusy(zfsvfs->z_vfs);
	-#endif
	- } else {
	- error = dmu_recv_end(&drc, NULL);
	- }
	-
	- /* Set delayed properties now, after we're done receiving. */
	- if (delayprops != NULL && error == 0) {
	- (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
	- delayprops, errors);
	- }
	- }
	-
	- if (delayprops != NULL) {
	- /*
	- * Merge delayed props back in with initial props, in case
	- * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
	- * we have to make sure clear_received_props() includes
	- * the delayed properties).
	- *
	- * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
	- * using ASSERT() will be just like a VERIFY.
	- */
	- ASSERT(nvlist_merge(props, delayprops, 0) == 0);
	- nvlist_free(delayprops);
	- }
	-
	- /*
	- * Now that all props, initial and delayed, are set, report the prop
	- * errors to the caller.
	- */
	- if (zc->zc_nvlist_dst_size != 0 &&
	- (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 \|\|
	- put_nvlist(zc, errors) != 0)) {
	- /*
	- * Caller made zc->zc_nvlist_dst less than the minimum expected
	- * size or supplied an invalid address.
	- */
	- props_error = SET_ERROR(EINVAL);
	- }
	-
	- zc->zc_cookie = off - fp->f_offset;
	- if (off >= 0 && off <= MAXOFFSET_T)
	- fp->f_offset = off;
	-
	-#ifdef DEBUG
	- if (zfs_ioc_recv_inject_err) {
	- zfs_ioc_recv_inject_err = B_FALSE;
	- error = 1;
	- }
	-#endif
	-
	- /*
	- * On error, restore the original props.
	- */
	- if (error != 0 && props != NULL && !drc.drc_newfs) {
	- if (clear_received_props(tofs, props, NULL) != 0) {
	- /*
	- * We failed to clear the received properties.
	- * Since we may have left a $recvd value on the
	- * system, we can't clear the $hasrecvd flag.
	- */
	- zc->zc_obj \|= ZPROP_ERR_NORESTORE;
	- } else if (first_recvd_props) {
	- dsl_prop_unset_hasrecvd(tofs);
	- }
	-
	- if (origprops == NULL && !drc.drc_newfs) {
	- /* We failed to stash the original properties. */
	- zc->zc_obj \|= ZPROP_ERR_NORESTORE;
	- }
	-
	- /*
	- * dsl_props_set() will not convert RECEIVED to LOCAL on or
	- * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
	- * explictly if we're restoring local properties cleared in the
	- * first new-style receive.
	- */
	- if (origprops != NULL &&
	- zfs_set_prop_nvlist(tofs, (first_recvd_props ?
	- ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
	- origprops, NULL) != 0) {
	- /*
	- * We stashed the original properties but failed to
	- * restore them.
	- */
	- zc->zc_obj \|= ZPROP_ERR_NORESTORE;
	- }
	- }
	-out:
	- nvlist_free(props);
	- nvlist_free(origprops);
	- nvlist_free(errors);
	- releasef(fd);
	-
	- if (error == 0)
	- error = props_error;
	-
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of snapshot to send
	- * zc_cookie file descriptor to send stream to
	- * zc_obj fromorigin flag (mutually exclusive with zc_fromobj)
	- * zc_sendobj objsetid of snapshot to send
	- * zc_fromobj objsetid of incremental fromsnap (may be zero)
	- * zc_guid if set, estimate size of stream only. zc_cookie is ignored.
	- * output size in zc_objset_type.
	- * zc_flags lzc_send_flags
	- *
	- * outputs:
	- * zc_objset_type estimated size, if zc_guid is set
	- *
	- * NOTE: This is no longer the preferred interface, any new functionality
	- * should be added to zfs_ioc_send_new() instead.
	- */
	-static int
	-zfs_ioc_send(zfs_cmd_t *zc)
	-{
	- int error;
	- offset_t off;
	- boolean_t estimate = (zc->zc_guid != 0);
	- boolean_t embedok = (zc->zc_flags & 0x1);
	- boolean_t large_block_ok = (zc->zc_flags & 0x2);
	- boolean_t compressok = (zc->zc_flags & 0x4);
	-
	- if (zc->zc_obj != 0) {
	- dsl_pool_t *dp;
	- dsl_dataset_t *tosnap;
	-
	- error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- if (dsl_dir_is_clone(tosnap->ds_dir))
	- zc->zc_fromobj =
	- dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
	- dsl_dataset_rele(tosnap, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- }
	-
	- if (estimate) {
	- dsl_pool_t *dp;
	- dsl_dataset_t *tosnap;
	- dsl_dataset_t *fromsnap = NULL;
	-
	- error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- if (zc->zc_fromobj != 0) {
	- error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
	- FTAG, &fromsnap);
	- if (error != 0) {
	- dsl_dataset_rele(tosnap, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	- }
	-
	- error = dmu_send_estimate(tosnap, fromsnap, compressok,
	- &zc->zc_objset_type);
	-
	- if (fromsnap != NULL)
	- dsl_dataset_rele(fromsnap, FTAG);
	- dsl_dataset_rele(tosnap, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- } else {
	- file_t *fp;
	-
	-#ifdef illumos
	- fp = getf(zc->zc_cookie);
	-#else
	- fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp);
	-#endif
	- if (fp == NULL)
	- return (SET_ERROR(EBADF));
	-
	- off = fp->f_offset;
	- error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
	- zc->zc_fromobj, embedok, large_block_ok, compressok,
	-#ifdef illumos
	- zc->zc_cookie, fp->f_vnode, &off);
	-#else
	- zc->zc_cookie, fp, &off);
	-#endif
	-
	- if (off >= 0 && off <= MAXOFFSET_T)
	- fp->f_offset = off;
	- releasef(zc->zc_cookie);
	- }
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of snapshot on which to report progress
	- * zc_cookie file descriptor of send stream
	- *
	- * outputs:
	- * zc_cookie number of bytes written in send stream thus far
	- */
	-static int
	-zfs_ioc_send_progress(zfs_cmd_t *zc)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *ds;
	- dmu_sendarg_t *dsp = NULL;
	- int error;
	-
	- error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- mutex_enter(&ds->ds_sendstream_lock);
	-
	- /*
	- * Iterate over all the send streams currently active on this dataset.
	- * If there's one which matches the specified file descriptor _and_ the
	- * stream was started by the current process, return the progress of
	- * that stream.
	- */
	- for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
	- dsp = list_next(&ds->ds_sendstreams, dsp)) {
	- if (dsp->dsa_outfd == zc->zc_cookie &&
	- dsp->dsa_proc == curproc)
	- break;
	- }
	-
	- if (dsp != NULL)
	- zc->zc_cookie = *(dsp->dsa_off);
	- else
	- error = SET_ERROR(ENOENT);
	-
	- mutex_exit(&ds->ds_sendstream_lock);
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_inject_fault(zfs_cmd_t *zc)
	-{
	- int id, error;
	-
	- error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
	- &zc->zc_inject_record);
	-
	- if (error == 0)
	- zc->zc_guid = (uint64_t)id;
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_clear_fault(zfs_cmd_t *zc)
	-{
	- return (zio_clear_fault((int)zc->zc_guid));
	-}
	-
	-static int
	-zfs_ioc_inject_list_next(zfs_cmd_t *zc)
	-{
	- int id = (int)zc->zc_guid;
	- int error;
	-
	- error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
	- &zc->zc_inject_record);
	-
	- zc->zc_guid = id;
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_error_log(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- int error;
	- size_t count = (size_t)zc->zc_nvlist_dst_size;
	-
	- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	- return (error);
	-
	- error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
	- &count);
	- if (error == 0)
	- zc->zc_nvlist_dst_size = count;
	- else
	- zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
	-
	- spa_close(spa, FTAG);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_clear(zfs_cmd_t *zc)
	-{
	- spa_t *spa;
	- vdev_t *vd;
	- int error;
	-
	- /*
	- * On zpool clear we also fix up missing slogs
	- */
	- mutex_enter(&spa_namespace_lock);
	- spa = spa_lookup(zc->zc_name);
	- if (spa == NULL) {
	- mutex_exit(&spa_namespace_lock);
	- return (SET_ERROR(EIO));
	- }
	- if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
	- /* we need to let spa_open/spa_load clear the chains */
	- spa_set_log_state(spa, SPA_LOG_CLEAR);
	- }
	- spa->spa_last_open_failed = 0;
	- mutex_exit(&spa_namespace_lock);
	-
	- if (zc->zc_cookie & ZPOOL_NO_REWIND) {
	- error = spa_open(zc->zc_name, &spa, FTAG);
	- } else {
	- nvlist_t *policy;
	- nvlist_t *config = NULL;
	-
	- if (zc->zc_nvlist_src == 0)
	- return (SET_ERROR(EINVAL));
	-
	- if ((error = get_nvlist(zc->zc_nvlist_src,
	- zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
	- error = spa_open_rewind(zc->zc_name, &spa, FTAG,
	- policy, &config);
	- if (config != NULL) {
	- int err;
	-
	- if ((err = put_nvlist(zc, config)) != 0)
	- error = err;
	- nvlist_free(config);
	- }
	- nvlist_free(policy);
	- }
	- }
	-
	- if (error != 0)
	- return (error);
	-
	- /*
	- * If multihost is enabled, resuming I/O is unsafe as another
	- * host may have imported the pool.
	- */
	- if (spa_multihost(spa) && spa_suspended(spa))
	- return (SET_ERROR(EINVAL));
	-
	- spa_vdev_state_enter(spa, SCL_NONE);
	-
	- if (zc->zc_guid == 0) {
	- vd = NULL;
	- } else {
	- vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
	- if (vd == NULL) {
	- (void) spa_vdev_state_exit(spa, NULL, ENODEV);
	- spa_close(spa, FTAG);
	- return (SET_ERROR(ENODEV));
	- }
	- }
	-
	- vdev_clear(spa, vd);
	-
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	-
	- /*
	- * Resume any suspended I/Os.
	- */
	- if (zio_resume(spa) != 0)
	- error = SET_ERROR(EIO);
	-
	- spa_close(spa, FTAG);
	-
	- return (error);
	-}
	-
	-/*
	- * Reopen all the vdevs associated with the pool.
	- *
	- * innvl: {
	- * "scrub_restart" -> when true and scrub is running, allow to restart
	- * scrub as the side effect of the reopen (boolean).
	- * }
	- *
	- * outnvl is unused
	- */
	-static const zfs_ioc_key_t zfs_keys_pool_reopen[] = {
	- {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
	-};
	-
	-static int
	-zfs_ioc_pool_reopen(const char pool, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- spa_t *spa;
	- int error;
	- boolean_t scrub_restart = B_TRUE;
	-
	- if (innvl) {
	- scrub_restart = fnvlist_lookup_boolean_value(innvl,
	- "scrub_restart");
	- }
	-
	- error = spa_open(pool, &spa, FTAG);
	- if (error != 0)
	- return (error);
	-
	- spa_vdev_state_enter(spa, SCL_NONE);
	-
	- /*
	- * If a resilver is already in progress then set the
	- * spa_scrub_reopen flag to B_TRUE so that we don't restart
	- * the scan as a side effect of the reopen. Otherwise, let
	- * vdev_open() decided if a resilver is required.
	- */
	- spa->spa_scrub_reopen = (!scrub_restart &&
	- dsl_scan_resilvering(spa->spa_dsl_pool));
	- vdev_reopen(spa->spa_root_vdev);
	- spa->spa_scrub_reopen = B_FALSE;
	-
	- (void) spa_vdev_state_exit(spa, NULL, 0);
	- spa_close(spa, FTAG);
	- return (0);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- *
	- * outputs:
	- * zc_string name of conflicting snapshot, if there is one
	- */
	-static int
	-zfs_ioc_promote(zfs_cmd_t *zc)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t ds, ods;
	- char origin[ZFS_MAX_DATASET_NAME_LEN];
	- char *cp;
	- int error;
	-
	- zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
	- if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 \|\|
	- strchr(zc->zc_name, '%'))
	- return (SET_ERROR(EINVAL));
	-
	- error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- if (!dsl_dir_is_clone(ds->ds_dir)) {
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- error = dsl_dataset_hold_obj(dp,
	- dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods);
	- if (error != 0) {
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- dsl_dataset_name(ods, origin);
	- dsl_dataset_rele(ods, FTAG);
	- dsl_dataset_rele(ds, FTAG);
	- dsl_pool_rele(dp, FTAG);
	-
	- /*
	- * We don't need to unmount all the origin fs's snapshots, but
	- * it's easier.
	- */
	- cp = strchr(origin, '@');
	- if (cp)
	- *cp = '\0';
	- (void) dmu_objset_find(origin,
	- zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
	- return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
	-}
	-
	-/*
	- * Retrieve a single {user\|group}{used\|quota}@... property.
	- *
	- * inputs:
	- * zc_name name of filesystem
	- * zc_objset_type zfs_userquota_prop_t
	- * zc_value domain name (eg. "S-1-234-567-89")
	- * zc_guid RID/UID/GID
	- *
	- * outputs:
	- * zc_cookie property value
	- */
	-static int
	-zfs_ioc_userspace_one(zfs_cmd_t *zc)
	-{
	- zfsvfs_t *zfsvfs;
	- int error;
	-
	- if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
	- return (SET_ERROR(EINVAL));
	-
	- error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
	- if (error != 0)
	- return (error);
	-
	- error = zfs_userspace_one(zfsvfs,
	- zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
	- zfsvfs_rele(zfsvfs, FTAG);
	-
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_cookie zap cursor
	- * zc_objset_type zfs_userquota_prop_t
	- * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
	- *
	- * outputs:
	- * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t)
	- * zc_cookie zap cursor
	- */
	-static int
	-zfs_ioc_userspace_many(zfs_cmd_t *zc)
	-{
	- zfsvfs_t *zfsvfs;
	- int bufsize = zc->zc_nvlist_dst_size;
	-
	- if (bufsize <= 0)
	- return (SET_ERROR(ENOMEM));
	-
	- int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
	- if (error != 0)
	- return (error);
	-
	- void *buf = kmem_alloc(bufsize, KM_SLEEP);
	-
	- error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
	- buf, &zc->zc_nvlist_dst_size);
	-
	- if (error == 0) {
	- error = ddi_copyout(buf,
	- (void *)(uintptr_t)zc->zc_nvlist_dst,
	- zc->zc_nvlist_dst_size, zc->zc_iflags);
	- }
	- kmem_free(buf, bufsize);
	- zfsvfs_rele(zfsvfs, FTAG);
	-
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- *
	- * outputs:
	- * none
	- */
	-static int
	-zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
	-{
	- objset_t *os;
	- int error = 0;
	- zfsvfs_t *zfsvfs;
	-
	- if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
	- if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
	- /*
	- * If userused is not enabled, it may be because the
	- * objset needs to be closed & reopened (to grow the
	- * objset_phys_t). Suspend/resume the fs will do that.
	- */
	- dsl_dataset_t ds, newds;
	-
	- ds = dmu_objset_ds(zfsvfs->z_os);
	- error = zfs_suspend_fs(zfsvfs);
	- if (error == 0) {
	- dmu_objset_refresh_ownership(ds, &newds,
	- zfsvfs);
	- error = zfs_resume_fs(zfsvfs, newds);
	- }
	- }
	- if (error == 0)
	- error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
	-#ifdef illumos
	- VFS_RELE(zfsvfs->z_vfs);
	-#else
	- vfs_unbusy(zfsvfs->z_vfs);
	-#endif
	- } else {
	- /* XXX kind of reading contents without owning */
	- error = dmu_objset_hold(zc->zc_name, FTAG, &os);
	- if (error != 0)
	- return (error);
	-
	- error = dmu_objset_userspace_upgrade(os);
	- dmu_objset_rele(os, FTAG);
	- }
	-
	- return (error);
	-}
	-
	-#ifdef illumos
	-/*
	- * We don't want to have a hard dependency
	- * against some special symbols in sharefs
	- * nfs, and smbsrv. Determine them if needed when
	- * the first file system is shared.
	- * Neither sharefs, nfs or smbsrv are unloadable modules.
	- */
	-int (znfsexport_fs)(void arg);
	-int (zshare_fs)(enum sharefs_sys_op, share_t , uint32_t);
	-int (zsmbexport_fs)(void arg, boolean_t add_share);
	-
	-int zfs_nfsshare_inited;
	-int zfs_smbshare_inited;
	-
	-ddi_modhandle_t nfs_mod;
	-ddi_modhandle_t sharefs_mod;
	-ddi_modhandle_t smbsrv_mod;
	-#endif /* illumos */
	-kmutex_t zfs_share_lock;
	-
	-#ifdef illumos
	-static int
	-zfs_init_sharefs()
	-{
	- int error;
	-
	- ASSERT(MUTEX_HELD(&zfs_share_lock));
	- /* Both NFS and SMB shares also require sharetab support. */
	- if (sharefs_mod == NULL && ((sharefs_mod =
	- ddi_modopen("fs/sharefs",
	- KRTLD_MODE_FIRST, &error)) == NULL)) {
	- return (SET_ERROR(ENOSYS));
	- }
	- if (zshare_fs == NULL && ((zshare_fs =
	- (int ()(enum sharefs_sys_op, share_t , uint32_t))
	- ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
	- return (SET_ERROR(ENOSYS));
	- }
	- return (0);
	-}
	-#endif /* illumos */
	-
	-static int
	-zfs_ioc_share(zfs_cmd_t *zc)
	-{
	-#ifdef illumos
	- int error;
	- int opcode;
	-
	- switch (zc->zc_share.z_sharetype) {
	- case ZFS_SHARE_NFS:
	- case ZFS_UNSHARE_NFS:
	- if (zfs_nfsshare_inited == 0) {
	- mutex_enter(&zfs_share_lock);
	- if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
	- KRTLD_MODE_FIRST, &error)) == NULL)) {
	- mutex_exit(&zfs_share_lock);
	- return (SET_ERROR(ENOSYS));
	- }
	- if (znfsexport_fs == NULL &&
	- ((znfsexport_fs = (int ()(void ))
	- ddi_modsym(nfs_mod,
	- "nfs_export", &error)) == NULL)) {
	- mutex_exit(&zfs_share_lock);
	- return (SET_ERROR(ENOSYS));
	- }
	- error = zfs_init_sharefs();
	- if (error != 0) {
	- mutex_exit(&zfs_share_lock);
	- return (SET_ERROR(ENOSYS));
	- }
	- zfs_nfsshare_inited = 1;
	- mutex_exit(&zfs_share_lock);
	- }
	- break;
	- case ZFS_SHARE_SMB:
	- case ZFS_UNSHARE_SMB:
	- if (zfs_smbshare_inited == 0) {
	- mutex_enter(&zfs_share_lock);
	- if (smbsrv_mod == NULL && ((smbsrv_mod =
	- ddi_modopen("drv/smbsrv",
	- KRTLD_MODE_FIRST, &error)) == NULL)) {
	- mutex_exit(&zfs_share_lock);
	- return (SET_ERROR(ENOSYS));
	- }
	- if (zsmbexport_fs == NULL && ((zsmbexport_fs =
	- (int ()(void , boolean_t))ddi_modsym(smbsrv_mod,
	- "smb_server_share", &error)) == NULL)) {
	- mutex_exit(&zfs_share_lock);
	- return (SET_ERROR(ENOSYS));
	- }
	- error = zfs_init_sharefs();
	- if (error != 0) {
	- mutex_exit(&zfs_share_lock);
	- return (SET_ERROR(ENOSYS));
	- }
	- zfs_smbshare_inited = 1;
	- mutex_exit(&zfs_share_lock);
	- }
	- break;
	- default:
	- return (SET_ERROR(EINVAL));
	- }
	-
	- switch (zc->zc_share.z_sharetype) {
	- case ZFS_SHARE_NFS:
	- case ZFS_UNSHARE_NFS:
	- if (error =
	- znfsexport_fs((void *)
	- (uintptr_t)zc->zc_share.z_exportdata))
	- return (error);
	- break;
	- case ZFS_SHARE_SMB:
	- case ZFS_UNSHARE_SMB:
	- if (error = zsmbexport_fs((void *)
	- (uintptr_t)zc->zc_share.z_exportdata,
	- zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
	- B_TRUE: B_FALSE)) {
	- return (error);
	- }
	- break;
	- }
	-
	- opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS \|\|
	- zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
	- SHAREFS_ADD : SHAREFS_REMOVE;
	-
	- /*
	- * Add or remove share from sharetab
	- */
	- error = zshare_fs(opcode,
	- (void *)(uintptr_t)zc->zc_share.z_sharedata,
	- zc->zc_share.z_sharemax);
	-
	- return (error);
	-
	-#else /* !illumos */
	- return (ENOSYS);
	-#endif /* illumos */
	-}
	-
	-ace_t full_access[] = {
	- {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
	-};
	-
	-/*
	- * inputs:
	- * zc_name name of containing filesystem
	- * zc_obj object # beyond which we want next in-use object #
	- *
	- * outputs:
	- * zc_obj next in-use object #
	- */
	-static int
	-zfs_ioc_next_obj(zfs_cmd_t *zc)
	-{
	- objset_t *os = NULL;
	- int error;
	-
	- error = dmu_objset_hold(zc->zc_name, FTAG, &os);
	- if (error != 0)
	- return (error);
	-
	- error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0);
	-
	- dmu_objset_rele(os, FTAG);
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of filesystem
	- * zc_value prefix name for snapshot
	- * zc_cleanup_fd cleanup-on-exit file descriptor for calling process
	- *
	- * outputs:
	- * zc_value short name of new snapshot
	- */
	-static int
	-zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
	-{
	- char *snap_name;
	- char *hold_name;
	- int error;
	- minor_t minor;
	-
	- error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
	- if (error != 0)
	- return (error);
	-
	- snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
	- (u_longlong_t)ddi_get_lbolt64());
	- hold_name = kmem_asprintf("%%%s", zc->zc_value);
	-
	- error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
	- hold_name);
	- if (error == 0)
	- (void) strcpy(zc->zc_value, snap_name);
	- strfree(snap_name);
	- strfree(hold_name);
	- zfs_onexit_fd_rele(zc->zc_cleanup_fd);
	- return (error);
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of "to" snapshot
	- * zc_value name of "from" snapshot
	- * zc_cookie file descriptor to write diff data on
	- *
	- * outputs:
	- * dmu_diff_record_t's to the file descriptor
	- */
	-static int
	-zfs_ioc_diff(zfs_cmd_t *zc)
	-{
	- file_t *fp;
	- offset_t off;
	- int error;
	-
	-#ifdef illumos
	- fp = getf(zc->zc_cookie);
	-#else
	- fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp);
	-#endif
	- if (fp == NULL)
	- return (SET_ERROR(EBADF));
	-
	- off = fp->f_offset;
	-
	-#ifdef illumos
	- error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
	-#else
	- error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
	-#endif
	-
	- if (off >= 0 && off <= MAXOFFSET_T)
	- fp->f_offset = off;
	- releasef(zc->zc_cookie);
	-
	- return (error);
	-}
	-
	-#ifdef illumos
	-/*
	- * Remove all ACL files in shares dir
	- */
	-static int
	-zfs_smb_acl_purge(znode_t *dzp)
	-{
	- zap_cursor_t zc;
	- zap_attribute_t zap;
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- int error;
	-
	- for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
	- (error = zap_cursor_retrieve(&zc, &zap)) == 0;
	- zap_cursor_advance(&zc)) {
	- if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
	- NULL, 0)) != 0)
	- break;
	- }
	- zap_cursor_fini(&zc);
	- return (error);
	-}
	-#endif /* illumos */
	-
	-static int
	-zfs_ioc_smb_acl(zfs_cmd_t *zc)
	-{
	-#ifdef illumos
	- vnode_t *vp;
	- znode_t *dzp;
	- vnode_t *resourcevp = NULL;
	- znode_t *sharedir;
	- zfsvfs_t *zfsvfs;
	- nvlist_t *nvlist;
	- char src, target;
	- vattr_t vattr;
	- vsecattr_t vsec;
	- int error = 0;
	-
	- if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
	- NO_FOLLOW, NULL, &vp)) != 0)
	- return (error);
	-
	- /* Now make sure mntpnt and dataset are ZFS */
	-
	- if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 \|\|
	- (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
	- zc->zc_name) != 0)) {
	- VN_RELE(vp);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- dzp = VTOZ(vp);
	- zfsvfs = dzp->z_zfsvfs;
	- ZFS_ENTER(zfsvfs);
	-
	- /*
	- * Create share dir if its missing.
	- */
	- mutex_enter(&zfsvfs->z_lock);
	- if (zfsvfs->z_shares_dir == 0) {
	- dmu_tx_t *tx;
	-
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
	- ZFS_SHARES_DIR);
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error != 0) {
	- dmu_tx_abort(tx);
	- } else {
	- error = zfs_create_share_dir(zfsvfs, tx);
	- dmu_tx_commit(tx);
	- }
	- if (error != 0) {
	- mutex_exit(&zfsvfs->z_lock);
	- VN_RELE(vp);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- }
	- mutex_exit(&zfsvfs->z_lock);
	-
	- ASSERT(zfsvfs->z_shares_dir);
	- if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
	- VN_RELE(vp);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- switch (zc->zc_cookie) {
	- case ZFS_SMB_ACL_ADD:
	- vattr.va_mask = AT_MODE\|AT_UID\|AT_GID\|AT_TYPE;
	- vattr.va_type = VREG;
	- vattr.va_mode = S_IFREG\|0777;
	- vattr.va_uid = 0;
	- vattr.va_gid = 0;
	-
	- vsec.vsa_mask = VSA_ACE;
	- vsec.vsa_aclentp = &full_access;
	- vsec.vsa_aclentsz = sizeof (full_access);
	- vsec.vsa_aclcnt = 1;
	-
	- error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
	- &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
	- if (resourcevp)
	- VN_RELE(resourcevp);
	- break;
	-
	- case ZFS_SMB_ACL_REMOVE:
	- error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
	- NULL, 0);
	- break;
	-
	- case ZFS_SMB_ACL_RENAME:
	- if ((error = get_nvlist(zc->zc_nvlist_src,
	- zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
	- VN_RELE(vp);
	- VN_RELE(ZTOV(sharedir));
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) \|\|
	- nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
	- &target)) {
	- VN_RELE(vp);
	- VN_RELE(ZTOV(sharedir));
	- ZFS_EXIT(zfsvfs);
	- nvlist_free(nvlist);
	- return (error);
	- }
	- error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
	- kcred, NULL, 0);
	- nvlist_free(nvlist);
	- break;
	-
	- case ZFS_SMB_ACL_PURGE:
	- error = zfs_smb_acl_purge(sharedir);
	- break;
	-
	- default:
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	-
	- VN_RELE(vp);
	- VN_RELE(ZTOV(sharedir));
	-
	- ZFS_EXIT(zfsvfs);
	-
	- return (error);
	-#else /* !illumos */
	- return (EOPNOTSUPP);
	-#endif /* illumos */
	-}
	-
	-/*
	- * innvl: {
	- * "holds" -> { snapname -> holdname (string), ... }
	- * (optional) "cleanup_fd" -> fd (int32)
	- * }
	- *
	- * outnvl: {
	- * snapname -> error value (int32)
	- * ...
	- * }
	- */
	-static const zfs_ioc_key_t zfs_keys_hold[] = {
	- {"holds", DATA_TYPE_NVLIST, 0},
	- {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL},
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_hold(const char pool, nvlist_t args, nvlist_t *errlist)
	-{
	- nvpair_t *pair;
	- nvlist_t *holds;
	- int cleanup_fd = -1;
	- int error;
	- minor_t minor = 0;
	-
	- holds = fnvlist_lookup_nvlist(args, "holds");
	-
	- /* make sure the user didn't pass us any invalid (empty) tags */
	- for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
	- pair = nvlist_next_nvpair(holds, pair)) {
	- char *htag;
	-
	- error = nvpair_value_string(pair, &htag);
	- if (error != 0)
	- return (SET_ERROR(error));
	-
	- if (strlen(htag) == 0)
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
	- error = zfs_onexit_fd_hold(cleanup_fd, &minor);
	- if (error != 0)
	- return (error);
	- }
	-
	- error = dsl_dataset_user_hold(holds, minor, errlist);
	- if (minor != 0)
	- zfs_onexit_fd_rele(cleanup_fd);
	- return (error);
	-}
	-
	-/*
	- * innvl is not used.
	- *
	- * outnvl: {
	- * holdname -> time added (uint64 seconds since epoch)
	- * ...
	- * }
	- */
	-static const zfs_ioc_key_t zfs_keys_get_holds[] = {
	- /* no nvl keys */
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_get_holds(const char snapname, nvlist_t args, nvlist_t *outnvl)
	-{
	- return (dsl_dataset_get_holds(snapname, outnvl));
	-}
	-
	-/*
	- * innvl: {
	- * snapname -> { holdname, ... }
	- * ...
	- * }
	- *
	- * outnvl: {
	- * snapname -> error value (int32)
	- * ...
	- * }
	- */
	-static const zfs_ioc_key_t zfs_keys_release[] = {
	- {"<snapname>...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST},
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_release(const char pool, nvlist_t holds, nvlist_t *errlist)
	-{
	- return (dsl_dataset_user_release(holds, errlist));
	-}
	-
	-/*
	- * inputs:
	- * zc_name name of new filesystem or snapshot
	- * zc_value full name of old snapshot
	- *
	- * outputs:
	- * zc_cookie space in bytes
	- * zc_objset_type compressed space in bytes
	- * zc_perm_action uncompressed space in bytes
	- */
	-static int
	-zfs_ioc_space_written(zfs_cmd_t *zc)
	-{
	- int error;
	- dsl_pool_t *dp;
	- dsl_dataset_t new, old;
	-
	- error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	- if (error != 0)
	- return (error);
	- error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	- error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
	- if (error != 0) {
	- dsl_dataset_rele(new, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- error = dsl_dataset_space_written(old, new, &zc->zc_cookie,
	- &zc->zc_objset_type, &zc->zc_perm_action);
	- dsl_dataset_rele(old, FTAG);
	- dsl_dataset_rele(new, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	-}
	-
	-/*
	- * innvl: {
	- * "firstsnap" -> snapshot name
	- * }
	- *
	- * outnvl: {
	- * "used" -> space in bytes
	- * "compressed" -> compressed space in bytes
	- * "uncompressed" -> uncompressed space in bytes
	- * }
	- */
	-static const zfs_ioc_key_t zfs_keys_space_snaps[] = {
	- {"firstsnap", DATA_TYPE_STRING, 0},
	-};
	-
	-static int
	-zfs_ioc_space_snaps(const char lastsnap, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- int error;
	- dsl_pool_t *dp;
	- dsl_dataset_t new, old;
	- char *firstsnap;
	- uint64_t used, comp, uncomp;
	-
	- firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
	-
	- error = dsl_pool_hold(lastsnap, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
	- if (error == 0 && !new->ds_is_snapshot) {
	- dsl_dataset_rele(new, FTAG);
	- error = SET_ERROR(EINVAL);
	- }
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	- error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
	- if (error == 0 && !old->ds_is_snapshot) {
	- dsl_dataset_rele(old, FTAG);
	- error = SET_ERROR(EINVAL);
	- }
	- if (error != 0) {
	- dsl_dataset_rele(new, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
	- dsl_dataset_rele(old, FTAG);
	- dsl_dataset_rele(new, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- fnvlist_add_uint64(outnvl, "used", used);
	- fnvlist_add_uint64(outnvl, "compressed", comp);
	- fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
	- return (error);
	-}
	-
	-static int
	-zfs_ioc_jail(zfs_cmd_t *zc)
	-{
	-
	- return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
	- (int)zc->zc_jailid));
	-}
	-
	-static int
	-zfs_ioc_unjail(zfs_cmd_t *zc)
	-{
	-
	- return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
	- (int)zc->zc_jailid));
	-}
	-
	-/*
	- * innvl: {
	- * "fd" -> file descriptor to write stream to (int32)
	- * (optional) "fromsnap" -> full snap name to send an incremental from
	- * (optional) "largeblockok" -> (value ignored)
	- * indicates that blocks > 128KB are permitted
	- * (optional) "embedok" -> (value ignored)
	- * presence indicates DRR_WRITE_EMBEDDED records are permitted
	- * (optional) "compressok" -> (value ignored)
	- * presence indicates compressed DRR_WRITE records are permitted
	- * (optional) "resume_object" and "resume_offset" -> (uint64)
	- * if present, resume send stream from specified object and offset.
	- * }
	- *
	- * outnvl is unused
	- */
	-static const zfs_ioc_key_t zfs_keys_send_new[] = {
	- {"fd", DATA_TYPE_INT32, 0},
	- {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL},
	- {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
	- {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
	- {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
	- {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
	- {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL},
	- {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL},
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_send_new(const char snapname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- file_t *fp;
	- int error;
	- offset_t off;
	- char *fromname = NULL;
	- int fd;
	- boolean_t largeblockok;
	- boolean_t embedok;
	- boolean_t compressok;
	- uint64_t resumeobj = 0;
	- uint64_t resumeoff = 0;
	-
	- fd = fnvlist_lookup_int32(innvl, "fd");
	-
	- (void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
	-
	- largeblockok = nvlist_exists(innvl, "largeblockok");
	- embedok = nvlist_exists(innvl, "embedok");
	- compressok = nvlist_exists(innvl, "compressok");
	-
	- (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
	- (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
	-
	-#ifdef illumos
	- file_t *fp = getf(fd);
	-#else
	- fget_write(curthread, fd, &cap_write_rights, &fp);
	-#endif
	- if (fp == NULL)
	- return (SET_ERROR(EBADF));
	-
	- off = fp->f_offset;
	- error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
	-#ifdef illumos
	- fd, resumeobj, resumeoff, fp->f_vnode, &off);
	-#else
	- fd, resumeobj, resumeoff, fp, &off);
	-#endif
	-
	-#ifdef illumos
	- if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
	- fp->f_offset = off;
	-#else
	- fp->f_offset = off;
	-#endif
	-
	- releasef(fd);
	- return (error);
	-}
	-
	-/*
	- * Determine approximately how large a zfs send stream will be -- the number
	- * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
	- *
	- * innvl: {
	- * (optional) "from" -> full snap or bookmark name to send an incremental
	- * from
	- * (optional) "largeblockok" -> (value ignored)
	- * indicates that blocks > 128KB are permitted
	- * (optional) "embedok" -> (value ignored)
	- * presence indicates DRR_WRITE_EMBEDDED records are permitted
	- * (optional) "compressok" -> (value ignored)
	- * presence indicates compressed DRR_WRITE records are permitted
	- * }
	- *
	- * outnvl: {
	- * "space" -> bytes of space (uint64)
	- * }
	- */
	-static const zfs_ioc_key_t zfs_keys_send_space[] = {
	- {"from", DATA_TYPE_STRING, ZK_OPTIONAL},
	- {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL},
	- {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
	- {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
	- {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
	- {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
	-};
	-
	-static int
	-zfs_ioc_send_space(const char snapname, nvlist_t innvl, nvlist_t *outnvl)
	-{
	- dsl_pool_t *dp;
	- dsl_dataset_t *tosnap;
	- int error;
	- char *fromname;
	- boolean_t compressok;
	- uint64_t space;
	-
	- error = dsl_pool_hold(snapname, FTAG, &dp);
	- if (error != 0)
	- return (error);
	-
	- error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
	- if (error != 0) {
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	- }
	-
	- compressok = nvlist_exists(innvl, "compressok");
	-
	- error = nvlist_lookup_string(innvl, "from", &fromname);
	- if (error == 0) {
	- if (strchr(fromname, '@') != NULL) {
	- /*
	- * If from is a snapshot, hold it and use the more
	- * efficient dmu_send_estimate to estimate send space
	- * size using deadlists.
	- */
	- dsl_dataset_t *fromsnap;
	- error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
	- if (error != 0)
	- goto out;
	- error = dmu_send_estimate(tosnap, fromsnap, compressok,
	- &space);
	- dsl_dataset_rele(fromsnap, FTAG);
	- } else if (strchr(fromname, '#') != NULL) {
	- /*
	- * If from is a bookmark, fetch the creation TXG of the
	- * snapshot it was created from and use that to find
	- * blocks that were born after it.
	- */
	- zfs_bookmark_phys_t frombm;
	-
	- error = dsl_bookmark_lookup(dp, fromname, tosnap,
	- &frombm);
	- if (error != 0)
	- goto out;
	- error = dmu_send_estimate_from_txg(tosnap,
	- frombm.zbm_creation_txg, compressok, &space);
	- } else {
	- /*
	- * from is not properly formatted as a snapshot or
	- * bookmark
	- */
	- error = SET_ERROR(EINVAL);
	- goto out;
	- }
	- } else {
	- /*
	- * If estimating the size of a full send, use dmu_send_estimate.
	- */
	- error = dmu_send_estimate(tosnap, NULL, compressok, &space);
	- }
	-
	- fnvlist_add_uint64(outnvl, "space", space);
	-
	-out:
	- dsl_dataset_rele(tosnap, FTAG);
	- dsl_pool_rele(dp, FTAG);
	- return (error);
	-}
	-
	-/*
	- * Sync the currently open TXG to disk for the specified pool.
	- * This is somewhat similar to 'zfs_sync()'.
	- * For cases that do not result in error this ioctl will wait for
	- * the currently open TXG to commit before returning back to the caller.
	- *
	- * innvl: {
	- * "force" -> when true, force uberblock update even if there is no dirty data.
	- * In addition this will cause the vdev configuration to be written
	- * out including updating the zpool cache file. (boolean_t)
	- * }
	- *
	- * onvl is unused
	- */
	-static const zfs_ioc_key_t zfs_keys_pool_sync[] = {
	- {"force", DATA_TYPE_BOOLEAN_VALUE, 0},
	-};
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioc_pool_sync(const char pool, nvlist_t innvl, nvlist_t *onvl)
	-{
	- int err;
	- boolean_t force;
	- spa_t *spa;
	-
	- if ((err = spa_open(pool, &spa, FTAG)) != 0)
	- return (err);
	-
	- force = fnvlist_lookup_boolean_value(innvl, "force");
	- if (force) {
	- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER);
	- vdev_config_dirty(spa->spa_root_vdev);
	- spa_config_exit(spa, SCL_CONFIG, FTAG);
	- }
	- txg_wait_synced(spa_get_dsl(spa), 0);
	-
	- spa_close(spa, FTAG);
	-
	- return (err);
	-}
	-
	-static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
	-
	-static void
	-zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	- zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
	- boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
	-{
	- zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
	-
	- ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
	- ASSERT3U(ioc, <, ZFS_IOC_LAST);
	- ASSERT3P(vec->zvec_legacy_func, ==, NULL);
	- ASSERT3P(vec->zvec_func, ==, NULL);
	-
	- vec->zvec_legacy_func = func;
	- vec->zvec_secpolicy = secpolicy;
	- vec->zvec_namecheck = namecheck;
	- vec->zvec_allow_log = log_history;
	- vec->zvec_pool_check = pool_check;
	-}
	-
	-/*
	- * See the block comment at the beginning of this file for details on
	- * each argument to this function.
	- */
	-static void
	-zfs_ioctl_register(const char name, zfs_ioc_t ioc, zfs_ioc_func_t func,
	- zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
	- zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
	- boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys)
	-{
	- zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
	-
	- ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
	- ASSERT3U(ioc, <, ZFS_IOC_LAST);
	- ASSERT3P(vec->zvec_legacy_func, ==, NULL);
	- ASSERT3P(vec->zvec_func, ==, NULL);
	-
	- /* if we are logging, the name must be valid */
	- ASSERT(!allow_log \|\| namecheck != NO_NAME);
	-
	- vec->zvec_name = name;
	- vec->zvec_func = func;
	- vec->zvec_secpolicy = secpolicy;
	- vec->zvec_namecheck = namecheck;
	- vec->zvec_pool_check = pool_check;
	- vec->zvec_smush_outnvlist = smush_outnvlist;
	- vec->zvec_allow_log = allow_log;
	- vec->zvec_nvl_keys = nvl_keys;
	- vec->zvec_nvl_key_count = num_keys;
	-}
	-
	-static void
	-zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	- zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
	- zfs_ioc_poolcheck_t pool_check)
	-{
	- zfs_ioctl_register_legacy(ioc, func, secpolicy,
	- POOL_NAME, log_history, pool_check);
	-}
	-
	-static void
	-zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	- zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
	-{
	- zfs_ioctl_register_legacy(ioc, func, secpolicy,
	- DATASET_NAME, B_FALSE, pool_check);
	-}
	-
	-static void
	-zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
	-{
	- zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
	- POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY);
	-}
	-
	-static void
	-zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	- zfs_secpolicy_func_t *secpolicy)
	-{
	- zfs_ioctl_register_legacy(ioc, func, secpolicy,
	- NO_NAME, B_FALSE, POOL_CHECK_NONE);
	-}
	-
	-static void
	-zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
	- zfs_ioc_legacy_func_t func, zfs_secpolicy_func_t secpolicy)
	-{
	- zfs_ioctl_register_legacy(ioc, func, secpolicy,
	- DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
	-}
	-
	-static void
	-zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
	-{
	- zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
	- zfs_secpolicy_read);
	-}
	-
	-static void
	-zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	- zfs_secpolicy_func_t *secpolicy)
	-{
	- zfs_ioctl_register_legacy(ioc, func, secpolicy,
	- DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY);
	-}
	-
	-static void
	-zfs_ioctl_init(void)
	-{
	- zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
	- zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot));
	-
	- zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
	- zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_FALSE, B_FALSE,
	- zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history));
	-
	- zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
	- zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
	- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
	- zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps));
	-
	- zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
	- zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
	- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
	- zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new));
	-
	- zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
	- zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
	- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
	- zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space));
	-
	- zfs_ioctl_register("create", ZFS_IOC_CREATE,
	- zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_create, ARRAY_SIZE(zfs_keys_create));
	-
	- zfs_ioctl_register("clone", ZFS_IOC_CLONE,
	- zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone));
	-
	- zfs_ioctl_register("remap", ZFS_IOC_REMAP,
	- zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_FALSE, B_TRUE,
	- zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap));
	-
	- zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
	- zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps));
	-
	- zfs_ioctl_register("hold", ZFS_IOC_HOLD,
	- zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold));
	- zfs_ioctl_register("release", ZFS_IOC_RELEASE,
	- zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_release, ARRAY_SIZE(zfs_keys_release));
	-
	- zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
	- zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
	- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
	- zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds));
	-
	- zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
	- zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_FALSE, B_TRUE,
	- zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback));
	-
	- zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
	- zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark));
	-
	- zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
	- zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
	- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
	- zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks));
	-
	- zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
	- zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
	- POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_destroy_bookmarks,
	- ARRAY_SIZE(zfs_keys_destroy_bookmarks));
	-
	- zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
	- zfs_ioc_channel_program, zfs_secpolicy_config,
	- POOL_NAME, POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE,
	- B_TRUE, zfs_keys_channel_program,
	- ARRAY_SIZE(zfs_keys_channel_program));
	-
	- zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
	- zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint));
	-
	- zfs_ioctl_register("zpool_discard_checkpoint",
	- ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
	- zfs_secpolicy_config, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_pool_discard_checkpoint,
	- ARRAY_SIZE(zfs_keys_pool_discard_checkpoint));
	-
	- zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
	- zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE,
	- zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
	-
	- zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
	- zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_FALSE, B_FALSE,
	- zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync));
	- zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
	- zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE,
	- B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen));
	-
	- zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV,
	- zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_FALSE, B_TRUE,
	- zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv));
	-
	- zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV,
	- zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME,
	- POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE,
	- zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv));
	-
	- /* IOCTLS that use the legacy function signature */
	-
	- zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
	- zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
	-
	- zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
	- zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
	- zfs_ioc_pool_scan);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
	- zfs_ioc_pool_upgrade);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
	- zfs_ioc_vdev_add);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
	- zfs_ioc_vdev_remove);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
	- zfs_ioc_vdev_set_state);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
	- zfs_ioc_vdev_attach);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
	- zfs_ioc_vdev_detach);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
	- zfs_ioc_vdev_setpath);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
	- zfs_ioc_vdev_setfru);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
	- zfs_ioc_pool_set_props);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
	- zfs_ioc_vdev_split);
	- zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
	- zfs_ioc_pool_reguid);
	-
	- zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
	- zfs_ioc_pool_configs, zfs_secpolicy_none);
	- zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
	- zfs_ioc_pool_tryimport, zfs_secpolicy_config);
	- zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
	- zfs_ioc_inject_fault, zfs_secpolicy_inject);
	- zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
	- zfs_ioc_clear_fault, zfs_secpolicy_inject);
	- zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
	- zfs_ioc_inject_list_next, zfs_secpolicy_inject);
	-
	- /*
	- * pool destroy, and export don't log the history as part of
	- * zfsdev_ioctl, but rather zfs_ioc_pool_export
	- * does the logging of those commands.
	- */
	- zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
	- zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
	- zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
	- zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
	-
	- zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
	- zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
	- zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
	- zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
	-
	- zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
	- zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
	- zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
	- zfs_ioc_dsobj_to_dsname,
	- zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
	- zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
	- zfs_ioc_pool_get_history,
	- zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
	-
	- zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
	- zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
	-
	- zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
	- zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY);
	-
	- zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
	- zfs_ioc_space_written);
	- zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
	- zfs_ioc_objset_recvd_props);
	- zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
	- zfs_ioc_next_obj);
	- zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
	- zfs_ioc_get_fsacl);
	- zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
	- zfs_ioc_objset_stats);
	- zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
	- zfs_ioc_objset_zplprops);
	- zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
	- zfs_ioc_dataset_list_next);
	- zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
	- zfs_ioc_snapshot_list_next);
	- zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
	- zfs_ioc_send_progress);
	-
	- zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
	- zfs_ioc_diff, zfs_secpolicy_diff);
	- zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
	- zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
	- zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
	- zfs_ioc_obj_to_path, zfs_secpolicy_diff);
	- zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
	- zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
	- zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
	- zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
	- zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
	- zfs_ioc_send, zfs_secpolicy_send);
	-
	- zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
	- zfs_secpolicy_none);
	- zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
	- zfs_secpolicy_destroy);
	- zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
	- zfs_secpolicy_recv);
	- zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
	- zfs_secpolicy_promote);
	- zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
	- zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
	- zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
	- zfs_secpolicy_set_fsacl);
	-
	- /*
	- * Not using zfs_ioctl_register_dataset_modify as DATASET_NAME check
	- * won't allow a bookmark name.
	- */
	- zfs_ioctl_register_legacy(ZFS_IOC_RENAME, zfs_ioc_rename,
	- zfs_secpolicy_rename, ENTITY_NAME, B_TRUE,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY);
	-
	- zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
	- zfs_secpolicy_share, POOL_CHECK_NONE);
	- zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
	- zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
	- zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
	- zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY);
	- zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
	- zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
	- POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY);
	-
	-#ifdef __FreeBSD__
	- zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
	- zfs_secpolicy_config, POOL_CHECK_NONE);
	- zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
	- zfs_secpolicy_config, POOL_CHECK_NONE);
	- zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
	- zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
	- POOL_CHECK_NONE, B_FALSE, B_FALSE,
	- zfs_keys_nextboot, ARRAY_SIZE(zfs_keys_nextboot));
	-#endif
	-}
	-
	-/*
	- * Verify that for non-legacy ioctls the input nvlist
	- * pairs match against the expected input.
	- *
	- * Possible errors are:
	- * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered
	- * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing
	- * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair
	- */
	-static int
	-zfs_check_input_nvpairs(nvlist_t innvl, const zfs_ioc_vec_t vec)
	-{
	- const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys;
	- boolean_t required_keys_found = B_FALSE;
	-
	- /*
	- * examine each input pair
	- */
	- for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
	- pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
	- char *name = nvpair_name(pair);
	- data_type_t type = nvpair_type(pair);
	- boolean_t identified = B_FALSE;
	-
	- /*
	- * check pair against the documented names and type
	- */
	- for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
	- /* if not a wild card name, check for an exact match */
	- if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 &&
	- strcmp(nvl_keys[k].zkey_name, name) != 0)
	- continue;
	-
	- identified = B_TRUE;
	-
	- if (nvl_keys[k].zkey_type != DATA_TYPE_ANY &&
	- nvl_keys[k].zkey_type != type) {
	- return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE));
	- }
	-
	- if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
	- continue;
	-
	- required_keys_found = B_TRUE;
	- break;
	- }
	-
	- /* allow an 'optional' key, everything else is invalid */
	- if (!identified &&
	- (strcmp(name, "optional") != 0 \|\|
	- type != DATA_TYPE_NVLIST)) {
	- return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL));
	- }
	- }
	-
	- /* verify that all required keys were found */
	- for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
	- if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
	- continue;
	-
	- if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) {
	- /* at least one non-optionial key is expected here */
	- if (!required_keys_found)
	- return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
	- continue;
	- }
	-
	- if (!nvlist_exists(innvl, nvl_keys[k].zkey_name))
	- return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
	- }
	-
	- return (0);
	-}
	-
	-int
	-pool_status_check(const char *name, zfs_ioc_namecheck_t type,
	- zfs_ioc_poolcheck_t check)
	-{
	- spa_t *spa;
	- int error;
	-
	- ASSERT(type == POOL_NAME \|\| type == DATASET_NAME \|\|
	- type == ENTITY_NAME);
	-
	- if (check & POOL_CHECK_NONE)
	- return (0);
	-
	- error = spa_open(name, &spa, FTAG);
	- if (error == 0) {
	- if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
	- error = SET_ERROR(EAGAIN);
	- else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
	- error = SET_ERROR(EROFS);
	- spa_close(spa, FTAG);
	- }
	- return (error);
	-}
	-
	-/*
	- * Find a free minor number.
	- */
	-minor_t
	-zfsdev_minor_alloc(void)
	-{
	- static minor_t last_minor;
	- minor_t m;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- for (m = last_minor + 1; m != last_minor; m++) {
	- if (m > ZFSDEV_MAX_MINOR)
	- m = 1;
	- if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
	- last_minor = m;
	- return (m);
	- }
	- }
	-
	- return (0);
	-}
	-
	-static int
	-zfs_ctldev_init(struct cdev *devp)
	-{
	- minor_t minor;
	- zfs_soft_state_t *zs;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- minor = zfsdev_minor_alloc();
	- if (minor == 0)
	- return (SET_ERROR(ENXIO));
	-
	- if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
	- return (SET_ERROR(EAGAIN));
	-
	- devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
	-
	- zs = ddi_get_soft_state(zfsdev_state, minor);
	- zs->zss_type = ZSST_CTLDEV;
	- zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
	-
	- return (0);
	-}
	-
	-static void
	-zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
	-{
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	-
	- zfs_onexit_destroy(zo);
	- ddi_soft_state_free(zfsdev_state, minor);
	-}
	-
	-void *
	-zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
	-{
	- zfs_soft_state_t *zp;
	-
	- zp = ddi_get_soft_state(zfsdev_state, minor);
	- if (zp == NULL \|\| zp->zss_type != which)
	- return (NULL);
	-
	- return (zp->zss_data);
	-}
	-
	-static int
	-zfsdev_open(struct cdev devp, int flag, int mode, struct thread td)
	-{
	- int error = 0;
	-
	-#ifdef illumos
	- if (getminor(*devp) != 0)
	- return (zvol_open(devp, flag, otyp, cr));
	-#endif
	-
	- /* This is the control device. Allocate a new minor if requested. */
	- if (flag & FEXCL) {
	- mutex_enter(&spa_namespace_lock);
	- error = zfs_ctldev_init(devp);
	- mutex_exit(&spa_namespace_lock);
	- }
	-
	- return (error);
	-}
	-
	-static void
	-zfsdev_close(void *data)
	-{
	- zfs_onexit_t *zo;
	- minor_t minor = (minor_t)(uintptr_t)data;
	-
	- if (minor == 0)
	- return;
	-
	- mutex_enter(&spa_namespace_lock);
	- zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
	- if (zo == NULL) {
	- mutex_exit(&spa_namespace_lock);
	- return;
	- }
	- zfs_ctldev_destroy(zo, minor);
	- mutex_exit(&spa_namespace_lock);
	-}
	-
	-static int
	-zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
	- struct thread *td)
	-{
	- zfs_cmd_t *zc;
	- uint_t vecnum;
	- int error, rc, len;
	-#ifdef illumos
	- minor_t minor = getminor(dev);
	-#else
	- zfs_iocparm_t *zc_iocparm;
	- int cflag, cmd, oldvecnum;
	- boolean_t newioc, compat;
	- void *compat_zc = NULL;
	- cred_t *cr = td->td_ucred;
	-#endif
	- const zfs_ioc_vec_t *vec;
	- char *saved_poolname = NULL;
	- nvlist_t *innvl = NULL;
	-
	- cflag = ZFS_CMD_COMPAT_NONE;
	- compat = B_FALSE;
	- newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */
	-
	- len = IOCPARM_LEN(zcmd);
	- vecnum = cmd = zcmd & 0xff;
	-
	- /*
	- * Check if we are talking to supported older binaries
	- * and translate zfs_cmd if necessary
	- */
	- if (len != sizeof(zfs_iocparm_t)) {
	- newioc = B_FALSE;
	- compat = B_TRUE;
	-
	- vecnum = cmd;
	-
	- switch (len) {
	- case sizeof(zfs_cmd_zcmd_t):
	- cflag = ZFS_CMD_COMPAT_LZC;
	- break;
	- case sizeof(zfs_cmd_deadman_t):
	- cflag = ZFS_CMD_COMPAT_DEADMAN;
	- break;
	- case sizeof(zfs_cmd_v28_t):
	- cflag = ZFS_CMD_COMPAT_V28;
	- break;
	- case sizeof(zfs_cmd_v15_t):
	- if (cmd >= sizeof(zfs_ioctl_v15_to_v28) /
	- sizeof(zfs_ioctl_v15_to_v28[0]))
	- return (EINVAL);
	-
	- cflag = ZFS_CMD_COMPAT_V15;
	- vecnum = zfs_ioctl_v15_to_v28[cmd];
	-
	- /*
	- * Return without further handling
	- * if the command is blacklisted.
	- */
	- if (vecnum == ZFS_IOC_COMPAT_PASS)
	- return (0);
	- else if (vecnum == ZFS_IOC_COMPAT_FAIL)
	- return (ENOTSUP);
	- break;
	- default:
	- return (EINVAL);
	- }
	- }
	-
	-#ifdef illumos
	- vecnum = cmd - ZFS_IOC_FIRST;
	- ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
	-#endif
	-
	- if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
	- return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
	- vec = &zfs_ioc_vec[vecnum];
	-
	- zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
	-
	-#ifdef illumos
	- error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
	- if (error != 0) {
	- error = SET_ERROR(EFAULT);
	- goto out;
	- }
	-#else /* !illumos */
	- bzero(zc, sizeof(zfs_cmd_t));
	-
	- if (newioc) {
	- zc_iocparm = (void *)arg;
	-
	- switch (zc_iocparm->zfs_ioctl_version) {
	- case ZFS_IOCVER_CURRENT:
	- if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
	- error = SET_ERROR(EINVAL);
	- goto out;
	- }
	- break;
	- case ZFS_IOCVER_INLANES:
	- if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) {
	- error = SET_ERROR(EFAULT);
	- goto out;
	- }
	- compat = B_TRUE;
	- cflag = ZFS_CMD_COMPAT_INLANES;
	- break;
	- case ZFS_IOCVER_RESUME:
	- if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) {
	- error = SET_ERROR(EFAULT);
	- goto out;
	- }
	- compat = B_TRUE;
	- cflag = ZFS_CMD_COMPAT_RESUME;
	- break;
	- case ZFS_IOCVER_EDBP:
	- if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
	- error = SET_ERROR(EFAULT);
	- goto out;
	- }
	- compat = B_TRUE;
	- cflag = ZFS_CMD_COMPAT_EDBP;
	- break;
	- case ZFS_IOCVER_ZCMD:
	- if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) \|\|
	- zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
	- error = SET_ERROR(EFAULT);
	- goto out;
	- }
	- compat = B_TRUE;
	- cflag = ZFS_CMD_COMPAT_ZCMD;
	- break;
	- default:
	- error = SET_ERROR(EINVAL);
	- goto out;
	- /* NOTREACHED */
	- }
	-
	- if (compat) {
	- ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
	- compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
	- bzero(compat_zc, sizeof(zfs_cmd_t));
	-
	- error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
	- compat_zc, zc_iocparm->zfs_cmd_size, flag);
	- if (error != 0) {
	- error = SET_ERROR(EFAULT);
	- goto out;
	- }
	- } else {
	- error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
	- zc, zc_iocparm->zfs_cmd_size, flag);
	- if (error != 0) {
	- error = SET_ERROR(EFAULT);
	- goto out;
	- }
	- }
	- }
	-
	- if (compat) {
	- if (newioc) {
	- ASSERT(compat_zc != NULL);
	- zfs_cmd_compat_get(zc, compat_zc, cflag);
	- } else {
	- ASSERT(compat_zc == NULL);
	- zfs_cmd_compat_get(zc, arg, cflag);
	- }
	- oldvecnum = vecnum;
	- error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
	- if (error != 0)
	- goto out;
	- if (oldvecnum != vecnum)
	- vec = &zfs_ioc_vec[vecnum];
	- }
	-#endif /* !illumos */
	-
	- zc->zc_iflags = flag & FKIOCTL;
	- if (zc->zc_nvlist_src_size != 0) {
	- error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	- zc->zc_iflags, &innvl);
	- if (error != 0)
	- goto out;
	- }
	-
	- /* rewrite innvl for backwards compatibility */
	- if (compat)
	- innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag);
	-
	- /*
	- * Ensure that all pool/dataset names are valid before we pass down to
	- * the lower layers.
	- */
	- zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
	- switch (vec->zvec_namecheck) {
	- case POOL_NAME:
	- if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
	- error = SET_ERROR(EINVAL);
	- else
	- error = pool_status_check(zc->zc_name,
	- vec->zvec_namecheck, vec->zvec_pool_check);
	- break;
	-
	- case DATASET_NAME:
	- if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
	- error = SET_ERROR(EINVAL);
	- else
	- error = pool_status_check(zc->zc_name,
	- vec->zvec_namecheck, vec->zvec_pool_check);
	- break;
	-
	- case ENTITY_NAME:
	- if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) {
	- error = SET_ERROR(EINVAL);
	- } else {
	- error = pool_status_check(zc->zc_name,
	- vec->zvec_namecheck, vec->zvec_pool_check);
	- }
	- break;
	-
	- case NO_NAME:
	- break;
	- }
	-
	- /*
	- * Ensure that all input pairs are valid before we pass them down
	- * to the lower layers.
	- *
	- * The vectored functions can use fnvlist_lookup_{type} for any
	- * required pairs since zfs_check_input_nvpairs() confirmed that
	- * they exist and are of the correct type.
	- */
	- if (error == 0 && vec->zvec_func != NULL) {
	- error = zfs_check_input_nvpairs(innvl, vec);
	- if (error != 0)
	- goto out;
	- }
	-
	- if (error == 0)
	- error = vec->zvec_secpolicy(zc, innvl, cr);
	-
	- if (error != 0)
	- goto out;
	-
	- /* legacy ioctls can modify zc_name */
	- len = strcspn(zc->zc_name, "/@#") + 1;
	- saved_poolname = kmem_alloc(len, KM_SLEEP);
	- (void) strlcpy(saved_poolname, zc->zc_name, len);
	-
	- if (vec->zvec_func != NULL) {
	- nvlist_t *outnvl;
	- int puterror = 0;
	- spa_t *spa;
	- nvlist_t *lognv = NULL;
	-
	- ASSERT(vec->zvec_legacy_func == NULL);
	-
	- /*
	- * Add the innvl to the lognv before calling the func,
	- * in case the func changes the innvl.
	- */
	- if (vec->zvec_allow_log) {
	- lognv = fnvlist_alloc();
	- fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
	- vec->zvec_name);
	- if (!nvlist_empty(innvl)) {
	- fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
	- innvl);
	- }
	- }
	-
	- outnvl = fnvlist_alloc();
	- error = vec->zvec_func(zc->zc_name, innvl, outnvl);
	-
	- /*
	- * Some commands can partially execute, modify state, and still
	- * return an error. In these cases, attempt to record what
	- * was modified.
	- */
	- if ((error == 0 \|\|
	- (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) &&
	- vec->zvec_allow_log &&
	- spa_open(zc->zc_name, &spa, FTAG) == 0) {
	- if (!nvlist_empty(outnvl)) {
	- fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
	- outnvl);
	- }
	- if (error != 0) {
	- fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
	- error);
	- }
	- (void) spa_history_log_nvl(spa, lognv);
	- spa_close(spa, FTAG);
	- }
	- fnvlist_free(lognv);
	-
	- /* rewrite outnvl for backwards compatibility */
	- if (compat)
	- outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
	- cflag);
	-
	- if (!nvlist_empty(outnvl) \|\| zc->zc_nvlist_dst_size != 0) {
	- int smusherror = 0;
	- if (vec->zvec_smush_outnvlist) {
	- smusherror = nvlist_smush(outnvl,
	- zc->zc_nvlist_dst_size);
	- }
	- if (smusherror == 0)
	- puterror = put_nvlist(zc, outnvl);
	- }
	-
	- if (puterror != 0)
	- error = puterror;
	-
	- nvlist_free(outnvl);
	- } else {
	- error = vec->zvec_legacy_func(zc);
	- }
	-
	-out:
	- nvlist_free(innvl);
	-
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- /*
	- * Wait for ZVOL changes to get applied.
	- * NB: taskqueue_drain_all() does less than taskq_wait(),
	- * but enough for what we want.
	- * And there is no equivalent illumos API.
	- */
	- if (error == 0) {
	- spa_t *spa;
	-
	- if (spa_open(saved_poolname, &spa, FTAG) == 0) {
	- taskqueue_drain_all(
	- spa->spa_zvol_taskq->tq_queue);
	- spa_close(spa, FTAG);
	- }
	- }
	-#endif
	-
	-#ifdef illumos
	- rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
	- if (error == 0 && rc != 0)
	- error = SET_ERROR(EFAULT);
	-#else
	- if (compat) {
	- zfs_ioctl_compat_post(zc, cmd, cflag);
	- if (newioc) {
	- ASSERT(compat_zc != NULL);
	- ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
	-
	- zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
	- rc = ddi_copyout(compat_zc,
	- (void *)(uintptr_t)zc_iocparm->zfs_cmd,
	- zc_iocparm->zfs_cmd_size, flag);
	- if (error == 0 && rc != 0)
	- error = SET_ERROR(EFAULT);
	- kmem_free(compat_zc, sizeof (zfs_cmd_t));
	- } else {
	- zfs_cmd_compat_put(zc, arg, vecnum, cflag);
	- }
	- } else {
	- ASSERT(newioc);
	-
	- rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
	- sizeof (zfs_cmd_t), flag);
	- if (error == 0 && rc != 0)
	- error = SET_ERROR(EFAULT);
	- }
	-#endif
	- if (error == 0 && vec->zvec_allow_log) {
	- char *s = tsd_get(zfs_allow_log_key);
	- if (s != NULL)
	- strfree(s);
	- (void) tsd_set(zfs_allow_log_key, saved_poolname);
	- } else {
	- if (saved_poolname != NULL)
	- strfree(saved_poolname);
	- }
	-
	- kmem_free(zc, sizeof (zfs_cmd_t));
	- return (error);
	-}
	-
	-#ifdef illumos
	-static int
	-zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
	-{
	- if (cmd != DDI_ATTACH)
	- return (DDI_FAILURE);
	-
	- if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
	- DDI_PSEUDO, 0) == DDI_FAILURE)
	- return (DDI_FAILURE);
	-
	- zfs_dip = dip;
	-
	- ddi_report_dev(dip);
	-
	- return (DDI_SUCCESS);
	-}
	-
	-static int
	-zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
	-{
	- if (spa_busy() \|\| zfs_busy() \|\| zvol_busy())
	- return (DDI_FAILURE);
	-
	- if (cmd != DDI_DETACH)
	- return (DDI_FAILURE);
	-
	- zfs_dip = NULL;
	-
	- ddi_prop_remove_all(dip);
	- ddi_remove_minor_node(dip, NULL);
	-
	- return (DDI_SUCCESS);
	-}
	-
	-/ARGSUSED/
	-static int
	-zfs_info(dev_info_t dip, ddi_info_cmd_t infocmd, void arg, void **result)
	-{
	- switch (infocmd) {
	- case DDI_INFO_DEVT2DEVINFO:
	- *result = zfs_dip;
	- return (DDI_SUCCESS);
	-
	- case DDI_INFO_DEVT2INSTANCE:
	- result = (void )0;
	- return (DDI_SUCCESS);
	- }
	-
	- return (DDI_FAILURE);
	-}
	-#endif /* illumos */
	-
	-/*
	- * OK, so this is a little weird.
	- *
	- * /dev/zfs is the control node, i.e. minor 0.
	- * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
	- *
	- * /dev/zfs has basically nothing to do except serve up ioctls,
	- * so most of the standard driver entry points are in zvol.c.
	- */
	-#ifdef illumos
	-static struct cb_ops zfs_cb_ops = {
	- zfsdev_open, /* open */
	- zfsdev_close, /* close */
	- zvol_strategy, /* strategy */
	- nodev, /* print */
	- zvol_dump, /* dump */
	- zvol_read, /* read */
	- zvol_write, /* write */
	- zfsdev_ioctl, /* ioctl */
	- nodev, /* devmap */
	- nodev, /* mmap */
	- nodev, /* segmap */
	- nochpoll, /* poll */
	- ddi_prop_op, /* prop_op */
	- NULL, /* streamtab */
	- D_NEW \| D_MP \| D_64BIT, /* Driver compatibility flag */
	- CB_REV, /* version */
	- nodev, /* async read */
	- nodev, /* async write */
	-};
	-
	-static struct dev_ops zfs_dev_ops = {
	- DEVO_REV, /* version */
	- 0, /* refcnt */
	- zfs_info, /* info */
	- nulldev, /* identify */
	- nulldev, /* probe */
	- zfs_attach, /* attach */
	- zfs_detach, /* detach */
	- nodev, /* reset */
	- &zfs_cb_ops, /* driver operations */
	- NULL, /* no bus operations */
	- NULL, /* power */
	- ddi_quiesce_not_needed, /* quiesce */
	-};
	-
	-static struct modldrv zfs_modldrv = {
	- &mod_driverops,
	- "ZFS storage pool",
	- &zfs_dev_ops
	-};
	-
	-static struct modlinkage modlinkage = {
	- MODREV_1,
	- (void *)&zfs_modlfs,
	- (void *)&zfs_modldrv,
	- NULL
	-};
	-#endif /* illumos */
	-
	-static struct cdevsw zfs_cdevsw = {
	- .d_version = D_VERSION,
	- .d_open = zfsdev_open,
	- .d_ioctl = zfsdev_ioctl,
	- .d_name = ZFS_DEV_NAME
	-};
	-
	-static void
	-zfs_allow_log_destroy(void *arg)
	-{
	- char *poolname = arg;
	- strfree(poolname);
	-}
	-
	-static void
	-zfsdev_init(void)
	-{
	- zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
	- ZFS_DEV_NAME);
	-}
	-
	-static void
	-zfsdev_fini(void)
	-{
	- if (zfsdev != NULL)
	- destroy_dev(zfsdev);
	-}
	-
	-static struct root_hold_token *zfs_root_token;
	-
	-#ifdef illumos
	-int
	-_init(void)
	-{
	- int error;
	-
	- spa_init(FREAD \| FWRITE);
	- zfs_init();
	- zvol_init();
	- zfs_ioctl_init();
	-
	- if ((error = mod_install(&modlinkage)) != 0) {
	- zvol_fini();
	- zfs_fini();
	- spa_fini();
	- return (error);
	- }
	-
	- tsd_create(&zfs_fsyncer_key, NULL);
	- tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
	- tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
	-
	- error = ldi_ident_from_mod(&modlinkage, &zfs_li);
	- ASSERT(error == 0);
	- mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- return (0);
	-}
	-
	-int
	-_fini(void)
	-{
	- int error;
	-
	- if (spa_busy() \|\| zfs_busy() \|\| zvol_busy() \|\| zio_injection_enabled)
	- return (SET_ERROR(EBUSY));
	-
	- if ((error = mod_remove(&modlinkage)) != 0)
	- return (error);
	-
	- zvol_fini();
	- zfs_fini();
	- spa_fini();
	- if (zfs_nfsshare_inited)
	- (void) ddi_modclose(nfs_mod);
	- if (zfs_smbshare_inited)
	- (void) ddi_modclose(smbsrv_mod);
	- if (zfs_nfsshare_inited \|\| zfs_smbshare_inited)
	- (void) ddi_modclose(sharefs_mod);
	-
	- tsd_destroy(&zfs_fsyncer_key);
	- ldi_ident_release(zfs_li);
	- zfs_li = NULL;
	- mutex_destroy(&zfs_share_lock);
	-
	- return (error);
	-}
	-
	-int
	-_info(struct modinfo *modinfop)
	-{
	- return (mod_info(&modlinkage, modinfop));
	-}
	-#endif /* illumos */
	-
	-static int zfs__init(void);
	-static int zfs__fini(void);
	-static void zfs_shutdown(void *, int);
	-
	-static eventhandler_tag zfs_shutdown_event_tag;
	-
	-#ifdef __FreeBSD__
	-#define ZFS_MIN_KSTACK_PAGES 4
	-#endif
	-
	-int
	-zfs__init(void)
	-{
	-
	-#ifdef __FreeBSD__
	-#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
	- printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
	- "overflow panic!\nPlease consider adding "
	- "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
	- ZFS_MIN_KSTACK_PAGES);
	-#endif
	-#endif
	- zfs_root_token = root_mount_hold("ZFS");
	-
	- mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- spa_init(FREAD \| FWRITE);
	- zfs_init();
	- zvol_init();
	- zfs_ioctl_init();
	-
	- tsd_create(&zfs_fsyncer_key, NULL);
	- tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
	- tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
	- tsd_create(&zfs_geom_probe_vdev_key, NULL);
	-
	- printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
	- root_mount_rel(zfs_root_token);
	-
	- zfsdev_init();
	-
	- return (0);
	-}
	-
	-int
	-zfs__fini(void)
	-{
	- if (spa_busy() \|\| zfs_busy() \|\| zvol_busy() \|\|
	- zio_injection_enabled) {
	- return (EBUSY);
	- }
	-
	- zfsdev_fini();
	- zvol_fini();
	- zfs_fini();
	- spa_fini();
	-
	- tsd_destroy(&zfs_fsyncer_key);
	- tsd_destroy(&rrw_tsd_key);
	- tsd_destroy(&zfs_allow_log_key);
	-
	- mutex_destroy(&zfs_share_lock);
	-
	- return (0);
	-}
	-
	-static void
	-zfs_shutdown(void *arg __unused, int howto __unused)
	-{
	-
	- /*
	- * ZFS fini routines can not properly work in a panic-ed system.
	- */
	- if (!KERNEL_PANICKED())
	- (void)zfs__fini();
	-}
	-
	-
	-static int
	-zfs_modevent(module_t mod, int type, void *unused __unused)
	-{
	- int err;
	-
	- switch (type) {
	- case MOD_LOAD:
	- err = zfs__init();
	- if (err == 0)
	- zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
	- shutdown_post_sync, zfs_shutdown, NULL,
	- SHUTDOWN_PRI_FIRST);
	- return (err);
	- case MOD_UNLOAD:
	- err = zfs__fini();
	- if (err == 0 && zfs_shutdown_event_tag != NULL)
	- EVENTHANDLER_DEREGISTER(shutdown_post_sync,
	- zfs_shutdown_event_tag);
	- return (err);
	- case MOD_SHUTDOWN:
	- return (0);
	- default:
	- break;
	- }
	- return (EOPNOTSUPP);
	-}
	-
	-static moduledata_t zfs_mod = {
	- "zfsctrl",
	- zfs_modevent,
	- 0
	-};
	-DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
	-MODULE_VERSION(zfsctrl, 1);
	-MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
	-MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1);
	-MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
	-MODULE_DEPEND(zfsctrl, zlib, 1, 1, 1);
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
	@@ -1,688 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/systm.h>
	-#include <sys/sysmacros.h>
	-#include <sys/cmn_err.h>
	-#include <sys/kmem.h>
	-#include <sys/file.h>
	-#include <sys/vfs.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zfs_dir.h>
	-#include <sys/zil.h>
	-#include <sys/zil_impl.h>
	-#include <sys/byteorder.h>
	-#include <sys/policy.h>
	-#include <sys/stat.h>
	-#include <sys/acl.h>
	-#include <sys/dmu.h>
	-#include <sys/spa.h>
	-#include <sys/zfs_fuid.h>
	-#include <sys/dsl_dataset.h>
	-
	-/*
	- * These zfs_log_* functions must be called within a dmu tx, in one
	- * of 2 contexts depending on zilog->z_replay:
	- *
	- * Non replay mode
	- * ---------------
	- * We need to record the transaction so that if it is committed to
	- * the Intent Log then it can be replayed. An intent log transaction
	- * structure (itx_t) is allocated and all the information necessary to
	- * possibly replay the transaction is saved in it. The itx is then assigned
	- * a sequence number and inserted in the in-memory list anchored in the zilog.
	- *
	- * Replay mode
	- * -----------
	- * We need to mark the intent log record as replayed in the log header.
	- * This is done in the same transaction as the replay so that they
	- * commit atomically.
	- */
	-
	-int
	-zfs_log_create_txtype(zil_create_t type, vsecattr_t vsecp, vattr_t vap)
	-{
	- int isxvattr = (vap->va_mask & AT_XVATTR);
	- switch (type) {
	- case Z_FILE:
	- if (vsecp == NULL && !isxvattr)
	- return (TX_CREATE);
	- if (vsecp && isxvattr)
	-#ifdef TODO
	- return (TX_CREATE_ACL_ATTR);
	-#else
	- panic("%s:%u: unsupported condition", __func__, __LINE__);
	-#endif
	- if (vsecp)
	- return (TX_CREATE_ACL);
	- else
	- return (TX_CREATE_ATTR);
	- /NOTREACHED/
	- case Z_DIR:
	- if (vsecp == NULL && !isxvattr)
	- return (TX_MKDIR);
	- if (vsecp && isxvattr)
	-#ifdef TODO
	- return (TX_MKDIR_ACL_ATTR);
	-#else
	- panic("%s:%u: unsupported condition", __func__, __LINE__);
	-#endif
	- if (vsecp)
	- return (TX_MKDIR_ACL);
	- else
	- return (TX_MKDIR_ATTR);
	- case Z_XATTRDIR:
	- return (TX_MKXATTR);
	- }
	- ASSERT(0);
	- return (TX_MAX_TYPE);
	-}
	-
	-/*
	- * build up the log data necessary for logging xvattr_t
	- * First lr_attr_t is initialized. following the lr_attr_t
	- * is the mapsize and attribute bitmap copied from the xvattr_t.
	- * Following the bitmap and bitmapsize two 64 bit words are reserved
	- * for the create time which may be set. Following the create time
	- * records a single 64 bit integer which has the bits to set on
	- * replay for the xvattr.
	- */
	-static void
	-zfs_log_xvattr(lr_attr_t lrattr, xvattr_t xvap)
	-{
	- uint32_t *bitmap;
	- uint64_t *attrs;
	- uint64_t *crtime;
	- xoptattr_t *xoap;
	- void *scanstamp;
	- int i;
	-
	- xoap = xva_getxoptattr(xvap);
	- ASSERT(xoap);
	-
	- lrattr->lr_attr_masksize = xvap->xva_mapsize;
	- bitmap = &lrattr->lr_attr_bitmap;
	- for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
	- *bitmap = xvap->xva_reqattrmap[i];
	- }
	-
	- /* Now pack the attributes up in a single uint64_t */
	- attrs = (uint64_t *)bitmap;
	- crtime = attrs + 1;
	- scanstamp = (caddr_t)(crtime + 2);
	- *attrs = 0;
	- if (XVA_ISSET_REQ(xvap, XAT_READONLY))
	- *attrs \|= (xoap->xoa_readonly == 0) ? 0 :
	- XAT0_READONLY;
	- if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
	- *attrs \|= (xoap->xoa_hidden == 0) ? 0 :
	- XAT0_HIDDEN;
	- if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
	- *attrs \|= (xoap->xoa_system == 0) ? 0 :
	- XAT0_SYSTEM;
	- if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
	- *attrs \|= (xoap->xoa_archive == 0) ? 0 :
	- XAT0_ARCHIVE;
	- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
	- *attrs \|= (xoap->xoa_immutable == 0) ? 0 :
	- XAT0_IMMUTABLE;
	- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
	- *attrs \|= (xoap->xoa_nounlink == 0) ? 0 :
	- XAT0_NOUNLINK;
	- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
	- *attrs \|= (xoap->xoa_appendonly == 0) ? 0 :
	- XAT0_APPENDONLY;
	- if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
	- *attrs \|= (xoap->xoa_opaque == 0) ? 0 :
	- XAT0_APPENDONLY;
	- if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
	- *attrs \|= (xoap->xoa_nodump == 0) ? 0 :
	- XAT0_NODUMP;
	- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
	- *attrs \|= (xoap->xoa_av_quarantined == 0) ? 0 :
	- XAT0_AV_QUARANTINED;
	- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
	- *attrs \|= (xoap->xoa_av_modified == 0) ? 0 :
	- XAT0_AV_MODIFIED;
	- if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
	- ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
	- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
	- bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
	- if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
	- *attrs \|= (xoap->xoa_reparse == 0) ? 0 :
	- XAT0_REPARSE;
	- if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
	- *attrs \|= (xoap->xoa_offline == 0) ? 0 :
	- XAT0_OFFLINE;
	- if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
	- *attrs \|= (xoap->xoa_sparse == 0) ? 0 :
	- XAT0_SPARSE;
	-}
	-
	-static void *
	-zfs_log_fuid_ids(zfs_fuid_info_t fuidp, void start)
	-{
	- zfs_fuid_t *zfuid;
	- uint64_t *fuidloc = start;
	-
	- /* First copy in the ACE FUIDs */
	- for (zfuid = list_head(&fuidp->z_fuids); zfuid;
	- zfuid = list_next(&fuidp->z_fuids, zfuid)) {
	- *fuidloc++ = zfuid->z_logfuid;
	- }
	- return (fuidloc);
	-}
	-
	-
	-static void *
	-zfs_log_fuid_domains(zfs_fuid_info_t fuidp, void start)
	-{
	- zfs_fuid_domain_t *zdomain;
	-
	- /* now copy in the domain info, if any */
	- if (fuidp->z_domain_str_sz != 0) {
	- for (zdomain = list_head(&fuidp->z_domains); zdomain;
	- zdomain = list_next(&fuidp->z_domains, zdomain)) {
	- bcopy((void *)zdomain->z_domain, start,
	- strlen(zdomain->z_domain) + 1);
	- start = (caddr_t)start +
	- strlen(zdomain->z_domain) + 1;
	- }
	- }
	- return (start);
	-}
	-
	-/*
	- * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
	- * TK_MKXATTR transactions.
	- *
	- * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
	- * domain information appended prior to the name. In this case the
	- * uid/gid in the log record will be a log centric FUID.
	- *
	- * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
	- * may contain attributes, ACL and optional fuid information.
	- *
	- * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
	- * and ACL and normal users/groups in the ACEs.
	- *
	- * There may be an optional xvattr attribute information similar
	- * to zfs_log_setattr.
	- *
	- * Also, after the file name "domain" strings may be appended.
	- */
	-void
	-zfs_log_create(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t dzp, znode_t zp, char name, vsecattr_t vsecp,
	- zfs_fuid_info_t fuidp, vattr_t vap)
	-{
	- itx_t *itx;
	- lr_create_t *lr;
	- lr_acl_create_t *lracl;
	- size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0;
	- size_t xvatsize = 0;
	- size_t txsize;
	- xvattr_t xvap = (xvattr_t )vap;
	- void *end;
	- size_t lrsize;
	- size_t namesize = strlen(name) + 1;
	- size_t fuidsz = 0;
	-
	- if (zil_replaying(zilog, tx))
	- return;
	-
	- /*
	- * If we have FUIDs present then add in space for
	- * domains and ACE fuid's if any.
	- */
	- if (fuidp) {
	- fuidsz += fuidp->z_domain_str_sz;
	- fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
	- }
	-
	- if (vap->va_mask & AT_XVATTR)
	- xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
	-
	- if ((int)txtype == TX_CREATE_ATTR \|\| (int)txtype == TX_MKDIR_ATTR \|\|
	- (int)txtype == TX_CREATE \|\| (int)txtype == TX_MKDIR \|\|
	- (int)txtype == TX_MKXATTR) {
	- txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
	- lrsize = sizeof (*lr);
	- } else {
	- txsize =
	- sizeof (lr_acl_create_t) + namesize + fuidsz +
	- ZIL_ACE_LENGTH(aclsize) + xvatsize;
	- lrsize = sizeof (lr_acl_create_t);
	- }
	-
	- itx = zil_itx_create(txtype, txsize);
	-
	- lr = (lr_create_t *)&itx->itx_lr;
	- lr->lr_doid = dzp->z_id;
	- lr->lr_foid = zp->z_id;
	- /* Store dnode slot count in 8 bits above object id. */
	- LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT);
	- lr->lr_mode = zp->z_mode;
	- if (!IS_EPHEMERAL(zp->z_uid)) {
	- lr->lr_uid = (uint64_t)zp->z_uid;
	- } else {
	- lr->lr_uid = fuidp->z_fuid_owner;
	- }
	- if (!IS_EPHEMERAL(zp->z_gid)) {
	- lr->lr_gid = (uint64_t)zp->z_gid;
	- } else {
	- lr->lr_gid = fuidp->z_fuid_group;
	- }
	- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
	- sizeof (uint64_t));
	- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
	- lr->lr_crtime, sizeof (uint64_t) * 2);
	-
	- if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
	- sizeof (lr->lr_rdev)) != 0)
	- lr->lr_rdev = 0;
	-
	- /*
	- * Fill in xvattr info if any
	- */
	- if (vap->va_mask & AT_XVATTR) {
	- zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
	- end = (caddr_t)lr + lrsize + xvatsize;
	- } else {
	- end = (caddr_t)lr + lrsize;
	- }
	-
	- /* Now fill in any ACL info */
	-
	- if (vsecp) {
	- lracl = (lr_acl_create_t *)&itx->itx_lr;
	- lracl->lr_aclcnt = vsecp->vsa_aclcnt;
	- lracl->lr_acl_bytes = aclsize;
	- lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
	- lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
	- if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
	- lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
	- else
	- lracl->lr_acl_flags = 0;
	-
	- bcopy(vsecp->vsa_aclentp, end, aclsize);
	- end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
	- }
	-
	- /* drop in FUID info */
	- if (fuidp) {
	- end = zfs_log_fuid_ids(fuidp, end);
	- end = zfs_log_fuid_domains(fuidp, end);
	- }
	- /*
	- * Now place file name in log record
	- */
	- bcopy(name, end, namesize);
	-
	- zil_itx_assign(zilog, itx, tx);
	-}
	-
	-/*
	- * Handles both TX_REMOVE and TX_RMDIR transactions.
	- */
	-void
	-zfs_log_remove(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t dzp, char name, uint64_t foid)
	-{
	- itx_t *itx;
	- lr_remove_t *lr;
	- size_t namesize = strlen(name) + 1;
	-
	- if (zil_replaying(zilog, tx))
	- return;
	-
	- itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
	- lr = (lr_remove_t *)&itx->itx_lr;
	- lr->lr_doid = dzp->z_id;
	- bcopy(name, (char *)(lr + 1), namesize);
	-
	- itx->itx_oid = foid;
	-
	- zil_itx_assign(zilog, itx, tx);
	-}
	-
	-/*
	- * Handles TX_LINK transactions.
	- */
	-void
	-zfs_log_link(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t dzp, znode_t zp, char *name)
	-{
	- itx_t *itx;
	- lr_link_t *lr;
	- size_t namesize = strlen(name) + 1;
	-
	- if (zil_replaying(zilog, tx))
	- return;
	-
	- itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
	- lr = (lr_link_t *)&itx->itx_lr;
	- lr->lr_doid = dzp->z_id;
	- lr->lr_link_obj = zp->z_id;
	- bcopy(name, (char *)(lr + 1), namesize);
	-
	- zil_itx_assign(zilog, itx, tx);
	-}
	-
	-/*
	- * Handles TX_SYMLINK transactions.
	- */
	-void
	-zfs_log_symlink(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t dzp, znode_t zp, char name, char link)
	-{
	- itx_t *itx;
	- lr_create_t *lr;
	- size_t namesize = strlen(name) + 1;
	- size_t linksize = strlen(link) + 1;
	-
	- if (zil_replaying(zilog, tx))
	- return;
	-
	- itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
	- lr = (lr_create_t *)&itx->itx_lr;
	- lr->lr_doid = dzp->z_id;
	- lr->lr_foid = zp->z_id;
	- lr->lr_uid = zp->z_uid;
	- lr->lr_gid = zp->z_gid;
	- lr->lr_mode = zp->z_mode;
	- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
	- sizeof (uint64_t));
	- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
	- lr->lr_crtime, sizeof (uint64_t) * 2);
	- bcopy(name, (char *)(lr + 1), namesize);
	- bcopy(link, (char *)(lr + 1) + namesize, linksize);
	-
	- zil_itx_assign(zilog, itx, tx);
	-}
	-
	-/*
	- * Handles TX_RENAME transactions.
	- */
	-void
	-zfs_log_rename(zilog_t zilog, dmu_tx_t tx, uint64_t txtype,
	- znode_t sdzp, char sname, znode_t tdzp, char dname, znode_t *szp)
	-{
	- itx_t *itx;
	- lr_rename_t *lr;
	- size_t snamesize = strlen(sname) + 1;
	- size_t dnamesize = strlen(dname) + 1;
	-
	- if (zil_replaying(zilog, tx))
	- return;
	-
	- itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
	- lr = (lr_rename_t *)&itx->itx_lr;
	- lr->lr_sdoid = sdzp->z_id;
	- lr->lr_tdoid = tdzp->z_id;
	- bcopy(sname, (char *)(lr + 1), snamesize);
	- bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
	- itx->itx_oid = szp->z_id;
	-
	- zil_itx_assign(zilog, itx, tx);
	-}
	-
	-/*
	- * Handles TX_WRITE transactions.
	- */
	-ssize_t zfs_immediate_write_sz = 32768;
	-#ifdef _KERNEL
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_LONG(_vfs_zfs, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
	- &zfs_immediate_write_sz, 0, "Minimal size for indirect log write");
	-#endif
	-
	-void
	-zfs_log_write(zilog_t zilog, dmu_tx_t tx, int txtype,
	- znode_t *zp, offset_t off, ssize_t resid, int ioflag)
	-{
	- uint32_t blocksize = zp->z_blksz;
	- itx_wr_state_t write_state;
	- uintptr_t fsync_cnt;
	-
	- if (zil_replaying(zilog, tx) \|\| zp->z_unlinked)
	- return;
	-
	- if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
	- write_state = WR_INDIRECT;
	- else if (!spa_has_slogs(zilog->zl_spa) &&
	- resid >= zfs_immediate_write_sz)
	- write_state = WR_INDIRECT;
	- else if (ioflag & (FSYNC \| FDSYNC))
	- write_state = WR_COPIED;
	- else
	- write_state = WR_NEED_COPY;
	-
	- if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
	- (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
	- }
	-
	- while (resid) {
	- itx_t *itx;
	- lr_write_t *lr;
	- itx_wr_state_t wr_state = write_state;
	- ssize_t len = resid;
	-
	- /*
	- * A WR_COPIED record must fit entirely in one log block.
	- * Large writes can use WR_NEED_COPY, which the ZIL will
	- * split into multiple records across several log blocks
	- * if necessary.
	- */
	- if (wr_state == WR_COPIED &&
	- resid > zil_max_copied_data(zilog))
	- wr_state = WR_NEED_COPY;
	- else if (wr_state == WR_INDIRECT)
	- len = MIN(blocksize - P2PHASE(off, blocksize), resid);
	-
	- itx = zil_itx_create(txtype, sizeof (*lr) +
	- (wr_state == WR_COPIED ? len : 0));
	- lr = (lr_write_t *)&itx->itx_lr;
	- if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
	- zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
	- zil_itx_destroy(itx);
	- itx = zil_itx_create(txtype, sizeof (*lr));
	- lr = (lr_write_t *)&itx->itx_lr;
	- wr_state = WR_NEED_COPY;
	- }
	-
	- itx->itx_wr_state = wr_state;
	- lr->lr_foid = zp->z_id;
	- lr->lr_offset = off;
	- lr->lr_length = len;
	- lr->lr_blkoff = 0;
	- BP_ZERO(&lr->lr_blkptr);
	-
	- itx->itx_private = zp->z_zfsvfs;
	-
	- if (!(ioflag & (FSYNC \| FDSYNC)) && (zp->z_sync_cnt == 0) &&
	- (fsync_cnt == 0))
	- itx->itx_sync = B_FALSE;
	-
	- zil_itx_assign(zilog, itx, tx);
	-
	- off += len;
	- resid -= len;
	- }
	-}
	-
	-/*
	- * Handles TX_TRUNCATE transactions.
	- */
	-void
	-zfs_log_truncate(zilog_t zilog, dmu_tx_t tx, int txtype,
	- znode_t *zp, uint64_t off, uint64_t len)
	-{
	- itx_t *itx;
	- lr_truncate_t *lr;
	-
	- if (zil_replaying(zilog, tx) \|\| zp->z_unlinked)
	- return;
	-
	- itx = zil_itx_create(txtype, sizeof (*lr));
	- lr = (lr_truncate_t *)&itx->itx_lr;
	- lr->lr_foid = zp->z_id;
	- lr->lr_offset = off;
	- lr->lr_length = len;
	-
	- itx->itx_sync = (zp->z_sync_cnt != 0);
	- zil_itx_assign(zilog, itx, tx);
	-}
	-
	-/*
	- * Handles TX_SETATTR transactions.
	- */
	-void
	-zfs_log_setattr(zilog_t zilog, dmu_tx_t tx, int txtype,
	- znode_t zp, vattr_t vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
	-{
	- itx_t *itx;
	- lr_setattr_t *lr;
	- xvattr_t xvap = (xvattr_t )vap;
	- size_t recsize = sizeof (lr_setattr_t);
	- void *start;
	-
	- if (zil_replaying(zilog, tx) \|\| zp->z_unlinked)
	- return;
	-
	- /*
	- * If XVATTR set, then log record size needs to allow
	- * for lr_attr_t + xvattr mask, mapsize and create time
	- * plus actual attribute values
	- */
	- if (vap->va_mask & AT_XVATTR)
	- recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
	-
	- if (fuidp)
	- recsize += fuidp->z_domain_str_sz;
	-
	- itx = zil_itx_create(txtype, recsize);
	- lr = (lr_setattr_t *)&itx->itx_lr;
	- lr->lr_foid = zp->z_id;
	- lr->lr_mask = (uint64_t)mask_applied;
	- lr->lr_mode = (uint64_t)vap->va_mode;
	- if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
	- lr->lr_uid = fuidp->z_fuid_owner;
	- else
	- lr->lr_uid = (uint64_t)vap->va_uid;
	-
	- if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
	- lr->lr_gid = fuidp->z_fuid_group;
	- else
	- lr->lr_gid = (uint64_t)vap->va_gid;
	-
	- lr->lr_size = (uint64_t)vap->va_size;
	- ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
	- ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
	- start = (lr_setattr_t *)(lr + 1);
	- if (vap->va_mask & AT_XVATTR) {
	- zfs_log_xvattr((lr_attr_t *)start, xvap);
	- start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
	- }
	-
	- /*
	- * Now stick on domain information if any on end
	- */
	-
	- if (fuidp)
	- (void) zfs_log_fuid_domains(fuidp, start);
	-
	- itx->itx_sync = (zp->z_sync_cnt != 0);
	- zil_itx_assign(zilog, itx, tx);
	-}
	-
	-/*
	- * Handles TX_ACL transactions.
	- */
	-void
	-zfs_log_acl(zilog_t zilog, dmu_tx_t tx, znode_t *zp,
	- vsecattr_t vsecp, zfs_fuid_info_t fuidp)
	-{
	- itx_t *itx;
	- lr_acl_v0_t *lrv0;
	- lr_acl_t *lr;
	- int txtype;
	- int lrsize;
	- size_t txsize;
	- size_t aclbytes = vsecp->vsa_aclentsz;
	-
	- if (zil_replaying(zilog, tx) \|\| zp->z_unlinked)
	- return;
	-
	- txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
	- TX_ACL_V0 : TX_ACL;
	-
	- if (txtype == TX_ACL)
	- lrsize = sizeof (*lr);
	- else
	- lrsize = sizeof (*lrv0);
	-
	- txsize = lrsize +
	- ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
	- (fuidp ? fuidp->z_domain_str_sz : 0) +
	- sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
	-
	- itx = zil_itx_create(txtype, txsize);
	-
	- lr = (lr_acl_t *)&itx->itx_lr;
	- lr->lr_foid = zp->z_id;
	- if (txtype == TX_ACL) {
	- lr->lr_acl_bytes = aclbytes;
	- lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
	- lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
	- if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
	- lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
	- else
	- lr->lr_acl_flags = 0;
	- }
	- lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
	-
	- if (txtype == TX_ACL_V0) {
	- lrv0 = (lr_acl_v0_t *)lr;
	- bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
	- } else {
	- void start = (ace_t )(lr + 1);
	-
	- bcopy(vsecp->vsa_aclentp, start, aclbytes);
	-
	- start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
	-
	- if (fuidp) {
	- start = zfs_log_fuid_ids(fuidp, start);
	- (void) zfs_log_fuid_domains(fuidp, start);
	- }
	- }
	-
	- itx->itx_sync = (zp->z_sync_cnt != 0);
	- zil_itx_assign(zilog, itx, tx);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
	@@ -1,254 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/errno.h>
	-#include <sys/kmem.h>
	-#include <sys/conf.h>
	-#include <sys/sunddi.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zfs_onexit.h>
	-#include <sys/zvol.h>
	-
	-/*
	- * ZFS kernel routines may add/delete callback routines to be invoked
	- * upon process exit (triggered via the close operation from the /dev/zfs
	- * driver).
	- *
	- * These cleanup callbacks are intended to allow for the accumulation
	- * of kernel state across multiple ioctls. User processes participate
	- * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a
	- * clone-open, generating a unique minor number. The process then passes
	- * along that file descriptor to each ioctl that might have a cleanup operation.
	- *
	- * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
	- * on to validate the given fd and add a reference to its file table entry.
	- * This allows the consumer to do its work and then add a callback, knowing
	- * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers
	- * should call zfs_onexit_fd_rele().
	- *
	- * A simple example is zfs_ioc_recv(), where we might create an AVL tree
	- * with dataset/GUID mappings and then reuse that tree on subsequent
	- * zfs_ioc_recv() calls.
	- *
	- * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
	- * the AVL tree and pass it along with a callback function to
	- * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
	- * callback and return an action handle.
	- *
	- * The action handle is then passed from user space to subsequent
	- * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
	- * by calling zfs_onexit_cb_data() with the device minor number and
	- * action handle.
	- *
	- * If the user process exits abnormally, the callback is invoked implicitly
	- * as part of the driver close operation. Once the user space process is
	- * finished with the accumulated kernel state, it can also just call close(2)
	- * on the cleanup fd to trigger the cleanup callback.
	- */
	-
	-void
	-zfs_onexit_init(zfs_onexit_t **zop)
	-{
	- zfs_onexit_t *zo;
	-
	- zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
	- mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
	- list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
	- offsetof(zfs_onexit_action_node_t, za_link));
	-}
	-
	-void
	-zfs_onexit_destroy(zfs_onexit_t *zo)
	-{
	- zfs_onexit_action_node_t *ap;
	-
	- mutex_enter(&zo->zo_lock);
	- while ((ap = list_head(&zo->zo_actions)) != NULL) {
	- list_remove(&zo->zo_actions, ap);
	- mutex_exit(&zo->zo_lock);
	- ap->za_func(ap->za_data);
	- kmem_free(ap, sizeof (zfs_onexit_action_node_t));
	- mutex_enter(&zo->zo_lock);
	- }
	- mutex_exit(&zo->zo_lock);
	-
	- list_destroy(&zo->zo_actions);
	- mutex_destroy(&zo->zo_lock);
	- kmem_free(zo, sizeof (zfs_onexit_t));
	-}
	-
	-static int
	-zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
	-{
	- *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
	- if (*zo == NULL)
	- return (SET_ERROR(EBADF));
	-
	- return (0);
	-}
	-
	-/*
	- * Consumers might need to operate by minor number instead of fd, since
	- * they might be running in another thread (e.g. txg_sync_thread). Callers
	- * of this function must call zfs_onexit_fd_rele() when they're finished
	- * using the minor number.
	- */
	-int
	-zfs_onexit_fd_hold(int fd, minor_t *minorp)
	-{
	- file_t fp, tmpfp;
	- zfs_onexit_t *zo;
	- cap_rights_t rights;
	- void *data;
	- int error;
	-
	- fp = getf(fd, &cap_no_rights);
	- if (fp == NULL)
	- return (SET_ERROR(EBADF));
	-
	- tmpfp = curthread->td_fpop;
	- curthread->td_fpop = fp;
	- error = devfs_get_cdevpriv(&data);
	- if (error == 0)
	- *minorp = (minor_t)(uintptr_t)data;
	- curthread->td_fpop = tmpfp;
	- if (error != 0)
	- return (SET_ERROR(EBADF));
	- return (zfs_onexit_minor_to_state(*minorp, &zo));
	-}
	-
	-void
	-zfs_onexit_fd_rele(int fd)
	-{
	- releasef(fd);
	-}
	-
	-/*
	- * Add a callback to be invoked when the calling process exits.
	- */
	-int
	-zfs_onexit_add_cb(minor_t minor, void (func)(void ), void *data,
	- uint64_t *action_handle)
	-{
	- zfs_onexit_t *zo;
	- zfs_onexit_action_node_t *ap;
	- int error;
	-
	- error = zfs_onexit_minor_to_state(minor, &zo);
	- if (error)
	- return (error);
	-
	- ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
	- list_link_init(&ap->za_link);
	- ap->za_func = func;
	- ap->za_data = data;
	-
	- mutex_enter(&zo->zo_lock);
	- list_insert_tail(&zo->zo_actions, ap);
	- mutex_exit(&zo->zo_lock);
	- if (action_handle)
	- *action_handle = (uint64_t)(uintptr_t)ap;
	-
	- return (0);
	-}
	-
	-static zfs_onexit_action_node_t *
	-zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle)
	-{
	- zfs_onexit_action_node_t *match;
	- zfs_onexit_action_node_t *ap;
	- list_t *l;
	-
	- ASSERT(MUTEX_HELD(&zo->zo_lock));
	-
	- match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle;
	- l = &zo->zo_actions;
	- for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) {
	- if (match == ap)
	- break;
	- }
	- return (ap);
	-}
	-
	-/*
	- * Delete the callback, triggering it first if 'fire' is set.
	- */
	-int
	-zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
	-{
	- zfs_onexit_t *zo;
	- zfs_onexit_action_node_t *ap;
	- int error;
	-
	- error = zfs_onexit_minor_to_state(minor, &zo);
	- if (error)
	- return (error);
	-
	- mutex_enter(&zo->zo_lock);
	- ap = zfs_onexit_find_cb(zo, action_handle);
	- if (ap != NULL) {
	- list_remove(&zo->zo_actions, ap);
	- mutex_exit(&zo->zo_lock);
	- if (fire)
	- ap->za_func(ap->za_data);
	- kmem_free(ap, sizeof (zfs_onexit_action_node_t));
	- } else {
	- mutex_exit(&zo->zo_lock);
	- error = SET_ERROR(ENOENT);
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * Return the data associated with this callback. This allows consumers
	- * of the cleanup-on-exit interfaces to stash kernel data across system
	- * calls, knowing that it will be cleaned up if the calling process exits.
	- */
	-int
	-zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
	-{
	- zfs_onexit_t *zo;
	- zfs_onexit_action_node_t *ap;
	- int error;
	-
	- *data = NULL;
	-
	- error = zfs_onexit_minor_to_state(minor, &zo);
	- if (error)
	- return (error);
	-
	- mutex_enter(&zo->zo_lock);
	- ap = zfs_onexit_find_cb(zo, action_handle);
	- if (ap != NULL)
	- *data = ap->za_data;
	- else
	- error = SET_ERROR(ENOENT);
	- mutex_exit(&zo->zo_lock);
	-
	- return (error);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
	@@ -1,1069 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/systm.h>
	-#include <sys/sysmacros.h>
	-#include <sys/cmn_err.h>
	-#include <sys/kmem.h>
	-#include <sys/file.h>
	-#include <sys/fcntl.h>
	-#include <sys/vfs.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zfs_dir.h>
	-#include <sys/zfs_acl.h>
	-#include <sys/zfs_fuid.h>
	-#include <sys/spa.h>
	-#include <sys/zil.h>
	-#include <sys/byteorder.h>
	-#include <sys/stat.h>
	-#include <sys/acl.h>
	-#include <sys/atomic.h>
	-#include <sys/cred.h>
	-#include <sys/namei.h>
	-
	-/*
	- * Functions to replay ZFS intent log (ZIL) records
	- * The functions are called through a function vector (zfs_replay_vector)
	- * which is indexed by the transaction type.
	- */
	-
	-static void
	-zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
	- uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
	-{
	- VATTR_NULL(vap);
	- vap->va_mask = (uint_t)mask;
	- if (mask & AT_TYPE)
	- vap->va_type = IFTOVT(mode);
	- if (mask & AT_MODE)
	- vap->va_mode = mode & MODEMASK;
	- if (mask & AT_UID)
	- vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
	- if (mask & AT_GID)
	- vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
	- vap->va_rdev = zfs_cmpldev(rdev);
	- vap->va_nodeid = nodeid;
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_replay_error(void arg1, void arg2, boolean_t byteswap)
	-{
	- return (SET_ERROR(ENOTSUP));
	-}
	-
	-static void
	-zfs_replay_xvattr(lr_attr_t lrattr, xvattr_t xvap)
	-{
	- xoptattr_t *xoap = NULL;
	- uint64_t *attrs;
	- uint64_t *crtime;
	- uint32_t *bitmap;
	- void *scanstamp;
	- int i;
	-
	- xvap->xva_vattr.va_mask \|= AT_XVATTR;
	- if ((xoap = xva_getxoptattr(xvap)) == NULL) {
	- xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
	- return;
	- }
	-
	- ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
	-
	- bitmap = &lrattr->lr_attr_bitmap;
	- for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
	- xvap->xva_reqattrmap[i] = *bitmap;
	-
	- attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
	- crtime = attrs + 1;
	- scanstamp = (caddr_t)(crtime + 2);
	-
	- if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
	- xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
	- xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
	- xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_READONLY))
	- xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
	- xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
	- xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
	- xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
	- xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
	- xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
	- xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
	- xoap->xoa_av_quarantined =
	- ((*attrs & XAT0_AV_QUARANTINED) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
	- ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
	- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
	- bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
	- if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
	- xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
	- xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
	- if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
	- xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
	-}
	-
	-static int
	-zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
	-{
	- uint64_t uid_idx;
	- uint64_t gid_idx;
	- int domcnt = 0;
	-
	- uid_idx = FUID_INDEX(uid);
	- gid_idx = FUID_INDEX(gid);
	- if (uid_idx)
	- domcnt++;
	- if (gid_idx > 0 && gid_idx != uid_idx)
	- domcnt++;
	-
	- return (domcnt);
	-}
	-
	-static void *
	-zfs_replay_fuid_domain_common(zfs_fuid_info_t fuid_infop, void start,
	- int domcnt)
	-{
	- int i;
	-
	- for (i = 0; i != domcnt; i++) {
	- fuid_infop->z_domain_table[i] = start;
	- start = (caddr_t)start + strlen(start) + 1;
	- }
	-
	- return (start);
	-}
	-
	-/*
	- * Set the uid/gid in the fuid_info structure.
	- */
	-static void
	-zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
	-{
	- /*
	- * If owner or group are log specific FUIDs then slurp up
	- * domain information and build zfs_fuid_info_t
	- */
	- if (IS_EPHEMERAL(uid))
	- fuid_infop->z_fuid_owner = uid;
	-
	- if (IS_EPHEMERAL(gid))
	- fuid_infop->z_fuid_group = gid;
	-}
	-
	-/*
	- * Load fuid domains into fuid_info_t
	- */
	-static zfs_fuid_info_t *
	-zfs_replay_fuid_domain(void buf, void *end, uint64_t uid, uint64_t gid)
	-{
	- int domcnt;
	-
	- zfs_fuid_info_t *fuid_infop;
	-
	- fuid_infop = zfs_fuid_info_alloc();
	-
	- domcnt = zfs_replay_domain_cnt(uid, gid);
	-
	- if (domcnt == 0)
	- return (fuid_infop);
	-
	- fuid_infop->z_domain_table =
	- kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
	-
	- zfs_replay_fuid_ugid(fuid_infop, uid, gid);
	-
	- fuid_infop->z_domain_cnt = domcnt;
	- *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
	- return (fuid_infop);
	-}
	-
	-/*
	- * load zfs_fuid_t's and fuid_domains into fuid_info_t
	- */
	-static zfs_fuid_info_t *
	-zfs_replay_fuids(void start, void *end, int idcnt, int domcnt, uint64_t uid,
	- uint64_t gid)
	-{
	- uint64_t log_fuid = (uint64_t )start;
	- zfs_fuid_info_t *fuid_infop;
	- int i;
	-
	- fuid_infop = zfs_fuid_info_alloc();
	- fuid_infop->z_domain_cnt = domcnt;
	-
	- fuid_infop->z_domain_table =
	- kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
	-
	- for (i = 0; i != idcnt; i++) {
	- zfs_fuid_t *zfuid;
	-
	- zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
	- zfuid->z_logfuid = *log_fuid;
	- zfuid->z_id = -1;
	- zfuid->z_domidx = 0;
	- list_insert_tail(&fuid_infop->z_fuids, zfuid);
	- log_fuid++;
	- }
	-
	- zfs_replay_fuid_ugid(fuid_infop, uid, gid);
	-
	- *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
	- return (fuid_infop);
	-}
	-
	-static void
	-zfs_replay_swap_attrs(lr_attr_t *lrattr)
	-{
	- /* swap the lr_attr structure */
	- byteswap_uint32_array(lrattr, sizeof (*lrattr));
	- /* swap the bitmap */
	- byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
	- sizeof (uint32_t));
	- /* swap the attributes, create time + 64 bit word for attributes */
	- byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
	- (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
	-}
	-
	-/*
	- * Replay file create with optional ACL, xvattr information as well
	- * as option FUID information.
	- */
	-static int
	-zfs_replay_create_acl(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_acl_create_t *lracl = arg2;
	- char name = NULL; / location determined later */
	- lr_create_t lr = (lr_create_t )lracl;
	- znode_t *dzp;
	- vnode_t *vp = NULL;
	- xvattr_t xva;
	- int vflg = 0;
	- vsecattr_t vsec = { 0 };
	- lr_attr_t *lrattr;
	- void *aclstart;
	- void *fuidstart;
	- size_t xvatlen = 0;
	- uint64_t txtype;
	- uint64_t objid;
	- uint64_t dnodesize;
	- int error;
	-
	- txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
	- if (byteswap) {
	- byteswap_uint64_array(lracl, sizeof (*lracl));
	- if (txtype == TX_CREATE_ACL_ATTR \|\|
	- txtype == TX_MKDIR_ACL_ATTR) {
	- lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
	- zfs_replay_swap_attrs(lrattr);
	- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
	- }
	-
	- aclstart = (caddr_t)(lracl + 1) + xvatlen;
	- zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
	- /* swap fuids */
	- if (lracl->lr_fuidcnt) {
	- byteswap_uint64_array((caddr_t)aclstart +
	- ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
	- lracl->lr_fuidcnt * sizeof (uint64_t));
	- }
	- }
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
	- return (error);
	-
	- objid = LR_FOID_GET_OBJ(lr->lr_foid);
	- dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
	-
	- xva_init(&xva);
	- zfs_init_vattr(&xva.xva_vattr, AT_TYPE \| AT_MODE \| AT_UID \| AT_GID,
	- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
	-
	- /*
	- * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
	- * eventually end up in zfs_mknode(), which assigns the object's
	- * creation time, generation number, and dnode size. The generic
	- * zfs_create() has no concept of these attributes, so we smuggle
	- * the values inside the vattr's otherwise unused va_ctime,
	- * va_nblocks, and va_fsid fields.
	- */
	- ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
	- xva.xva_vattr.va_nblocks = lr->lr_gen;
	- xva.xva_vattr.va_fsid = dnodesize;
	-
	- error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
	- if (error != ENOENT)
	- goto bail;
	-
	- if (lr->lr_common.lrc_txtype & TX_CI)
	- vflg \|= FIGNORECASE;
	- switch (txtype) {
	- case TX_CREATE_ACL:
	- aclstart = (caddr_t)(lracl + 1);
	- fuidstart = (caddr_t)aclstart +
	- ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
	- zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
	- (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
	- lr->lr_uid, lr->lr_gid);
	- /FALLTHROUGH/
	- case TX_CREATE_ACL_ATTR:
	- if (name == NULL) {
	- lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
	- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
	- xva.xva_vattr.va_mask \|= AT_XVATTR;
	- zfs_replay_xvattr(lrattr, &xva);
	- }
	- vsec.vsa_mask = VSA_ACE \| VSA_ACE_ACLFLAGS;
	- vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
	- vsec.vsa_aclcnt = lracl->lr_aclcnt;
	- vsec.vsa_aclentsz = lracl->lr_acl_bytes;
	- vsec.vsa_aclflags = lracl->lr_acl_flags;
	- if (zfsvfs->z_fuid_replay == NULL) {
	- fuidstart = (caddr_t)(lracl + 1) + xvatlen +
	- ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
	- zfsvfs->z_fuid_replay =
	- zfs_replay_fuids(fuidstart,
	- (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
	- lr->lr_uid, lr->lr_gid);
	- }
	-
	-#ifdef TODO
	- error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
	- 0, 0, &vp, kcred, vflg, NULL, &vsec);
	-#else
	- panic("%s:%u: unsupported condition", __func__, __LINE__);
	-#endif
	- break;
	- case TX_MKDIR_ACL:
	- aclstart = (caddr_t)(lracl + 1);
	- fuidstart = (caddr_t)aclstart +
	- ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
	- zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
	- (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
	- lr->lr_uid, lr->lr_gid);
	- /FALLTHROUGH/
	- case TX_MKDIR_ACL_ATTR:
	- if (name == NULL) {
	- lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
	- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
	- zfs_replay_xvattr(lrattr, &xva);
	- }
	- vsec.vsa_mask = VSA_ACE \| VSA_ACE_ACLFLAGS;
	- vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
	- vsec.vsa_aclcnt = lracl->lr_aclcnt;
	- vsec.vsa_aclentsz = lracl->lr_acl_bytes;
	- vsec.vsa_aclflags = lracl->lr_acl_flags;
	- if (zfsvfs->z_fuid_replay == NULL) {
	- fuidstart = (caddr_t)(lracl + 1) + xvatlen +
	- ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
	- zfsvfs->z_fuid_replay =
	- zfs_replay_fuids(fuidstart,
	- (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
	- lr->lr_uid, lr->lr_gid);
	- }
	-#ifdef TODO
	- error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
	- &vp, kcred, NULL, vflg, &vsec);
	-#else
	- panic("%s:%u: unsupported condition", __func__, __LINE__);
	-#endif
	- break;
	- default:
	- error = SET_ERROR(ENOTSUP);
	- }
	-
	-bail:
	- if (error == 0 && vp != NULL)
	- VN_RELE(vp);
	-
	- VN_RELE(ZTOV(dzp));
	-
	- if (zfsvfs->z_fuid_replay)
	- zfs_fuid_info_free(zfsvfs->z_fuid_replay);
	- zfsvfs->z_fuid_replay = NULL;
	-
	- return (error);
	-}
	-
	-static int
	-zfs_replay_create(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_create_t *lr = arg2;
	- char name = NULL; / location determined later */
	- char link; / symlink content follows name */
	- znode_t *dzp;
	- vnode_t *vp = NULL;
	- xvattr_t xva;
	- int vflg = 0;
	- size_t lrsize = sizeof (lr_create_t);
	- lr_attr_t *lrattr;
	- void *start;
	- size_t xvatlen;
	- uint64_t txtype;
	- struct componentname cn;
	- int error;
	-
	- txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
	- if (byteswap) {
	- byteswap_uint64_array(lr, sizeof (*lr));
	- if (txtype == TX_CREATE_ATTR \|\| txtype == TX_MKDIR_ATTR)
	- zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
	- }
	-
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
	- return (error);
	-
	- uint64_t objid = LR_FOID_GET_OBJ(lr->lr_foid);
	- int dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
	-
	- xva_init(&xva);
	- zfs_init_vattr(&xva.xva_vattr, AT_TYPE \| AT_MODE \| AT_UID \| AT_GID,
	- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
	-
	- /*
	- * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
	- * eventually end up in zfs_mknode(), which assigns the object's
	- * creation time, generation number, and dnode slot count. The
	- * generic zfs_create() has no concept of these attributes, so
	- * we smuggle the values inside the vattr's otherwise unused
	- * va_ctime, va_nblocks and va_fsid fields.
	- */
	- ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
	- xva.xva_vattr.va_nblocks = lr->lr_gen;
	- xva.xva_vattr.va_fsid = dnodesize;
	-
	- error = dmu_object_info(zfsvfs->z_os, objid, NULL);
	- if (error != ENOENT)
	- goto out;
	-
	- if (lr->lr_common.lrc_txtype & TX_CI)
	- vflg \|= FIGNORECASE;
	-
	- /*
	- * Symlinks don't have fuid info, and CIFS never creates
	- * symlinks.
	- *
	- * The _ATTR versions will grab the fuid info in their subcases.
	- */
	- if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
	- (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
	- (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
	- start = (lr + 1);
	- zfsvfs->z_fuid_replay =
	- zfs_replay_fuid_domain(start, &start,
	- lr->lr_uid, lr->lr_gid);
	- }
	-
	- cn.cn_cred = kcred;
	- cn.cn_thread = curthread;
	- cn.cn_flags = SAVENAME;
	-
	- vn_lock(ZTOV(dzp), LK_EXCLUSIVE \| LK_RETRY);
	- switch (txtype) {
	- case TX_CREATE_ATTR:
	- lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
	- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
	- zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
	- start = (caddr_t)(lr + 1) + xvatlen;
	- zfsvfs->z_fuid_replay =
	- zfs_replay_fuid_domain(start, &start,
	- lr->lr_uid, lr->lr_gid);
	- name = (char *)start;
	-
	- /FALLTHROUGH/
	- case TX_CREATE:
	- if (name == NULL)
	- name = (char *)start;
	-
	- cn.cn_nameptr = name;
	- error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /,vflg/);
	- break;
	- case TX_MKDIR_ATTR:
	- lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
	- xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
	- zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
	- start = (caddr_t)(lr + 1) + xvatlen;
	- zfsvfs->z_fuid_replay =
	- zfs_replay_fuid_domain(start, &start,
	- lr->lr_uid, lr->lr_gid);
	- name = (char *)start;
	-
	- /FALLTHROUGH/
	- case TX_MKDIR:
	- if (name == NULL)
	- name = (char *)(lr + 1);
	-
	- cn.cn_nameptr = name;
	- error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /,vflg/);
	- break;
	- case TX_MKXATTR:
	- error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
	- break;
	- case TX_SYMLINK:
	- name = (char *)(lr + 1);
	- link = name + strlen(name) + 1;
	- cn.cn_nameptr = name;
	- error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /,vflg/);
	- break;
	- default:
	- error = SET_ERROR(ENOTSUP);
	- }
	- VOP_UNLOCK(ZTOV(dzp));
	-
	-out:
	- if (error == 0 && vp != NULL)
	- VN_URELE(vp);
	-
	- VN_RELE(ZTOV(dzp));
	-
	- if (zfsvfs->z_fuid_replay)
	- zfs_fuid_info_free(zfsvfs->z_fuid_replay);
	- zfsvfs->z_fuid_replay = NULL;
	- return (error);
	-}
	-
	-static int
	-zfs_replay_remove(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_remove_t *lr = arg2;
	- char name = (char )(lr + 1); /* name follows lr_remove_t */
	- znode_t *dzp;
	- struct componentname cn;
	- vnode_t *vp;
	- int error;
	- int vflg = 0;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
	- return (error);
	-
	- if (lr->lr_common.lrc_txtype & TX_CI)
	- vflg \|= FIGNORECASE;
	- cn.cn_nameptr = name;
	- cn.cn_namelen = strlen(name);
	- cn.cn_nameiop = DELETE;
	- cn.cn_flags = ISLASTCN \| SAVENAME;
	- cn.cn_lkflags = LK_EXCLUSIVE \| LK_RETRY;
	- cn.cn_cred = kcred;
	- cn.cn_thread = curthread;
	- vn_lock(ZTOV(dzp), LK_EXCLUSIVE \| LK_RETRY);
	- error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
	- if (error != 0) {
	- VOP_UNLOCK(ZTOV(dzp));
	- goto fail;
	- }
	-
	- switch ((int)lr->lr_common.lrc_txtype) {
	- case TX_REMOVE:
	- error = VOP_REMOVE(ZTOV(dzp), vp, &cn /,vflg/);
	- break;
	- case TX_RMDIR:
	- error = VOP_RMDIR(ZTOV(dzp), vp, &cn /,vflg/);
	- break;
	- default:
	- error = SET_ERROR(ENOTSUP);
	- }
	- vput(vp);
	- VOP_UNLOCK(ZTOV(dzp));
	-
	-fail:
	- VN_RELE(ZTOV(dzp));
	-
	- return (error);
	-}
	-
	-static int
	-zfs_replay_link(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_link_t *lr = arg2;
	- char name = (char )(lr + 1); /* name follows lr_link_t */
	- znode_t dzp, zp;
	- struct componentname cn;
	- int error;
	- int vflg = 0;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
	- return (error);
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
	- VN_RELE(ZTOV(dzp));
	- return (error);
	- }
	-
	- if (lr->lr_common.lrc_txtype & TX_CI)
	- vflg \|= FIGNORECASE;
	-
	- cn.cn_nameptr = name;
	- cn.cn_cred = kcred;
	- cn.cn_thread = curthread;
	- cn.cn_flags = SAVENAME;
	-
	- vn_lock(ZTOV(dzp), LK_EXCLUSIVE \| LK_RETRY);
	- vn_lock(ZTOV(zp), LK_EXCLUSIVE \| LK_RETRY);
	- error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /,vflg/);
	- VOP_UNLOCK(ZTOV(zp));
	- VOP_UNLOCK(ZTOV(dzp));
	-
	- VN_RELE(ZTOV(zp));
	- VN_RELE(ZTOV(dzp));
	-
	- return (error);
	-}
	-
	-static int
	-zfs_replay_rename(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_rename_t *lr = arg2;
	- char sname = (char )(lr + 1); /* sname and tname follow lr_rename_t */
	- char *tname = sname + strlen(sname) + 1;
	- znode_t sdzp, tdzp;
	- struct componentname scn, tcn;
	- vnode_t svp, tvp;
	- kthread_t *td = curthread;
	- int error;
	- int vflg = 0;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
	- return (error);
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
	- VN_RELE(ZTOV(sdzp));
	- return (error);
	- }
	-
	- if (lr->lr_common.lrc_txtype & TX_CI)
	- vflg \|= FIGNORECASE;
	- svp = tvp = NULL;
	-
	- scn.cn_nameptr = sname;
	- scn.cn_namelen = strlen(sname);
	- scn.cn_nameiop = DELETE;
	- scn.cn_flags = ISLASTCN \| SAVENAME;
	- scn.cn_lkflags = LK_EXCLUSIVE \| LK_RETRY;
	- scn.cn_cred = kcred;
	- scn.cn_thread = td;
	- vn_lock(ZTOV(sdzp), LK_EXCLUSIVE \| LK_RETRY);
	- error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
	- VOP_UNLOCK(ZTOV(sdzp));
	- if (error != 0)
	- goto fail;
	- VOP_UNLOCK(svp);
	-
	- tcn.cn_nameptr = tname;
	- tcn.cn_namelen = strlen(tname);
	- tcn.cn_nameiop = RENAME;
	- tcn.cn_flags = ISLASTCN \| SAVENAME;
	- tcn.cn_lkflags = LK_EXCLUSIVE \| LK_RETRY;
	- tcn.cn_cred = kcred;
	- tcn.cn_thread = td;
	- vn_lock(ZTOV(tdzp), LK_EXCLUSIVE \| LK_RETRY);
	- error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
	- if (error == EJUSTRETURN)
	- tvp = NULL;
	- else if (error != 0) {
	- VOP_UNLOCK(ZTOV(tdzp));
	- goto fail;
	- }
	-
	- error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /,vflg/);
	- return (error);
	-fail:
	- if (svp != NULL)
	- vrele(svp);
	- if (tvp != NULL)
	- vrele(tvp);
	- VN_RELE(ZTOV(tdzp));
	- VN_RELE(ZTOV(sdzp));
	-
	- return (error);
	-}
	-
	-static int
	-zfs_replay_write(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_write_t *lr = arg2;
	- char data = (char )(lr + 1); /* data follows lr_write_t */
	- znode_t *zp;
	- int error;
	- ssize_t resid;
	- uint64_t eod, offset, length;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
	- /*
	- * As we can log writes out of order, it's possible the
	- * file has been removed. In this case just drop the write
	- * and return success.
	- */
	- if (error == ENOENT)
	- error = 0;
	- return (error);
	- }
	-
	- offset = lr->lr_offset;
	- length = lr->lr_length;
	- eod = offset + length; /* end of data for this write */
	-
	- /*
	- * This may be a write from a dmu_sync() for a whole block,
	- * and may extend beyond the current end of the file.
	- * We can't just replay what was written for this TX_WRITE as
	- * a future TX_WRITE2 may extend the eof and the data for that
	- * write needs to be there. So we write the whole block and
	- * reduce the eof. This needs to be done within the single dmu
	- * transaction created within vn_rdwr -> zfs_write. So a possible
	- * new end of file is passed through in zfsvfs->z_replay_eof
	- */
	-
	- zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
	-
	- /* If it's a dmu_sync() block, write the whole block */
	- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
	- uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
	- if (length < blocksize) {
	- offset -= offset % blocksize;
	- length = blocksize;
	- }
	- if (zp->z_size < eod)
	- zfsvfs->z_replay_eof = eod;
	- }
	-
	- error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
	- UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
	-
	- VN_RELE(ZTOV(zp));
	- zfsvfs->z_replay_eof = 0; /* safety */
	-
	- return (error);
	-}
	-
	-/*
	- * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
	- * meaning the pool block is already being synced. So now that we always write
	- * out full blocks, all we have to do is expand the eof if
	- * the file is grown.
	- */
	-static int
	-zfs_replay_write2(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_write_t *lr = arg2;
	- znode_t *zp;
	- int error;
	- uint64_t end;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
	- return (error);
	-
	-top:
	- end = lr->lr_offset + lr->lr_length;
	- if (end > zp->z_size) {
	- dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
	-
	- zp->z_size = end;
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- VN_RELE(ZTOV(zp));
	- if (error == ERESTART) {
	- dmu_tx_wait(tx);
	- dmu_tx_abort(tx);
	- goto top;
	- }
	- dmu_tx_abort(tx);
	- return (error);
	- }
	- (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
	- (void *)&zp->z_size, sizeof (uint64_t), tx);
	-
	- /* Ensure the replayed seq is updated */
	- (void) zil_replaying(zfsvfs->z_log, tx);
	-
	- dmu_tx_commit(tx);
	- }
	-
	- VN_RELE(ZTOV(zp));
	-
	- return (error);
	-}
	-
	-static int
	-zfs_replay_truncate(void arg1, void arg2, boolean_t byteswap)
	-{
	-#ifdef illumos
	- zfsvfs_t *zfsvfs = arg1;
	- lr_truncate_t *lr = arg2;
	- znode_t *zp;
	- flock64_t fl;
	- int error;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
	- return (error);
	-
	- bzero(&fl, sizeof (fl));
	- fl.l_type = F_WRLCK;
	- fl.l_whence = 0;
	- fl.l_start = lr->lr_offset;
	- fl.l_len = lr->lr_length;
	-
	- error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE \| FOFFMAX,
	- lr->lr_offset, kcred, NULL);
	-
	- VN_RELE(ZTOV(zp));
	-
	- return (error);
	-#else
	- ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
	- return (EOPNOTSUPP);
	-#endif
	-}
	-
	-static int
	-zfs_replay_setattr(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_setattr_t *lr = arg2;
	- znode_t *zp;
	- xvattr_t xva;
	- vattr_t *vap = &xva.xva_vattr;
	- vnode_t *vp;
	- int error;
	- void *start;
	-
	- xva_init(&xva);
	- if (byteswap) {
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- if ((lr->lr_mask & AT_XVATTR) &&
	- zfsvfs->z_version >= ZPL_VERSION_INITIAL)
	- zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
	- }
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
	- return (error);
	-
	- zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
	- lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
	-
	- vap->va_size = lr->lr_size;
	- ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
	- ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
	-
	- /*
	- * Fill in xvattr_t portions if necessary.
	- */
	-
	- start = (lr_setattr_t *)(lr + 1);
	- if (vap->va_mask & AT_XVATTR) {
	- zfs_replay_xvattr((lr_attr_t *)start, &xva);
	- start = (caddr_t)start +
	- ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
	- } else
	- xva.xva_vattr.va_mask &= ~AT_XVATTR;
	-
	- zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
	- lr->lr_uid, lr->lr_gid);
	-
	- vp = ZTOV(zp);
	- vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	- error = VOP_SETATTR(vp, vap, kcred);
	- VOP_UNLOCK(vp);
	-
	- zfs_fuid_info_free(zfsvfs->z_fuid_replay);
	- zfsvfs->z_fuid_replay = NULL;
	- VN_RELE(vp);
	-
	- return (error);
	-}
	-
	-extern int zfs_setsecattr(vnode_t vp, vsecattr_t vsecp, int flag, cred_t *cr,
	- caller_context_t *ct);
	-
	-static int
	-zfs_replay_acl_v0(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_acl_v0_t *lr = arg2;
	- ace_t ace = (ace_t )(lr + 1); /* ace array follows lr_acl_t */
	- vsecattr_t vsa;
	- vnode_t *vp;
	- znode_t *zp;
	- int error;
	-
	- if (byteswap) {
	- byteswap_uint64_array(lr, sizeof (*lr));
	- zfs_oldace_byteswap(ace, lr->lr_aclcnt);
	- }
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
	- return (error);
	-
	- bzero(&vsa, sizeof (vsa));
	- vsa.vsa_mask = VSA_ACE \| VSA_ACECNT;
	- vsa.vsa_aclcnt = lr->lr_aclcnt;
	- vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
	- vsa.vsa_aclflags = 0;
	- vsa.vsa_aclentp = ace;
	-
	- vp = ZTOV(zp);
	- vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	- error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
	- VOP_UNLOCK(vp);
	-
	- VN_RELE(vp);
	-
	- return (error);
	-}
	-
	-/*
	- * Replaying ACLs is complicated by FUID support.
	- * The log record may contain some optional data
	- * to be used for replaying FUID's. These pieces
	- * are the actual FUIDs that were created initially.
	- * The FUID table index may no longer be valid and
	- * during zfs_create() a new index may be assigned.
	- * Because of this the log will contain the original
	- * doman+rid in order to create a new FUID.
	- *
	- * The individual ACEs may contain an ephemeral uid/gid which is no
	- * longer valid and will need to be replaced with an actual FUID.
	- *
	- */
	-static int
	-zfs_replay_acl(void arg1, void arg2, boolean_t byteswap)
	-{
	- zfsvfs_t *zfsvfs = arg1;
	- lr_acl_t *lr = arg2;
	- ace_t ace = (ace_t )(lr + 1);
	- vsecattr_t vsa;
	- znode_t *zp;
	- vnode_t *vp;
	- int error;
	-
	- if (byteswap) {
	- byteswap_uint64_array(lr, sizeof (*lr));
	- zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
	- if (lr->lr_fuidcnt) {
	- byteswap_uint64_array((caddr_t)ace +
	- ZIL_ACE_LENGTH(lr->lr_acl_bytes),
	- lr->lr_fuidcnt * sizeof (uint64_t));
	- }
	- }
	-
	- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
	- return (error);
	-
	- bzero(&vsa, sizeof (vsa));
	- vsa.vsa_mask = VSA_ACE \| VSA_ACECNT \| VSA_ACE_ACLFLAGS;
	- vsa.vsa_aclcnt = lr->lr_aclcnt;
	- vsa.vsa_aclentp = ace;
	- vsa.vsa_aclentsz = lr->lr_acl_bytes;
	- vsa.vsa_aclflags = lr->lr_acl_flags;
	-
	- if (lr->lr_fuidcnt) {
	- void *fuidstart = (caddr_t)ace +
	- ZIL_ACE_LENGTH(lr->lr_acl_bytes);
	-
	- zfsvfs->z_fuid_replay =
	- zfs_replay_fuids(fuidstart, &fuidstart,
	- lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
	- }
	-
	- vp = ZTOV(zp);
	- vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	- error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
	- VOP_UNLOCK(vp);
	-
	- if (zfsvfs->z_fuid_replay)
	- zfs_fuid_info_free(zfsvfs->z_fuid_replay);
	-
	- zfsvfs->z_fuid_replay = NULL;
	- VN_RELE(vp);
	-
	- return (error);
	-}
	-
	-/*
	- * Callback vectors for replaying records
	- */
	-zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
	- zfs_replay_error, /* 0 no such transaction type */
	- zfs_replay_create, /* TX_CREATE */
	- zfs_replay_create, /* TX_MKDIR */
	- zfs_replay_create, /* TX_MKXATTR */
	- zfs_replay_create, /* TX_SYMLINK */
	- zfs_replay_remove, /* TX_REMOVE */
	- zfs_replay_remove, /* TX_RMDIR */
	- zfs_replay_link, /* TX_LINK */
	- zfs_replay_rename, /* TX_RENAME */
	- zfs_replay_write, /* TX_WRITE */
	- zfs_replay_truncate, /* TX_TRUNCATE */
	- zfs_replay_setattr, /* TX_SETATTR */
	- zfs_replay_acl_v0, /* TX_ACL_V0 */
	- zfs_replay_acl, /* TX_ACL */
	- zfs_replay_create_acl, /* TX_CREATE_ACL */
	- zfs_replay_create, /* TX_CREATE_ATTR */
	- zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */
	- zfs_replay_create_acl, /* TX_MKDIR_ACL */
	- zfs_replay_create, /* TX_MKDIR_ATTR */
	- zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
	- zfs_replay_write2, /* TX_WRITE2 */
	-};
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
	@@ -1,641 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * This file contains the code to implement file range locking in
	- * ZFS, although there isn't much specific to ZFS (all that comes to mind is
	- * support for growing the blocksize).
	- *
	- * Interface
	- * ---------
	- * Defined in zfs_rlock.h but essentially:
	- * lr = rangelock_enter(zp, off, len, lock_type);
	- * rangelock_reduce(lr, off, len); // optional
	- * rangelock_exit(lr);
	- *
	- * AVL tree
	- * --------
	- * An AVL tree is used to maintain the state of the existing ranges
	- * that are locked for exclusive (writer) or shared (reader) use.
	- * The starting range offset is used for searching and sorting the tree.
	- *
	- * Common case
	- * -----------
	- * The (hopefully) usual case is of no overlaps or contention for locks. On
	- * entry to rangelock_enter(), a locked_range_t is allocated; the tree
	- * searched that finds no overlap, and this locked_range_t is placed in the
	- * tree.
	- *
	- * Overlaps/Reference counting/Proxy locks
	- * ---------------------------------------
	- * The avl code only allows one node at a particular offset. Also it's very
	- * inefficient to search through all previous entries looking for overlaps
	- * (because the very 1st in the ordered list might be at offset 0 but
	- * cover the whole file).
	- * So this implementation uses reference counts and proxy range locks.
	- * Firstly, only reader locks use reference counts and proxy locks,
	- * because writer locks are exclusive.
	- * When a reader lock overlaps with another then a proxy lock is created
	- * for that range and replaces the original lock. If the overlap
	- * is exact then the reference count of the proxy is simply incremented.
	- * Otherwise, the proxy lock is split into smaller lock ranges and
	- * new proxy locks created for non overlapping ranges.
	- * The reference counts are adjusted accordingly.
	- * Meanwhile, the orginal lock is kept around (this is the callers handle)
	- * and its offset and length are used when releasing the lock.
	- *
	- * Thread coordination
	- * -------------------
	- * In order to make wakeups efficient and to ensure multiple continuous
	- * readers on a range don't starve a writer for the same range lock,
	- * two condition variables are allocated in each rl_t.
	- * If a writer (or reader) can't get a range it initialises the writer
	- * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
	- * and waits on that cv. When a thread unlocks that range it wakes up all
	- * writers then all readers before destroying the lock.
	- *
	- * Append mode writes
	- * ------------------
	- * Append mode writes need to lock a range at the end of a file.
	- * The offset of the end of the file is determined under the
	- * range locking mutex, and the lock type converted from RL_APPEND to
	- * RL_WRITER and the range locked.
	- *
	- * Grow block handling
	- * -------------------
	- * ZFS supports multiple block sizes, up to 16MB. The smallest
	- * block size is used for the file which is grown as needed. During this
	- * growth all other writers and readers must be excluded.
	- * So if the block size needs to be grown then the whole file is
	- * exclusively locked, then later the caller will reduce the lock
	- * range to just the range to be written using rangelock_reduce().
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/avl.h>
	-#include <sys/zfs_rlock.h>
	-
	-/*
	- * AVL comparison function used to order range locks
	- * Locks are ordered on the start offset of the range.
	- */
	-static int
	-rangelock_compare(const void arg1, const void arg2)
	-{
	- const locked_range_t rl1 = (const locked_range_t )arg1;
	- const locked_range_t rl2 = (const locked_range_t )arg2;
	-
	- return (AVL_CMP(rl1->lr_offset, rl2->lr_offset));
	-}
	-
	-/*
	- * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
	- * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
	- * and may increase the range that's locked for RL_WRITER.
	- */
	-void
	-rangelock_init(rangelock_t rl, rangelock_cb_t cb, void *arg)
	-{
	- mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
	- avl_create(&rl->rl_tree, rangelock_compare,
	- sizeof (locked_range_t), offsetof(locked_range_t, lr_node));
	- rl->rl_cb = cb;
	- rl->rl_arg = arg;
	-}
	-
	-void
	-rangelock_fini(rangelock_t *rl)
	-{
	- mutex_destroy(&rl->rl_lock);
	- avl_destroy(&rl->rl_tree);
	-}
	-
	-/*
	- * Check if a write lock can be grabbed. If not, fail immediately or sleep and
	- * recheck until available, depending on the value of the "nonblock" parameter.
	- */
	-static boolean_t
	-rangelock_enter_writer(rangelock_t rl, locked_range_t new, boolean_t nonblock)
	-{
	- avl_tree_t *tree = &rl->rl_tree;
	- locked_range_t *lr;
	- avl_index_t where;
	- uint64_t orig_off = new->lr_offset;
	- uint64_t orig_len = new->lr_length;
	- rangelock_type_t orig_type = new->lr_type;
	-
	- for (;;) {
	- /*
	- * Call callback which can modify new->r_off,len,type.
	- * Note, the callback is used by the ZPL to handle appending
	- * and changing blocksizes. It isn't needed for zvols.
	- */
	- if (rl->rl_cb != NULL) {
	- rl->rl_cb(new, rl->rl_arg);
	- }
	-
	- /*
	- * If the type was APPEND, the callback must convert it to
	- * WRITER.
	- */
	- ASSERT3U(new->lr_type, ==, RL_WRITER);
	-
	- /*
	- * First check for the usual case of no locks
	- */
	- if (avl_numnodes(tree) == 0) {
	- avl_add(tree, new);
	- return (B_TRUE);
	- }
	-
	- /*
	- * Look for any locks in the range.
	- */
	- lr = avl_find(tree, new, &where);
	- if (lr != NULL)
	- goto wait; /* already locked at same offset */
	-
	- lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
	- if (lr != NULL &&
	- lr->lr_offset < new->lr_offset + new->lr_length)
	- goto wait;
	-
	- lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
	- if (lr != NULL &&
	- lr->lr_offset + lr->lr_length > new->lr_offset)
	- goto wait;
	-
	- avl_insert(tree, new, where);
	- return (B_TRUE);
	-wait:
	- if (nonblock)
	- return (B_FALSE);
	- if (!lr->lr_write_wanted) {
	- cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
	- lr->lr_write_wanted = B_TRUE;
	- }
	- cv_wait(&lr->lr_write_cv, &rl->rl_lock);
	-
	- /* reset to original */
	- new->lr_offset = orig_off;
	- new->lr_length = orig_len;
	- new->lr_type = orig_type;
	- }
	-}
	-
	-/*
	- * If this is an original (non-proxy) lock then replace it by
	- * a proxy and return the proxy.
	- */
	-static locked_range_t *
	-rangelock_proxify(avl_tree_t tree, locked_range_t lr)
	-{
	- locked_range_t *proxy;
	-
	- if (lr->lr_proxy)
	- return (lr); /* already a proxy */
	-
	- ASSERT3U(lr->lr_count, ==, 1);
	- ASSERT(lr->lr_write_wanted == B_FALSE);
	- ASSERT(lr->lr_read_wanted == B_FALSE);
	- avl_remove(tree, lr);
	- lr->lr_count = 0;
	-
	- /* create a proxy range lock */
	- proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
	- proxy->lr_offset = lr->lr_offset;
	- proxy->lr_length = lr->lr_length;
	- proxy->lr_count = 1;
	- proxy->lr_type = RL_READER;
	- proxy->lr_proxy = B_TRUE;
	- proxy->lr_write_wanted = B_FALSE;
	- proxy->lr_read_wanted = B_FALSE;
	- avl_add(tree, proxy);
	-
	- return (proxy);
	-}
	-
	-/*
	- * Split the range lock at the supplied offset
	- * returning the front proxy.
	- */
	-static locked_range_t *
	-rangelock_split(avl_tree_t tree, locked_range_t lr, uint64_t off)
	-{
	- ASSERT3U(lr->lr_length, >, 1);
	- ASSERT3U(off, >, lr->lr_offset);
	- ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
	- ASSERT(lr->lr_write_wanted == B_FALSE);
	- ASSERT(lr->lr_read_wanted == B_FALSE);
	-
	- /* create the rear proxy range lock */
	- locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
	- rear->lr_offset = off;
	- rear->lr_length = lr->lr_offset + lr->lr_length - off;
	- rear->lr_count = lr->lr_count;
	- rear->lr_type = RL_READER;
	- rear->lr_proxy = B_TRUE;
	- rear->lr_write_wanted = B_FALSE;
	- rear->lr_read_wanted = B_FALSE;
	-
	- locked_range_t *front = rangelock_proxify(tree, lr);
	- front->lr_length = off - lr->lr_offset;
	-
	- avl_insert_here(tree, rear, front, AVL_AFTER);
	- return (front);
	-}
	-
	-/*
	- * Create and add a new proxy range lock for the supplied range.
	- */
	-static void
	-rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
	-{
	- ASSERT(len != 0);
	- locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
	- lr->lr_offset = off;
	- lr->lr_length = len;
	- lr->lr_count = 1;
	- lr->lr_type = RL_READER;
	- lr->lr_proxy = B_TRUE;
	- lr->lr_write_wanted = B_FALSE;
	- lr->lr_read_wanted = B_FALSE;
	- avl_add(tree, lr);
	-}
	-
	-static void
	-rangelock_add_reader(avl_tree_t tree, locked_range_t new,
	- locked_range_t *prev, avl_index_t where)
	-{
	- locked_range_t *next;
	- uint64_t off = new->lr_offset;
	- uint64_t len = new->lr_length;
	-
	- /*
	- * prev arrives either:
	- * - pointing to an entry at the same offset
	- * - pointing to the entry with the closest previous offset whose
	- * range may overlap with the new range
	- * - null, if there were no ranges starting before the new one
	- */
	- if (prev != NULL) {
	- if (prev->lr_offset + prev->lr_length <= off) {
	- prev = NULL;
	- } else if (prev->lr_offset != off) {
	- /*
	- * convert to proxy if needed then
	- * split this entry and bump ref count
	- */
	- prev = rangelock_split(tree, prev, off);
	- prev = AVL_NEXT(tree, prev); /* move to rear range */
	- }
	- }
	- ASSERT((prev == NULL) \|\| (prev->lr_offset == off));
	-
	- if (prev != NULL)
	- next = prev;
	- else
	- next = avl_nearest(tree, where, AVL_AFTER);
	-
	- if (next == NULL \|\| off + len <= next->lr_offset) {
	- /* no overlaps, use the original new rl_t in the tree */
	- avl_insert(tree, new, where);
	- return;
	- }
	-
	- if (off < next->lr_offset) {
	- /* Add a proxy for initial range before the overlap */
	- rangelock_new_proxy(tree, off, next->lr_offset - off);
	- }
	-
	- new->lr_count = 0; /* will use proxies in tree */
	- /*
	- * We now search forward through the ranges, until we go past the end
	- * of the new range. For each entry we make it a proxy if it
	- * isn't already, then bump its reference count. If there's any
	- * gaps between the ranges then we create a new proxy range.
	- */
	- for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
	- if (off + len <= next->lr_offset)
	- break;
	- if (prev != NULL && prev->lr_offset + prev->lr_length <
	- next->lr_offset) {
	- /* there's a gap */
	- ASSERT3U(next->lr_offset, >,
	- prev->lr_offset + prev->lr_length);
	- rangelock_new_proxy(tree,
	- prev->lr_offset + prev->lr_length,
	- next->lr_offset -
	- (prev->lr_offset + prev->lr_length));
	- }
	- if (off + len == next->lr_offset + next->lr_length) {
	- /* exact overlap with end */
	- next = rangelock_proxify(tree, next);
	- next->lr_count++;
	- return;
	- }
	- if (off + len < next->lr_offset + next->lr_length) {
	- /* new range ends in the middle of this block */
	- next = rangelock_split(tree, next, off + len);
	- next->lr_count++;
	- return;
	- }
	- ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
	- next = rangelock_proxify(tree, next);
	- next->lr_count++;
	- }
	-
	- /* Add the remaining end range. */
	- rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
	- (off + len) - (prev->lr_offset + prev->lr_length));
	-}
	-
	-/*
	- * Check if a reader lock can be grabbed. If not, fail immediately or sleep and
	- * recheck until available, depending on the value of the "nonblock" parameter.
	- */
	-static boolean_t
	-rangelock_enter_reader(rangelock_t rl, locked_range_t new, boolean_t nonblock)
	-{
	- avl_tree_t *tree = &rl->rl_tree;
	- locked_range_t prev, next;
	- avl_index_t where;
	- uint64_t off = new->lr_offset;
	- uint64_t len = new->lr_length;
	-
	- /*
	- * Look for any writer locks in the range.
	- */
	-retry:
	- prev = avl_find(tree, new, &where);
	- if (prev == NULL)
	- prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
	-
	- /*
	- * Check the previous range for a writer lock overlap.
	- */
	- if (prev && (off < prev->lr_offset + prev->lr_length)) {
	- if ((prev->lr_type == RL_WRITER) \|\| (prev->lr_write_wanted)) {
	- if (nonblock)
	- return (B_FALSE);
	- if (!prev->lr_read_wanted) {
	- cv_init(&prev->lr_read_cv,
	- NULL, CV_DEFAULT, NULL);
	- prev->lr_read_wanted = B_TRUE;
	- }
	- cv_wait(&prev->lr_read_cv, &rl->rl_lock);
	- goto retry;
	- }
	- if (off + len < prev->lr_offset + prev->lr_length)
	- goto got_lock;
	- }
	-
	- /*
	- * Search through the following ranges to see if there's
	- * write lock any overlap.
	- */
	- if (prev != NULL)
	- next = AVL_NEXT(tree, prev);
	- else
	- next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
	- for (; next != NULL; next = AVL_NEXT(tree, next)) {
	- if (off + len <= next->lr_offset)
	- goto got_lock;
	- if ((next->lr_type == RL_WRITER) \|\| (next->lr_write_wanted)) {
	- if (nonblock)
	- return (B_FALSE);
	- if (!next->lr_read_wanted) {
	- cv_init(&next->lr_read_cv,
	- NULL, CV_DEFAULT, NULL);
	- next->lr_read_wanted = B_TRUE;
	- }
	- cv_wait(&next->lr_read_cv, &rl->rl_lock);
	- goto retry;
	- }
	- if (off + len <= next->lr_offset + next->lr_length)
	- goto got_lock;
	- }
	-
	-got_lock:
	- /*
	- * Add the read lock, which may involve splitting existing
	- * locks and bumping ref counts (r_count).
	- */
	- rangelock_add_reader(tree, new, prev, where);
	- return (B_TRUE);
	-}
	-
	-/*
	- * Lock a range (offset, length) as either shared (RL_READER) or exclusive
	- * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert
	- * it to a RL_WRITER lock (with the offset at the end of the file). Returns
	- * the range lock structure for later unlocking (or reduce range if the
	- * entire file is locked as RL_WRITER).
	- */
	-static locked_range_t *
	-_rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
	- rangelock_type_t type, boolean_t nonblock)
	-{
	- ASSERT(type == RL_READER \|\| type == RL_WRITER \|\| type == RL_APPEND);
	-
	- locked_range_t new = kmem_alloc(sizeof (new), KM_SLEEP);
	- new->lr_rangelock = rl;
	- new->lr_offset = off;
	- if (len + off < off) /* overflow */
	- len = UINT64_MAX - off;
	- new->lr_length = len;
	- new->lr_count = 1; /* assume it's going to be in the tree */
	- new->lr_type = type;
	- new->lr_proxy = B_FALSE;
	- new->lr_write_wanted = B_FALSE;
	- new->lr_read_wanted = B_FALSE;
	-
	- mutex_enter(&rl->rl_lock);
	- if (type == RL_READER) {
	- /*
	- * First check for the usual case of no locks
	- */
	- if (avl_numnodes(&rl->rl_tree) == 0) {
	- avl_add(&rl->rl_tree, new);
	- } else if (!rangelock_enter_reader(rl, new, nonblock)) {
	- kmem_free(new, sizeof (*new));
	- new = NULL;
	- }
	- } else if (!rangelock_enter_writer(rl, new, nonblock)) {
	- kmem_free(new, sizeof (*new));
	- new = NULL;
	- }
	- mutex_exit(&rl->rl_lock);
	- return (new);
	-}
	-
	-locked_range_t *
	-rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
	- rangelock_type_t type)
	-{
	- return (_rangelock_enter(rl, off, len, type, B_FALSE));
	-}
	-
	-locked_range_t *
	-rangelock_tryenter(rangelock_t *rl, uint64_t off, uint64_t len,
	- rangelock_type_t type)
	-{
	- return (_rangelock_enter(rl, off, len, type, B_TRUE));
	-}
	-
	-/*
	- * Unlock a reader lock
	- */
	-static void
	-rangelock_exit_reader(rangelock_t rl, locked_range_t remove)
	-{
	- avl_tree_t *tree = &rl->rl_tree;
	- uint64_t len;
	-
	- /*
	- * The common case is when the remove entry is in the tree
	- * (cnt == 1) meaning there's been no other reader locks overlapping
	- * with this one. Otherwise the remove entry will have been
	- * removed from the tree and replaced by proxies (one or
	- * more ranges mapping to the entire range).
	- */
	- if (remove->lr_count == 1) {
	- avl_remove(tree, remove);
	- if (remove->lr_write_wanted) {
	- cv_broadcast(&remove->lr_write_cv);
	- cv_destroy(&remove->lr_write_cv);
	- }
	- if (remove->lr_read_wanted) {
	- cv_broadcast(&remove->lr_read_cv);
	- cv_destroy(&remove->lr_read_cv);
	- }
	- } else {
	- ASSERT0(remove->lr_count);
	- ASSERT0(remove->lr_write_wanted);
	- ASSERT0(remove->lr_read_wanted);
	- /*
	- * Find start proxy representing this reader lock,
	- * then decrement ref count on all proxies
	- * that make up this range, freeing them as needed.
	- */
	- locked_range_t *lr = avl_find(tree, remove, NULL);
	- ASSERT3P(lr, !=, NULL);
	- ASSERT3U(lr->lr_count, !=, 0);
	- ASSERT3U(lr->lr_type, ==, RL_READER);
	- locked_range_t *next = NULL;
	- for (len = remove->lr_length; len != 0; lr = next) {
	- len -= lr->lr_length;
	- if (len != 0) {
	- next = AVL_NEXT(tree, lr);
	- ASSERT3P(next, !=, NULL);
	- ASSERT3U(lr->lr_offset + lr->lr_length, ==,
	- next->lr_offset);
	- ASSERT3U(next->lr_count, !=, 0);
	- ASSERT3U(next->lr_type, ==, RL_READER);
	- }
	- lr->lr_count--;
	- if (lr->lr_count == 0) {
	- avl_remove(tree, lr);
	- if (lr->lr_write_wanted) {
	- cv_broadcast(&lr->lr_write_cv);
	- cv_destroy(&lr->lr_write_cv);
	- }
	- if (lr->lr_read_wanted) {
	- cv_broadcast(&lr->lr_read_cv);
	- cv_destroy(&lr->lr_read_cv);
	- }
	- kmem_free(lr, sizeof (locked_range_t));
	- }
	- }
	- }
	- kmem_free(remove, sizeof (locked_range_t));
	-}
	-
	-/*
	- * Unlock range and destroy range lock structure.
	- */
	-void
	-rangelock_exit(locked_range_t *lr)
	-{
	- rangelock_t *rl = lr->lr_rangelock;
	-
	- ASSERT(lr->lr_type == RL_WRITER \|\| lr->lr_type == RL_READER);
	- ASSERT(lr->lr_count == 1 \|\| lr->lr_count == 0);
	- ASSERT(!lr->lr_proxy);
	-
	- mutex_enter(&rl->rl_lock);
	- if (lr->lr_type == RL_WRITER) {
	- /* writer locks can't be shared or split */
	- avl_remove(&rl->rl_tree, lr);
	- mutex_exit(&rl->rl_lock);
	- if (lr->lr_write_wanted) {
	- cv_broadcast(&lr->lr_write_cv);
	- cv_destroy(&lr->lr_write_cv);
	- }
	- if (lr->lr_read_wanted) {
	- cv_broadcast(&lr->lr_read_cv);
	- cv_destroy(&lr->lr_read_cv);
	- }
	- kmem_free(lr, sizeof (locked_range_t));
	- } else {
	- /*
	- * lock may be shared, let rangelock_exit_reader()
	- * release the lock and free the rl_t
	- */
	- rangelock_exit_reader(rl, lr);
	- mutex_exit(&rl->rl_lock);
	- }
	-}
	-
	-/*
	- * Reduce range locked as RL_WRITER from whole file to specified range.
	- * Asserts the whole file is exclusively locked and so there's only one
	- * entry in the tree.
	- */
	-void
	-rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len)
	-{
	- rangelock_t *rl = lr->lr_rangelock;
	-
	- /* Ensure there are no other locks */
	- ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
	- ASSERT3U(lr->lr_offset, ==, 0);
	- ASSERT3U(lr->lr_type, ==, RL_WRITER);
	- ASSERT(!lr->lr_proxy);
	- ASSERT3U(lr->lr_length, ==, UINT64_MAX);
	- ASSERT3U(lr->lr_count, ==, 1);
	-
	- mutex_enter(&rl->rl_lock);
	- lr->lr_offset = off;
	- lr->lr_length = len;
	- mutex_exit(&rl->rl_lock);
	- if (lr->lr_write_wanted)
	- cv_broadcast(&lr->lr_write_cv);
	- if (lr->lr_read_wanted)
	- cv_broadcast(&lr->lr_read_cv);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
	@@ -1,326 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/vnode.h>
	-#include <sys/sa.h>
	-#include <sys/zfs_acl.h>
	-#include <sys/zfs_sa.h>
	-
	-/*
	- * ZPL attribute registration table.
	- * Order of attributes doesn't matter
	- * a unique value will be assigned for each
	- * attribute that is file system specific
	- *
	- * This is just the set of ZPL attributes that this
	- * version of ZFS deals with natively. The file system
	- * could have other attributes stored in files, but they will be
	- * ignored. The SA framework will preserve them, just that
	- * this version of ZFS won't change or delete them.
	- */
	-
	-sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
	- {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
	- {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
	- {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
	- {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
	- {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
	- {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
	- {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
	- {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
	- {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
	- {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
	- {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
	- {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
	- {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
	- {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
	- {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
	- {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
	- {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
	- {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
	- {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
	- {"ZPL_DACL_ACES", 0, SA_ACL, 0},
	- {NULL, 0, 0, 0}
	-};
	-
	-#ifdef _KERNEL
	-
	-int
	-zfs_sa_readlink(znode_t zp, uio_t uio)
	-{
	- dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
	- size_t bufsz;
	- int error;
	-
	- bufsz = zp->z_size;
	- if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
	- error = uiomove((caddr_t)db->db_data +
	- ZFS_OLD_ZNODE_PHYS_SIZE,
	- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
	- } else {
	- dmu_buf_t *dbp;
	- if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id,
	- 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
	- error = uiomove(dbp->db_data,
	- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
	- dmu_buf_rele(dbp, FTAG);
	- }
	- }
	- return (error);
	-}
	-
	-void
	-zfs_sa_symlink(znode_t zp, char link, int len, dmu_tx_t *tx)
	-{
	- dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
	-
	- if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
	- VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
	- if (len) {
	- bcopy(link, (caddr_t)db->db_data +
	- ZFS_OLD_ZNODE_PHYS_SIZE, len);
	- }
	- } else {
	- dmu_buf_t *dbp;
	-
	- zfs_grow_blocksize(zp, len, tx);
	- VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os,
	- zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH));
	-
	- dmu_buf_will_dirty(dbp, tx);
	-
	- ASSERT3U(len, <=, dbp->db_size);
	- bcopy(link, dbp->db_data, len);
	- dmu_buf_rele(dbp, FTAG);
	- }
	-}
	-
	-void
	-zfs_sa_get_scanstamp(znode_t zp, xvattr_t xvap)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- xoptattr_t *xoap;
	-
	- ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
	- VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
	- if (zp->z_is_sa) {
	- if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
	- &xoap->xoa_av_scanstamp,
	- sizeof (xoap->xoa_av_scanstamp)) != 0)
	- return;
	- } else {
	- dmu_object_info_t doi;
	- dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
	- int len;
	-
	- if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
	- return;
	-
	- sa_object_info(zp->z_sa_hdl, &doi);
	- len = sizeof (xoap->xoa_av_scanstamp) +
	- ZFS_OLD_ZNODE_PHYS_SIZE;
	-
	- if (len <= doi.doi_bonus_size) {
	- (void) memcpy(xoap->xoa_av_scanstamp,
	- (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
	- sizeof (xoap->xoa_av_scanstamp));
	- }
	- }
	- XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
	-}
	-
	-void
	-zfs_sa_set_scanstamp(znode_t zp, xvattr_t xvap, dmu_tx_t *tx)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- xoptattr_t *xoap;
	-
	- ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
	- VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
	- if (zp->z_is_sa)
	- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
	- &xoap->xoa_av_scanstamp,
	- sizeof (xoap->xoa_av_scanstamp), tx));
	- else {
	- dmu_object_info_t doi;
	- dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
	- int len;
	-
	- sa_object_info(zp->z_sa_hdl, &doi);
	- len = sizeof (xoap->xoa_av_scanstamp) +
	- ZFS_OLD_ZNODE_PHYS_SIZE;
	- if (len > doi.doi_bonus_size)
	- VERIFY(dmu_set_bonus(db, len, tx) == 0);
	- (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
	- xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
	-
	- zp->z_pflags \|= ZFS_BONUS_SCANSTAMP;
	- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
	- &zp->z_pflags, sizeof (uint64_t), tx));
	- }
	-}
	-
	-/*
	- * I'm not convinced we should do any of this upgrade.
	- * since the SA code can read both old/new znode formats
	- * with probably little to no performance difference.
	- *
	- * All new files will be created with the new format.
	- */
	-
	-void
	-zfs_sa_upgrade(sa_handle_t hdl, dmu_tx_t tx)
	-{
	- dmu_buf_t *db = sa_get_db(hdl);
	- znode_t *zp = sa_get_userdata(hdl);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- sa_bulk_attr_t bulk[20];
	- int count = 0;
	- sa_bulk_attr_t sa_attrs[20] = { 0 };
	- zfs_acl_locator_cb_t locate = { 0 };
	- uint64_t uid, gid, mode, rdev, xattr, parent;
	- uint64_t crtime[2], mtime[2], ctime[2];
	- zfs_acl_phys_t znode_acl;
	- char scanstamp[AV_SCANSTAMP_SZ];
	-
	- /*
	- * No upgrade if ACL isn't cached
	- * since we won't know which locks are held
	- * and ready the ACL would require special "locked"
	- * interfaces that would be messy
	- */
	- if (zp->z_acl_cached == NULL \|\| ZTOV(zp)->v_type == VLNK)
	- return;
	-
	- /*
	- * If the vnode lock is held and we aren't the owner
	- * then just return since we don't want to deadlock
	- * trying to update the status of z_is_sa. This
	- * file can then be upgraded at a later time.
	- *
	- * Otherwise, we know we are doing the
	- * sa_update() that caused us to enter this function.
	- */
	- if (vn_lock(ZTOV(zp), LK_EXCLUSIVE \| LK_NOWAIT) != 0)
	- return;
	-
	- /* First do a bulk query of the attributes that aren't cached */
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
	- &znode_acl, 88);
	-
	- if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
	- goto done;
	-
	-
	- /*
	- * While the order here doesn't matter its best to try and organize
	- * it is such a way to pick up an already existing layout number
	- */
	- count = 0;
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
	- &zp->z_size, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
	- NULL, &zp->z_gen, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
	- NULL, &parent, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	- &zp->z_pflags, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
	- zp->z_atime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
	- &mtime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
	- &ctime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
	- &crtime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
	- &zp->z_links, 8);
	- if (zp->z_vnode->v_type == VBLK \|\| zp->z_vnode->v_type == VCHR)
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
	- &rdev, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
	- &zp->z_acl_cached->z_acl_count, 8);
	-
	- if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
	- zfs_acl_xform(zp, zp->z_acl_cached, CRED());
	-
	- locate.cb_aclp = zp->z_acl_cached;
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
	- zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
	-
	- if (xattr)
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
	- NULL, &xattr, 8);
	-
	- /* if scanstamp then add scanstamp */
	-
	- if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
	- bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
	- scanstamp, AV_SCANSTAMP_SZ);
	- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
	- NULL, scanstamp, AV_SCANSTAMP_SZ);
	- zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
	- }
	-
	- VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
	- VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
	- count, tx) == 0);
	- if (znode_acl.z_acl_extern_obj)
	- VERIFY(0 == dmu_object_free(zfsvfs->z_os,
	- znode_acl.z_acl_extern_obj, tx));
	-
	- zp->z_is_sa = B_TRUE;
	-done:
	- VOP_UNLOCK(ZTOV(zp));
	-}
	-
	-void
	-zfs_sa_upgrade_txholds(dmu_tx_t tx, znode_t zp)
	-{
	- if (!zp->z_zfsvfs->z_use_sa \|\| zp->z_is_sa)
	- return;
	-
	-
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
	-
	- if (zfs_external_acl(zp)) {
	- dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
	- DMU_OBJECT_END);
	- }
	-}
	-
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
	@@ -1,2813 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
	- * All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
	- */
	-
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/systm.h>
	-#include <sys/kernel.h>
	-#include <sys/sysmacros.h>
	-#include <sys/kmem.h>
	-#include <sys/acl.h>
	-#include <sys/vnode.h>
	-#include <sys/vfs.h>
	-#include <sys/mntent.h>
	-#include <sys/mount.h>
	-#include <sys/cmn_err.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zfs_dir.h>
	-#include <sys/zil.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/dmu.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_deleg.h>
	-#include <sys/spa.h>
	-#include <sys/zap.h>
	-#include <sys/sa.h>
	-#include <sys/sa_impl.h>
	-#include <sys/varargs.h>
	-#include <sys/policy.h>
	-#include <sys/atomic.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zfs_ctldir.h>
	-#include <sys/zfs_fuid.h>
	-#include <sys/sunddi.h>
	-#include <sys/dnlc.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/spa_boot.h>
	-#include <sys/jail.h>
	-#include <ufs/ufs/quota.h>
	-
	-#include "zfs_comutil.h"
	-
	-struct mtx zfs_debug_mtx;
	-MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
	-
	-SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "ZFS file system");
	-
	-int zfs_super_owner;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
	- "File system owner can perform privileged operation on his file systems");
	-
	-int zfs_debug_level;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
	- "Debug level");
	-
	-SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	- "ZFS versions");
	-static int zfs_version_acl = ZFS_ACL_VERSION;
	-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
	- "ZFS_ACL_VERSION");
	-static int zfs_version_spa = SPA_VERSION;
	-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
	- "SPA_VERSION");
	-static int zfs_version_zpl = ZPL_VERSION;
	-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
	- "ZPL_VERSION");
	-
	-static int zfs_quotactl(vfs_t vfsp, int cmds, uid_t id, void arg);
	-static int zfs_mount(vfs_t *vfsp);
	-static int zfs_umount(vfs_t *vfsp, int fflag);
	-static int zfs_root(vfs_t vfsp, int flags, vnode_t *vpp);
	-static int zfs_statfs(vfs_t vfsp, struct statfs statp);
	-static int zfs_vget(vfs_t vfsp, ino_t ino, int flags, vnode_t *vpp);
	-static int zfs_sync(vfs_t *vfsp, int waitfor);
	-static int zfs_checkexp(vfs_t vfsp, struct sockaddr nam, uint64_t *extflagsp,
	- struct ucred *credanonp, int numsecflavors, int *secflavors);
	-static int zfs_fhtovp(vfs_t vfsp, fid_t fidp, int flags, vnode_t **vpp);
	-static void zfs_objset_close(zfsvfs_t *zfsvfs);
	-static void zfs_freevfs(vfs_t *vfsp);
	-
	-struct vfsops zfs_vfsops = {
	- .vfs_mount = zfs_mount,
	- .vfs_unmount = zfs_umount,
	- .vfs_root = vfs_cache_root,
	- .vfs_cachedroot = zfs_root,
	- .vfs_statfs = zfs_statfs,
	- .vfs_vget = zfs_vget,
	- .vfs_sync = zfs_sync,
	- .vfs_checkexp = zfs_checkexp,
	- .vfs_fhtovp = zfs_fhtovp,
	- .vfs_quotactl = zfs_quotactl,
	-};
	-
	-VFS_SET(zfs_vfsops, zfs, VFCF_JAIL \| VFCF_DELEGADMIN);
	-
	-/*
	- * We need to keep a count of active fs's.
	- * This is necessary to prevent our module
	- * from being unloaded after a umount -f
	- */
	-static uint32_t zfs_active_fs_count = 0;
	-
	-static int
	-zfs_getquota(zfsvfs_t zfsvfs, uid_t id, int isgroup, struct dqblk64 dqp)
	-{
	- int error = 0;
	- char buf[32];
	- int err;
	- uint64_t usedobj, quotaobj;
	- uint64_t quota, used = 0;
	- timespec_t now;
	-
	- usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
	- quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
	-
	- if (quotaobj == 0 \|\| zfsvfs->z_replay) {
	- error = EINVAL;
	- goto done;
	- }
	- (void)sprintf(buf, "%llx", (longlong_t)id);
	- if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
	- buf, sizeof(quota), 1, &quota)) != 0) {
	- dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__);
	- goto done;
	- }
	- /*
	- * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
	- * So we set them to be the same.
	- */
	- dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
	- error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof(used), 1, &used);
	- if (error && error != ENOENT) {
	- dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error);
	- goto done;
	- }
	- dqp->dqb_curblocks = btodb(used);
	- dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
	- vfs_timestamp(&now);
	- /*
	- * Setting this to 0 causes FreeBSD quota(8) to print
	- * the number of days since the epoch, which isn't
	- * particularly useful.
	- */
	- dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
	-done:
	- return (error);
	-}
	-
	-static int
	-zfs_quotactl(vfs_t vfsp, int cmds, uid_t id, void arg)
	-{
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	- struct thread *td;
	- int cmd, type, error = 0;
	- int bitsize;
	- uint64_t fuid;
	- zfs_userquota_prop_t quota_type;
	- struct dqblk64 dqblk = { 0 };
	-
	- td = curthread;
	- cmd = cmds >> SUBCMDSHIFT;
	- type = cmds & SUBCMDMASK;
	-
	- ZFS_ENTER(zfsvfs);
	- if (id == -1) {
	- switch (type) {
	- case USRQUOTA:
	- id = td->td_ucred->cr_ruid;
	- break;
	- case GRPQUOTA:
	- id = td->td_ucred->cr_rgid;
	- break;
	- default:
	- error = EINVAL;
	- if (cmd == Q_QUOTAON \|\| cmd == Q_QUOTAOFF)
	- vfs_unbusy(vfsp);
	- goto done;
	- }
	- }
	- /*
	- * Map BSD type to:
	- * ZFS_PROP_USERUSED,
	- * ZFS_PROP_USERQUOTA,
	- * ZFS_PROP_GROUPUSED,
	- * ZFS_PROP_GROUPQUOTA
	- */
	- switch (cmd) {
	- case Q_SETQUOTA:
	- case Q_SETQUOTA32:
	- if (type == USRQUOTA)
	- quota_type = ZFS_PROP_USERQUOTA;
	- else if (type == GRPQUOTA)
	- quota_type = ZFS_PROP_GROUPQUOTA;
	- else
	- error = EINVAL;
	- break;
	- case Q_GETQUOTA:
	- case Q_GETQUOTA32:
	- if (type == USRQUOTA)
	- quota_type = ZFS_PROP_USERUSED;
	- else if (type == GRPQUOTA)
	- quota_type = ZFS_PROP_GROUPUSED;
	- else
	- error = EINVAL;
	- break;
	- }
	-
	- /*
	- * Depending on the cmd, we may need to get
	- * the ruid and domain (see fuidstr_to_sid?),
	- * the fuid (how?), or other information.
	- * Create fuid using zfs_fuid_create(zfsvfs, id,
	- * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
	- * I think I can use just the id?
	- *
	- * Look at zfs_fuid_overquota() to look up a quota.
	- * zap_lookup(something, quotaobj, fuidstring, sizeof(long long), 1, &quota)
	- *
	- * See zfs_set_userquota() to set a quota.
	- */
	- if ((u_int)type >= MAXQUOTAS) {
	- error = EINVAL;
	- goto done;
	- }
	-
	- switch (cmd) {
	- case Q_GETQUOTASIZE:
	- bitsize = 64;
	- error = copyout(&bitsize, arg, sizeof(int));
	- break;
	- case Q_QUOTAON:
	- // As far as I can tell, you can't turn quotas on or off on zfs
	- error = 0;
	- vfs_unbusy(vfsp);
	- break;
	- case Q_QUOTAOFF:
	- error = ENOTSUP;
	- vfs_unbusy(vfsp);
	- break;
	- case Q_SETQUOTA:
	- error = copyin(arg, &dqblk, sizeof(dqblk));
	- if (error == 0)
	- error = zfs_set_userquota(zfsvfs, quota_type,
	- "", id, dbtob(dqblk.dqb_bhardlimit));
	- break;
	- case Q_GETQUOTA:
	- error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
	- if (error == 0)
	- error = copyout(&dqblk, arg, sizeof(dqblk));
	- break;
	- default:
	- error = EINVAL;
	- break;
	- }
	-done:
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-/ARGSUSED/
	-static int
	-zfs_sync(vfs_t *vfsp, int waitfor)
	-{
	-
	- /*
	- * Data integrity is job one. We don't want a compromised kernel
	- * writing to the storage pool, so we never sync during panic.
	- */
	- if (KERNEL_PANICKED())
	- return (0);
	-
	- /*
	- * Ignore the system syncher. ZFS already commits async data
	- * at zfs_txg_timeout intervals.
	- */
	- if (waitfor == MNT_LAZY)
	- return (0);
	-
	- if (vfsp != NULL) {
	- /*
	- * Sync a specific filesystem.
	- */
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	- dsl_pool_t *dp;
	- int error;
	-
	- error = vfs_stdsync(vfsp, waitfor);
	- if (error != 0)
	- return (error);
	-
	- ZFS_ENTER(zfsvfs);
	- dp = dmu_objset_pool(zfsvfs->z_os);
	-
	- /*
	- * If the system is shutting down, then skip any
	- * filesystems which may exist on a suspended pool.
	- */
	- if (sys_shutdown && spa_suspended(dp->dp_spa)) {
	- ZFS_EXIT(zfsvfs);
	- return (0);
	- }
	-
	- if (zfsvfs->z_log != NULL)
	- zil_commit(zfsvfs->z_log, 0);
	-
	- ZFS_EXIT(zfsvfs);
	- } else {
	- /*
	- * Sync all ZFS filesystems. This is what happens when you
	- * run sync(1M). Unlike other filesystems, ZFS honors the
	- * request by waiting for all pools to commit all dirty data.
	- */
	- spa_sync_allpools();
	- }
	-
	- return (0);
	-}
	-
	-#ifndef __FreeBSD_kernel__
	-static int
	-zfs_create_unique_device(dev_t *dev)
	-{
	- major_t new_major;
	-
	- do {
	- ASSERT3U(zfs_minor, <=, MAXMIN32);
	- minor_t start = zfs_minor;
	- do {
	- mutex_enter(&zfs_dev_mtx);
	- if (zfs_minor >= MAXMIN32) {
	- /*
	- * If we're still using the real major
	- * keep out of /dev/zfs and /dev/zvol minor
	- * number space. If we're using a getudev()'ed
	- * major number, we can use all of its minors.
	- */
	- if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
	- zfs_minor = ZFS_MIN_MINOR;
	- else
	- zfs_minor = 0;
	- } else {
	- zfs_minor++;
	- }
	- *dev = makedevice(zfs_major, zfs_minor);
	- mutex_exit(&zfs_dev_mtx);
	- } while (vfs_devismounted(*dev) && zfs_minor != start);
	- if (zfs_minor == start) {
	- /*
	- * We are using all ~262,000 minor numbers for the
	- * current major number. Create a new major number.
	- */
	- if ((new_major = getudev()) == (major_t)-1) {
	- cmn_err(CE_WARN,
	- "zfs_mount: Can't get unique major "
	- "device number.");
	- return (-1);
	- }
	- mutex_enter(&zfs_dev_mtx);
	- zfs_major = new_major;
	- zfs_minor = 0;
	-
	- mutex_exit(&zfs_dev_mtx);
	- } else {
	- break;
	- }
	- /* CONSTANTCONDITION */
	- } while (1);
	-
	- return (0);
	-}
	-#endif /* !__FreeBSD_kernel__ */
	-
	-static void
	-atime_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	-
	- if (newval == TRUE) {
	- zfsvfs->z_atime = TRUE;
	- zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
	- } else {
	- zfsvfs->z_atime = FALSE;
	- zfsvfs->z_vfs->vfs_flag \|= MNT_NOATIME;
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
	- }
	-}
	-
	-static void
	-xattr_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	-
	- if (newval == TRUE) {
	- /* XXX locking on vfs_flag? */
	-#ifdef TODO
	- zfsvfs->z_vfs->vfs_flag \|= VFS_XATTR;
	-#endif
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
	- } else {
	- /* XXX locking on vfs_flag? */
	-#ifdef TODO
	- zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
	-#endif
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
	- }
	-}
	-
	-static void
	-blksz_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	- ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
	- ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
	- ASSERT(ISP2(newval));
	-
	- zfsvfs->z_max_blksz = newval;
	- zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
	-}
	-
	-static void
	-readonly_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	-
	- if (newval) {
	- /* XXX locking on vfs_flag? */
	- zfsvfs->z_vfs->vfs_flag \|= VFS_RDONLY;
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
	- } else {
	- /* XXX locking on vfs_flag? */
	- zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
	- }
	-}
	-
	-static void
	-setuid_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	-
	- if (newval == FALSE) {
	- zfsvfs->z_vfs->vfs_flag \|= VFS_NOSETUID;
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
	- } else {
	- zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
	- }
	-}
	-
	-static void
	-exec_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	-
	- if (newval == FALSE) {
	- zfsvfs->z_vfs->vfs_flag \|= VFS_NOEXEC;
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
	- } else {
	- zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
	- }
	-}
	-
	-/*
	- * The nbmand mount option can be changed at mount time.
	- * We can't allow it to be toggled on live file systems or incorrect
	- * behavior may be seen from cifs clients
	- *
	- * This property isn't registered via dsl_prop_register(), but this callback
	- * will be called when a file system is first mounted
	- */
	-static void
	-nbmand_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	- if (newval == FALSE) {
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
	- } else {
	- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
	- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
	- }
	-}
	-
	-static void
	-snapdir_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	-
	- zfsvfs->z_show_ctldir = newval;
	-}
	-
	-static void
	-vscan_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	-
	- zfsvfs->z_vscan = newval;
	-}
	-
	-static void
	-acl_mode_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	-
	- zfsvfs->z_acl_mode = newval;
	-}
	-
	-static void
	-acl_inherit_changed_cb(void *arg, uint64_t newval)
	-{
	- zfsvfs_t *zfsvfs = arg;
	-
	- zfsvfs->z_acl_inherit = newval;
	-}
	-
	-static int
	-zfs_register_callbacks(vfs_t *vfsp)
	-{
	- struct dsl_dataset *ds = NULL;
	- objset_t *os = NULL;
	- zfsvfs_t *zfsvfs = NULL;
	- uint64_t nbmand;
	- boolean_t readonly = B_FALSE;
	- boolean_t do_readonly = B_FALSE;
	- boolean_t setuid = B_FALSE;
	- boolean_t do_setuid = B_FALSE;
	- boolean_t exec = B_FALSE;
	- boolean_t do_exec = B_FALSE;
	-#ifdef illumos
	- boolean_t devices = B_FALSE;
	- boolean_t do_devices = B_FALSE;
	-#endif
	- boolean_t xattr = B_FALSE;
	- boolean_t do_xattr = B_FALSE;
	- boolean_t atime = B_FALSE;
	- boolean_t do_atime = B_FALSE;
	- int error = 0;
	-
	- ASSERT(vfsp);
	- zfsvfs = vfsp->vfs_data;
	- ASSERT(zfsvfs);
	- os = zfsvfs->z_os;
	-
	- /*
	- * This function can be called for a snapshot when we update snapshot's
	- * mount point, which isn't really supported.
	- */
	- if (dmu_objset_is_snapshot(os))
	- return (EOPNOTSUPP);
	-
	- /*
	- * The act of registering our callbacks will destroy any mount
	- * options we may have. In order to enable temporary overrides
	- * of mount options, we stash away the current values and
	- * restore them after we register the callbacks.
	- */
	- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) \|\|
	- !spa_writeable(dmu_objset_spa(os))) {
	- readonly = B_TRUE;
	- do_readonly = B_TRUE;
	- } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
	- readonly = B_FALSE;
	- do_readonly = B_TRUE;
	- }
	- if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
	- setuid = B_FALSE;
	- do_setuid = B_TRUE;
	- } else {
	- if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
	- setuid = B_FALSE;
	- do_setuid = B_TRUE;
	- } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
	- setuid = B_TRUE;
	- do_setuid = B_TRUE;
	- }
	- }
	- if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
	- exec = B_FALSE;
	- do_exec = B_TRUE;
	- } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
	- exec = B_TRUE;
	- do_exec = B_TRUE;
	- }
	- if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
	- xattr = B_FALSE;
	- do_xattr = B_TRUE;
	- } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
	- xattr = B_TRUE;
	- do_xattr = B_TRUE;
	- }
	- if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
	- atime = B_FALSE;
	- do_atime = B_TRUE;
	- } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
	- atime = B_TRUE;
	- do_atime = B_TRUE;
	- }
	-
	- /*
	- * We need to enter pool configuration here, so that we can use
	- * dsl_prop_get_int_ds() to handle the special nbmand property below.
	- * dsl_prop_get_integer() can not be used, because it has to acquire
	- * spa_namespace_lock and we can not do that because we already hold
	- * z_teardown_lock. The problem is that spa_write_cachefile() is called
	- * with spa_namespace_lock held and the function calls ZFS vnode
	- * operations to write the cache file and thus z_teardown_lock is
	- * acquired after spa_namespace_lock.
	- */
	- ds = dmu_objset_ds(os);
	- dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	-
	- /*
	- * nbmand is a special property. It can only be changed at
	- * mount time.
	- *
	- * This is weird, but it is documented to only be changeable
	- * at mount time.
	- */
	- if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
	- nbmand = B_FALSE;
	- } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
	- nbmand = B_TRUE;
	- } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
	- dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	- return (error);
	- }
	-
	- /*
	- * Register property callbacks.
	- *
	- * It would probably be fine to just check for i/o error from
	- * the first prop_register(), but I guess I like to go
	- * overboard...
	- */
	- error = dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
	-#ifdef illumos
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
	-#endif
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
	- zfsvfs);
	- error = error ? error : dsl_prop_register(ds,
	- zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
	- dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	- if (error)
	- goto unregister;
	-
	- /*
	- * Invoke our callbacks to restore temporary mount options.
	- */
	- if (do_readonly)
	- readonly_changed_cb(zfsvfs, readonly);
	- if (do_setuid)
	- setuid_changed_cb(zfsvfs, setuid);
	- if (do_exec)
	- exec_changed_cb(zfsvfs, exec);
	- if (do_xattr)
	- xattr_changed_cb(zfsvfs, xattr);
	- if (do_atime)
	- atime_changed_cb(zfsvfs, atime);
	-
	- nbmand_changed_cb(zfsvfs, nbmand);
	-
	- return (0);
	-
	-unregister:
	- dsl_prop_unregister_all(ds, zfsvfs);
	- return (error);
	-}
	-
	-static int
	-zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
	- uint64_t userp, uint64_t groupp)
	-{
	- /*
	- * Is it a valid type of object to track?
	- */
	- if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
	- return (SET_ERROR(ENOENT));
	-
	- /*
	- * If we have a NULL data pointer
	- * then assume the id's aren't changing and
	- * return EEXIST to the dmu to let it know to
	- * use the same ids
	- */
	- if (data == NULL)
	- return (SET_ERROR(EEXIST));
	-
	- if (bonustype == DMU_OT_ZNODE) {
	- znode_phys_t *znp = data;
	- *userp = znp->zp_uid;
	- *groupp = znp->zp_gid;
	- } else {
	- int hdrsize;
	- sa_hdr_phys_t *sap = data;
	- sa_hdr_phys_t sa = *sap;
	- boolean_t swap = B_FALSE;
	-
	- ASSERT(bonustype == DMU_OT_SA);
	-
	- if (sa.sa_magic == 0) {
	- /*
	- * This should only happen for newly created
	- * files that haven't had the znode data filled
	- * in yet.
	- */
	- *userp = 0;
	- *groupp = 0;
	- return (0);
	- }
	- if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
	- sa.sa_magic = SA_MAGIC;
	- sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
	- swap = B_TRUE;
	- } else {
	- VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
	- }
	-
	- hdrsize = sa_hdrsize(&sa);
	- VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
	- userp = ((uint64_t *)((uintptr_t)data + hdrsize +
	- SA_UID_OFFSET));
	- groupp = ((uint64_t *)((uintptr_t)data + hdrsize +
	- SA_GID_OFFSET));
	- if (swap) {
	- userp = BSWAP_64(userp);
	- groupp = BSWAP_64(groupp);
	- }
	- }
	- return (0);
	-}
	-
	-static void
	-fuidstr_to_sid(zfsvfs_t zfsvfs, const char fuidstr,
	- char domainbuf, int buflen, uid_t ridp)
	-{
	- uint64_t fuid;
	- const char *domain;
	-
	- fuid = zfs_strtonum(fuidstr, NULL);
	-
	- domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
	- if (domain)
	- (void) strlcpy(domainbuf, domain, buflen);
	- else
	- domainbuf[0] = '\0';
	- *ridp = FUID_RID(fuid);
	-}
	-
	-static uint64_t
	-zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
	-{
	- switch (type) {
	- case ZFS_PROP_USERUSED:
	- return (DMU_USERUSED_OBJECT);
	- case ZFS_PROP_GROUPUSED:
	- return (DMU_GROUPUSED_OBJECT);
	- case ZFS_PROP_USERQUOTA:
	- return (zfsvfs->z_userquota_obj);
	- case ZFS_PROP_GROUPQUOTA:
	- return (zfsvfs->z_groupquota_obj);
	- }
	- return (0);
	-}
	-
	-int
	-zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
	- uint64_t cookiep, void vbuf, uint64_t *bufsizep)
	-{
	- int error;
	- zap_cursor_t zc;
	- zap_attribute_t za;
	- zfs_useracct_t *buf = vbuf;
	- uint64_t obj;
	-
	- if (!dmu_objset_userspace_present(zfsvfs->z_os))
	- return (SET_ERROR(ENOTSUP));
	-
	- obj = zfs_userquota_prop_to_obj(zfsvfs, type);
	- if (obj == 0) {
	- *bufsizep = 0;
	- return (0);
	- }
	-
	- for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
	- (error = zap_cursor_retrieve(&zc, &za)) == 0;
	- zap_cursor_advance(&zc)) {
	- if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
	- *bufsizep)
	- break;
	-
	- fuidstr_to_sid(zfsvfs, za.za_name,
	- buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
	-
	- buf->zu_space = za.za_first_integer;
	- buf++;
	- }
	- if (error == ENOENT)
	- error = 0;
	-
	- ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
	- *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
	- *cookiep = zap_cursor_serialize(&zc);
	- zap_cursor_fini(&zc);
	- return (error);
	-}
	-
	-/*
	- * buf must be big enough (eg, 32 bytes)
	- */
	-static int
	-id_to_fuidstr(zfsvfs_t zfsvfs, const char domain, uid_t rid,
	- char *buf, boolean_t addok)
	-{
	- uint64_t fuid;
	- int domainid = 0;
	-
	- if (domain && domain[0]) {
	- domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
	- if (domainid == -1)
	- return (SET_ERROR(ENOENT));
	- }
	- fuid = FUID_ENCODE(domainid, rid);
	- (void) sprintf(buf, "%llx", (longlong_t)fuid);
	- return (0);
	-}
	-
	-int
	-zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
	- const char domain, uint64_t rid, uint64_t valp)
	-{
	- char buf[32];
	- int err;
	- uint64_t obj;
	-
	- *valp = 0;
	-
	- if (!dmu_objset_userspace_present(zfsvfs->z_os))
	- return (SET_ERROR(ENOTSUP));
	-
	- obj = zfs_userquota_prop_to_obj(zfsvfs, type);
	- if (obj == 0)
	- return (0);
	-
	- err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
	- if (err)
	- return (err);
	-
	- err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
	- if (err == ENOENT)
	- err = 0;
	- return (err);
	-}
	-
	-int
	-zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
	- const char *domain, uint64_t rid, uint64_t quota)
	-{
	- char buf[32];
	- int err;
	- dmu_tx_t *tx;
	- uint64_t *objp;
	- boolean_t fuid_dirtied;
	-
	- if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
	- return (SET_ERROR(EINVAL));
	-
	- if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
	- return (SET_ERROR(ENOTSUP));
	-
	- objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
	- &zfsvfs->z_groupquota_obj;
	-
	- err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
	- if (err)
	- return (err);
	- fuid_dirtied = zfsvfs->z_fuid_dirty;
	-
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_zap(tx, objp ? objp : DMU_NEW_OBJECT, B_TRUE, NULL);
	- if (*objp == 0) {
	- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
	- zfs_userquota_prop_prefixes[type]);
	- }
	- if (fuid_dirtied)
	- zfs_fuid_txhold(zfsvfs, tx);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err) {
	- dmu_tx_abort(tx);
	- return (err);
	- }
	-
	- mutex_enter(&zfsvfs->z_lock);
	- if (*objp == 0) {
	- *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
	- DMU_OT_NONE, 0, tx);
	- VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
	- zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
	- }
	- mutex_exit(&zfsvfs->z_lock);
	-
	- if (quota == 0) {
	- err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
	- if (err == ENOENT)
	- err = 0;
	- } else {
	- err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
	- }
	- ASSERT(err == 0);
	- if (fuid_dirtied)
	- zfs_fuid_sync(zfsvfs, tx);
	- dmu_tx_commit(tx);
	- return (err);
	-}
	-
	-boolean_t
	-zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
	-{
	- char buf[32];
	- uint64_t used, quota, usedobj, quotaobj;
	- int err;
	-
	- usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
	- quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
	-
	- if (quotaobj == 0 \|\| zfsvfs->z_replay)
	- return (B_FALSE);
	-
	- (void) sprintf(buf, "%llx", (longlong_t)fuid);
	- err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
	- if (err != 0)
	- return (B_FALSE);
	-
	- err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
	- if (err != 0)
	- return (B_FALSE);
	- return (used >= quota);
	-}
	-
	-boolean_t
	-zfs_owner_overquota(zfsvfs_t zfsvfs, znode_t zp, boolean_t isgroup)
	-{
	- uint64_t fuid;
	- uint64_t quotaobj;
	-
	- quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
	-
	- fuid = isgroup ? zp->z_gid : zp->z_uid;
	-
	- if (quotaobj == 0 \|\| zfsvfs->z_replay)
	- return (B_FALSE);
	-
	- return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
	-}
	-
	-/*
	- * Associate this zfsvfs with the given objset, which must be owned.
	- * This will cache a bunch of on-disk state from the objset in the
	- * zfsvfs.
	- */
	-static int
	-zfsvfs_init(zfsvfs_t zfsvfs, objset_t os)
	-{
	- int error;
	- uint64_t val;
	-
	- zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
	- zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
	- zfsvfs->z_os = os;
	-
	- error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
	- if (error != 0)
	- return (error);
	- if (zfsvfs->z_version >
	- zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
	- (void) printf("Can't mount a version %lld file system "
	- "on a version %lld pool\n. Pool must be upgraded to mount "
	- "this file system.", (u_longlong_t)zfsvfs->z_version,
	- (u_longlong_t)spa_version(dmu_objset_spa(os)));
	- return (SET_ERROR(ENOTSUP));
	- }
	- error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
	- if (error != 0)
	- return (error);
	- zfsvfs->z_norm = (int)val;
	-
	- error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
	- if (error != 0)
	- return (error);
	- zfsvfs->z_utf8 = (val != 0);
	-
	- error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
	- if (error != 0)
	- return (error);
	- zfsvfs->z_case = (uint_t)val;
	-
	- /*
	- * Fold case on file systems that are always or sometimes case
	- * insensitive.
	- */
	- if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE \|\|
	- zfsvfs->z_case == ZFS_CASE_MIXED)
	- zfsvfs->z_norm \|= U8_TEXTPREP_TOUPPER;
	-
	- zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
	- zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
	-
	- uint64_t sa_obj = 0;
	- if (zfsvfs->z_use_sa) {
	- /* should either have both of these objects or none */
	- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
	- &sa_obj);
	- if (error != 0)
	- return (error);
	- }
	-
	- error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
	- &zfsvfs->z_attr_table);
	- if (error != 0)
	- return (error);
	-
	- if (zfsvfs->z_version >= ZPL_VERSION_SA)
	- sa_register_update_callback(os, zfs_sa_upgrade);
	-
	- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
	- &zfsvfs->z_root);
	- if (error != 0)
	- return (error);
	- ASSERT(zfsvfs->z_root != 0);
	-
	- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
	- &zfsvfs->z_unlinkedobj);
	- if (error != 0)
	- return (error);
	-
	- error = zap_lookup(os, MASTER_NODE_OBJ,
	- zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
	- 8, 1, &zfsvfs->z_userquota_obj);
	- if (error == ENOENT)
	- zfsvfs->z_userquota_obj = 0;
	- else if (error != 0)
	- return (error);
	-
	- error = zap_lookup(os, MASTER_NODE_OBJ,
	- zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
	- 8, 1, &zfsvfs->z_groupquota_obj);
	- if (error == ENOENT)
	- zfsvfs->z_groupquota_obj = 0;
	- else if (error != 0)
	- return (error);
	-
	- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
	- &zfsvfs->z_fuid_obj);
	- if (error == ENOENT)
	- zfsvfs->z_fuid_obj = 0;
	- else if (error != 0)
	- return (error);
	-
	- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
	- &zfsvfs->z_shares_dir);
	- if (error == ENOENT)
	- zfsvfs->z_shares_dir = 0;
	- else if (error != 0)
	- return (error);
	-
	- /*
	- * Only use the name cache if we are looking for a
	- * name on a file system that does not require normalization
	- * or case folding. We can also look there if we happen to be
	- * on a non-normalizing, mixed sensitivity file system IF we
	- * are looking for the exact name (which is always the case on
	- * FreeBSD).
	- */
	- zfsvfs->z_use_namecache = !zfsvfs->z_norm \|\|
	- ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
	- !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
	-
	- return (0);
	-}
	-
	-#if defined(__FreeBSD__)
	-taskq_t *zfsvfs_taskq;
	-
	-static void
	-zfsvfs_task_unlinked_drain(void *context, int pending __unused)
	-{
	-
	- zfs_unlinked_drain((zfsvfs_t *)context);
	-}
	-#endif
	-
	-int
	-zfsvfs_create(const char osname, zfsvfs_t *zfvp)
	-{
	- objset_t *os;
	- zfsvfs_t *zfsvfs;
	- int error;
	-
	- /*
	- * XXX: Fix struct statfs so this isn't necessary!
	- *
	- * The 'osname' is used as the filesystem's special node, which means
	- * it must fit in statfs.f_mntfromname, or else it can't be
	- * enumerated, so libzfs_mnttab_find() returns NULL, which causes
	- * 'zfs unmount' to think it's not mounted when it is.
	- */
	- if (strlen(osname) >= MNAMELEN)
	- return (SET_ERROR(ENAMETOOLONG));
	-
	- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
	-
	- /*
	- * We claim to always be readonly so we can open snapshots;
	- * other ZPL code will prevent us from writing to snapshots.
	- */
	-
	- error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
	- if (error != 0) {
	- kmem_free(zfsvfs, sizeof (zfsvfs_t));
	- return (error);
	- }
	-
	- error = zfsvfs_create_impl(zfvp, zfsvfs, os);
	- if (error != 0) {
	- dmu_objset_disown(os, zfsvfs);
	- }
	- return (error);
	-}
	-
	-
	-int
	-zfsvfs_create_impl(zfsvfs_t *zfvp, zfsvfs_t zfsvfs, objset_t *os)
	-{
	- int error;
	-
	- zfsvfs->z_vfs = NULL;
	- zfsvfs->z_parent = zfsvfs;
	-
	- mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
	- list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
	- offsetof(znode_t, z_link_node));
	-#if defined(__FreeBSD__)
	- TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
	- zfsvfs_task_unlinked_drain, zfsvfs);
	-#endif
	-#ifdef DIAGNOSTIC
	- rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
	-#else
	- rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
	-#endif
	- rms_init(&zfsvfs->z_teardown_inactive_lock, "zfs teardown inactive");
	- rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
	- for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
	- mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
	-
	- error = zfsvfs_init(zfsvfs, os);
	- if (error != 0) {
	- *zfvp = NULL;
	- kmem_free(zfsvfs, sizeof (zfsvfs_t));
	- return (error);
	- }
	-
	- *zfvp = zfsvfs;
	- return (0);
	-}
	-
	-static int
	-zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
	-{
	- int error;
	-
	- error = zfs_register_callbacks(zfsvfs->z_vfs);
	- if (error)
	- return (error);
	-
	- zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
	-
	- /*
	- * If we are not mounting (ie: online recv), then we don't
	- * have to worry about replaying the log as we blocked all
	- * operations out since we closed the ZIL.
	- */
	- if (mounting) {
	- boolean_t readonly;
	-
	- /*
	- * During replay we remove the read only flag to
	- * allow replays to succeed.
	- */
	- readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
	- if (readonly != 0)
	- zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
	- else
	- zfs_unlinked_drain(zfsvfs);
	-
	- /*
	- * Parse and replay the intent log.
	- *
	- * Because of ziltest, this must be done after
	- * zfs_unlinked_drain(). (Further note: ziltest
	- * doesn't use readonly mounts, where
	- * zfs_unlinked_drain() isn't called.) This is because
	- * ziltest causes spa_sync() to think it's committed,
	- * but actually it is not, so the intent log contains
	- * many txg's worth of changes.
	- *
	- * In particular, if object N is in the unlinked set in
	- * the last txg to actually sync, then it could be
	- * actually freed in a later txg and then reallocated
	- * in a yet later txg. This would write a "create
	- * object N" record to the intent log. Normally, this
	- * would be fine because the spa_sync() would have
	- * written out the fact that object N is free, before
	- * we could write the "create object N" intent log
	- * record.
	- *
	- * But when we are in ziltest mode, we advance the "open
	- * txg" without actually spa_sync()-ing the changes to
	- * disk. So we would see that object N is still
	- * allocated and in the unlinked set, and there is an
	- * intent log record saying to allocate it.
	- */
	- if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
	- if (zil_replay_disable) {
	- zil_destroy(zfsvfs->z_log, B_FALSE);
	- } else {
	- zfsvfs->z_replay = B_TRUE;
	- zil_replay(zfsvfs->z_os, zfsvfs,
	- zfs_replay_vector);
	- zfsvfs->z_replay = B_FALSE;
	- }
	- }
	- zfsvfs->z_vfs->vfs_flag \|= readonly; /* restore readonly bit */
	- }
	-
	- /*
	- * Set the objset user_ptr to track its zfsvfs.
	- */
	- mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
	- dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
	- mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
	-
	- return (0);
	-}
	-
	-extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
	-
	-void
	-zfsvfs_free(zfsvfs_t *zfsvfs)
	-{
	- int i;
	-
	- /*
	- * This is a barrier to prevent the filesystem from going away in
	- * zfs_znode_move() until we can safely ensure that the filesystem is
	- * not unmounted. We consider the filesystem valid before the barrier
	- * and invalid after the barrier.
	- */
	- rw_enter(&zfsvfs_lock, RW_READER);
	- rw_exit(&zfsvfs_lock);
	-
	- zfs_fuid_destroy(zfsvfs);
	-
	- mutex_destroy(&zfsvfs->z_znodes_lock);
	- mutex_destroy(&zfsvfs->z_lock);
	- list_destroy(&zfsvfs->z_all_znodes);
	- rrm_destroy(&zfsvfs->z_teardown_lock);
	- rms_destroy(&zfsvfs->z_teardown_inactive_lock);
	- rw_destroy(&zfsvfs->z_fuid_lock);
	- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
	- mutex_destroy(&zfsvfs->z_hold_mtx[i]);
	- kmem_free(zfsvfs, sizeof (zfsvfs_t));
	-}
	-
	-static void
	-zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
	-{
	- zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
	- if (zfsvfs->z_vfs) {
	- if (zfsvfs->z_use_fuids) {
	- vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
	- vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
	- vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
	- vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
	- vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
	- vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
	- } else {
	- vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
	- vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
	- vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
	- vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
	- vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
	- vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
	- }
	- }
	- zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
	-}
	-
	-static int
	-zfs_domount(vfs_t vfsp, char osname)
	-{
	- uint64_t recordsize, fsid_guid;
	- int error = 0;
	- zfsvfs_t *zfsvfs;
	- vnode_t *vp;
	-
	- ASSERT(vfsp);
	- ASSERT(osname);
	-
	- error = zfsvfs_create(osname, &zfsvfs);
	- if (error)
	- return (error);
	- zfsvfs->z_vfs = vfsp;
	-
	-#ifdef illumos
	- /* Initialize the generic filesystem structure. */
	- vfsp->vfs_bcount = 0;
	- vfsp->vfs_data = NULL;
	-
	- if (zfs_create_unique_device(&mount_dev) == -1) {
	- error = SET_ERROR(ENODEV);
	- goto out;
	- }
	- ASSERT(vfs_devismounted(mount_dev) == 0);
	-#endif
	-
	- if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
	- NULL))
	- goto out;
	- zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
	- zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
	-
	- vfsp->vfs_data = zfsvfs;
	- vfsp->mnt_flag \|= MNT_LOCAL;
	-#if defined(_KERNEL) && !defined(KMEM_DEBUG)
	- vfsp->mnt_kern_flag \|= MNTK_FPLOOKUP;
	-#endif
	- vfsp->mnt_kern_flag \|= MNTK_LOOKUP_SHARED;
	- vfsp->mnt_kern_flag \|= MNTK_SHARED_WRITES;
	- vfsp->mnt_kern_flag \|= MNTK_EXTENDED_SHARED;
	- vfsp->mnt_kern_flag \|= MNTK_NO_IOPF; /* vn_io_fault can be used */
	- vfsp->mnt_kern_flag \|= MNTK_NOMSYNC;
	- vfsp->mnt_kern_flag \|= MNTK_VMSETSIZE_BUG;
	-
	- /*
	- * The fsid is 64 bits, composed of an 8-bit fs type, which
	- * separates our fsid from any other filesystem types, and a
	- * 56-bit objset unique ID. The objset unique ID is unique to
	- * all objsets open on this system, provided by unique_create().
	- * The 8-bit fs type must be put in the low bits of fsid[1]
	- * because that's where other Solaris filesystems put it.
	- */
	- fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
	- ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
	- vfsp->vfs_fsid.val[0] = fsid_guid;
	- vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) \|
	- vfsp->mnt_vfc->vfc_typenum & 0xFF;
	-
	- /*
	- * Set features for file system.
	- */
	- zfs_set_fuid_feature(zfsvfs);
	- if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
	- vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
	- vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
	- vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
	- } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
	- vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
	- vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
	- }
	- vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
	-
	- if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
	- uint64_t pval;
	-
	- atime_changed_cb(zfsvfs, B_FALSE);
	- readonly_changed_cb(zfsvfs, B_TRUE);
	- if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
	- goto out;
	- xattr_changed_cb(zfsvfs, pval);
	- zfsvfs->z_issnap = B_TRUE;
	- zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
	-
	- mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
	- dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
	- mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
	- } else {
	- error = zfsvfs_setup(zfsvfs, B_TRUE);
	- }
	-
	- vfs_mountedfrom(vfsp, osname);
	-
	- if (!zfsvfs->z_issnap)
	- zfsctl_create(zfsvfs);
	-out:
	- if (error) {
	- dmu_objset_disown(zfsvfs->z_os, zfsvfs);
	- zfsvfs_free(zfsvfs);
	- } else {
	- atomic_inc_32(&zfs_active_fs_count);
	- }
	-
	- return (error);
	-}
	-
	-void
	-zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
	-{
	- objset_t *os = zfsvfs->z_os;
	-
	- if (!dmu_objset_is_snapshot(os))
	- dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
	-}
	-
	-#ifdef SECLABEL
	-/*
	- * Convert a decimal digit string to a uint64_t integer.
	- */
	-static int
	-str_to_uint64(char str, uint64_t objnum)
	-{
	- uint64_t num = 0;
	-
	- while (*str) {
	- if (str < '0' \|\| str > '9')
	- return (SET_ERROR(EINVAL));
	-
	- num = num10 + str++ - '0';
	- }
	-
	- *objnum = num;
	- return (0);
	-}
	-
	-/*
	- * The boot path passed from the boot loader is in the form of
	- * "rootpool-name/root-filesystem-object-number'. Convert this
	- * string to a dataset name: "rootpool-name/root-filesystem-name".
	- */
	-static int
	-zfs_parse_bootfs(char bpath, char outpath)
	-{
	- char *slashp;
	- uint64_t objnum;
	- int error;
	-
	- if (bpath == 0 \|\| bpath == '/')
	- return (SET_ERROR(EINVAL));
	-
	- (void) strcpy(outpath, bpath);
	-
	- slashp = strchr(bpath, '/');
	-
	- /* if no '/', just return the pool name */
	- if (slashp == NULL) {
	- return (0);
	- }
	-
	- /* if not a number, just return the root dataset name */
	- if (str_to_uint64(slashp+1, &objnum)) {
	- return (0);
	- }
	-
	- *slashp = '\0';
	- error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
	- *slashp = '/';
	-
	- return (error);
	-}
	-
	-/*
	- * Check that the hex label string is appropriate for the dataset being
	- * mounted into the global_zone proper.
	- *
	- * Return an error if the hex label string is not default or
	- * admin_low/admin_high. For admin_low labels, the corresponding
	- * dataset must be readonly.
	- */
	-int
	-zfs_check_global_label(const char dsname, const char hexsl)
	-{
	- if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
	- return (0);
	- if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
	- return (0);
	- if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
	- /* must be readonly */
	- uint64_t rdonly;
	-
	- if (dsl_prop_get_integer(dsname,
	- zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
	- return (SET_ERROR(EACCES));
	- return (rdonly ? 0 : EACCES);
	- }
	- return (SET_ERROR(EACCES));
	-}
	-
	-/*
	- * Determine whether the mount is allowed according to MAC check.
	- * by comparing (where appropriate) label of the dataset against
	- * the label of the zone being mounted into. If the dataset has
	- * no label, create one.
	- *
	- * Returns 0 if access allowed, error otherwise (e.g. EACCES)
	- */
	-static int
	-zfs_mount_label_policy(vfs_t vfsp, char osname)
	-{
	- int error, retv;
	- zone_t *mntzone = NULL;
	- ts_label_t *mnt_tsl;
	- bslabel_t *mnt_sl;
	- bslabel_t ds_sl;
	- char ds_hexsl[MAXNAMELEN];
	-
	- retv = EACCES; /* assume the worst */
	-
	- /*
	- * Start by getting the dataset label if it exists.
	- */
	- error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
	- 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
	- if (error)
	- return (SET_ERROR(EACCES));
	-
	- /*
	- * If labeling is NOT enabled, then disallow the mount of datasets
	- * which have a non-default label already. No other label checks
	- * are needed.
	- */
	- if (!is_system_labeled()) {
	- if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
	- return (0);
	- return (SET_ERROR(EACCES));
	- }
	-
	- /*
	- * Get the label of the mountpoint. If mounting into the global
	- * zone (i.e. mountpoint is not within an active zone and the
	- * zoned property is off), the label must be default or
	- * admin_low/admin_high only; no other checks are needed.
	- */
	- mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
	- if (mntzone->zone_id == GLOBAL_ZONEID) {
	- uint64_t zoned;
	-
	- zone_rele(mntzone);
	-
	- if (dsl_prop_get_integer(osname,
	- zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
	- return (SET_ERROR(EACCES));
	- if (!zoned)
	- return (zfs_check_global_label(osname, ds_hexsl));
	- else
	- /*
	- * This is the case of a zone dataset being mounted
	- * initially, before the zone has been fully created;
	- * allow this mount into global zone.
	- */
	- return (0);
	- }
	-
	- mnt_tsl = mntzone->zone_slabel;
	- ASSERT(mnt_tsl != NULL);
	- label_hold(mnt_tsl);
	- mnt_sl = label2bslabel(mnt_tsl);
	-
	- if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
	- /*
	- * The dataset doesn't have a real label, so fabricate one.
	- */
	- char *str = NULL;
	-
	- if (l_to_str_internal(mnt_sl, &str) == 0 &&
	- dsl_prop_set_string(osname,
	- zfs_prop_to_name(ZFS_PROP_MLSLABEL),
	- ZPROP_SRC_LOCAL, str) == 0)
	- retv = 0;
	- if (str != NULL)
	- kmem_free(str, strlen(str) + 1);
	- } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
	- /*
	- * Now compare labels to complete the MAC check. If the
	- * labels are equal then allow access. If the mountpoint
	- * label dominates the dataset label, allow readonly access.
	- * Otherwise, access is denied.
	- */
	- if (blequal(mnt_sl, &ds_sl))
	- retv = 0;
	- else if (bldominates(mnt_sl, &ds_sl)) {
	- vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
	- retv = 0;
	- }
	- }
	-
	- label_rele(mnt_tsl);
	- zone_rele(mntzone);
	- return (retv);
	-}
	-#endif /* SECLABEL */
	-
	-#ifdef OPENSOLARIS_MOUNTROOT
	-static int
	-zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
	-{
	- int error = 0;
	- static int zfsrootdone = 0;
	- zfsvfs_t *zfsvfs = NULL;
	- znode_t *zp = NULL;
	- vnode_t *vp = NULL;
	- char *zfs_bootfs;
	- char *zfs_devid;
	-
	- ASSERT(vfsp);
	-
	- /*
	- * The filesystem that we mount as root is defined in the
	- * boot property "zfs-bootfs" with a format of
	- * "poolname/root-dataset-objnum".
	- */
	- if (why == ROOT_INIT) {
	- if (zfsrootdone++)
	- return (SET_ERROR(EBUSY));
	- /*
	- * the process of doing a spa_load will require the
	- * clock to be set before we could (for example) do
	- * something better by looking at the timestamp on
	- * an uberblock, so just set it to -1.
	- */
	- clkset(-1);
	-
	- if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
	- cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
	- "bootfs name");
	- return (SET_ERROR(EINVAL));
	- }
	- zfs_devid = spa_get_bootprop("diskdevid");
	- error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
	- if (zfs_devid)
	- spa_free_bootprop(zfs_devid);
	- if (error) {
	- spa_free_bootprop(zfs_bootfs);
	- cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
	- error);
	- return (error);
	- }
	- if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
	- spa_free_bootprop(zfs_bootfs);
	- cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
	- error);
	- return (error);
	- }
	-
	- spa_free_bootprop(zfs_bootfs);
	-
	- if (error = vfs_lock(vfsp))
	- return (error);
	-
	- if (error = zfs_domount(vfsp, rootfs.bo_name)) {
	- cmn_err(CE_NOTE, "zfs_domount: error %d", error);
	- goto out;
	- }
	-
	- zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
	- ASSERT(zfsvfs);
	- if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
	- cmn_err(CE_NOTE, "zfs_zget: error %d", error);
	- goto out;
	- }
	-
	- vp = ZTOV(zp);
	- mutex_enter(&vp->v_lock);
	- vp->v_flag \|= VROOT;
	- mutex_exit(&vp->v_lock);
	- rootvp = vp;
	-
	- /*
	- * Leave rootvp held. The root file system is never unmounted.
	- */
	-
	- vfs_add((struct vnode *)0, vfsp,
	- (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
	-out:
	- vfs_unlock(vfsp);
	- return (error);
	- } else if (why == ROOT_REMOUNT) {
	- readonly_changed_cb(vfsp->vfs_data, B_FALSE);
	- vfsp->vfs_flag \|= VFS_REMOUNT;
	-
	- /* refresh mount options */
	- zfs_unregister_callbacks(vfsp->vfs_data);
	- return (zfs_register_callbacks(vfsp));
	-
	- } else if (why == ROOT_UNMOUNT) {
	- zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
	- (void) zfs_sync(vfsp, 0, 0);
	- return (0);
	- }
	-
	- /*
	- * if "why" is equal to anything else other than ROOT_INIT,
	- * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
	- */
	- return (SET_ERROR(ENOTSUP));
	-}
	-#endif /* OPENSOLARIS_MOUNTROOT */
	-
	-static int
	-getpoolname(const char osname, char poolname)
	-{
	- char *p;
	-
	- p = strchr(osname, '/');
	- if (p == NULL) {
	- if (strlen(osname) >= MAXNAMELEN)
	- return (ENAMETOOLONG);
	- (void) strcpy(poolname, osname);
	- } else {
	- if (p - osname >= MAXNAMELEN)
	- return (ENAMETOOLONG);
	- (void) strncpy(poolname, osname, p - osname);
	- poolname[p - osname] = '\0';
	- }
	- return (0);
	-}
	-
	-static void
	-fetch_osname_options(char name, bool checkpointrewind)
	-{
	-
	- if (name[0] == '!') {
	- *checkpointrewind = true;
	- memmove(name, name + 1, strlen(name));
	- } else {
	- *checkpointrewind = false;
	- }
	-}
	-
	-/ARGSUSED/
	-static int
	-zfs_mount(vfs_t *vfsp)
	-{
	- kthread_t *td = curthread;
	- vnode_t *mvp = vfsp->mnt_vnodecovered;
	- cred_t *cr = td->td_ucred;
	- char *osname;
	- int error = 0;
	- int canwrite;
	- bool checkpointrewind;
	-
	-#ifdef illumos
	- if (mvp->v_type != VDIR)
	- return (SET_ERROR(ENOTDIR));
	-
	- mutex_enter(&mvp->v_lock);
	- if ((uap->flags & MS_REMOUNT) == 0 &&
	- (uap->flags & MS_OVERLAY) == 0 &&
	- (mvp->v_count != 1 \|\| (mvp->v_flag & VROOT))) {
	- mutex_exit(&mvp->v_lock);
	- return (SET_ERROR(EBUSY));
	- }
	- mutex_exit(&mvp->v_lock);
	-
	- /*
	- * ZFS does not support passing unparsed data in via MS_DATA.
	- * Users should use the MS_OPTIONSTR interface; this means
	- * that all option parsing is already done and the options struct
	- * can be interrogated.
	- */
	- if ((uap->flags & MS_DATA) && uap->datalen > 0)
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * Get the objset name (the "special" mount argument).
	- */
	- if (error = pn_get(uap->spec, fromspace, &spn))
	- return (error);
	-
	- osname = spn.pn_path;
	-#else /* !illumos */
	- if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * If full-owner-access is enabled and delegated administration is
	- * turned on, we must set nosuid.
	- */
	- if (zfs_super_owner &&
	- dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
	- secpolicy_fs_mount_clearopts(cr, vfsp);
	- }
	-#endif /* illumos */
	- fetch_osname_options(osname, &checkpointrewind);
	-
	- /*
	- * Check for mount privilege?
	- *
	- * If we don't have privilege then see if
	- * we have local permission to allow it
	- */
	- error = secpolicy_fs_mount(cr, mvp, vfsp);
	- if (error) {
	- if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
	- goto out;
	-
	- if (!(vfsp->vfs_flag & MS_REMOUNT)) {
	- vattr_t vattr;
	-
	- /*
	- * Make sure user is the owner of the mount point
	- * or has sufficient privileges.
	- */
	-
	- vattr.va_mask = AT_UID;
	-
	- vn_lock(mvp, LK_SHARED \| LK_RETRY);
	- if (VOP_GETATTR(mvp, &vattr, cr)) {
	- VOP_UNLOCK(mvp);
	- goto out;
	- }
	-
	- if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
	- VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
	- VOP_UNLOCK(mvp);
	- goto out;
	- }
	- VOP_UNLOCK(mvp);
	- }
	-
	- secpolicy_fs_mount_clearopts(cr, vfsp);
	- }
	-
	- /*
	- * Refuse to mount a filesystem if we are in a local zone and the
	- * dataset is not visible.
	- */
	- if (!INGLOBALZONE(curthread) &&
	- (!zone_dataset_visible(osname, &canwrite) \|\| !canwrite)) {
	- error = SET_ERROR(EPERM);
	- goto out;
	- }
	-
	-#ifdef SECLABEL
	- error = zfs_mount_label_policy(vfsp, osname);
	- if (error)
	- goto out;
	-#endif
	-
	- vfsp->vfs_flag \|= MNT_NFS4ACLS;
	-
	- /*
	- * When doing a remount, we simply refresh our temporary properties
	- * according to those options set in the current VFS options.
	- */
	- if (vfsp->vfs_flag & MS_REMOUNT) {
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	-
	- /*
	- * Refresh mount options with z_teardown_lock blocking I/O while
	- * the filesystem is in an inconsistent state.
	- * The lock also serializes this code with filesystem
	- * manipulations between entry to zfs_suspend_fs() and return
	- * from zfs_resume_fs().
	- */
	- rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
	- zfs_unregister_callbacks(zfsvfs);
	- error = zfs_register_callbacks(vfsp);
	- rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
	- goto out;
	- }
	-
	- /* Initial root mount: try hard to import the requested root pool. */
	- if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
	- (vfsp->vfs_flag & MNT_UPDATE) == 0) {
	- char pname[MAXNAMELEN];
	-
	- error = getpoolname(osname, pname);
	- if (error == 0)
	- error = spa_import_rootpool(pname, checkpointrewind);
	- if (error)
	- goto out;
	- }
	- DROP_GIANT();
	- error = zfs_domount(vfsp, osname);
	- PICKUP_GIANT();
	-
	-#ifdef illumos
	- /*
	- * Add an extra VFS_HOLD on our parent vfs so that it can't
	- * disappear due to a forced unmount.
	- */
	- if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
	- VFS_HOLD(mvp->v_vfsp);
	-#endif
	-
	-out:
	- return (error);
	-}
	-
	-static int
	-zfs_statfs(vfs_t vfsp, struct statfs statp)
	-{
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	- uint64_t refdbytes, availbytes, usedobjs, availobjs;
	-
	- statp->f_version = STATFS_VERSION;
	-
	- ZFS_ENTER(zfsvfs);
	-
	- dmu_objset_space(zfsvfs->z_os,
	- &refdbytes, &availbytes, &usedobjs, &availobjs);
	-
	- /*
	- * The underlying storage pool actually uses multiple block sizes.
	- * We report the fragsize as the smallest block size we support,
	- * and we report our blocksize as the filesystem's maximum blocksize.
	- */
	- statp->f_bsize = SPA_MINBLOCKSIZE;
	- statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
	-
	- /*
	- * The following report "total" blocks of various kinds in the
	- * file system, but reported in terms of f_frsize - the
	- * "fragment" size.
	- */
	-
	- statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
	- statp->f_bfree = availbytes / statp->f_bsize;
	- statp->f_bavail = statp->f_bfree; /* no root reservation */
	-
	- /*
	- * statvfs() should really be called statufs(), because it assumes
	- * static metadata. ZFS doesn't preallocate files, so the best
	- * we can do is report the max that could possibly fit in f_files,
	- * and that minus the number actually used in f_ffree.
	- * For f_ffree, report the smaller of the number of object available
	- * and the number of blocks (each object will take at least a block).
	- */
	- statp->f_ffree = MIN(availobjs, statp->f_bfree);
	- statp->f_files = statp->f_ffree + usedobjs;
	-
	- /*
	- * We're a zfs filesystem.
	- */
	- (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
	-
	- strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
	- sizeof(statp->f_mntfromname));
	- strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
	- sizeof(statp->f_mntonname));
	-
	- statp->f_namemax = MAXNAMELEN - 1;
	-
	- ZFS_EXIT(zfsvfs);
	- return (0);
	-}
	-
	-static int
	-zfs_root(vfs_t vfsp, int flags, vnode_t *vpp)
	-{
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	- znode_t *rootzp;
	- int error;
	-
	- ZFS_ENTER(zfsvfs);
	-
	- error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
	- if (error == 0)
	- *vpp = ZTOV(rootzp);
	-
	- ZFS_EXIT(zfsvfs);
	-
	- if (error == 0) {
	- error = vn_lock(*vpp, flags);
	- if (error != 0) {
	- VN_RELE(*vpp);
	- *vpp = NULL;
	- }
	- }
	- return (error);
	-}
	-
	-/*
	- * Teardown the zfsvfs::z_os.
	- *
	- * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
	- * and 'z_teardown_inactive_lock' held.
	- */
	-static int
	-zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
	-{
	- znode_t *zp;
	-
	- rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
	-
	- if (!unmounting) {
	- /*
	- * We purge the parent filesystem's vfsp as the parent
	- * filesystem and all of its snapshots have their vnode's
	- * v_vfsp set to the parent's filesystem's vfsp. Note,
	- * 'z_parent' is self referential for non-snapshots.
	- */
	- (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
	-#ifdef FREEBSD_NAMECACHE
	- cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
	-#endif
	- }
	-
	- /*
	- * Close the zil. NB: Can't close the zil while zfs_inactive
	- * threads are blocked as zil_close can call zfs_inactive.
	- */
	- if (zfsvfs->z_log) {
	- zil_close(zfsvfs->z_log);
	- zfsvfs->z_log = NULL;
	- }
	-
	- ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs);
	-
	- /*
	- * If we are not unmounting (ie: online recv) and someone already
	- * unmounted this file system while we were doing the switcheroo,
	- * or a reopen of z_os failed then just bail out now.
	- */
	- if (!unmounting && (zfsvfs->z_unmounted \|\| zfsvfs->z_os == NULL)) {
	- ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
	- rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
	- return (SET_ERROR(EIO));
	- }
	-
	- /*
	- * At this point there are no vops active, and any new vops will
	- * fail with EIO since we have z_teardown_lock for writer (only
	- * relavent for forced unmount).
	- *
	- * Release all holds on dbufs.
	- */
	- mutex_enter(&zfsvfs->z_znodes_lock);
	- for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
	- zp = list_next(&zfsvfs->z_all_znodes, zp))
	- if (zp->z_sa_hdl) {
	- ASSERT(ZTOV(zp)->v_count >= 0);
	- zfs_znode_dmu_fini(zp);
	- }
	- mutex_exit(&zfsvfs->z_znodes_lock);
	-
	- /*
	- * If we are unmounting, set the unmounted flag and let new vops
	- * unblock. zfs_inactive will have the unmounted behavior, and all
	- * other vops will fail with EIO.
	- */
	- if (unmounting) {
	- zfsvfs->z_unmounted = B_TRUE;
	- ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
	- rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
	- }
	-
	- /*
	- * z_os will be NULL if there was an error in attempting to reopen
	- * zfsvfs, so just return as the properties had already been
	- * unregistered and cached data had been evicted before.
	- */
	- if (zfsvfs->z_os == NULL)
	- return (0);
	-
	- /*
	- * Unregister properties.
	- */
	- zfs_unregister_callbacks(zfsvfs);
	-
	- /*
	- * Evict cached data
	- */
	- if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
	- !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
	- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
	- dmu_objset_evict_dbufs(zfsvfs->z_os);
	-
	- return (0);
	-}
	-
	-/ARGSUSED/
	-static int
	-zfs_umount(vfs_t *vfsp, int fflag)
	-{
	- kthread_t *td = curthread;
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	- objset_t *os;
	- cred_t *cr = td->td_ucred;
	- int ret;
	-
	- ret = secpolicy_fs_unmount(cr, vfsp);
	- if (ret) {
	- if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
	- ZFS_DELEG_PERM_MOUNT, cr))
	- return (ret);
	- }
	-
	- /*
	- * We purge the parent filesystem's vfsp as the parent filesystem
	- * and all of its snapshots have their vnode's v_vfsp set to the
	- * parent's filesystem's vfsp. Note, 'z_parent' is self
	- * referential for non-snapshots.
	- */
	- (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
	-
	- /*
	- * Unmount any snapshots mounted under .zfs before unmounting the
	- * dataset itself.
	- */
	- if (zfsvfs->z_ctldir != NULL) {
	- if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
	- return (ret);
	- }
	-
	- if (fflag & MS_FORCE) {
	- /*
	- * Mark file system as unmounted before calling
	- * vflush(FORCECLOSE). This way we ensure no future vnops
	- * will be called and risk operating on DOOMED vnodes.
	- */
	- rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
	- zfsvfs->z_unmounted = B_TRUE;
	- rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
	- }
	-
	- /*
	- * Flush all the files.
	- */
	- ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
	- if (ret != 0)
	- return (ret);
	-
	-#ifdef illumos
	- if (!(fflag & MS_FORCE)) {
	- /*
	- * Check the number of active vnodes in the file system.
	- * Our count is maintained in the vfs structure, but the
	- * number is off by 1 to indicate a hold on the vfs
	- * structure itself.
	- *
	- * The '.zfs' directory maintains a reference of its
	- * own, and any active references underneath are
	- * reflected in the vnode count.
	- */
	- if (zfsvfs->z_ctldir == NULL) {
	- if (vfsp->vfs_count > 1)
	- return (SET_ERROR(EBUSY));
	- } else {
	- if (vfsp->vfs_count > 2 \|\|
	- zfsvfs->z_ctldir->v_count > 1)
	- return (SET_ERROR(EBUSY));
	- }
	- }
	-#endif
	-
	- while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
	- &zfsvfs->z_unlinked_drain_task, NULL) != 0)
	- taskqueue_drain(zfsvfs_taskq->tq_queue,
	- &zfsvfs->z_unlinked_drain_task);
	-
	- VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
	- os = zfsvfs->z_os;
	-
	- /*
	- * z_os will be NULL if there was an error in
	- * attempting to reopen zfsvfs.
	- */
	- if (os != NULL) {
	- /*
	- * Unset the objset user_ptr.
	- */
	- mutex_enter(&os->os_user_ptr_lock);
	- dmu_objset_set_user(os, NULL);
	- mutex_exit(&os->os_user_ptr_lock);
	-
	- /*
	- * Finally release the objset
	- */
	- dmu_objset_disown(os, zfsvfs);
	- }
	-
	- /*
	- * We can now safely destroy the '.zfs' directory node.
	- */
	- if (zfsvfs->z_ctldir != NULL)
	- zfsctl_destroy(zfsvfs);
	- zfs_freevfs(vfsp);
	-
	- return (0);
	-}
	-
	-static int
	-zfs_vget(vfs_t vfsp, ino_t ino, int flags, vnode_t *vpp)
	-{
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	- znode_t *zp;
	- int err;
	-
	- /*
	- * zfs_zget() can't operate on virtual entries like .zfs/ or
	- * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
	- * This will make NFS to switch to LOOKUP instead of using VGET.
	- */
	- if (ino == ZFSCTL_INO_ROOT \|\| ino == ZFSCTL_INO_SNAPDIR \|\|
	- (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
	- return (EOPNOTSUPP);
	-
	- ZFS_ENTER(zfsvfs);
	- err = zfs_zget(zfsvfs, ino, &zp);
	- if (err == 0 && zp->z_unlinked) {
	- vrele(ZTOV(zp));
	- err = EINVAL;
	- }
	- if (err == 0)
	- *vpp = ZTOV(zp);
	- ZFS_EXIT(zfsvfs);
	- if (err == 0) {
	- err = vn_lock(*vpp, flags);
	- if (err != 0)
	- vrele(*vpp);
	- }
	- if (err != 0)
	- *vpp = NULL;
	- return (err);
	-}
	-
	-static int
	-zfs_checkexp(vfs_t vfsp, struct sockaddr nam, uint64_t *extflagsp,
	- struct ucred *credanonp, int numsecflavors, int *secflavors)
	-{
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	-
	- /*
	- * If this is regular file system vfsp is the same as
	- * zfsvfs->z_parent->z_vfs, but if it is snapshot,
	- * zfsvfs->z_parent->z_vfs represents parent file system
	- * which we have to use here, because only this file system
	- * has mnt_export configured.
	- */
	- return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
	- credanonp, numsecflavors, secflavors));
	-}
	-
	-CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
	-CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
	-
	-static int
	-zfs_fhtovp(vfs_t vfsp, fid_t fidp, int flags, vnode_t **vpp)
	-{
	- struct componentname cn;
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	- znode_t *zp;
	- vnode_t *dvp;
	- uint64_t object = 0;
	- uint64_t fid_gen = 0;
	- uint64_t gen_mask;
	- uint64_t zp_gen;
	- int i, err;
	-
	- *vpp = NULL;
	-
	- ZFS_ENTER(zfsvfs);
	-
	- /*
	- * On FreeBSD we can get snapshot's mount point or its parent file
	- * system mount point depending if snapshot is already mounted or not.
	- */
	- if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
	- zfid_long_t zlfid = (zfid_long_t )fidp;
	- uint64_t objsetid = 0;
	- uint64_t setgen = 0;
	-
	- for (i = 0; i < sizeof (zlfid->zf_setid); i++)
	- objsetid \|= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
	-
	- for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
	- setgen \|= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
	-
	- ZFS_EXIT(zfsvfs);
	-
	- err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
	- if (err)
	- return (SET_ERROR(EINVAL));
	- ZFS_ENTER(zfsvfs);
	- }
	-
	- if (fidp->fid_len == SHORT_FID_LEN \|\| fidp->fid_len == LONG_FID_LEN) {
	- zfid_short_t zfid = (zfid_short_t )fidp;
	-
	- for (i = 0; i < sizeof (zfid->zf_object); i++)
	- object \|= ((uint64_t)zfid->zf_object[i]) << (8 * i);
	-
	- for (i = 0; i < sizeof (zfid->zf_gen); i++)
	- fid_gen \|= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
	- } else {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * A zero fid_gen means we are in .zfs or the .zfs/snapshot
	- * directory tree. If the object == zfsvfs->z_shares_dir, then
	- * we are in the .zfs/shares directory tree.
	- */
	- if ((fid_gen == 0 &&
	- (object == ZFSCTL_INO_ROOT \|\| object == ZFSCTL_INO_SNAPDIR)) \|\|
	- (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
	- ZFS_EXIT(zfsvfs);
	- VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
	- if (object == ZFSCTL_INO_SNAPDIR) {
	- cn.cn_nameptr = "snapshot";
	- cn.cn_namelen = strlen(cn.cn_nameptr);
	- cn.cn_nameiop = LOOKUP;
	- cn.cn_flags = ISLASTCN \| LOCKLEAF;
	- cn.cn_lkflags = flags;
	- VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
	- vput(dvp);
	- } else if (object == zfsvfs->z_shares_dir) {
	- /*
	- * XXX This branch must not be taken,
	- * if it is, then the lookup below will
	- * explode.
	- */
	- cn.cn_nameptr = "shares";
	- cn.cn_namelen = strlen(cn.cn_nameptr);
	- cn.cn_nameiop = LOOKUP;
	- cn.cn_flags = ISLASTCN;
	- cn.cn_lkflags = flags;
	- VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
	- vput(dvp);
	- } else {
	- *vpp = dvp;
	- }
	- return (err);
	- }
	-
	- gen_mask = -1ULL >> (64 - 8 * i);
	-
	- dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
	- if (err = zfs_zget(zfsvfs, object, &zp)) {
	- ZFS_EXIT(zfsvfs);
	- return (err);
	- }
	- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
	- sizeof (uint64_t));
	- zp_gen = zp_gen & gen_mask;
	- if (zp_gen == 0)
	- zp_gen = 1;
	- if (zp->z_unlinked \|\| zp_gen != fid_gen) {
	- dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
	- vrele(ZTOV(zp));
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- *vpp = ZTOV(zp);
	- ZFS_EXIT(zfsvfs);
	- err = vn_lock(*vpp, flags);
	- if (err == 0)
	- vnode_create_vobject(*vpp, zp->z_size, curthread);
	- else
	- *vpp = NULL;
	- return (err);
	-}
	-
	-/*
	- * Block out VOPs and close zfsvfs_t::z_os
	- *
	- * Note, if successful, then we return with the 'z_teardown_lock' and
	- * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
	- * dataset and objset intact so that they can be atomically handed off during
	- * a subsequent rollback or recv operation and the resume thereafter.
	- */
	-int
	-zfs_suspend_fs(zfsvfs_t *zfsvfs)
	-{
	- int error;
	-
	- if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
	- return (error);
	-
	- return (0);
	-}
	-
	-/*
	- * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
	- * is an invariant across any of the operations that can be performed while the
	- * filesystem was suspended. Whether it succeeded or failed, the preconditions
	- * are the same: the relevant objset and associated dataset are owned by
	- * zfsvfs, held, and long held on entry.
	- */
	-int
	-zfs_resume_fs(zfsvfs_t zfsvfs, dsl_dataset_t ds)
	-{
	- int err;
	- znode_t *zp;
	-
	- ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
	- ASSERT(ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs));
	-
	- /*
	- * We already own this, so just update the objset_t, as the one we
	- * had before may have been evicted.
	- */
	- objset_t *os;
	- VERIFY3P(ds->ds_owner, ==, zfsvfs);
	- VERIFY(dsl_dataset_long_held(ds));
	- VERIFY0(dmu_objset_from_ds(ds, &os));
	-
	- err = zfsvfs_init(zfsvfs, os);
	- if (err != 0)
	- goto bail;
	-
	- VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
	-
	- zfs_set_fuid_feature(zfsvfs);
	-
	- /*
	- * Attempt to re-establish all the active znodes with
	- * their dbufs. If a zfs_rezget() fails, then we'll let
	- * any potential callers discover that via ZFS_ENTER_VERIFY_VP
	- * when they try to use their znode.
	- */
	- mutex_enter(&zfsvfs->z_znodes_lock);
	- for (zp = list_head(&zfsvfs->z_all_znodes); zp;
	- zp = list_next(&zfsvfs->z_all_znodes, zp)) {
	- (void) zfs_rezget(zp);
	- }
	- mutex_exit(&zfsvfs->z_znodes_lock);
	-
	-bail:
	- /* release the VOPs */
	- ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
	- rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
	-
	- if (err) {
	- /*
	- * Since we couldn't setup the sa framework, try to force
	- * unmount this file system.
	- */
	- if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
	- vfs_ref(zfsvfs->z_vfs);
	- (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
	- }
	- }
	- return (err);
	-}
	-
	-static void
	-zfs_freevfs(vfs_t *vfsp)
	-{
	- zfsvfs_t *zfsvfs = vfsp->vfs_data;
	-
	-#ifdef illumos
	- /*
	- * If this is a snapshot, we have an extra VFS_HOLD on our parent
	- * from zfs_mount(). Release it here. If we came through
	- * zfs_mountroot() instead, we didn't grab an extra hold, so
	- * skip the VFS_RELE for rootvfs.
	- */
	- if (zfsvfs->z_issnap && (vfsp != rootvfs))
	- VFS_RELE(zfsvfs->z_parent->z_vfs);
	-#endif
	-
	- zfsvfs_free(zfsvfs);
	-
	- atomic_dec_32(&zfs_active_fs_count);
	-}
	-
	-#ifdef __i386__
	-static int desiredvnodes_backup;
	-#endif
	-
	-static void
	-zfs_vnodes_adjust(void)
	-{
	-#ifdef __i386__
	- int newdesiredvnodes;
	-
	- desiredvnodes_backup = desiredvnodes;
	-
	- /*
	- * We calculate newdesiredvnodes the same way it is done in
	- * vntblinit(). If it is equal to desiredvnodes, it means that
	- * it wasn't tuned by the administrator and we can tune it down.
	- */
	- newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
	- vm_kmem_size / (5 * (sizeof(struct vm_object) +
	- sizeof(struct vnode))));
	- if (newdesiredvnodes == desiredvnodes)
	- desiredvnodes = (3 * newdesiredvnodes) / 4;
	-#endif
	-}
	-
	-static void
	-zfs_vnodes_adjust_back(void)
	-{
	-
	-#ifdef __i386__
	- desiredvnodes = desiredvnodes_backup;
	-#endif
	-}
	-
	-void
	-zfs_init(void)
	-{
	-
	- printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
	-
	- /*
	- * Initialize .zfs directory structures
	- */
	- zfsctl_init();
	-
	- /*
	- * Initialize znode cache, vnode ops, etc...
	- */
	- zfs_znode_init();
	-
	- /*
	- * Reduce number of vnodes. Originally number of vnodes is calculated
	- * with UFS inode in mind. We reduce it here, because it's too big for
	- * ZFS/i386.
	- */
	- zfs_vnodes_adjust();
	-
	- dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
	-#if defined(__FreeBSD__)
	- zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
	-#endif
	-}
	-
	-void
	-zfs_fini(void)
	-{
	-#if defined(__FreeBSD__)
	- taskq_destroy(zfsvfs_taskq);
	-#endif
	- zfsctl_fini();
	- zfs_znode_fini();
	- zfs_vnodes_adjust_back();
	-}
	-
	-int
	-zfs_busy(void)
	-{
	- return (zfs_active_fs_count != 0);
	-}
	-
	-int
	-zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
	-{
	- int error;
	- objset_t *os = zfsvfs->z_os;
	- dmu_tx_t *tx;
	-
	- if (newvers < ZPL_VERSION_INITIAL \|\| newvers > ZPL_VERSION)
	- return (SET_ERROR(EINVAL));
	-
	- if (newvers < zfsvfs->z_version)
	- return (SET_ERROR(EINVAL));
	-
	- if (zfs_spa_version_map(newvers) >
	- spa_version(dmu_objset_spa(zfsvfs->z_os)))
	- return (SET_ERROR(ENOTSUP));
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
	- if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
	- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
	- ZFS_SA_ATTRS);
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
	- }
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- return (error);
	- }
	-
	- error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
	- 8, 1, &newvers, tx);
	-
	- if (error) {
	- dmu_tx_commit(tx);
	- return (error);
	- }
	-
	- if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
	- uint64_t sa_obj;
	-
	- ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
	- SPA_VERSION_SA);
	- sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
	- DMU_OT_NONE, 0, tx);
	-
	- error = zap_add(os, MASTER_NODE_OBJ,
	- ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
	- ASSERT0(error);
	-
	- VERIFY(0 == sa_set_sa_object(os, sa_obj));
	- sa_register_update_callback(os, zfs_sa_upgrade);
	- }
	-
	- spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
	- "from %llu to %llu", zfsvfs->z_version, newvers);
	-
	- dmu_tx_commit(tx);
	-
	- zfsvfs->z_version = newvers;
	- os->os_version = newvers;
	-
	- zfs_set_fuid_feature(zfsvfs);
	-
	- return (0);
	-}
	-
	-/*
	- * Read a property stored within the master node.
	- */
	-int
	-zfs_get_zplprop(objset_t os, zfs_prop_t prop, uint64_t value)
	-{
	- uint64_t *cached_copy = NULL;
	-
	- /*
	- * Figure out where in the objset_t the cached copy would live, if it
	- * is available for the requested property.
	- */
	- if (os != NULL) {
	- switch (prop) {
	- case ZFS_PROP_VERSION:
	- cached_copy = &os->os_version;
	- break;
	- case ZFS_PROP_NORMALIZE:
	- cached_copy = &os->os_normalization;
	- break;
	- case ZFS_PROP_UTF8ONLY:
	- cached_copy = &os->os_utf8only;
	- break;
	- case ZFS_PROP_CASE:
	- cached_copy = &os->os_casesensitivity;
	- break;
	- default:
	- break;
	- }
	- }
	- if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
	- value = cached_copy;
	- return (0);
	- }
	-
	- /*
	- * If the property wasn't cached, look up the file system's value for
	- * the property. For the version property, we look up a slightly
	- * different string.
	- */
	- const char *pname;
	- int error = ENOENT;
	- if (prop == ZFS_PROP_VERSION) {
	- pname = ZPL_VERSION_STR;
	- } else {
	- pname = zfs_prop_to_name(prop);
	- }
	-
	- if (os != NULL) {
	- ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
	- error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
	- }
	-
	- if (error == ENOENT) {
	- /* No value set, use the default value */
	- switch (prop) {
	- case ZFS_PROP_VERSION:
	- *value = ZPL_VERSION;
	- break;
	- case ZFS_PROP_NORMALIZE:
	- case ZFS_PROP_UTF8ONLY:
	- *value = 0;
	- break;
	- case ZFS_PROP_CASE:
	- *value = ZFS_CASE_SENSITIVE;
	- break;
	- default:
	- return (error);
	- }
	- error = 0;
	- }
	-
	- /*
	- * If one of the methods for getting the property value above worked,
	- * copy it into the objset_t's cache.
	- */
	- if (error == 0 && cached_copy != NULL) {
	- cached_copy = value;
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * Return true if the coresponding vfs's unmounted flag is set.
	- * Otherwise return false.
	- * If this function returns true we know VFS unmount has been initiated.
	- */
	-boolean_t
	-zfs_get_vfs_flag_unmounted(objset_t *os)
	-{
	- zfsvfs_t *zfvp;
	- boolean_t unmounted = B_FALSE;
	-
	- ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
	-
	- mutex_enter(&os->os_user_ptr_lock);
	- zfvp = dmu_objset_get_user(os);
	- if (zfvp != NULL && zfvp->z_vfs != NULL &&
	- (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
	- unmounted = B_TRUE;
	- mutex_exit(&os->os_user_ptr_lock);
	-
	- return (unmounted);
	-}
	-
	-#ifdef _KERNEL
	-void
	-zfsvfs_update_fromname(const char oldname, const char newname)
	-{
	- char tmpbuf[MAXPATHLEN];
	- struct mount *mp;
	- char *fromname;
	- size_t oldlen;
	-
	- oldlen = strlen(oldname);
	-
	- mtx_lock(&mountlist_mtx);
	- TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	- fromname = mp->mnt_stat.f_mntfromname;
	- if (strcmp(fromname, oldname) == 0) {
	- (void)strlcpy(fromname, newname,
	- sizeof(mp->mnt_stat.f_mntfromname));
	- continue;
	- }
	- if (strncmp(fromname, oldname, oldlen) == 0 &&
	- (fromname[oldlen] == '/' \|\| fromname[oldlen] == '@')) {
	- (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
	- newname, fromname + oldlen);
	- (void)strlcpy(fromname, tmpbuf,
	- sizeof(mp->mnt_stat.f_mntfromname));
	- continue;
	- }
	- }
	- mtx_unlock(&mountlist_mtx);
	-}
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
	@@ -1,6124 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Nexenta Systems, Inc.
	- */
	-
	-/* Portions Copyright 2007 Jeremy Teo */
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/time.h>
	-#include <sys/systm.h>
	-#include <sys/sysmacros.h>
	-#include <sys/resource.h>
	-#include <sys/vfs.h>
	-#include <sys/vm.h>
	-#include <sys/vnode.h>
	-#include <sys/smr.h>
	-#include <sys/file.h>
	-#include <sys/stat.h>
	-#include <sys/kmem.h>
	-#include <sys/taskq.h>
	-#include <sys/uio.h>
	-#include <sys/atomic.h>
	-#include <sys/namei.h>
	-#include <sys/mman.h>
	-#include <sys/cmn_err.h>
	-#include <sys/errno.h>
	-#include <sys/unistd.h>
	-#include <sys/zfs_dir.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/dmu.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/spa.h>
	-#include <sys/txg.h>
	-#include <sys/dbuf.h>
	-#include <sys/zap.h>
	-#include <sys/sa.h>
	-#include <sys/dirent.h>
	-#include <sys/policy.h>
	-#include <sys/sunddi.h>
	-#include <sys/filio.h>
	-#include <sys/sid.h>
	-#include <sys/zfs_ctldir.h>
	-#include <sys/zfs_fuid.h>
	-#include <sys/zfs_sa.h>
	-#include <sys/zfs_rlock.h>
	-#include <sys/extdirent.h>
	-#include <sys/kidmap.h>
	-#include <sys/bio.h>
	-#include <sys/buf.h>
	-#include <sys/sched.h>
	-#include <sys/acl.h>
	-#include <sys/vmmeter.h>
	-#include <vm/vm_param.h>
	-#include <sys/zil.h>
	-
	-VFS_SMR_DECLARE;
	-
	-/*
	- * Programming rules.
	- *
	- * Each vnode op performs some logical unit of work. To do this, the ZPL must
	- * properly lock its in-core state, create a DMU transaction, do the work,
	- * record this work in the intent log (ZIL), commit the DMU transaction,
	- * and wait for the intent log to commit if it is a synchronous operation.
	- * Moreover, the vnode ops must work in both normal and log replay context.
	- * The ordering of events is important to avoid deadlocks and references
	- * to freed memory. The example below illustrates the following Big Rules:
	- *
	- * (1) A check must be made in each zfs thread for a mounted file system.
	- * This is done avoiding races using ZFS_ENTER(zfsvfs).
	- * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
	- * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
	- * can return EIO from the calling function.
	- *
	- * (2) VN_RELE() should always be the last thing except for zil_commit()
	- * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
	- * First, if it's the last reference, the vnode/znode
	- * can be freed, so the zp may point to freed memory. Second, the last
	- * reference will call zfs_zinactive(), which may induce a lot of work --
	- * pushing cached pages (which acquires range locks) and syncing out
	- * cached atime changes. Third, zfs_zinactive() may require a new tx,
	- * which could deadlock the system if you were already holding one.
	- * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
	- *
	- * (3) All range locks must be grabbed before calling dmu_tx_assign(),
	- * as they can span dmu_tx_assign() calls.
	- *
	- * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
	- * dmu_tx_assign(). This is critical because we don't want to block
	- * while holding locks.
	- *
	- * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
	- * reduces lock contention and CPU usage when we must wait (note that if
	- * throughput is constrained by the storage, nearly every transaction
	- * must wait).
	- *
	- * Note, in particular, that if a lock is sometimes acquired before
	- * the tx assigns, and sometimes after (e.g. z_lock), then failing
	- * to use a non-blocking assign can deadlock the system. The scenario:
	- *
	- * Thread A has grabbed a lock before calling dmu_tx_assign().
	- * Thread B is in an already-assigned tx, and blocks for this lock.
	- * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
	- * forever, because the previous txg can't quiesce until B's tx commits.
	- *
	- * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
	- * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
	- * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
	- * to indicate that this operation has already called dmu_tx_wait().
	- * This will ensure that we don't retry forever, waiting a short bit
	- * each time.
	- *
	- * (5) If the operation succeeded, generate the intent log entry for it
	- * before dropping locks. This ensures that the ordering of events
	- * in the intent log matches the order in which they actually occurred.
	- * During ZIL replay the zfs_log_* functions will update the sequence
	- * number to indicate the zil transaction has replayed.
	- *
	- * (6) At the end of each vnode op, the DMU tx must always commit,
	- * regardless of whether there were any errors.
	- *
	- * (7) After dropping all locks, invoke zil_commit(zilog, foid)
	- * to ensure that synchronous semantics are provided when necessary.
	- *
	- * In general, this is how things should be ordered in each vnode op:
	- *
	- * ZFS_ENTER(zfsvfs); // exit if unmounted
	- * top:
	- * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
	- * rw_enter(...); // grab any other locks you need
	- * tx = dmu_tx_create(...); // get DMU tx
	- * dmu_tx_hold_*(); // hold each object you might modify
	- * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) \| TXG_NOWAIT);
	- * if (error) {
	- * rw_exit(...); // drop locks
	- * zfs_dirent_unlock(dl); // unlock directory entry
	- * VN_RELE(...); // release held vnodes
	- * if (error == ERESTART) {
	- * waited = B_TRUE;
	- * dmu_tx_wait(tx);
	- * dmu_tx_abort(tx);
	- * goto top;
	- * }
	- * dmu_tx_abort(tx); // abort DMU tx
	- * ZFS_EXIT(zfsvfs); // finished in zfs
	- * return (error); // really out of space
	- * }
	- * error = do_real_work(); // do whatever this VOP does
	- * if (error == 0)
	- * zfs_log_*(...); // on success, make ZIL entry
	- * dmu_tx_commit(tx); // commit DMU tx -- error or not
	- * rw_exit(...); // drop locks
	- * zfs_dirent_unlock(dl); // unlock directory entry
	- * VN_RELE(...); // release held vnodes
	- * zil_commit(zilog, foid); // synchronous when necessary
	- * ZFS_EXIT(zfsvfs); // finished in zfs
	- * return (error); // done, report error
	- */
	-
	-/* ARGSUSED */
	-static int
	-zfs_open(vnode_t *vpp, int flag, cred_t cr, caller_context_t *ct)
	-{
	- znode_t zp = VTOZ(vpp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
	- ((flag & FAPPEND) == 0)) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EPERM));
	- }
	-
	- if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
	- ZTOV(zp)->v_type == VREG &&
	- !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
	- if (fs_vscan(*vpp, cr, 0) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EACCES));
	- }
	- }
	-
	- /* Keep a count of the synchronous opens in the znode */
	- if (flag & (FSYNC \| FDSYNC))
	- atomic_inc_32(&zp->z_sync_cnt);
	-
	- ZFS_EXIT(zfsvfs);
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_close(vnode_t vp, int flag, int count, offset_t offset, cred_t cr,
	- caller_context_t *ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	-
	- /*
	- * Clean up any locks held by this process on the vp.
	- */
	- cleanlocks(vp, ddi_get_pid(), 0);
	- cleanshares(vp, ddi_get_pid());
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- /* Decrement the synchronous opens in the znode */
	- if ((flag & (FSYNC \| FDSYNC)) && (count == 1))
	- atomic_dec_32(&zp->z_sync_cnt);
	-
	- if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
	- ZTOV(zp)->v_type == VREG &&
	- !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
	- VERIFY(fs_vscan(vp, cr, 1) == 0);
	-
	- ZFS_EXIT(zfsvfs);
	- return (0);
	-}
	-
	-/*
	- * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
	- * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
	- */
	-static int
	-zfs_holey(vnode_t vp, u_long cmd, offset_t off)
	-{
	- znode_t *zp = VTOZ(vp);
	- uint64_t noff = (uint64_t)off; / new offset */
	- uint64_t file_sz;
	- int error;
	- boolean_t hole;
	-
	- file_sz = zp->z_size;
	- if (noff >= file_sz) {
	- return (SET_ERROR(ENXIO));
	- }
	-
	- if (cmd == _FIO_SEEK_HOLE)
	- hole = B_TRUE;
	- else
	- hole = B_FALSE;
	-
	- error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
	-
	- if (error == ESRCH)
	- return (SET_ERROR(ENXIO));
	-
	- /*
	- * We could find a hole that begins after the logical end-of-file,
	- * because dmu_offset_next() only works on whole blocks. If the
	- * EOF falls mid-block, then indicate that the "virtual hole"
	- * at the end of the file begins at the logical EOF, rather than
	- * at the end of the last block.
	- */
	- if (noff > file_sz) {
	- ASSERT(hole);
	- noff = file_sz;
	- }
	-
	- if (noff < *off)
	- return (error);
	- *off = noff;
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zfs_ioctl(vnode_t vp, u_long com, intptr_t data, int flag, cred_t cred,
	- int rvalp, caller_context_t ct)
	-{
	- offset_t off;
	- offset_t ndata;
	- dmu_object_info_t doi;
	- int error;
	- zfsvfs_t *zfsvfs;
	- znode_t *zp;
	-
	- switch (com) {
	- case _FIOFFS:
	- {
	- return (0);
	-
	- /*
	- * The following two ioctls are used by bfu. Faking out,
	- * necessary to avoid bfu errors.
	- */
	- }
	- case _FIOGDIO:
	- case _FIOSDIO:
	- {
	- return (0);
	- }
	-
	- case _FIO_SEEK_DATA:
	- case _FIO_SEEK_HOLE:
	- {
	-#ifdef illumos
	- if (ddi_copyin((void *)data, &off, sizeof (off), flag))
	- return (SET_ERROR(EFAULT));
	-#else
	- off = (offset_t )data;
	-#endif
	- zp = VTOZ(vp);
	- zfsvfs = zp->z_zfsvfs;
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- /* offset parameter is in/out */
	- error = zfs_holey(vp, com, &off);
	- ZFS_EXIT(zfsvfs);
	- if (error)
	- return (error);
	-#ifdef illumos
	- if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
	- return (SET_ERROR(EFAULT));
	-#else
	- (offset_t )data = off;
	-#endif
	- return (0);
	- }
	-#ifdef illumos
	- case _FIO_COUNT_FILLED:
	- {
	- /*
	- * _FIO_COUNT_FILLED adds a new ioctl command which
	- * exposes the number of filled blocks in a
	- * ZFS object.
	- */
	- zp = VTOZ(vp);
	- zfsvfs = zp->z_zfsvfs;
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- /*
	- * Wait for all dirty blocks for this object
	- * to get synced out to disk, and the DMU info
	- * updated.
	- */
	- error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
	- if (error) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * Retrieve fill count from DMU object.
	- */
	- error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
	- if (error) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- ndata = doi.doi_fill_count;
	-
	- ZFS_EXIT(zfsvfs);
	- if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
	- return (SET_ERROR(EFAULT));
	- return (0);
	- }
	-#endif
	- }
	- return (SET_ERROR(ENOTTY));
	-}
	-
	-static vm_page_t
	-page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
	-{
	- vm_object_t obj;
	- vm_page_t pp;
	- int64_t end;
	-
	- /*
	- * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
	- * aligned boundaries, if the range is not aligned. As a result a
	- * DEV_BSIZE subrange with partially dirty data may get marked as clean.
	- * It may happen that all DEV_BSIZE subranges are marked clean and thus
	- * the whole page would be considred clean despite have some dirty data.
	- * For this reason we should shrink the range to DEV_BSIZE aligned
	- * boundaries before calling vm_page_clear_dirty.
	- */
	- end = rounddown2(off + nbytes, DEV_BSIZE);
	- off = roundup2(off, DEV_BSIZE);
	- nbytes = end - off;
	-
	- obj = vp->v_object;
	-
	- vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
	- VM_ALLOC_NOCREAT \| VM_ALLOC_SBUSY \| VM_ALLOC_NORMAL \|
	- VM_ALLOC_IGN_SBUSY);
	- if (pp != NULL) {
	- ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
	- vm_object_pip_add(obj, 1);
	- pmap_remove_write(pp);
	- if (nbytes != 0)
	- vm_page_clear_dirty(pp, off, nbytes);
	- }
	- return (pp);
	-}
	-
	-static void
	-page_unbusy(vm_page_t pp)
	-{
	-
	- vm_page_sunbusy(pp);
	- vm_object_pip_wakeup(pp->object);
	-}
	-
	-static vm_page_t
	-page_wire(vnode_t *vp, int64_t start)
	-{
	- vm_object_t obj;
	- vm_page_t m;
	-
	- obj = vp->v_object;
	- vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
	- VM_ALLOC_NOCREAT \| VM_ALLOC_WIRED \| VM_ALLOC_IGN_SBUSY \|
	- VM_ALLOC_NOBUSY);
	- return (m);
	-}
	-
	-static void
	-page_unwire(vm_page_t pp)
	-{
	-
	- vm_page_unwire(pp, PQ_ACTIVE);
	-}
	-
	-/*
	- * When a file is memory mapped, we must keep the IO data synchronized
	- * between the DMU cache and the memory mapped pages. What this means:
	- *
	- * On Write: If we find a memory mapped page, we write to both
	- * the page and the dmu buffer.
	- */
	-static void
	-update_pages(vnode_t vp, int64_t start, int len, objset_t os, uint64_t oid,
	- int segflg, dmu_tx_t *tx)
	-{
	- vm_object_t obj;
	- struct sf_buf *sf;
	- caddr_t va;
	- int off;
	-
	- ASSERT(segflg != UIO_NOCOPY);
	- ASSERT(vp->v_mount != NULL);
	- obj = vp->v_object;
	- ASSERT(obj != NULL);
	-
	- off = start & PAGEOFFSET;
	- vm_object_pip_add(obj, 1);
	- for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
	- vm_page_t pp;
	- int nbytes = imin(PAGESIZE - off, len);
	-
	- if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
	- va = zfs_map_page(pp, &sf);
	- (void) dmu_read(os, oid, start+off, nbytes,
	- va+off, DMU_READ_PREFETCH);;
	- zfs_unmap_page(sf);
	- page_unbusy(pp);
	- }
	- len -= nbytes;
	- off = 0;
	- }
	- vm_object_pip_wakeup(obj);
	-}
	-
	-/*
	- * Read with UIO_NOCOPY flag means that sendfile(2) requests
	- * ZFS to populate a range of page cache pages with data.
	- *
	- * NOTE: this function could be optimized to pre-allocate
	- * all pages in advance, drain exclusive busy on all of them,
	- * map them into contiguous KVA region and populate them
	- * in one single dmu_read() call.
	- */
	-static int
	-mappedread_sf(vnode_t vp, int nbytes, uio_t uio)
	-{
	- znode_t *zp = VTOZ(vp);
	- objset_t *os = zp->z_zfsvfs->z_os;
	- struct sf_buf *sf;
	- vm_object_t obj;
	- vm_page_t pp;
	- int64_t start;
	- caddr_t va;
	- int len = nbytes;
	- int off;
	- int error = 0;
	-
	- ASSERT(uio->uio_segflg == UIO_NOCOPY);
	- ASSERT(vp->v_mount != NULL);
	- obj = vp->v_object;
	- ASSERT(obj != NULL);
	- ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
	-
	- for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
	- int bytes = MIN(PAGESIZE, len);
	-
	- pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
	- VM_ALLOC_SBUSY \| VM_ALLOC_NORMAL \| VM_ALLOC_IGN_SBUSY);
	- if (vm_page_none_valid(pp)) {
	- va = zfs_map_page(pp, &sf);
	- error = dmu_read(os, zp->z_id, start, bytes, va,
	- DMU_READ_PREFETCH);
	- if (bytes != PAGESIZE && error == 0)
	- bzero(va + bytes, PAGESIZE - bytes);
	- zfs_unmap_page(sf);
	- if (error == 0) {
	- vm_page_valid(pp);
	- vm_page_activate(pp);
	- vm_page_sunbusy(pp);
	- } else {
	- zfs_vmobject_wlock(obj);
	- if (!vm_page_wired(pp) && pp->valid == 0 &&
	- vm_page_busy_tryupgrade(pp))
	- vm_page_free(pp);
	- else
	- vm_page_sunbusy(pp);
	- zfs_vmobject_wunlock(obj);
	- }
	- } else {
	- ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
	- vm_page_sunbusy(pp);
	- }
	- if (error)
	- break;
	- uio->uio_resid -= bytes;
	- uio->uio_offset += bytes;
	- len -= bytes;
	- }
	- return (error);
	-}
	-
	-/*
	- * When a file is memory mapped, we must keep the IO data synchronized
	- * between the DMU cache and the memory mapped pages. What this means:
	- *
	- * On Read: We "read" preferentially from memory mapped pages,
	- * else we default from the dmu buffer.
	- *
	- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
	- * the file is memory mapped.
	- */
	-static int
	-mappedread(vnode_t vp, int nbytes, uio_t uio)
	-{
	- znode_t *zp = VTOZ(vp);
	- vm_object_t obj;
	- int64_t start;
	- caddr_t va;
	- int len = nbytes;
	- int off;
	- int error = 0;
	-
	- ASSERT(vp->v_mount != NULL);
	- obj = vp->v_object;
	- ASSERT(obj != NULL);
	-
	- start = uio->uio_loffset;
	- off = start & PAGEOFFSET;
	- for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
	- vm_page_t pp;
	- uint64_t bytes = MIN(PAGESIZE - off, len);
	-
	- if (pp = page_wire(vp, start)) {
	- struct sf_buf *sf;
	- caddr_t va;
	-
	- va = zfs_map_page(pp, &sf);
	-#ifdef illumos
	- error = uiomove(va + off, bytes, UIO_READ, uio);
	-#else
	- error = vn_io_fault_uiomove(va + off, bytes, uio);
	-#endif
	- zfs_unmap_page(sf);
	- page_unwire(pp);
	- } else {
	- error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
	- uio, bytes);
	- }
	- len -= bytes;
	- off = 0;
	- if (error)
	- break;
	- }
	- return (error);
	-}
	-
	-offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
	-
	-/*
	- * Read bytes from specified file into supplied buffer.
	- *
	- * IN: vp - vnode of file to be read from.
	- * uio - structure supplying read location, range info,
	- * and return buffer.
	- * ioflag - SYNC flags; used to provide FRSYNC semantics.
	- * cr - credentials of caller.
	- * ct - caller context
	- *
	- * OUT: uio - updated offset and range, buffer filled.
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Side Effects:
	- * vp - atime updated if byte count > 0
	- */
	-/* ARGSUSED */
	-static int
	-zfs_read(vnode_t vp, uio_t uio, int ioflag, cred_t cr, caller_context_t ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- ssize_t n, nbytes;
	- int error = 0;
	- xuio_t *xuio = NULL;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- /* We don't copy out anything useful for directories. */
	- if (vp->v_type == VDIR) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EISDIR));
	- }
	-
	- if (zp->z_pflags & ZFS_AV_QUARANTINED) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EACCES));
	- }
	-
	- /*
	- * Validate file offset
	- */
	- if (uio->uio_loffset < (offset_t)0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * Fasttrack empty reads
	- */
	- if (uio->uio_resid == 0) {
	- ZFS_EXIT(zfsvfs);
	- return (0);
	- }
	-
	- /*
	- * Check for mandatory locks
	- */
	- if (MANDMODE(zp->z_mode)) {
	- if (error = chklock(vp, FREAD,
	- uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- }
	-
	- /*
	- * If we're in FRSYNC mode, sync out this znode before reading it.
	- */
	- if (zfsvfs->z_log &&
	- (ioflag & FRSYNC \|\| zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
	- zil_commit(zfsvfs->z_log, zp->z_id);
	-
	- /*
	- * Lock the range against changes.
	- */
	- locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
	- uio->uio_loffset, uio->uio_resid, RL_READER);
	-
	- /*
	- * If we are reading past end-of-file we can skip
	- * to the end; but we might still need to set atime.
	- */
	- if (uio->uio_loffset >= zp->z_size) {
	- error = 0;
	- goto out;
	- }
	-
	- ASSERT(uio->uio_loffset < zp->z_size);
	- n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
	-
	-#ifdef illumos
	- if ((uio->uio_extflg == UIO_XUIO) &&
	- (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
	- int nblk;
	- int blksz = zp->z_blksz;
	- uint64_t offset = uio->uio_loffset;
	-
	- xuio = (xuio_t *)uio;
	- if ((ISP2(blksz))) {
	- nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
	- blksz)) / blksz;
	- } else {
	- ASSERT(offset + n <= blksz);
	- nblk = 1;
	- }
	- (void) dmu_xuio_init(xuio, nblk);
	-
	- if (vn_has_cached_data(vp)) {
	- /*
	- * For simplicity, we always allocate a full buffer
	- * even if we only expect to read a portion of a block.
	- */
	- while (--nblk >= 0) {
	- (void) dmu_xuio_add(xuio,
	- dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
	- blksz), 0, blksz);
	- }
	- }
	- }
	-#endif /* illumos */
	-
	- while (n > 0) {
	- nbytes = MIN(n, zfs_read_chunk_size -
	- P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
	-
	-#ifdef __FreeBSD__
	- if (uio->uio_segflg == UIO_NOCOPY)
	- error = mappedread_sf(vp, nbytes, uio);
	- else
	-#endif /* __FreeBSD__ */
	- if (vn_has_cached_data(vp)) {
	- error = mappedread(vp, nbytes, uio);
	- } else {
	- error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
	- uio, nbytes);
	- }
	- if (error) {
	- /* convert checksum errors into IO errors */
	- if (error == ECKSUM)
	- error = SET_ERROR(EIO);
	- break;
	- }
	-
	- n -= nbytes;
	- }
	-out:
	- rangelock_exit(lr);
	-
	- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-/*
	- * Write the bytes to a file.
	- *
	- * IN: vp - vnode of file to be written to.
	- * uio - structure supplying write location, range info,
	- * and data buffer.
	- * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
	- * set if in append mode.
	- * cr - credentials of caller.
	- * ct - caller context (NFS/CIFS fem monitor only)
	- *
	- * OUT: uio - updated offset and range.
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * vp - ctime\|mtime updated if byte count > 0
	- */
	-
	-/* ARGSUSED */
	-static int
	-zfs_write(vnode_t vp, uio_t uio, int ioflag, cred_t cr, caller_context_t ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- rlim64_t limit = MAXOFFSET_T;
	- ssize_t start_resid = uio->uio_resid;
	- ssize_t tx_bytes;
	- uint64_t end_size;
	- dmu_tx_t *tx;
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- zilog_t *zilog;
	- offset_t woff;
	- ssize_t n, nbytes;
	- int max_blksz = zfsvfs->z_max_blksz;
	- int error = 0;
	- arc_buf_t *abuf;
	- iovec_t *aiov = NULL;
	- xuio_t *xuio = NULL;
	- int i_iov = 0;
	- int iovcnt = uio->uio_iovcnt;
	- iovec_t *iovp = uio->uio_iov;
	- int write_eof;
	- int count = 0;
	- sa_bulk_attr_t bulk[4];
	- uint64_t mtime[2], ctime[2];
	-
	- /*
	- * Fasttrack empty write
	- */
	- n = start_resid;
	- if (n == 0)
	- return (0);
	-
	- if (limit == RLIM64_INFINITY \|\| limit > MAXOFFSET_T)
	- limit = MAXOFFSET_T;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
	- &zp->z_size, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	- &zp->z_pflags, 8);
	-
	- /*
	- * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
	- * callers might not be able to detect properly that we are read-only,
	- * so check it explicitly here.
	- */
	- if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EROFS));
	- }
	-
	- /*
	- * If immutable or not appending then return EPERM.
	- * Intentionally allow ZFS_READONLY through here.
	- * See zfs_zaccess_common()
	- */
	- if ((zp->z_pflags & ZFS_IMMUTABLE) \|\|
	- ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
	- (uio->uio_loffset < zp->z_size))) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EPERM));
	- }
	-
	- zilog = zfsvfs->z_log;
	-
	- /*
	- * Validate file offset
	- */
	- woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
	- if (woff < 0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * Check for mandatory locks before calling rangelock_enter()
	- * in order to prevent a deadlock with locks set via fcntl().
	- */
	- if (MANDMODE((mode_t)zp->z_mode) &&
	- (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	-#ifdef illumos
	- /*
	- * Pre-fault the pages to ensure slow (eg NFS) pages
	- * don't hold up txg.
	- * Skip this if uio contains loaned arc_buf.
	- */
	- if ((uio->uio_extflg == UIO_XUIO) &&
	- (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
	- xuio = (xuio_t *)uio;
	- else
	- uio_prefaultpages(MIN(n, max_blksz), uio);
	-#endif
	-
	- /*
	- * If in append mode, set the io offset pointer to eof.
	- */
	- locked_range_t *lr;
	- if (ioflag & FAPPEND) {
	- /*
	- * Obtain an appending range lock to guarantee file append
	- * semantics. We reset the write offset once we have the lock.
	- */
	- lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
	- woff = lr->lr_offset;
	- if (lr->lr_length == UINT64_MAX) {
	- /*
	- * We overlocked the file because this write will cause
	- * the file block size to increase.
	- * Note that zp_size cannot change with this lock held.
	- */
	- woff = zp->z_size;
	- }
	- uio->uio_loffset = woff;
	- } else {
	- /*
	- * Note that if the file block size will change as a result of
	- * this write, then this range lock will lock the entire file
	- * so that we can re-write the block safely.
	- */
	- lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
	- }
	-
	- if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
	- rangelock_exit(lr);
	- ZFS_EXIT(zfsvfs);
	- return (EFBIG);
	- }
	-
	- if (woff >= limit) {
	- rangelock_exit(lr);
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EFBIG));
	- }
	-
	- if ((woff + n) > limit \|\| woff > (limit - n))
	- n = limit - woff;
	-
	- /* Will this write extend the file length? */
	- write_eof = (woff + n > zp->z_size);
	-
	- end_size = MAX(zp->z_size, woff + n);
	-
	- /*
	- * Write the file in reasonable size chunks. Each chunk is written
	- * in a separate transaction; this keeps the intent log records small
	- * and allows us to do more fine-grained space accounting.
	- */
	- while (n > 0) {
	- abuf = NULL;
	- woff = uio->uio_loffset;
	- if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) \|\|
	- zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
	- if (abuf != NULL)
	- dmu_return_arcbuf(abuf);
	- error = SET_ERROR(EDQUOT);
	- break;
	- }
	-
	- if (xuio && abuf == NULL) {
	- ASSERT(i_iov < iovcnt);
	- aiov = &iovp[i_iov];
	- abuf = dmu_xuio_arcbuf(xuio, i_iov);
	- dmu_xuio_clear(xuio, i_iov);
	- DTRACE_PROBE3(zfs_cp_write, int, i_iov,
	- iovec_t , aiov, arc_buf_t , abuf);
	- ASSERT((aiov->iov_base == abuf->b_data) \|\|
	- ((char )aiov->iov_base - (char )abuf->b_data +
	- aiov->iov_len == arc_buf_size(abuf)));
	- i_iov++;
	- } else if (abuf == NULL && n >= max_blksz &&
	- woff >= zp->z_size &&
	- P2PHASE(woff, max_blksz) == 0 &&
	- zp->z_blksz == max_blksz) {
	- /*
	- * This write covers a full block. "Borrow" a buffer
	- * from the dmu so that we can fill it before we enter
	- * a transaction. This avoids the possibility of
	- * holding up the transaction if the data copy hangs
	- * up on a pagefault (e.g., from an NFS server mapping).
	- */
	- size_t cbytes;
	-
	- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
	- max_blksz);
	- ASSERT(abuf != NULL);
	- ASSERT(arc_buf_size(abuf) == max_blksz);
	- if (error = uiocopy(abuf->b_data, max_blksz,
	- UIO_WRITE, uio, &cbytes)) {
	- dmu_return_arcbuf(abuf);
	- break;
	- }
	- ASSERT(cbytes == max_blksz);
	- }
	-
	- /*
	- * Start a transaction.
	- */
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
	- zfs_sa_upgrade_txholds(tx, zp);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- if (abuf != NULL)
	- dmu_return_arcbuf(abuf);
	- break;
	- }
	-
	- /*
	- * If rangelock_enter() over-locked we grow the blocksize
	- * and then reduce the lock range. This will only happen
	- * on the first iteration since rangelock_reduce() will
	- * shrink down lr_length to the appropriate size.
	- */
	- if (lr->lr_length == UINT64_MAX) {
	- uint64_t new_blksz;
	-
	- if (zp->z_blksz > max_blksz) {
	- /*
	- * File's blocksize is already larger than the
	- * "recordsize" property. Only let it grow to
	- * the next power of 2.
	- */
	- ASSERT(!ISP2(zp->z_blksz));
	- new_blksz = MIN(end_size,
	- 1 << highbit64(zp->z_blksz));
	- } else {
	- new_blksz = MIN(end_size, max_blksz);
	- }
	- zfs_grow_blocksize(zp, new_blksz, tx);
	- rangelock_reduce(lr, woff, n);
	- }
	-
	- /*
	- * XXX - should we really limit each write to z_max_blksz?
	- * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
	- */
	- nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
	-
	- if (woff + nbytes > zp->z_size)
	- vnode_pager_setsize(vp, woff + nbytes);
	-
	- if (abuf == NULL) {
	- tx_bytes = uio->uio_resid;
	- error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
	- uio, nbytes, tx);
	- tx_bytes -= uio->uio_resid;
	- } else {
	- tx_bytes = nbytes;
	- ASSERT(xuio == NULL \|\| tx_bytes == aiov->iov_len);
	- /*
	- * If this is not a full block write, but we are
	- * extending the file past EOF and this data starts
	- * block-aligned, use assign_arcbuf(). Otherwise,
	- * write via dmu_write().
	- */
	- if (tx_bytes < max_blksz && (!write_eof \|\|
	- aiov->iov_base != abuf->b_data)) {
	- ASSERT(xuio);
	- dmu_write(zfsvfs->z_os, zp->z_id, woff,
	- aiov->iov_len, aiov->iov_base, tx);
	- dmu_return_arcbuf(abuf);
	- xuio_stat_wbuf_copied();
	- } else {
	- ASSERT(xuio \|\| tx_bytes == max_blksz);
	- dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
	- woff, abuf, tx);
	- }
	- ASSERT(tx_bytes <= uio->uio_resid);
	- uioskip(uio, tx_bytes);
	- }
	- if (tx_bytes && vn_has_cached_data(vp)) {
	- update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
	- zp->z_id, uio->uio_segflg, tx);
	- }
	-
	- /*
	- * If we made no progress, we're done. If we made even
	- * partial progress, update the znode and ZIL accordingly.
	- */
	- if (tx_bytes == 0) {
	- (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
	- (void *)&zp->z_size, sizeof (uint64_t), tx);
	- dmu_tx_commit(tx);
	- ASSERT(error != 0);
	- break;
	- }
	-
	- /*
	- * Clear Set-UID/Set-GID bits on successful write if not
	- * privileged and at least one of the excute bits is set.
	- *
	- * It would be nice to to this after all writes have
	- * been done, but that would still expose the ISUID/ISGID
	- * to another app after the partial write is committed.
	- *
	- * Note: we don't call zfs_fuid_map_id() here because
	- * user 0 is not an ephemeral uid.
	- */
	- mutex_enter(&zp->z_acl_lock);
	- if ((zp->z_mode & (S_IXUSR \| (S_IXUSR >> 3) \|
	- (S_IXUSR >> 6))) != 0 &&
	- (zp->z_mode & (S_ISUID \| S_ISGID)) != 0 &&
	- secpolicy_vnode_setid_retain(vp, cr,
	- (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
	- uint64_t newmode;
	- zp->z_mode &= ~(S_ISUID \| S_ISGID);
	- newmode = zp->z_mode;
	- (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
	- (void *)&newmode, sizeof (uint64_t), tx);
	- }
	- mutex_exit(&zp->z_acl_lock);
	-
	- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
	- B_TRUE);
	-
	- /*
	- * Update the file size (zp_size) if it has changed;
	- * account for possible concurrent updates.
	- */
	- while ((end_size = zp->z_size) < uio->uio_loffset) {
	- (void) atomic_cas_64(&zp->z_size, end_size,
	- uio->uio_loffset);
	-#ifdef illumos
	- ASSERT(error == 0);
	-#else
	- ASSERT(error == 0 \|\| error == EFAULT);
	-#endif
	- }
	- /*
	- * If we are replaying and eof is non zero then force
	- * the file size to the specified eof. Note, there's no
	- * concurrency during replay.
	- */
	- if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
	- zp->z_size = zfsvfs->z_replay_eof;
	-
	- if (error == 0)
	- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	- else
	- (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	-
	- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
	- dmu_tx_commit(tx);
	-
	- if (error != 0)
	- break;
	- ASSERT(tx_bytes == nbytes);
	- n -= nbytes;
	-
	-#ifdef illumos
	- if (!xuio && n > 0)
	- uio_prefaultpages(MIN(n, max_blksz), uio);
	-#endif
	- }
	-
	- rangelock_exit(lr);
	-
	- /*
	- * If we're in replay mode, or we made no progress, return error.
	- * Otherwise, it's at least a partial write, so it's successful.
	- */
	- if (zfsvfs->z_replay \|\| uio->uio_resid == start_resid) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	-#ifdef __FreeBSD__
	- /*
	- * EFAULT means that at least one page of the source buffer was not
	- * available. VFS will re-try remaining I/O upon this error.
	- */
	- if (error == EFAULT) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-#endif
	-
	- if (ioflag & (FSYNC \| FDSYNC) \|\|
	- zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, zp->z_id);
	-
	- ZFS_EXIT(zfsvfs);
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-void
	-zfs_get_done(zgd_t *zgd, int error)
	-{
	- znode_t *zp = zgd->zgd_private;
	- objset_t *os = zp->z_zfsvfs->z_os;
	-
	- if (zgd->zgd_db)
	- dmu_buf_rele(zgd->zgd_db, zgd);
	-
	- rangelock_exit(zgd->zgd_lr);
	-
	- /*
	- * Release the vnode asynchronously as we currently have the
	- * txg stopped from syncing.
	- */
	- VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
	-
	- kmem_free(zgd, sizeof (zgd_t));
	-}
	-
	-#ifdef DEBUG
	-static int zil_fault_io = 0;
	-#endif
	-
	-/*
	- * Get data to generate a TX_WRITE intent log record.
	- */
	-int
	-zfs_get_data(void arg, lr_write_t lr, char buf, struct lwb lwb, zio_t *zio)
	-{
	- zfsvfs_t *zfsvfs = arg;
	- objset_t *os = zfsvfs->z_os;
	- znode_t *zp;
	- uint64_t object = lr->lr_foid;
	- uint64_t offset = lr->lr_offset;
	- uint64_t size = lr->lr_length;
	- dmu_buf_t *db;
	- zgd_t *zgd;
	- int error = 0;
	-
	- ASSERT3P(lwb, !=, NULL);
	- ASSERT3P(zio, !=, NULL);
	- ASSERT3U(size, !=, 0);
	-
	- /*
	- * Nothing to do if the file has been removed
	- */
	- if (zfs_zget(zfsvfs, object, &zp) != 0)
	- return (SET_ERROR(ENOENT));
	- if (zp->z_unlinked) {
	- /*
	- * Release the vnode asynchronously as we currently have the
	- * txg stopped from syncing.
	- */
	- VN_RELE_ASYNC(ZTOV(zp),
	- dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
	- return (SET_ERROR(ENOENT));
	- }
	-
	- zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
	- zgd->zgd_lwb = lwb;
	- zgd->zgd_private = zp;
	-
	- /*
	- * Write records come in two flavors: immediate and indirect.
	- * For small writes it's cheaper to store the data with the
	- * log record (immediate); for large writes it's cheaper to
	- * sync the data and get a pointer to it (indirect) so that
	- * we don't have to write the data twice.
	- */
	- if (buf != NULL) { /* immediate write */
	- zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
	- offset, size, RL_READER);
	- /* test for truncation needs to be done while range locked */
	- if (offset >= zp->z_size) {
	- error = SET_ERROR(ENOENT);
	- } else {
	- error = dmu_read(os, object, offset, size, buf,
	- DMU_READ_NO_PREFETCH);
	- }
	- ASSERT(error == 0 \|\| error == ENOENT);
	- } else { /* indirect write */
	- /*
	- * Have to lock the whole block to ensure when it's
	- * written out and its checksum is being calculated
	- * that no one can change the data. We need to re-check
	- * blocksize after we get the lock in case it's changed!
	- */
	- for (;;) {
	- uint64_t blkoff;
	- size = zp->z_blksz;
	- blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
	- offset -= blkoff;
	- zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
	- offset, size, RL_READER);
	- if (zp->z_blksz == size)
	- break;
	- offset += blkoff;
	- rangelock_exit(zgd->zgd_lr);
	- }
	- /* test for truncation needs to be done while range locked */
	- if (lr->lr_offset >= zp->z_size)
	- error = SET_ERROR(ENOENT);
	-#ifdef DEBUG
	- if (zil_fault_io) {
	- error = SET_ERROR(EIO);
	- zil_fault_io = 0;
	- }
	-#endif
	- if (error == 0)
	- error = dmu_buf_hold(os, object, offset, zgd, &db,
	- DMU_READ_NO_PREFETCH);
	-
	- if (error == 0) {
	- blkptr_t *bp = &lr->lr_blkptr;
	-
	- zgd->zgd_db = db;
	- zgd->zgd_bp = bp;
	-
	- ASSERT(db->db_offset == offset);
	- ASSERT(db->db_size == size);
	-
	- error = dmu_sync(zio, lr->lr_common.lrc_txg,
	- zfs_get_done, zgd);
	- ASSERT(error \|\| lr->lr_length <= size);
	-
	- /*
	- * On success, we need to wait for the write I/O
	- * initiated by dmu_sync() to complete before we can
	- * release this dbuf. We will finish everything up
	- * in the zfs_get_done() callback.
	- */
	- if (error == 0)
	- return (0);
	-
	- if (error == EALREADY) {
	- lr->lr_common.lrc_txtype = TX_WRITE2;
	- /*
	- * TX_WRITE2 relies on the data previously
	- * written by the TX_WRITE that caused
	- * EALREADY. We zero out the BP because
	- * it is the old, currently-on-disk BP.
	- */
	- zgd->zgd_bp = NULL;
	- BP_ZERO(bp);
	- error = 0;
	- }
	- }
	- }
	-
	- zfs_get_done(zgd, error);
	-
	- return (error);
	-}
	-
	-/ARGSUSED/
	-static int
	-zfs_access(vnode_t vp, int mode, int flag, cred_t cr,
	- caller_context_t *ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- int error;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- if (flag & V_ACE_MASK)
	- error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
	- else
	- error = zfs_zaccess_rwx(zp, mode, flag, cr);
	-
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-static int
	-zfs_dd_callback(struct mount mp, void arg, int lkflags, struct vnode **vpp)
	-{
	- int error;
	-
	- *vpp = arg;
	- error = vn_lock(*vpp, lkflags);
	- if (error != 0)
	- vrele(*vpp);
	- return (error);
	-}
	-
	-static int
	-zfs_lookup_lock(vnode_t dvp, vnode_t vp, const char *name, int lkflags)
	-{
	- znode_t *zdp = VTOZ(dvp);
	- zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
	- int error;
	- int ltype;
	-
	- ASSERT_VOP_LOCKED(dvp, __func__);
	-#ifdef DIAGNOSTIC
	- if ((zdp->z_pflags & ZFS_XATTR) == 0)
	- VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
	-#endif
	-
	- if (name[0] == 0 \|\| (name[0] == '.' && name[1] == 0)) {
	- ASSERT3P(dvp, ==, vp);
	- vref(dvp);
	- ltype = lkflags & LK_TYPE_MASK;
	- if (ltype != VOP_ISLOCKED(dvp)) {
	- if (ltype == LK_EXCLUSIVE)
	- vn_lock(dvp, LK_UPGRADE \| LK_RETRY);
	- else /* if (ltype == LK_SHARED) */
	- vn_lock(dvp, LK_DOWNGRADE \| LK_RETRY);
	-
	- /*
	- * Relock for the "." case could leave us with
	- * reclaimed vnode.
	- */
	- if (VN_IS_DOOMED(dvp)) {
	- vrele(dvp);
	- return (SET_ERROR(ENOENT));
	- }
	- }
	- return (0);
	- } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
	- /*
	- * Note that in this case, dvp is the child vnode, and we
	- * are looking up the parent vnode - exactly reverse from
	- * normal operation. Unlocking dvp requires some rather
	- * tricky unlock/relock dance to prevent mp from being freed;
	- * use vn_vget_ino_gen() which takes care of all that.
	- *
	- * XXX Note that there is a time window when both vnodes are
	- * unlocked. It is possible, although highly unlikely, that
	- * during that window the parent-child relationship between
	- * the vnodes may change, for example, get reversed.
	- * In that case we would have a wrong lock order for the vnodes.
	- * All other filesystems seem to ignore this problem, so we
	- * do the same here.
	- * A potential solution could be implemented as follows:
	- * - using LK_NOWAIT when locking the second vnode and retrying
	- * if necessary
	- * - checking that the parent-child relationship still holds
	- * after locking both vnodes and retrying if it doesn't
	- */
	- error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
	- return (error);
	- } else {
	- error = vn_lock(vp, lkflags);
	- if (error != 0)
	- vrele(vp);
	- return (error);
	- }
	-}
	-
	-/*
	- * Lookup an entry in a directory, or an extended attribute directory.
	- * If it exists, return a held vnode reference for it.
	- *
	- * IN: dvp - vnode of directory to search.
	- * nm - name of entry to lookup.
	- * pnp - full pathname to lookup [UNUSED].
	- * flags - LOOKUP_XATTR set if looking for an attribute.
	- * rdir - root directory vnode [UNUSED].
	- * cr - credentials of caller.
	- * ct - caller context
	- *
	- * OUT: vpp - vnode of located entry, NULL if not found.
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * NA
	- */
	-/* ARGSUSED */
	-static int
	-zfs_lookup(vnode_t dvp, char nm, vnode_t *vpp, struct componentname cnp,
	- int nameiop, cred_t cr, kthread_t td, int flags, boolean_t cached)
	-{
	- znode_t *zdp = VTOZ(dvp);
	- znode_t *zp;
	- zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
	- int error = 0;
	-
	- /*
	- * Fast path lookup, however we must skip DNLC lookup
	- * for case folding or normalizing lookups because the
	- * DNLC code only stores the passed in name. This means
	- * creating 'a' and removing 'A' on a case insensitive
	- * file system would work, but DNLC still thinks 'a'
	- * exists and won't let you create it again on the next
	- * pass through fast path.
	- */
	- if (!(flags & LOOKUP_XATTR)) {
	- if (dvp->v_type != VDIR) {
	- return (SET_ERROR(ENOTDIR));
	- } else if (zdp->z_sa_hdl == NULL) {
	- return (SET_ERROR(EIO));
	- }
	- }
	-
	- DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t , dvp, char , nm);
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zdp);
	-
	- *vpp = NULL;
	-
	- if (flags & LOOKUP_XATTR) {
	-#ifdef TODO
	- /*
	- * If the xattr property is off, refuse the lookup request.
	- */
	- if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-#endif
	-
	- /*
	- * We don't allow recursive attributes..
	- * Maybe someday we will.
	- */
	- if (zdp->z_pflags & ZFS_XATTR) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * Do we have permission to get into attribute directory?
	- */
	- if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
	- B_FALSE, cr)) {
	- vrele(*vpp);
	- *vpp = NULL;
	- }
	-
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * Check accessibility of directory.
	- */
	- if (!cached) {
	- if ((cnp->cn_flags & NOEXECCHECK) != 0) {
	- cnp->cn_flags &= ~NOEXECCHECK;
	- } else {
	- error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- }
	- }
	-
	- if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
	- NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EILSEQ));
	- }
	-
	-
	- /*
	- * First handle the special cases.
	- */
	- if ((cnp->cn_flags & ISDOTDOT) != 0) {
	- /*
	- * If we are a snapshot mounted under .zfs, return
	- * the vp for the snapshot directory.
	- */
	- if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
	- struct componentname cn;
	- vnode_t *zfsctl_vp;
	- int ltype;
	-
	- ZFS_EXIT(zfsvfs);
	- ltype = VOP_ISLOCKED(dvp);
	- VOP_UNLOCK(dvp);
	- error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
	- &zfsctl_vp);
	- if (error == 0) {
	- cn.cn_nameptr = "snapshot";
	- cn.cn_namelen = strlen(cn.cn_nameptr);
	- cn.cn_nameiop = cnp->cn_nameiop;
	- cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
	- cn.cn_lkflags = cnp->cn_lkflags;
	- error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
	- vput(zfsctl_vp);
	- }
	- vn_lock(dvp, ltype \| LK_RETRY);
	- return (error);
	- }
	- }
	- if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
	- ZFS_EXIT(zfsvfs);
	- if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
	- return (SET_ERROR(ENOTSUP));
	- error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
	- return (error);
	- }
	-
	- /*
	- * The loop is retry the lookup if the parent-child relationship
	- * changes during the dot-dot locking complexities.
	- */
	- for (;;) {
	- uint64_t parent;
	-
	- error = zfs_dirlook(zdp, nm, &zp);
	- if (error == 0)
	- *vpp = ZTOV(zp);
	-
	- ZFS_EXIT(zfsvfs);
	- if (error != 0)
	- break;
	-
	- error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
	- if (error != 0) {
	- /*
	- * If we've got a locking error, then the vnode
	- * got reclaimed because of a force unmount.
	- * We never enter doomed vnodes into the name cache.
	- */
	- *vpp = NULL;
	- return (error);
	- }
	-
	- if ((cnp->cn_flags & ISDOTDOT) == 0)
	- break;
	-
	- ZFS_ENTER(zfsvfs);
	- if (zdp->z_sa_hdl == NULL) {
	- error = SET_ERROR(EIO);
	- } else {
	- error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
	- &parent, sizeof (parent));
	- }
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- vput(ZTOV(zp));
	- break;
	- }
	- if (zp->z_id == parent) {
	- ZFS_EXIT(zfsvfs);
	- break;
	- }
	- vput(ZTOV(zp));
	- }
	-
	-out:
	- if (error != 0)
	- *vpp = NULL;
	-
	- /* Translate errors and add SAVENAME when needed. */
	- if (cnp->cn_flags & ISLASTCN) {
	- switch (nameiop) {
	- case CREATE:
	- case RENAME:
	- if (error == ENOENT) {
	- error = EJUSTRETURN;
	- cnp->cn_flags \|= SAVENAME;
	- break;
	- }
	- /* FALLTHROUGH */
	- case DELETE:
	- if (error == 0)
	- cnp->cn_flags \|= SAVENAME;
	- break;
	- }
	- }
	-
	- /* Insert name into cache (as non-existent) if appropriate. */
	- if (zfsvfs->z_use_namecache &&
	- error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
	- cache_enter(dvp, NULL, cnp);
	-
	- /* Insert name into cache if appropriate. */
	- if (zfsvfs->z_use_namecache &&
	- error == 0 && (cnp->cn_flags & MAKEENTRY)) {
	- if (!(cnp->cn_flags & ISLASTCN) \|\|
	- (nameiop != DELETE && nameiop != RENAME)) {
	- cache_enter(dvp, *vpp, cnp);
	- }
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * Attempt to create a new entry in a directory. If the entry
	- * already exists, truncate the file if permissible, else return
	- * an error. Return the vp of the created or trunc'd file.
	- *
	- * IN: dvp - vnode of directory to put new file entry in.
	- * name - name of new file entry.
	- * vap - attributes of new file.
	- * excl - flag indicating exclusive or non-exclusive mode.
	- * mode - mode to open file with.
	- * cr - credentials of caller.
	- * flag - large file flag [UNUSED].
	- * ct - caller context
	- * vsecp - ACL to be set
	- *
	- * OUT: vpp - vnode of created or trunc'd entry.
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * dvp - ctime\|mtime updated if new entry created
	- * vp - ctime\|mtime always, atime if new
	- */
	-
	-/* ARGSUSED */
	-static int
	-zfs_create(vnode_t dvp, char name, vattr_t *vap, int excl, int mode,
	- vnode_t *vpp, cred_t cr, kthread_t *td)
	-{
	- znode_t zp, dzp = VTOZ(dvp);
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- zilog_t *zilog;
	- objset_t *os;
	- dmu_tx_t *tx;
	- int error;
	- ksid_t *ksid;
	- uid_t uid;
	- gid_t gid = crgetgid(cr);
	- zfs_acl_ids_t acl_ids;
	- boolean_t fuid_dirtied;
	- void *vsecp = NULL;
	- int flag = 0;
	- uint64_t txtype;
	-
	- /*
	- * If we have an ephemeral id, ACL, or XVATTR then
	- * make sure file system is at proper version
	- */
	-
	- ksid = crgetsid(cr, KSID_OWNER);
	- if (ksid)
	- uid = ksid_getid(ksid);
	- else
	- uid = crgetuid(cr);
	-
	- if (zfsvfs->z_use_fuids == B_FALSE &&
	- (vsecp \|\| (vap->va_mask & AT_XVATTR) \|\|
	- IS_EPHEMERAL(uid) \|\| IS_EPHEMERAL(gid)))
	- return (SET_ERROR(EINVAL));
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(dzp);
	- os = zfsvfs->z_os;
	- zilog = zfsvfs->z_log;
	-
	- if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
	- NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EILSEQ));
	- }
	-
	- if (vap->va_mask & AT_XVATTR) {
	- if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
	- crgetuid(cr), cr, vap->va_type)) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- }
	-
	- *vpp = NULL;
	-
	- if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
	- vap->va_mode &= ~S_ISVTX;
	-
	- error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
	- if (error) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- ASSERT3P(zp, ==, NULL);
	-
	- /*
	- * Create a new file object and update the directory
	- * to reference it.
	- */
	- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
	- goto out;
	- }
	-
	- /*
	- * We only support the creation of regular files in
	- * extended attribute directories.
	- */
	-
	- if ((dzp->z_pflags & ZFS_XATTR) &&
	- (vap->va_type != VREG)) {
	- error = SET_ERROR(EINVAL);
	- goto out;
	- }
	-
	- if ((error = zfs_acl_ids_create(dzp, 0, vap,
	- cr, vsecp, &acl_ids)) != 0)
	- goto out;
	-
	- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
	- zfs_acl_ids_free(&acl_ids);
	- error = SET_ERROR(EDQUOT);
	- goto out;
	- }
	-
	- getnewvnode_reserve();
	-
	- tx = dmu_tx_create(os);
	-
	- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
	- ZFS_SA_BASE_ATTR_SIZE);
	-
	- fuid_dirtied = zfsvfs->z_fuid_dirty;
	- if (fuid_dirtied)
	- zfs_fuid_txhold(zfsvfs, tx);
	- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
	- dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
	- if (!zfsvfs->z_use_sa &&
	- acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
	- 0, acl_ids.z_aclp->z_acl_bytes);
	- }
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- zfs_acl_ids_free(&acl_ids);
	- dmu_tx_abort(tx);
	- getnewvnode_drop_reserve();
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
	-
	- if (fuid_dirtied)
	- zfs_fuid_sync(zfsvfs, tx);
	-
	- (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
	- txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
	- zfs_log_create(zilog, tx, txtype, dzp, zp, name,
	- vsecp, acl_ids.z_fuidp, vap);
	- zfs_acl_ids_free(&acl_ids);
	- dmu_tx_commit(tx);
	-
	- getnewvnode_drop_reserve();
	-
	-out:
	- if (error == 0) {
	- *vpp = ZTOV(zp);
	- }
	-
	- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, 0);
	-
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-/*
	- * Remove an entry from a directory.
	- *
	- * IN: dvp - vnode of directory to remove entry from.
	- * name - name of entry to remove.
	- * cr - credentials of caller.
	- * ct - caller context
	- * flags - case flags
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * dvp - ctime\|mtime
	- * vp - ctime (if nlink > 0)
	- */
	-
	-/ARGSUSED/
	-static int
	-zfs_remove(vnode_t dvp, vnode_t vp, char name, cred_t cr)
	-{
	- znode_t *dzp = VTOZ(dvp);
	- znode_t *zp = VTOZ(vp);
	- znode_t *xzp;
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- zilog_t *zilog;
	- uint64_t acl_obj, xattr_obj;
	- uint64_t obj = 0;
	- dmu_tx_t *tx;
	- boolean_t unlinked, toobig = FALSE;
	- uint64_t txtype;
	- int error;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(dzp);
	- ZFS_VERIFY_ZP(zp);
	- zilog = zfsvfs->z_log;
	- zp = VTOZ(vp);
	-
	- xattr_obj = 0;
	- xzp = NULL;
	-
	- if (error = zfs_zaccess_delete(dzp, zp, cr)) {
	- goto out;
	- }
	-
	- /*
	- * Need to use rmdir for removing directories.
	- */
	- if (vp->v_type == VDIR) {
	- error = SET_ERROR(EPERM);
	- goto out;
	- }
	-
	- vnevent_remove(vp, dvp, name, ct);
	-
	- obj = zp->z_id;
	-
	- /* are there any extended attributes? */
	- error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
	- &xattr_obj, sizeof (xattr_obj));
	- if (error == 0 && xattr_obj) {
	- error = zfs_zget(zfsvfs, xattr_obj, &xzp);
	- ASSERT0(error);
	- }
	-
	- /*
	- * We may delete the znode now, or we may put it in the unlinked set;
	- * it depends on whether we're the last link, and on whether there are
	- * other holds on the vnode. So we dmu_tx_hold() the right things to
	- * allow for either case.
	- */
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- zfs_sa_upgrade_txholds(tx, zp);
	- zfs_sa_upgrade_txholds(tx, dzp);
	-
	- if (xzp) {
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
	- dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
	- }
	-
	- /* charge as an update -- would be nice not to charge at all */
	- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
	-
	- /*
	- * Mark this transaction as typically resulting in a net free of space
	- */
	- dmu_tx_mark_netfree(tx);
	-
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * Remove the directory entry.
	- */
	- error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
	-
	- if (error) {
	- dmu_tx_commit(tx);
	- goto out;
	- }
	-
	- if (unlinked) {
	- zfs_unlinked_add(zp, tx);
	- vp->v_vflag \|= VV_NOSYNC;
	- }
	-
	- txtype = TX_REMOVE;
	- zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
	-
	- dmu_tx_commit(tx);
	-out:
	-
	- if (xzp)
	- vrele(ZTOV(xzp));
	-
	- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, 0);
	-
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-/*
	- * Create a new directory and insert it into dvp using the name
	- * provided. Return a pointer to the inserted directory.
	- *
	- * IN: dvp - vnode of directory to add subdir to.
	- * dirname - name of new directory.
	- * vap - attributes of new directory.
	- * cr - credentials of caller.
	- * ct - caller context
	- * flags - case flags
	- * vsecp - ACL to be set
	- *
	- * OUT: vpp - vnode of created directory.
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * dvp - ctime\|mtime updated
	- * vp - ctime\|mtime\|atime updated
	- */
	-/ARGSUSED/
	-static int
	-zfs_mkdir(vnode_t dvp, char dirname, vattr_t vap, vnode_t vpp, cred_t cr)
	-{
	- znode_t zp, dzp = VTOZ(dvp);
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- zilog_t *zilog;
	- uint64_t txtype;
	- dmu_tx_t *tx;
	- int error;
	- ksid_t *ksid;
	- uid_t uid;
	- gid_t gid = crgetgid(cr);
	- zfs_acl_ids_t acl_ids;
	- boolean_t fuid_dirtied;
	-
	- ASSERT(vap->va_type == VDIR);
	-
	- /*
	- * If we have an ephemeral id, ACL, or XVATTR then
	- * make sure file system is at proper version
	- */
	-
	- ksid = crgetsid(cr, KSID_OWNER);
	- if (ksid)
	- uid = ksid_getid(ksid);
	- else
	- uid = crgetuid(cr);
	- if (zfsvfs->z_use_fuids == B_FALSE &&
	- ((vap->va_mask & AT_XVATTR) \|\|
	- IS_EPHEMERAL(uid) \|\| IS_EPHEMERAL(gid)))
	- return (SET_ERROR(EINVAL));
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(dzp);
	- zilog = zfsvfs->z_log;
	-
	- if (dzp->z_pflags & ZFS_XATTR) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (zfsvfs->z_utf8 && u8_validate(dirname,
	- strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EILSEQ));
	- }
	-
	- if (vap->va_mask & AT_XVATTR) {
	- if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
	- crgetuid(cr), cr, vap->va_type)) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- }
	-
	- if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
	- NULL, &acl_ids)) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * First make sure the new directory doesn't exist.
	- *
	- * Existence is checked first to make sure we don't return
	- * EACCES instead of EEXIST which can cause some applications
	- * to fail.
	- */
	- *vpp = NULL;
	-
	- if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
	- zfs_acl_ids_free(&acl_ids);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- ASSERT3P(zp, ==, NULL);
	-
	- if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
	- zfs_acl_ids_free(&acl_ids);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
	- zfs_acl_ids_free(&acl_ids);
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EDQUOT));
	- }
	-
	- /*
	- * Add a new entry to the directory.
	- */
	- getnewvnode_reserve();
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
	- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
	- fuid_dirtied = zfsvfs->z_fuid_dirty;
	- if (fuid_dirtied)
	- zfs_fuid_txhold(zfsvfs, tx);
	- if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
	- acl_ids.z_aclp->z_acl_bytes);
	- }
	-
	- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
	- ZFS_SA_BASE_ATTR_SIZE);
	-
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- zfs_acl_ids_free(&acl_ids);
	- dmu_tx_abort(tx);
	- getnewvnode_drop_reserve();
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * Create new node.
	- */
	- zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
	-
	- if (fuid_dirtied)
	- zfs_fuid_sync(zfsvfs, tx);
	-
	- /*
	- * Now put new name in parent dir.
	- */
	- (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
	-
	- *vpp = ZTOV(zp);
	-
	- txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
	- zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
	- acl_ids.z_fuidp, vap);
	-
	- zfs_acl_ids_free(&acl_ids);
	-
	- dmu_tx_commit(tx);
	-
	- getnewvnode_drop_reserve();
	-
	- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, 0);
	-
	- ZFS_EXIT(zfsvfs);
	- return (0);
	-}
	-
	-/*
	- * Remove a directory subdir entry. If the current working
	- * directory is the same as the subdir to be removed, the
	- * remove will fail.
	- *
	- * IN: dvp - vnode of directory to remove from.
	- * name - name of directory to be removed.
	- * cwd - vnode of current working directory.
	- * cr - credentials of caller.
	- * ct - caller context
	- * flags - case flags
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * dvp - ctime\|mtime updated
	- */
	-/ARGSUSED/
	-static int
	-zfs_rmdir(vnode_t dvp, vnode_t vp, char name, cred_t cr)
	-{
	- znode_t *dzp = VTOZ(dvp);
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- zilog_t *zilog;
	- dmu_tx_t *tx;
	- int error;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(dzp);
	- ZFS_VERIFY_ZP(zp);
	- zilog = zfsvfs->z_log;
	-
	-
	- if (error = zfs_zaccess_delete(dzp, zp, cr)) {
	- goto out;
	- }
	-
	- if (vp->v_type != VDIR) {
	- error = SET_ERROR(ENOTDIR);
	- goto out;
	- }
	-
	- vnevent_rmdir(vp, dvp, name, ct);
	-
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
	- zfs_sa_upgrade_txholds(tx, zp);
	- zfs_sa_upgrade_txholds(tx, dzp);
	- dmu_tx_mark_netfree(tx);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- cache_purge(dvp);
	-
	- error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
	-
	- if (error == 0) {
	- uint64_t txtype = TX_RMDIR;
	- zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
	- }
	-
	- dmu_tx_commit(tx);
	-
	- cache_purge(vp);
	-out:
	- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, 0);
	-
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-/*
	- * Read as many directory entries as will fit into the provided
	- * buffer from the given directory cursor position (specified in
	- * the uio structure).
	- *
	- * IN: vp - vnode of directory to read.
	- * uio - structure supplying read location, range info,
	- * and return buffer.
	- * cr - credentials of caller.
	- * ct - caller context
	- * flags - case flags
	- *
	- * OUT: uio - updated offset and range, buffer filled.
	- * eofp - set to true if end-of-file detected.
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * vp - atime updated
	- *
	- * Note that the low 4 bits of the cookie returned by zap is always zero.
	- * This allows us to use the low range for "special" directory entries:
	- * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
	- * we use the offset 2 for the '.zfs' directory.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_readdir(vnode_t vp, uio_t uio, cred_t cr, int eofp, int ncookies, u_long *cookies)
	-{
	- znode_t *zp = VTOZ(vp);
	- iovec_t *iovp;
	- edirent_t *eodp;
	- dirent64_t *odp;
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- objset_t *os;
	- caddr_t outbuf;
	- size_t bufsize;
	- zap_cursor_t zc;
	- zap_attribute_t zap;
	- uint_t bytes_wanted;
	- uint64_t offset; /* must be unsigned; checks for < 1 */
	- uint64_t parent;
	- int local_eof;
	- int outcount;
	- int error;
	- uint8_t prefetch;
	- boolean_t check_sysattrs;
	- uint8_t type;
	- int ncooks;
	- u_long *cooks = NULL;
	- int flags = 0;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
	- &parent, sizeof (parent))) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * If we are not given an eof variable,
	- * use a local one.
	- */
	- if (eofp == NULL)
	- eofp = &local_eof;
	-
	- /*
	- * Check for valid iov_len.
	- */
	- if (uio->uio_iov->iov_len <= 0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * Quit if directory has been removed (posix)
	- */
	- if ((*eofp = zp->z_unlinked) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (0);
	- }
	-
	- error = 0;
	- os = zfsvfs->z_os;
	- offset = uio->uio_loffset;
	- prefetch = zp->z_zn_prefetch;
	-
	- /*
	- * Initialize the iterator cursor.
	- */
	- if (offset <= 3) {
	- /*
	- * Start iteration from the beginning of the directory.
	- */
	- zap_cursor_init(&zc, os, zp->z_id);
	- } else {
	- /*
	- * The offset is a serialized cursor.
	- */
	- zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
	- }
	-
	- /*
	- * Get space to change directory entries into fs independent format.
	- */
	- iovp = uio->uio_iov;
	- bytes_wanted = iovp->iov_len;
	- if (uio->uio_segflg != UIO_SYSSPACE \|\| uio->uio_iovcnt != 1) {
	- bufsize = bytes_wanted;
	- outbuf = kmem_alloc(bufsize, KM_SLEEP);
	- odp = (struct dirent64 *)outbuf;
	- } else {
	- bufsize = bytes_wanted;
	- outbuf = NULL;
	- odp = (struct dirent64 *)iovp->iov_base;
	- }
	- eodp = (struct edirent *)odp;
	-
	- if (ncookies != NULL) {
	- /*
	- * Minimum entry size is dirent size and 1 byte for a file name.
	- */
	- ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
	- cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
	- *cookies = cooks;
	- *ncookies = ncooks;
	- }
	- /*
	- * If this VFS supports the system attribute view interface; and
	- * we're looking at an extended attribute directory; and we care
	- * about normalization conflicts on this vfs; then we must check
	- * for normalization conflicts with the sysattr name space.
	- */
	-#ifdef TODO
	- check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
	- (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
	- (flags & V_RDDIR_ENTFLAGS);
	-#else
	- check_sysattrs = 0;
	-#endif
	-
	- /*
	- * Transform to file-system independent format
	- */
	- outcount = 0;
	- while (outcount < bytes_wanted) {
	- ino64_t objnum;
	- ushort_t reclen;
	- off64_t *next = NULL;
	-
	- /*
	- * Special case `.', `..', and `.zfs'.
	- */
	- if (offset == 0) {
	- (void) strcpy(zap.za_name, ".");
	- zap.za_normalization_conflict = 0;
	- objnum = zp->z_id;
	- type = DT_DIR;
	- } else if (offset == 1) {
	- (void) strcpy(zap.za_name, "..");
	- zap.za_normalization_conflict = 0;
	- objnum = parent;
	- type = DT_DIR;
	- } else if (offset == 2 && zfs_show_ctldir(zp)) {
	- (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
	- zap.za_normalization_conflict = 0;
	- objnum = ZFSCTL_INO_ROOT;
	- type = DT_DIR;
	- } else {
	- /*
	- * Grab next entry.
	- */
	- if (error = zap_cursor_retrieve(&zc, &zap)) {
	- if ((*eofp = (error == ENOENT)) != 0)
	- break;
	- else
	- goto update;
	- }
	-
	- if (zap.za_integer_length != 8 \|\|
	- zap.za_num_integers != 1) {
	- cmn_err(CE_WARN, "zap_readdir: bad directory "
	- "entry, obj = %lld, offset = %lld\n",
	- (u_longlong_t)zp->z_id,
	- (u_longlong_t)offset);
	- error = SET_ERROR(ENXIO);
	- goto update;
	- }
	-
	- objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
	- /*
	- * MacOS X can extract the object type here such as:
	- * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
	- */
	- type = ZFS_DIRENT_TYPE(zap.za_first_integer);
	-
	- if (check_sysattrs && !zap.za_normalization_conflict) {
	-#ifdef TODO
	- zap.za_normalization_conflict =
	- xattr_sysattr_casechk(zap.za_name);
	-#else
	- panic("%s:%u: TODO", __func__, __LINE__);
	-#endif
	- }
	- }
	-
	- if (flags & V_RDDIR_ACCFILTER) {
	- /*
	- * If we have no access at all, don't include
	- * this entry in the returned information
	- */
	- znode_t *ezp;
	- if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
	- goto skip_entry;
	- if (!zfs_has_access(ezp, cr)) {
	- vrele(ZTOV(ezp));
	- goto skip_entry;
	- }
	- vrele(ZTOV(ezp));
	- }
	-
	- if (flags & V_RDDIR_ENTFLAGS)
	- reclen = EDIRENT_RECLEN(strlen(zap.za_name));
	- else
	- reclen = DIRENT64_RECLEN(strlen(zap.za_name));
	-
	- /*
	- * Will this entry fit in the buffer?
	- */
	- if (outcount + reclen > bufsize) {
	- /*
	- * Did we manage to fit anything in the buffer?
	- */
	- if (!outcount) {
	- error = SET_ERROR(EINVAL);
	- goto update;
	- }
	- break;
	- }
	- if (flags & V_RDDIR_ENTFLAGS) {
	- /*
	- * Add extended flag entry:
	- */
	- eodp->ed_ino = objnum;
	- eodp->ed_reclen = reclen;
	- /* NOTE: ed_off is the offset for the next entry. */
	- next = &eodp->ed_off;
	- eodp->ed_eflags = zap.za_normalization_conflict ?
	- ED_CASE_CONFLICT : 0;
	- (void) strncpy(eodp->ed_name, zap.za_name,
	- EDIRENT_NAMELEN(reclen));
	- eodp = (edirent_t *)((intptr_t)eodp + reclen);
	- } else {
	- /*
	- * Add normal entry:
	- */
	- odp->d_ino = objnum;
	- odp->d_reclen = reclen;
	- odp->d_namlen = strlen(zap.za_name);
	- /* NOTE: d_off is the offset for the next entry. */
	- next = &odp->d_off;
	- (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
	- odp->d_type = type;
	- dirent_terminate(odp);
	- odp = (dirent64_t *)((intptr_t)odp + reclen);
	- }
	- outcount += reclen;
	-
	- ASSERT(outcount <= bufsize);
	-
	- /* Prefetch znode */
	- if (prefetch)
	- dmu_prefetch(os, objnum, 0, 0, 0,
	- ZIO_PRIORITY_SYNC_READ);
	-
	- skip_entry:
	- /*
	- * Move to the next entry, fill in the previous offset.
	- */
	- if (offset > 2 \|\| (offset == 2 && !zfs_show_ctldir(zp))) {
	- zap_cursor_advance(&zc);
	- offset = zap_cursor_serialize(&zc);
	- } else {
	- offset += 1;
	- }
	-
	- /* Fill the offset right after advancing the cursor. */
	- if (next != NULL)
	- *next = offset;
	- if (cooks != NULL) {
	- *cooks++ = offset;
	- ncooks--;
	- KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
	- }
	- }
	- zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
	-
	- /* Subtract unused cookies */
	- if (ncookies != NULL)
	- *ncookies -= ncooks;
	-
	- if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
	- iovp->iov_base += outcount;
	- iovp->iov_len -= outcount;
	- uio->uio_resid -= outcount;
	- } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
	- /*
	- * Reset the pointer.
	- */
	- offset = uio->uio_loffset;
	- }
	-
	-update:
	- zap_cursor_fini(&zc);
	- if (uio->uio_segflg != UIO_SYSSPACE \|\| uio->uio_iovcnt != 1)
	- kmem_free(outbuf, bufsize);
	-
	- if (error == ENOENT)
	- error = 0;
	-
	- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
	-
	- uio->uio_loffset = offset;
	- ZFS_EXIT(zfsvfs);
	- if (error != 0 && cookies != NULL) {
	- free(*cookies, M_TEMP);
	- *cookies = NULL;
	- *ncookies = 0;
	- }
	- return (error);
	-}
	-
	-ulong_t zfs_fsync_sync_cnt = 4;
	-
	-static int
	-zfs_fsync(vnode_t vp, int syncflag, cred_t cr, caller_context_t *ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	-
	- (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
	-
	- if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	- zil_commit(zfsvfs->z_log, zp->z_id);
	- ZFS_EXIT(zfsvfs);
	- }
	- return (0);
	-}
	-
	-
	-/*
	- * Get the requested file attributes and place them in the provided
	- * vattr structure.
	- *
	- * IN: vp - vnode of file.
	- * vap - va_mask identifies requested attributes.
	- * If AT_XVATTR set, then optional attrs are requested
	- * flags - ATTR_NOACLCHECK (CIFS server context)
	- * cr - credentials of caller.
	- * ct - caller context
	- *
	- * OUT: vap - attribute values.
	- *
	- * RETURN: 0 (always succeeds).
	- */
	-/* ARGSUSED */
	-static int
	-zfs_getattr(vnode_t vp, vattr_t vap, int flags, cred_t *cr,
	- caller_context_t *ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- int error = 0;
	- uint32_t blksize;
	- u_longlong_t nblocks;
	- uint64_t mtime[2], ctime[2], crtime[2], rdev;
	- xvattr_t xvap = (xvattr_t )vap; /* vap may be an xvattr_t * */
	- xoptattr_t *xoap = NULL;
	- boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
	- sa_bulk_attr_t bulk[4];
	- int count = 0;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
	-
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
	- if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
	- &rdev, 8);
	-
	- if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
	- * Also, if we are the owner don't bother, since owner should
	- * always be allowed to read basic attributes of file.
	- */
	- if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
	- (vap->va_uid != crgetuid(cr))) {
	- if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
	- skipaclchk, cr)) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- }
	-
	- /*
	- * Return all attributes. It's cheaper to provide the answer
	- * than to determine whether we were asked the question.
	- */
	-
	- vap->va_type = IFTOVT(zp->z_mode);
	- vap->va_mode = zp->z_mode & ~S_IFMT;
	-#ifdef illumos
	- vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
	-#else
	- vn_fsid(vp, vap);
	-#endif
	- vap->va_nodeid = zp->z_id;
	- vap->va_nlink = zp->z_links;
	- if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
	- zp->z_links < ZFS_LINK_MAX)
	- vap->va_nlink++;
	- vap->va_size = zp->z_size;
	-#ifdef illumos
	- vap->va_rdev = vp->v_rdev;
	-#else
	- if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	- vap->va_rdev = zfs_cmpldev(rdev);
	-#endif
	- vap->va_seq = zp->z_seq;
	- vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
	- vap->va_filerev = zp->z_seq;
	-
	- /*
	- * Add in any requested optional attributes and the create time.
	- * Also set the corresponding bits in the returned attribute bitmap.
	- */
	- if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
	- if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
	- xoap->xoa_archive =
	- ((zp->z_pflags & ZFS_ARCHIVE) != 0);
	- XVA_SET_RTN(xvap, XAT_ARCHIVE);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
	- xoap->xoa_readonly =
	- ((zp->z_pflags & ZFS_READONLY) != 0);
	- XVA_SET_RTN(xvap, XAT_READONLY);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
	- xoap->xoa_system =
	- ((zp->z_pflags & ZFS_SYSTEM) != 0);
	- XVA_SET_RTN(xvap, XAT_SYSTEM);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
	- xoap->xoa_hidden =
	- ((zp->z_pflags & ZFS_HIDDEN) != 0);
	- XVA_SET_RTN(xvap, XAT_HIDDEN);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
	- xoap->xoa_nounlink =
	- ((zp->z_pflags & ZFS_NOUNLINK) != 0);
	- XVA_SET_RTN(xvap, XAT_NOUNLINK);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
	- xoap->xoa_immutable =
	- ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
	- XVA_SET_RTN(xvap, XAT_IMMUTABLE);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
	- xoap->xoa_appendonly =
	- ((zp->z_pflags & ZFS_APPENDONLY) != 0);
	- XVA_SET_RTN(xvap, XAT_APPENDONLY);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
	- xoap->xoa_nodump =
	- ((zp->z_pflags & ZFS_NODUMP) != 0);
	- XVA_SET_RTN(xvap, XAT_NODUMP);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
	- xoap->xoa_opaque =
	- ((zp->z_pflags & ZFS_OPAQUE) != 0);
	- XVA_SET_RTN(xvap, XAT_OPAQUE);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
	- xoap->xoa_av_quarantined =
	- ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
	- XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
	- xoap->xoa_av_modified =
	- ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
	- XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
	- vp->v_type == VREG) {
	- zfs_sa_get_scanstamp(zp, xvap);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
	- xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
	- XVA_SET_RTN(xvap, XAT_REPARSE);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
	- xoap->xoa_generation = zp->z_gen;
	- XVA_SET_RTN(xvap, XAT_GEN);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
	- xoap->xoa_offline =
	- ((zp->z_pflags & ZFS_OFFLINE) != 0);
	- XVA_SET_RTN(xvap, XAT_OFFLINE);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
	- xoap->xoa_sparse =
	- ((zp->z_pflags & ZFS_SPARSE) != 0);
	- XVA_SET_RTN(xvap, XAT_SPARSE);
	- }
	- }
	-
	- ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
	- ZFS_TIME_DECODE(&vap->va_mtime, mtime);
	- ZFS_TIME_DECODE(&vap->va_ctime, ctime);
	- ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
	-
	-
	- sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
	- vap->va_blksize = blksize;
	- vap->va_bytes = nblocks << 9; /* nblocks * 512 */
	-
	- if (zp->z_blksz == 0) {
	- /*
	- * Block size hasn't been set; suggest maximal I/O transfers.
	- */
	- vap->va_blksize = zfsvfs->z_max_blksz;
	- }
	-
	- ZFS_EXIT(zfsvfs);
	- return (0);
	-}
	-
	-/*
	- * Set the file attributes to the values contained in the
	- * vattr structure.
	- *
	- * IN: vp - vnode of file to be modified.
	- * vap - new attribute values.
	- * If AT_XVATTR set, then optional attrs are being set
	- * flags - ATTR_UTIME set if non-default time values provided.
	- * - ATTR_NOACLCHECK (CIFS context only).
	- * cr - credentials of caller.
	- * ct - caller context
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * vp - ctime updated, mtime updated if size changed.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_setattr(vnode_t vp, vattr_t vap, int flags, cred_t *cr,
	- caller_context_t *ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- zilog_t *zilog;
	- dmu_tx_t *tx;
	- vattr_t oldva;
	- xvattr_t tmpxvattr;
	- uint_t mask = vap->va_mask;
	- uint_t saved_mask = 0;
	- uint64_t saved_mode;
	- int trim_mask = 0;
	- uint64_t new_mode;
	- uint64_t new_uid, new_gid;
	- uint64_t xattr_obj;
	- uint64_t mtime[2], ctime[2];
	- znode_t *attrzp;
	- int need_policy = FALSE;
	- int err, err2;
	- zfs_fuid_info_t *fuidp = NULL;
	- xvattr_t xvap = (xvattr_t )vap; /* vap may be an xvattr_t * */
	- xoptattr_t *xoap;
	- zfs_acl_t *aclp;
	- boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
	- boolean_t fuid_dirtied = B_FALSE;
	- sa_bulk_attr_t bulk[7], xattr_bulk[7];
	- int count = 0, xattr_count = 0;
	-
	- if (mask == 0)
	- return (0);
	-
	- if (mask & AT_NOSET)
	- return (SET_ERROR(EINVAL));
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- zilog = zfsvfs->z_log;
	-
	- /*
	- * Make sure that if we have ephemeral uid/gid or xvattr specified
	- * that file system is at proper version level
	- */
	-
	- if (zfsvfs->z_use_fuids == B_FALSE &&
	- (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) \|\|
	- ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) \|\|
	- (mask & AT_XVATTR))) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (mask & AT_SIZE && vp->v_type == VDIR) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EISDIR));
	- }
	-
	- if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- /*
	- * If this is an xvattr_t, then get a pointer to the structure of
	- * optional attributes. If this is NULL, then we have a vattr_t.
	- */
	- xoap = xva_getxoptattr(xvap);
	-
	- xva_init(&tmpxvattr);
	-
	- /*
	- * Immutable files can only alter immutable bit and atime
	- */
	- if ((zp->z_pflags & ZFS_IMMUTABLE) &&
	- ((mask & (AT_SIZE\|AT_UID\|AT_GID\|AT_MTIME\|AT_MODE)) \|\|
	- ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EPERM));
	- }
	-
	- /*
	- * Note: ZFS_READONLY is handled in zfs_zaccess_common.
	- */
	-
	- /*
	- * Verify timestamps doesn't overflow 32 bits.
	- * ZFS can handle large timestamps, but 32bit syscalls can't
	- * handle times greater than 2039. This check should be removed
	- * once large timestamps are fully supported.
	- */
	- if (mask & (AT_ATIME \| AT_MTIME)) {
	- if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) \|\|
	- ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EOVERFLOW));
	- }
	- }
	- if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
	- TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EOVERFLOW));
	- }
	-
	- attrzp = NULL;
	- aclp = NULL;
	-
	- /* Can this be moved to before the top label? */
	- if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EROFS));
	- }
	-
	- /*
	- * First validate permissions
	- */
	-
	- if (mask & AT_SIZE) {
	- /*
	- * XXX - Note, we are not providing any open
	- * mode flags here (like FNDELAY), so we may
	- * block if there are locks present... this
	- * should be addressed in openat().
	- */
	- /* XXX - would it be OK to generate a log record here? */
	- err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
	- if (err) {
	- ZFS_EXIT(zfsvfs);
	- return (err);
	- }
	- }
	-
	- if (mask & (AT_ATIME\|AT_MTIME) \|\|
	- ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) \|\|
	- XVA_ISSET_REQ(xvap, XAT_READONLY) \|\|
	- XVA_ISSET_REQ(xvap, XAT_ARCHIVE) \|\|
	- XVA_ISSET_REQ(xvap, XAT_OFFLINE) \|\|
	- XVA_ISSET_REQ(xvap, XAT_SPARSE) \|\|
	- XVA_ISSET_REQ(xvap, XAT_CREATETIME) \|\|
	- XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
	- need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
	- skipaclchk, cr);
	- }
	-
	- if (mask & (AT_UID\|AT_GID)) {
	- int idmask = (mask & (AT_UID\|AT_GID));
	- int take_owner;
	- int take_group;
	-
	- /*
	- * NOTE: even if a new mode is being set,
	- * we may clear S_ISUID/S_ISGID bits.
	- */
	-
	- if (!(mask & AT_MODE))
	- vap->va_mode = zp->z_mode;
	-
	- /*
	- * Take ownership or chgrp to group we are a member of
	- */
	-
	- take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
	- take_group = (mask & AT_GID) &&
	- zfs_groupmember(zfsvfs, vap->va_gid, cr);
	-
	- /*
	- * If both AT_UID and AT_GID are set then take_owner and
	- * take_group must both be set in order to allow taking
	- * ownership.
	- *
	- * Otherwise, send the check through secpolicy_vnode_setattr()
	- *
	- */
	-
	- if (((idmask == (AT_UID\|AT_GID)) && take_owner && take_group) \|\|
	- ((idmask == AT_UID) && take_owner) \|\|
	- ((idmask == AT_GID) && take_group)) {
	- if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
	- skipaclchk, cr) == 0) {
	- /*
	- * Remove setuid/setgid for non-privileged users
	- */
	- secpolicy_setid_clear(vap, vp, cr);
	- trim_mask = (mask & (AT_UID\|AT_GID));
	- } else {
	- need_policy = TRUE;
	- }
	- } else {
	- need_policy = TRUE;
	- }
	- }
	-
	- oldva.va_mode = zp->z_mode;
	- zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
	- if (mask & AT_XVATTR) {
	- /*
	- * Update xvattr mask to include only those attributes
	- * that are actually changing.
	- *
	- * the bits will be restored prior to actually setting
	- * the attributes so the caller thinks they were set.
	- */
	- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
	- if (xoap->xoa_appendonly !=
	- ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
	- need_policy = TRUE;
	- } else {
	- XVA_CLR_REQ(xvap, XAT_APPENDONLY);
	- XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
	- }
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
	- if (xoap->xoa_nounlink !=
	- ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
	- need_policy = TRUE;
	- } else {
	- XVA_CLR_REQ(xvap, XAT_NOUNLINK);
	- XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
	- }
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
	- if (xoap->xoa_immutable !=
	- ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
	- need_policy = TRUE;
	- } else {
	- XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
	- XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
	- }
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
	- if (xoap->xoa_nodump !=
	- ((zp->z_pflags & ZFS_NODUMP) != 0)) {
	- need_policy = TRUE;
	- } else {
	- XVA_CLR_REQ(xvap, XAT_NODUMP);
	- XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
	- }
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
	- if (xoap->xoa_av_modified !=
	- ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
	- need_policy = TRUE;
	- } else {
	- XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
	- XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
	- }
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
	- if ((vp->v_type != VREG &&
	- xoap->xoa_av_quarantined) \|\|
	- xoap->xoa_av_quarantined !=
	- ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
	- need_policy = TRUE;
	- } else {
	- XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
	- XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
	- }
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EPERM));
	- }
	-
	- if (need_policy == FALSE &&
	- (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) \|\|
	- XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
	- need_policy = TRUE;
	- }
	- }
	-
	- if (mask & AT_MODE) {
	- if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
	- err = secpolicy_setid_setsticky_clear(vp, vap,
	- &oldva, cr);
	- if (err) {
	- ZFS_EXIT(zfsvfs);
	- return (err);
	- }
	- trim_mask \|= AT_MODE;
	- } else {
	- need_policy = TRUE;
	- }
	- }
	-
	- if (need_policy) {
	- /*
	- * If trim_mask is set then take ownership
	- * has been granted or write_acl is present and user
	- * has the ability to modify mode. In that case remove
	- * UID\|GID and or MODE from mask so that
	- * secpolicy_vnode_setattr() doesn't revoke it.
	- */
	-
	- if (trim_mask) {
	- saved_mask = vap->va_mask;
	- vap->va_mask &= ~trim_mask;
	- if (trim_mask & AT_MODE) {
	- /*
	- * Save the mode, as secpolicy_vnode_setattr()
	- * will overwrite it with ova.va_mode.
	- */
	- saved_mode = vap->va_mode;
	- }
	- }
	- err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
	- (int ()(void , int, cred_t *))zfs_zaccess_unix, zp);
	- if (err) {
	- ZFS_EXIT(zfsvfs);
	- return (err);
	- }
	-
	- if (trim_mask) {
	- vap->va_mask \|= saved_mask;
	- if (trim_mask & AT_MODE) {
	- /*
	- * Recover the mode after
	- * secpolicy_vnode_setattr().
	- */
	- vap->va_mode = saved_mode;
	- }
	- }
	- }
	-
	- /*
	- * secpolicy_vnode_setattr, or take ownership may have
	- * changed va_mask
	- */
	- mask = vap->va_mask;
	-
	- if ((mask & (AT_UID \| AT_GID))) {
	- err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
	- &xattr_obj, sizeof (xattr_obj));
	-
	- if (err == 0 && xattr_obj) {
	- err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
	- if (err == 0) {
	- err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
	- if (err != 0)
	- vrele(ZTOV(attrzp));
	- }
	- if (err)
	- goto out2;
	- }
	- if (mask & AT_UID) {
	- new_uid = zfs_fuid_create(zfsvfs,
	- (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
	- if (new_uid != zp->z_uid &&
	- zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
	- if (attrzp)
	- vput(ZTOV(attrzp));
	- err = SET_ERROR(EDQUOT);
	- goto out2;
	- }
	- }
	-
	- if (mask & AT_GID) {
	- new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
	- cr, ZFS_GROUP, &fuidp);
	- if (new_gid != zp->z_gid &&
	- zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
	- if (attrzp)
	- vput(ZTOV(attrzp));
	- err = SET_ERROR(EDQUOT);
	- goto out2;
	- }
	- }
	- }
	- tx = dmu_tx_create(zfsvfs->z_os);
	-
	- if (mask & AT_MODE) {
	- uint64_t pmode = zp->z_mode;
	- uint64_t acl_obj;
	- new_mode = (pmode & S_IFMT) \| (vap->va_mode & ~S_IFMT);
	-
	- if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
	- !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
	- err = SET_ERROR(EPERM);
	- goto out;
	- }
	-
	- if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
	- goto out;
	-
	- if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
	- /*
	- * Are we upgrading ACL from old V0 format
	- * to V1 format?
	- */
	- if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
	- zfs_znode_acl_version(zp) ==
	- ZFS_ACL_VERSION_INITIAL) {
	- dmu_tx_hold_free(tx, acl_obj, 0,
	- DMU_OBJECT_END);
	- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
	- 0, aclp->z_acl_bytes);
	- } else {
	- dmu_tx_hold_write(tx, acl_obj, 0,
	- aclp->z_acl_bytes);
	- }
	- } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
	- 0, aclp->z_acl_bytes);
	- }
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
	- } else {
	- if ((mask & AT_XVATTR) &&
	- XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
	- else
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- }
	-
	- if (attrzp) {
	- dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
	- }
	-
	- fuid_dirtied = zfsvfs->z_fuid_dirty;
	- if (fuid_dirtied)
	- zfs_fuid_txhold(zfsvfs, tx);
	-
	- zfs_sa_upgrade_txholds(tx, zp);
	-
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err)
	- goto out;
	-
	- count = 0;
	- /*
	- * Set each attribute requested.
	- * We group settings according to the locks they need to acquire.
	- *
	- * Note: you cannot set ctime directly, although it will be
	- * updated as a side-effect of calling this function.
	- */
	-
	- if (mask & (AT_UID\|AT_GID\|AT_MODE))
	- mutex_enter(&zp->z_acl_lock);
	-
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	- &zp->z_pflags, sizeof (zp->z_pflags));
	-
	- if (attrzp) {
	- if (mask & (AT_UID\|AT_GID\|AT_MODE))
	- mutex_enter(&attrzp->z_acl_lock);
	- SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
	- SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
	- sizeof (attrzp->z_pflags));
	- }
	-
	- if (mask & (AT_UID\|AT_GID)) {
	-
	- if (mask & AT_UID) {
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
	- &new_uid, sizeof (new_uid));
	- zp->z_uid = new_uid;
	- if (attrzp) {
	- SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
	- SA_ZPL_UID(zfsvfs), NULL, &new_uid,
	- sizeof (new_uid));
	- attrzp->z_uid = new_uid;
	- }
	- }
	-
	- if (mask & AT_GID) {
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
	- NULL, &new_gid, sizeof (new_gid));
	- zp->z_gid = new_gid;
	- if (attrzp) {
	- SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
	- SA_ZPL_GID(zfsvfs), NULL, &new_gid,
	- sizeof (new_gid));
	- attrzp->z_gid = new_gid;
	- }
	- }
	- if (!(mask & AT_MODE)) {
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
	- NULL, &new_mode, sizeof (new_mode));
	- new_mode = zp->z_mode;
	- }
	- err = zfs_acl_chown_setattr(zp);
	- ASSERT(err == 0);
	- if (attrzp) {
	- err = zfs_acl_chown_setattr(attrzp);
	- ASSERT(err == 0);
	- }
	- }
	-
	- if (mask & AT_MODE) {
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
	- &new_mode, sizeof (new_mode));
	- zp->z_mode = new_mode;
	- ASSERT3U((uintptr_t)aclp, !=, 0);
	- err = zfs_aclset_common(zp, aclp, cr, tx);
	- ASSERT0(err);
	- if (zp->z_acl_cached)
	- zfs_acl_free(zp->z_acl_cached);
	- zp->z_acl_cached = aclp;
	- aclp = NULL;
	- }
	-
	-
	- if (mask & AT_ATIME) {
	- ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
	- &zp->z_atime, sizeof (zp->z_atime));
	- }
	-
	- if (mask & AT_MTIME) {
	- ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
	- mtime, sizeof (mtime));
	- }
	-
	- /* XXX - shouldn't this be done before the ATIME/MTIME checks? */
	- if (mask & AT_SIZE && !(mask & AT_MTIME)) {
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
	- NULL, mtime, sizeof (mtime));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
	- &ctime, sizeof (ctime));
	- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
	- B_TRUE);
	- } else if (mask != 0) {
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
	- &ctime, sizeof (ctime));
	- zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
	- B_TRUE);
	- if (attrzp) {
	- SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
	- SA_ZPL_CTIME(zfsvfs), NULL,
	- &ctime, sizeof (ctime));
	- zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
	- mtime, ctime, B_TRUE);
	- }
	- }
	- /*
	- * Do this after setting timestamps to prevent timestamp
	- * update from toggling bit
	- */
	-
	- if (xoap && (mask & AT_XVATTR)) {
	-
	- if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
	- xoap->xoa_createtime = vap->va_birthtime;
	- /*
	- * restore trimmed off masks
	- * so that return masks can be set for caller.
	- */
	-
	- if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
	- XVA_SET_REQ(xvap, XAT_APPENDONLY);
	- }
	- if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
	- XVA_SET_REQ(xvap, XAT_NOUNLINK);
	- }
	- if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
	- XVA_SET_REQ(xvap, XAT_IMMUTABLE);
	- }
	- if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
	- XVA_SET_REQ(xvap, XAT_NODUMP);
	- }
	- if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
	- XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
	- }
	- if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
	- XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
	- }
	-
	- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
	- ASSERT(vp->v_type == VREG);
	-
	- zfs_xvattr_set(zp, xvap, tx);
	- }
	-
	- if (fuid_dirtied)
	- zfs_fuid_sync(zfsvfs, tx);
	-
	- if (mask != 0)
	- zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
	-
	- if (mask & (AT_UID\|AT_GID\|AT_MODE))
	- mutex_exit(&zp->z_acl_lock);
	-
	- if (attrzp) {
	- if (mask & (AT_UID\|AT_GID\|AT_MODE))
	- mutex_exit(&attrzp->z_acl_lock);
	- }
	-out:
	- if (err == 0 && attrzp) {
	- err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
	- xattr_count, tx);
	- ASSERT(err2 == 0);
	- }
	-
	- if (attrzp)
	- vput(ZTOV(attrzp));
	-
	- if (aclp)
	- zfs_acl_free(aclp);
	-
	- if (fuidp) {
	- zfs_fuid_info_free(fuidp);
	- fuidp = NULL;
	- }
	-
	- if (err) {
	- dmu_tx_abort(tx);
	- } else {
	- err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	- dmu_tx_commit(tx);
	- }
	-
	-out2:
	- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, 0);
	-
	- ZFS_EXIT(zfsvfs);
	- return (err);
	-}
	-
	-/*
	- * We acquire all but fdvp locks using non-blocking acquisitions. If we
	- * fail to acquire any lock in the path we will drop all held locks,
	- * acquire the new lock in a blocking fashion, and then release it and
	- * restart the rename. This acquire/release step ensures that we do not
	- * spin on a lock waiting for release. On error release all vnode locks
	- * and decrement references the way tmpfs_rename() would do.
	- */
	-static int
	-zfs_rename_relock(struct vnode sdvp, struct vnode *svpp,
	- struct vnode tdvp, struct vnode *tvpp,
	- const struct componentname scnp, const struct componentname tcnp)
	-{
	- zfsvfs_t *zfsvfs;
	- struct vnode nvp, svp, *tvp;
	- znode_t sdzp, tdzp, szp, tzp;
	- const char *snm = scnp->cn_nameptr;
	- const char *tnm = tcnp->cn_nameptr;
	- int error;
	-
	- VOP_UNLOCK(tdvp);
	- if (tvpp != NULL && tvpp != tdvp)
	- VOP_UNLOCK(*tvpp);
	-
	-relock:
	- error = vn_lock(sdvp, LK_EXCLUSIVE);
	- if (error)
	- goto out;
	- sdzp = VTOZ(sdvp);
	-
	- error = vn_lock(tdvp, LK_EXCLUSIVE \| LK_NOWAIT);
	- if (error != 0) {
	- VOP_UNLOCK(sdvp);
	- if (error != EBUSY)
	- goto out;
	- error = vn_lock(tdvp, LK_EXCLUSIVE);
	- if (error)
	- goto out;
	- VOP_UNLOCK(tdvp);
	- goto relock;
	- }
	- tdzp = VTOZ(tdvp);
	-
	- /*
	- * Before using sdzp and tdzp we must ensure that they are live.
	- * As a porting legacy from illumos we have two things to worry
	- * about. One is typical for FreeBSD and it is that the vnode is
	- * not reclaimed (doomed). The other is that the znode is live.
	- * The current code can invalidate the znode without acquiring the
	- * corresponding vnode lock if the object represented by the znode
	- * and vnode is no longer valid after a rollback or receive operation.
	- * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
	- * that protects the znodes from the invalidation.
	- */
	- zfsvfs = sdzp->z_zfsvfs;
	- ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
	- ZFS_ENTER(zfsvfs);
	-
	- /*
	- * We can not use ZFS_VERIFY_ZP() here because it could directly return
	- * bypassing the cleanup code in the case of an error.
	- */
	- if (tdzp->z_sa_hdl == NULL \|\| sdzp->z_sa_hdl == NULL) {
	- ZFS_EXIT(zfsvfs);
	- VOP_UNLOCK(sdvp);
	- VOP_UNLOCK(tdvp);
	- error = SET_ERROR(EIO);
	- goto out;
	- }
	-
	- /*
	- * Re-resolve svp to be certain it still exists and fetch the
	- * correct vnode.
	- */
	- error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
	- if (error != 0) {
	- /* Source entry invalid or not there. */
	- ZFS_EXIT(zfsvfs);
	- VOP_UNLOCK(sdvp);
	- VOP_UNLOCK(tdvp);
	- if ((scnp->cn_flags & ISDOTDOT) != 0 \|\|
	- (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
	- error = SET_ERROR(EINVAL);
	- goto out;
	- }
	- svp = ZTOV(szp);
	-
	- /*
	- * Re-resolve tvp, if it disappeared we just carry on.
	- */
	- error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- VOP_UNLOCK(sdvp);
	- VOP_UNLOCK(tdvp);
	- vrele(svp);
	- if ((tcnp->cn_flags & ISDOTDOT) != 0)
	- error = SET_ERROR(EINVAL);
	- goto out;
	- }
	- if (tzp != NULL)
	- tvp = ZTOV(tzp);
	- else
	- tvp = NULL;
	-
	- /*
	- * At present the vnode locks must be acquired before z_teardown_lock,
	- * although it would be more logical to use the opposite order.
	- */
	- ZFS_EXIT(zfsvfs);
	-
	- /*
	- * Now try acquire locks on svp and tvp.
	- */
	- nvp = svp;
	- error = vn_lock(nvp, LK_EXCLUSIVE \| LK_NOWAIT);
	- if (error != 0) {
	- VOP_UNLOCK(sdvp);
	- VOP_UNLOCK(tdvp);
	- if (tvp != NULL)
	- vrele(tvp);
	- if (error != EBUSY) {
	- vrele(nvp);
	- goto out;
	- }
	- error = vn_lock(nvp, LK_EXCLUSIVE);
	- if (error != 0) {
	- vrele(nvp);
	- goto out;
	- }
	- VOP_UNLOCK(nvp);
	- /*
	- * Concurrent rename race.
	- * XXX ?
	- */
	- if (nvp == tdvp) {
	- vrele(nvp);
	- error = SET_ERROR(EINVAL);
	- goto out;
	- }
	- vrele(*svpp);
	- *svpp = nvp;
	- goto relock;
	- }
	- vrele(*svpp);
	- *svpp = nvp;
	-
	- if (*tvpp != NULL)
	- vrele(*tvpp);
	- *tvpp = NULL;
	- if (tvp != NULL) {
	- nvp = tvp;
	- error = vn_lock(nvp, LK_EXCLUSIVE \| LK_NOWAIT);
	- if (error != 0) {
	- VOP_UNLOCK(sdvp);
	- VOP_UNLOCK(tdvp);
	- VOP_UNLOCK(*svpp);
	- if (error != EBUSY) {
	- vrele(nvp);
	- goto out;
	- }
	- error = vn_lock(nvp, LK_EXCLUSIVE);
	- if (error != 0) {
	- vrele(nvp);
	- goto out;
	- }
	- vput(nvp);
	- goto relock;
	- }
	- *tvpp = nvp;
	- }
	-
	- return (0);
	-
	-out:
	- return (error);
	-}
	-
	-/*
	- * Note that we must use VRELE_ASYNC in this function as it walks
	- * up the directory tree and vrele may need to acquire an exclusive
	- * lock if a last reference to a vnode is dropped.
	- */
	-static int
	-zfs_rename_check(znode_t szp, znode_t sdzp, znode_t *tdzp)
	-{
	- zfsvfs_t *zfsvfs;
	- znode_t zp, zp1;
	- uint64_t parent;
	- int error;
	-
	- zfsvfs = tdzp->z_zfsvfs;
	- if (tdzp == szp)
	- return (SET_ERROR(EINVAL));
	- if (tdzp == sdzp)
	- return (0);
	- if (tdzp->z_id == zfsvfs->z_root)
	- return (0);
	- zp = tdzp;
	- for (;;) {
	- ASSERT(!zp->z_unlinked);
	- if ((error = sa_lookup(zp->z_sa_hdl,
	- SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
	- break;
	-
	- if (parent == szp->z_id) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	- if (parent == zfsvfs->z_root)
	- break;
	- if (parent == sdzp->z_id)
	- break;
	-
	- error = zfs_zget(zfsvfs, parent, &zp1);
	- if (error != 0)
	- break;
	-
	- if (zp != tdzp)
	- VN_RELE_ASYNC(ZTOV(zp),
	- dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
	- zp = zp1;
	- }
	-
	- if (error == ENOTDIR)
	- panic("checkpath: .. not a directory\n");
	- if (zp != tdzp)
	- VN_RELE_ASYNC(ZTOV(zp),
	- dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
	- return (error);
	-}
	-
	-/*
	- * Move an entry from the provided source directory to the target
	- * directory. Change the entry name as indicated.
	- *
	- * IN: sdvp - Source directory containing the "old entry".
	- * snm - Old entry name.
	- * tdvp - Target directory to contain the "new entry".
	- * tnm - New entry name.
	- * cr - credentials of caller.
	- * ct - caller context
	- * flags - case flags
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * sdvp,tdvp - ctime\|mtime updated
	- */
	-/ARGSUSED/
	-static int
	-zfs_rename(vnode_t sdvp, vnode_t svpp, struct componentname scnp,
	- vnode_t tdvp, vnode_t tvpp, struct componentname tcnp,
	- cred_t *cr)
	-{
	- zfsvfs_t *zfsvfs;
	- znode_t sdzp, tdzp, szp, tzp;
	- zilog_t *zilog = NULL;
	- dmu_tx_t *tx;
	- char *snm = scnp->cn_nameptr;
	- char *tnm = tcnp->cn_nameptr;
	- int error = 0;
	- bool want_seqc_end = false;
	-
	- /* Reject renames across filesystems. */
	- if ((*svpp)->v_mount != tdvp->v_mount \|\|
	- ((tvpp) != NULL && (svpp)->v_mount != (*tvpp)->v_mount)) {
	- error = SET_ERROR(EXDEV);
	- goto out;
	- }
	-
	- if (zfsctl_is_node(tdvp)) {
	- error = SET_ERROR(EXDEV);
	- goto out;
	- }
	-
	- /*
	- * Lock all four vnodes to ensure safety and semantics of renaming.
	- */
	- error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
	- if (error != 0) {
	- /* no vnodes are locked in the case of error here */
	- return (error);
	- }
	-
	- tdzp = VTOZ(tdvp);
	- sdzp = VTOZ(sdvp);
	- zfsvfs = tdzp->z_zfsvfs;
	- zilog = zfsvfs->z_log;
	-
	- /*
	- * After we re-enter ZFS_ENTER() we will have to revalidate all
	- * znodes involved.
	- */
	- ZFS_ENTER(zfsvfs);
	-
	- if (zfsvfs->z_utf8 && u8_validate(tnm,
	- strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	- error = SET_ERROR(EILSEQ);
	- goto unlockout;
	- }
	-
	- /* If source and target are the same file, there is nothing to do. */
	- if ((svpp) == (tvpp)) {
	- error = 0;
	- goto unlockout;
	- }
	-
	- if (((svpp)->v_type == VDIR && (svpp)->v_mountedhere != NULL) \|\|
	- ((tvpp) != NULL && (tvpp)->v_type == VDIR &&
	- (*tvpp)->v_mountedhere != NULL)) {
	- error = SET_ERROR(EXDEV);
	- goto unlockout;
	- }
	-
	- /*
	- * We can not use ZFS_VERIFY_ZP() here because it could directly return
	- * bypassing the cleanup code in the case of an error.
	- */
	- if (tdzp->z_sa_hdl == NULL \|\| sdzp->z_sa_hdl == NULL) {
	- error = SET_ERROR(EIO);
	- goto unlockout;
	- }
	-
	- szp = VTOZ(*svpp);
	- tzp = tvpp == NULL ? NULL : VTOZ(tvpp);
	- if (szp->z_sa_hdl == NULL \|\| (tzp != NULL && tzp->z_sa_hdl == NULL)) {
	- error = SET_ERROR(EIO);
	- goto unlockout;
	- }
	-
	- /*
	- * This is to prevent the creation of links into attribute space
	- * by renaming a linked file into/outof an attribute directory.
	- * See the comment in zfs_link() for why this is considered bad.
	- */
	- if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
	- error = SET_ERROR(EINVAL);
	- goto unlockout;
	- }
	-
	- /*
	- * Must have write access at the source to remove the old entry
	- * and write access at the target to create the new entry.
	- * Note that if target and source are the same, this can be
	- * done in a single check.
	- */
	- if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
	- goto unlockout;
	-
	- if ((*svpp)->v_type == VDIR) {
	- /*
	- * Avoid ".", "..", and aliases of "." for obvious reasons.
	- */
	- if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') \|\|
	- sdzp == szp \|\|
	- (scnp->cn_flags \| tcnp->cn_flags) & ISDOTDOT) {
	- error = EINVAL;
	- goto unlockout;
	- }
	-
	- /*
	- * Check to make sure rename is valid.
	- * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
	- */
	- if (error = zfs_rename_check(szp, sdzp, tdzp))
	- goto unlockout;
	- }
	-
	- /*
	- * Does target exist?
	- */
	- if (tzp) {
	- /*
	- * Source and target must be the same type.
	- */
	- if ((*svpp)->v_type == VDIR) {
	- if ((*tvpp)->v_type != VDIR) {
	- error = SET_ERROR(ENOTDIR);
	- goto unlockout;
	- } else {
	- cache_purge(tdvp);
	- if (sdvp != tdvp)
	- cache_purge(sdvp);
	- }
	- } else {
	- if ((*tvpp)->v_type == VDIR) {
	- error = SET_ERROR(EISDIR);
	- goto unlockout;
	- }
	- }
	- }
	-
	- vn_seqc_write_begin(*svpp);
	- vn_seqc_write_begin(sdvp);
	- if (*tvpp != NULL)
	- vn_seqc_write_begin(*tvpp);
	- if (tdvp != *tvpp)
	- vn_seqc_write_begin(tdvp);
	- want_seqc_end = true;
	-
	- vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
	- if (tzp)
	- vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
	-
	- /*
	- * notify the target directory if it is not the same
	- * as source directory.
	- */
	- if (tdvp != sdvp) {
	- vnevent_rename_dest_dir(tdvp, ct);
	- }
	-
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
	- dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
	- dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
	- dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
	- if (sdzp != tdzp) {
	- dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
	- zfs_sa_upgrade_txholds(tx, tdzp);
	- }
	- if (tzp) {
	- dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
	- zfs_sa_upgrade_txholds(tx, tzp);
	- }
	-
	- zfs_sa_upgrade_txholds(tx, szp);
	- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- goto unlockout;
	- }
	-
	-
	- if (tzp) /* Attempt to remove the existing target */
	- error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
	-
	- if (error == 0) {
	- error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
	- if (error == 0) {
	- szp->z_pflags \|= ZFS_AV_MODIFIED;
	-
	- error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
	- (void *)&szp->z_pflags, sizeof (uint64_t), tx);
	- ASSERT0(error);
	-
	- error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
	- NULL);
	- if (error == 0) {
	- zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
	- snm, tdzp, tnm, szp);
	-
	- /*
	- * Update path information for the target vnode
	- */
	- vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
	- } else {
	- /*
	- * At this point, we have successfully created
	- * the target name, but have failed to remove
	- * the source name. Since the create was done
	- * with the ZRENAMING flag, there are
	- * complications; for one, the link count is
	- * wrong. The easiest way to deal with this
	- * is to remove the newly created target, and
	- * return the original error. This must
	- * succeed; fortunately, it is very unlikely to
	- * fail, since we just created it.
	- */
	- VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
	- ZRENAMING, NULL), ==, 0);
	- }
	- }
	- if (error == 0) {
	- cache_purge(*svpp);
	- if (*tvpp != NULL)
	- cache_purge(*tvpp);
	- cache_purge_negative(tdvp);
	- }
	- }
	-
	- dmu_tx_commit(tx);
	-
	-unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
	- ZFS_EXIT(zfsvfs);
	- if (want_seqc_end) {
	- vn_seqc_write_end(*svpp);
	- vn_seqc_write_end(sdvp);
	- if (*tvpp != NULL)
	- vn_seqc_write_end(*tvpp);
	- if (tdvp != *tvpp)
	- vn_seqc_write_end(tdvp);
	- want_seqc_end = false;
	- }
	- VOP_UNLOCK(*svpp);
	- VOP_UNLOCK(sdvp);
	-
	-out: /* original two vnodes are locked */
	- MPASS(!want_seqc_end);
	- if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, 0);
	-
	- if (*tvpp != NULL)
	- VOP_UNLOCK(*tvpp);
	- if (tdvp != *tvpp)
	- VOP_UNLOCK(tdvp);
	- return (error);
	-}
	-
	-/*
	- * Insert the indicated symbolic reference entry into the directory.
	- *
	- * IN: dvp - Directory to contain new symbolic link.
	- * link - Name for new symlink entry.
	- * vap - Attributes of new entry.
	- * cr - credentials of caller.
	- * ct - caller context
	- * flags - case flags
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * dvp - ctime\|mtime updated
	- */
	-/ARGSUSED/
	-static int
	-zfs_symlink(vnode_t dvp, vnode_t vpp, char name, vattr_t vap, char link,
	- cred_t cr, kthread_t td)
	-{
	- znode_t zp, dzp = VTOZ(dvp);
	- dmu_tx_t *tx;
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- zilog_t *zilog;
	- uint64_t len = strlen(link);
	- int error;
	- zfs_acl_ids_t acl_ids;
	- boolean_t fuid_dirtied;
	- uint64_t txtype = TX_SYMLINK;
	- int flags = 0;
	-
	- ASSERT(vap->va_type == VLNK);
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(dzp);
	- zilog = zfsvfs->z_log;
	-
	- if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
	- NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EILSEQ));
	- }
	-
	- if (len > MAXPATHLEN) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(ENAMETOOLONG));
	- }
	-
	- if ((error = zfs_acl_ids_create(dzp, 0,
	- vap, cr, NULL, &acl_ids)) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * Attempt to lock directory; fail if entry already exists.
	- */
	- error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
	- if (error) {
	- zfs_acl_ids_free(&acl_ids);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
	- zfs_acl_ids_free(&acl_ids);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
	- zfs_acl_ids_free(&acl_ids);
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EDQUOT));
	- }
	-
	- getnewvnode_reserve();
	- tx = dmu_tx_create(zfsvfs->z_os);
	- fuid_dirtied = zfsvfs->z_fuid_dirty;
	- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
	- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
	- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
	- ZFS_SA_BASE_ATTR_SIZE + len);
	- dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
	- if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
	- acl_ids.z_aclp->z_acl_bytes);
	- }
	- if (fuid_dirtied)
	- zfs_fuid_txhold(zfsvfs, tx);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- zfs_acl_ids_free(&acl_ids);
	- dmu_tx_abort(tx);
	- getnewvnode_drop_reserve();
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * Create a new object for the symlink.
	- * for version 4 ZPL datsets the symlink will be an SA attribute
	- */
	- zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
	-
	- if (fuid_dirtied)
	- zfs_fuid_sync(zfsvfs, tx);
	-
	- if (zp->z_is_sa)
	- error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
	- link, len, tx);
	- else
	- zfs_sa_symlink(zp, link, len, tx);
	-
	- zp->z_size = len;
	- (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
	- &zp->z_size, sizeof (zp->z_size), tx);
	- /*
	- * Insert the new object into the directory.
	- */
	- (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
	-
	- zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
	- *vpp = ZTOV(zp);
	-
	- zfs_acl_ids_free(&acl_ids);
	-
	- dmu_tx_commit(tx);
	-
	- getnewvnode_drop_reserve();
	-
	- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, 0);
	-
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-/*
	- * Return, in the buffer contained in the provided uio structure,
	- * the symbolic path referred to by vp.
	- *
	- * IN: vp - vnode of symbolic link.
	- * uio - structure to contain the link path.
	- * cr - credentials of caller.
	- * ct - caller context
	- *
	- * OUT: uio - structure containing the link path.
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * vp - atime updated
	- */
	-/* ARGSUSED */
	-static int
	-zfs_readlink(vnode_t vp, uio_t uio, cred_t cr, caller_context_t ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- int error;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- if (zp->z_is_sa)
	- error = sa_lookup_uio(zp->z_sa_hdl,
	- SA_ZPL_SYMLINK(zfsvfs), uio);
	- else
	- error = zfs_sa_readlink(zp, uio);
	-
	- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
	-
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-/*
	- * Insert a new entry into directory tdvp referencing svp.
	- *
	- * IN: tdvp - Directory to contain new entry.
	- * svp - vnode of new entry.
	- * name - name of new entry.
	- * cr - credentials of caller.
	- * ct - caller context
	- *
	- * RETURN: 0 on success, error code on failure.
	- *
	- * Timestamps:
	- * tdvp - ctime\|mtime updated
	- * svp - ctime updated
	- */
	-/* ARGSUSED */
	-static int
	-zfs_link(vnode_t tdvp, vnode_t svp, char name, cred_t cr,
	- caller_context_t *ct, int flags)
	-{
	- znode_t *dzp = VTOZ(tdvp);
	- znode_t tzp, szp;
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- zilog_t *zilog;
	- dmu_tx_t *tx;
	- int error;
	- uint64_t parent;
	- uid_t owner;
	-
	- ASSERT(tdvp->v_type == VDIR);
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(dzp);
	- zilog = zfsvfs->z_log;
	-
	- /*
	- * POSIX dictates that we return EPERM here.
	- * Better choices include ENOTSUP or EISDIR.
	- */
	- if (svp->v_type == VDIR) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EPERM));
	- }
	-
	- szp = VTOZ(svp);
	- ZFS_VERIFY_ZP(szp);
	-
	- if (szp->z_pflags & (ZFS_APPENDONLY \| ZFS_IMMUTABLE \| ZFS_READONLY)) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EPERM));
	- }
	-
	- /* Prevent links to .zfs/shares files */
	-
	- if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
	- &parent, sizeof (uint64_t))) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- if (parent == zfsvfs->z_shares_dir) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EPERM));
	- }
	-
	- if (zfsvfs->z_utf8 && u8_validate(name,
	- strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EILSEQ));
	- }
	-
	- /*
	- * We do not support links between attributes and non-attributes
	- * because of the potential security risk of creating links
	- * into "normal" file space in order to circumvent restrictions
	- * imposed in attribute space.
	- */
	- if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EINVAL));
	- }
	-
	-
	- owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
	- if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(EPERM));
	- }
	-
	- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- /*
	- * Attempt to lock directory; fail if entry already exists.
	- */
	- error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
	- if (error) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
	- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
	- zfs_sa_upgrade_txholds(tx, szp);
	- zfs_sa_upgrade_txholds(tx, dzp);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- error = zfs_link_create(dzp, name, szp, tx, 0);
	-
	- if (error == 0) {
	- uint64_t txtype = TX_LINK;
	- zfs_log_link(zilog, tx, txtype, dzp, szp, name);
	- }
	-
	- dmu_tx_commit(tx);
	-
	- if (error == 0) {
	- vnevent_link(svp, ct);
	- }
	-
	- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, 0);
	-
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-
	-/ARGSUSED/
	-void
	-zfs_inactive(vnode_t vp, cred_t cr, caller_context_t *ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- int error;
	-
	- ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs);
	- if (zp->z_sa_hdl == NULL) {
	- /*
	- * The fs has been unmounted, or we did a
	- * suspend/resume and this file no longer exists.
	- */
	- ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
	- vrecycle(vp);
	- return;
	- }
	-
	- if (zp->z_unlinked) {
	- /*
	- * Fast path to recycle a vnode of a removed file.
	- */
	- ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
	- vrecycle(vp);
	- return;
	- }
	-
	- if (zp->z_atime_dirty && zp->z_unlinked == 0) {
	- dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
	-
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- zfs_sa_upgrade_txholds(tx, zp);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- } else {
	- (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
	- (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
	- zp->z_atime_dirty = 0;
	- dmu_tx_commit(tx);
	- }
	- }
	- ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
	-}
	-
	-
	-CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
	-CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
	-
	-/ARGSUSED/
	-static int
	-zfs_fid(vnode_t vp, fid_t fidp, caller_context_t *ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- uint32_t gen;
	- uint64_t gen64;
	- uint64_t object = zp->z_id;
	- zfid_short_t *zfid;
	- int size, i, error;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
	- &gen64, sizeof (uint64_t))) != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- gen = (uint32_t)gen64;
	-
	- size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
	-
	-#ifdef illumos
	- if (fidp->fid_len < size) {
	- fidp->fid_len = size;
	- ZFS_EXIT(zfsvfs);
	- return (SET_ERROR(ENOSPC));
	- }
	-#else
	- fidp->fid_len = size;
	-#endif
	-
	- zfid = (zfid_short_t *)fidp;
	-
	- zfid->zf_len = size;
	-
	- for (i = 0; i < sizeof (zfid->zf_object); i++)
	- zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
	-
	- /* Must have a non-zero generation number to distinguish from .zfs */
	- if (gen == 0)
	- gen = 1;
	- for (i = 0; i < sizeof (zfid->zf_gen); i++)
	- zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
	-
	- if (size == LONG_FID_LEN) {
	- uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
	- zfid_long_t *zlfid;
	-
	- zlfid = (zfid_long_t *)fidp;
	-
	- for (i = 0; i < sizeof (zlfid->zf_setid); i++)
	- zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
	-
	- /* XXX - this should be the generation number for the objset */
	- for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
	- zlfid->zf_setgen[i] = 0;
	- }
	-
	- ZFS_EXIT(zfsvfs);
	- return (0);
	-}
	-
	-static int
	-zfs_pathconf(vnode_t vp, int cmd, ulong_t valp, cred_t *cr,
	- caller_context_t *ct)
	-{
	- znode_t zp, xzp;
	- zfsvfs_t *zfsvfs;
	- int error;
	-
	- switch (cmd) {
	- case _PC_LINK_MAX:
	- *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
	- return (0);
	-
	- case _PC_FILESIZEBITS:
	- *valp = 64;
	- return (0);
	-#ifdef illumos
	- case _PC_XATTR_EXISTS:
	- zp = VTOZ(vp);
	- zfsvfs = zp->z_zfsvfs;
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	- *valp = 0;
	- error = zfs_dirent_lookup(zp, "", &xzp,
	- ZXATTR \| ZEXISTS \| ZSHARED);
	- if (error == 0) {
	- if (!zfs_dirempty(xzp))
	- *valp = 1;
	- vrele(ZTOV(xzp));
	- } else if (error == ENOENT) {
	- /*
	- * If there aren't extended attributes, it's the
	- * same as having zero of them.
	- */
	- error = 0;
	- }
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-
	- case _PC_SATTR_ENABLED:
	- case _PC_SATTR_EXISTS:
	- *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
	- (vp->v_type == VREG \|\| vp->v_type == VDIR);
	- return (0);
	-
	- case _PC_ACCESS_FILTERING:
	- *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
	- vp->v_type == VDIR;
	- return (0);
	-
	- case _PC_ACL_ENABLED:
	- *valp = _ACL_ACE_ENABLED;
	- return (0);
	-#endif /* illumos */
	- case _PC_MIN_HOLE_SIZE:
	- *valp = (int)SPA_MINBLOCKSIZE;
	- return (0);
	-#ifdef illumos
	- case _PC_TIMESTAMP_RESOLUTION:
	- /* nanosecond timestamp resolution */
	- *valp = 1L;
	- return (0);
	-#endif
	- case _PC_ACL_EXTENDED:
	- *valp = 0;
	- return (0);
	-
	- case _PC_ACL_NFS4:
	- *valp = 1;
	- return (0);
	-
	- case _PC_ACL_PATH_MAX:
	- *valp = ACL_MAX_ENTRIES;
	- return (0);
	-
	- default:
	- return (EOPNOTSUPP);
	- }
	-}
	-
	-/ARGSUSED/
	-static int
	-zfs_getsecattr(vnode_t vp, vsecattr_t vsecp, int flag, cred_t *cr,
	- caller_context_t *ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- int error;
	- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	- error = zfs_getacl(zp, vsecp, skipaclchk, cr);
	- ZFS_EXIT(zfsvfs);
	-
	- return (error);
	-}
	-
	-/ARGSUSED/
	-int
	-zfs_setsecattr(vnode_t vp, vsecattr_t vsecp, int flag, cred_t *cr,
	- caller_context_t *ct)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- int error;
	- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
	- zilog_t *zilog = zfsvfs->z_log;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- error = zfs_setacl(zp, vsecp, skipaclchk, cr);
	-
	- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zilog, 0);
	-
	- ZFS_EXIT(zfsvfs);
	- return (error);
	-}
	-
	-static int
	-zfs_getpages(struct vnode vp, vm_page_t ma, int count, int *rbehind,
	- int *rahead)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- objset_t *os = zp->z_zfsvfs->z_os;
	- locked_range_t *lr;
	- vm_object_t object;
	- off_t start, end, obj_size;
	- uint_t blksz;
	- int pgsin_b, pgsin_a;
	- int error;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- start = IDX_TO_OFF(ma[0]->pindex);
	- end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
	-
	- /*
	- * Try to lock a range covering all required and optional pages, to
	- * handle the case of the block size growing. It is not safe to block
	- * on the range lock since the owner may be waiting for the fault page
	- * to be unbusied.
	- */
	- for (;;) {
	- blksz = zp->z_blksz;
	- lr = rangelock_tryenter(&zp->z_rangelock,
	- rounddown(start, blksz),
	- roundup(end, blksz) - rounddown(start, blksz), RL_READER);
	- if (lr == NULL) {
	- if (rahead != NULL) {
	- *rahead = 0;
	- rahead = NULL;
	- }
	- if (rbehind != NULL) {
	- *rbehind = 0;
	- rbehind = NULL;
	- }
	- break;
	- }
	- if (blksz == zp->z_blksz)
	- break;
	- rangelock_exit(lr);
	- }
	-
	- object = ma[0]->object;
	- zfs_vmobject_wlock(object);
	- obj_size = object->un_pager.vnp.vnp_size;
	- zfs_vmobject_wunlock(object);
	- if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
	- if (lr != NULL)
	- rangelock_exit(lr);
	- ZFS_EXIT(zfsvfs);
	- return (zfs_vm_pagerret_bad);
	- }
	-
	- pgsin_b = 0;
	- if (rbehind != NULL) {
	- pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
	- pgsin_b = MIN(*rbehind, pgsin_b);
	- }
	-
	- pgsin_a = 0;
	- if (rahead != NULL) {
	- pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
	- if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
	- pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
	- pgsin_a = MIN(*rahead, pgsin_a);
	- }
	-
	- /*
	- * NB: we need to pass the exact byte size of the data that we expect
	- * to read after accounting for the file size. This is required because
	- * ZFS will panic if we request DMU to read beyond the end of the last
	- * allocated block.
	- */
	- error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
	- MIN(end, obj_size) - (end - PAGE_SIZE));
	-
	- if (lr != NULL)
	- rangelock_exit(lr);
	- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
	- ZFS_EXIT(zfsvfs);
	-
	- if (error != 0)
	- return (zfs_vm_pagerret_error);
	-
	- VM_CNT_INC(v_vnodein);
	- VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
	- if (rbehind != NULL)
	- *rbehind = pgsin_b;
	- if (rahead != NULL)
	- *rahead = pgsin_a;
	- return (zfs_vm_pagerret_ok);
	-}
	-
	-static int
	-zfs_freebsd_getpages(ap)
	- struct vop_getpages_args /* {
	- struct vnode *a_vp;
	- vm_page_t *a_m;
	- int a_count;
	- int *a_rbehind;
	- int *a_rahead;
	- } / ap;
	-{
	-
	- return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
	- ap->a_rahead));
	-}
	-
	-static int
	-zfs_putpages(struct vnode vp, vm_page_t ma, size_t len, int flags,
	- int *rtvals)
	-{
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- locked_range_t *lr;
	- dmu_tx_t *tx;
	- struct sf_buf *sf;
	- vm_object_t object;
	- vm_page_t m;
	- caddr_t va;
	- size_t tocopy;
	- size_t lo_len;
	- vm_ooffset_t lo_off;
	- vm_ooffset_t off;
	- uint_t blksz;
	- int ncount;
	- int pcount;
	- int err;
	- int i;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- object = vp->v_object;
	- pcount = btoc(len);
	- ncount = pcount;
	-
	- KASSERT(ma[0]->object == object, ("mismatching object"));
	- KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
	-
	- for (i = 0; i < pcount; i++)
	- rtvals[i] = zfs_vm_pagerret_error;
	-
	- off = IDX_TO_OFF(ma[0]->pindex);
	- blksz = zp->z_blksz;
	- lo_off = rounddown(off, blksz);
	- lo_len = roundup(len + (off - lo_off), blksz);
	- lr = rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
	-
	- zfs_vmobject_wlock(object);
	- if (len + off > object->un_pager.vnp.vnp_size) {
	- if (object->un_pager.vnp.vnp_size > off) {
	- int pgoff;
	-
	- len = object->un_pager.vnp.vnp_size - off;
	- ncount = btoc(len);
	- if ((pgoff = (int)len & PAGE_MASK) != 0) {
	- /*
	- * If the object is locked and the following
	- * conditions hold, then the page's dirty
	- * field cannot be concurrently changed by a
	- * pmap operation.
	- */
	- m = ma[ncount - 1];
	- vm_page_assert_sbusied(m);
	- KASSERT(!pmap_page_is_write_mapped(m),
	- ("zfs_putpages: page %p is not read-only", m));
	- vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
	- pgoff);
	- }
	- } else {
	- len = 0;
	- ncount = 0;
	- }
	- if (ncount < pcount) {
	- for (i = ncount; i < pcount; i++) {
	- rtvals[i] = zfs_vm_pagerret_bad;
	- }
	- }
	- }
	- zfs_vmobject_wunlock(object);
	-
	- if (ncount == 0)
	- goto out;
	-
	- if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) \|\|
	- zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
	- goto out;
	- }
	-
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_write(tx, zp->z_id, off, len);
	-
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- zfs_sa_upgrade_txholds(tx, zp);
	- err = dmu_tx_assign(tx, TXG_WAIT);
	- if (err != 0) {
	- dmu_tx_abort(tx);
	- goto out;
	- }
	-
	- if (zp->z_blksz < PAGE_SIZE) {
	- for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
	- tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
	- va = zfs_map_page(ma[i], &sf);
	- dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
	- zfs_unmap_page(sf);
	- }
	- } else {
	- err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
	- }
	-
	- if (err == 0) {
	- uint64_t mtime[2], ctime[2];
	- sa_bulk_attr_t bulk[3];
	- int count = 0;
	-
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
	- &mtime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
	- &ctime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	- &zp->z_pflags, 8);
	- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
	- B_TRUE);
	- err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	- ASSERT0(err);
	- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
	-
	- zfs_vmobject_wlock(object);
	- for (i = 0; i < ncount; i++) {
	- rtvals[i] = zfs_vm_pagerret_ok;
	- vm_page_undirty(ma[i]);
	- }
	- zfs_vmobject_wunlock(object);
	- VM_CNT_INC(v_vnodeout);
	- VM_CNT_ADD(v_vnodepgsout, ncount);
	- }
	- dmu_tx_commit(tx);
	-
	-out:
	- rangelock_exit(lr);
	- if ((flags & (zfs_vm_pagerput_sync \| zfs_vm_pagerput_inval)) != 0 \|\|
	- zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	- zil_commit(zfsvfs->z_log, zp->z_id);
	- ZFS_EXIT(zfsvfs);
	- return (rtvals[0]);
	-}
	-
	-int
	-zfs_freebsd_putpages(ap)
	- struct vop_putpages_args /* {
	- struct vnode *a_vp;
	- vm_page_t *a_m;
	- int a_count;
	- int a_sync;
	- int *a_rtvals;
	- } / ap;
	-{
	-
	- return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
	- ap->a_rtvals));
	-}
	-
	-static int
	-zfs_freebsd_bmap(ap)
	- struct vop_bmap_args /* {
	- struct vnode *a_vp;
	- daddr_t a_bn;
	- struct bufobj **a_bop;
	- daddr_t *a_bnp;
	- int *a_runp;
	- int *a_runb;
	- } / ap;
	-{
	-
	- if (ap->a_bop != NULL)
	- *ap->a_bop = &ap->a_vp->v_bufobj;
	- if (ap->a_bnp != NULL)
	- *ap->a_bnp = ap->a_bn;
	- if (ap->a_runp != NULL)
	- *ap->a_runp = 0;
	- if (ap->a_runb != NULL)
	- *ap->a_runb = 0;
	-
	- return (0);
	-}
	-
	-static int
	-zfs_freebsd_open(ap)
	- struct vop_open_args /* {
	- struct vnode *a_vp;
	- int a_mode;
	- struct ucred *a_cred;
	- struct thread *a_td;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	- znode_t *zp = VTOZ(vp);
	- int error;
	-
	- error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
	- if (error == 0)
	- vnode_create_vobject(vp, zp->z_size, ap->a_td);
	- return (error);
	-}
	-
	-static int
	-zfs_freebsd_close(ap)
	- struct vop_close_args /* {
	- struct vnode *a_vp;
	- int a_fflag;
	- struct ucred *a_cred;
	- struct thread *a_td;
	- } / ap;
	-{
	-
	- return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
	-}
	-
	-static int
	-zfs_freebsd_ioctl(ap)
	- struct vop_ioctl_args /* {
	- struct vnode *a_vp;
	- u_long a_command;
	- caddr_t a_data;
	- int a_fflag;
	- struct ucred *cred;
	- struct thread *td;
	- } / ap;
	-{
	-
	- return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
	- ap->a_fflag, ap->a_cred, NULL, NULL));
	-}
	-
	-static int
	-ioflags(int ioflags)
	-{
	- int flags = 0;
	-
	- if (ioflags & IO_APPEND)
	- flags \|= FAPPEND;
	- if (ioflags & IO_NDELAY)
	- flags \|= FNONBLOCK;
	- if (ioflags & IO_SYNC)
	- flags \|= (FSYNC \| FDSYNC \| FRSYNC);
	-
	- return (flags);
	-}
	-
	-static int
	-zfs_freebsd_read(ap)
	- struct vop_read_args /* {
	- struct vnode *a_vp;
	- struct uio *a_uio;
	- int a_ioflag;
	- struct ucred *a_cred;
	- } / ap;
	-{
	-
	- return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
	- ap->a_cred, NULL));
	-}
	-
	-static int
	-zfs_freebsd_write(ap)
	- struct vop_write_args /* {
	- struct vnode *a_vp;
	- struct uio *a_uio;
	- int a_ioflag;
	- struct ucred *a_cred;
	- } / ap;
	-{
	-
	- return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
	- ap->a_cred, NULL));
	-}
	-
	-/*
	- * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
	- * the comment above cache_fplookup for details.
	- */
	-static int
	-zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
	-{
	- vnode_t *vp;
	- znode_t *zp;
	- uint64_t pflags;
	-
	- vp = v->a_vp;
	- zp = VTOZ_SMR(vp);
	- if (__predict_false(zp == NULL))
	- return (EAGAIN);
	- pflags = atomic_load_64(&zp->z_pflags);
	- if (pflags & ZFS_AV_QUARANTINED)
	- return (EAGAIN);
	- if (pflags & ZFS_XATTR)
	- return (EAGAIN);
	- if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
	- return (EAGAIN);
	- return (0);
	-}
	-
	-static int
	-zfs_freebsd_access(ap)
	- struct vop_access_args /* {
	- struct vnode *a_vp;
	- accmode_t a_accmode;
	- struct ucred *a_cred;
	- struct thread *a_td;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	- znode_t *zp = VTOZ(vp);
	- accmode_t accmode;
	- int error = 0;
	-
	- if (ap->a_accmode == VEXEC) {
	- if (zfs_freebsd_fastaccesschk_execute(ap->a_vp, ap->a_cred) == 0)
	- return (0);
	- }
	-
	- /*
	- * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
	- */
	- accmode = ap->a_accmode & (VREAD\|VWRITE\|VEXEC\|VAPPEND);
	- if (accmode != 0)
	- error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
	-
	- /*
	- * VADMIN has to be handled by vaccess().
	- */
	- if (error == 0) {
	- accmode = ap->a_accmode & ~(VREAD\|VWRITE\|VEXEC\|VAPPEND);
	- if (accmode != 0) {
	- error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
	- zp->z_gid, accmode, ap->a_cred);
	- }
	- }
	-
	- /*
	- * For VEXEC, ensure that at least one execute bit is set for
	- * non-directories.
	- */
	- if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
	- (zp->z_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH)) == 0) {
	- error = EACCES;
	- }
	-
	- return (error);
	-}
	-
	-static int
	-zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
	-{
	- struct componentname *cnp = ap->a_cnp;
	- char nm[NAME_MAX + 1];
	-
	- ASSERT(cnp->cn_namelen < sizeof(nm));
	- strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
	-
	- return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
	- cnp->cn_cred, cnp->cn_thread, 0, cached));
	-}
	-
	-static int
	-zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
	-{
	-
	- return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
	-}
	-
	-static int
	-zfs_cache_lookup(ap)
	- struct vop_lookup_args /* {
	- struct vnode *a_dvp;
	- struct vnode **a_vpp;
	- struct componentname *a_cnp;
	- } / ap;
	-{
	- zfsvfs_t *zfsvfs;
	-
	- zfsvfs = ap->a_dvp->v_mount->mnt_data;
	- if (zfsvfs->z_use_namecache)
	- return (vfs_cache_lookup(ap));
	- else
	- return (zfs_freebsd_lookup(ap, B_FALSE));
	-}
	-
	-static int
	-zfs_freebsd_create(ap)
	- struct vop_create_args /* {
	- struct vnode *a_dvp;
	- struct vnode **a_vpp;
	- struct componentname *a_cnp;
	- struct vattr *a_vap;
	- } / ap;
	-{
	- zfsvfs_t *zfsvfs;
	- struct componentname *cnp = ap->a_cnp;
	- vattr_t *vap = ap->a_vap;
	- int error, mode;
	-
	- ASSERT(cnp->cn_flags & SAVENAME);
	-
	- vattr_init_mask(vap);
	- mode = vap->va_mode & ALLPERMS;
	- zfsvfs = ap->a_dvp->v_mount->mnt_data;
	-
	- error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
	- ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
	- if (zfsvfs->z_use_namecache &&
	- error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
	- cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
	- return (error);
	-}
	-
	-static int
	-zfs_freebsd_remove(ap)
	- struct vop_remove_args /* {
	- struct vnode *a_dvp;
	- struct vnode *a_vp;
	- struct componentname *a_cnp;
	- } / ap;
	-{
	-
	- ASSERT(ap->a_cnp->cn_flags & SAVENAME);
	-
	- return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
	- ap->a_cnp->cn_cred));
	-}
	-
	-static int
	-zfs_freebsd_mkdir(ap)
	- struct vop_mkdir_args /* {
	- struct vnode *a_dvp;
	- struct vnode **a_vpp;
	- struct componentname *a_cnp;
	- struct vattr *a_vap;
	- } / ap;
	-{
	- vattr_t *vap = ap->a_vap;
	-
	- ASSERT(ap->a_cnp->cn_flags & SAVENAME);
	-
	- vattr_init_mask(vap);
	-
	- return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
	- ap->a_cnp->cn_cred));
	-}
	-
	-static int
	-zfs_freebsd_rmdir(ap)
	- struct vop_rmdir_args /* {
	- struct vnode *a_dvp;
	- struct vnode *a_vp;
	- struct componentname *a_cnp;
	- } / ap;
	-{
	- struct componentname *cnp = ap->a_cnp;
	-
	- ASSERT(cnp->cn_flags & SAVENAME);
	-
	- return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
	-}
	-
	-static int
	-zfs_freebsd_readdir(ap)
	- struct vop_readdir_args /* {
	- struct vnode *a_vp;
	- struct uio *a_uio;
	- struct ucred *a_cred;
	- int *a_eofflag;
	- int *a_ncookies;
	- u_long **a_cookies;
	- } / ap;
	-{
	-
	- return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
	- ap->a_ncookies, ap->a_cookies));
	-}
	-
	-static int
	-zfs_freebsd_fsync(ap)
	- struct vop_fsync_args /* {
	- struct vnode *a_vp;
	- int a_waitfor;
	- struct thread *a_td;
	- } / ap;
	-{
	-
	- vop_stdfsync(ap);
	- return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
	-}
	-
	-static int
	-zfs_freebsd_getattr(ap)
	- struct vop_getattr_args /* {
	- struct vnode *a_vp;
	- struct vattr *a_vap;
	- struct ucred *a_cred;
	- } / ap;
	-{
	- vattr_t *vap = ap->a_vap;
	- xvattr_t xvap;
	- u_long fflags = 0;
	- int error;
	-
	- xva_init(&xvap);
	- xvap.xva_vattr = *vap;
	- xvap.xva_vattr.va_mask \|= AT_XVATTR;
	-
	- /* Convert chflags into ZFS-type flags. */
	- /* XXX: what about SF_SETTABLE?. */
	- XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
	- XVA_SET_REQ(&xvap, XAT_APPENDONLY);
	- XVA_SET_REQ(&xvap, XAT_NOUNLINK);
	- XVA_SET_REQ(&xvap, XAT_NODUMP);
	- XVA_SET_REQ(&xvap, XAT_READONLY);
	- XVA_SET_REQ(&xvap, XAT_ARCHIVE);
	- XVA_SET_REQ(&xvap, XAT_SYSTEM);
	- XVA_SET_REQ(&xvap, XAT_HIDDEN);
	- XVA_SET_REQ(&xvap, XAT_REPARSE);
	- XVA_SET_REQ(&xvap, XAT_OFFLINE);
	- XVA_SET_REQ(&xvap, XAT_SPARSE);
	-
	- error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
	- if (error != 0)
	- return (error);
	-
	- /* Convert ZFS xattr into chflags. */
	-#define FLAG_CHECK(fflag, xflag, xfield) do { \
	- if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
	- fflags \|= (fflag); \
	-} while (0)
	- FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
	- xvap.xva_xoptattrs.xoa_immutable);
	- FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
	- xvap.xva_xoptattrs.xoa_appendonly);
	- FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
	- xvap.xva_xoptattrs.xoa_nounlink);
	- FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
	- xvap.xva_xoptattrs.xoa_archive);
	- FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
	- xvap.xva_xoptattrs.xoa_nodump);
	- FLAG_CHECK(UF_READONLY, XAT_READONLY,
	- xvap.xva_xoptattrs.xoa_readonly);
	- FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
	- xvap.xva_xoptattrs.xoa_system);
	- FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
	- xvap.xva_xoptattrs.xoa_hidden);
	- FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
	- xvap.xva_xoptattrs.xoa_reparse);
	- FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
	- xvap.xva_xoptattrs.xoa_offline);
	- FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
	- xvap.xva_xoptattrs.xoa_sparse);
	-
	-#undef FLAG_CHECK
	- *vap = xvap.xva_vattr;
	- vap->va_flags = fflags;
	- return (0);
	-}
	-
	-static int
	-zfs_freebsd_setattr(ap)
	- struct vop_setattr_args /* {
	- struct vnode *a_vp;
	- struct vattr *a_vap;
	- struct ucred *a_cred;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	- vattr_t *vap = ap->a_vap;
	- cred_t *cred = ap->a_cred;
	- xvattr_t xvap;
	- u_long fflags;
	- uint64_t zflags;
	-
	- vattr_init_mask(vap);
	- vap->va_mask &= ~AT_NOSET;
	-
	- xva_init(&xvap);
	- xvap.xva_vattr = *vap;
	-
	- zflags = VTOZ(vp)->z_pflags;
	-
	- if (vap->va_flags != VNOVAL) {
	- zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
	- int error;
	-
	- if (zfsvfs->z_use_fuids == B_FALSE)
	- return (EOPNOTSUPP);
	-
	- fflags = vap->va_flags;
	- /*
	- * XXX KDM
	- * We need to figure out whether it makes sense to allow
	- * UF_REPARSE through, since we don't really have other
	- * facilities to handle reparse points and zfs_setattr()
	- * doesn't currently allow setting that attribute anyway.
	- */
	- if ((fflags & ~(SF_IMMUTABLE\|SF_APPEND\|SF_NOUNLINK\|UF_ARCHIVE\|
	- UF_NODUMP\|UF_SYSTEM\|UF_HIDDEN\|UF_READONLY\|UF_REPARSE\|
	- UF_OFFLINE\|UF_SPARSE)) != 0)
	- return (EOPNOTSUPP);
	- /*
	- * Unprivileged processes are not permitted to unset system
	- * flags, or modify flags if any system flags are set.
	- * Privileged non-jail processes may not modify system flags
	- * if securelevel > 0 and any existing system flags are set.
	- * Privileged jail processes behave like privileged non-jail
	- * processes if the PR_ALLOW_CHFLAGS permission bit is set;
	- * otherwise, they behave like unprivileged processes.
	- */
	- if (secpolicy_fs_owner(vp->v_mount, cred) == 0 \|\|
	- priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
	- if (zflags &
	- (ZFS_IMMUTABLE \| ZFS_APPENDONLY \| ZFS_NOUNLINK)) {
	- error = securelevel_gt(cred, 0);
	- if (error != 0)
	- return (error);
	- }
	- } else {
	- /*
	- * Callers may only modify the file flags on objects they
	- * have VADMIN rights for.
	- */
	- if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
	- return (error);
	- if (zflags &
	- (ZFS_IMMUTABLE \| ZFS_APPENDONLY \| ZFS_NOUNLINK)) {
	- return (EPERM);
	- }
	- if (fflags &
	- (SF_IMMUTABLE \| SF_APPEND \| SF_NOUNLINK)) {
	- return (EPERM);
	- }
	- }
	-
	-#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
	- if (((fflags & (fflag)) && !(zflags & (zflag))) \|\| \
	- ((zflags & (zflag)) && !(fflags & (fflag)))) { \
	- XVA_SET_REQ(&xvap, (xflag)); \
	- (xfield) = ((fflags & (fflag)) != 0); \
	- } \
	-} while (0)
	- /* Convert chflags into ZFS-type flags. */
	- /* XXX: what about SF_SETTABLE?. */
	- FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
	- xvap.xva_xoptattrs.xoa_immutable);
	- FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
	- xvap.xva_xoptattrs.xoa_appendonly);
	- FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
	- xvap.xva_xoptattrs.xoa_nounlink);
	- FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
	- xvap.xva_xoptattrs.xoa_archive);
	- FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
	- xvap.xva_xoptattrs.xoa_nodump);
	- FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
	- xvap.xva_xoptattrs.xoa_readonly);
	- FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
	- xvap.xva_xoptattrs.xoa_system);
	- FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
	- xvap.xva_xoptattrs.xoa_hidden);
	- FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
	- xvap.xva_xoptattrs.xoa_reparse);
	- FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
	- xvap.xva_xoptattrs.xoa_offline);
	- FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
	- xvap.xva_xoptattrs.xoa_sparse);
	-#undef FLAG_CHANGE
	- }
	- if (vap->va_birthtime.tv_sec != VNOVAL) {
	- xvap.xva_vattr.va_mask \|= AT_XVATTR;
	- XVA_SET_REQ(&xvap, XAT_CREATETIME);
	- }
	- return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
	-}
	-
	-static int
	-zfs_freebsd_rename(ap)
	- struct vop_rename_args /* {
	- struct vnode *a_fdvp;
	- struct vnode *a_fvp;
	- struct componentname *a_fcnp;
	- struct vnode *a_tdvp;
	- struct vnode *a_tvp;
	- struct componentname *a_tcnp;
	- } / ap;
	-{
	- vnode_t *fdvp = ap->a_fdvp;
	- vnode_t *fvp = ap->a_fvp;
	- vnode_t *tdvp = ap->a_tdvp;
	- vnode_t *tvp = ap->a_tvp;
	- int error;
	-
	- ASSERT(ap->a_fcnp->cn_flags & (SAVENAME\|SAVESTART));
	- ASSERT(ap->a_tcnp->cn_flags & (SAVENAME\|SAVESTART));
	-
	- error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
	- ap->a_tcnp, ap->a_fcnp->cn_cred);
	-
	- vrele(fdvp);
	- vrele(fvp);
	- vrele(tdvp);
	- if (tvp != NULL)
	- vrele(tvp);
	-
	- return (error);
	-}
	-
	-static int
	-zfs_freebsd_symlink(ap)
	- struct vop_symlink_args /* {
	- struct vnode *a_dvp;
	- struct vnode **a_vpp;
	- struct componentname *a_cnp;
	- struct vattr *a_vap;
	- char *a_target;
	- } / ap;
	-{
	- struct componentname *cnp = ap->a_cnp;
	- vattr_t *vap = ap->a_vap;
	-
	- ASSERT(cnp->cn_flags & SAVENAME);
	-
	- vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
	- vattr_init_mask(vap);
	-
	- return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
	- __DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread));
	-}
	-
	-static int
	-zfs_freebsd_readlink(ap)
	- struct vop_readlink_args /* {
	- struct vnode *a_vp;
	- struct uio *a_uio;
	- struct ucred *a_cred;
	- } / ap;
	-{
	-
	- return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
	-}
	-
	-static int
	-zfs_freebsd_link(ap)
	- struct vop_link_args /* {
	- struct vnode *a_tdvp;
	- struct vnode *a_vp;
	- struct componentname *a_cnp;
	- } / ap;
	-{
	- struct componentname *cnp = ap->a_cnp;
	- vnode_t *vp = ap->a_vp;
	- vnode_t *tdvp = ap->a_tdvp;
	-
	- if (tdvp->v_mount != vp->v_mount)
	- return (EXDEV);
	-
	- ASSERT(cnp->cn_flags & SAVENAME);
	-
	- return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
	-}
	-
	-static int
	-zfs_freebsd_inactive(ap)
	- struct vop_inactive_args /* {
	- struct vnode *a_vp;
	- struct thread *a_td;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	-
	- zfs_inactive(vp, ap->a_td->td_ucred, NULL);
	- return (0);
	-}
	-
	-static int
	-zfs_freebsd_need_inactive(ap)
	- struct vop_need_inactive_args /* {
	- struct vnode *a_vp;
	- struct thread *a_td;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- int need;
	-
	- if (vn_need_pageq_flush(vp))
	- return (1);
	-
	- if (!ZFS_TRYRLOCK_TEARDOWN_INACTIVE(zfsvfs))
	- return (1);
	- need = (zp->z_sa_hdl == NULL \|\| zp->z_unlinked \|\| zp->z_atime_dirty);
	- ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
	-
	- return (need);
	-}
	-
	-static int
	-zfs_freebsd_reclaim(ap)
	- struct vop_reclaim_args /* {
	- struct vnode *a_vp;
	- struct thread *a_td;
	- } / ap;
	-{
	- vnode_t *vp = ap->a_vp;
	- znode_t *zp = VTOZ(vp);
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	-
	- ASSERT(zp != NULL);
	-
	- /*
	- * z_teardown_inactive_lock protects from a race with
	- * zfs_znode_dmu_fini in zfsvfs_teardown during
	- * force unmount.
	- */
	- ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs);
	- if (zp->z_sa_hdl == NULL)
	- zfs_znode_free(zp);
	- else
	- zfs_zinactive(zp);
	- ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
	-
	- vp->v_data = NULL;
	- return (0);
	-}
	-
	-static int
	-zfs_freebsd_fid(ap)
	- struct vop_fid_args /* {
	- struct vnode *a_vp;
	- struct fid *a_fid;
	- } / ap;
	-{
	-
	- return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
	-}
	-
	-static int
	-zfs_freebsd_pathconf(ap)
	- struct vop_pathconf_args /* {
	- struct vnode *a_vp;
	- int a_name;
	- register_t *a_retval;
	- } / ap;
	-{
	- ulong_t val;
	- int error;
	-
	- error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
	- if (error == 0) {
	- *ap->a_retval = val;
	- return (error);
	- }
	- if (error != EOPNOTSUPP)
	- return (error);
	-
	- switch (ap->a_name) {
	- case _PC_NAME_MAX:
	- *ap->a_retval = NAME_MAX;
	- return (0);
	- case _PC_PIPE_BUF:
	- if (ap->a_vp->v_type == VDIR \|\| ap->a_vp->v_type == VFIFO) {
	- *ap->a_retval = PIPE_BUF;
	- return (0);
	- }
	- return (EINVAL);
	- default:
	- return (vop_stdpathconf(ap));
	- }
	-}
	-
	-/*
	- * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
	- * extended attribute name:
	- *
	- * NAMESPACE PREFIX
	- * system freebsd:system:
	- * user (none, can be used to access ZFS fsattr(5) attributes
	- * created on Solaris)
	- */
	-static int
	-zfs_create_attrname(int attrnamespace, const char name, char attrname,
	- size_t size)
	-{
	- const char namespace, prefix, *suffix;
	-
	- /* We don't allow '/' character in attribute name. */
	- if (strchr(name, '/') != NULL)
	- return (EINVAL);
	- /* We don't allow attribute names that start with "freebsd:" string. */
	- if (strncmp(name, "freebsd:", 8) == 0)
	- return (EINVAL);
	-
	- bzero(attrname, size);
	-
	- switch (attrnamespace) {
	- case EXTATTR_NAMESPACE_USER:
	-#if 0
	- prefix = "freebsd:";
	- namespace = EXTATTR_NAMESPACE_USER_STRING;
	- suffix = ":";
	-#else
	- /*
	- * This is the default namespace by which we can access all
	- * attributes created on Solaris.
	- */
	- prefix = namespace = suffix = "";
	-#endif
	- break;
	- case EXTATTR_NAMESPACE_SYSTEM:
	- prefix = "freebsd:";
	- namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
	- suffix = ":";
	- break;
	- case EXTATTR_NAMESPACE_EMPTY:
	- default:
	- return (EINVAL);
	- }
	- if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
	- name) >= size) {
	- return (ENAMETOOLONG);
	- }
	- return (0);
	-}
	-
	-/*
	- * Vnode operating to retrieve a named extended attribute.
	- */
	-static int
	-zfs_getextattr(struct vop_getextattr_args *ap)
	-/*
	-vop_getextattr {
	- IN struct vnode *a_vp;
	- IN int a_attrnamespace;
	- IN const char *a_name;
	- INOUT struct uio *a_uio;
	- OUT size_t *a_size;
	- IN struct ucred *a_cred;
	- IN struct thread *a_td;
	-};
	-*/
	-{
	- zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
	- struct thread *td = ap->a_td;
	- struct nameidata nd;
	- char attrname[255];
	- struct vattr va;
	- vnode_t xvp = NULL, vp;
	- int error, flags;
	-
	- error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	- ap->a_cred, ap->a_td, VREAD);
	- if (error != 0)
	- return (error);
	-
	- error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
	- sizeof(attrname));
	- if (error != 0)
	- return (error);
	-
	- ZFS_ENTER(zfsvfs);
	-
	- error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
	- LOOKUP_XATTR, B_FALSE);
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- flags = FREAD;
	- NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
	- xvp, td);
	- error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL);
	- vp = nd.ni_vp;
	- NDFREE(&nd, NDF_ONLY_PNBUF);
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- if (error == ENOENT)
	- error = ENOATTR;
	- return (error);
	- }
	-
	- if (ap->a_size != NULL) {
	- error = VOP_GETATTR(vp, &va, ap->a_cred);
	- if (error == 0)
	- *ap->a_size = (size_t)va.va_size;
	- } else if (ap->a_uio != NULL)
	- error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
	-
	- VOP_UNLOCK(vp);
	- vn_close(vp, flags, ap->a_cred, td);
	- ZFS_EXIT(zfsvfs);
	-
	- return (error);
	-}
	-
	-/*
	- * Vnode operation to remove a named attribute.
	- */
	-int
	-zfs_deleteextattr(struct vop_deleteextattr_args *ap)
	-/*
	-vop_deleteextattr {
	- IN struct vnode *a_vp;
	- IN int a_attrnamespace;
	- IN const char *a_name;
	- IN struct ucred *a_cred;
	- IN struct thread *a_td;
	-};
	-*/
	-{
	- zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
	- struct thread *td = ap->a_td;
	- struct nameidata nd;
	- char attrname[255];
	- struct vattr va;
	- vnode_t xvp = NULL, vp;
	- int error, flags;
	-
	- error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	- ap->a_cred, ap->a_td, VWRITE);
	- if (error != 0)
	- return (error);
	-
	- error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
	- sizeof(attrname));
	- if (error != 0)
	- return (error);
	-
	- ZFS_ENTER(zfsvfs);
	-
	- error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
	- LOOKUP_XATTR, B_FALSE);
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- NDINIT_ATVP(&nd, DELETE, NOFOLLOW \| LOCKPARENT \| LOCKLEAF,
	- UIO_SYSSPACE, attrname, xvp, td);
	- error = namei(&nd);
	- vp = nd.ni_vp;
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- NDFREE(&nd, NDF_ONLY_PNBUF);
	- if (error == ENOENT)
	- error = ENOATTR;
	- return (error);
	- }
	-
	- error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
	- NDFREE(&nd, NDF_ONLY_PNBUF);
	-
	- vput(nd.ni_dvp);
	- if (vp == nd.ni_dvp)
	- vrele(vp);
	- else
	- vput(vp);
	- ZFS_EXIT(zfsvfs);
	-
	- return (error);
	-}
	-
	-/*
	- * Vnode operation to set a named attribute.
	- */
	-static int
	-zfs_setextattr(struct vop_setextattr_args *ap)
	-/*
	-vop_setextattr {
	- IN struct vnode *a_vp;
	- IN int a_attrnamespace;
	- IN const char *a_name;
	- INOUT struct uio *a_uio;
	- IN struct ucred *a_cred;
	- IN struct thread *a_td;
	-};
	-*/
	-{
	- zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
	- struct thread *td = ap->a_td;
	- struct nameidata nd;
	- char attrname[255];
	- struct vattr va;
	- vnode_t xvp = NULL, vp;
	- int error, flags;
	-
	- error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	- ap->a_cred, ap->a_td, VWRITE);
	- if (error != 0)
	- return (error);
	-
	- error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
	- sizeof(attrname));
	- if (error != 0)
	- return (error);
	-
	- ZFS_ENTER(zfsvfs);
	-
	- error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
	- LOOKUP_XATTR \| CREATE_XATTR_DIR, B_FALSE);
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- flags = FFLAGS(O_WRONLY \| O_CREAT);
	- NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
	- xvp, td);
	- error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
	- NULL);
	- vp = nd.ni_vp;
	- NDFREE(&nd, NDF_ONLY_PNBUF);
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- VATTR_NULL(&va);
	- va.va_size = 0;
	- error = VOP_SETATTR(vp, &va, ap->a_cred);
	- if (error == 0)
	- VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
	-
	- VOP_UNLOCK(vp);
	- vn_close(vp, flags, ap->a_cred, td);
	- ZFS_EXIT(zfsvfs);
	-
	- return (error);
	-}
	-
	-/*
	- * Vnode operation to retrieve extended attributes on a vnode.
	- */
	-static int
	-zfs_listextattr(struct vop_listextattr_args *ap)
	-/*
	-vop_listextattr {
	- IN struct vnode *a_vp;
	- IN int a_attrnamespace;
	- INOUT struct uio *a_uio;
	- OUT size_t *a_size;
	- IN struct ucred *a_cred;
	- IN struct thread *a_td;
	-};
	-*/
	-{
	- zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
	- struct thread *td = ap->a_td;
	- struct nameidata nd;
	- char attrprefix[16];
	- u_char dirbuf[sizeof(struct dirent)];
	- struct dirent *dp;
	- struct iovec aiov;
	- struct uio auio, *uio = ap->a_uio;
	- size_t *sizep = ap->a_size;
	- size_t plen;
	- vnode_t xvp = NULL, vp;
	- int done, error, eof, pos;
	-
	- error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	- ap->a_cred, ap->a_td, VREAD);
	- if (error != 0)
	- return (error);
	-
	- error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
	- sizeof(attrprefix));
	- if (error != 0)
	- return (error);
	- plen = strlen(attrprefix);
	-
	- ZFS_ENTER(zfsvfs);
	-
	- if (sizep != NULL)
	- *sizep = 0;
	-
	- error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
	- LOOKUP_XATTR, B_FALSE);
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- /*
	- * ENOATTR means that the EA directory does not yet exist,
	- * i.e. there are no extended attributes there.
	- */
	- if (error == ENOATTR)
	- error = 0;
	- return (error);
	- }
	-
	- NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW \| LOCKLEAF \| LOCKSHARED,
	- UIO_SYSSPACE, ".", xvp, td);
	- error = namei(&nd);
	- vp = nd.ni_vp;
	- NDFREE(&nd, NDF_ONLY_PNBUF);
	- if (error != 0) {
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	-
	- auio.uio_iov = &aiov;
	- auio.uio_iovcnt = 1;
	- auio.uio_segflg = UIO_SYSSPACE;
	- auio.uio_td = td;
	- auio.uio_rw = UIO_READ;
	- auio.uio_offset = 0;
	-
	- do {
	- u_char nlen;
	-
	- aiov.iov_base = (void *)dirbuf;
	- aiov.iov_len = sizeof(dirbuf);
	- auio.uio_resid = sizeof(dirbuf);
	- error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
	- done = sizeof(dirbuf) - auio.uio_resid;
	- if (error != 0)
	- break;
	- for (pos = 0; pos < done;) {
	- dp = (struct dirent *)(dirbuf + pos);
	- pos += dp->d_reclen;
	- /*
	- * XXX: Temporarily we also accept DT_UNKNOWN, as this
	- * is what we get when attribute was created on Solaris.
	- */
	- if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
	- continue;
	- if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
	- continue;
	- else if (strncmp(dp->d_name, attrprefix, plen) != 0)
	- continue;
	- nlen = dp->d_namlen - plen;
	- if (sizep != NULL)
	- *sizep += 1 + nlen;
	- else if (uio != NULL) {
	- /*
	- * Format of extattr name entry is one byte for
	- * length and the rest for name.
	- */
	- error = uiomove(&nlen, 1, uio->uio_rw, uio);
	- if (error == 0) {
	- error = uiomove(dp->d_name + plen, nlen,
	- uio->uio_rw, uio);
	- }
	- if (error != 0)
	- break;
	- }
	- }
	- } while (!eof && error == 0);
	-
	- vput(vp);
	- ZFS_EXIT(zfsvfs);
	-
	- return (error);
	-}
	-
	-int
	-zfs_freebsd_getacl(ap)
	- struct vop_getacl_args /* {
	- struct vnode *vp;
	- acl_type_t type;
	- struct acl *aclp;
	- struct ucred *cred;
	- struct thread *td;
	- } / ap;
	-{
	- int error;
	- vsecattr_t vsecattr;
	-
	- if (ap->a_type != ACL_TYPE_NFS4)
	- return (EINVAL);
	-
	- vsecattr.vsa_mask = VSA_ACE \| VSA_ACECNT;
	- if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
	- return (error);
	-
	- error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
	- if (vsecattr.vsa_aclentp != NULL)
	- kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
	-
	- return (error);
	-}
	-
	-int
	-zfs_freebsd_setacl(ap)
	- struct vop_setacl_args /* {
	- struct vnode *vp;
	- acl_type_t type;
	- struct acl *aclp;
	- struct ucred *cred;
	- struct thread *td;
	- } / ap;
	-{
	- int error;
	- vsecattr_t vsecattr;
	- int aclbsize; /* size of acl list in bytes */
	- aclent_t *aaclp;
	-
	- if (ap->a_type != ACL_TYPE_NFS4)
	- return (EINVAL);
	-
	- if (ap->a_aclp == NULL)
	- return (EINVAL);
	-
	- if (ap->a_aclp->acl_cnt < 1 \|\| ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
	- return (EINVAL);
	-
	- /*
	- * With NFSv4 ACLs, chmod(2) may need to add additional entries,
	- * splitting every entry into two and appending "canonical six"
	- * entries at the end. Don't allow for setting an ACL that would
	- * cause chmod(2) to run out of ACL entries.
	- */
	- if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
	- return (ENOSPC);
	-
	- error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
	- if (error != 0)
	- return (error);
	-
	- vsecattr.vsa_mask = VSA_ACE;
	- aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
	- vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
	- aaclp = vsecattr.vsa_aclentp;
	- vsecattr.vsa_aclentsz = aclbsize;
	-
	- aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
	- error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
	- kmem_free(aaclp, aclbsize);
	-
	- return (error);
	-}
	-
	-int
	-zfs_freebsd_aclcheck(ap)
	- struct vop_aclcheck_args /* {
	- struct vnode *vp;
	- acl_type_t type;
	- struct acl *aclp;
	- struct ucred *cred;
	- struct thread *td;
	- } / ap;
	-{
	-
	- return (EOPNOTSUPP);
	-}
	-
	-static int
	-zfs_vptocnp(struct vop_vptocnp_args *ap)
	-{
	- vnode_t *covered_vp;
	- vnode_t *vp = ap->a_vp;;
	- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
	- znode_t *zp = VTOZ(vp);
	- enum vgetstate vs;
	- int ltype;
	- int error;
	-
	- ZFS_ENTER(zfsvfs);
	- ZFS_VERIFY_ZP(zp);
	-
	- /*
	- * If we are a snapshot mounted under .zfs, run the operation
	- * on the covered vnode.
	- */
	- if (zp->z_id != zfsvfs->z_root \|\| zfsvfs->z_parent == zfsvfs) {
	- char name[MAXNAMLEN + 1];
	- znode_t *dzp;
	- size_t len;
	-
	- error = zfs_znode_parent_and_name(zp, &dzp, name);
	- if (error == 0) {
	- len = strlen(name);
	- if (*ap->a_buflen < len)
	- error = SET_ERROR(ENOMEM);
	- }
	- if (error == 0) {
	- *ap->a_buflen -= len;
	- bcopy(name, ap->a_buf + *ap->a_buflen, len);
	- *ap->a_vpp = ZTOV(dzp);
	- }
	- ZFS_EXIT(zfsvfs);
	- return (error);
	- }
	- ZFS_EXIT(zfsvfs);
	-
	- covered_vp = vp->v_mount->mnt_vnodecovered;
	- vs = vget_prep(covered_vp);
	- ltype = VOP_ISLOCKED(vp);
	- VOP_UNLOCK(vp);
	- error = vget_finish(covered_vp, LK_SHARED, vs);
	- if (error == 0) {
	- error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
	- ap->a_buf, ap->a_buflen);
	- vput(covered_vp);
	- }
	- vn_lock(vp, ltype \| LK_RETRY);
	- if (VN_IS_DOOMED(vp))
	- error = SET_ERROR(ENOENT);
	- return (error);
	-}
	-
	-#ifdef DIAGNOSTIC
	-static int
	-zfs_lock(ap)
	- struct vop_lock1_args /* {
	- struct vnode *a_vp;
	- int a_flags;
	- char *file;
	- int line;
	- } / ap;
	-{
	- vnode_t *vp;
	- znode_t *zp;
	- int err;
	-
	- err = vop_lock(ap);
	- if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
	- vp = ap->a_vp;
	- zp = vp->v_data;
	- if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) &&
	- zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
	- VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
	- }
	- return (err);
	-}
	-#endif
	-
	-struct vop_vector zfs_vnodeops;
	-struct vop_vector zfs_fifoops;
	-struct vop_vector zfs_shareops;
	-
	-struct vop_vector zfs_vnodeops = {
	- .vop_default = &default_vnodeops,
	- .vop_inactive = zfs_freebsd_inactive,
	- .vop_need_inactive = zfs_freebsd_need_inactive,
	- .vop_reclaim = zfs_freebsd_reclaim,
	- .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
	- .vop_access = zfs_freebsd_access,
	- .vop_allocate = VOP_EINVAL,
	- .vop_lookup = zfs_cache_lookup,
	- .vop_cachedlookup = zfs_freebsd_cachedlookup,
	- .vop_getattr = zfs_freebsd_getattr,
	- .vop_setattr = zfs_freebsd_setattr,
	- .vop_create = zfs_freebsd_create,
	- .vop_mknod = zfs_freebsd_create,
	- .vop_mkdir = zfs_freebsd_mkdir,
	- .vop_readdir = zfs_freebsd_readdir,
	- .vop_fsync = zfs_freebsd_fsync,
	- .vop_open = zfs_freebsd_open,
	- .vop_close = zfs_freebsd_close,
	- .vop_rmdir = zfs_freebsd_rmdir,
	- .vop_ioctl = zfs_freebsd_ioctl,
	- .vop_link = zfs_freebsd_link,
	- .vop_symlink = zfs_freebsd_symlink,
	- .vop_readlink = zfs_freebsd_readlink,
	- .vop_read = zfs_freebsd_read,
	- .vop_write = zfs_freebsd_write,
	- .vop_remove = zfs_freebsd_remove,
	- .vop_rename = zfs_freebsd_rename,
	- .vop_pathconf = zfs_freebsd_pathconf,
	- .vop_bmap = zfs_freebsd_bmap,
	- .vop_fid = zfs_freebsd_fid,
	- .vop_getextattr = zfs_getextattr,
	- .vop_deleteextattr = zfs_deleteextattr,
	- .vop_setextattr = zfs_setextattr,
	- .vop_listextattr = zfs_listextattr,
	- .vop_getacl = zfs_freebsd_getacl,
	- .vop_setacl = zfs_freebsd_setacl,
	- .vop_aclcheck = zfs_freebsd_aclcheck,
	- .vop_getpages = zfs_freebsd_getpages,
	- .vop_putpages = zfs_freebsd_putpages,
	- .vop_vptocnp = zfs_vptocnp,
	-#ifdef DIAGNOSTIC
	- .vop_lock1 = zfs_lock,
	-#else
	- .vop_lock1 = vop_lock,
	-#endif
	- .vop_unlock = vop_unlock,
	- .vop_islocked = vop_islocked,
	-};
	-VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
	-
	-struct vop_vector zfs_fifoops = {
	- .vop_default = &fifo_specops,
	- .vop_fsync = zfs_freebsd_fsync,
	- .vop_access = zfs_freebsd_access,
	- .vop_getattr = zfs_freebsd_getattr,
	- .vop_inactive = zfs_freebsd_inactive,
	- .vop_read = VOP_PANIC,
	- .vop_reclaim = zfs_freebsd_reclaim,
	- .vop_setattr = zfs_freebsd_setattr,
	- .vop_write = VOP_PANIC,
	- .vop_pathconf = zfs_freebsd_pathconf,
	- .vop_fid = zfs_freebsd_fid,
	- .vop_getacl = zfs_freebsd_getacl,
	- .vop_setacl = zfs_freebsd_setacl,
	- .vop_aclcheck = zfs_freebsd_aclcheck,
	-};
	-VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
	-
	-/*
	- * special share hidden files vnode operations template
	- */
	-struct vop_vector zfs_shareops = {
	- .vop_default = &default_vnodeops,
	- .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
	- .vop_access = zfs_freebsd_access,
	- .vop_inactive = zfs_freebsd_inactive,
	- .vop_reclaim = zfs_freebsd_reclaim,
	- .vop_fid = zfs_freebsd_fid,
	- .vop_pathconf = zfs_freebsd_pathconf,
	-};
	-VFS_VOP_VECTOR_REGISTER(zfs_shareops);
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
	@@ -1,2388 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-/* Portions Copyright 2007 Jeremy Teo */
	-/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
	-
	-#ifdef _KERNEL
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/time.h>
	-#include <sys/systm.h>
	-#include <sys/sysmacros.h>
	-#include <sys/resource.h>
	-#include <sys/mntent.h>
	-#include <sys/u8_textprep.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/vfs.h>
	-#include <sys/vnode.h>
	-#include <sys/file.h>
	-#include <sys/kmem.h>
	-#include <sys/errno.h>
	-#include <sys/unistd.h>
	-#include <sys/atomic.h>
	-#include <sys/zfs_dir.h>
	-#include <sys/zfs_acl.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zfs_rlock.h>
	-#include <sys/zfs_fuid.h>
	-#include <sys/dnode.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/kidmap.h>
	-#endif /* _KERNEL */
	-
	-#include <sys/dmu.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/refcount.h>
	-#include <sys/stat.h>
	-#include <sys/zap.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/sa.h>
	-#include <sys/zfs_sa.h>
	-#include <sys/zfs_stat.h>
	-#include <sys/refcount.h>
	-
	-#include "zfs_prop.h"
	-#include "zfs_comutil.h"
	-
	-/* Used by fstat(1). */
	-SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
	- SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)");
	-
	-/*
	- * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
	- * turned on when DEBUG is also defined.
	- */
	-#ifdef DEBUG
	-#define ZNODE_STATS
	-#endif /* DEBUG */
	-
	-#ifdef ZNODE_STATS
	-#define ZNODE_STAT_ADD(stat) ((stat)++)
	-#else
	-#define ZNODE_STAT_ADD(stat) /* nothing */
	-#endif /* ZNODE_STATS */
	-
	-/*
	- * Functions needed for userland (ie: libzpool) are not put under
	- * #ifdef_KERNEL; the rest of the functions have dependencies
	- * (such as VFS logic) that will not compile easily in userland.
	- */
	-#ifdef _KERNEL
	-/*
	- * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
	- * be freed before it can be safely accessed.
	- */
	-krwlock_t zfsvfs_lock;
	-
	-#if defined(_KERNEL) && !defined(KMEM_DEBUG)
	-#define _ZFS_USE_SMR
	-static uma_zone_t znode_uma_zone;
	-#else
	-static kmem_cache_t *znode_cache = NULL;
	-#endif
	-
	-/ARGSUSED/
	-static void
	-znode_evict_error(dmu_buf_t dbuf, void user_ptr)
	-{
	- /*
	- * We should never drop all dbuf refs without first clearing
	- * the eviction callback.
	- */
	- panic("evicting znode %p\n", user_ptr);
	-}
	-
	-extern struct vop_vector zfs_vnodeops;
	-extern struct vop_vector zfs_fifoops;
	-extern struct vop_vector zfs_shareops;
	-
	-/*
	- * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
	- * z_rangelock. It will modify the offset and length of the lock to reflect
	- * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
	- * called with the rangelock_t's rl_lock held, which avoids races.
	- */
	-static void
	-zfs_rangelock_cb(locked_range_t new, void arg)
	-{
	- znode_t *zp = arg;
	-
	- /*
	- * If in append mode, convert to writer and lock starting at the
	- * current end of file.
	- */
	- if (new->lr_type == RL_APPEND) {
	- new->lr_offset = zp->z_size;
	- new->lr_type = RL_WRITER;
	- }
	-
	- /*
	- * If we need to grow the block size then lock the whole file range.
	- */
	- uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
	- if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) \|\|
	- zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
	- new->lr_offset = 0;
	- new->lr_length = UINT64_MAX;
	- }
	-}
	-
	-/ARGSUSED/
	-static int
	-zfs_znode_cache_constructor(void buf, void arg, int kmflags)
	-{
	- znode_t *zp = buf;
	-
	- POINTER_INVALIDATE(&zp->z_zfsvfs);
	-
	- list_link_init(&zp->z_link_node);
	-
	- mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
	-
	- zp->z_acl_cached = NULL;
	- zp->z_vnode = NULL;
	- zp->z_moved = 0;
	- return (0);
	-}
	-
	-/ARGSUSED/
	-static void
	-zfs_znode_cache_destructor(void buf, void arg)
	-{
	- znode_t *zp = buf;
	-
	- ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
	- ASSERT3P(zp->z_vnode, ==, NULL);
	- ASSERT(!list_link_active(&zp->z_link_node));
	- mutex_destroy(&zp->z_acl_lock);
	- rangelock_fini(&zp->z_rangelock);
	-
	- ASSERT(zp->z_acl_cached == NULL);
	-}
	-
	-#ifdef ZNODE_STATS
	-static struct {
	- uint64_t zms_zfsvfs_invalid;
	- uint64_t zms_zfsvfs_recheck1;
	- uint64_t zms_zfsvfs_unmounted;
	- uint64_t zms_zfsvfs_recheck2;
	- uint64_t zms_obj_held;
	- uint64_t zms_vnode_locked;
	- uint64_t zms_not_only_dnlc;
	-} znode_move_stats;
	-#endif /* ZNODE_STATS */
	-
	-#ifdef illumos
	-static void
	-zfs_znode_move_impl(znode_t ozp, znode_t nzp)
	-{
	- vnode_t *vp;
	-
	- /* Copy fields. */
	- nzp->z_zfsvfs = ozp->z_zfsvfs;
	-
	- /* Swap vnodes. */
	- vp = nzp->z_vnode;
	- nzp->z_vnode = ozp->z_vnode;
	- ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
	- ZTOV(ozp)->v_data = ozp;
	- ZTOV(nzp)->v_data = nzp;
	-
	- nzp->z_id = ozp->z_id;
	- ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
	- nzp->z_unlinked = ozp->z_unlinked;
	- nzp->z_atime_dirty = ozp->z_atime_dirty;
	- nzp->z_zn_prefetch = ozp->z_zn_prefetch;
	- nzp->z_blksz = ozp->z_blksz;
	- nzp->z_seq = ozp->z_seq;
	- nzp->z_mapcnt = ozp->z_mapcnt;
	- nzp->z_gen = ozp->z_gen;
	- nzp->z_sync_cnt = ozp->z_sync_cnt;
	- nzp->z_is_sa = ozp->z_is_sa;
	- nzp->z_sa_hdl = ozp->z_sa_hdl;
	- bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
	- nzp->z_links = ozp->z_links;
	- nzp->z_size = ozp->z_size;
	- nzp->z_pflags = ozp->z_pflags;
	- nzp->z_uid = ozp->z_uid;
	- nzp->z_gid = ozp->z_gid;
	- nzp->z_mode = ozp->z_mode;
	-
	- /*
	- * Since this is just an idle znode and kmem is already dealing with
	- * memory pressure, release any cached ACL.
	- */
	- if (ozp->z_acl_cached) {
	- zfs_acl_free(ozp->z_acl_cached);
	- ozp->z_acl_cached = NULL;
	- }
	-
	- sa_set_userp(nzp->z_sa_hdl, nzp);
	-
	- /*
	- * Invalidate the original znode by clearing fields that provide a
	- * pointer back to the znode. Set the low bit of the vfs pointer to
	- * ensure that zfs_znode_move() recognizes the znode as invalid in any
	- * subsequent callback.
	- */
	- ozp->z_sa_hdl = NULL;
	- POINTER_INVALIDATE(&ozp->z_zfsvfs);
	-
	- /*
	- * Mark the znode.
	- */
	- nzp->z_moved = 1;
	- ozp->z_moved = (uint8_t)-1;
	-}
	-
	-/ARGSUSED/
	-static kmem_cbrc_t
	-zfs_znode_move(void buf, void newbuf, size_t size, void *arg)
	-{
	- znode_t ozp = buf, nzp = newbuf;
	- zfsvfs_t *zfsvfs;
	- vnode_t *vp;
	-
	- /*
	- * The znode is on the file system's list of known znodes if the vfs
	- * pointer is valid. We set the low bit of the vfs pointer when freeing
	- * the znode to invalidate it, and the memory patterns written by kmem
	- * (baddcafe and deadbeef) set at least one of the two low bits. A newly
	- * created znode sets the vfs pointer last of all to indicate that the
	- * znode is known and in a valid state to be moved by this function.
	- */
	- zfsvfs = ozp->z_zfsvfs;
	- if (!POINTER_IS_VALID(zfsvfs)) {
	- ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
	- return (KMEM_CBRC_DONT_KNOW);
	- }
	-
	- /*
	- * Close a small window in which it's possible that the filesystem could
	- * be unmounted and freed, and zfsvfs, though valid in the previous
	- * statement, could point to unrelated memory by the time we try to
	- * prevent the filesystem from being unmounted.
	- */
	- rw_enter(&zfsvfs_lock, RW_WRITER);
	- if (zfsvfs != ozp->z_zfsvfs) {
	- rw_exit(&zfsvfs_lock);
	- ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
	- return (KMEM_CBRC_DONT_KNOW);
	- }
	-
	- /*
	- * If the znode is still valid, then so is the file system. We know that
	- * no valid file system can be freed while we hold zfsvfs_lock, so we
	- * can safely ensure that the filesystem is not and will not be
	- * unmounted. The next statement is equivalent to ZFS_ENTER().
	- */
	- rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
	- if (zfsvfs->z_unmounted) {
	- ZFS_EXIT(zfsvfs);
	- rw_exit(&zfsvfs_lock);
	- ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
	- return (KMEM_CBRC_DONT_KNOW);
	- }
	- rw_exit(&zfsvfs_lock);
	-
	- mutex_enter(&zfsvfs->z_znodes_lock);
	- /*
	- * Recheck the vfs pointer in case the znode was removed just before
	- * acquiring the lock.
	- */
	- if (zfsvfs != ozp->z_zfsvfs) {
	- mutex_exit(&zfsvfs->z_znodes_lock);
	- ZFS_EXIT(zfsvfs);
	- ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
	- return (KMEM_CBRC_DONT_KNOW);
	- }
	-
	- /*
	- * At this point we know that as long as we hold z_znodes_lock, the
	- * znode cannot be freed and fields within the znode can be safely
	- * accessed. Now, prevent a race with zfs_zget().
	- */
	- if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
	- mutex_exit(&zfsvfs->z_znodes_lock);
	- ZFS_EXIT(zfsvfs);
	- ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
	- return (KMEM_CBRC_LATER);
	- }
	-
	- vp = ZTOV(ozp);
	- if (mutex_tryenter(&vp->v_lock) == 0) {
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
	- mutex_exit(&zfsvfs->z_znodes_lock);
	- ZFS_EXIT(zfsvfs);
	- ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
	- return (KMEM_CBRC_LATER);
	- }
	-
	- /* Only move znodes that are referenced _only_ by the DNLC. */
	- if (vp->v_count != 1 \|\| !vn_in_dnlc(vp)) {
	- mutex_exit(&vp->v_lock);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
	- mutex_exit(&zfsvfs->z_znodes_lock);
	- ZFS_EXIT(zfsvfs);
	- ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
	- return (KMEM_CBRC_LATER);
	- }
	-
	- /*
	- * The znode is known and in a valid state to move. We're holding the
	- * locks needed to execute the critical section.
	- */
	- zfs_znode_move_impl(ozp, nzp);
	- mutex_exit(&vp->v_lock);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
	-
	- list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
	- mutex_exit(&zfsvfs->z_znodes_lock);
	- ZFS_EXIT(zfsvfs);
	-
	- return (KMEM_CBRC_YES);
	-}
	-#endif /* illumos */
	-
	-#ifdef _ZFS_USE_SMR
	-VFS_SMR_DECLARE;
	-
	-static int
	-zfs_znode_cache_constructor_smr(void mem, int size __unused, void private, int flags)
	-{
	-
	- return (zfs_znode_cache_constructor(mem, private, flags));
	-}
	-
	-static void
	-zfs_znode_cache_destructor_smr(void mem, int size __unused, void private)
	-{
	-
	- zfs_znode_cache_destructor(mem, private);
	-}
	-
	-void
	-zfs_znode_init(void)
	-{
	- /*
	- * Initialize zcache
	- */
	- rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
	- ASSERT(znode_uma_zone == NULL);
	- znode_uma_zone = uma_zcreate("zfs_znode_cache",
	- sizeof (znode_t), zfs_znode_cache_constructor_smr,
	- zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0);
	- VFS_SMR_ZONE_SET(znode_uma_zone);
	-}
	-
	-static znode_t *
	-zfs_znode_alloc_kmem(int flags)
	-{
	-
	- return (uma_zalloc_smr(znode_uma_zone, flags));
	-}
	-
	-static void
	-zfs_znode_free_kmem(znode_t *zp)
	-{
	-
	- uma_zfree_smr(znode_uma_zone, zp);
	-}
	-#else
	-void
	-zfs_znode_init(void)
	-{
	- /*
	- * Initialize zcache
	- */
	- rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
	- ASSERT(znode_cache == NULL);
	- znode_cache = kmem_cache_create("zfs_znode_cache",
	- sizeof (znode_t), 0, zfs_znode_cache_constructor,
	- zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
	- kmem_cache_set_move(znode_cache, zfs_znode_move);
	-}
	-
	-static znode_t *
	-zfs_znode_alloc_kmem(int flags)
	-{
	-
	- return (kmem_cache_alloc(znode_cache, flags));
	-}
	-
	-static void
	-zfs_znode_free_kmem(znode_t *zp)
	-{
	-
	- kmem_cache_free(znode_cache, zp);
	-}
	-#endif
	-
	-void
	-zfs_znode_fini(void)
	-{
	-#ifdef illumos
	- /*
	- * Cleanup vfs & vnode ops
	- */
	- zfs_remove_op_tables();
	-#endif
	-
	- /*
	- * Cleanup zcache
	- */
	-#ifdef _ZFS_USE_SMR
	- if (znode_uma_zone) {
	- uma_zdestroy(znode_uma_zone);
	- znode_uma_zone = NULL;
	- }
	-#else
	- if (znode_cache) {
	- kmem_cache_destroy(znode_cache);
	- znode_cache = NULL;
	- }
	-#endif
	- rw_destroy(&zfsvfs_lock);
	-}
	-
	-#ifdef illumos
	-struct vnodeops *zfs_dvnodeops;
	-struct vnodeops *zfs_fvnodeops;
	-struct vnodeops *zfs_symvnodeops;
	-struct vnodeops *zfs_xdvnodeops;
	-struct vnodeops *zfs_evnodeops;
	-struct vnodeops *zfs_sharevnodeops;
	-
	-void
	-zfs_remove_op_tables()
	-{
	- /*
	- * Remove vfs ops
	- */
	- ASSERT(zfsfstype);
	- (void) vfs_freevfsops_by_type(zfsfstype);
	- zfsfstype = 0;
	-
	- /*
	- * Remove vnode ops
	- */
	- if (zfs_dvnodeops)
	- vn_freevnodeops(zfs_dvnodeops);
	- if (zfs_fvnodeops)
	- vn_freevnodeops(zfs_fvnodeops);
	- if (zfs_symvnodeops)
	- vn_freevnodeops(zfs_symvnodeops);
	- if (zfs_xdvnodeops)
	- vn_freevnodeops(zfs_xdvnodeops);
	- if (zfs_evnodeops)
	- vn_freevnodeops(zfs_evnodeops);
	- if (zfs_sharevnodeops)
	- vn_freevnodeops(zfs_sharevnodeops);
	-
	- zfs_dvnodeops = NULL;
	- zfs_fvnodeops = NULL;
	- zfs_symvnodeops = NULL;
	- zfs_xdvnodeops = NULL;
	- zfs_evnodeops = NULL;
	- zfs_sharevnodeops = NULL;
	-}
	-
	-extern const fs_operation_def_t zfs_dvnodeops_template[];
	-extern const fs_operation_def_t zfs_fvnodeops_template[];
	-extern const fs_operation_def_t zfs_xdvnodeops_template[];
	-extern const fs_operation_def_t zfs_symvnodeops_template[];
	-extern const fs_operation_def_t zfs_evnodeops_template[];
	-extern const fs_operation_def_t zfs_sharevnodeops_template[];
	-
	-int
	-zfs_create_op_tables()
	-{
	- int error;
	-
	- /*
	- * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
	- * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
	- * In this case we just return as the ops vectors are already set up.
	- */
	- if (zfs_dvnodeops)
	- return (0);
	-
	- error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
	- &zfs_dvnodeops);
	- if (error)
	- return (error);
	-
	- error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
	- &zfs_fvnodeops);
	- if (error)
	- return (error);
	-
	- error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
	- &zfs_symvnodeops);
	- if (error)
	- return (error);
	-
	- error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
	- &zfs_xdvnodeops);
	- if (error)
	- return (error);
	-
	- error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
	- &zfs_evnodeops);
	- if (error)
	- return (error);
	-
	- error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
	- &zfs_sharevnodeops);
	-
	- return (error);
	-}
	-#endif /* illumos */
	-
	-int
	-zfs_create_share_dir(zfsvfs_t zfsvfs, dmu_tx_t tx)
	-{
	- zfs_acl_ids_t acl_ids;
	- vattr_t vattr;
	- znode_t *sharezp;
	- znode_t *zp;
	- int error;
	-
	- vattr.va_mask = AT_MODE\|AT_UID\|AT_GID\|AT_TYPE;
	- vattr.va_type = VDIR;
	- vattr.va_mode = S_IFDIR\|0555;
	- vattr.va_uid = crgetuid(kcred);
	- vattr.va_gid = crgetgid(kcred);
	-
	- sharezp = zfs_znode_alloc_kmem(KM_SLEEP);
	- ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
	- sharezp->z_moved = 0;
	- sharezp->z_unlinked = 0;
	- sharezp->z_atime_dirty = 0;
	- sharezp->z_zfsvfs = zfsvfs;
	- sharezp->z_is_sa = zfsvfs->z_use_sa;
	-
	- VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
	- kcred, NULL, &acl_ids));
	- zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
	- ASSERT3P(zp, ==, sharezp);
	- POINTER_INVALIDATE(&sharezp->z_zfsvfs);
	- error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
	- ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
	- zfsvfs->z_shares_dir = sharezp->z_id;
	-
	- zfs_acl_ids_free(&acl_ids);
	- sa_handle_destroy(sharezp->z_sa_hdl);
	- zfs_znode_free_kmem(sharezp);
	-
	- return (error);
	-}
	-
	-/*
	- * define a couple of values we need available
	- * for both 64 and 32 bit environments.
	- */
	-#ifndef NBITSMINOR64
	-#define NBITSMINOR64 32
	-#endif
	-#ifndef MAXMAJ64
	-#define MAXMAJ64 0xffffffffUL
	-#endif
	-#ifndef MAXMIN64
	-#define MAXMIN64 0xffffffffUL
	-#endif
	-
	-/*
	- * Create special expldev for ZFS private use.
	- * Can't use standard expldev since it doesn't do
	- * what we want. The standard expldev() takes a
	- * dev32_t in LP64 and expands it to a long dev_t.
	- * We need an interface that takes a dev32_t in ILP32
	- * and expands it to a long dev_t.
	- */
	-static uint64_t
	-zfs_expldev(dev_t dev)
	-{
	- return (((uint64_t)major(dev) << NBITSMINOR64) \| minor(dev));
	-}
	-/*
	- * Special cmpldev for ZFS private use.
	- * Can't use standard cmpldev since it takes
	- * a long dev_t and compresses it to dev32_t in
	- * LP64. We need to do a compaction of a long dev_t
	- * to a dev32_t in ILP32.
	- */
	-dev_t
	-zfs_cmpldev(uint64_t dev)
	-{
	- return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
	-}
	-
	-static void
	-zfs_znode_sa_init(zfsvfs_t zfsvfs, znode_t zp,
	- dmu_buf_t db, dmu_object_type_t obj_type, sa_handle_t sa_hdl)
	-{
	- ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) \|\| (zfsvfs == zp->z_zfsvfs));
	- ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
	-
	- ASSERT(zp->z_sa_hdl == NULL);
	- ASSERT(zp->z_acl_cached == NULL);
	- if (sa_hdl == NULL) {
	- VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
	- SA_HDL_SHARED, &zp->z_sa_hdl));
	- } else {
	- zp->z_sa_hdl = sa_hdl;
	- sa_set_userp(sa_hdl, zp);
	- }
	-
	- zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
	-
	- /*
	- * Slap on VROOT if we are the root znode unless we are the root
	- * node of a snapshot mounted under .zfs.
	- */
	- if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
	- ZTOV(zp)->v_flag \|= VROOT;
	-
	- vn_exists(ZTOV(zp));
	-}
	-
	-void
	-zfs_znode_dmu_fini(znode_t *zp)
	-{
	- ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) \|\|
	- zp->z_unlinked \|\|
	- ZFS_TEARDOWN_INACTIVE_WLOCKED(zp->z_zfsvfs));
	-
	- sa_handle_destroy(zp->z_sa_hdl);
	- zp->z_sa_hdl = NULL;
	-}
	-
	-static void
	-zfs_vnode_forget(vnode_t *vp)
	-{
	-
	- /* copied from insmntque_stddtr */
	- vp->v_data = NULL;
	- vp->v_op = &dead_vnodeops;
	- vgone(vp);
	- vput(vp);
	-}
	-
	-/*
	- * Construct a new znode/vnode and intialize.
	- *
	- * This does not do a call to dmu_set_user() that is
	- * up to the caller to do, in case you don't want to
	- * return the znode
	- */
	-static znode_t *
	-zfs_znode_alloc(zfsvfs_t zfsvfs, dmu_buf_t db, int blksz,
	- dmu_object_type_t obj_type, sa_handle_t *hdl)
	-{
	- znode_t *zp;
	- vnode_t *vp;
	- uint64_t mode;
	- uint64_t parent;
	- sa_bulk_attr_t bulk[9];
	- int count = 0;
	- int error;
	-
	- zp = zfs_znode_alloc_kmem(KM_SLEEP);
	-
	-#ifndef _ZFS_USE_SMR
	- KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0,
	- ("%s: fast path lookup enabled without smr", __func__));
	-#endif
	-
	- KASSERT(curthread->td_vp_reserved != NULL,
	- ("zfs_znode_alloc: getnewvnode without preallocated vnode"));
	- error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
	- if (error != 0) {
	- zfs_znode_free_kmem(zp);
	- return (NULL);
	- }
	- zp->z_vnode = vp;
	- vp->v_data = zp;
	-
	- ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
	- zp->z_moved = 0;
	-
	- /*
	- * Defer setting z_zfsvfs until the znode is ready to be a candidate for
	- * the zfs_znode_move() callback.
	- */
	- zp->z_sa_hdl = NULL;
	- zp->z_unlinked = 0;
	- zp->z_atime_dirty = 0;
	- zp->z_mapcnt = 0;
	- zp->z_id = db->db_object;
	- zp->z_blksz = blksz;
	- zp->z_seq = 0x7A4653;
	- zp->z_sync_cnt = 0;
	-
	- vp = ZTOV(zp);
	-
	- zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
	-
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
	- &zp->z_size, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
	- &zp->z_links, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	- &zp->z_pflags, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
	- &zp->z_atime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
	- &zp->z_uid, 8);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
	- &zp->z_gid, 8);
	-
	- if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 \|\| zp->z_gen == 0) {
	- if (hdl == NULL)
	- sa_handle_destroy(zp->z_sa_hdl);
	- zfs_vnode_forget(vp);
	- zp->z_vnode = NULL;
	- zfs_znode_free_kmem(zp);
	- return (NULL);
	- }
	-
	- zp->z_mode = mode;
	-
	- vp->v_type = IFTOVT((mode_t)mode);
	-
	- switch (vp->v_type) {
	- case VDIR:
	- zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
	- break;
	-#ifdef illumos
	- case VBLK:
	- case VCHR:
	- {
	- uint64_t rdev;
	- VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
	- &rdev, sizeof (rdev)) == 0);
	-
	- vp->v_rdev = zfs_cmpldev(rdev);
	- }
	- break;
	-#endif
	- case VFIFO:
	-#ifdef illumos
	- case VSOCK:
	- case VDOOR:
	-#endif
	- vp->v_op = &zfs_fifoops;
	- break;
	- case VREG:
	- if (parent == zfsvfs->z_shares_dir) {
	- ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
	- vp->v_op = &zfs_shareops;
	- }
	- break;
	-#ifdef illumos
	- case VLNK:
	- vn_setops(vp, zfs_symvnodeops);
	- break;
	- default:
	- vn_setops(vp, zfs_evnodeops);
	- break;
	-#endif
	- }
	-
	- mutex_enter(&zfsvfs->z_znodes_lock);
	- list_insert_tail(&zfsvfs->z_all_znodes, zp);
	- membar_producer();
	- /*
	- * Everything else must be valid before assigning z_zfsvfs makes the
	- * znode eligible for zfs_znode_move().
	- */
	- zp->z_zfsvfs = zfsvfs;
	- mutex_exit(&zfsvfs->z_znodes_lock);
	-
	- /*
	- * Acquire vnode lock before making it available to the world.
	- */
	- vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	- VN_LOCK_AREC(vp);
	- if (vp->v_type != VFIFO)
	- VN_LOCK_ASHARE(vp);
	-
	-#ifdef illumos
	- VFS_HOLD(zfsvfs->z_vfs);
	-#endif
	- return (zp);
	-}
	-
	-static uint64_t empty_xattr;
	-static uint64_t pad[4];
	-static zfs_acl_phys_t acl_phys;
	-/*
	- * Create a new DMU object to hold a zfs znode.
	- *
	- * IN: dzp - parent directory for new znode
	- * vap - file attributes for new znode
	- * tx - dmu transaction id for zap operations
	- * cr - credentials of caller
	- * flag - flags:
	- * IS_ROOT_NODE - new object will be root
	- * IS_XATTR - new object is an attribute
	- * bonuslen - length of bonus buffer
	- * setaclp - File/Dir initial ACL
	- * fuidp - Tracks fuid allocation.
	- *
	- * OUT: zpp - allocated znode
	- *
	- */
	-void
	-zfs_mknode(znode_t dzp, vattr_t vap, dmu_tx_t tx, cred_t cr,
	- uint_t flag, znode_t *zpp, zfs_acl_ids_t acl_ids)
	-{
	- uint64_t crtime[2], atime[2], mtime[2], ctime[2];
	- uint64_t mode, size, links, parent, pflags;
	- uint64_t dzp_pflags = 0;
	- uint64_t rdev = 0;
	- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	- dmu_buf_t *db;
	- timestruc_t now;
	- uint64_t gen, obj;
	- int err;
	- int bonuslen;
	- int dnodesize;
	- sa_handle_t *sa_hdl;
	- dmu_object_type_t obj_type;
	- sa_bulk_attr_t *sa_attrs;
	- int cnt = 0;
	- zfs_acl_locator_cb_t locate = { 0 };
	-
	- ASSERT(vap && (vap->va_mask & (AT_TYPE\|AT_MODE)) == (AT_TYPE\|AT_MODE));
	-
	- if (zfsvfs->z_replay) {
	- obj = vap->va_nodeid;
	- now = vap->va_ctime; /* see zfs_replay_create() */
	- gen = vap->va_nblocks; /* ditto */
	- dnodesize = vap->va_fsid; /* ditto */
	- } else {
	- obj = 0;
	- vfs_timestamp(&now);
	- gen = dmu_tx_get_txg(tx);
	- dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
	- }
	-
	- if (dnodesize == 0)
	- dnodesize = DNODE_MIN_SIZE;
	-
	- obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
	- bonuslen = (obj_type == DMU_OT_SA) ?
	- DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
	-
	- /*
	- * Create a new DMU object.
	- */
	- /*
	- * There's currently no mechanism for pre-reading the blocks that will
	- * be needed to allocate a new object, so we accept the small chance
	- * that there will be an i/o error and we will fail one of the
	- * assertions below.
	- */
	- if (vap->va_type == VDIR) {
	- if (zfsvfs->z_replay) {
	- VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
	- zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
	- obj_type, bonuslen, dnodesize, tx));
	- } else {
	- obj = zap_create_norm_dnsize(zfsvfs->z_os,
	- zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
	- obj_type, bonuslen, dnodesize, tx);
	- }
	- } else {
	- if (zfsvfs->z_replay) {
	- VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
	- DMU_OT_PLAIN_FILE_CONTENTS, 0,
	- obj_type, bonuslen, dnodesize, tx));
	- } else {
	- obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
	- DMU_OT_PLAIN_FILE_CONTENTS, 0,
	- obj_type, bonuslen, dnodesize, tx);
	- }
	- }
	-
	- ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
	- VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
	-
	- /*
	- * If this is the root, fix up the half-initialized parent pointer
	- * to reference the just-allocated physical data area.
	- */
	- if (flag & IS_ROOT_NODE) {
	- dzp->z_id = obj;
	- } else {
	- dzp_pflags = dzp->z_pflags;
	- }
	-
	- /*
	- * If parent is an xattr, so am I.
	- */
	- if (dzp_pflags & ZFS_XATTR) {
	- flag \|= IS_XATTR;
	- }
	-
	- if (zfsvfs->z_use_fuids)
	- pflags = ZFS_ARCHIVE \| ZFS_AV_MODIFIED;
	- else
	- pflags = 0;
	-
	- if (vap->va_type == VDIR) {
	- size = 2; /* contents ("." and "..") */
	- links = (flag & (IS_ROOT_NODE \| IS_XATTR)) ? 2 : 1;
	- } else {
	- size = links = 0;
	- }
	-
	- if (vap->va_type == VBLK \|\| vap->va_type == VCHR) {
	- rdev = zfs_expldev(vap->va_rdev);
	- }
	-
	- parent = dzp->z_id;
	- mode = acl_ids->z_mode;
	- if (flag & IS_XATTR)
	- pflags \|= ZFS_XATTR;
	-
	- /*
	- * No execs denied will be deterimed when zfs_mode_compute() is called.
	- */
	- pflags \|= acl_ids->z_aclp->z_hints &
	- (ZFS_ACL_TRIVIAL\|ZFS_INHERIT_ACE\|ZFS_ACL_AUTO_INHERIT\|
	- ZFS_ACL_DEFAULTED\|ZFS_ACL_PROTECTED);
	-
	- ZFS_TIME_ENCODE(&now, crtime);
	- ZFS_TIME_ENCODE(&now, ctime);
	-
	- if (vap->va_mask & AT_ATIME) {
	- ZFS_TIME_ENCODE(&vap->va_atime, atime);
	- } else {
	- ZFS_TIME_ENCODE(&now, atime);
	- }
	-
	- if (vap->va_mask & AT_MTIME) {
	- ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
	- } else {
	- ZFS_TIME_ENCODE(&now, mtime);
	- }
	-
	- /* Now add in all of the "SA" attributes */
	- VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
	- &sa_hdl));
	-
	- /*
	- * Setup the array of attributes to be replaced/set on the new file
	- *
	- * order for DMU_OT_ZNODE is critical since it needs to be constructed
	- * in the old znode_phys_t format. Don't change this ordering
	- */
	- sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
	-
	- if (obj_type == DMU_OT_ZNODE) {
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
	- NULL, &atime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
	- NULL, &mtime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
	- NULL, &ctime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
	- NULL, &crtime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
	- NULL, &gen, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
	- NULL, &mode, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
	- NULL, &size, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
	- NULL, &parent, 8);
	- } else {
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
	- NULL, &mode, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
	- NULL, &size, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
	- NULL, &gen, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
	- NULL, &acl_ids->z_fuid, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
	- NULL, &acl_ids->z_fgid, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
	- NULL, &parent, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
	- NULL, &pflags, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
	- NULL, &atime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
	- NULL, &mtime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
	- NULL, &ctime, 16);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
	- NULL, &crtime, 16);
	- }
	-
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
	-
	- if (obj_type == DMU_OT_ZNODE) {
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
	- &empty_xattr, 8);
	- }
	- if (obj_type == DMU_OT_ZNODE \|\|
	- (vap->va_type == VBLK \|\| vap->va_type == VCHR)) {
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
	- NULL, &rdev, 8);
	-
	- }
	- if (obj_type == DMU_OT_ZNODE) {
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
	- NULL, &pflags, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
	- &acl_ids->z_fuid, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
	- &acl_ids->z_fgid, 8);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
	- sizeof (uint64_t) * 4);
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
	- &acl_phys, sizeof (zfs_acl_phys_t));
	- } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
	- &acl_ids->z_aclp->z_acl_count, 8);
	- locate.cb_aclp = acl_ids->z_aclp;
	- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
	- zfs_acl_data_locator, &locate,
	- acl_ids->z_aclp->z_acl_bytes);
	- mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
	- acl_ids->z_fuid, acl_ids->z_fgid);
	- }
	-
	- VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
	-
	- if (!(flag & IS_ROOT_NODE)) {
	- *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
	- ASSERT(*zpp != NULL);
	- } else {
	- /*
	- * If we are creating the root node, the "parent" we
	- * passed in is the znode for the root.
	- */
	- *zpp = dzp;
	-
	- (*zpp)->z_sa_hdl = sa_hdl;
	- }
	-
	- (*zpp)->z_pflags = pflags;
	- (*zpp)->z_mode = mode;
	- (*zpp)->z_dnodesize = dnodesize;
	-
	- if (vap->va_mask & AT_XVATTR)
	- zfs_xvattr_set(zpp, (xvattr_t )vap, tx);
	-
	- if (obj_type == DMU_OT_ZNODE \|\|
	- acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
	- VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
	- }
	- if (!(flag & IS_ROOT_NODE)) {
	- vnode_t *vp;
	-
	- vp = ZTOV(*zpp);
	- vp->v_vflag \|= VV_FORCEINSMQ;
	- err = insmntque(vp, zfsvfs->z_vfs);
	- vp->v_vflag &= ~VV_FORCEINSMQ;
	- KASSERT(err == 0, ("insmntque() failed: error %d", err));
	- }
	- kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
	-}
	-
	-/*
	- * Update in-core attributes. It is assumed the caller will be doing an
	- * sa_bulk_update to push the changes out.
	- */
	-void
	-zfs_xvattr_set(znode_t zp, xvattr_t xvap, dmu_tx_t *tx)
	-{
	- xoptattr_t *xoap;
	-
	- xoap = xva_getxoptattr(xvap);
	- ASSERT(xoap);
	-
	- ASSERT_VOP_IN_SEQC(ZTOV(zp));
	-
	- if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
	- uint64_t times[2];
	- ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
	- (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
	- &times, sizeof (times), tx);
	- XVA_SET_RTN(xvap, XAT_CREATETIME);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
	- ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_READONLY);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
	- ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_HIDDEN);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
	- ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_SYSTEM);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
	- ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_ARCHIVE);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
	- ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_IMMUTABLE);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
	- ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_NOUNLINK);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
	- ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_APPENDONLY);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
	- ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_NODUMP);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
	- ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_OPAQUE);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
	- ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
	- xoap->xoa_av_quarantined, zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
	- ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
	- zfs_sa_set_scanstamp(zp, xvap, tx);
	- XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
	- ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_REPARSE);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
	- ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_OFFLINE);
	- }
	- if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
	- ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
	- zp->z_pflags, tx);
	- XVA_SET_RTN(xvap, XAT_SPARSE);
	- }
	-}
	-
	-int
	-zfs_zget(zfsvfs_t zfsvfs, uint64_t obj_num, znode_t *zpp)
	-{
	- dmu_object_info_t doi;
	- dmu_buf_t *db;
	- znode_t *zp;
	- vnode_t *vp;
	- sa_handle_t *hdl;
	- struct thread *td;
	- int locked;
	- int err;
	-
	- td = curthread;
	- getnewvnode_reserve();
	-again:
	- *zpp = NULL;
	- ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
	-
	- err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
	- if (err) {
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	- getnewvnode_drop_reserve();
	- return (err);
	- }
	-
	- dmu_object_info_from_db(db, &doi);
	- if (doi.doi_bonus_type != DMU_OT_SA &&
	- (doi.doi_bonus_type != DMU_OT_ZNODE \|\|
	- (doi.doi_bonus_type == DMU_OT_ZNODE &&
	- doi.doi_bonus_size < sizeof (znode_phys_t)))) {
	- sa_buf_rele(db, NULL);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	-#ifdef __FreeBSD__
	- getnewvnode_drop_reserve();
	-#endif
	- return (SET_ERROR(EINVAL));
	- }
	-
	- hdl = dmu_buf_get_user(db);
	- if (hdl != NULL) {
	- zp = sa_get_userdata(hdl);
	-
	- /*
	- * Since "SA" does immediate eviction we
	- * should never find a sa handle that doesn't
	- * know about the znode.
	- */
	- ASSERT3P(zp, !=, NULL);
	- ASSERT3U(zp->z_id, ==, obj_num);
	- if (zp->z_unlinked) {
	- err = SET_ERROR(ENOENT);
	- } else {
	- vp = ZTOV(zp);
	- /*
	- * Don't let the vnode disappear after
	- * ZFS_OBJ_HOLD_EXIT.
	- */
	- VN_HOLD(vp);
	- *zpp = zp;
	- err = 0;
	- }
	-
	- sa_buf_rele(db, NULL);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	-
	- if (err) {
	- getnewvnode_drop_reserve();
	- return (err);
	- }
	-
	- locked = VOP_ISLOCKED(vp);
	- VI_LOCK(vp);
	- if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) {
	- /*
	- * The vnode is doomed and this thread doesn't
	- * hold the exclusive lock on it, so the vnode
	- * must be being reclaimed by another thread.
	- * Otherwise the doomed vnode is being reclaimed
	- * by this thread and zfs_zget is called from
	- * ZIL internals.
	- */
	- VI_UNLOCK(vp);
	-
	- /*
	- * XXX vrele() locks the vnode when the last reference
	- * is dropped. Although in this case the vnode is
	- * doomed / dead and so no inactivation is required,
	- * the vnode lock is still acquired. That could result
	- * in a LOR with z_teardown_lock if another thread holds
	- * the vnode's lock and tries to take z_teardown_lock.
	- * But that is only possible if the other thread peforms
	- * a ZFS vnode operation on the vnode. That either
	- * should not happen if the vnode is dead or the thread
	- * should also have a refrence to the vnode and thus
	- * our reference is not last.
	- */
	- VN_RELE(vp);
	- goto again;
	- }
	- VI_UNLOCK(vp);
	- getnewvnode_drop_reserve();
	- return (err);
	- }
	-
	- /*
	- * Not found create new znode/vnode
	- * but only if file exists.
	- *
	- * There is a small window where zfs_vget() could
	- * find this object while a file create is still in
	- * progress. This is checked for in zfs_znode_alloc()
	- *
	- * if zfs_znode_alloc() fails it will drop the hold on the
	- * bonus buffer.
	- */
	- zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
	- doi.doi_bonus_type, NULL);
	- if (zp == NULL) {
	- err = SET_ERROR(ENOENT);
	- } else {
	- *zpp = zp;
	- }
	- if (err == 0) {
	- vnode_t *vp = ZTOV(zp);
	-
	- err = insmntque(vp, zfsvfs->z_vfs);
	- if (err == 0) {
	- vp->v_hash = obj_num;
	- VOP_UNLOCK(vp);
	- } else {
	- zp->z_vnode = NULL;
	- zfs_znode_dmu_fini(zp);
	- zfs_znode_free(zp);
	- *zpp = NULL;
	- }
	- }
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	- getnewvnode_drop_reserve();
	- return (err);
	-}
	-
	-int
	-zfs_rezget(znode_t *zp)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- dmu_object_info_t doi;
	- dmu_buf_t *db;
	- vnode_t *vp;
	- uint64_t obj_num = zp->z_id;
	- uint64_t mode, size;
	- sa_bulk_attr_t bulk[8];
	- int err;
	- int count = 0;
	- uint64_t gen;
	-
	- /*
	- * Remove cached pages before reloading the znode, so that they are not
	- * lingering after we run into any error. Ideally, we should vgone()
	- * the vnode in case of error, but currently we cannot do that
	- * because of the LOR between the vnode lock and z_teardown_lock.
	- * So, instead, we have to "doom" the znode in the illumos style.
	- */
	- vp = ZTOV(zp);
	- vn_pages_remove(vp, 0, 0);
	-
	- ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
	-
	- mutex_enter(&zp->z_acl_lock);
	- if (zp->z_acl_cached) {
	- zfs_acl_free(zp->z_acl_cached);
	- zp->z_acl_cached = NULL;
	- }
	-
	- mutex_exit(&zp->z_acl_lock);
	- ASSERT(zp->z_sa_hdl == NULL);
	- err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
	- if (err) {
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	- return (err);
	- }
	-
	- dmu_object_info_from_db(db, &doi);
	- if (doi.doi_bonus_type != DMU_OT_SA &&
	- (doi.doi_bonus_type != DMU_OT_ZNODE \|\|
	- (doi.doi_bonus_type == DMU_OT_ZNODE &&
	- doi.doi_bonus_size < sizeof (znode_phys_t)))) {
	- sa_buf_rele(db, NULL);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	- return (SET_ERROR(EINVAL));
	- }
	-
	- zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
	- size = zp->z_size;
	-
	- /* reload cached values */
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
	- &gen, sizeof (gen));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
	- &zp->z_size, sizeof (zp->z_size));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
	- &zp->z_links, sizeof (zp->z_links));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	- &zp->z_pflags, sizeof (zp->z_pflags));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
	- &zp->z_atime, sizeof (zp->z_atime));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
	- &zp->z_uid, sizeof (zp->z_uid));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
	- &zp->z_gid, sizeof (zp->z_gid));
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
	- &mode, sizeof (mode));
	-
	- if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
	- zfs_znode_dmu_fini(zp);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	- return (SET_ERROR(EIO));
	- }
	-
	- zp->z_mode = mode;
	-
	- if (gen != zp->z_gen) {
	- zfs_znode_dmu_fini(zp);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	- return (SET_ERROR(EIO));
	- }
	-
	- /*
	- * It is highly improbable but still quite possible that two
	- * objects in different datasets are created with the same
	- * object numbers and in transaction groups with the same
	- * numbers. znodes corresponding to those objects would
	- * have the same z_id and z_gen, but their other attributes
	- * may be different.
	- * zfs recv -F may replace one of such objects with the other.
	- * As a result file properties recorded in the replaced
	- * object's vnode may no longer match the received object's
	- * properties. At present the only cached property is the
	- * files type recorded in v_type.
	- * So, handle this case by leaving the old vnode and znode
	- * disassociated from the actual object. A new vnode and a
	- * znode will be created if the object is accessed
	- * (e.g. via a look-up). The old vnode and znode will be
	- * recycled when the last vnode reference is dropped.
	- */
	- if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
	- zfs_znode_dmu_fini(zp);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	- return (SET_ERROR(EIO));
	- }
	-
	- /*
	- * If the file has zero links, then it has been unlinked on the send
	- * side and it must be in the received unlinked set.
	- * We call zfs_znode_dmu_fini() now to prevent any accesses to the
	- * stale data and to prevent automatical removal of the file in
	- * zfs_zinactive(). The file will be removed either when it is removed
	- * on the send side and the next incremental stream is received or
	- * when the unlinked set gets processed.
	- */
	- zp->z_unlinked = (zp->z_links == 0);
	- if (zp->z_unlinked) {
	- zfs_znode_dmu_fini(zp);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	- return (0);
	- }
	-
	- zp->z_blksz = doi.doi_data_block_size;
	- if (zp->z_size != size)
	- vnode_pager_setsize(vp, zp->z_size);
	-
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
	-
	- return (0);
	-}
	-
	-void
	-zfs_znode_delete(znode_t zp, dmu_tx_t tx)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- objset_t *os = zfsvfs->z_os;
	- uint64_t obj = zp->z_id;
	- uint64_t acl_obj = zfs_external_acl(zp);
	-
	- ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
	- if (acl_obj) {
	- VERIFY(!zp->z_is_sa);
	- VERIFY(0 == dmu_object_free(os, acl_obj, tx));
	- }
	- VERIFY(0 == dmu_object_free(os, obj, tx));
	- zfs_znode_dmu_fini(zp);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
	- zfs_znode_free(zp);
	-}
	-
	-void
	-zfs_zinactive(znode_t *zp)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- uint64_t z_id = zp->z_id;
	-
	- ASSERT(zp->z_sa_hdl);
	-
	- /*
	- * Don't allow a zfs_zget() while were trying to release this znode
	- */
	- ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
	-
	- /*
	- * If this was the last reference to a file with no links, remove
	- * the file from the file system unless the file system is mounted
	- * read-only. That can happen, for example, if the file system was
	- * originally read-write, the file was opened, then unlinked and
	- * the file system was made read-only before the file was finally
	- * closed. The file will remain in the unlinked set.
	- */
	- if (zp->z_unlinked) {
	- ASSERT(!zfsvfs->z_issnap);
	- if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) {
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
	- zfs_rmnode(zp);
	- return;
	- }
	- }
	-
	- zfs_znode_dmu_fini(zp);
	- ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
	- zfs_znode_free(zp);
	-}
	-
	-void
	-zfs_znode_free(znode_t *zp)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	-
	- ASSERT(zp->z_sa_hdl == NULL);
	- zp->z_vnode = NULL;
	- mutex_enter(&zfsvfs->z_znodes_lock);
	- POINTER_INVALIDATE(&zp->z_zfsvfs);
	- list_remove(&zfsvfs->z_all_znodes, zp);
	- mutex_exit(&zfsvfs->z_znodes_lock);
	-
	- if (zp->z_acl_cached) {
	- zfs_acl_free(zp->z_acl_cached);
	- zp->z_acl_cached = NULL;
	- }
	-
	- zfs_znode_free_kmem(zp);
	-
	-#ifdef illumos
	- VFS_RELE(zfsvfs->z_vfs);
	-#endif
	-}
	-
	-void
	-zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
	- uint64_t ctime[2], boolean_t have_tx)
	-{
	- timestruc_t now;
	-
	- vfs_timestamp(&now);
	-
	- if (have_tx) { /* will sa_bulk_update happen really soon? */
	- zp->z_atime_dirty = 0;
	- zp->z_seq++;
	- } else {
	- zp->z_atime_dirty = 1;
	- }
	-
	- if (flag & AT_ATIME) {
	- ZFS_TIME_ENCODE(&now, zp->z_atime);
	- }
	-
	- if (flag & AT_MTIME) {
	- ZFS_TIME_ENCODE(&now, mtime);
	- if (zp->z_zfsvfs->z_use_fuids) {
	- zp->z_pflags \|= (ZFS_ARCHIVE \|
	- ZFS_AV_MODIFIED);
	- }
	- }
	-
	- if (flag & AT_CTIME) {
	- ZFS_TIME_ENCODE(&now, ctime);
	- if (zp->z_zfsvfs->z_use_fuids)
	- zp->z_pflags \|= ZFS_ARCHIVE;
	- }
	-}
	-
	-/*
	- * Grow the block size for a file.
	- *
	- * IN: zp - znode of file to free data in.
	- * size - requested block size
	- * tx - open transaction.
	- *
	- * NOTE: this function assumes that the znode is write locked.
	- */
	-void
	-zfs_grow_blocksize(znode_t zp, uint64_t size, dmu_tx_t tx)
	-{
	- int error;
	- u_longlong_t dummy;
	-
	- if (size <= zp->z_blksz)
	- return;
	- /*
	- * If the file size is already greater than the current blocksize,
	- * we will not grow. If there is more than one block in a file,
	- * the blocksize cannot change.
	- */
	- if (zp->z_blksz && zp->z_size > zp->z_blksz)
	- return;
	-
	- error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
	- size, 0, tx);
	-
	- if (error == ENOTSUP)
	- return;
	- ASSERT0(error);
	-
	- /* What blocksize did we actually get? */
	- dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
	-}
	-
	-#ifdef illumos
	-/*
	- * This is a dummy interface used when pvn_vplist_dirty() should not
	- * be calling back into the fs for a putpage(). E.g.: when truncating
	- * a file, the pages being "thrown away* don't need to be written out.
	- */
	-/* ARGSUSED */
	-static int
	-zfs_no_putpage(vnode_t vp, page_t pp, u_offset_t offp, size_t lenp,
	- int flags, cred_t *cr)
	-{
	- ASSERT(0);
	- return (0);
	-}
	-#endif
	-
	-/*
	- * Increase the file length
	- *
	- * IN: zp - znode of file to free data in.
	- * end - new end-of-file
	- *
	- * RETURN: 0 on success, error code on failure
	- */
	-static int
	-zfs_extend(znode_t *zp, uint64_t end)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- dmu_tx_t *tx;
	- locked_range_t *lr;
	- uint64_t newblksz;
	- int error;
	-
	- /*
	- * We will change zp_size, lock the whole file.
	- */
	- lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
	-
	- /*
	- * Nothing to do if file already at desired length.
	- */
	- if (end <= zp->z_size) {
	- rangelock_exit(lr);
	- return (0);
	- }
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- zfs_sa_upgrade_txholds(tx, zp);
	- if (end > zp->z_blksz &&
	- (!ISP2(zp->z_blksz) \|\| zp->z_blksz < zfsvfs->z_max_blksz)) {
	- /*
	- * We are growing the file past the current block size.
	- */
	- if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
	- /*
	- * File's blocksize is already larger than the
	- * "recordsize" property. Only let it grow to
	- * the next power of 2.
	- */
	- ASSERT(!ISP2(zp->z_blksz));
	- newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
	- } else {
	- newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
	- }
	- dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
	- } else {
	- newblksz = 0;
	- }
	-
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- rangelock_exit(lr);
	- return (error);
	- }
	-
	- if (newblksz)
	- zfs_grow_blocksize(zp, newblksz, tx);
	-
	- zp->z_size = end;
	-
	- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
	- &zp->z_size, sizeof (zp->z_size), tx));
	-
	- vnode_pager_setsize(ZTOV(zp), end);
	-
	- rangelock_exit(lr);
	-
	- dmu_tx_commit(tx);
	-
	- return (0);
	-}
	-
	-/*
	- * Free space in a file.
	- *
	- * IN: zp - znode of file to free data in.
	- * off - start of section to free.
	- * len - length of section to free.
	- *
	- * RETURN: 0 on success, error code on failure
	- */
	-static int
	-zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- locked_range_t *lr;
	- int error;
	-
	- /*
	- * Lock the range being freed.
	- */
	- lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
	-
	- /*
	- * Nothing to do if file already at desired length.
	- */
	- if (off >= zp->z_size) {
	- rangelock_exit(lr);
	- return (0);
	- }
	-
	- if (off + len > zp->z_size)
	- len = zp->z_size - off;
	-
	- error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
	-
	- if (error == 0) {
	- /*
	- * In FreeBSD we cannot free block in the middle of a file,
	- * but only at the end of a file, so this code path should
	- * never happen.
	- */
	- vnode_pager_setsize(ZTOV(zp), off);
	- }
	-
	- rangelock_exit(lr);
	-
	- return (error);
	-}
	-
	-/*
	- * Truncate a file
	- *
	- * IN: zp - znode of file to free data in.
	- * end - new end-of-file.
	- *
	- * RETURN: 0 on success, error code on failure
	- */
	-static int
	-zfs_trunc(znode_t *zp, uint64_t end)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- vnode_t *vp = ZTOV(zp);
	- dmu_tx_t *tx;
	- locked_range_t *lr;
	- int error;
	- sa_bulk_attr_t bulk[2];
	- int count = 0;
	-
	- /*
	- * We will change zp_size, lock the whole file.
	- */
	- lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
	-
	- /*
	- * Nothing to do if file already at desired length.
	- */
	- if (end >= zp->z_size) {
	- rangelock_exit(lr);
	- return (0);
	- }
	-
	- error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
	- DMU_OBJECT_END);
	- if (error) {
	- rangelock_exit(lr);
	- return (error);
	- }
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- zfs_sa_upgrade_txholds(tx, zp);
	- dmu_tx_mark_netfree(tx);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- rangelock_exit(lr);
	- return (error);
	- }
	-
	- zp->z_size = end;
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
	- NULL, &zp->z_size, sizeof (zp->z_size));
	-
	- if (end == 0) {
	- zp->z_pflags &= ~ZFS_SPARSE;
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
	- NULL, &zp->z_pflags, 8);
	- }
	- VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
	-
	- dmu_tx_commit(tx);
	-
	- /*
	- * Clear any mapped pages in the truncated region. This has to
	- * happen outside of the transaction to avoid the possibility of
	- * a deadlock with someone trying to push a page that we are
	- * about to invalidate.
	- */
	- vnode_pager_setsize(vp, end);
	-
	- rangelock_exit(lr);
	-
	- return (0);
	-}
	-
	-/*
	- * Free space in a file
	- *
	- * IN: zp - znode of file to free data in.
	- * off - start of range
	- * len - end of range (0 => EOF)
	- * flag - current file open mode flags.
	- * log - TRUE if this action should be logged
	- *
	- * RETURN: 0 on success, error code on failure
	- */
	-int
	-zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
	-{
	- vnode_t *vp = ZTOV(zp);
	- dmu_tx_t *tx;
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- zilog_t *zilog = zfsvfs->z_log;
	- uint64_t mode;
	- uint64_t mtime[2], ctime[2];
	- sa_bulk_attr_t bulk[3];
	- int count = 0;
	- int error;
	-
	- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
	- sizeof (mode))) != 0)
	- return (error);
	-
	- if (off > zp->z_size) {
	- error = zfs_extend(zp, off+len);
	- if (error == 0 && log)
	- goto log;
	- else
	- return (error);
	- }
	-
	- /*
	- * Check for any locks in the region to be freed.
	- */
	-
	- if (MANDLOCK(vp, (mode_t)mode)) {
	- uint64_t length = (len ? len : zp->z_size - off);
	- if (error = chklock(vp, FWRITE, off, length, flag, NULL))
	- return (error);
	- }
	-
	- if (len == 0) {
	- error = zfs_trunc(zp, off);
	- } else {
	- if ((error = zfs_free_range(zp, off, len)) == 0 &&
	- off + len > zp->z_size)
	- error = zfs_extend(zp, off+len);
	- }
	- if (error \|\| !log)
	- return (error);
	-log:
	- tx = dmu_tx_create(zfsvfs->z_os);
	- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	- zfs_sa_upgrade_txholds(tx, zp);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- return (error);
	- }
	-
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
	- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
	- NULL, &zp->z_pflags, 8);
	- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
	- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	- ASSERT(error == 0);
	-
	- zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
	-
	- dmu_tx_commit(tx);
	- return (0);
	-}
	-
	-void
	-zfs_create_fs(objset_t os, cred_t cr, nvlist_t zplprops, dmu_tx_t tx)
	-{
	- uint64_t moid, obj, sa_obj, version;
	- uint64_t sense = ZFS_CASE_SENSITIVE;
	- uint64_t norm = 0;
	- nvpair_t *elem;
	- int error;
	- int i;
	- znode_t *rootzp = NULL;
	- zfsvfs_t *zfsvfs;
	- vattr_t vattr;
	- znode_t *zp;
	- zfs_acl_ids_t acl_ids;
	-
	- /*
	- * First attempt to create master node.
	- */
	- /*
	- * In an empty objset, there are no blocks to read and thus
	- * there can be no i/o errors (which we assert below).
	- */
	- moid = MASTER_NODE_OBJ;
	- error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
	- DMU_OT_NONE, 0, tx);
	- ASSERT(error == 0);
	-
	- /*
	- * Set starting attributes.
	- */
	- version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
	- elem = NULL;
	- while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
	- /* For the moment we expect all zpl props to be uint64_ts */
	- uint64_t val;
	- char *name;
	-
	- ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
	- VERIFY(nvpair_value_uint64(elem, &val) == 0);
	- name = nvpair_name(elem);
	- if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
	- if (val < version)
	- version = val;
	- } else {
	- error = zap_update(os, moid, name, 8, 1, &val, tx);
	- }
	- ASSERT(error == 0);
	- if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
	- norm = val;
	- else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
	- sense = val;
	- }
	- ASSERT(version != 0);
	- error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
	-
	- /*
	- * Create zap object used for SA attribute registration
	- */
	-
	- if (version >= ZPL_VERSION_SA) {
	- sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
	- DMU_OT_NONE, 0, tx);
	- error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
	- ASSERT(error == 0);
	- } else {
	- sa_obj = 0;
	- }
	- /*
	- * Create a delete queue.
	- */
	- obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
	-
	- error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
	- ASSERT(error == 0);
	-
	- /*
	- * Create root znode. Create minimal znode/vnode/zfsvfs
	- * to allow zfs_mknode to work.
	- */
	- VATTR_NULL(&vattr);
	- vattr.va_mask = AT_MODE\|AT_UID\|AT_GID\|AT_TYPE;
	- vattr.va_type = VDIR;
	- vattr.va_mode = S_IFDIR\|0755;
	- vattr.va_uid = crgetuid(cr);
	- vattr.va_gid = crgetgid(cr);
	-
	- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
	-
	- rootzp = zfs_znode_alloc_kmem(KM_SLEEP);
	- ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
	- rootzp->z_moved = 0;
	- rootzp->z_unlinked = 0;
	- rootzp->z_atime_dirty = 0;
	- rootzp->z_is_sa = USE_SA(version, os);
	-
	- zfsvfs->z_os = os;
	- zfsvfs->z_parent = zfsvfs;
	- zfsvfs->z_version = version;
	- zfsvfs->z_use_fuids = USE_FUIDS(version, os);
	- zfsvfs->z_use_sa = USE_SA(version, os);
	- zfsvfs->z_norm = norm;
	-
	- error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
	- &zfsvfs->z_attr_table);
	-
	- ASSERT(error == 0);
	-
	- /*
	- * Fold case on file systems that are always or sometimes case
	- * insensitive.
	- */
	- if (sense == ZFS_CASE_INSENSITIVE \|\| sense == ZFS_CASE_MIXED)
	- zfsvfs->z_norm \|= U8_TEXTPREP_TOUPPER;
	-
	- mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
	- list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
	- offsetof(znode_t, z_link_node));
	-
	- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
	- mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
	-
	- rootzp->z_zfsvfs = zfsvfs;
	- VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
	- cr, NULL, &acl_ids));
	- zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
	- ASSERT3P(zp, ==, rootzp);
	- error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
	- ASSERT(error == 0);
	- zfs_acl_ids_free(&acl_ids);
	- POINTER_INVALIDATE(&rootzp->z_zfsvfs);
	-
	- sa_handle_destroy(rootzp->z_sa_hdl);
	- zfs_znode_free_kmem(rootzp);
	-
	- /*
	- * Create shares directory
	- */
	-
	- error = zfs_create_share_dir(zfsvfs, tx);
	-
	- ASSERT(error == 0);
	-
	- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
	- mutex_destroy(&zfsvfs->z_hold_mtx[i]);
	- kmem_free(zfsvfs, sizeof (zfsvfs_t));
	-}
	-#endif /* _KERNEL */
	-
	-static int
	-zfs_sa_setup(objset_t osp, sa_attr_type_t *sa_table)
	-{
	- uint64_t sa_obj = 0;
	- int error;
	-
	- error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
	- if (error != 0 && error != ENOENT)
	- return (error);
	-
	- error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
	- return (error);
	-}
	-
	-static int
	-zfs_grab_sa_handle(objset_t osp, uint64_t obj, sa_handle_t *hdlp,
	- dmu_buf_t *db, void tag)
	-{
	- dmu_object_info_t doi;
	- int error;
	-
	- if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
	- return (error);
	-
	- dmu_object_info_from_db(*db, &doi);
	- if ((doi.doi_bonus_type != DMU_OT_SA &&
	- doi.doi_bonus_type != DMU_OT_ZNODE) \|\|
	- doi.doi_bonus_type == DMU_OT_ZNODE &&
	- doi.doi_bonus_size < sizeof (znode_phys_t)) {
	- sa_buf_rele(*db, tag);
	- return (SET_ERROR(ENOTSUP));
	- }
	-
	- error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
	- if (error != 0) {
	- sa_buf_rele(*db, tag);
	- return (error);
	- }
	-
	- return (0);
	-}
	-
	-void
	-zfs_release_sa_handle(sa_handle_t hdl, dmu_buf_t db, void *tag)
	-{
	- sa_handle_destroy(hdl);
	- sa_buf_rele(db, tag);
	-}
	-
	-/*
	- * Given an object number, return its parent object number and whether
	- * or not the object is an extended attribute directory.
	- */
	-static int
	-zfs_obj_to_pobj(objset_t osp, sa_handle_t hdl, sa_attr_type_t *sa_table,
	- uint64_t pobjp, int is_xattrdir)
	-{
	- uint64_t parent;
	- uint64_t pflags;
	- uint64_t mode;
	- uint64_t parent_mode;
	- sa_bulk_attr_t bulk[3];
	- sa_handle_t *sa_hdl;
	- dmu_buf_t *sa_db;
	- int count = 0;
	- int error;
	-
	- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
	- &parent, sizeof (parent));
	- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
	- &pflags, sizeof (pflags));
	- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
	- &mode, sizeof (mode));
	-
	- if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
	- return (error);
	-
	- /*
	- * When a link is removed its parent pointer is not changed and will
	- * be invalid. There are two cases where a link is removed but the
	- * file stays around, when it goes to the delete queue and when there
	- * are additional links.
	- */
	- error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
	- if (error != 0)
	- return (error);
	-
	- error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
	- zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
	- if (error != 0)
	- return (error);
	-
	- *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
	-
	- /*
	- * Extended attributes can be applied to files, directories, etc.
	- * Otherwise the parent must be a directory.
	- */
	- if (!*is_xattrdir && !S_ISDIR(parent_mode))
	- return (SET_ERROR(EINVAL));
	-
	- *pobjp = parent;
	-
	- return (0);
	-}
	-
	-/*
	- * Given an object number, return some zpl level statistics
	- */
	-static int
	-zfs_obj_to_stats_impl(sa_handle_t hdl, sa_attr_type_t sa_table,
	- zfs_stat_t *sb)
	-{
	- sa_bulk_attr_t bulk[4];
	- int count = 0;
	-
	- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
	- &sb->zs_mode, sizeof (sb->zs_mode));
	- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
	- &sb->zs_gen, sizeof (sb->zs_gen));
	- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
	- &sb->zs_links, sizeof (sb->zs_links));
	- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
	- &sb->zs_ctime, sizeof (sb->zs_ctime));
	-
	- return (sa_bulk_lookup(hdl, bulk, count));
	-}
	-
	-static int
	-zfs_obj_to_path_impl(objset_t osp, uint64_t obj, sa_handle_t hdl,
	- sa_attr_type_t sa_table, char buf, int len)
	-{
	- sa_handle_t *sa_hdl;
	- sa_handle_t *prevhdl = NULL;
	- dmu_buf_t *prevdb = NULL;
	- dmu_buf_t *sa_db = NULL;
	- char *path = buf + len - 1;
	- int error;
	-
	- *path = '\0';
	- sa_hdl = hdl;
	-
	- uint64_t deleteq_obj;
	- VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
	- ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
	- error = zap_lookup_int(osp, deleteq_obj, obj);
	- if (error == 0) {
	- return (ESTALE);
	- } else if (error != ENOENT) {
	- return (error);
	- }
	- error = 0;
	-
	- for (;;) {
	- uint64_t pobj;
	- char component[MAXNAMELEN + 2];
	- size_t complen;
	- int is_xattrdir;
	-
	- if (prevdb)
	- zfs_release_sa_handle(prevhdl, prevdb, FTAG);
	-
	- if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
	- &is_xattrdir)) != 0)
	- break;
	-
	- if (pobj == obj) {
	- if (path[0] != '/')
	- *--path = '/';
	- break;
	- }
	-
	- component[0] = '/';
	- if (is_xattrdir) {
	- (void) sprintf(component + 1, "<xattrdir>");
	- } else {
	- error = zap_value_search(osp, pobj, obj,
	- ZFS_DIRENT_OBJ(-1ULL), component + 1);
	- if (error != 0)
	- break;
	- }
	-
	- complen = strlen(component);
	- path -= complen;
	- ASSERT(path >= buf);
	- bcopy(component, path, complen);
	- obj = pobj;
	-
	- if (sa_hdl != hdl) {
	- prevhdl = sa_hdl;
	- prevdb = sa_db;
	- }
	- error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
	- if (error != 0) {
	- sa_hdl = prevhdl;
	- sa_db = prevdb;
	- break;
	- }
	- }
	-
	- if (sa_hdl != NULL && sa_hdl != hdl) {
	- ASSERT(sa_db != NULL);
	- zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
	- }
	-
	- if (error == 0)
	- (void) memmove(buf, path, buf + len - path);
	-
	- return (error);
	-}
	-
	-int
	-zfs_obj_to_path(objset_t osp, uint64_t obj, char buf, int len)
	-{
	- sa_attr_type_t *sa_table;
	- sa_handle_t *hdl;
	- dmu_buf_t *db;
	- int error;
	-
	- error = zfs_sa_setup(osp, &sa_table);
	- if (error != 0)
	- return (error);
	-
	- error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
	- if (error != 0)
	- return (error);
	-
	- error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
	-
	- zfs_release_sa_handle(hdl, db, FTAG);
	- return (error);
	-}
	-
	-int
	-zfs_obj_to_stats(objset_t osp, uint64_t obj, zfs_stat_t sb,
	- char *buf, int len)
	-{
	- char *path = buf + len - 1;
	- sa_attr_type_t *sa_table;
	- sa_handle_t *hdl;
	- dmu_buf_t *db;
	- int error;
	-
	- *path = '\0';
	-
	- error = zfs_sa_setup(osp, &sa_table);
	- if (error != 0)
	- return (error);
	-
	- error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
	- if (error != 0)
	- return (error);
	-
	- error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
	- if (error != 0) {
	- zfs_release_sa_handle(hdl, db, FTAG);
	- return (error);
	- }
	-
	- error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
	-
	- zfs_release_sa_handle(hdl, db, FTAG);
	- return (error);
	-}
	-
	-#ifdef _KERNEL
	-int
	-zfs_znode_parent_and_name(znode_t zp, znode_t dzpp, char buf)
	-{
	- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	- uint64_t parent;
	- int is_xattrdir;
	- int err;
	-
	- /* Extended attributes should not be visible as regular files. */
	- if ((zp->z_pflags & ZFS_XATTR) != 0)
	- return (SET_ERROR(EINVAL));
	-
	- err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
	- &parent, &is_xattrdir);
	- if (err != 0)
	- return (err);
	- ASSERT0(is_xattrdir);
	-
	- /* No name as this is a root object. */
	- if (parent == zp->z_id)
	- return (SET_ERROR(EINVAL));
	-
	- err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
	- ZFS_DIRENT_OBJ(-1ULL), buf);
	- if (err != 0)
	- return (err);
	- err = zfs_zget(zfsvfs, parent, dzpp);
	- return (err);
	-}
	-#endif /* _KERNEL */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
	@@ -1,3499 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- */
	-
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/dmu.h>
	-#include <sys/zap.h>
	-#include <sys/arc.h>
	-#include <sys/stat.h>
	-#include <sys/resource.h>
	-#include <sys/zil.h>
	-#include <sys/zil_impl.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/dsl_pool.h>
	-#include <sys/abd.h>
	-
	-/*
	- * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
	- * calls that change the file system. Each itx has enough information to
	- * be able to replay them after a system crash, power loss, or
	- * equivalent failure mode. These are stored in memory until either:
	- *
	- * 1. they are committed to the pool by the DMU transaction group
	- * (txg), at which point they can be discarded; or
	- * 2. they are committed to the on-disk ZIL for the dataset being
	- * modified (e.g. due to an fsync, O_DSYNC, or other synchronous
	- * requirement).
	- *
	- * In the event of a crash or power loss, the itxs contained by each
	- * dataset's on-disk ZIL will be replayed when that dataset is first
	- * instantianted (e.g. if the dataset is a normal fileystem, when it is
	- * first mounted).
	- *
	- * As hinted at above, there is one ZIL per dataset (both the in-memory
	- * representation, and the on-disk representation). The on-disk format
	- * consists of 3 parts:
	- *
	- * - a single, per-dataset, ZIL header; which points to a chain of
	- * - zero or more ZIL blocks; each of which contains
	- * - zero or more ZIL records
	- *
	- * A ZIL record holds the information necessary to replay a single
	- * system call transaction. A ZIL block can hold many ZIL records, and
	- * the blocks are chained together, similarly to a singly linked list.
	- *
	- * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
	- * block in the chain, and the ZIL header points to the first block in
	- * the chain.
	- *
	- * Note, there is not a fixed place in the pool to hold these ZIL
	- * blocks; they are dynamically allocated and freed as needed from the
	- * blocks available on the pool, though they can be preferentially
	- * allocated from a dedicated "log" vdev.
	- */
	-
	-/*
	- * This controls the amount of time that a ZIL block (lwb) will remain
	- * "open" when it isn't "full", and it has a thread waiting for it to be
	- * committed to stable storage. Please refer to the zil_commit_waiter()
	- * function (and the comments within it) for more details.
	- */
	-int zfs_commit_timeout_pct = 5;
	-
	-/*
	- * Disable intent logging replay. This global ZIL switch affects all pools.
	- */
	-int zil_replay_disable = 0;
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN,
	- &zil_replay_disable, 0, "Disable intent logging replay");
	-
	-/*
	- * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
	- * the disk(s) by the ZIL after an LWB write has completed. Setting this
	- * will cause ZIL corruption on power loss if a volatile out-of-order
	- * write cache is enabled.
	- */
	-boolean_t zil_nocacheflush = B_FALSE;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_nocacheflush, CTLFLAG_RWTUN,
	- &zil_nocacheflush, 0, "Disable ZIL cache flush");
	-
	-boolean_t zfs_trim_enabled = B_TRUE;
	-SYSCTL_DECL(_vfs_zfs_trim);
	-SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
	- "Enable ZFS TRIM");
	-
	-/*
	- * Limit SLOG write size per commit executed with synchronous priority.
	- * Any writes above that will be executed with lower (asynchronous) priority
	- * to limit potential SLOG device abuse by single active ZIL writer.
	- */
	-uint64_t zil_slog_bulk = 768 * 1024;
	-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN,
	- &zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority");
	-
	-static kmem_cache_t *zil_lwb_cache;
	-static kmem_cache_t *zil_zcw_cache;
	-
	-#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
	- sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
	-
	-static int
	-zil_bp_compare(const void x1, const void x2)
	-{
	- const dva_t dva1 = &((zil_bp_node_t )x1)->zn_dva;
	- const dva_t dva2 = &((zil_bp_node_t )x2)->zn_dva;
	-
	- int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
	- if (likely(cmp))
	- return (cmp);
	-
	- return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
	-}
	-
	-static void
	-zil_bp_tree_init(zilog_t *zilog)
	-{
	- avl_create(&zilog->zl_bp_tree, zil_bp_compare,
	- sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
	-}
	-
	-static void
	-zil_bp_tree_fini(zilog_t *zilog)
	-{
	- avl_tree_t *t = &zilog->zl_bp_tree;
	- zil_bp_node_t *zn;
	- void *cookie = NULL;
	-
	- while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
	- kmem_free(zn, sizeof (zil_bp_node_t));
	-
	- avl_destroy(t);
	-}
	-
	-int
	-zil_bp_tree_add(zilog_t zilog, const blkptr_t bp)
	-{
	- avl_tree_t *t = &zilog->zl_bp_tree;
	- const dva_t *dva;
	- zil_bp_node_t *zn;
	- avl_index_t where;
	-
	- if (BP_IS_EMBEDDED(bp))
	- return (0);
	-
	- dva = BP_IDENTITY(bp);
	-
	- if (avl_find(t, dva, &where) != NULL)
	- return (SET_ERROR(EEXIST));
	-
	- zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
	- zn->zn_dva = *dva;
	- avl_insert(t, zn, where);
	-
	- return (0);
	-}
	-
	-static zil_header_t *
	-zil_header_in_syncing_context(zilog_t *zilog)
	-{
	- return ((zil_header_t *)zilog->zl_header);
	-}
	-
	-static void
	-zil_init_log_chain(zilog_t zilog, blkptr_t bp)
	-{
	- zio_cksum_t *zc = &bp->blk_cksum;
	-
	- zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
	- zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
	- zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
	- zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
	-}
	-
	-/*
	- * Read a log block and make sure it's valid.
	- */
	-static int
	-zil_read_log_block(zilog_t zilog, const blkptr_t bp, blkptr_t nbp, void dst,
	- char **end)
	-{
	- enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
	- arc_flags_t aflags = ARC_FLAG_WAIT;
	- arc_buf_t *abuf = NULL;
	- zbookmark_phys_t zb;
	- int error;
	-
	- if (zilog->zl_header->zh_claim_txg == 0)
	- zio_flags \|= ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SCRUB;
	-
	- if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
	- zio_flags \|= ZIO_FLAG_SPECULATIVE;
	-
	- SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
	- ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
	-
	- error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
	- ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
	-
	- if (error == 0) {
	- zio_cksum_t cksum = bp->blk_cksum;
	-
	- /*
	- * Validate the checksummed log block.
	- *
	- * Sequence numbers should be... sequential. The checksum
	- * verifier for the next block should be bp's checksum plus 1.
	- *
	- * Also check the log chain linkage and size used.
	- */
	- cksum.zc_word[ZIL_ZC_SEQ]++;
	-
	- if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
	- zil_chain_t *zilc = abuf->b_data;
	- char lr = (char )(zilc + 1);
	- uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
	-
	- if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
	- sizeof (cksum)) \|\| BP_IS_HOLE(&zilc->zc_next_blk)) {
	- error = SET_ERROR(ECKSUM);
	- } else {
	- ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
	- bcopy(lr, dst, len);
	- end = (char )dst + len;
	- *nbp = zilc->zc_next_blk;
	- }
	- } else {
	- char *lr = abuf->b_data;
	- uint64_t size = BP_GET_LSIZE(bp);
	- zil_chain_t zilc = (zil_chain_t )(lr + size) - 1;
	-
	- if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
	- sizeof (cksum)) \|\| BP_IS_HOLE(&zilc->zc_next_blk) \|\|
	- (zilc->zc_nused > (size - sizeof (*zilc)))) {
	- error = SET_ERROR(ECKSUM);
	- } else {
	- ASSERT3U(zilc->zc_nused, <=,
	- SPA_OLD_MAXBLOCKSIZE);
	- bcopy(lr, dst, zilc->zc_nused);
	- end = (char )dst + zilc->zc_nused;
	- *nbp = zilc->zc_next_blk;
	- }
	- }
	-
	- arc_buf_destroy(abuf, &abuf);
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * Read a TX_WRITE log data block.
	- */
	-static int
	-zil_read_log_data(zilog_t zilog, const lr_write_t lr, void *wbuf)
	-{
	- enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
	- const blkptr_t *bp = &lr->lr_blkptr;
	- arc_flags_t aflags = ARC_FLAG_WAIT;
	- arc_buf_t *abuf = NULL;
	- zbookmark_phys_t zb;
	- int error;
	-
	- if (BP_IS_HOLE(bp)) {
	- if (wbuf != NULL)
	- bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
	- return (0);
	- }
	-
	- if (zilog->zl_header->zh_claim_txg == 0)
	- zio_flags \|= ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SCRUB;
	-
	- SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
	- ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
	-
	- error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
	- ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
	-
	- if (error == 0) {
	- if (wbuf != NULL)
	- bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
	- arc_buf_destroy(abuf, &abuf);
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * Parse the intent log, and call parse_func for each valid record within.
	- */
	-int
	-zil_parse(zilog_t zilog, zil_parse_blk_func_t parse_blk_func,
	- zil_parse_lr_func_t parse_lr_func, void arg, uint64_t txg)
	-{
	- const zil_header_t *zh = zilog->zl_header;
	- boolean_t claimed = !!zh->zh_claim_txg;
	- uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
	- uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
	- uint64_t max_blk_seq = 0;
	- uint64_t max_lr_seq = 0;
	- uint64_t blk_count = 0;
	- uint64_t lr_count = 0;
	- blkptr_t blk, next_blk;
	- char lrbuf, lrp;
	- int error = 0;
	-
	- /*
	- * Old logs didn't record the maximum zh_claim_lr_seq.
	- */
	- if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
	- claim_lr_seq = UINT64_MAX;
	-
	- /*
	- * Starting at the block pointed to by zh_log we read the log chain.
	- * For each block in the chain we strongly check that block to
	- * ensure its validity. We stop when an invalid block is found.
	- * For each block pointer in the chain we call parse_blk_func().
	- * For each record in each valid block we call parse_lr_func().
	- * If the log has been claimed, stop if we encounter a sequence
	- * number greater than the highest claimed sequence number.
	- */
	- lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
	- zil_bp_tree_init(zilog);
	-
	- for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
	- uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
	- int reclen;
	- char *end;
	-
	- if (blk_seq > claim_blk_seq)
	- break;
	- if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
	- break;
	- ASSERT3U(max_blk_seq, <, blk_seq);
	- max_blk_seq = blk_seq;
	- blk_count++;
	-
	- if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
	- break;
	-
	- error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
	- if (error != 0)
	- break;
	-
	- for (lrp = lrbuf; lrp < end; lrp += reclen) {
	- lr_t lr = (lr_t )lrp;
	- reclen = lr->lrc_reclen;
	- ASSERT3U(reclen, >=, sizeof (lr_t));
	- if (lr->lrc_seq > claim_lr_seq)
	- goto done;
	- if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
	- goto done;
	- ASSERT3U(max_lr_seq, <, lr->lrc_seq);
	- max_lr_seq = lr->lrc_seq;
	- lr_count++;
	- }
	- }
	-done:
	- zilog->zl_parse_error = error;
	- zilog->zl_parse_blk_seq = max_blk_seq;
	- zilog->zl_parse_lr_seq = max_lr_seq;
	- zilog->zl_parse_blk_count = blk_count;
	- zilog->zl_parse_lr_count = lr_count;
	-
	- ASSERT(!claimed \|\| !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) \|\|
	- (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
	-
	- zil_bp_tree_fini(zilog);
	- zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
	-
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zil_clear_log_block(zilog_t zilog, blkptr_t bp, void *tx, uint64_t first_txg)
	-{
	- ASSERT(!BP_IS_HOLE(bp));
	-
	- /*
	- * As we call this function from the context of a rewind to a
	- * checkpoint, each ZIL block whose txg is later than the txg
	- * that we rewind to is invalid. Thus, we return -1 so
	- * zil_parse() doesn't attempt to read it.
	- */
	- if (bp->blk_birth >= first_txg)
	- return (-1);
	-
	- if (zil_bp_tree_add(zilog, bp) != 0)
	- return (0);
	-
	- zio_free(zilog->zl_spa, first_txg, bp);
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zil_noop_log_record(zilog_t zilog, lr_t lrc, void *tx, uint64_t first_txg)
	-{
	- return (0);
	-}
	-
	-static int
	-zil_claim_log_block(zilog_t zilog, blkptr_t bp, void *tx, uint64_t first_txg)
	-{
	- /*
	- * Claim log block if not already committed and not already claimed.
	- * If tx == NULL, just verify that the block is claimable.
	- */
	- if (BP_IS_HOLE(bp) \|\| bp->blk_birth < first_txg \|\|
	- zil_bp_tree_add(zilog, bp) != 0)
	- return (0);
	-
	- return (zio_wait(zio_claim(NULL, zilog->zl_spa,
	- tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SCRUB)));
	-}
	-
	-static int
	-zil_claim_log_record(zilog_t zilog, lr_t lrc, void *tx, uint64_t first_txg)
	-{
	- lr_write_t lr = (lr_write_t )lrc;
	- int error;
	-
	- if (lrc->lrc_txtype != TX_WRITE)
	- return (0);
	-
	- /*
	- * If the block is not readable, don't claim it. This can happen
	- * in normal operation when a log block is written to disk before
	- * some of the dmu_sync() blocks it points to. In this case, the
	- * transaction cannot have been committed to anyone (we would have
	- * waited for all writes to be stable first), so it is semantically
	- * correct to declare this the end of the log.
	- */
	- if (lr->lr_blkptr.blk_birth >= first_txg &&
	- (error = zil_read_log_data(zilog, lr, NULL)) != 0)
	- return (error);
	- return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
	-}
	-
	-/* ARGSUSED */
	-static int
	-zil_free_log_block(zilog_t zilog, blkptr_t bp, void *tx, uint64_t claim_txg)
	-{
	- zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
	-
	- return (0);
	-}
	-
	-static int
	-zil_free_log_record(zilog_t zilog, lr_t lrc, void *tx, uint64_t claim_txg)
	-{
	- lr_write_t lr = (lr_write_t )lrc;
	- blkptr_t *bp = &lr->lr_blkptr;
	-
	- /*
	- * If we previously claimed it, we need to free it.
	- */
	- if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
	- bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
	- !BP_IS_HOLE(bp))
	- zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
	-
	- return (0);
	-}
	-
	-static int
	-zil_lwb_vdev_compare(const void x1, const void x2)
	-{
	- const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
	- const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
	-
	- return (AVL_CMP(v1, v2));
	-}
	-
	-static lwb_t *
	-zil_alloc_lwb(zilog_t zilog, blkptr_t bp, boolean_t slog, uint64_t txg)
	-{
	- lwb_t *lwb;
	-
	- lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
	- lwb->lwb_zilog = zilog;
	- lwb->lwb_blk = *bp;
	- lwb->lwb_slog = slog;
	- lwb->lwb_state = LWB_STATE_CLOSED;
	- lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
	- lwb->lwb_max_txg = txg;
	- lwb->lwb_write_zio = NULL;
	- lwb->lwb_root_zio = NULL;
	- lwb->lwb_tx = NULL;
	- lwb->lwb_issued_timestamp = 0;
	- if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
	- lwb->lwb_nused = sizeof (zil_chain_t);
	- lwb->lwb_sz = BP_GET_LSIZE(bp);
	- } else {
	- lwb->lwb_nused = 0;
	- lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
	- }
	-
	- mutex_enter(&zilog->zl_lock);
	- list_insert_tail(&zilog->zl_lwb_list, lwb);
	- mutex_exit(&zilog->zl_lock);
	-
	- ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
	- ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
	- VERIFY(list_is_empty(&lwb->lwb_waiters));
	-
	- return (lwb);
	-}
	-
	-static void
	-zil_free_lwb(zilog_t zilog, lwb_t lwb)
	-{
	- ASSERT(MUTEX_HELD(&zilog->zl_lock));
	- ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
	- VERIFY(list_is_empty(&lwb->lwb_waiters));
	- ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
	- ASSERT3P(lwb->lwb_write_zio, ==, NULL);
	- ASSERT3P(lwb->lwb_root_zio, ==, NULL);
	- ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
	- ASSERT(lwb->lwb_state == LWB_STATE_CLOSED \|\|
	- lwb->lwb_state == LWB_STATE_FLUSH_DONE);
	-
	- /*
	- * Clear the zilog's field to indicate this lwb is no longer
	- * valid, and prevent use-after-free errors.
	- */
	- if (zilog->zl_last_lwb_opened == lwb)
	- zilog->zl_last_lwb_opened = NULL;
	-
	- kmem_cache_free(zil_lwb_cache, lwb);
	-}
	-
	-/*
	- * Called when we create in-memory log transactions so that we know
	- * to cleanup the itxs at the end of spa_sync().
	- */
	-void
	-zilog_dirty(zilog_t *zilog, uint64_t txg)
	-{
	- dsl_pool_t *dp = zilog->zl_dmu_pool;
	- dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
	-
	- ASSERT(spa_writeable(zilog->zl_spa));
	-
	- if (ds->ds_is_snapshot)
	- panic("dirtying snapshot!");
	-
	- if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
	- /* up the hold count until we can be written out */
	- dmu_buf_add_ref(ds->ds_dbuf, zilog);
	-
	- zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
	- }
	-}
	-
	-/*
	- * Determine if the zil is dirty in the specified txg. Callers wanting to
	- * ensure that the dirty state does not change must hold the itxg_lock for
	- * the specified txg. Holding the lock will ensure that the zil cannot be
	- * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
	- * state.
	- */
	-boolean_t
	-zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
	-{
	- dsl_pool_t *dp = zilog->zl_dmu_pool;
	-
	- if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
	- return (B_TRUE);
	- return (B_FALSE);
	-}
	-
	-/*
	- * Determine if the zil is dirty. The zil is considered dirty if it has
	- * any pending itx records that have not been cleaned by zil_clean().
	- */
	-boolean_t
	-zilog_is_dirty(zilog_t *zilog)
	-{
	- dsl_pool_t *dp = zilog->zl_dmu_pool;
	-
	- for (int t = 0; t < TXG_SIZE; t++) {
	- if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
	- return (B_TRUE);
	- }
	- return (B_FALSE);
	-}
	-
	-/*
	- * Create an on-disk intent log.
	- */
	-static lwb_t *
	-zil_create(zilog_t *zilog)
	-{
	- const zil_header_t *zh = zilog->zl_header;
	- lwb_t *lwb = NULL;
	- uint64_t txg = 0;
	- dmu_tx_t *tx = NULL;
	- blkptr_t blk;
	- int error = 0;
	- boolean_t slog = FALSE;
	-
	- /*
	- * Wait for any previous destroy to complete.
	- */
	- txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
	-
	- ASSERT(zh->zh_claim_txg == 0);
	- ASSERT(zh->zh_replay_seq == 0);
	-
	- blk = zh->zh_log;
	-
	- /*
	- * Allocate an initial log block if:
	- * - there isn't one already
	- * - the existing block is the wrong endianess
	- */
	- if (BP_IS_HOLE(&blk) \|\| BP_SHOULD_BYTESWAP(&blk)) {
	- tx = dmu_tx_create(zilog->zl_os);
	- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	- txg = dmu_tx_get_txg(tx);
	-
	- if (!BP_IS_HOLE(&blk)) {
	- zio_free(zilog->zl_spa, txg, &blk);
	- BP_ZERO(&blk);
	- }
	-
	- error = zio_alloc_zil(zilog->zl_spa,
	- zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL,
	- ZIL_MIN_BLKSZ, &slog);
	-
	- if (error == 0)
	- zil_init_log_chain(zilog, &blk);
	- }
	-
	- /*
	- * Allocate a log write block (lwb) for the first log block.
	- */
	- if (error == 0)
	- lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
	-
	- /*
	- * If we just allocated the first log block, commit our transaction
	- * and wait for zil_sync() to stuff the block poiner into zh_log.
	- * (zh is part of the MOS, so we cannot modify it in open context.)
	- */
	- if (tx != NULL) {
	- dmu_tx_commit(tx);
	- txg_wait_synced(zilog->zl_dmu_pool, txg);
	- }
	-
	- ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
	-
	- return (lwb);
	-}
	-
	-/*
	- * In one tx, free all log blocks and clear the log header. If keep_first
	- * is set, then we're replaying a log with no content. We want to keep the
	- * first block, however, so that the first synchronous transaction doesn't
	- * require a txg_wait_synced() in zil_create(). We don't need to
	- * txg_wait_synced() here either when keep_first is set, because both
	- * zil_create() and zil_destroy() will wait for any in-progress destroys
	- * to complete.
	- */
	-void
	-zil_destroy(zilog_t *zilog, boolean_t keep_first)
	-{
	- const zil_header_t *zh = zilog->zl_header;
	- lwb_t *lwb;
	- dmu_tx_t *tx;
	- uint64_t txg;
	-
	- /*
	- * Wait for any previous destroy to complete.
	- */
	- txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
	-
	- zilog->zl_old_header = zh; / debugging aid */
	-
	- if (BP_IS_HOLE(&zh->zh_log))
	- return;
	-
	- tx = dmu_tx_create(zilog->zl_os);
	- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	- txg = dmu_tx_get_txg(tx);
	-
	- mutex_enter(&zilog->zl_lock);
	-
	- ASSERT3U(zilog->zl_destroy_txg, <, txg);
	- zilog->zl_destroy_txg = txg;
	- zilog->zl_keep_first = keep_first;
	-
	- if (!list_is_empty(&zilog->zl_lwb_list)) {
	- ASSERT(zh->zh_claim_txg == 0);
	- VERIFY(!keep_first);
	- while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
	- list_remove(&zilog->zl_lwb_list, lwb);
	- if (lwb->lwb_buf != NULL)
	- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
	- zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
	- zil_free_lwb(zilog, lwb);
	- }
	- } else if (!keep_first) {
	- zil_destroy_sync(zilog, tx);
	- }
	- mutex_exit(&zilog->zl_lock);
	-
	- dmu_tx_commit(tx);
	-}
	-
	-void
	-zil_destroy_sync(zilog_t zilog, dmu_tx_t tx)
	-{
	- ASSERT(list_is_empty(&zilog->zl_lwb_list));
	- (void) zil_parse(zilog, zil_free_log_block,
	- zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
	-}
	-
	-int
	-zil_claim(dsl_pool_t dp, dsl_dataset_t ds, void *txarg)
	-{
	- dmu_tx_t *tx = txarg;
	- zilog_t *zilog;
	- uint64_t first_txg;
	- zil_header_t *zh;
	- objset_t *os;
	- int error;
	-
	- error = dmu_objset_own_obj(dp, ds->ds_object,
	- DMU_OST_ANY, B_FALSE, FTAG, &os);
	- if (error != 0) {
	- /*
	- * EBUSY indicates that the objset is inconsistent, in which
	- * case it can not have a ZIL.
	- */
	- if (error != EBUSY) {
	- cmn_err(CE_WARN, "can't open objset for %llu, error %u",
	- (unsigned long long)ds->ds_object, error);
	- }
	- return (0);
	- }
	-
	- zilog = dmu_objset_zil(os);
	- zh = zil_header_in_syncing_context(zilog);
	- ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
	- first_txg = spa_min_claim_txg(zilog->zl_spa);
	-
	- /*
	- * If the spa_log_state is not set to be cleared, check whether
	- * the current uberblock is a checkpoint one and if the current
	- * header has been claimed before moving on.
	- *
	- * If the current uberblock is a checkpointed uberblock then
	- * one of the following scenarios took place:
	- *
	- * 1] We are currently rewinding to the checkpoint of the pool.
	- * 2] We crashed in the middle of a checkpoint rewind but we
	- * did manage to write the checkpointed uberblock to the
	- * vdev labels, so when we tried to import the pool again
	- * the checkpointed uberblock was selected from the import
	- * procedure.
	- *
	- * In both cases we want to zero out all the ZIL blocks, except
	- * the ones that have been claimed at the time of the checkpoint
	- * (their zh_claim_txg != 0). The reason is that these blocks
	- * may be corrupted since we may have reused their locations on
	- * disk after we took the checkpoint.
	- *
	- * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
	- * when we first figure out whether the current uberblock is
	- * checkpointed or not. Unfortunately, that would discard all
	- * the logs, including the ones that are claimed, and we would
	- * leak space.
	- */
	- if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR \|\|
	- (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
	- zh->zh_claim_txg == 0)) {
	- if (!BP_IS_HOLE(&zh->zh_log)) {
	- (void) zil_parse(zilog, zil_clear_log_block,
	- zil_noop_log_record, tx, first_txg);
	- }
	- BP_ZERO(&zh->zh_log);
	- dsl_dataset_dirty(dmu_objset_ds(os), tx);
	- dmu_objset_disown(os, FTAG);
	- return (0);
	- }
	-
	- /*
	- * If we are not rewinding and opening the pool normally, then
	- * the min_claim_txg should be equal to the first txg of the pool.
	- */
	- ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
	-
	- /*
	- * Claim all log blocks if we haven't already done so, and remember
	- * the highest claimed sequence number. This ensures that if we can
	- * read only part of the log now (e.g. due to a missing device),
	- * but we can read the entire log later, we will not try to replay
	- * or destroy beyond the last block we successfully claimed.
	- */
	- ASSERT3U(zh->zh_claim_txg, <=, first_txg);
	- if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
	- (void) zil_parse(zilog, zil_claim_log_block,
	- zil_claim_log_record, tx, first_txg);
	- zh->zh_claim_txg = first_txg;
	- zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
	- zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
	- if (zilog->zl_parse_lr_count \|\| zilog->zl_parse_blk_count > 1)
	- zh->zh_flags \|= ZIL_REPLAY_NEEDED;
	- zh->zh_flags \|= ZIL_CLAIM_LR_SEQ_VALID;
	- dsl_dataset_dirty(dmu_objset_ds(os), tx);
	- }
	-
	- ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
	- dmu_objset_disown(os, FTAG);
	- return (0);
	-}
	-
	-/*
	- * Check the log by walking the log chain.
	- * Checksum errors are ok as they indicate the end of the chain.
	- * Any other error (no device or read failure) returns an error.
	- */
	-/* ARGSUSED */
	-int
	-zil_check_log_chain(dsl_pool_t dp, dsl_dataset_t ds, void *tx)
	-{
	- zilog_t *zilog;
	- objset_t *os;
	- blkptr_t *bp;
	- int error;
	-
	- ASSERT(tx == NULL);
	-
	- error = dmu_objset_from_ds(ds, &os);
	- if (error != 0) {
	- cmn_err(CE_WARN, "can't open objset %llu, error %d",
	- (unsigned long long)ds->ds_object, error);
	- return (0);
	- }
	-
	- zilog = dmu_objset_zil(os);
	- bp = (blkptr_t *)&zilog->zl_header->zh_log;
	-
	- if (!BP_IS_HOLE(bp)) {
	- vdev_t *vd;
	- boolean_t valid = B_TRUE;
	-
	- /*
	- * Check the first block and determine if it's on a log device
	- * which may have been removed or faulted prior to loading this
	- * pool. If so, there's no point in checking the rest of the
	- * log as its content should have already been synced to the
	- * pool.
	- */
	- spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
	- vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
	- if (vd->vdev_islog && vdev_is_dead(vd))
	- valid = vdev_log_state_valid(vd);
	- spa_config_exit(os->os_spa, SCL_STATE, FTAG);
	-
	- if (!valid)
	- return (0);
	-
	- /*
	- * Check whether the current uberblock is checkpointed (e.g.
	- * we are rewinding) and whether the current header has been
	- * claimed or not. If it hasn't then skip verifying it. We
	- * do this because its ZIL blocks may be part of the pool's
	- * state before the rewind, which is no longer valid.
	- */
	- zil_header_t *zh = zil_header_in_syncing_context(zilog);
	- if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
	- zh->zh_claim_txg == 0)
	- return (0);
	- }
	-
	- /*
	- * Because tx == NULL, zil_claim_log_block() will not actually claim
	- * any blocks, but just determine whether it is possible to do so.
	- * In addition to checking the log chain, zil_claim_log_block()
	- * will invoke zio_claim() with a done func of spa_claim_notify(),
	- * which will update spa_max_claim_txg. See spa_load() for details.
	- */
	- error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
	- zilog->zl_header->zh_claim_txg ? -1ULL :
	- spa_min_claim_txg(os->os_spa));
	-
	- return ((error == ECKSUM \|\| error == ENOENT) ? 0 : error);
	-}
	-
	-/*
	- * When an itx is "skipped", this function is used to properly mark the
	- * waiter as "done, and signal any thread(s) waiting on it. An itx can
	- * be skipped (and not committed to an lwb) for a variety of reasons,
	- * one of them being that the itx was committed via spa_sync(), prior to
	- * it being committed to an lwb; this can happen if a thread calling
	- * zil_commit() is racing with spa_sync().
	- */
	-static void
	-zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
	-{
	- mutex_enter(&zcw->zcw_lock);
	- ASSERT3B(zcw->zcw_done, ==, B_FALSE);
	- zcw->zcw_done = B_TRUE;
	- cv_broadcast(&zcw->zcw_cv);
	- mutex_exit(&zcw->zcw_lock);
	-}
	-
	-/*
	- * This function is used when the given waiter is to be linked into an
	- * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
	- * At this point, the waiter will no longer be referenced by the itx,
	- * and instead, will be referenced by the lwb.
	- */
	-static void
	-zil_commit_waiter_link_lwb(zil_commit_waiter_t zcw, lwb_t lwb)
	-{
	- /*
	- * The lwb_waiters field of the lwb is protected by the zilog's
	- * zl_lock, thus it must be held when calling this function.
	- */
	- ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
	-
	- mutex_enter(&zcw->zcw_lock);
	- ASSERT(!list_link_active(&zcw->zcw_node));
	- ASSERT3P(zcw->zcw_lwb, ==, NULL);
	- ASSERT3P(lwb, !=, NULL);
	- ASSERT(lwb->lwb_state == LWB_STATE_OPENED \|\|
	- lwb->lwb_state == LWB_STATE_ISSUED \|\|
	- lwb->lwb_state == LWB_STATE_WRITE_DONE);
	-
	- list_insert_tail(&lwb->lwb_waiters, zcw);
	- zcw->zcw_lwb = lwb;
	- mutex_exit(&zcw->zcw_lock);
	-}
	-
	-/*
	- * This function is used when zio_alloc_zil() fails to allocate a ZIL
	- * block, and the given waiter must be linked to the "nolwb waiters"
	- * list inside of zil_process_commit_list().
	- */
	-static void
	-zil_commit_waiter_link_nolwb(zil_commit_waiter_t zcw, list_t nolwb)
	-{
	- mutex_enter(&zcw->zcw_lock);
	- ASSERT(!list_link_active(&zcw->zcw_node));
	- ASSERT3P(zcw->zcw_lwb, ==, NULL);
	- list_insert_tail(nolwb, zcw);
	- mutex_exit(&zcw->zcw_lock);
	-}
	-
	-void
	-zil_lwb_add_block(lwb_t lwb, const blkptr_t bp)
	-{
	- avl_tree_t *t = &lwb->lwb_vdev_tree;
	- avl_index_t where;
	- zil_vdev_node_t *zv, zvsearch;
	- int ndvas = BP_GET_NDVAS(bp);
	- int i;
	-
	- if (zil_nocacheflush)
	- return;
	-
	- mutex_enter(&lwb->lwb_vdev_lock);
	- for (i = 0; i < ndvas; i++) {
	- zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
	- if (avl_find(t, &zvsearch, &where) == NULL) {
	- zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
	- zv->zv_vdev = zvsearch.zv_vdev;
	- avl_insert(t, zv, where);
	- }
	- }
	- mutex_exit(&lwb->lwb_vdev_lock);
	-}
	-
	-static void
	-zil_lwb_flush_defer(lwb_t lwb, lwb_t nlwb)
	-{
	- avl_tree_t *src = &lwb->lwb_vdev_tree;
	- avl_tree_t *dst = &nlwb->lwb_vdev_tree;
	- void *cookie = NULL;
	- zil_vdev_node_t *zv;
	-
	- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
	- ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
	- ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
	-
	- /*
	- * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
	- * not need the protection of lwb_vdev_lock (it will only be modified
	- * while holding zilog->zl_lock) as its writes and those of its
	- * children have all completed. The younger 'nlwb' may be waiting on
	- * future writes to additional vdevs.
	- */
	- mutex_enter(&nlwb->lwb_vdev_lock);
	- /*
	- * Tear down the 'lwb' vdev tree, ensuring that entries which do not
	- * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
	- */
	- while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
	- avl_index_t where;
	-
	- if (avl_find(dst, zv, &where) == NULL) {
	- avl_insert(dst, zv, where);
	- } else {
	- kmem_free(zv, sizeof (*zv));
	- }
	- }
	- mutex_exit(&nlwb->lwb_vdev_lock);
	-}
	-
	-void
	-zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
	-{
	- lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
	-}
	-
	-/*
	- * This function is a called after all vdevs associated with a given lwb
	- * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
	- * as the lwb write completes, if "zil_nocacheflush" is set. Further,
	- * all "previous" lwb's will have completed before this function is
	- * called; i.e. this function is called for all previous lwbs before
	- * it's called for "this" lwb (enforced via zio the dependencies
	- * configured in zil_lwb_set_zio_dependency()).
	- *
	- * The intention is for this function to be called as soon as the
	- * contents of an lwb are considered "stable" on disk, and will survive
	- * any sudden loss of power. At this point, any threads waiting for the
	- * lwb to reach this state are signalled, and the "waiter" structures
	- * are marked "done".
	- */
	-static void
	-zil_lwb_flush_vdevs_done(zio_t *zio)
	-{
	- lwb_t *lwb = zio->io_private;
	- zilog_t *zilog = lwb->lwb_zilog;
	- dmu_tx_t *tx = lwb->lwb_tx;
	- zil_commit_waiter_t *zcw;
	-
	- spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
	-
	- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
	-
	- mutex_enter(&zilog->zl_lock);
	-
	- /*
	- * Ensure the lwb buffer pointer is cleared before releasing the
	- * txg. If we have had an allocation failure and the txg is
	- * waiting to sync then we want zil_sync() to remove the lwb so
	- * that it's not picked up as the next new one in
	- * zil_process_commit_list(). zil_sync() will only remove the
	- * lwb if lwb_buf is null.
	- */
	- lwb->lwb_buf = NULL;
	- lwb->lwb_tx = NULL;
	-
	- ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
	- zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
	-
	- lwb->lwb_root_zio = NULL;
	-
	- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
	- lwb->lwb_state = LWB_STATE_FLUSH_DONE;
	-
	- if (zilog->zl_last_lwb_opened == lwb) {
	- /*
	- * Remember the highest committed log sequence number
	- * for ztest. We only update this value when all the log
	- * writes succeeded, because ztest wants to ASSERT that
	- * it got the whole log chain.
	- */
	- zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
	- }
	-
	- while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
	- mutex_enter(&zcw->zcw_lock);
	-
	- ASSERT(list_link_active(&zcw->zcw_node));
	- list_remove(&lwb->lwb_waiters, zcw);
	-
	- ASSERT3P(zcw->zcw_lwb, ==, lwb);
	- zcw->zcw_lwb = NULL;
	-
	- zcw->zcw_zio_error = zio->io_error;
	-
	- ASSERT3B(zcw->zcw_done, ==, B_FALSE);
	- zcw->zcw_done = B_TRUE;
	- cv_broadcast(&zcw->zcw_cv);
	-
	- mutex_exit(&zcw->zcw_lock);
	- }
	-
	- mutex_exit(&zilog->zl_lock);
	-
	- /*
	- * Now that we've written this log block, we have a stable pointer
	- * to the next block in the chain, so it's OK to let the txg in
	- * which we allocated the next block sync.
	- */
	- dmu_tx_commit(tx);
	-}
	-
	-/*
	- * This is called when an lwb's write zio completes. The callback's
	- * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
	- * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
	- * in writing out this specific lwb's data, and in the case that cache
	- * flushes have been deferred, vdevs involved in writing the data for
	- * previous lwbs. The writes corresponding to all the vdevs in the
	- * lwb_vdev_tree will have completed by the time this is called, due to
	- * the zio dependencies configured in zil_lwb_set_zio_dependency(),
	- * which takes deferred flushes into account. The lwb will be "done"
	- * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
	- * completion callback for the lwb's root zio.
	- */
	-static void
	-zil_lwb_write_done(zio_t *zio)
	-{
	- lwb_t *lwb = zio->io_private;
	- spa_t *spa = zio->io_spa;
	- zilog_t *zilog = lwb->lwb_zilog;
	- avl_tree_t *t = &lwb->lwb_vdev_tree;
	- void *cookie = NULL;
	- zil_vdev_node_t *zv;
	- lwb_t *nlwb;
	-
	- ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
	-
	- ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
	- ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
	- ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
	- ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
	- ASSERT(!BP_IS_GANG(zio->io_bp));
	- ASSERT(!BP_IS_HOLE(zio->io_bp));
	- ASSERT(BP_GET_FILL(zio->io_bp) == 0);
	-
	- abd_put(zio->io_abd);
	-
	- mutex_enter(&zilog->zl_lock);
	- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
	- lwb->lwb_state = LWB_STATE_WRITE_DONE;
	- lwb->lwb_write_zio = NULL;
	- nlwb = list_next(&zilog->zl_lwb_list, lwb);
	- mutex_exit(&zilog->zl_lock);
	-
	- if (avl_numnodes(t) == 0)
	- return;
	-
	- /*
	- * If there was an IO error, we're not going to call zio_flush()
	- * on these vdevs, so we simply empty the tree and free the
	- * nodes. We avoid calling zio_flush() since there isn't any
	- * good reason for doing so, after the lwb block failed to be
	- * written out.
	- */
	- if (zio->io_error != 0) {
	- while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
	- kmem_free(zv, sizeof (*zv));
	- return;
	- }
	-
	- /*
	- * If this lwb does not have any threads waiting for it to
	- * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
	- * command to the vdevs written to by "this" lwb, and instead
	- * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
	- * command for those vdevs. Thus, we merge the vdev tree of
	- * "this" lwb with the vdev tree of the "next" lwb in the list,
	- * and assume the "next" lwb will handle flushing the vdevs (or
	- * deferring the flush(s) again).
	- *
	- * This is a useful performance optimization, especially for
	- * workloads with lots of async write activity and few sync
	- * write and/or fsync activity, as it has the potential to
	- * coalesce multiple flush commands to a vdev into one.
	- */
	- if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
	- zil_lwb_flush_defer(lwb, nlwb);
	- ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
	- return;
	- }
	-
	- while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
	- vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
	- if (vd != NULL)
	- zio_flush(lwb->lwb_root_zio, vd);
	- kmem_free(zv, sizeof (*zv));
	- }
	-}
	-
	-static void
	-zil_lwb_set_zio_dependency(zilog_t zilog, lwb_t lwb)
	-{
	- lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
	-
	- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	- ASSERT(MUTEX_HELD(&zilog->zl_lock));
	-
	- /*
	- * The zilog's "zl_last_lwb_opened" field is used to build the
	- * lwb/zio dependency chain, which is used to preserve the
	- * ordering of lwb completions that is required by the semantics
	- * of the ZIL. Each new lwb zio becomes a parent of the
	- * "previous" lwb zio, such that the new lwb's zio cannot
	- * complete until the "previous" lwb's zio completes.
	- *
	- * This is required by the semantics of zil_commit(); the commit
	- * waiters attached to the lwbs will be woken in the lwb zio's
	- * completion callback, so this zio dependency graph ensures the
	- * waiters are woken in the correct order (the same order the
	- * lwbs were created).
	- */
	- if (last_lwb_opened != NULL &&
	- last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
	- ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED \|\|
	- last_lwb_opened->lwb_state == LWB_STATE_ISSUED \|\|
	- last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
	-
	- ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
	- zio_add_child(lwb->lwb_root_zio,
	- last_lwb_opened->lwb_root_zio);
	-
	- /*
	- * If the previous lwb's write hasn't already completed,
	- * we also want to order the completion of the lwb write
	- * zios (above, we only order the completion of the lwb
	- * root zios). This is required because of how we can
	- * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
	- *
	- * When the DKIOCFLUSHWRITECACHE commands are defered,
	- * the previous lwb will rely on this lwb to flush the
	- * vdevs written to by that previous lwb. Thus, we need
	- * to ensure this lwb doesn't issue the flush until
	- * after the previous lwb's write completes. We ensure
	- * this ordering by setting the zio parent/child
	- * relationship here.
	- *
	- * Without this relationship on the lwb's write zio,
	- * it's possible for this lwb's write to complete prior
	- * to the previous lwb's write completing; and thus, the
	- * vdevs for the previous lwb would be flushed prior to
	- * that lwb's data being written to those vdevs (the
	- * vdevs are flushed in the lwb write zio's completion
	- * handler, zil_lwb_write_done()).
	- */
	- if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
	- ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED \|\|
	- last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
	-
	- ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
	- zio_add_child(lwb->lwb_write_zio,
	- last_lwb_opened->lwb_write_zio);
	- }
	- }
	-}
	-
	-
	-/*
	- * This function's purpose is to "open" an lwb such that it is ready to
	- * accept new itxs being committed to it. To do this, the lwb's zio
	- * structures are created, and linked to the lwb. This function is
	- * idempotent; if the passed in lwb has already been opened, this
	- * function is essentially a no-op.
	- */
	-static void
	-zil_lwb_write_open(zilog_t zilog, lwb_t lwb)
	-{
	- zbookmark_phys_t zb;
	- zio_priority_t prio;
	-
	- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	- ASSERT3P(lwb, !=, NULL);
	- EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
	- EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
	-
	- SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
	- ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
	- lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
	-
	- if (lwb->lwb_root_zio == NULL) {
	- abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
	- BP_GET_LSIZE(&lwb->lwb_blk));
	-
	- if (!lwb->lwb_slog \|\| zilog->zl_cur_used <= zil_slog_bulk)
	- prio = ZIO_PRIORITY_SYNC_WRITE;
	- else
	- prio = ZIO_PRIORITY_ASYNC_WRITE;
	-
	- lwb->lwb_root_zio = zio_root(zilog->zl_spa,
	- zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
	- ASSERT3P(lwb->lwb_root_zio, !=, NULL);
	-
	- lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
	- zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
	- BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
	- prio, ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE, &zb);
	- ASSERT3P(lwb->lwb_write_zio, !=, NULL);
	-
	- lwb->lwb_state = LWB_STATE_OPENED;
	-
	- mutex_enter(&zilog->zl_lock);
	- zil_lwb_set_zio_dependency(zilog, lwb);
	- zilog->zl_last_lwb_opened = lwb;
	- mutex_exit(&zilog->zl_lock);
	- }
	-
	- ASSERT3P(lwb->lwb_root_zio, !=, NULL);
	- ASSERT3P(lwb->lwb_write_zio, !=, NULL);
	- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
	-}
	-
	-/*
	- * Define a limited set of intent log block sizes.
	- *
	- * These must be a multiple of 4KB. Note only the amount used (again
	- * aligned to 4KB) actually gets written. However, we can't always just
	- * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
	- */
	-struct {
	- uint64_t limit;
	- uint64_t blksz;
	-} zil_block_buckets[] = {
	- { 4096, 4096 }, /* non TX_WRITE */
	- { 8192 + 4096, 8192 + 4096 }, /* database */
	- { 32768 + 4096, 32768 + 4096 }, /* NFS writes */
	- { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */
	- { 131072, 131072 }, /* < 128KB writes */
	- { 131072 + 4096, 65536 + 4096 }, /* 128KB writes */
	- { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */
	-};
	-
	-/*
	- * Maximum block size used by the ZIL. This is picked up when the ZIL is
	- * initialized. Otherwise this should not be used directly; see
	- * zl_max_block_size instead.
	- */
	-int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_maxblocksize, CTLFLAG_RWTUN,
	- &zil_maxblocksize, 0, "Limit in bytes of ZIL log block size");
	-
	-/*
	- * Start a log block write and advance to the next log block.
	- * Calls are serialized.
	- */
	-static lwb_t *
	-zil_lwb_write_issue(zilog_t zilog, lwb_t lwb)
	-{
	- lwb_t *nlwb = NULL;
	- zil_chain_t *zilc;
	- spa_t *spa = zilog->zl_spa;
	- blkptr_t *bp;
	- dmu_tx_t *tx;
	- uint64_t txg;
	- uint64_t zil_blksz, wsz;
	- int i, error;
	- boolean_t slog;
	-
	- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	- ASSERT3P(lwb->lwb_root_zio, !=, NULL);
	- ASSERT3P(lwb->lwb_write_zio, !=, NULL);
	- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
	-
	- if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
	- zilc = (zil_chain_t *)lwb->lwb_buf;
	- bp = &zilc->zc_next_blk;
	- } else {
	- zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
	- bp = &zilc->zc_next_blk;
	- }
	-
	- ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
	-
	- /*
	- * Allocate the next block and save its address in this block
	- * before writing it in order to establish the log chain.
	- * Note that if the allocation of nlwb synced before we wrote
	- * the block that points at it (lwb), we'd leak it if we crashed.
	- * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
	- * We dirty the dataset to ensure that zil_sync() will be called
	- * to clean up in the event of allocation failure or I/O failure.
	- */
	-
	- tx = dmu_tx_create(zilog->zl_os);
	-
	- /*
	- * Since we are not going to create any new dirty data, and we
	- * can even help with clearing the existing dirty data, we
	- * should not be subject to the dirty data based delays. We
	- * use TXG_NOTHROTTLE to bypass the delay mechanism.
	- */
	- VERIFY0(dmu_tx_assign(tx, TXG_WAIT \| TXG_NOTHROTTLE));
	-
	- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	- txg = dmu_tx_get_txg(tx);
	-
	- lwb->lwb_tx = tx;
	-
	- /*
	- * Log blocks are pre-allocated. Here we select the size of the next
	- * block, based on size used in the last block.
	- * - first find the smallest bucket that will fit the block from a
	- * limited set of block sizes. This is because it's faster to write
	- * blocks allocated from the same metaslab as they are adjacent or
	- * close.
	- * - next find the maximum from the new suggested size and an array of
	- * previous sizes. This lessens a picket fence effect of wrongly
	- * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
	- * requests.
	- *
	- * Note we only write what is used, but we can't just allocate
	- * the maximum block size because we can exhaust the available
	- * pool log space.
	- */
	- zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
	- for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
	- continue;
	- zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
	- zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
	- for (i = 0; i < ZIL_PREV_BLKS; i++)
	- zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
	- zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
	-
	- BP_ZERO(bp);
	-
	- /* pass the old blkptr in order to spread log blocks across devs */
	- error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object,
	- txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
	- if (error == 0) {
	- ASSERT3U(bp->blk_birth, ==, txg);
	- bp->blk_cksum = lwb->lwb_blk.blk_cksum;
	- bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
	-
	- /*
	- * Allocate a new log write block (lwb).
	- */
	- nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
	- }
	-
	- if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
	- /* For Slim ZIL only write what is used. */
	- wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
	- ASSERT3U(wsz, <=, lwb->lwb_sz);
	- zio_shrink(lwb->lwb_write_zio, wsz);
	-
	- } else {
	- wsz = lwb->lwb_sz;
	- }
	-
	- zilc->zc_pad = 0;
	- zilc->zc_nused = lwb->lwb_nused;
	- zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
	-
	- /*
	- * clear unused data for security
	- */
	- bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
	-
	- spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
	-
	- zil_lwb_add_block(lwb, &lwb->lwb_blk);
	- lwb->lwb_issued_timestamp = gethrtime();
	- lwb->lwb_state = LWB_STATE_ISSUED;
	-
	- zio_nowait(lwb->lwb_root_zio);
	- zio_nowait(lwb->lwb_write_zio);
	-
	- /*
	- * If there was an allocation failure then nlwb will be null which
	- * forces a txg_wait_synced().
	- */
	- return (nlwb);
	-}
	-
	-/*
	- * Maximum amount of write data that can be put into single log block.
	- */
	-uint64_t
	-zil_max_log_data(zilog_t *zilog)
	-{
	- return (zilog->zl_max_block_size -
	- sizeof (zil_chain_t) - sizeof (lr_write_t));
	-}
	-
	-/*
	- * Maximum amount of log space we agree to waste to reduce number of
	- * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
	- */
	-static inline uint64_t
	-zil_max_waste_space(zilog_t *zilog)
	-{
	- return (zil_max_log_data(zilog) / 8);
	-}
	-
	-/*
	- * Maximum amount of write data for WR_COPIED. For correctness, consumers
	- * must fall back to WR_NEED_COPY if we can't fit the entire record into one
	- * maximum sized log block, because each WR_COPIED record must fit in a
	- * single log block. For space efficiency, we want to fit two records into a
	- * max-sized log block.
	- */
	-uint64_t
	-zil_max_copied_data(zilog_t *zilog)
	-{
	- return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
	- sizeof (lr_write_t));
	-}
	-
	-static lwb_t *
	-zil_lwb_commit(zilog_t zilog, itx_t itx, lwb_t *lwb)
	-{
	- lr_t lrcb, lrc;
	- lr_write_t lrwb, lrw;
	- char *lr_buf;
	- uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data;
	-
	- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	- ASSERT3P(lwb, !=, NULL);
	- ASSERT3P(lwb->lwb_buf, !=, NULL);
	-
	- zil_lwb_write_open(zilog, lwb);
	-
	- lrc = &itx->itx_lr;
	- lrw = (lr_write_t *)lrc;
	-
	- /*
	- * A commit itx doesn't represent any on-disk state; instead
	- * it's simply used as a place holder on the commit list, and
	- * provides a mechanism for attaching a "commit waiter" onto the
	- * correct lwb (such that the waiter can be signalled upon
	- * completion of that lwb). Thus, we don't process this itx's
	- * log record if it's a commit itx (these itx's don't have log
	- * records), and instead link the itx's waiter onto the lwb's
	- * list of waiters.
	- *
	- * For more details, see the comment above zil_commit().
	- */
	- if (lrc->lrc_txtype == TX_COMMIT) {
	- mutex_enter(&zilog->zl_lock);
	- zil_commit_waiter_link_lwb(itx->itx_private, lwb);
	- itx->itx_private = NULL;
	- mutex_exit(&zilog->zl_lock);
	- return (lwb);
	- }
	-
	- if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
	- dlen = P2ROUNDUP_TYPED(
	- lrw->lr_length, sizeof (uint64_t), uint64_t);
	- } else {
	- dlen = 0;
	- }
	- reclen = lrc->lrc_reclen;
	- zilog->zl_cur_used += (reclen + dlen);
	- txg = lrc->lrc_txg;
	-
	- ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
	-
	-cont:
	- /*
	- * If this record won't fit in the current log block, start a new one.
	- * For WR_NEED_COPY optimize layout for minimal number of chunks.
	- */
	- lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
	- max_log_data = zil_max_log_data(zilog);
	- if (reclen > lwb_sp \|\| (reclen + dlen > lwb_sp &&
	- lwb_sp < zil_max_waste_space(zilog) &&
	- (dlen % max_log_data == 0 \|\|
	- lwb_sp < reclen + dlen % max_log_data))) {
	- lwb = zil_lwb_write_issue(zilog, lwb);
	- if (lwb == NULL)
	- return (NULL);
	- zil_lwb_write_open(zilog, lwb);
	- ASSERT(LWB_EMPTY(lwb));
	- lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
	-
	- /*
	- * There must be enough space in the new, empty log block to
	- * hold reclen. For WR_COPIED, we need to fit the whole
	- * record in one block, and reclen is the header size + the
	- * data size. For WR_NEED_COPY, we can create multiple
	- * records, splitting the data into multiple blocks, so we
	- * only need to fit one word of data per block; in this case
	- * reclen is just the header size (no data).
	- */
	- ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
	- }
	-
	- dnow = MIN(dlen, lwb_sp - reclen);
	- lr_buf = lwb->lwb_buf + lwb->lwb_nused;
	- bcopy(lrc, lr_buf, reclen);
	- lrcb = (lr_t )lr_buf; / Like lrc, but inside lwb. */
	- lrwb = (lr_write_t )lrcb; / Like lrw, but inside lwb. */
	-
	- /*
	- * If it's a write, fetch the data or get its blkptr as appropriate.
	- */
	- if (lrc->lrc_txtype == TX_WRITE) {
	- if (txg > spa_freeze_txg(zilog->zl_spa))
	- txg_wait_synced(zilog->zl_dmu_pool, txg);
	- if (itx->itx_wr_state != WR_COPIED) {
	- char *dbuf;
	- int error;
	-
	- if (itx->itx_wr_state == WR_NEED_COPY) {
	- dbuf = lr_buf + reclen;
	- lrcb->lrc_reclen += dnow;
	- if (lrwb->lr_length > dnow)
	- lrwb->lr_length = dnow;
	- lrw->lr_offset += dnow;
	- lrw->lr_length -= dnow;
	- } else {
	- ASSERT(itx->itx_wr_state == WR_INDIRECT);
	- dbuf = NULL;
	- }
	-
	- /*
	- * We pass in the "lwb_write_zio" rather than
	- * "lwb_root_zio" so that the "lwb_write_zio"
	- * becomes the parent of any zio's created by
	- * the "zl_get_data" callback. The vdevs are
	- * flushed after the "lwb_write_zio" completes,
	- * so we want to make sure that completion
	- * callback waits for these additional zio's,
	- * such that the vdevs used by those zio's will
	- * be included in the lwb's vdev tree, and those
	- * vdevs will be properly flushed. If we passed
	- * in "lwb_root_zio" here, then these additional
	- * vdevs may not be flushed; e.g. if these zio's
	- * completed after "lwb_write_zio" completed.
	- */
	- error = zilog->zl_get_data(itx->itx_private,
	- lrwb, dbuf, lwb, lwb->lwb_write_zio);
	-
	- if (error == EIO) {
	- txg_wait_synced(zilog->zl_dmu_pool, txg);
	- return (lwb);
	- }
	- if (error != 0) {
	- ASSERT(error == ENOENT \|\| error == EEXIST \|\|
	- error == EALREADY);
	- return (lwb);
	- }
	- }
	- }
	-
	- /*
	- * We're actually making an entry, so update lrc_seq to be the
	- * log record sequence number. Note that this is generally not
	- * equal to the itx sequence number because not all transactions
	- * are synchronous, and sometimes spa_sync() gets there first.
	- */
	- lrcb->lrc_seq = ++zilog->zl_lr_seq;
	- lwb->lwb_nused += reclen + dnow;
	-
	- zil_lwb_add_txg(lwb, txg);
	-
	- ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
	- ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
	-
	- dlen -= dnow;
	- if (dlen > 0) {
	- zilog->zl_cur_used += reclen;
	- goto cont;
	- }
	-
	- return (lwb);
	-}
	-
	-itx_t *
	-zil_itx_create(uint64_t txtype, size_t lrsize)
	-{
	- itx_t *itx;
	-
	- lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
	-
	- itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
	- itx->itx_lr.lrc_txtype = txtype;
	- itx->itx_lr.lrc_reclen = lrsize;
	- itx->itx_lr.lrc_seq = 0; /* defensive */
	- itx->itx_sync = B_TRUE; /* default is synchronous */
	-
	- return (itx);
	-}
	-
	-void
	-zil_itx_destroy(itx_t *itx)
	-{
	- kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
	-}
	-
	-/*
	- * Free up the sync and async itxs. The itxs_t has already been detached
	- * so no locks are needed.
	- */
	-static void
	-zil_itxg_clean(itxs_t *itxs)
	-{
	- itx_t *itx;
	- list_t *list;
	- avl_tree_t *t;
	- void *cookie;
	- itx_async_node_t *ian;
	-
	- list = &itxs->i_sync_list;
	- while ((itx = list_head(list)) != NULL) {
	- /*
	- * In the general case, commit itxs will not be found
	- * here, as they'll be committed to an lwb via
	- * zil_lwb_commit(), and free'd in that function. Having
	- * said that, it is still possible for commit itxs to be
	- * found here, due to the following race:
	- *
	- * - a thread calls zil_commit() which assigns the
	- * commit itx to a per-txg i_sync_list
	- * - zil_itxg_clean() is called (e.g. via spa_sync())
	- * while the waiter is still on the i_sync_list
	- *
	- * There's nothing to prevent syncing the txg while the
	- * waiter is on the i_sync_list. This normally doesn't
	- * happen because spa_sync() is slower than zil_commit(),
	- * but if zil_commit() calls txg_wait_synced() (e.g.
	- * because zil_create() or zil_commit_writer_stall() is
	- * called) we will hit this case.
	- */
	- if (itx->itx_lr.lrc_txtype == TX_COMMIT)
	- zil_commit_waiter_skip(itx->itx_private);
	-
	- list_remove(list, itx);
	- zil_itx_destroy(itx);
	- }
	-
	- cookie = NULL;
	- t = &itxs->i_async_tree;
	- while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
	- list = &ian->ia_list;
	- while ((itx = list_head(list)) != NULL) {
	- list_remove(list, itx);
	- /* commit itxs should never be on the async lists. */
	- ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
	- zil_itx_destroy(itx);
	- }
	- list_destroy(list);
	- kmem_free(ian, sizeof (itx_async_node_t));
	- }
	- avl_destroy(t);
	-
	- kmem_free(itxs, sizeof (itxs_t));
	-}
	-
	-static int
	-zil_aitx_compare(const void x1, const void x2)
	-{
	- const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
	- const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
	-
	- return (AVL_CMP(o1, o2));
	-}
	-
	-/*
	- * Remove all async itx with the given oid.
	- */
	-static void
	-zil_remove_async(zilog_t *zilog, uint64_t oid)
	-{
	- uint64_t otxg, txg;
	- itx_async_node_t *ian;
	- avl_tree_t *t;
	- avl_index_t where;
	- list_t clean_list;
	- itx_t *itx;
	-
	- ASSERT(oid != 0);
	- list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
	-
	- if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
	- otxg = ZILTEST_TXG;
	- else
	- otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
	-
	- for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
	- itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
	-
	- mutex_enter(&itxg->itxg_lock);
	- if (itxg->itxg_txg != txg) {
	- mutex_exit(&itxg->itxg_lock);
	- continue;
	- }
	-
	- /*
	- * Locate the object node and append its list.
	- */
	- t = &itxg->itxg_itxs->i_async_tree;
	- ian = avl_find(t, &oid, &where);
	- if (ian != NULL)
	- list_move_tail(&clean_list, &ian->ia_list);
	- mutex_exit(&itxg->itxg_lock);
	- }
	- while ((itx = list_head(&clean_list)) != NULL) {
	- list_remove(&clean_list, itx);
	- /* commit itxs should never be on the async lists. */
	- ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
	- zil_itx_destroy(itx);
	- }
	- list_destroy(&clean_list);
	-}
	-
	-void
	-zil_itx_assign(zilog_t zilog, itx_t itx, dmu_tx_t *tx)
	-{
	- uint64_t txg;
	- itxg_t *itxg;
	- itxs_t itxs, clean = NULL;
	-
	- /*
	- * Object ids can be re-instantiated in the next txg so
	- * remove any async transactions to avoid future leaks.
	- * This can happen if a fsync occurs on the re-instantiated
	- * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
	- * the new file data and flushes a write record for the old object.
	- */
	- if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
	- zil_remove_async(zilog, itx->itx_oid);
	-
	- /*
	- * Ensure the data of a renamed file is committed before the rename.
	- */
	- if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
	- zil_async_to_sync(zilog, itx->itx_oid);
	-
	- if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
	- txg = ZILTEST_TXG;
	- else
	- txg = dmu_tx_get_txg(tx);
	-
	- itxg = &zilog->zl_itxg[txg & TXG_MASK];
	- mutex_enter(&itxg->itxg_lock);
	- itxs = itxg->itxg_itxs;
	- if (itxg->itxg_txg != txg) {
	- if (itxs != NULL) {
	- /*
	- * The zil_clean callback hasn't got around to cleaning
	- * this itxg. Save the itxs for release below.
	- * This should be rare.
	- */
	- zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
	- "txg %llu", itxg->itxg_txg);
	- clean = itxg->itxg_itxs;
	- }
	- itxg->itxg_txg = txg;
	- itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
	-
	- list_create(&itxs->i_sync_list, sizeof (itx_t),
	- offsetof(itx_t, itx_node));
	- avl_create(&itxs->i_async_tree, zil_aitx_compare,
	- sizeof (itx_async_node_t),
	- offsetof(itx_async_node_t, ia_node));
	- }
	- if (itx->itx_sync) {
	- list_insert_tail(&itxs->i_sync_list, itx);
	- } else {
	- avl_tree_t *t = &itxs->i_async_tree;
	- uint64_t foid =
	- LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
	- itx_async_node_t *ian;
	- avl_index_t where;
	-
	- ian = avl_find(t, &foid, &where);
	- if (ian == NULL) {
	- ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
	- list_create(&ian->ia_list, sizeof (itx_t),
	- offsetof(itx_t, itx_node));
	- ian->ia_foid = foid;
	- avl_insert(t, ian, where);
	- }
	- list_insert_tail(&ian->ia_list, itx);
	- }
	-
	- itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
	-
	- /*
	- * We don't want to dirty the ZIL using ZILTEST_TXG, because
	- * zil_clean() will never be called using ZILTEST_TXG. Thus, we
	- * need to be careful to always dirty the ZIL using the "real"
	- * TXG (not itxg_txg) even when the SPA is frozen.
	- */
	- zilog_dirty(zilog, dmu_tx_get_txg(tx));
	- mutex_exit(&itxg->itxg_lock);
	-
	- /* Release the old itxs now we've dropped the lock */
	- if (clean != NULL)
	- zil_itxg_clean(clean);
	-}
	-
	-/*
	- * If there are any in-memory intent log transactions which have now been
	- * synced then start up a taskq to free them. We should only do this after we
	- * have written out the uberblocks (i.e. txg has been comitted) so that
	- * don't inadvertently clean out in-memory log records that would be required
	- * by zil_commit().
	- */
	-void
	-zil_clean(zilog_t *zilog, uint64_t synced_txg)
	-{
	- itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
	- itxs_t *clean_me;
	-
	- ASSERT3U(synced_txg, <, ZILTEST_TXG);
	-
	- mutex_enter(&itxg->itxg_lock);
	- if (itxg->itxg_itxs == NULL \|\| itxg->itxg_txg == ZILTEST_TXG) {
	- mutex_exit(&itxg->itxg_lock);
	- return;
	- }
	- ASSERT3U(itxg->itxg_txg, <=, synced_txg);
	- ASSERT3U(itxg->itxg_txg, !=, 0);
	- clean_me = itxg->itxg_itxs;
	- itxg->itxg_itxs = NULL;
	- itxg->itxg_txg = 0;
	- mutex_exit(&itxg->itxg_lock);
	- /*
	- * Preferably start a task queue to free up the old itxs but
	- * if taskq_dispatch can't allocate resources to do that then
	- * free it in-line. This should be rare. Note, using TQ_SLEEP
	- * created a bad performance problem.
	- */
	- ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
	- ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
	- if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
	- (void ()(void ))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
	- zil_itxg_clean(clean_me);
	-}
	-
	-/*
	- * This function will traverse the queue of itxs that need to be
	- * committed, and move them onto the ZIL's zl_itx_commit_list.
	- */
	-static void
	-zil_get_commit_list(zilog_t *zilog)
	-{
	- uint64_t otxg, txg;
	- list_t *commit_list = &zilog->zl_itx_commit_list;
	-
	- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	-
	- if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
	- otxg = ZILTEST_TXG;
	- else
	- otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
	-
	- /*
	- * This is inherently racy, since there is nothing to prevent
	- * the last synced txg from changing. That's okay since we'll
	- * only commit things in the future.
	- */
	- for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
	- itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
	-
	- mutex_enter(&itxg->itxg_lock);
	- if (itxg->itxg_txg != txg) {
	- mutex_exit(&itxg->itxg_lock);
	- continue;
	- }
	-
	- /*
	- * If we're adding itx records to the zl_itx_commit_list,
	- * then the zil better be dirty in this "txg". We can assert
	- * that here since we're holding the itxg_lock which will
	- * prevent spa_sync from cleaning it. Once we add the itxs
	- * to the zl_itx_commit_list we must commit it to disk even
	- * if it's unnecessary (i.e. the txg was synced).
	- */
	- ASSERT(zilog_is_dirty_in_txg(zilog, txg) \|\|
	- spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
	- list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
	-
	- mutex_exit(&itxg->itxg_lock);
	- }
	-}
	-
	-/*
	- * Move the async itxs for a specified object to commit into sync lists.
	- */
	-void
	-zil_async_to_sync(zilog_t *zilog, uint64_t foid)
	-{
	- uint64_t otxg, txg;
	- itx_async_node_t *ian;
	- avl_tree_t *t;
	- avl_index_t where;
	-
	- if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
	- otxg = ZILTEST_TXG;
	- else
	- otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
	-
	- /*
	- * This is inherently racy, since there is nothing to prevent
	- * the last synced txg from changing.
	- */
	- for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
	- itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
	-
	- mutex_enter(&itxg->itxg_lock);
	- if (itxg->itxg_txg != txg) {
	- mutex_exit(&itxg->itxg_lock);
	- continue;
	- }
	-
	- /*
	- * If a foid is specified then find that node and append its
	- * list. Otherwise walk the tree appending all the lists
	- * to the sync list. We add to the end rather than the
	- * beginning to ensure the create has happened.
	- */
	- t = &itxg->itxg_itxs->i_async_tree;
	- if (foid != 0) {
	- ian = avl_find(t, &foid, &where);
	- if (ian != NULL) {
	- list_move_tail(&itxg->itxg_itxs->i_sync_list,
	- &ian->ia_list);
	- }
	- } else {
	- void *cookie = NULL;
	-
	- while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
	- list_move_tail(&itxg->itxg_itxs->i_sync_list,
	- &ian->ia_list);
	- list_destroy(&ian->ia_list);
	- kmem_free(ian, sizeof (itx_async_node_t));
	- }
	- }
	- mutex_exit(&itxg->itxg_lock);
	- }
	-}
	-
	-/*
	- * This function will prune commit itxs that are at the head of the
	- * commit list (it won't prune past the first non-commit itx), and
	- * either: a) attach them to the last lwb that's still pending
	- * completion, or b) skip them altogether.
	- *
	- * This is used as a performance optimization to prevent commit itxs
	- * from generating new lwbs when it's unnecessary to do so.
	- */
	-static void
	-zil_prune_commit_list(zilog_t *zilog)
	-{
	- itx_t *itx;
	-
	- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	-
	- while (itx = list_head(&zilog->zl_itx_commit_list)) {
	- lr_t *lrc = &itx->itx_lr;
	- if (lrc->lrc_txtype != TX_COMMIT)
	- break;
	-
	- mutex_enter(&zilog->zl_lock);
	-
	- lwb_t *last_lwb = zilog->zl_last_lwb_opened;
	- if (last_lwb == NULL \|\|
	- last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
	- /*
	- * All of the itxs this waiter was waiting on
	- * must have already completed (or there were
	- * never any itx's for it to wait on), so it's
	- * safe to skip this waiter and mark it done.
	- */
	- zil_commit_waiter_skip(itx->itx_private);
	- } else {
	- zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
	- itx->itx_private = NULL;
	- }
	-
	- mutex_exit(&zilog->zl_lock);
	-
	- list_remove(&zilog->zl_itx_commit_list, itx);
	- zil_itx_destroy(itx);
	- }
	-
	- IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
	-}
	-
	-static void
	-zil_commit_writer_stall(zilog_t *zilog)
	-{
	- /*
	- * When zio_alloc_zil() fails to allocate the next lwb block on
	- * disk, we must call txg_wait_synced() to ensure all of the
	- * lwbs in the zilog's zl_lwb_list are synced and then freed (in
	- * zil_sync()), such that any subsequent ZIL writer (i.e. a call
	- * to zil_process_commit_list()) will have to call zil_create(),
	- * and start a new ZIL chain.
	- *
	- * Since zil_alloc_zil() failed, the lwb that was previously
	- * issued does not have a pointer to the "next" lwb on disk.
	- * Thus, if another ZIL writer thread was to allocate the "next"
	- * on-disk lwb, that block could be leaked in the event of a
	- * crash (because the previous lwb on-disk would not point to
	- * it).
	- *
	- * We must hold the zilog's zl_issuer_lock while we do this, to
	- * ensure no new threads enter zil_process_commit_list() until
	- * all lwb's in the zl_lwb_list have been synced and freed
	- * (which is achieved via the txg_wait_synced() call).
	- */
	- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	- txg_wait_synced(zilog->zl_dmu_pool, 0);
	- ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
	-}
	-
	-/*
	- * This function will traverse the commit list, creating new lwbs as
	- * needed, and committing the itxs from the commit list to these newly
	- * created lwbs. Additionally, as a new lwb is created, the previous
	- * lwb will be issued to the zio layer to be written to disk.
	- */
	-static void
	-zil_process_commit_list(zilog_t *zilog)
	-{
	- spa_t *spa = zilog->zl_spa;
	- list_t nolwb_waiters;
	- lwb_t *lwb;
	- itx_t *itx;
	-
	- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	-
	- /*
	- * Return if there's nothing to commit before we dirty the fs by
	- * calling zil_create().
	- */
	- if (list_head(&zilog->zl_itx_commit_list) == NULL)
	- return;
	-
	- list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
	- offsetof(zil_commit_waiter_t, zcw_node));
	-
	- lwb = list_tail(&zilog->zl_lwb_list);
	- if (lwb == NULL) {
	- lwb = zil_create(zilog);
	- } else {
	- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
	- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
	- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
	- }
	-
	- while (itx = list_head(&zilog->zl_itx_commit_list)) {
	- lr_t *lrc = &itx->itx_lr;
	- uint64_t txg = lrc->lrc_txg;
	-
	- ASSERT3U(txg, !=, 0);
	-
	- if (lrc->lrc_txtype == TX_COMMIT) {
	- DTRACE_PROBE2(zil__process__commit__itx,
	- zilog_t , zilog, itx_t , itx);
	- } else {
	- DTRACE_PROBE2(zil__process__normal__itx,
	- zilog_t , zilog, itx_t , itx);
	- }
	-
	- boolean_t synced = txg <= spa_last_synced_txg(spa);
	- boolean_t frozen = txg > spa_freeze_txg(spa);
	-
	- /*
	- * If the txg of this itx has already been synced out, then
	- * we don't need to commit this itx to an lwb. This is
	- * because the data of this itx will have already been
	- * written to the main pool. This is inherently racy, and
	- * it's still ok to commit an itx whose txg has already
	- * been synced; this will result in a write that's
	- * unnecessary, but will do no harm.
	- *
	- * With that said, we always want to commit TX_COMMIT itxs
	- * to an lwb, regardless of whether or not that itx's txg
	- * has been synced out. We do this to ensure any OPENED lwb
	- * will always have at least one zil_commit_waiter_t linked
	- * to the lwb.
	- *
	- * As a counter-example, if we skipped TX_COMMIT itx's
	- * whose txg had already been synced, the following
	- * situation could occur if we happened to be racing with
	- * spa_sync:
	- *
	- * 1. we commit a non-TX_COMMIT itx to an lwb, where the
	- * itx's txg is 10 and the last synced txg is 9.
	- * 2. spa_sync finishes syncing out txg 10.
	- * 3. we move to the next itx in the list, it's a TX_COMMIT
	- * whose txg is 10, so we skip it rather than committing
	- * it to the lwb used in (1).
	- *
	- * If the itx that is skipped in (3) is the last TX_COMMIT
	- * itx in the commit list, than it's possible for the lwb
	- * used in (1) to remain in the OPENED state indefinitely.
	- *
	- * To prevent the above scenario from occuring, ensuring
	- * that once an lwb is OPENED it will transition to ISSUED
	- * and eventually DONE, we always commit TX_COMMIT itx's to
	- * an lwb here, even if that itx's txg has already been
	- * synced.
	- *
	- * Finally, if the pool is frozen, we _always_ commit the
	- * itx. The point of freezing the pool is to prevent data
	- * from being written to the main pool via spa_sync, and
	- * instead rely solely on the ZIL to persistently store the
	- * data; i.e. when the pool is frozen, the last synced txg
	- * value can't be trusted.
	- */
	- if (frozen \|\| !synced \|\| lrc->lrc_txtype == TX_COMMIT) {
	- if (lwb != NULL) {
	- lwb = zil_lwb_commit(zilog, itx, lwb);
	- } else if (lrc->lrc_txtype == TX_COMMIT) {
	- ASSERT3P(lwb, ==, NULL);
	- zil_commit_waiter_link_nolwb(
	- itx->itx_private, &nolwb_waiters);
	- }
	- }
	-
	- list_remove(&zilog->zl_itx_commit_list, itx);
	- zil_itx_destroy(itx);
	- }
	-
	- if (lwb == NULL) {
	- /*
	- * This indicates zio_alloc_zil() failed to allocate the
	- * "next" lwb on-disk. When this happens, we must stall
	- * the ZIL write pipeline; see the comment within
	- * zil_commit_writer_stall() for more details.
	- */
	- zil_commit_writer_stall(zilog);
	-
	- /*
	- * Additionally, we have to signal and mark the "nolwb"
	- * waiters as "done" here, since without an lwb, we
	- * can't do this via zil_lwb_flush_vdevs_done() like
	- * normal.
	- */
	- zil_commit_waiter_t *zcw;
	- while (zcw = list_head(&nolwb_waiters)) {
	- zil_commit_waiter_skip(zcw);
	- list_remove(&nolwb_waiters, zcw);
	- }
	- } else {
	- ASSERT(list_is_empty(&nolwb_waiters));
	- ASSERT3P(lwb, !=, NULL);
	- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
	- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
	- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
	-
	- /*
	- * At this point, the ZIL block pointed at by the "lwb"
	- * variable is in one of the following states: "closed"
	- * or "open".
	- *
	- * If its "closed", then no itxs have been committed to
	- * it, so there's no point in issuing its zio (i.e.
	- * it's "empty").
	- *
	- * If its "open" state, then it contains one or more
	- * itxs that eventually need to be committed to stable
	- * storage. In this case we intentionally do not issue
	- * the lwb's zio to disk yet, and instead rely on one of
	- * the following two mechanisms for issuing the zio:
	- *
	- * 1. Ideally, there will be more ZIL activity occuring
	- * on the system, such that this function will be
	- * immediately called again (not necessarily by the same
	- * thread) and this lwb's zio will be issued via
	- * zil_lwb_commit(). This way, the lwb is guaranteed to
	- * be "full" when it is issued to disk, and we'll make
	- * use of the lwb's size the best we can.
	- *
	- * 2. If there isn't sufficient ZIL activity occuring on
	- * the system, such that this lwb's zio isn't issued via
	- * zil_lwb_commit(), zil_commit_waiter() will issue the
	- * lwb's zio. If this occurs, the lwb is not guaranteed
	- * to be "full" by the time its zio is issued, and means
	- * the size of the lwb was "too large" given the amount
	- * of ZIL activity occuring on the system at that time.
	- *
	- * We do this for a couple of reasons:
	- *
	- * 1. To try and reduce the number of IOPs needed to
	- * write the same number of itxs. If an lwb has space
	- * available in it's buffer for more itxs, and more itxs
	- * will be committed relatively soon (relative to the
	- * latency of performing a write), then it's beneficial
	- * to wait for these "next" itxs. This way, more itxs
	- * can be committed to stable storage with fewer writes.
	- *
	- * 2. To try and use the largest lwb block size that the
	- * incoming rate of itxs can support. Again, this is to
	- * try and pack as many itxs into as few lwbs as
	- * possible, without significantly impacting the latency
	- * of each individual itx.
	- */
	- }
	-}
	-
	-/*
	- * This function is responsible for ensuring the passed in commit waiter
	- * (and associated commit itx) is committed to an lwb. If the waiter is
	- * not already committed to an lwb, all itxs in the zilog's queue of
	- * itxs will be processed. The assumption is the passed in waiter's
	- * commit itx will found in the queue just like the other non-commit
	- * itxs, such that when the entire queue is processed, the waiter will
	- * have been commited to an lwb.
	- *
	- * The lwb associated with the passed in waiter is not guaranteed to
	- * have been issued by the time this function completes. If the lwb is
	- * not issued, we rely on future calls to zil_commit_writer() to issue
	- * the lwb, or the timeout mechanism found in zil_commit_waiter().
	- */
	-static void
	-zil_commit_writer(zilog_t zilog, zil_commit_waiter_t zcw)
	-{
	- ASSERT(!MUTEX_HELD(&zilog->zl_lock));
	- ASSERT(spa_writeable(zilog->zl_spa));
	-
	- mutex_enter(&zilog->zl_issuer_lock);
	-
	- if (zcw->zcw_lwb != NULL \|\| zcw->zcw_done) {
	- /*
	- * It's possible that, while we were waiting to acquire
	- * the "zl_issuer_lock", another thread committed this
	- * waiter to an lwb. If that occurs, we bail out early,
	- * without processing any of the zilog's queue of itxs.
	- *
	- * On certain workloads and system configurations, the
	- * "zl_issuer_lock" can become highly contended. In an
	- * attempt to reduce this contention, we immediately drop
	- * the lock if the waiter has already been processed.
	- *
	- * We've measured this optimization to reduce CPU spent
	- * contending on this lock by up to 5%, using a system
	- * with 32 CPUs, low latency storage (~50 usec writes),
	- * and 1024 threads performing sync writes.
	- */
	- goto out;
	- }
	-
	- zil_get_commit_list(zilog);
	- zil_prune_commit_list(zilog);
	- zil_process_commit_list(zilog);
	-
	-out:
	- mutex_exit(&zilog->zl_issuer_lock);
	-}
	-
	-static void
	-zil_commit_waiter_timeout(zilog_t zilog, zil_commit_waiter_t zcw)
	-{
	- ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
	- ASSERT(MUTEX_HELD(&zcw->zcw_lock));
	- ASSERT3B(zcw->zcw_done, ==, B_FALSE);
	-
	- lwb_t *lwb = zcw->zcw_lwb;
	- ASSERT3P(lwb, !=, NULL);
	- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
	-
	- /*
	- * If the lwb has already been issued by another thread, we can
	- * immediately return since there's no work to be done (the
	- * point of this function is to issue the lwb). Additionally, we
	- * do this prior to acquiring the zl_issuer_lock, to avoid
	- * acquiring it when it's not necessary to do so.
	- */
	- if (lwb->lwb_state == LWB_STATE_ISSUED \|\|
	- lwb->lwb_state == LWB_STATE_WRITE_DONE \|\|
	- lwb->lwb_state == LWB_STATE_FLUSH_DONE)
	- return;
	-
	- /*
	- * In order to call zil_lwb_write_issue() we must hold the
	- * zilog's "zl_issuer_lock". We can't simply acquire that lock,
	- * since we're already holding the commit waiter's "zcw_lock",
	- * and those two locks are aquired in the opposite order
	- * elsewhere.
	- */
	- mutex_exit(&zcw->zcw_lock);
	- mutex_enter(&zilog->zl_issuer_lock);
	- mutex_enter(&zcw->zcw_lock);
	-
	- /*
	- * Since we just dropped and re-acquired the commit waiter's
	- * lock, we have to re-check to see if the waiter was marked
	- * "done" during that process. If the waiter was marked "done",
	- * the "lwb" pointer is no longer valid (it can be free'd after
	- * the waiter is marked "done"), so without this check we could
	- * wind up with a use-after-free error below.
	- */
	- if (zcw->zcw_done)
	- goto out;
	-
	- ASSERT3P(lwb, ==, zcw->zcw_lwb);
	-
	- /*
	- * We've already checked this above, but since we hadn't acquired
	- * the zilog's zl_issuer_lock, we have to perform this check a
	- * second time while holding the lock.
	- *
	- * We don't need to hold the zl_lock since the lwb cannot transition
	- * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
	- * _can_ transition from ISSUED to DONE, but it's OK to race with
	- * that transition since we treat the lwb the same, whether it's in
	- * the ISSUED or DONE states.
	- *
	- * The important thing, is we treat the lwb differently depending on
	- * if it's ISSUED or OPENED, and block any other threads that might
	- * attempt to issue this lwb. For that reason we hold the
	- * zl_issuer_lock when checking the lwb_state; we must not call
	- * zil_lwb_write_issue() if the lwb had already been issued.
	- *
	- * See the comment above the lwb_state_t structure definition for
	- * more details on the lwb states, and locking requirements.
	- */
	- if (lwb->lwb_state == LWB_STATE_ISSUED \|\|
	- lwb->lwb_state == LWB_STATE_WRITE_DONE \|\|
	- lwb->lwb_state == LWB_STATE_FLUSH_DONE)
	- goto out;
	-
	- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
	-
	- /*
	- * As described in the comments above zil_commit_waiter() and
	- * zil_process_commit_list(), we need to issue this lwb's zio
	- * since we've reached the commit waiter's timeout and it still
	- * hasn't been issued.
	- */
	- lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
	-
	- IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
	-
	- /*
	- * Since the lwb's zio hadn't been issued by the time this thread
	- * reached its timeout, we reset the zilog's "zl_cur_used" field
	- * to influence the zil block size selection algorithm.
	- *
	- * By having to issue the lwb's zio here, it means the size of the
	- * lwb was too large, given the incoming throughput of itxs. By
	- * setting "zl_cur_used" to zero, we communicate this fact to the
	- * block size selection algorithm, so it can take this informaiton
	- * into account, and potentially select a smaller size for the
	- * next lwb block that is allocated.
	- */
	- zilog->zl_cur_used = 0;
	-
	- if (nlwb == NULL) {
	- /*
	- * When zil_lwb_write_issue() returns NULL, this
	- * indicates zio_alloc_zil() failed to allocate the
	- * "next" lwb on-disk. When this occurs, the ZIL write
	- * pipeline must be stalled; see the comment within the
	- * zil_commit_writer_stall() function for more details.
	- *
	- * We must drop the commit waiter's lock prior to
	- * calling zil_commit_writer_stall() or else we can wind
	- * up with the following deadlock:
	- *
	- * - This thread is waiting for the txg to sync while
	- * holding the waiter's lock; txg_wait_synced() is
	- * used within txg_commit_writer_stall().
	- *
	- * - The txg can't sync because it is waiting for this
	- * lwb's zio callback to call dmu_tx_commit().
	- *
	- * - The lwb's zio callback can't call dmu_tx_commit()
	- * because it's blocked trying to acquire the waiter's
	- * lock, which occurs prior to calling dmu_tx_commit()
	- */
	- mutex_exit(&zcw->zcw_lock);
	- zil_commit_writer_stall(zilog);
	- mutex_enter(&zcw->zcw_lock);
	- }
	-
	-out:
	- mutex_exit(&zilog->zl_issuer_lock);
	- ASSERT(MUTEX_HELD(&zcw->zcw_lock));
	-}
	-
	-/*
	- * This function is responsible for performing the following two tasks:
	- *
	- * 1. its primary responsibility is to block until the given "commit
	- * waiter" is considered "done".
	- *
	- * 2. its secondary responsibility is to issue the zio for the lwb that
	- * the given "commit waiter" is waiting on, if this function has
	- * waited "long enough" and the lwb is still in the "open" state.
	- *
	- * Given a sufficient amount of itxs being generated and written using
	- * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
	- * function. If this does not occur, this secondary responsibility will
	- * ensure the lwb is issued even if there is not other synchronous
	- * activity on the system.
	- *
	- * For more details, see zil_process_commit_list(); more specifically,
	- * the comment at the bottom of that function.
	- */
	-static void
	-zil_commit_waiter(zilog_t zilog, zil_commit_waiter_t zcw)
	-{
	- ASSERT(!MUTEX_HELD(&zilog->zl_lock));
	- ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
	- ASSERT(spa_writeable(zilog->zl_spa));
	-
	- mutex_enter(&zcw->zcw_lock);
	-
	- /*
	- * The timeout is scaled based on the lwb latency to avoid
	- * significantly impacting the latency of each individual itx.
	- * For more details, see the comment at the bottom of the
	- * zil_process_commit_list() function.
	- */
	- int pct = MAX(zfs_commit_timeout_pct, 1);
	-#if defined(illumos) \|\| !defined(_KERNEL)
	- hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
	- hrtime_t wakeup = gethrtime() + sleep;
	-#else
	- sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100);
	- sbintime_t wakeup = getsbinuptime() + sleep;
	-#endif
	- boolean_t timedout = B_FALSE;
	-
	- while (!zcw->zcw_done) {
	- ASSERT(MUTEX_HELD(&zcw->zcw_lock));
	-
	- lwb_t *lwb = zcw->zcw_lwb;
	-
	- /*
	- * Usually, the waiter will have a non-NULL lwb field here,
	- * but it's possible for it to be NULL as a result of
	- * zil_commit() racing with spa_sync().
	- *
	- * When zil_clean() is called, it's possible for the itxg
	- * list (which may be cleaned via a taskq) to contain
	- * commit itxs. When this occurs, the commit waiters linked
	- * off of these commit itxs will not be committed to an
	- * lwb. Additionally, these commit waiters will not be
	- * marked done until zil_commit_waiter_skip() is called via
	- * zil_itxg_clean().
	- *
	- * Thus, it's possible for this commit waiter (i.e. the
	- * "zcw" variable) to be found in this "in between" state;
	- * where it's "zcw_lwb" field is NULL, and it hasn't yet
	- * been skipped, so it's "zcw_done" field is still B_FALSE.
	- */
	- IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
	-
	- if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
	- ASSERT3B(timedout, ==, B_FALSE);
	-
	- /*
	- * If the lwb hasn't been issued yet, then we
	- * need to wait with a timeout, in case this
	- * function needs to issue the lwb after the
	- * timeout is reached; responsibility (2) from
	- * the comment above this function.
	- */
	-#if defined(illumos) \|\| !defined(_KERNEL)
	- clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv,
	- &zcw->zcw_lock, wakeup, USEC2NSEC(1),
	- CALLOUT_FLAG_ABSOLUTE);
	-
	- if (timeleft >= 0 \|\| zcw->zcw_done)
	- continue;
	-#else
	- int wait_err = cv_timedwait_sbt(&zcw->zcw_cv,
	- &zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE);
	- if (wait_err != EWOULDBLOCK \|\| zcw->zcw_done)
	- continue;
	-#endif
	-
	- timedout = B_TRUE;
	- zil_commit_waiter_timeout(zilog, zcw);
	-
	- if (!zcw->zcw_done) {
	- /*
	- * If the commit waiter has already been
	- * marked "done", it's possible for the
	- * waiter's lwb structure to have already
	- * been freed. Thus, we can only reliably
	- * make these assertions if the waiter
	- * isn't done.
	- */
	- ASSERT3P(lwb, ==, zcw->zcw_lwb);
	- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
	- }
	- } else {
	- /*
	- * If the lwb isn't open, then it must have already
	- * been issued. In that case, there's no need to
	- * use a timeout when waiting for the lwb to
	- * complete.
	- *
	- * Additionally, if the lwb is NULL, the waiter
	- * will soon be signalled and marked done via
	- * zil_clean() and zil_itxg_clean(), so no timeout
	- * is required.
	- */
	-
	- IMPLY(lwb != NULL,
	- lwb->lwb_state == LWB_STATE_ISSUED \|\|
	- lwb->lwb_state == LWB_STATE_WRITE_DONE \|\|
	- lwb->lwb_state == LWB_STATE_FLUSH_DONE);
	- cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
	- }
	- }
	-
	- mutex_exit(&zcw->zcw_lock);
	-}
	-
	-static zil_commit_waiter_t *
	-zil_alloc_commit_waiter()
	-{
	- zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
	-
	- cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
	- mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
	- list_link_init(&zcw->zcw_node);
	- zcw->zcw_lwb = NULL;
	- zcw->zcw_done = B_FALSE;
	- zcw->zcw_zio_error = 0;
	-
	- return (zcw);
	-}
	-
	-static void
	-zil_free_commit_waiter(zil_commit_waiter_t *zcw)
	-{
	- ASSERT(!list_link_active(&zcw->zcw_node));
	- ASSERT3P(zcw->zcw_lwb, ==, NULL);
	- ASSERT3B(zcw->zcw_done, ==, B_TRUE);
	- mutex_destroy(&zcw->zcw_lock);
	- cv_destroy(&zcw->zcw_cv);
	- kmem_cache_free(zil_zcw_cache, zcw);
	-}
	-
	-/*
	- * This function is used to create a TX_COMMIT itx and assign it. This
	- * way, it will be linked into the ZIL's list of synchronous itxs, and
	- * then later committed to an lwb (or skipped) when
	- * zil_process_commit_list() is called.
	- */
	-static void
	-zil_commit_itx_assign(zilog_t zilog, zil_commit_waiter_t zcw)
	-{
	- dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
	- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	-
	- itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
	- itx->itx_sync = B_TRUE;
	- itx->itx_private = zcw;
	-
	- zil_itx_assign(zilog, itx, tx);
	-
	- dmu_tx_commit(tx);
	-}
	-
	-/*
	- * Commit ZFS Intent Log transactions (itxs) to stable storage.
	- *
	- * When writing ZIL transactions to the on-disk representation of the
	- * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
	- * itxs can be committed to a single lwb. Once a lwb is written and
	- * committed to stable storage (i.e. the lwb is written, and vdevs have
	- * been flushed), each itx that was committed to that lwb is also
	- * considered to be committed to stable storage.
	- *
	- * When an itx is committed to an lwb, the log record (lr_t) contained
	- * by the itx is copied into the lwb's zio buffer, and once this buffer
	- * is written to disk, it becomes an on-disk ZIL block.
	- *
	- * As itxs are generated, they're inserted into the ZIL's queue of
	- * uncommitted itxs. The semantics of zil_commit() are such that it will
	- * block until all itxs that were in the queue when it was called, are
	- * committed to stable storage.
	- *
	- * If "foid" is zero, this means all "synchronous" and "asynchronous"
	- * itxs, for all objects in the dataset, will be committed to stable
	- * storage prior to zil_commit() returning. If "foid" is non-zero, all
	- * "synchronous" itxs for all objects, but only "asynchronous" itxs
	- * that correspond to the foid passed in, will be committed to stable
	- * storage prior to zil_commit() returning.
	- *
	- * Generally speaking, when zil_commit() is called, the consumer doesn't
	- * actually care about _all_ of the uncommitted itxs. Instead, they're
	- * simply trying to waiting for a specific itx to be committed to disk,
	- * but the interface(s) for interacting with the ZIL don't allow such
	- * fine-grained communication. A better interface would allow a consumer
	- * to create and assign an itx, and then pass a reference to this itx to
	- * zil_commit(); such that zil_commit() would return as soon as that
	- * specific itx was committed to disk (instead of waiting for _all_
	- * itxs to be committed).
	- *
	- * When a thread calls zil_commit() a special "commit itx" will be
	- * generated, along with a corresponding "waiter" for this commit itx.
	- * zil_commit() will wait on this waiter's CV, such that when the waiter
	- * is marked done, and signalled, zil_commit() will return.
	- *
	- * This commit itx is inserted into the queue of uncommitted itxs. This
	- * provides an easy mechanism for determining which itxs were in the
	- * queue prior to zil_commit() having been called, and which itxs were
	- * added after zil_commit() was called.
	- *
	- * The commit it is special; it doesn't have any on-disk representation.
	- * When a commit itx is "committed" to an lwb, the waiter associated
	- * with it is linked onto the lwb's list of waiters. Then, when that lwb
	- * completes, each waiter on the lwb's list is marked done and signalled
	- * -- allowing the thread waiting on the waiter to return from zil_commit().
	- *
	- * It's important to point out a few critical factors that allow us
	- * to make use of the commit itxs, commit waiters, per-lwb lists of
	- * commit waiters, and zio completion callbacks like we're doing:
	- *
	- * 1. The list of waiters for each lwb is traversed, and each commit
	- * waiter is marked "done" and signalled, in the zio completion
	- * callback of the lwb's zio[*].
	- *
	- * * Actually, the waiters are signalled in the zio completion
	- * callback of the root zio for the DKIOCFLUSHWRITECACHE commands
	- * that are sent to the vdevs upon completion of the lwb zio.
	- *
	- * 2. When the itxs are inserted into the ZIL's queue of uncommitted
	- * itxs, the order in which they are inserted is preserved[*]; as
	- * itxs are added to the queue, they are added to the tail of
	- * in-memory linked lists.
	- *
	- * When committing the itxs to lwbs (to be written to disk), they
	- * are committed in the same order in which the itxs were added to
	- * the uncommitted queue's linked list(s); i.e. the linked list of
	- * itxs to commit is traversed from head to tail, and each itx is
	- * committed to an lwb in that order.
	- *
	- * * To clarify:
	- *
	- * - the order of "sync" itxs is preserved w.r.t. other
	- * "sync" itxs, regardless of the corresponding objects.
	- * - the order of "async" itxs is preserved w.r.t. other
	- * "async" itxs corresponding to the same object.
	- * - the order of "async" itxs is not preserved w.r.t. other
	- * "async" itxs corresponding to different objects.
	- * - the order of "sync" itxs w.r.t. "async" itxs (or vice
	- * versa) is not preserved, even for itxs that correspond
	- * to the same object.
	- *
	- * For more details, see: zil_itx_assign(), zil_async_to_sync(),
	- * zil_get_commit_list(), and zil_process_commit_list().
	- *
	- * 3. The lwbs represent a linked list of blocks on disk. Thus, any
	- * lwb cannot be considered committed to stable storage, until its
	- * "previous" lwb is also committed to stable storage. This fact,
	- * coupled with the fact described above, means that itxs are
	- * committed in (roughly) the order in which they were generated.
	- * This is essential because itxs are dependent on prior itxs.
	- * Thus, we must not deem an itx as being committed to stable
	- * storage, until all prior itxs have also been committed to
	- * stable storage.
	- *
	- * To enforce this ordering of lwb zio's, while still leveraging as
	- * much of the underlying storage performance as possible, we rely
	- * on two fundamental concepts:
	- *
	- * 1. The creation and issuance of lwb zio's is protected by
	- * the zilog's "zl_issuer_lock", which ensures only a single
	- * thread is creating and/or issuing lwb's at a time
	- * 2. The "previous" lwb is a child of the "current" lwb
	- * (leveraging the zio parent-child depenency graph)
	- *
	- * By relying on this parent-child zio relationship, we can have
	- * many lwb zio's concurrently issued to the underlying storage,
	- * but the order in which they complete will be the same order in
	- * which they were created.
	- */
	-void
	-zil_commit(zilog_t *zilog, uint64_t foid)
	-{
	- /*
	- * We should never attempt to call zil_commit on a snapshot for
	- * a couple of reasons:
	- *
	- * 1. A snapshot may never be modified, thus it cannot have any
	- * in-flight itxs that would have modified the dataset.
	- *
	- * 2. By design, when zil_commit() is called, a commit itx will
	- * be assigned to this zilog; as a result, the zilog will be
	- * dirtied. We must not dirty the zilog of a snapshot; there's
	- * checks in the code that enforce this invariant, and will
	- * cause a panic if it's not upheld.
	- */
	- ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
	-
	- if (zilog->zl_sync == ZFS_SYNC_DISABLED)
	- return;
	-
	- if (!spa_writeable(zilog->zl_spa)) {
	- /*
	- * If the SPA is not writable, there should never be any
	- * pending itxs waiting to be committed to disk. If that
	- * weren't true, we'd skip writing those itxs out, and
	- * would break the sematics of zil_commit(); thus, we're
	- * verifying that truth before we return to the caller.
	- */
	- ASSERT(list_is_empty(&zilog->zl_lwb_list));
	- ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
	- for (int i = 0; i < TXG_SIZE; i++)
	- ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
	- return;
	- }
	-
	- /*
	- * If the ZIL is suspended, we don't want to dirty it by calling
	- * zil_commit_itx_assign() below, nor can we write out
	- * lwbs like would be done in zil_commit_write(). Thus, we
	- * simply rely on txg_wait_synced() to maintain the necessary
	- * semantics, and avoid calling those functions altogether.
	- */
	- if (zilog->zl_suspend > 0) {
	- txg_wait_synced(zilog->zl_dmu_pool, 0);
	- return;
	- }
	-
	- zil_commit_impl(zilog, foid);
	-}
	-
	-void
	-zil_commit_impl(zilog_t *zilog, uint64_t foid)
	-{
	- /*
	- * Move the "async" itxs for the specified foid to the "sync"
	- * queues, such that they will be later committed (or skipped)
	- * to an lwb when zil_process_commit_list() is called.
	- *
	- * Since these "async" itxs must be committed prior to this
	- * call to zil_commit returning, we must perform this operation
	- * before we call zil_commit_itx_assign().
	- */
	- zil_async_to_sync(zilog, foid);
	-
	- /*
	- * We allocate a new "waiter" structure which will initially be
	- * linked to the commit itx using the itx's "itx_private" field.
	- * Since the commit itx doesn't represent any on-disk state,
	- * when it's committed to an lwb, rather than copying the its
	- * lr_t into the lwb's buffer, the commit itx's "waiter" will be
	- * added to the lwb's list of waiters. Then, when the lwb is
	- * committed to stable storage, each waiter in the lwb's list of
	- * waiters will be marked "done", and signalled.
	- *
	- * We must create the waiter and assign the commit itx prior to
	- * calling zil_commit_writer(), or else our specific commit itx
	- * is not guaranteed to be committed to an lwb prior to calling
	- * zil_commit_waiter().
	- */
	- zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
	- zil_commit_itx_assign(zilog, zcw);
	-
	- zil_commit_writer(zilog, zcw);
	- zil_commit_waiter(zilog, zcw);
	-
	- if (zcw->zcw_zio_error != 0) {
	- /*
	- * If there was an error writing out the ZIL blocks that
	- * this thread is waiting on, then we fallback to
	- * relying on spa_sync() to write out the data this
	- * thread is waiting on. Obviously this has performance
	- * implications, but the expectation is for this to be
	- * an exceptional case, and shouldn't occur often.
	- */
	- DTRACE_PROBE2(zil__commit__io__error,
	- zilog_t , zilog, zil_commit_waiter_t , zcw);
	- txg_wait_synced(zilog->zl_dmu_pool, 0);
	- }
	-
	- zil_free_commit_waiter(zcw);
	-}
	-
	-/*
	- * Called in syncing context to free committed log blocks and update log header.
	- */
	-void
	-zil_sync(zilog_t zilog, dmu_tx_t tx)
	-{
	- zil_header_t *zh = zil_header_in_syncing_context(zilog);
	- uint64_t txg = dmu_tx_get_txg(tx);
	- spa_t *spa = zilog->zl_spa;
	- uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
	- lwb_t *lwb;
	-
	- /*
	- * We don't zero out zl_destroy_txg, so make sure we don't try
	- * to destroy it twice.
	- */
	- if (spa_sync_pass(spa) != 1)
	- return;
	-
	- mutex_enter(&zilog->zl_lock);
	-
	- ASSERT(zilog->zl_stop_sync == 0);
	-
	- if (*replayed_seq != 0) {
	- ASSERT(zh->zh_replay_seq < *replayed_seq);
	- zh->zh_replay_seq = *replayed_seq;
	- *replayed_seq = 0;
	- }
	-
	- if (zilog->zl_destroy_txg == txg) {
	- blkptr_t blk = zh->zh_log;
	-
	- ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
	-
	- bzero(zh, sizeof (zil_header_t));
	- bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
	-
	- if (zilog->zl_keep_first) {
	- /*
	- * If this block was part of log chain that couldn't
	- * be claimed because a device was missing during
	- * zil_claim(), but that device later returns,
	- * then this block could erroneously appear valid.
	- * To guard against this, assign a new GUID to the new
	- * log chain so it doesn't matter what blk points to.
	- */
	- zil_init_log_chain(zilog, &blk);
	- zh->zh_log = blk;
	- }
	- }
	-
	- while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
	- zh->zh_log = lwb->lwb_blk;
	- if (lwb->lwb_buf != NULL \|\| lwb->lwb_max_txg > txg)
	- break;
	- list_remove(&zilog->zl_lwb_list, lwb);
	- zio_free(spa, txg, &lwb->lwb_blk);
	- zil_free_lwb(zilog, lwb);
	-
	- /*
	- * If we don't have anything left in the lwb list then
	- * we've had an allocation failure and we need to zero
	- * out the zil_header blkptr so that we don't end
	- * up freeing the same block twice.
	- */
	- if (list_head(&zilog->zl_lwb_list) == NULL)
	- BP_ZERO(&zh->zh_log);
	- }
	- mutex_exit(&zilog->zl_lock);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zil_lwb_cons(void vbuf, void unused, int kmflag)
	-{
	- lwb_t *lwb = vbuf;
	- list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
	- offsetof(zil_commit_waiter_t, zcw_node));
	- avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
	- sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
	- mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static void
	-zil_lwb_dest(void vbuf, void unused)
	-{
	- lwb_t *lwb = vbuf;
	- mutex_destroy(&lwb->lwb_vdev_lock);
	- avl_destroy(&lwb->lwb_vdev_tree);
	- list_destroy(&lwb->lwb_waiters);
	-}
	-
	-void
	-zil_init(void)
	-{
	- zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
	- sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
	-
	- zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
	- sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	-}
	-
	-void
	-zil_fini(void)
	-{
	- kmem_cache_destroy(zil_zcw_cache);
	- kmem_cache_destroy(zil_lwb_cache);
	-}
	-
	-void
	-zil_set_sync(zilog_t *zilog, uint64_t sync)
	-{
	- zilog->zl_sync = sync;
	-}
	-
	-void
	-zil_set_logbias(zilog_t *zilog, uint64_t logbias)
	-{
	- zilog->zl_logbias = logbias;
	-}
	-
	-zilog_t *
	-zil_alloc(objset_t os, zil_header_t zh_phys)
	-{
	- zilog_t *zilog;
	-
	- zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
	-
	- zilog->zl_header = zh_phys;
	- zilog->zl_os = os;
	- zilog->zl_spa = dmu_objset_spa(os);
	- zilog->zl_dmu_pool = dmu_objset_pool(os);
	- zilog->zl_destroy_txg = TXG_INITIAL - 1;
	- zilog->zl_logbias = dmu_objset_logbias(os);
	- zilog->zl_sync = dmu_objset_syncprop(os);
	- zilog->zl_dirty_max_txg = 0;
	- zilog->zl_last_lwb_opened = NULL;
	- zilog->zl_last_lwb_latency = 0;
	- zilog->zl_max_block_size = zil_maxblocksize;
	-
	- mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
	-
	- for (int i = 0; i < TXG_SIZE; i++) {
	- mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
	- MUTEX_DEFAULT, NULL);
	- }
	-
	- list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
	- offsetof(lwb_t, lwb_node));
	-
	- list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
	- offsetof(itx_t, itx_node));
	-
	- cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
	-
	- return (zilog);
	-}
	-
	-void
	-zil_free(zilog_t *zilog)
	-{
	- zilog->zl_stop_sync = 1;
	-
	- ASSERT0(zilog->zl_suspend);
	- ASSERT0(zilog->zl_suspending);
	-
	- ASSERT(list_is_empty(&zilog->zl_lwb_list));
	- list_destroy(&zilog->zl_lwb_list);
	-
	- ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
	- list_destroy(&zilog->zl_itx_commit_list);
	-
	- for (int i = 0; i < TXG_SIZE; i++) {
	- /*
	- * It's possible for an itx to be generated that doesn't dirty
	- * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
	- * callback to remove the entry. We remove those here.
	- *
	- * Also free up the ziltest itxs.
	- */
	- if (zilog->zl_itxg[i].itxg_itxs)
	- zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
	- mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
	- }
	-
	- mutex_destroy(&zilog->zl_issuer_lock);
	- mutex_destroy(&zilog->zl_lock);
	-
	- cv_destroy(&zilog->zl_cv_suspend);
	-
	- kmem_free(zilog, sizeof (zilog_t));
	-}
	-
	-/*
	- * Open an intent log.
	- */
	-zilog_t *
	-zil_open(objset_t os, zil_get_data_t get_data)
	-{
	- zilog_t *zilog = dmu_objset_zil(os);
	-
	- ASSERT3P(zilog->zl_get_data, ==, NULL);
	- ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
	- ASSERT(list_is_empty(&zilog->zl_lwb_list));
	-
	- zilog->zl_get_data = get_data;
	-
	- return (zilog);
	-}
	-
	-/*
	- * Close an intent log.
	- */
	-void
	-zil_close(zilog_t *zilog)
	-{
	- lwb_t *lwb;
	- uint64_t txg;
	-
	- if (!dmu_objset_is_snapshot(zilog->zl_os)) {
	- zil_commit(zilog, 0);
	- } else {
	- ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
	- ASSERT0(zilog->zl_dirty_max_txg);
	- ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
	- }
	-
	- mutex_enter(&zilog->zl_lock);
	- lwb = list_tail(&zilog->zl_lwb_list);
	- if (lwb == NULL)
	- txg = zilog->zl_dirty_max_txg;
	- else
	- txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
	- mutex_exit(&zilog->zl_lock);
	-
	- /*
	- * We need to use txg_wait_synced() to wait long enough for the
	- * ZIL to be clean, and to wait for all pending lwbs to be
	- * written out.
	- */
	- if (txg)
	- txg_wait_synced(zilog->zl_dmu_pool, txg);
	-
	- if (zilog_is_dirty(zilog))
	- zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
	- if (txg < spa_freeze_txg(zilog->zl_spa))
	- VERIFY(!zilog_is_dirty(zilog));
	-
	- zilog->zl_get_data = NULL;
	-
	- /*
	- * We should have only one lwb left on the list; remove it now.
	- */
	- mutex_enter(&zilog->zl_lock);
	- lwb = list_head(&zilog->zl_lwb_list);
	- if (lwb != NULL) {
	- ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
	- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
	- list_remove(&zilog->zl_lwb_list, lwb);
	- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
	- zil_free_lwb(zilog, lwb);
	- }
	- mutex_exit(&zilog->zl_lock);
	-}
	-
	-static char *suspend_tag = "zil suspending";
	-
	-/*
	- * Suspend an intent log. While in suspended mode, we still honor
	- * synchronous semantics, but we rely on txg_wait_synced() to do it.
	- * On old version pools, we suspend the log briefly when taking a
	- * snapshot so that it will have an empty intent log.
	- *
	- * Long holds are not really intended to be used the way we do here --
	- * held for such a short time. A concurrent caller of dsl_dataset_long_held()
	- * could fail. Therefore we take pains to only put a long hold if it is
	- * actually necessary. Fortunately, it will only be necessary if the
	- * objset is currently mounted (or the ZVOL equivalent). In that case it
	- * will already have a long hold, so we are not really making things any worse.
	- *
	- * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
	- * zvol_state_t), and use their mechanism to prevent their hold from being
	- * dropped (e.g. VFS_HOLD()). However, that would be even more pain for
	- * very little gain.
	- *
	- * if cookiep == NULL, this does both the suspend & resume.
	- * Otherwise, it returns with the dataset "long held", and the cookie
	- * should be passed into zil_resume().
	- */
	-int
	-zil_suspend(const char osname, void *cookiep)
	-{
	- objset_t *os;
	- zilog_t *zilog;
	- const zil_header_t *zh;
	- int error;
	-
	- error = dmu_objset_hold(osname, suspend_tag, &os);
	- if (error != 0)
	- return (error);
	- zilog = dmu_objset_zil(os);
	-
	- mutex_enter(&zilog->zl_lock);
	- zh = zilog->zl_header;
	-
	- if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
	- mutex_exit(&zilog->zl_lock);
	- dmu_objset_rele(os, suspend_tag);
	- return (SET_ERROR(EBUSY));
	- }
	-
	- /*
	- * Don't put a long hold in the cases where we can avoid it. This
	- * is when there is no cookie so we are doing a suspend & resume
	- * (i.e. called from zil_vdev_offline()), and there's nothing to do
	- * for the suspend because it's already suspended, or there's no ZIL.
	- */
	- if (cookiep == NULL && !zilog->zl_suspending &&
	- (zilog->zl_suspend > 0 \|\| BP_IS_HOLE(&zh->zh_log))) {
	- mutex_exit(&zilog->zl_lock);
	- dmu_objset_rele(os, suspend_tag);
	- return (0);
	- }
	-
	- dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
	- dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
	-
	- zilog->zl_suspend++;
	-
	- if (zilog->zl_suspend > 1) {
	- /*
	- * Someone else is already suspending it.
	- * Just wait for them to finish.
	- */
	-
	- while (zilog->zl_suspending)
	- cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
	- mutex_exit(&zilog->zl_lock);
	-
	- if (cookiep == NULL)
	- zil_resume(os);
	- else
	- *cookiep = os;
	- return (0);
	- }
	-
	- /*
	- * If there is no pointer to an on-disk block, this ZIL must not
	- * be active (e.g. filesystem not mounted), so there's nothing
	- * to clean up.
	- */
	- if (BP_IS_HOLE(&zh->zh_log)) {
	- ASSERT(cookiep != NULL); /* fast path already handled */
	-
	- *cookiep = os;
	- mutex_exit(&zilog->zl_lock);
	- return (0);
	- }
	-
	- zilog->zl_suspending = B_TRUE;
	- mutex_exit(&zilog->zl_lock);
	-
	- /*
	- * We need to use zil_commit_impl to ensure we wait for all
	- * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed
	- * to disk before proceeding. If we used zil_commit instead, it
	- * would just call txg_wait_synced(), because zl_suspend is set.
	- * txg_wait_synced() doesn't wait for these lwb's to be
	- * LWB_STATE_FLUSH_DONE before returning.
	- */
	- zil_commit_impl(zilog, 0);
	-
	- /*
	- * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
	- * use txg_wait_synced() to ensure the data from the zilog has
	- * migrated to the main pool before calling zil_destroy().
	- */
	- txg_wait_synced(zilog->zl_dmu_pool, 0);
	-
	- zil_destroy(zilog, B_FALSE);
	-
	- mutex_enter(&zilog->zl_lock);
	- zilog->zl_suspending = B_FALSE;
	- cv_broadcast(&zilog->zl_cv_suspend);
	- mutex_exit(&zilog->zl_lock);
	-
	- if (cookiep == NULL)
	- zil_resume(os);
	- else
	- *cookiep = os;
	- return (0);
	-}
	-
	-void
	-zil_resume(void *cookie)
	-{
	- objset_t *os = cookie;
	- zilog_t *zilog = dmu_objset_zil(os);
	-
	- mutex_enter(&zilog->zl_lock);
	- ASSERT(zilog->zl_suspend != 0);
	- zilog->zl_suspend--;
	- mutex_exit(&zilog->zl_lock);
	- dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
	- dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
	-}
	-
	-typedef struct zil_replay_arg {
	- zil_replay_func_t **zr_replay;
	- void *zr_arg;
	- boolean_t zr_byteswap;
	- char *zr_lr;
	-} zil_replay_arg_t;
	-
	-static int
	-zil_replay_error(zilog_t zilog, lr_t lr, int error)
	-{
	- char name[ZFS_MAX_DATASET_NAME_LEN];
	-
	- zilog->zl_replaying_seq--; /* didn't actually replay this one */
	-
	- dmu_objset_name(zilog->zl_os, name);
	-
	- cmn_err(CE_WARN, "ZFS replay transaction error %d, "
	- "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
	- (u_longlong_t)lr->lrc_seq,
	- (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
	- (lr->lrc_txtype & TX_CI) ? "CI" : "");
	-
	- return (error);
	-}
	-
	-static int
	-zil_replay_log_record(zilog_t zilog, lr_t lr, void *zra, uint64_t claim_txg)
	-{
	- zil_replay_arg_t *zr = zra;
	- const zil_header_t *zh = zilog->zl_header;
	- uint64_t reclen = lr->lrc_reclen;
	- uint64_t txtype = lr->lrc_txtype;
	- int error = 0;
	-
	- zilog->zl_replaying_seq = lr->lrc_seq;
	-
	- if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
	- return (0);
	-
	- if (lr->lrc_txg < claim_txg) /* already committed */
	- return (0);
	-
	- /* Strip case-insensitive bit, still present in log record */
	- txtype &= ~TX_CI;
	-
	- if (txtype == 0 \|\| txtype >= TX_MAX_TYPE)
	- return (zil_replay_error(zilog, lr, EINVAL));
	-
	- /*
	- * If this record type can be logged out of order, the object
	- * (lr_foid) may no longer exist. That's legitimate, not an error.
	- */
	- if (TX_OOO(txtype)) {
	- error = dmu_object_info(zilog->zl_os,
	- LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
	- if (error == ENOENT \|\| error == EEXIST)
	- return (0);
	- }
	-
	- /*
	- * Make a copy of the data so we can revise and extend it.
	- */
	- bcopy(lr, zr->zr_lr, reclen);
	-
	- /*
	- * If this is a TX_WRITE with a blkptr, suck in the data.
	- */
	- if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
	- error = zil_read_log_data(zilog, (lr_write_t *)lr,
	- zr->zr_lr + reclen);
	- if (error != 0)
	- return (zil_replay_error(zilog, lr, error));
	- }
	-
	- /*
	- * The log block containing this lr may have been byteswapped
	- * so that we can easily examine common fields like lrc_txtype.
	- * However, the log is a mix of different record types, and only the
	- * replay vectors know how to byteswap their records. Therefore, if
	- * the lr was byteswapped, undo it before invoking the replay vector.
	- */
	- if (zr->zr_byteswap)
	- byteswap_uint64_array(zr->zr_lr, reclen);
	-
	- /*
	- * We must now do two things atomically: replay this log record,
	- * and update the log header sequence number to reflect the fact that
	- * we did so. At the end of each replay function the sequence number
	- * is updated if we are in replay mode.
	- */
	- error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
	- if (error != 0) {
	- /*
	- * The DMU's dnode layer doesn't see removes until the txg
	- * commits, so a subsequent claim can spuriously fail with
	- * EEXIST. So if we receive any error we try syncing out
	- * any removes then retry the transaction. Note that we
	- * specify B_FALSE for byteswap now, so we don't do it twice.
	- */
	- txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
	- error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
	- if (error != 0)
	- return (zil_replay_error(zilog, lr, error));
	- }
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zil_incr_blks(zilog_t zilog, blkptr_t bp, void *arg, uint64_t claim_txg)
	-{
	- zilog->zl_replay_blks++;
	-
	- return (0);
	-}
	-
	-/*
	- * If this dataset has a non-empty intent log, replay it and destroy it.
	- */
	-void
	-zil_replay(objset_t os, void arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
	-{
	- zilog_t *zilog = dmu_objset_zil(os);
	- const zil_header_t *zh = zilog->zl_header;
	- zil_replay_arg_t zr;
	-
	- if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
	- zil_destroy(zilog, B_TRUE);
	- return;
	- }
	-
	- zr.zr_replay = replay_func;
	- zr.zr_arg = arg;
	- zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
	- zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
	-
	- /*
	- * Wait for in-progress removes to sync before starting replay.
	- */
	- txg_wait_synced(zilog->zl_dmu_pool, 0);
	-
	- zilog->zl_replay = B_TRUE;
	- zilog->zl_replay_time = ddi_get_lbolt();
	- ASSERT(zilog->zl_replay_blks == 0);
	- (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
	- zh->zh_claim_txg);
	- kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
	-
	- zil_destroy(zilog, B_FALSE);
	- txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
	- zilog->zl_replay = B_FALSE;
	-}
	-
	-boolean_t
	-zil_replaying(zilog_t zilog, dmu_tx_t tx)
	-{
	- if (zilog->zl_sync == ZFS_SYNC_DISABLED)
	- return (B_TRUE);
	-
	- if (zilog->zl_replay) {
	- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	- zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
	- zilog->zl_replaying_seq;
	- return (B_TRUE);
	- }
	-
	- return (B_FALSE);
	-}
	-
	-/* ARGSUSED */
	-int
	-zil_reset(const char osname, void arg)
	-{
	- int error;
	-
	- error = zil_suspend(osname, NULL);
	- if (error != 0)
	- return (SET_ERROR(EEXIST));
	- return (0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
	@@ -1,4386 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	- * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-#include <sys/sysmacros.h>
	-#include <sys/zfs_context.h>
	-#include <sys/fm/fs/zfs.h>
	-#include <sys/spa.h>
	-#include <sys/txg.h>
	-#include <sys/spa_impl.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/zio_impl.h>
	-#include <sys/zio_compress.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/arc.h>
	-#include <sys/ddt.h>
	-#include <sys/trim_map.h>
	-#include <sys/blkptr.h>
	-#include <sys/zfeature.h>
	-#include <sys/dsl_scan.h>
	-#include <sys/metaslab_impl.h>
	-#include <sys/abd.h>
	-#include <sys/cityhash.h>
	-
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "ZFS ZIO");
	-#if defined(__amd64__)
	-static int zio_use_uma = 1;
	-#else
	-static int zio_use_uma = 0;
	-#endif
	-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
	- "Use uma(9) for ZIO allocations");
	-static int zio_exclude_metadata = 0;
	-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
	- "Exclude metadata buffers from dumps as well");
	-
	-zio_trim_stats_t zio_trim_stats = {
	- { "bytes", KSTAT_DATA_UINT64,
	- "Number of bytes successfully TRIMmed" },
	- { "success", KSTAT_DATA_UINT64,
	- "Number of successful TRIM requests" },
	- { "unsupported", KSTAT_DATA_UINT64,
	- "Number of TRIM requests that failed because TRIM is not supported" },
	- { "failed", KSTAT_DATA_UINT64,
	- "Number of TRIM requests that failed for reasons other than not supported" },
	-};
	-
	-static kstat_t *zio_trim_ksp;
	-
	-/*
	- * ==========================================================================
	- * I/O type descriptions
	- * ==========================================================================
	- */
	-const char *zio_type_name[ZIO_TYPES] = {
	- "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
	- "zio_ioctl"
	-};
	-
	-boolean_t zio_dva_throttle_enabled = B_TRUE;
	-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RWTUN,
	- &zio_dva_throttle_enabled, 0, "Enable allocation throttling");
	-
	-/*
	- * ==========================================================================
	- * I/O kmem caches
	- * ==========================================================================
	- */
	-kmem_cache_t *zio_cache;
	-kmem_cache_t *zio_link_cache;
	-kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
	-kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
	-
	-#ifdef _KERNEL
	-extern vmem_t *zio_alloc_arena;
	-#endif
	-
	-#define BP_SPANB(indblkshift, level) \
	- (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
	-#define COMPARE_META_LEVEL 0x80000000ul
	-/*
	- * The following actions directly effect the spa's sync-to-convergence logic.
	- * The values below define the sync pass when we start performing the action.
	- * Care should be taken when changing these values as they directly impact
	- * spa_sync() performance. Tuning these values may introduce subtle performance
	- * pathologies and should only be done in the context of performance analysis.
	- * These tunables will eventually be removed and replaced with #defines once
	- * enough analysis has been done to determine optimal values.
	- *
	- * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
	- * regular blocks are not deferred.
	- */
	-int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
	- &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
	-int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
	- &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
	-int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
	-SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
	- &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
	-
	-/*
	- * An allocating zio is one that either currently has the DVA allocate
	- * stage set or will have it later in its lifetime.
	- */
	-#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
	-
	-boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;
	-
	-#ifdef illumos
	-#ifdef ZFS_DEBUG
	-int zio_buf_debug_limit = 16384;
	-#else
	-int zio_buf_debug_limit = 0;
	-#endif
	-#endif
	-
	-static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
	-
	-void
	-zio_init(void)
	-{
	- size_t c;
	- zio_cache = kmem_cache_create("zio_cache",
	- sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	- zio_link_cache = kmem_cache_create("zio_link_cache",
	- sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	- if (!zio_use_uma)
	- goto out;
	-
	- /*
	- * For small buffers, we want a cache for each multiple of
	- * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
	- * for each quarter-power of 2.
	- */
	- for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
	- size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
	- size_t p2 = size;
	- size_t align = 0;
	- int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;
	-
	- while (!ISP2(p2))
	- p2 &= p2 - 1;
	-
	-#ifdef illumos
	-#ifndef _KERNEL
	- /*
	- * If we are using watchpoints, put each buffer on its own page,
	- * to eliminate the performance overhead of trapping to the
	- * kernel when modifying a non-watched buffer that shares the
	- * page with a watched buffer.
	- */
	- if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
	- continue;
	-#endif
	-#endif /* illumos */
	- if (size <= 4 * SPA_MINBLOCKSIZE) {
	- align = SPA_MINBLOCKSIZE;
	- } else if (IS_P2ALIGNED(size, p2 >> 2)) {
	- align = MIN(p2 >> 2, PAGESIZE);
	- }
	-
	- if (align != 0) {
	- char name[36];
	- (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
	- zio_buf_cache[c] = kmem_cache_create(name, size,
	- align, NULL, NULL, NULL, NULL, NULL, cflags);
	-
	- /*
	- * Since zio_data bufs do not appear in crash dumps, we
	- * pass KMC_NOTOUCH so that no allocator metadata is
	- * stored with the buffers.
	- */
	- (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
	- zio_data_buf_cache[c] = kmem_cache_create(name, size,
	- align, NULL, NULL, NULL, NULL, NULL,
	- cflags \| KMC_NOTOUCH \| KMC_NODEBUG);
	- }
	- }
	-
	- while (--c != 0) {
	- ASSERT(zio_buf_cache[c] != NULL);
	- if (zio_buf_cache[c - 1] == NULL)
	- zio_buf_cache[c - 1] = zio_buf_cache[c];
	-
	- ASSERT(zio_data_buf_cache[c] != NULL);
	- if (zio_data_buf_cache[c - 1] == NULL)
	- zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
	- }
	-out:
	-
	- zio_inject_init();
	-
	- zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
	- KSTAT_TYPE_NAMED,
	- sizeof(zio_trim_stats) / sizeof(kstat_named_t),
	- KSTAT_FLAG_VIRTUAL);
	-
	- if (zio_trim_ksp != NULL) {
	- zio_trim_ksp->ks_data = &zio_trim_stats;
	- kstat_install(zio_trim_ksp);
	- }
	-}
	-
	-void
	-zio_fini(void)
	-{
	- size_t c;
	- kmem_cache_t *last_cache = NULL;
	- kmem_cache_t *last_data_cache = NULL;
	-
	- for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
	- if (zio_buf_cache[c] != last_cache) {
	- last_cache = zio_buf_cache[c];
	- kmem_cache_destroy(zio_buf_cache[c]);
	- }
	- zio_buf_cache[c] = NULL;
	-
	- if (zio_data_buf_cache[c] != last_data_cache) {
	- last_data_cache = zio_data_buf_cache[c];
	- kmem_cache_destroy(zio_data_buf_cache[c]);
	- }
	- zio_data_buf_cache[c] = NULL;
	- }
	-
	- kmem_cache_destroy(zio_link_cache);
	- kmem_cache_destroy(zio_cache);
	-
	- zio_inject_fini();
	-
	- if (zio_trim_ksp != NULL) {
	- kstat_delete(zio_trim_ksp);
	- zio_trim_ksp = NULL;
	- }
	-}
	-
	-/*
	- * ==========================================================================
	- * Allocate and free I/O buffers
	- * ==========================================================================
	- */
	-
	-/*
	- * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
	- * crashdump if the kernel panics, so use it judiciously. Obviously, it's
	- * useful to inspect ZFS metadata, but if possible, we should avoid keeping
	- * excess / transient data in-core during a crashdump.
	- */
	-void *
	-zio_buf_alloc(size_t size)
	-{
	- size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
	- int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
	-
	- VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
	-
	- if (zio_use_uma)
	- return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
	- else
	- return (kmem_alloc(size, KM_SLEEP\|flags));
	-}
	-
	-/*
	- * Use zio_data_buf_alloc to allocate data. The data will not appear in a
	- * crashdump if the kernel panics. This exists so that we will limit the amount
	- * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
	- * of kernel heap dumped to disk when the kernel panics)
	- */
	-void *
	-zio_data_buf_alloc(size_t size)
	-{
	- size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
	-
	- VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
	-
	- if (zio_use_uma)
	- return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
	- else
	- return (kmem_alloc(size, KM_SLEEP \| KM_NODEBUG));
	-}
	-
	-void
	-zio_buf_free(void *buf, size_t size)
	-{
	- size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
	-
	- VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
	-
	- if (zio_use_uma)
	- kmem_cache_free(zio_buf_cache[c], buf);
	- else
	- kmem_free(buf, size);
	-}
	-
	-void
	-zio_data_buf_free(void *buf, size_t size)
	-{
	- size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
	-
	- VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
	-
	- if (zio_use_uma)
	- kmem_cache_free(zio_data_buf_cache[c], buf);
	- else
	- kmem_free(buf, size);
	-}
	-
	-/*
	- * ==========================================================================
	- * Push and pop I/O transform buffers
	- * ==========================================================================
	- */
	-void
	-zio_push_transform(zio_t zio, abd_t data, uint64_t size, uint64_t bufsize,
	- zio_transform_func_t *transform)
	-{
	- zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
	-
	- /*
	- * Ensure that anyone expecting this zio to contain a linear ABD isn't
	- * going to get a nasty surprise when they try to access the data.
	- */
	-#ifdef illumos
	- IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
	-#else
	- IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd),
	- abd_is_linear(data));
	-#endif
	-
	- zt->zt_orig_abd = zio->io_abd;
	- zt->zt_orig_size = zio->io_size;
	- zt->zt_bufsize = bufsize;
	- zt->zt_transform = transform;
	-
	- zt->zt_next = zio->io_transform_stack;
	- zio->io_transform_stack = zt;
	-
	- zio->io_abd = data;
	- zio->io_size = size;
	-}
	-
	-void
	-zio_pop_transforms(zio_t *zio)
	-{
	- zio_transform_t *zt;
	-
	- while ((zt = zio->io_transform_stack) != NULL) {
	- if (zt->zt_transform != NULL)
	- zt->zt_transform(zio,
	- zt->zt_orig_abd, zt->zt_orig_size);
	-
	- if (zt->zt_bufsize != 0)
	- abd_free(zio->io_abd);
	-
	- zio->io_abd = zt->zt_orig_abd;
	- zio->io_size = zt->zt_orig_size;
	- zio->io_transform_stack = zt->zt_next;
	-
	- kmem_free(zt, sizeof (zio_transform_t));
	- }
	-}
	-
	-/*
	- * ==========================================================================
	- * I/O transform callbacks for subblocks and decompression
	- * ==========================================================================
	- */
	-static void
	-zio_subblock(zio_t zio, abd_t data, uint64_t size)
	-{
	- ASSERT(zio->io_size > size);
	-
	- if (zio->io_type == ZIO_TYPE_READ)
	- abd_copy(data, zio->io_abd, size);
	-}
	-
	-static void
	-zio_decompress(zio_t zio, abd_t data, uint64_t size)
	-{
	- if (zio->io_error == 0) {
	- void *tmp = abd_borrow_buf(data, size);
	- int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
	- zio->io_abd, tmp, zio->io_size, size);
	- abd_return_buf_copy(data, tmp, size);
	-
	- if (ret != 0)
	- zio->io_error = SET_ERROR(EIO);
	- }
	-}
	-
	-/*
	- * ==========================================================================
	- * I/O parent/child relationships and pipeline interlocks
	- * ==========================================================================
	- */
	-zio_t *
	-zio_walk_parents(zio_t cio, zio_link_t *zl)
	-{
	- list_t *pl = &cio->io_parent_list;
	-
	- zl = (zl == NULL) ? list_head(pl) : list_next(pl, *zl);
	- if (*zl == NULL)
	- return (NULL);
	-
	- ASSERT((*zl)->zl_child == cio);
	- return ((*zl)->zl_parent);
	-}
	-
	-zio_t *
	-zio_walk_children(zio_t pio, zio_link_t *zl)
	-{
	- list_t *cl = &pio->io_child_list;
	-
	- ASSERT(MUTEX_HELD(&pio->io_lock));
	-
	- zl = (zl == NULL) ? list_head(cl) : list_next(cl, *zl);
	- if (*zl == NULL)
	- return (NULL);
	-
	- ASSERT((*zl)->zl_parent == pio);
	- return ((*zl)->zl_child);
	-}
	-
	-zio_t *
	-zio_unique_parent(zio_t *cio)
	-{
	- zio_link_t *zl = NULL;
	- zio_t *pio = zio_walk_parents(cio, &zl);
	-
	- VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
	- return (pio);
	-}
	-
	-void
	-zio_add_child(zio_t pio, zio_t cio)
	-{
	- zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
	-
	- /*
	- * Logical I/Os can have logical, gang, or vdev children.
	- * Gang I/Os can have gang or vdev children.
	- * Vdev I/Os can only have vdev children.
	- * The following ASSERT captures all of these constraints.
	- */
	- ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
	-
	- zl->zl_parent = pio;
	- zl->zl_child = cio;
	-
	- mutex_enter(&pio->io_lock);
	- mutex_enter(&cio->io_lock);
	-
	- ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
	-
	- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
	- pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
	-
	- list_insert_head(&pio->io_child_list, zl);
	- list_insert_head(&cio->io_parent_list, zl);
	-
	- pio->io_child_count++;
	- cio->io_parent_count++;
	-
	- mutex_exit(&cio->io_lock);
	- mutex_exit(&pio->io_lock);
	-}
	-
	-static void
	-zio_remove_child(zio_t pio, zio_t cio, zio_link_t *zl)
	-{
	- ASSERT(zl->zl_parent == pio);
	- ASSERT(zl->zl_child == cio);
	-
	- mutex_enter(&pio->io_lock);
	- mutex_enter(&cio->io_lock);
	-
	- list_remove(&pio->io_child_list, zl);
	- list_remove(&cio->io_parent_list, zl);
	-
	- pio->io_child_count--;
	- cio->io_parent_count--;
	-
	- mutex_exit(&cio->io_lock);
	- mutex_exit(&pio->io_lock);
	- kmem_cache_free(zio_link_cache, zl);
	-}
	-
	-static boolean_t
	-zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
	-{
	- boolean_t waiting = B_FALSE;
	-
	- mutex_enter(&zio->io_lock);
	- ASSERT(zio->io_stall == NULL);
	- for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
	- if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
	- continue;
	-
	- uint64_t *countp = &zio->io_children[c][wait];
	- if (*countp != 0) {
	- zio->io_stage >>= 1;
	- ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
	- zio->io_stall = countp;
	- waiting = B_TRUE;
	- break;
	- }
	- }
	- mutex_exit(&zio->io_lock);
	- return (waiting);
	-}
	-
	-static void
	-zio_notify_parent(zio_t pio, zio_t zio, enum zio_wait_type wait,
	- zio_t **next_to_executep)
	-{
	- uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
	- int *errorp = &pio->io_child_error[zio->io_child_type];
	-
	- mutex_enter(&pio->io_lock);
	- if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
	- errorp = zio_worst_error(errorp, zio->io_error);
	- pio->io_reexecute \|= zio->io_reexecute;
	- ASSERT3U(*countp, >, 0);
	-
	- (*countp)--;
	-
	- if (*countp == 0 && pio->io_stall == countp) {
	- zio_taskq_type_t type =
	- pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
	- ZIO_TASKQ_INTERRUPT;
	- pio->io_stall = NULL;
	- mutex_exit(&pio->io_lock);
	-
	- /*
	- * If we can tell the caller to execute this parent next, do
	- * so. Otherwise dispatch the parent zio as its own task.
	- *
	- * Having the caller execute the parent when possible reduces
	- * locking on the zio taskq's, reduces context switch
	- * overhead, and has no recursion penalty. Note that one
	- * read from disk typically causes at least 3 zio's: a
	- * zio_null(), the logical zio_read(), and then a physical
	- * zio. When the physical ZIO completes, we are able to call
	- * zio_done() on all 3 of these zio's from one invocation of
	- * zio_execute() by returning the parent back to
	- * zio_execute(). Since the parent isn't executed until this
	- * thread returns back to zio_execute(), the caller should do
	- * so promptly.
	- *
	- * In other cases, dispatching the parent prevents
	- * overflowing the stack when we have deeply nested
	- * parent-child relationships, as we do with the "mega zio"
	- * of writes for spa_sync(), and the chain of ZIL blocks.
	- */
	- if (next_to_executep != NULL && *next_to_executep == NULL) {
	- *next_to_executep = pio;
	- } else {
	- zio_taskq_dispatch(pio, type, B_FALSE);
	- }
	- } else {
	- mutex_exit(&pio->io_lock);
	- }
	-}
	-
	-static void
	-zio_inherit_child_errors(zio_t *zio, enum zio_child c)
	-{
	- if (zio->io_child_error[c] != 0 && zio->io_error == 0)
	- zio->io_error = zio->io_child_error[c];
	-}
	-
	-int
	-zio_bookmark_compare(const void x1, const void x2)
	-{
	- const zio_t *z1 = x1;
	- const zio_t *z2 = x2;
	-
	- if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
	- return (-1);
	- if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
	- return (1);
	-
	- if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
	- return (-1);
	- if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
	- return (1);
	-
	- if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
	- return (-1);
	- if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
	- return (1);
	-
	- if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
	- return (-1);
	- if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
	- return (1);
	-
	- if (z1 < z2)
	- return (-1);
	- if (z1 > z2)
	- return (1);
	-
	- return (0);
	-}
	-
	-/*
	- * ==========================================================================
	- * Create the various types of I/O (read, write, free, etc)
	- * ==========================================================================
	- */
	-static zio_t *
	-zio_create(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t *bp,
	- abd_t data, uint64_t lsize, uint64_t psize, zio_done_func_t done,
	- void *private, zio_type_t type, zio_priority_t priority,
	- enum zio_flag flags, vdev_t *vd, uint64_t offset,
	- const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
	-{
	- zio_t *zio;
	-
	- IMPLY(type != ZIO_TYPE_FREE, psize <= SPA_MAXBLOCKSIZE);
	- ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
	- ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
	-
	- ASSERT(!vd \|\| spa_config_held(spa, SCL_STATE_ALL, RW_READER));
	- ASSERT(!bp \|\| !(flags & ZIO_FLAG_CONFIG_WRITER));
	- ASSERT(vd \|\| stage == ZIO_STAGE_OPEN);
	-
	- IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);
	-
	- zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
	- bzero(zio, sizeof (zio_t));
	-
	- mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
	-#if defined(__FreeBSD__) && defined(_KERNEL)
	- callout_init(&zio->io_timer, 1);
	-#endif
	-
	- list_create(&zio->io_parent_list, sizeof (zio_link_t),
	- offsetof(zio_link_t, zl_parent_node));
	- list_create(&zio->io_child_list, sizeof (zio_link_t),
	- offsetof(zio_link_t, zl_child_node));
	- metaslab_trace_init(&zio->io_alloc_list);
	-
	- if (vd != NULL)
	- zio->io_child_type = ZIO_CHILD_VDEV;
	- else if (flags & ZIO_FLAG_GANG_CHILD)
	- zio->io_child_type = ZIO_CHILD_GANG;
	- else if (flags & ZIO_FLAG_DDT_CHILD)
	- zio->io_child_type = ZIO_CHILD_DDT;
	- else
	- zio->io_child_type = ZIO_CHILD_LOGICAL;
	-
	- if (bp != NULL) {
	- zio->io_bp = (blkptr_t *)bp;
	- zio->io_bp_copy = *bp;
	- zio->io_bp_orig = *bp;
	- if (type != ZIO_TYPE_WRITE \|\|
	- zio->io_child_type == ZIO_CHILD_DDT)
	- zio->io_bp = &zio->io_bp_copy; /* so caller can free */
	- if (zio->io_child_type == ZIO_CHILD_LOGICAL)
	- zio->io_logical = zio;
	- if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
	- pipeline \|= ZIO_GANG_STAGES;
	- }
	-
	- zio->io_spa = spa;
	- zio->io_txg = txg;
	- zio->io_done = done;
	- zio->io_private = private;
	- zio->io_type = type;
	- zio->io_priority = priority;
	- zio->io_vd = vd;
	- zio->io_offset = offset;
	- zio->io_orig_abd = zio->io_abd = data;
	- zio->io_orig_size = zio->io_size = psize;
	- zio->io_lsize = lsize;
	- zio->io_orig_flags = zio->io_flags = flags;
	- zio->io_orig_stage = zio->io_stage = stage;
	- zio->io_orig_pipeline = zio->io_pipeline = pipeline;
	- zio->io_pipeline_trace = ZIO_STAGE_OPEN;
	-
	- zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
	- zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
	-
	- if (zb != NULL)
	- zio->io_bookmark = *zb;
	-
	- if (pio != NULL) {
	- if (zio->io_metaslab_class == NULL)
	- zio->io_metaslab_class = pio->io_metaslab_class;
	- if (zio->io_logical == NULL)
	- zio->io_logical = pio->io_logical;
	- if (zio->io_child_type == ZIO_CHILD_GANG)
	- zio->io_gang_leader = pio->io_gang_leader;
	- zio_add_child(pio, zio);
	- }
	-
	- return (zio);
	-}
	-
	-static void
	-zio_destroy(zio_t *zio)
	-{
	-#ifdef __FreeBSD__
	- KASSERT(!(callout_active(&zio->io_timer) \|\|
	- callout_pending(&zio->io_timer)), ("zio_destroy: timer active"));
	-#endif
	- metaslab_trace_fini(&zio->io_alloc_list);
	- list_destroy(&zio->io_parent_list);
	- list_destroy(&zio->io_child_list);
	- mutex_destroy(&zio->io_lock);
	- cv_destroy(&zio->io_cv);
	- kmem_cache_free(zio_cache, zio);
	-}
	-
	-zio_t *
	-zio_null(zio_t pio, spa_t spa, vdev_t vd, zio_done_func_t done,
	- void *private, enum zio_flag flags)
	-{
	- zio_t *zio;
	-
	- zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
	- ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
	- ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
	-
	- return (zio);
	-}
	-
	-zio_t *
	-zio_root(spa_t spa, zio_done_func_t done, void *private, enum zio_flag flags)
	-{
	- return (zio_null(NULL, spa, NULL, done, private, flags));
	-}
	-
	-void
	-zfs_blkptr_verify(spa_t spa, const blkptr_t bp)
	-{
	- if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
	- zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
	- bp, (longlong_t)BP_GET_TYPE(bp));
	- }
	- if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS \|\|
	- BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
	- zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
	- bp, (longlong_t)BP_GET_CHECKSUM(bp));
	- }
	- if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS \|\|
	- BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
	- zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
	- bp, (longlong_t)BP_GET_COMPRESS(bp));
	- }
	- if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
	- zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
	- bp, (longlong_t)BP_GET_LSIZE(bp));
	- }
	- if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
	- zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
	- bp, (longlong_t)BP_GET_PSIZE(bp));
	- }
	-
	- if (BP_IS_EMBEDDED(bp)) {
	- if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
	- zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
	- bp, (longlong_t)BPE_GET_ETYPE(bp));
	- }
	- }
	-
	- /*
	- * Do not verify individual DVAs if the config is not trusted. This
	- * will be done once the zio is executed in vdev_mirror_map_alloc.
	- */
	- if (!spa->spa_trust_config)
	- return;
	-
	- /*
	- * Pool-specific checks.
	- *
	- * Note: it would be nice to verify that the blk_birth and
	- * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
	- * allows the birth time of log blocks (and dmu_sync()-ed blocks
	- * that are in the log) to be arbitrarily large.
	- */
	- for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
	- uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
	- if (vdevid >= spa->spa_root_vdev->vdev_children) {
	- zfs_panic_recover("blkptr at %p DVA %u has invalid "
	- "VDEV %llu",
	- bp, i, (longlong_t)vdevid);
	- continue;
	- }
	- vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
	- if (vd == NULL) {
	- zfs_panic_recover("blkptr at %p DVA %u has invalid "
	- "VDEV %llu",
	- bp, i, (longlong_t)vdevid);
	- continue;
	- }
	- if (vd->vdev_ops == &vdev_hole_ops) {
	- zfs_panic_recover("blkptr at %p DVA %u has hole "
	- "VDEV %llu",
	- bp, i, (longlong_t)vdevid);
	- continue;
	- }
	- if (vd->vdev_ops == &vdev_missing_ops) {
	- /*
	- * "missing" vdevs are valid during import, but we
	- * don't have their detailed info (e.g. asize), so
	- * we can't perform any more checks on them.
	- */
	- continue;
	- }
	- uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
	- uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
	- if (BP_IS_GANG(bp))
	- asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
	- if (offset + asize > vd->vdev_asize) {
	- zfs_panic_recover("blkptr at %p DVA %u has invalid "
	- "OFFSET %llu",
	- bp, i, (longlong_t)offset);
	- }
	- }
	-}
	-
	-boolean_t
	-zfs_dva_valid(spa_t spa, const dva_t dva, const blkptr_t *bp)
	-{
	- uint64_t vdevid = DVA_GET_VDEV(dva);
	-
	- if (vdevid >= spa->spa_root_vdev->vdev_children)
	- return (B_FALSE);
	-
	- vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
	- if (vd == NULL)
	- return (B_FALSE);
	-
	- if (vd->vdev_ops == &vdev_hole_ops)
	- return (B_FALSE);
	-
	- if (vd->vdev_ops == &vdev_missing_ops) {
	- return (B_FALSE);
	- }
	-
	- uint64_t offset = DVA_GET_OFFSET(dva);
	- uint64_t asize = DVA_GET_ASIZE(dva);
	-
	- if (BP_IS_GANG(bp))
	- asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
	- if (offset + asize > vd->vdev_asize)
	- return (B_FALSE);
	-
	- return (B_TRUE);
	-}
	-
	-zio_t *
	-zio_read(zio_t pio, spa_t spa, const blkptr_t *bp,
	- abd_t data, uint64_t size, zio_done_func_t done, void *private,
	- zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
	-{
	- zio_t *zio;
	-
	- zfs_blkptr_verify(spa, bp);
	-
	- zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
	- data, size, size, done, private,
	- ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
	- ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
	- ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
	-
	- return (zio);
	-}
	-
	-zio_t *
	-zio_write(zio_t pio, spa_t spa, uint64_t txg, blkptr_t *bp,
	- abd_t data, uint64_t lsize, uint64_t psize, const zio_prop_t zp,
	- zio_done_func_t ready, zio_done_func_t children_ready,
	- zio_done_func_t physdone, zio_done_func_t done,
	- void *private, zio_priority_t priority, enum zio_flag flags,
	- const zbookmark_phys_t *zb)
	-{
	- zio_t *zio;
	-
	- ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
	- zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
	- zp->zp_compress >= ZIO_COMPRESS_OFF &&
	- zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
	- DMU_OT_IS_VALID(zp->zp_type) &&
	- zp->zp_level < 32 &&
	- zp->zp_copies > 0 &&
	- zp->zp_copies <= spa_max_replication(spa));
	-
	- zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
	- ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
	- ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
	- ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
	-
	- zio->io_ready = ready;
	- zio->io_children_ready = children_ready;
	- zio->io_physdone = physdone;
	- zio->io_prop = *zp;
	-
	- /*
	- * Data can be NULL if we are going to call zio_write_override() to
	- * provide the already-allocated BP. But we may need the data to
	- * verify a dedup hit (if requested). In this case, don't try to
	- * dedup (just take the already-allocated BP verbatim).
	- */
	- if (data == NULL && zio->io_prop.zp_dedup_verify) {
	- zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
	- }
	-
	- return (zio);
	-}
	-
	-zio_t *
	-zio_rewrite(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, abd_t data,
	- uint64_t size, zio_done_func_t done, void private,
	- zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
	-{
	- zio_t *zio;
	-
	- zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
	- ZIO_TYPE_WRITE, priority, flags \| ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
	- ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
	-
	- return (zio);
	-}
	-
	-void
	-zio_write_override(zio_t zio, blkptr_t bp, int copies, boolean_t nopwrite)
	-{
	- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	- ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
	- ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
	-
	- /*
	- * We must reset the io_prop to match the values that existed
	- * when the bp was first written by dmu_sync() keeping in mind
	- * that nopwrite and dedup are mutually exclusive.
	- */
	- zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
	- zio->io_prop.zp_nopwrite = nopwrite;
	- zio->io_prop.zp_copies = copies;
	- zio->io_bp_override = bp;
	-}
	-
	-void
	-zio_free(spa_t spa, uint64_t txg, const blkptr_t bp)
	-{
	-
	- zfs_blkptr_verify(spa, bp);
	-
	- /*
	- * The check for EMBEDDED is a performance optimization. We
	- * process the free here (by ignoring it) rather than
	- * putting it on the list and then processing it in zio_free_sync().
	- */
	- if (BP_IS_EMBEDDED(bp))
	- return;
	- metaslab_check_free(spa, bp);
	-
	- /*
	- * Frees that are for the currently-syncing txg, are not going to be
	- * deferred, and which will not need to do a read (i.e. not GANG or
	- * DEDUP), can be processed immediately. Otherwise, put them on the
	- * in-memory list for later processing.
	- */
	- if (zfs_trim_enabled \|\| BP_IS_GANG(bp) \|\| BP_GET_DEDUP(bp) \|\|
	- txg != spa->spa_syncing_txg \|\|
	- spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
	- bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
	- } else {
	- VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
	- BP_GET_PSIZE(bp), 0)));
	- }
	-}
	-
	-zio_t *
	-zio_free_sync(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t *bp,
	- uint64_t size, enum zio_flag flags)
	-{
	- zio_t *zio;
	- enum zio_stage stage = ZIO_FREE_PIPELINE;
	-
	- ASSERT(!BP_IS_HOLE(bp));
	- ASSERT(spa_syncing_txg(spa) == txg);
	- ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
	-
	- if (BP_IS_EMBEDDED(bp))
	- return (zio_null(pio, spa, NULL, NULL, NULL, 0));
	-
	- metaslab_check_free(spa, bp);
	- arc_freed(spa, bp);
	- dsl_scan_freed(spa, bp);
	-
	- if (zfs_trim_enabled)
	- stage \|= ZIO_STAGE_ISSUE_ASYNC \| ZIO_STAGE_VDEV_IO_START \|
	- ZIO_STAGE_VDEV_IO_ASSESS;
	- /*
	- * GANG and DEDUP blocks can induce a read (for the gang block header,
	- * or the DDT), so issue them asynchronously so that this thread is
	- * not tied up.
	- */
	- else if (BP_IS_GANG(bp) \|\| BP_GET_DEDUP(bp))
	- stage \|= ZIO_STAGE_ISSUE_ASYNC;
	-
	- flags \|= ZIO_FLAG_DONT_QUEUE;
	-
	- zio = zio_create(pio, spa, txg, bp, NULL, size,
	- size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
	- flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
	-
	- return (zio);
	-}
	-
	-zio_t *
	-zio_claim(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t *bp,
	- zio_done_func_t done, void private, enum zio_flag flags)
	-{
	- zio_t *zio;
	-
	- zfs_blkptr_verify(spa, bp);
	-
	- if (BP_IS_EMBEDDED(bp))
	- return (zio_null(pio, spa, NULL, NULL, NULL, 0));
	-
	- /*
	- * A claim is an allocation of a specific block. Claims are needed
	- * to support immediate writes in the intent log. The issue is that
	- * immediate writes contain committed data, but in a txg that was
	- * not committed. Upon opening the pool after an unclean shutdown,
	- * the intent log claims all blocks that contain immediate write data
	- * so that the SPA knows they're in use.
	- *
	- * All claims must be resolved in the first txg -- before the SPA
	- * starts allocating blocks -- so that nothing is allocated twice.
	- * If txg == 0 we just verify that the block is claimable.
	- */
	- ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
	- spa_min_claim_txg(spa));
	- ASSERT(txg == spa_min_claim_txg(spa) \|\| txg == 0);
	- ASSERT(!BP_GET_DEDUP(bp) \|\| !spa_writeable(spa)); /* zdb(1M) */
	-
	- zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
	- BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
	- flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
	- ASSERT0(zio->io_queued_timestamp);
	-
	- return (zio);
	-}
	-
	-zio_t *
	-zio_ioctl(zio_t pio, spa_t spa, vdev_t *vd, int cmd, uint64_t offset,
	- uint64_t size, zio_done_func_t done, void private,
	- zio_priority_t priority, enum zio_flag flags)
	-{
	- zio_t *zio;
	- int c;
	-
	- if (vd->vdev_children == 0) {
	- zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
	- ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
	- ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
	-
	- zio->io_cmd = cmd;
	- } else {
	- zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
	-
	- for (c = 0; c < vd->vdev_children; c++)
	- zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
	- offset, size, done, private, priority, flags));
	- }
	-
	- return (zio);
	-}
	-
	-zio_t *
	-zio_read_phys(zio_t pio, vdev_t vd, uint64_t offset, uint64_t size,
	- abd_t data, int checksum, zio_done_func_t done, void *private,
	- zio_priority_t priority, enum zio_flag flags, boolean_t labels)
	-{
	- zio_t *zio;
	-
	- ASSERT(vd->vdev_children == 0);
	- ASSERT(!labels \|\| offset + size <= VDEV_LABEL_START_SIZE \|\|
	- offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
	- ASSERT3U(offset + size, <=, vd->vdev_psize);
	-
	- zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
	- private, ZIO_TYPE_READ, priority, flags \| ZIO_FLAG_PHYSICAL, vd,
	- offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
	-
	- zio->io_prop.zp_checksum = checksum;
	-
	- return (zio);
	-}
	-
	-zio_t *
	-zio_write_phys(zio_t pio, vdev_t vd, uint64_t offset, uint64_t size,
	- abd_t data, int checksum, zio_done_func_t done, void *private,
	- zio_priority_t priority, enum zio_flag flags, boolean_t labels)
	-{
	- zio_t *zio;
	-
	- ASSERT(vd->vdev_children == 0);
	- ASSERT(!labels \|\| offset + size <= VDEV_LABEL_START_SIZE \|\|
	- offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
	- ASSERT3U(offset + size, <=, vd->vdev_psize);
	-
	- zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
	- private, ZIO_TYPE_WRITE, priority, flags \| ZIO_FLAG_PHYSICAL, vd,
	- offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
	-
	- zio->io_prop.zp_checksum = checksum;
	-
	- if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
	- /*
	- * zec checksums are necessarily destructive -- they modify
	- * the end of the write buffer to hold the verifier/checksum.
	- * Therefore, we must make a local copy in case the data is
	- * being written to multiple places in parallel.
	- */
	- abd_t *wbuf = abd_alloc_sametype(data, size);
	- abd_copy(wbuf, data, size);
	-
	- zio_push_transform(zio, wbuf, size, size, NULL);
	- }
	-
	- return (zio);
	-}
	-
	-/*
	- * Create a child I/O to do some work for us.
	- */
	-zio_t *
	-zio_vdev_child_io(zio_t pio, blkptr_t bp, vdev_t *vd, uint64_t offset,
	- abd_t *data, uint64_t size, int type, zio_priority_t priority,
	- enum zio_flag flags, zio_done_func_t done, void private)
	-{
	- enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
	- zio_t *zio;
	-
	- /*
	- * vdev child I/Os do not propagate their error to the parent.
	- * Therefore, for correct operation the caller must check for
	- * and handle the error in the child i/o's done callback.
	- * The only exceptions are i/os that we don't care about
	- * (OPTIONAL or REPAIR).
	- */
	- ASSERT((flags & ZIO_FLAG_OPTIONAL) \|\| (flags & ZIO_FLAG_IO_REPAIR) \|\|
	- done != NULL);
	-
	- if (type == ZIO_TYPE_READ && bp != NULL) {
	- /*
	- * If we have the bp, then the child should perform the
	- * checksum and the parent need not. This pushes error
	- * detection as close to the leaves as possible and
	- * eliminates redundant checksums in the interior nodes.
	- */
	- pipeline \|= ZIO_STAGE_CHECKSUM_VERIFY;
	- pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
	- }
	-
	- /* Not all IO types require vdev io done stage e.g. free */
	- if (type == ZIO_TYPE_FREE &&
	- !(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
	- pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
	-
	- if (vd->vdev_ops->vdev_op_leaf) {
	- ASSERT0(vd->vdev_children);
	- offset += VDEV_LABEL_START_SIZE;
	- }
	-
	- flags \|= ZIO_VDEV_CHILD_FLAGS(pio);
	-
	- /*
	- * If we've decided to do a repair, the write is not speculative --
	- * even if the original read was.
	- */
	- if (flags & ZIO_FLAG_IO_REPAIR)
	- flags &= ~ZIO_FLAG_SPECULATIVE;
	-
	- /*
	- * If we're creating a child I/O that is not associated with a
	- * top-level vdev, then the child zio is not an allocating I/O.
	- * If this is a retried I/O then we ignore it since we will
	- * have already processed the original allocating I/O.
	- */
	- if (flags & ZIO_FLAG_IO_ALLOCATING &&
	- (vd != vd->vdev_top \|\| (flags & ZIO_FLAG_IO_RETRY))) {
	- ASSERT(pio->io_metaslab_class != NULL);
	- ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
	- ASSERT(type == ZIO_TYPE_WRITE);
	- ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
	- ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
	- ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) \|\|
	- pio->io_child_type == ZIO_CHILD_GANG);
	-
	- flags &= ~ZIO_FLAG_IO_ALLOCATING;
	- }
	-
	- zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
	- done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
	- ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
	- ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
	-
	- zio->io_physdone = pio->io_physdone;
	- if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
	- zio->io_logical->io_phys_children++;
	-
	- return (zio);
	-}
	-
	-zio_t *
	-zio_vdev_delegated_io(vdev_t vd, uint64_t offset, abd_t data, uint64_t size,
	- zio_type_t type, zio_priority_t priority, enum zio_flag flags,
	- zio_done_func_t done, void private)
	-{
	- zio_t *zio;
	-
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
	- data, size, size, done, private, type, priority,
	- flags \| ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_RETRY \| ZIO_FLAG_DELEGATED,
	- vd, offset, NULL,
	- ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
	-
	- return (zio);
	-}
	-
	-void
	-zio_flush(zio_t zio, vdev_t vd)
	-{
	- zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
	- NULL, NULL, ZIO_PRIORITY_NOW,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY));
	-}
	-
	-zio_t *
	-zio_trim(zio_t zio, spa_t spa, vdev_t *vd, uint64_t offset, uint64_t size)
	-{
	-
	- ASSERT(vd->vdev_ops->vdev_op_leaf);
	-
	- return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL,
	- ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE \|
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY,
	- vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
	-}
	-
	-void
	-zio_shrink(zio_t *zio, uint64_t size)
	-{
	- ASSERT3P(zio->io_executor, ==, NULL);
	- ASSERT3P(zio->io_orig_size, ==, zio->io_size);
	- ASSERT3U(size, <=, zio->io_size);
	-
	- /*
	- * We don't shrink for raidz because of problems with the
	- * reconstruction when reading back less than the block size.
	- * Note, BP_IS_RAIDZ() assumes no compression.
	- */
	- ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
	- if (!BP_IS_RAIDZ(zio->io_bp)) {
	- /* we are not doing a raw write */
	- ASSERT3U(zio->io_size, ==, zio->io_lsize);
	- zio->io_orig_size = zio->io_size = zio->io_lsize = size;
	- }
	-}
	-
	-/*
	- * ==========================================================================
	- * Prepare to read and write logical blocks
	- * ==========================================================================
	- */
	-
	-static zio_t *
	-zio_read_bp_init(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	-
	- ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
	-
	- if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
	- zio->io_child_type == ZIO_CHILD_LOGICAL &&
	- !(zio->io_flags & ZIO_FLAG_RAW)) {
	- uint64_t psize =
	- BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
	- zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
	- psize, psize, zio_decompress);
	- }
	-
	- if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
	- zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	-
	- int psize = BPE_GET_PSIZE(bp);
	- void *data = abd_borrow_buf(zio->io_abd, psize);
	- decode_embedded_bp_compressed(bp, data);
	- abd_return_buf_copy(zio->io_abd, data, psize);
	- } else {
	- ASSERT(!BP_IS_EMBEDDED(bp));
	- ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
	- }
	-
	- if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
	- zio->io_flags \|= ZIO_FLAG_DONT_CACHE;
	-
	- if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
	- zio->io_flags \|= ZIO_FLAG_DONT_CACHE;
	-
	- if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
	- zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
	-
	- return (zio);
	-}
	-
	-static zio_t *
	-zio_write_bp_init(zio_t *zio)
	-{
	- if (!IO_IS_ALLOCATING(zio))
	- return (zio);
	-
	- ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
	-
	- if (zio->io_bp_override) {
	- blkptr_t *bp = zio->io_bp;
	- zio_prop_t *zp = &zio->io_prop;
	-
	- ASSERT(bp->blk_birth != zio->io_txg);
	- ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
	-
	- bp = zio->io_bp_override;
	- zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	-
	- if (BP_IS_EMBEDDED(bp))
	- return (zio);
	-
	- /*
	- * If we've been overridden and nopwrite is set then
	- * set the flag accordingly to indicate that a nopwrite
	- * has already occurred.
	- */
	- if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
	- ASSERT(!zp->zp_dedup);
	- ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
	- zio->io_flags \|= ZIO_FLAG_NOPWRITE;
	- return (zio);
	- }
	-
	- ASSERT(!zp->zp_nopwrite);
	-
	- if (BP_IS_HOLE(bp) \|\| !zp->zp_dedup)
	- return (zio);
	-
	- ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
	- ZCHECKSUM_FLAG_DEDUP) \|\| zp->zp_dedup_verify);
	-
	- if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
	- BP_SET_DEDUP(bp, 1);
	- zio->io_pipeline \|= ZIO_STAGE_DDT_WRITE;
	- return (zio);
	- }
	-
	- /*
	- * We were unable to handle this as an override bp, treat
	- * it as a regular write I/O.
	- */
	- zio->io_bp_override = NULL;
	- *bp = zio->io_bp_orig;
	- zio->io_pipeline = zio->io_orig_pipeline;
	- }
	-
	- return (zio);
	-}
	-
	-static zio_t *
	-zio_write_compress(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- zio_prop_t *zp = &zio->io_prop;
	- enum zio_compress compress = zp->zp_compress;
	- blkptr_t *bp = zio->io_bp;
	- uint64_t lsize = zio->io_lsize;
	- uint64_t psize = zio->io_size;
	- int pass = 1;
	-
	- EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
	-
	- /*
	- * If our children haven't all reached the ready stage,
	- * wait for them and then repeat this pipeline stage.
	- */
	- if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT \|
	- ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
	- return (NULL);
	- }
	-
	- if (!IO_IS_ALLOCATING(zio))
	- return (zio);
	-
	- if (zio->io_children_ready != NULL) {
	- /*
	- * Now that all our children are ready, run the callback
	- * associated with this zio in case it wants to modify the
	- * data to be written.
	- */
	- ASSERT3U(zp->zp_level, >, 0);
	- zio->io_children_ready(zio);
	- }
	-
	- ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
	- ASSERT(zio->io_bp_override == NULL);
	-
	- if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
	- /*
	- * We're rewriting an existing block, which means we're
	- * working on behalf of spa_sync(). For spa_sync() to
	- * converge, it must eventually be the case that we don't
	- * have to allocate new blocks. But compression changes
	- * the blocksize, which forces a reallocate, and makes
	- * convergence take longer. Therefore, after the first
	- * few passes, stop compressing to ensure convergence.
	- */
	- pass = spa_sync_pass(spa);
	-
	- ASSERT(zio->io_txg == spa_syncing_txg(spa));
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	- ASSERT(!BP_GET_DEDUP(bp));
	-
	- if (pass >= zfs_sync_pass_dont_compress)
	- compress = ZIO_COMPRESS_OFF;
	-
	- /* Make sure someone doesn't change their mind on overwrites */
	- ASSERT(BP_IS_EMBEDDED(bp) \|\| MIN(zp->zp_copies + BP_IS_GANG(bp),
	- spa_max_replication(spa)) == BP_GET_NDVAS(bp));
	- }
	-
	- /* If it's a compressed write that is not raw, compress the buffer. */
	- if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
	- void *cbuf = zio_buf_alloc(lsize);
	- psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
	- if (psize == 0 \|\| psize == lsize) {
	- compress = ZIO_COMPRESS_OFF;
	- zio_buf_free(cbuf, lsize);
	- } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
	- zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
	- spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
	- encode_embedded_bp_compressed(bp,
	- cbuf, compress, lsize, psize);
	- BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
	- BP_SET_TYPE(bp, zio->io_prop.zp_type);
	- BP_SET_LEVEL(bp, zio->io_prop.zp_level);
	- zio_buf_free(cbuf, lsize);
	- bp->blk_birth = zio->io_txg;
	- zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	- ASSERT(spa_feature_is_active(spa,
	- SPA_FEATURE_EMBEDDED_DATA));
	- return (zio);
	- } else {
	- /*
	- * Round up compressed size up to the ashift
	- * of the smallest-ashift device, and zero the tail.
	- * This ensures that the compressed size of the BP
	- * (and thus compressratio property) are correct,
	- * in that we charge for the padding used to fill out
	- * the last sector.
	- */
	- ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
	- size_t rounded = (size_t)P2ROUNDUP(psize,
	- 1ULL << spa->spa_min_ashift);
	- if (rounded >= lsize) {
	- compress = ZIO_COMPRESS_OFF;
	- zio_buf_free(cbuf, lsize);
	- psize = lsize;
	- } else {
	- abd_t *cdata = abd_get_from_buf(cbuf, lsize);
	- abd_take_ownership_of_buf(cdata, B_TRUE);
	- abd_zero_off(cdata, psize, rounded - psize);
	- psize = rounded;
	- zio_push_transform(zio, cdata,
	- psize, lsize, NULL);
	- }
	- }
	-
	- /*
	- * We were unable to handle this as an override bp, treat
	- * it as a regular write I/O.
	- */
	- zio->io_bp_override = NULL;
	- *bp = zio->io_bp_orig;
	- zio->io_pipeline = zio->io_orig_pipeline;
	- } else {
	- ASSERT3U(psize, !=, 0);
	- }
	-
	- /*
	- * The final pass of spa_sync() must be all rewrites, but the first
	- * few passes offer a trade-off: allocating blocks defers convergence,
	- * but newly allocated blocks are sequential, so they can be written
	- * to disk faster. Therefore, we allow the first few passes of
	- * spa_sync() to allocate new blocks, but force rewrites after that.
	- * There should only be a handful of blocks after pass 1 in any case.
	- */
	- if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
	- BP_GET_PSIZE(bp) == psize &&
	- pass >= zfs_sync_pass_rewrite) {
	- VERIFY3U(psize, !=, 0);
	- enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
	-
	- zio->io_pipeline = ZIO_REWRITE_PIPELINE \| gang_stages;
	- zio->io_flags \|= ZIO_FLAG_IO_REWRITE;
	- } else {
	- BP_ZERO(bp);
	- zio->io_pipeline = ZIO_WRITE_PIPELINE;
	- }
	-
	- if (psize == 0) {
	- if (zio->io_bp_orig.blk_birth != 0 &&
	- spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
	- BP_SET_LSIZE(bp, lsize);
	- BP_SET_TYPE(bp, zp->zp_type);
	- BP_SET_LEVEL(bp, zp->zp_level);
	- BP_SET_BIRTH(bp, zio->io_txg, 0);
	- }
	- zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	- } else {
	- ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
	- BP_SET_LSIZE(bp, lsize);
	- BP_SET_TYPE(bp, zp->zp_type);
	- BP_SET_LEVEL(bp, zp->zp_level);
	- BP_SET_PSIZE(bp, psize);
	- BP_SET_COMPRESS(bp, compress);
	- BP_SET_CHECKSUM(bp, zp->zp_checksum);
	- BP_SET_DEDUP(bp, zp->zp_dedup);
	- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
	- if (zp->zp_dedup) {
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	- ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
	- zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
	- }
	- if (zp->zp_nopwrite) {
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	- ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
	- zio->io_pipeline \|= ZIO_STAGE_NOP_WRITE;
	- }
	- }
	- return (zio);
	-}
	-
	-static zio_t *
	-zio_free_bp_init(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	-
	- if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
	- if (BP_GET_DEDUP(bp))
	- zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
	- }
	-
	- ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
	-
	- return (zio);
	-}
	-
	-/*
	- * ==========================================================================
	- * Execute the I/O pipeline
	- * ==========================================================================
	- */
	-
	-static void
	-zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
	-{
	- spa_t *spa = zio->io_spa;
	- zio_type_t t = zio->io_type;
	- int flags = (cutinline ? TQ_FRONT : 0);
	-
	- ASSERT(q == ZIO_TASKQ_ISSUE \|\| q == ZIO_TASKQ_INTERRUPT);
	-
	- /*
	- * If we're a config writer or a probe, the normal issue and
	- * interrupt threads may all be blocked waiting for the config lock.
	- * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
	- */
	- if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_PROBE))
	- t = ZIO_TYPE_NULL;
	-
	- /*
	- * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
	- */
	- if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
	- t = ZIO_TYPE_NULL;
	-
	- /*
	- * If this is a high priority I/O, then use the high priority taskq if
	- * available.
	- */
	- if ((zio->io_priority == ZIO_PRIORITY_NOW \|\|
	- zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
	- spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
	- q++;
	-
	- ASSERT3U(q, <, ZIO_TASKQ_TYPES);
	-
	- /*
	- * NB: We are assuming that the zio can only be dispatched
	- * to a single taskq at a time. It would be a grievous error
	- * to dispatch the zio to another taskq at the same time.
	- */
	-#if defined(illumos) \|\| !defined(_KERNEL)
	- ASSERT(zio->io_tqent.tqent_next == NULL);
	-#else
	- ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
	-#endif
	- spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
	- flags, &zio->io_tqent);
	-}
	-
	-static boolean_t
	-zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
	-{
	- kthread_t *executor = zio->io_executor;
	- spa_t *spa = zio->io_spa;
	-
	- for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
	- spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
	- uint_t i;
	- for (i = 0; i < tqs->stqs_count; i++) {
	- if (taskq_member(tqs->stqs_taskq[i], executor))
	- return (B_TRUE);
	- }
	- }
	-
	- return (B_FALSE);
	-}
	-
	-static zio_t *
	-zio_issue_async(zio_t *zio)
	-{
	- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
	-
	- return (NULL);
	-}
	-
	-void
	-zio_interrupt(zio_t *zio)
	-{
	- zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
	-}
	-
	-void
	-zio_delay_interrupt(zio_t *zio)
	-{
	- /*
	- * The timeout_generic() function isn't defined in userspace, so
	- * rather than trying to implement the function, the zio delay
	- * functionality has been disabled for userspace builds.
	- */
	-
	-#ifdef _KERNEL
	- /*
	- * If io_target_timestamp is zero, then no delay has been registered
	- * for this IO, thus jump to the end of this function and "skip" the
	- * delay; issuing it directly to the zio layer.
	- */
	- if (zio->io_target_timestamp != 0) {
	- hrtime_t now = gethrtime();
	-
	- if (now >= zio->io_target_timestamp) {
	- /*
	- * This IO has already taken longer than the target
	- * delay to complete, so we don't want to delay it
	- * any longer; we "miss" the delay and issue it
	- * directly to the zio layer. This is likely due to
	- * the target latency being set to a value less than
	- * the underlying hardware can satisfy (e.g. delay
	- * set to 1ms, but the disks take 10ms to complete an
	- * IO request).
	- */
	-
	- DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
	- hrtime_t, now);
	-
	- zio_interrupt(zio);
	- } else {
	- hrtime_t diff = zio->io_target_timestamp - now;
	-
	- DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
	- hrtime_t, now, hrtime_t, diff);
	-
	-#ifdef __FreeBSD__
	- callout_reset_sbt(&zio->io_timer, nstosbt(diff), 0,
	- (void ()(void ))zio_interrupt, zio, C_HARDCLOCK);
	-#else
	- (void) timeout_generic(CALLOUT_NORMAL,
	- (void ()(void ))zio_interrupt, zio, diff, 1, 0);
	-#endif
	- }
	-
	- return;
	- }
	-#endif
	-
	- DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
	- zio_interrupt(zio);
	-}
	-
	-/*
	- * Execute the I/O pipeline until one of the following occurs:
	- *
	- * (1) the I/O completes
	- * (2) the pipeline stalls waiting for dependent child I/Os
	- * (3) the I/O issues, so we're waiting for an I/O completion interrupt
	- * (4) the I/O is delegated by vdev-level caching or aggregation
	- * (5) the I/O is deferred due to vdev-level queueing
	- * (6) the I/O is handed off to another thread.
	- *
	- * In all cases, the pipeline stops whenever there's no CPU work; it never
	- * burns a thread in cv_wait().
	- *
	- * There's no locking on io_stage because there's no legitimate way
	- * for multiple threads to be attempting to process the same I/O.
	- */
	-static zio_pipe_stage_t *zio_pipeline[];
	-
	-void
	-zio_execute(zio_t *zio)
	-{
	- ASSERT3U(zio->io_queued_timestamp, >, 0);
	-
	- while (zio->io_stage < ZIO_STAGE_DONE) {
	- enum zio_stage pipeline = zio->io_pipeline;
	- enum zio_stage stage = zio->io_stage;
	-
	- zio->io_executor = curthread;
	-
	- ASSERT(!MUTEX_HELD(&zio->io_lock));
	- ASSERT(ISP2(stage));
	- ASSERT(zio->io_stall == NULL);
	-
	- do {
	- stage <<= 1;
	- } while ((stage & pipeline) == 0);
	-
	- ASSERT(stage <= ZIO_STAGE_DONE);
	-
	- /*
	- * If we are in interrupt context and this pipeline stage
	- * will grab a config lock that is held across I/O,
	- * or may wait for an I/O that needs an interrupt thread
	- * to complete, issue async to avoid deadlock.
	- *
	- * For VDEV_IO_START, we cut in line so that the io will
	- * be sent to disk promptly.
	- */
	- if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
	- zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
	- boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
	- zio_requeue_io_start_cut_in_line : B_FALSE;
	- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
	- return;
	- }
	-
	- zio->io_stage = stage;
	- zio->io_pipeline_trace \|= zio->io_stage;
	-
	- /*
	- * The zio pipeline stage returns the next zio to execute
	- * (typically the same as this one), or NULL if we should
	- * stop.
	- */
	- zio = zio_pipeline[highbit64(stage) - 1](zio);
	-
	- if (zio == NULL)
	- return;
	- }
	-}
	-
	-/*
	- * ==========================================================================
	- * Initiate I/O, either sync or async
	- * ==========================================================================
	- */
	-int
	-zio_wait(zio_t *zio)
	-{
	- int error;
	-
	- ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN);
	- ASSERT3P(zio->io_executor, ==, NULL);
	-
	- zio->io_waiter = curthread;
	- ASSERT0(zio->io_queued_timestamp);
	- zio->io_queued_timestamp = gethrtime();
	-
	- zio_execute(zio);
	-
	- mutex_enter(&zio->io_lock);
	- while (zio->io_executor != NULL)
	- cv_wait(&zio->io_cv, &zio->io_lock);
	- mutex_exit(&zio->io_lock);
	-
	- error = zio->io_error;
	- zio_destroy(zio);
	-
	- return (error);
	-}
	-
	-void
	-zio_nowait(zio_t *zio)
	-{
	- ASSERT3P(zio->io_executor, ==, NULL);
	-
	- if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
	- zio_unique_parent(zio) == NULL) {
	- /*
	- * This is a logical async I/O with no parent to wait for it.
	- * We add it to the spa_async_root_zio "Godfather" I/O which
	- * will ensure they complete prior to unloading the pool.
	- */
	- spa_t *spa = zio->io_spa;
	-
	- zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
	- }
	-
	- ASSERT0(zio->io_queued_timestamp);
	- zio->io_queued_timestamp = gethrtime();
	- zio_execute(zio);
	-}
	-
	-/*
	- * ==========================================================================
	- * Reexecute, cancel, or suspend/resume failed I/O
	- * ==========================================================================
	- */
	-
	-static void
	-zio_reexecute(zio_t *pio)
	-{
	- zio_t cio, cio_next;
	-
	- ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
	- ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
	- ASSERT(pio->io_gang_leader == NULL);
	- ASSERT(pio->io_gang_tree == NULL);
	-
	- pio->io_flags = pio->io_orig_flags;
	- pio->io_stage = pio->io_orig_stage;
	- pio->io_pipeline = pio->io_orig_pipeline;
	- pio->io_reexecute = 0;
	- pio->io_flags \|= ZIO_FLAG_REEXECUTED;
	- pio->io_pipeline_trace = 0;
	- pio->io_error = 0;
	- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
	- pio->io_state[w] = 0;
	- for (int c = 0; c < ZIO_CHILD_TYPES; c++)
	- pio->io_child_error[c] = 0;
	-
	- if (IO_IS_ALLOCATING(pio))
	- BP_ZERO(pio->io_bp);
	-
	- /*
	- * As we reexecute pio's children, new children could be created.
	- * New children go to the head of pio's io_child_list, however,
	- * so we will (correctly) not reexecute them. The key is that
	- * the remainder of pio's io_child_list, from 'cio_next' onward,
	- * cannot be affected by any side effects of reexecuting 'cio'.
	- */
	- zio_link_t *zl = NULL;
	- mutex_enter(&pio->io_lock);
	- for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
	- cio_next = zio_walk_children(pio, &zl);
	- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
	- pio->io_children[cio->io_child_type][w]++;
	- mutex_exit(&pio->io_lock);
	- zio_reexecute(cio);
	- mutex_enter(&pio->io_lock);
	- }
	- mutex_exit(&pio->io_lock);
	-
	- /*
	- * Now that all children have been reexecuted, execute the parent.
	- * We don't reexecute "The Godfather" I/O here as it's the
	- * responsibility of the caller to wait on it.
	- */
	- if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
	- pio->io_queued_timestamp = gethrtime();
	- zio_execute(pio);
	- }
	-}
	-
	-void
	-zio_suspend(spa_t spa, zio_t zio, zio_suspend_reason_t reason)
	-{
	- if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
	- fm_panic("Pool '%s' has encountered an uncorrectable I/O "
	- "failure and the failure mode property for this pool "
	- "is set to panic.", spa_name(spa));
	-
	- zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
	-
	- mutex_enter(&spa->spa_suspend_lock);
	-
	- if (spa->spa_suspend_zio_root == NULL)
	- spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	- ZIO_FLAG_GODFATHER);
	-
	- spa->spa_suspended = reason;
	-
	- if (zio != NULL) {
	- ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
	- ASSERT(zio != spa->spa_suspend_zio_root);
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	- ASSERT(zio_unique_parent(zio) == NULL);
	- ASSERT(zio->io_stage == ZIO_STAGE_DONE);
	- zio_add_child(spa->spa_suspend_zio_root, zio);
	- }
	-
	- mutex_exit(&spa->spa_suspend_lock);
	-}
	-
	-int
	-zio_resume(spa_t *spa)
	-{
	- zio_t *pio;
	-
	- /*
	- * Reexecute all previously suspended i/o.
	- */
	- mutex_enter(&spa->spa_suspend_lock);
	- spa->spa_suspended = ZIO_SUSPEND_NONE;
	- cv_broadcast(&spa->spa_suspend_cv);
	- pio = spa->spa_suspend_zio_root;
	- spa->spa_suspend_zio_root = NULL;
	- mutex_exit(&spa->spa_suspend_lock);
	-
	- if (pio == NULL)
	- return (0);
	-
	- zio_reexecute(pio);
	- return (zio_wait(pio));
	-}
	-
	-void
	-zio_resume_wait(spa_t *spa)
	-{
	- mutex_enter(&spa->spa_suspend_lock);
	- while (spa_suspended(spa))
	- cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
	- mutex_exit(&spa->spa_suspend_lock);
	-}
	-
	-/*
	- * ==========================================================================
	- * Gang blocks.
	- *
	- * A gang block is a collection of small blocks that looks to the DMU
	- * like one large block. When zio_dva_allocate() cannot find a block
	- * of the requested size, due to either severe fragmentation or the pool
	- * being nearly full, it calls zio_write_gang_block() to construct the
	- * block from smaller fragments.
	- *
	- * A gang block consists of a gang header (zio_gbh_phys_t) and up to
	- * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
	- * an indirect block: it's an array of block pointers. It consumes
	- * only one sector and hence is allocatable regardless of fragmentation.
	- * The gang header's bps point to its gang members, which hold the data.
	- *
	- * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
	- * as the verifier to ensure uniqueness of the SHA256 checksum.
	- * Critically, the gang block bp's blk_cksum is the checksum of the data,
	- * not the gang header. This ensures that data block signatures (needed for
	- * deduplication) are independent of how the block is physically stored.
	- *
	- * Gang blocks can be nested: a gang member may itself be a gang block.
	- * Thus every gang block is a tree in which root and all interior nodes are
	- * gang headers, and the leaves are normal blocks that contain user data.
	- * The root of the gang tree is called the gang leader.
	- *
	- * To perform any operation (read, rewrite, free, claim) on a gang block,
	- * zio_gang_assemble() first assembles the gang tree (minus data leaves)
	- * in the io_gang_tree field of the original logical i/o by recursively
	- * reading the gang leader and all gang headers below it. This yields
	- * an in-core tree containing the contents of every gang header and the
	- * bps for every constituent of the gang block.
	- *
	- * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
	- * and invokes a callback on each bp. To free a gang block, zio_gang_issue()
	- * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
	- * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
	- * zio_read_gang() is a wrapper around zio_read() that omits reading gang
	- * headers, since we already have those in io_gang_tree. zio_rewrite_gang()
	- * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
	- * of the gang header plus zio_checksum_compute() of the data to update the
	- * gang header's blk_cksum as described above.
	- *
	- * The two-phase assemble/issue model solves the problem of partial failure --
	- * what if you'd freed part of a gang block but then couldn't read the
	- * gang header for another part? Assembling the entire gang tree first
	- * ensures that all the necessary gang header I/O has succeeded before
	- * starting the actual work of free, claim, or write. Once the gang tree
	- * is assembled, free and claim are in-memory operations that cannot fail.
	- *
	- * In the event that a gang write fails, zio_dva_unallocate() walks the
	- * gang tree to immediately free (i.e. insert back into the space map)
	- * everything we've allocated. This ensures that we don't get ENOSPC
	- * errors during repeated suspend/resume cycles due to a flaky device.
	- *
	- * Gang rewrites only happen during sync-to-convergence. If we can't assemble
	- * the gang tree, we won't modify the block, so we can safely defer the free
	- * (knowing that the block is still intact). If we can assemble the gang
	- * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
	- * each constituent bp and we can allocate a new block on the next sync pass.
	- *
	- * In all cases, the gang tree allows complete recovery from partial failure.
	- * ==========================================================================
	- */
	-
	-static void
	-zio_gang_issue_func_done(zio_t *zio)
	-{
	- abd_put(zio->io_abd);
	-}
	-
	-static zio_t *
	-zio_read_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data,
	- uint64_t offset)
	-{
	- if (gn != NULL)
	- return (pio);
	-
	- return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
	- BP_GET_PSIZE(bp), zio_gang_issue_func_done,
	- NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
	- &pio->io_bookmark));
	-}
	-
	-static zio_t *
	-zio_rewrite_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data,
	- uint64_t offset)
	-{
	- zio_t *zio;
	-
	- if (gn != NULL) {
	- abd_t *gbh_abd =
	- abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
	- zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
	- gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
	- pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
	- &pio->io_bookmark);
	- /*
	- * As we rewrite each gang header, the pipeline will compute
	- * a new gang block header checksum for it; but no one will
	- * compute a new data checksum, so we do that here. The one
	- * exception is the gang leader: the pipeline already computed
	- * its data checksum because that stage precedes gang assembly.
	- * (Presently, nothing actually uses interior data checksums;
	- * this is just good hygiene.)
	- */
	- if (gn != pio->io_gang_leader->io_gang_tree) {
	- abd_t *buf = abd_get_offset(data, offset);
	-
	- zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
	- buf, BP_GET_PSIZE(bp));
	-
	- abd_put(buf);
	- }
	- /*
	- * If we are here to damage data for testing purposes,
	- * leave the GBH alone so that we can detect the damage.
	- */
	- if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
	- zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
	- } else {
	- zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
	- abd_get_offset(data, offset), BP_GET_PSIZE(bp),
	- zio_gang_issue_func_done, NULL, pio->io_priority,
	- ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
	- }
	-
	- return (zio);
	-}
	-
	-/* ARGSUSED */
	-static zio_t *
	-zio_free_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data,
	- uint64_t offset)
	-{
	- return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
	- BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
	- ZIO_GANG_CHILD_FLAGS(pio)));
	-}
	-
	-/* ARGSUSED */
	-static zio_t *
	-zio_claim_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data,
	- uint64_t offset)
	-{
	- return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
	- NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
	-}
	-
	-static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
	- NULL,
	- zio_read_gang,
	- zio_rewrite_gang,
	- zio_free_gang,
	- zio_claim_gang,
	- NULL
	-};
	-
	-static void zio_gang_tree_assemble_done(zio_t *zio);
	-
	-static zio_gang_node_t *
	-zio_gang_node_alloc(zio_gang_node_t **gnpp)
	-{
	- zio_gang_node_t *gn;
	-
	- ASSERT(*gnpp == NULL);
	-
	- gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
	- gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
	- *gnpp = gn;
	-
	- return (gn);
	-}
	-
	-static void
	-zio_gang_node_free(zio_gang_node_t **gnpp)
	-{
	- zio_gang_node_t gn = gnpp;
	-
	- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
	- ASSERT(gn->gn_child[g] == NULL);
	-
	- zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
	- kmem_free(gn, sizeof (*gn));
	- *gnpp = NULL;
	-}
	-
	-static void
	-zio_gang_tree_free(zio_gang_node_t **gnpp)
	-{
	- zio_gang_node_t gn = gnpp;
	-
	- if (gn == NULL)
	- return;
	-
	- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
	- zio_gang_tree_free(&gn->gn_child[g]);
	-
	- zio_gang_node_free(gnpp);
	-}
	-
	-static void
	-zio_gang_tree_assemble(zio_t gio, blkptr_t bp, zio_gang_node_t **gnpp)
	-{
	- zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
	- abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
	-
	- ASSERT(gio->io_gang_leader == gio);
	- ASSERT(BP_IS_GANG(bp));
	-
	- zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
	- zio_gang_tree_assemble_done, gn, gio->io_priority,
	- ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
	-}
	-
	-static void
	-zio_gang_tree_assemble_done(zio_t *zio)
	-{
	- zio_t *gio = zio->io_gang_leader;
	- zio_gang_node_t *gn = zio->io_private;
	- blkptr_t *bp = zio->io_bp;
	-
	- ASSERT(gio == zio_unique_parent(zio));
	- ASSERT(zio->io_child_count == 0);
	-
	- if (zio->io_error)
	- return;
	-
	- /* this ABD was created from a linear buf in zio_gang_tree_assemble */
	- if (BP_SHOULD_BYTESWAP(bp))
	- byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
	-
	- ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
	- ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
	- ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
	-
	- abd_put(zio->io_abd);
	-
	- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
	- blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
	- if (!BP_IS_GANG(gbp))
	- continue;
	- zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
	- }
	-}
	-
	-static void
	-zio_gang_tree_issue(zio_t pio, zio_gang_node_t gn, blkptr_t bp, abd_t data,
	- uint64_t offset)
	-{
	- zio_t *gio = pio->io_gang_leader;
	- zio_t *zio;
	-
	- ASSERT(BP_IS_GANG(bp) == !!gn);
	- ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
	- ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) \|\| gn == gio->io_gang_tree);
	-
	- /*
	- * If you're a gang header, your data is in gn->gn_gbh.
	- * If you're a gang member, your data is in 'data' and gn == NULL.
	- */
	- zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
	-
	- if (gn != NULL) {
	- ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
	-
	- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
	- blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
	- if (BP_IS_HOLE(gbp))
	- continue;
	- zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
	- offset);
	- offset += BP_GET_PSIZE(gbp);
	- }
	- }
	-
	- if (gn == gio->io_gang_tree && gio->io_abd != NULL)
	- ASSERT3U(gio->io_size, ==, offset);
	-
	- if (zio != pio)
	- zio_nowait(zio);
	-}
	-
	-static zio_t *
	-zio_gang_assemble(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	-
	- ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
	- ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
	-
	- zio->io_gang_leader = zio;
	-
	- zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
	-
	- return (zio);
	-}
	-
	-static zio_t *
	-zio_gang_issue(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	-
	- if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
	- return (NULL);
	- }
	-
	- ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
	- ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
	-
	- if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
	- zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
	- 0);
	- else
	- zio_gang_tree_free(&zio->io_gang_tree);
	-
	- zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	-
	- return (zio);
	-}
	-
	-static void
	-zio_write_gang_member_ready(zio_t *zio)
	-{
	- zio_t *pio = zio_unique_parent(zio);
	- zio_t *gio = zio->io_gang_leader;
	- dva_t *cdva = zio->io_bp->blk_dva;
	- dva_t *pdva = pio->io_bp->blk_dva;
	- uint64_t asize;
	-
	- if (BP_IS_HOLE(zio->io_bp))
	- return;
	-
	- ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
	-
	- ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
	- ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
	- ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
	- ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
	- ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
	-
	- mutex_enter(&pio->io_lock);
	- for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
	- ASSERT(DVA_GET_GANG(&pdva[d]));
	- asize = DVA_GET_ASIZE(&pdva[d]);
	- asize += DVA_GET_ASIZE(&cdva[d]);
	- DVA_SET_ASIZE(&pdva[d], asize);
	- }
	- mutex_exit(&pio->io_lock);
	-}
	-
	-static void
	-zio_write_gang_done(zio_t *zio)
	-{
	- /*
	- * The io_abd field will be NULL for a zio with no data. The io_flags
	- * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
	- * check for it here as it is cleared in zio_ready.
	- */
	- if (zio->io_abd != NULL)
	- abd_put(zio->io_abd);
	-}
	-
	-static zio_t *
	-zio_write_gang_block(zio_t *pio)
	-{
	- spa_t *spa = pio->io_spa;
	- metaslab_class_t *mc = spa_normal_class(spa);
	- blkptr_t *bp = pio->io_bp;
	- zio_t *gio = pio->io_gang_leader;
	- zio_t *zio;
	- zio_gang_node_t gn, *gnpp;
	- zio_gbh_phys_t *gbh;
	- abd_t *gbh_abd;
	- uint64_t txg = pio->io_txg;
	- uint64_t resid = pio->io_size;
	- uint64_t lsize;
	- int copies = gio->io_prop.zp_copies;
	- int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
	- zio_prop_t zp;
	- int error;
	- boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
	-
	- int flags = METASLAB_HINTBP_FAVOR \| METASLAB_GANG_HEADER;
	- if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	- ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	- ASSERT(has_data);
	-
	- flags \|= METASLAB_ASYNC_ALLOC;
	- VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
	- pio));
	-
	- /*
	- * The logical zio has already placed a reservation for
	- * 'copies' allocation slots but gang blocks may require
	- * additional copies. These additional copies
	- * (i.e. gbh_copies - copies) are guaranteed to succeed
	- * since metaslab_class_throttle_reserve() always allows
	- * additional reservations for gang blocks.
	- */
	- VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
	- pio->io_allocator, pio, flags));
	- }
	-
	- error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
	- bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
	- &pio->io_alloc_list, pio, pio->io_allocator);
	- if (error) {
	- if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	- ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	- ASSERT(has_data);
	-
	- /*
	- * If we failed to allocate the gang block header then
	- * we remove any additional allocation reservations that
	- * we placed here. The original reservation will
	- * be removed when the logical I/O goes to the ready
	- * stage.
	- */
	- metaslab_class_throttle_unreserve(mc,
	- gbh_copies - copies, pio->io_allocator, pio);
	- }
	- pio->io_error = error;
	- return (pio);
	- }
	-
	- if (pio == gio) {
	- gnpp = &gio->io_gang_tree;
	- } else {
	- gnpp = pio->io_private;
	- ASSERT(pio->io_ready == zio_write_gang_member_ready);
	- }
	-
	- gn = zio_gang_node_alloc(gnpp);
	- gbh = gn->gn_gbh;
	- bzero(gbh, SPA_GANGBLOCKSIZE);
	- gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
	-
	- /*
	- * Create the gang header.
	- */
	- zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
	- zio_write_gang_done, NULL, pio->io_priority,
	- ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
	-
	- /*
	- * Create and nowait the gang children.
	- */
	- for (int g = 0; resid != 0; resid -= lsize, g++) {
	- lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
	- SPA_MINBLOCKSIZE);
	- ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
	-
	- zp.zp_checksum = gio->io_prop.zp_checksum;
	- zp.zp_compress = ZIO_COMPRESS_OFF;
	- zp.zp_type = DMU_OT_NONE;
	- zp.zp_level = 0;
	- zp.zp_copies = gio->io_prop.zp_copies;
	- zp.zp_dedup = B_FALSE;
	- zp.zp_dedup_verify = B_FALSE;
	- zp.zp_nopwrite = B_FALSE;
	-
	- zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
	- has_data ? abd_get_offset(pio->io_abd, pio->io_size -
	- resid) : NULL, lsize, lsize, &zp,
	- zio_write_gang_member_ready, NULL, NULL,
	- zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
	- ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
	-
	- if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	- ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	- ASSERT(has_data);
	-
	- /*
	- * Gang children won't throttle but we should
	- * account for their work, so reserve an allocation
	- * slot for them here.
	- */
	- VERIFY(metaslab_class_throttle_reserve(mc,
	- zp.zp_copies, cio->io_allocator, cio, flags));
	- }
	- zio_nowait(cio);
	- }
	-
	- /*
	- * Set pio's pipeline to just wait for zio to finish.
	- */
	- pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	-
	- zio_nowait(zio);
	-
	- return (pio);
	-}
	-
	-/*
	- * The zio_nop_write stage in the pipeline determines if allocating a
	- * new bp is necessary. The nopwrite feature can handle writes in
	- * either syncing or open context (i.e. zil writes) and as a result is
	- * mutually exclusive with dedup.
	- *
	- * By leveraging a cryptographically secure checksum, such as SHA256, we
	- * can compare the checksums of the new data and the old to determine if
	- * allocating a new block is required. Note that our requirements for
	- * cryptographic strength are fairly weak: there can't be any accidental
	- * hash collisions, but we don't need to be secure against intentional
	- * (malicious) collisions. To trigger a nopwrite, you have to be able
	- * to write the file to begin with, and triggering an incorrect (hash
	- * collision) nopwrite is no worse than simply writing to the file.
	- * That said, there are no known attacks against the checksum algorithms
	- * used for nopwrite, assuming that the salt and the checksums
	- * themselves remain secret.
	- */
	-static zio_t *
	-zio_nop_write(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	- blkptr_t *bp_orig = &zio->io_bp_orig;
	- zio_prop_t *zp = &zio->io_prop;
	-
	- ASSERT(BP_GET_LEVEL(bp) == 0);
	- ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
	- ASSERT(zp->zp_nopwrite);
	- ASSERT(!zp->zp_dedup);
	- ASSERT(zio->io_bp_override == NULL);
	- ASSERT(IO_IS_ALLOCATING(zio));
	-
	- /*
	- * Check to see if the original bp and the new bp have matching
	- * characteristics (i.e. same checksum, compression algorithms, etc).
	- * If they don't then just continue with the pipeline which will
	- * allocate a new bp.
	- */
	- if (BP_IS_HOLE(bp_orig) \|\|
	- !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
	- ZCHECKSUM_FLAG_NOPWRITE) \|\|
	- BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) \|\|
	- BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) \|\|
	- BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) \|\|
	- zp->zp_copies != BP_GET_NDVAS(bp_orig))
	- return (zio);
	-
	- /*
	- * If the checksums match then reset the pipeline so that we
	- * avoid allocating a new bp and issuing any I/O.
	- */
	- if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
	- ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
	- ZCHECKSUM_FLAG_NOPWRITE);
	- ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
	- ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
	- ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
	- ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
	- sizeof (uint64_t)) == 0);
	-
	- bp = bp_orig;
	- zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	- zio->io_flags \|= ZIO_FLAG_NOPWRITE;
	- }
	-
	- return (zio);
	-}
	-
	-/*
	- * ==========================================================================
	- * Dedup
	- * ==========================================================================
	- */
	-static void
	-zio_ddt_child_read_done(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	- ddt_entry_t *dde = zio->io_private;
	- ddt_phys_t *ddp;
	- zio_t *pio = zio_unique_parent(zio);
	-
	- mutex_enter(&pio->io_lock);
	- ddp = ddt_phys_select(dde, bp);
	- if (zio->io_error == 0)
	- ddt_phys_clear(ddp); /* this ddp doesn't need repair */
	-
	- if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
	- dde->dde_repair_abd = zio->io_abd;
	- else
	- abd_free(zio->io_abd);
	- mutex_exit(&pio->io_lock);
	-}
	-
	-static zio_t *
	-zio_ddt_read_start(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	-
	- ASSERT(BP_GET_DEDUP(bp));
	- ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	-
	- if (zio->io_child_error[ZIO_CHILD_DDT]) {
	- ddt_t *ddt = ddt_select(zio->io_spa, bp);
	- ddt_entry_t *dde = ddt_repair_start(ddt, bp);
	- ddt_phys_t *ddp = dde->dde_phys;
	- ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
	- blkptr_t blk;
	-
	- ASSERT(zio->io_vsd == NULL);
	- zio->io_vsd = dde;
	-
	- if (ddp_self == NULL)
	- return (zio);
	-
	- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	- if (ddp->ddp_phys_birth == 0 \|\| ddp == ddp_self)
	- continue;
	- ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
	- &blk);
	- zio_nowait(zio_read(zio, zio->io_spa, &blk,
	- abd_alloc_for_io(zio->io_size, B_TRUE),
	- zio->io_size, zio_ddt_child_read_done, dde,
	- zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) \|
	- ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
	- }
	- return (zio);
	- }
	-
	- zio_nowait(zio_read(zio, zio->io_spa, bp,
	- zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
	- ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
	-
	- return (zio);
	-}
	-
	-static zio_t *
	-zio_ddt_read_done(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	-
	- if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
	- return (NULL);
	- }
	-
	- ASSERT(BP_GET_DEDUP(bp));
	- ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	-
	- if (zio->io_child_error[ZIO_CHILD_DDT]) {
	- ddt_t *ddt = ddt_select(zio->io_spa, bp);
	- ddt_entry_t *dde = zio->io_vsd;
	- if (ddt == NULL) {
	- ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
	- return (zio);
	- }
	- if (dde == NULL) {
	- zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
	- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
	- return (NULL);
	- }
	- if (dde->dde_repair_abd != NULL) {
	- abd_copy(zio->io_abd, dde->dde_repair_abd,
	- zio->io_size);
	- zio->io_child_error[ZIO_CHILD_DDT] = 0;
	- }
	- ddt_repair_done(ddt, dde);
	- zio->io_vsd = NULL;
	- }
	-
	- ASSERT(zio->io_vsd == NULL);
	-
	- return (zio);
	-}
	-
	-static boolean_t
	-zio_ddt_collision(zio_t zio, ddt_t ddt, ddt_entry_t *dde)
	-{
	- spa_t *spa = zio->io_spa;
	- boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW);
	-
	- /* We should never get a raw, override zio */
	- ASSERT(!(zio->io_bp_override && do_raw));
	-
	- /*
	- * Note: we compare the original data, not the transformed data,
	- * because when zio->io_bp is an override bp, we will not have
	- * pushed the I/O transforms. That's an important optimization
	- * because otherwise we'd compress/encrypt all dmu_sync() data twice.
	- */
	- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
	- zio_t *lio = dde->dde_lead_zio[p];
	-
	- if (lio != NULL) {
	- return (lio->io_orig_size != zio->io_orig_size \|\|
	- abd_cmp(zio->io_orig_abd, lio->io_orig_abd,
	- zio->io_orig_size) != 0);
	- }
	- }
	-
	- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
	- ddt_phys_t *ddp = &dde->dde_phys[p];
	-
	- if (ddp->ddp_phys_birth != 0) {
	- arc_buf_t *abuf = NULL;
	- arc_flags_t aflags = ARC_FLAG_WAIT;
	- int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE;
	- blkptr_t blk = *zio->io_bp;
	- int error;
	-
	- ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
	-
	- ddt_exit(ddt);
	-
	- /*
	- * Intuitively, it would make more sense to compare
	- * io_abd than io_orig_abd in the raw case since you
	- * don't want to look at any transformations that have
	- * happened to the data. However, for raw I/Os the
	- * data will actually be the same in io_abd and
	- * io_orig_abd, so all we have to do is issue this as
	- * a raw ARC read.
	- */
	- if (do_raw) {
	- zio_flags \|= ZIO_FLAG_RAW;
	- ASSERT3U(zio->io_size, ==, zio->io_orig_size);
	- ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd,
	- zio->io_size));
	- ASSERT3P(zio->io_transform_stack, ==, NULL);
	- }
	-
	- error = arc_read(NULL, spa, &blk,
	- arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
	- zio_flags, &aflags, &zio->io_bookmark);
	-
	- if (error == 0) {
	- if (arc_buf_size(abuf) != zio->io_orig_size \|\|
	- abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
	- zio->io_orig_size) != 0)
	- error = SET_ERROR(EEXIST);
	- arc_buf_destroy(abuf, &abuf);
	- }
	-
	- ddt_enter(ddt);
	- return (error != 0);
	- }
	- }
	-
	- return (B_FALSE);
	-}
	-
	-static void
	-zio_ddt_child_write_ready(zio_t *zio)
	-{
	- int p = zio->io_prop.zp_copies;
	- ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
	- ddt_entry_t *dde = zio->io_private;
	- ddt_phys_t *ddp = &dde->dde_phys[p];
	- zio_t *pio;
	-
	- if (zio->io_error)
	- return;
	-
	- ddt_enter(ddt);
	-
	- ASSERT(dde->dde_lead_zio[p] == zio);
	-
	- ddt_phys_fill(ddp, zio->io_bp);
	-
	- zio_link_t *zl = NULL;
	- while ((pio = zio_walk_parents(zio, &zl)) != NULL)
	- ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
	-
	- ddt_exit(ddt);
	-}
	-
	-static void
	-zio_ddt_child_write_done(zio_t *zio)
	-{
	- int p = zio->io_prop.zp_copies;
	- ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
	- ddt_entry_t *dde = zio->io_private;
	- ddt_phys_t *ddp = &dde->dde_phys[p];
	-
	- ddt_enter(ddt);
	-
	- ASSERT(ddp->ddp_refcnt == 0);
	- ASSERT(dde->dde_lead_zio[p] == zio);
	- dde->dde_lead_zio[p] = NULL;
	-
	- if (zio->io_error == 0) {
	- zio_link_t *zl = NULL;
	- while (zio_walk_parents(zio, &zl) != NULL)
	- ddt_phys_addref(ddp);
	- } else {
	- ddt_phys_clear(ddp);
	- }
	-
	- ddt_exit(ddt);
	-}
	-
	-static void
	-zio_ddt_ditto_write_done(zio_t *zio)
	-{
	- int p = DDT_PHYS_DITTO;
	- zio_prop_t *zp = &zio->io_prop;
	- blkptr_t *bp = zio->io_bp;
	- ddt_t *ddt = ddt_select(zio->io_spa, bp);
	- ddt_entry_t *dde = zio->io_private;
	- ddt_phys_t *ddp = &dde->dde_phys[p];
	- ddt_key_t *ddk = &dde->dde_key;
	-
	- ddt_enter(ddt);
	-
	- ASSERT(ddp->ddp_refcnt == 0);
	- ASSERT(dde->dde_lead_zio[p] == zio);
	- dde->dde_lead_zio[p] = NULL;
	-
	- if (zio->io_error == 0) {
	- ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
	- ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
	- ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
	- if (ddp->ddp_phys_birth != 0)
	- ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
	- ddt_phys_fill(ddp, bp);
	- }
	-
	- ddt_exit(ddt);
	-}
	-
	-static zio_t *
	-zio_ddt_write(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- blkptr_t *bp = zio->io_bp;
	- uint64_t txg = zio->io_txg;
	- zio_prop_t *zp = &zio->io_prop;
	- int p = zp->zp_copies;
	- int ditto_copies;
	- zio_t *cio = NULL;
	- zio_t *dio = NULL;
	- ddt_t *ddt = ddt_select(spa, bp);
	- ddt_entry_t *dde;
	- ddt_phys_t *ddp;
	-
	- ASSERT(BP_GET_DEDUP(bp));
	- ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
	- ASSERT(BP_IS_HOLE(bp) \|\| zio->io_bp_override);
	- ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
	-
	- ddt_enter(ddt);
	- dde = ddt_lookup(ddt, bp, B_TRUE);
	- ddp = &dde->dde_phys[p];
	-
	- if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
	- /*
	- * If we're using a weak checksum, upgrade to a strong checksum
	- * and try again. If we're already using a strong checksum,
	- * we can't resolve it, so just convert to an ordinary write.
	- * (And automatically e-mail a paper to Nature?)
	- */
	- if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
	- ZCHECKSUM_FLAG_DEDUP)) {
	- zp->zp_checksum = spa_dedup_checksum(spa);
	- zio_pop_transforms(zio);
	- zio->io_stage = ZIO_STAGE_OPEN;
	- BP_ZERO(bp);
	- } else {
	- zp->zp_dedup = B_FALSE;
	- BP_SET_DEDUP(bp, B_FALSE);
	- }
	- ASSERT(!BP_GET_DEDUP(bp));
	- zio->io_pipeline = ZIO_WRITE_PIPELINE;
	- ddt_exit(ddt);
	- return (zio);
	- }
	-
	- ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
	- ASSERT(ditto_copies < SPA_DVAS_PER_BP);
	-
	- if (ditto_copies > ddt_ditto_copies_present(dde) &&
	- dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
	- zio_prop_t czp = *zp;
	-
	- czp.zp_copies = ditto_copies;
	-
	- /*
	- * If we arrived here with an override bp, we won't have run
	- * the transform stack, so we won't have the data we need to
	- * generate a child i/o. So, toss the override bp and restart.
	- * This is safe, because using the override bp is just an
	- * optimization; and it's rare, so the cost doesn't matter.
	- */
	- if (zio->io_bp_override) {
	- zio_pop_transforms(zio);
	- zio->io_stage = ZIO_STAGE_OPEN;
	- zio->io_pipeline = ZIO_WRITE_PIPELINE;
	- zio->io_bp_override = NULL;
	- BP_ZERO(bp);
	- ddt_exit(ddt);
	- return (zio);
	- }
	-
	- dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
	- zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
	- NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
	- ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
	-
	- zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
	- dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
	- }
	-
	- if (ddp->ddp_phys_birth != 0 \|\| dde->dde_lead_zio[p] != NULL) {
	- if (ddp->ddp_phys_birth != 0)
	- ddt_bp_fill(ddp, bp, txg);
	- if (dde->dde_lead_zio[p] != NULL)
	- zio_add_child(zio, dde->dde_lead_zio[p]);
	- else
	- ddt_phys_addref(ddp);
	- } else if (zio->io_bp_override) {
	- ASSERT(bp->blk_birth == txg);
	- ASSERT(BP_EQUAL(bp, zio->io_bp_override));
	- ddt_phys_fill(ddp, bp);
	- ddt_phys_addref(ddp);
	- } else {
	- cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
	- zio->io_orig_size, zio->io_orig_size, zp,
	- zio_ddt_child_write_ready, NULL, NULL,
	- zio_ddt_child_write_done, dde, zio->io_priority,
	- ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
	-
	- zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
	- dde->dde_lead_zio[p] = cio;
	- }
	-
	- ddt_exit(ddt);
	-
	- if (cio)
	- zio_nowait(cio);
	- if (dio)
	- zio_nowait(dio);
	-
	- return (zio);
	-}
	-
	-ddt_entry_t freedde; / for debugging */
	-
	-static zio_t *
	-zio_ddt_free(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- blkptr_t *bp = zio->io_bp;
	- ddt_t *ddt = ddt_select(spa, bp);
	- ddt_entry_t *dde;
	- ddt_phys_t *ddp;
	-
	- ASSERT(BP_GET_DEDUP(bp));
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	-
	- ddt_enter(ddt);
	- freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
	- if (dde) {
	- ddp = ddt_phys_select(dde, bp);
	- if (ddp)
	- ddt_phys_decref(ddp);
	- }
	- ddt_exit(ddt);
	-
	- return (zio);
	-}
	-
	-/*
	- * ==========================================================================
	- * Allocate and free blocks
	- * ==========================================================================
	- */
	-
	-static zio_t *
	-zio_io_to_allocate(spa_t *spa, int allocator)
	-{
	- zio_t *zio;
	-
	- ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
	-
	- zio = avl_first(&spa->spa_alloc_trees[allocator]);
	- if (zio == NULL)
	- return (NULL);
	-
	- ASSERT(IO_IS_ALLOCATING(zio));
	-
	- /*
	- * Try to place a reservation for this zio. If we're unable to
	- * reserve then we throttle.
	- */
	- ASSERT3U(zio->io_allocator, ==, allocator);
	- if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
	- zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
	- return (NULL);
	- }
	-
	- avl_remove(&spa->spa_alloc_trees[allocator], zio);
	- ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
	-
	- return (zio);
	-}
	-
	-static zio_t *
	-zio_dva_throttle(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- zio_t *nio;
	- metaslab_class_t *mc;
	-
	- /* locate an appropriate allocation class */
	- mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
	- zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
	-
	- if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE \|\|
	- !mc->mc_alloc_throttle_enabled \|\|
	- zio->io_child_type == ZIO_CHILD_GANG \|\|
	- zio->io_flags & ZIO_FLAG_NODATA) {
	- return (zio);
	- }
	-
	- ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
	-
	- ASSERT3U(zio->io_queued_timestamp, >, 0);
	- ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
	-
	- zbookmark_phys_t *bm = &zio->io_bookmark;
	- /*
	- * We want to try to use as many allocators as possible to help improve
	- * performance, but we also want logically adjacent IOs to be physically
	- * adjacent to improve sequential read performance. We chunk each object
	- * into 2^20 block regions, and then hash based on the objset, object,
	- * level, and region to accomplish both of these goals.
	- */
	- zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
	- bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
	- mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
	- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
	- zio->io_metaslab_class = mc;
	- avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
	- nio = zio_io_to_allocate(spa, zio->io_allocator);
	- mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
	-
	- return (nio);
	-}
	-
	-static void
	-zio_allocate_dispatch(spa_t *spa, int allocator)
	-{
	- zio_t *zio;
	-
	- mutex_enter(&spa->spa_alloc_locks[allocator]);
	- zio = zio_io_to_allocate(spa, allocator);
	- mutex_exit(&spa->spa_alloc_locks[allocator]);
	- if (zio == NULL)
	- return;
	-
	- ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
	- ASSERT0(zio->io_error);
	- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
	-}
	-
	-static zio_t *
	-zio_dva_allocate(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- metaslab_class_t *mc;
	- blkptr_t *bp = zio->io_bp;
	- int error;
	- int flags = 0;
	-
	- if (zio->io_gang_leader == NULL) {
	- ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
	- zio->io_gang_leader = zio;
	- }
	-
	- ASSERT(BP_IS_HOLE(bp));
	- ASSERT0(BP_GET_NDVAS(bp));
	- ASSERT3U(zio->io_prop.zp_copies, >, 0);
	- ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
	- ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
	-
	- if (zio->io_flags & ZIO_FLAG_NODATA)
	- flags \|= METASLAB_DONT_THROTTLE;
	- if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
	- flags \|= METASLAB_GANG_CHILD;
	- if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
	- flags \|= METASLAB_ASYNC_ALLOC;
	-
	- /*
	- * if not already chosen, locate an appropriate allocation class
	- */
	- mc = zio->io_metaslab_class;
	- if (mc == NULL) {
	- mc = spa_preferred_class(spa, zio->io_size,
	- zio->io_prop.zp_type, zio->io_prop.zp_level,
	- zio->io_prop.zp_zpl_smallblk);
	- zio->io_metaslab_class = mc;
	- }
	-
	- error = metaslab_alloc(spa, mc, zio->io_size, bp,
	- zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
	- &zio->io_alloc_list, zio, zio->io_allocator);
	-
	- /*
	- * Fallback to normal class when an alloc class is full
	- */
	- if (error == ENOSPC && mc != spa_normal_class(spa)) {
	- /*
	- * If throttling, transfer reservation over to normal class.
	- * The io_allocator slot can remain the same even though we
	- * are switching classes.
	- */
	- if (mc->mc_alloc_throttle_enabled &&
	- (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
	- metaslab_class_throttle_unreserve(mc,
	- zio->io_prop.zp_copies, zio->io_allocator, zio);
	- zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
	-
	- mc = spa_normal_class(spa);
	- VERIFY(metaslab_class_throttle_reserve(mc,
	- zio->io_prop.zp_copies, zio->io_allocator, zio,
	- flags \| METASLAB_MUST_RESERVE));
	- } else {
	- mc = spa_normal_class(spa);
	- }
	- zio->io_metaslab_class = mc;
	-
	- error = metaslab_alloc(spa, mc, zio->io_size, bp,
	- zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
	- &zio->io_alloc_list, zio, zio->io_allocator);
	- }
	-
	- if (error != 0) {
	- zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
	- "size %llu, error %d", spa_name(spa), zio, zio->io_size,
	- error);
	- if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
	- return (zio_write_gang_block(zio));
	- zio->io_error = error;
	- }
	-
	- return (zio);
	-}
	-
	-static zio_t *
	-zio_dva_free(zio_t *zio)
	-{
	- metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
	-
	- return (zio);
	-}
	-
	-static zio_t *
	-zio_dva_claim(zio_t *zio)
	-{
	- int error;
	-
	- error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
	- if (error)
	- zio->io_error = error;
	-
	- return (zio);
	-}
	-
	-/*
	- * Undo an allocation. This is used by zio_done() when an I/O fails
	- * and we want to give back the block we just allocated.
	- * This handles both normal blocks and gang blocks.
	- */
	-static void
	-zio_dva_unallocate(zio_t zio, zio_gang_node_t gn, blkptr_t *bp)
	-{
	- ASSERT(bp->blk_birth == zio->io_txg \|\| BP_IS_HOLE(bp));
	- ASSERT(zio->io_bp_override == NULL);
	-
	- if (!BP_IS_HOLE(bp))
	- metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
	-
	- if (gn != NULL) {
	- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
	- zio_dva_unallocate(zio, gn->gn_child[g],
	- &gn->gn_gbh->zg_blkptr[g]);
	- }
	- }
	-}
	-
	-/*
	- * Try to allocate an intent log block. Return 0 on success, errno on failure.
	- */
	-int
	-zio_alloc_zil(spa_t spa, uint64_t objset, uint64_t txg, blkptr_t new_bp,
	- blkptr_t old_bp, uint64_t size, boolean_t slog)
	-{
	- int error = 1;
	- zio_alloc_list_t io_alloc_list;
	-
	- ASSERT(txg > spa_syncing_txg(spa));
	-
	- metaslab_trace_init(&io_alloc_list);
	-
	- /*
	- * Block pointer fields are useful to metaslabs for stats and debugging.
	- * Fill in the obvious ones before calling into metaslab_alloc().
	- */
	- BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
	- BP_SET_PSIZE(new_bp, size);
	- BP_SET_LEVEL(new_bp, 0);
	-
	- /*
	- * When allocating a zil block, we don't have information about
	- * the final destination of the block except the objset it's part
	- * of, so we just hash the objset ID to pick the allocator to get
	- * some parallelism.
	- */
	- error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
	- txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL,
	- cityhash4(0, 0, 0, objset) % spa->spa_alloc_count);
	- if (error == 0) {
	- *slog = TRUE;
	- } else {
	- error = metaslab_alloc(spa, spa_normal_class(spa), size,
	- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
	- &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) %
	- spa->spa_alloc_count);
	- if (error == 0)
	- *slog = FALSE;
	- }
	- metaslab_trace_fini(&io_alloc_list);
	-
	- if (error == 0) {
	- BP_SET_LSIZE(new_bp, size);
	- BP_SET_PSIZE(new_bp, size);
	- BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
	- BP_SET_CHECKSUM(new_bp,
	- spa_version(spa) >= SPA_VERSION_SLIM_ZIL
	- ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
	- BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
	- BP_SET_LEVEL(new_bp, 0);
	- BP_SET_DEDUP(new_bp, 0);
	- BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
	- } else {
	- zfs_dbgmsg("%s: zil block allocation failure: "
	- "size %llu, error %d", spa_name(spa), size, error);
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * ==========================================================================
	- * Read, write and delete to physical devices
	- * ==========================================================================
	- */
	-
	-
	-/*
	- * Issue an I/O to the underlying vdev. Typically the issue pipeline
	- * stops after this stage and will resume upon I/O completion.
	- * However, there are instances where the vdev layer may need to
	- * continue the pipeline when an I/O was not issued. Since the I/O
	- * that was sent to the vdev layer might be different than the one
	- * currently active in the pipeline (see vdev_queue_io()), we explicitly
	- * force the underlying vdev layers to call either zio_execute() or
	- * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
	- */
	-static zio_t *
	-zio_vdev_io_start(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- uint64_t align;
	- spa_t *spa = zio->io_spa;
	- int ret;
	-
	- ASSERT(zio->io_error == 0);
	- ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
	-
	- if (vd == NULL) {
	- if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
	- spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
	-
	- /*
	- * The mirror_ops handle multiple DVAs in a single BP.
	- */
	- vdev_mirror_ops.vdev_op_io_start(zio);
	- return (NULL);
	- }
	-
	- if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
	- zio->io_priority == ZIO_PRIORITY_NOW) {
	- trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
	- return (zio);
	- }
	-
	- ASSERT3P(zio->io_logical, !=, zio);
	- if (zio->io_type == ZIO_TYPE_WRITE) {
	- ASSERT(spa->spa_trust_config);
	-
	- if (zio->io_vd->vdev_removing) {
	- /*
	- * Note: the code can handle other kinds of writes,
	- * but we don't expect them.
	- */
	- ASSERT(zio->io_flags &
	- (ZIO_FLAG_PHYSICAL \| ZIO_FLAG_SELF_HEAL \|
	- ZIO_FLAG_RESILVER \| ZIO_FLAG_INDUCE_DAMAGE));
	- }
	- }
	-
	- /*
	- * We keep track of time-sensitive I/Os so that the scan thread
	- * can quickly react to certain workloads. In particular, we care
	- * about non-scrubbing, top-level reads and writes with the following
	- * characteristics:
	- * - synchronous writes of user data to non-slog devices
	- * - any reads of user data
	- * When these conditions are met, adjust the timestamp of spa_last_io
	- * which allows the scan thread to adjust its workload accordingly.
	- */
	- if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
	- vd == vd->vdev_top && !vd->vdev_islog &&
	- zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
	- zio->io_txg != spa_syncing_txg(spa)) {
	- uint64_t old = spa->spa_last_io;
	- uint64_t new = ddi_get_lbolt64();
	- if (old != new)
	- (void) atomic_cas_64(&spa->spa_last_io, old, new);
	- }
	- align = 1ULL << vd->vdev_top->vdev_ashift;
	-
	- if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
	- P2PHASE(zio->io_size, align) != 0) {
	- /* Transform logical writes to be a full physical block size. */
	- uint64_t asize = P2ROUNDUP(zio->io_size, align);
	- abd_t *abuf = NULL;
	- if (zio->io_type == ZIO_TYPE_READ \|\|
	- zio->io_type == ZIO_TYPE_WRITE)
	- abuf = abd_alloc_sametype(zio->io_abd, asize);
	- ASSERT(vd == vd->vdev_top);
	- if (zio->io_type == ZIO_TYPE_WRITE) {
	- abd_copy(abuf, zio->io_abd, zio->io_size);
	- abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
	- }
	- zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
	- zio_subblock);
	- }
	-
	- /*
	- * If this is not a physical io, make sure that it is properly aligned
	- * before proceeding.
	- */
	- if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
	- ASSERT0(P2PHASE(zio->io_offset, align));
	- ASSERT0(P2PHASE(zio->io_size, align));
	- } else {
	- /*
	- * For the physical io we allow alignment
	- * to a logical block size.
	- */
	- uint64_t log_align =
	- 1ULL << vd->vdev_top->vdev_logical_ashift;
	- ASSERT0(P2PHASE(zio->io_offset, log_align));
	- ASSERT0(P2PHASE(zio->io_size, log_align));
	- }
	-
	- VERIFY(zio->io_type == ZIO_TYPE_READ \|\| spa_writeable(spa));
	-
	- /*
	- * If this is a repair I/O, and there's no self-healing involved --
	- * that is, we're just resilvering what we expect to resilver --
	- * then don't do the I/O unless zio's txg is actually in vd's DTL.
	- * This prevents spurious resilvering.
	- *
	- * There are a few ways that we can end up creating these spurious
	- * resilver i/os:
	- *
	- * 1. A resilver i/o will be issued if any DVA in the BP has a
	- * dirty DTL. The mirror code will issue resilver writes to
	- * each DVA, including the one(s) that are not on vdevs with dirty
	- * DTLs.
	- *
	- * 2. With nested replication, which happens when we have a
	- * "replacing" or "spare" vdev that's a child of a mirror or raidz.
	- * For example, given mirror(replacing(A+B), C), it's likely that
	- * only A is out of date (it's the new device). In this case, we'll
	- * read from C, then use the data to resilver A+B -- but we don't
	- * actually want to resilver B, just A. The top-level mirror has no
	- * way to know this, so instead we just discard unnecessary repairs
	- * as we work our way down the vdev tree.
	- *
	- * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
	- * The same logic applies to any form of nested replication: ditto
	- * + mirror, RAID-Z + replacing, etc.
	- *
	- * However, indirect vdevs point off to other vdevs which may have
	- * DTL's, so we never bypass them. The child i/os on concrete vdevs
	- * will be properly bypassed instead.
	- */
	- if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
	- !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
	- zio->io_txg != 0 && /* not a delegated i/o */
	- vd->vdev_ops != &vdev_indirect_ops &&
	- !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
	- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
	- zio_vdev_io_bypass(zio);
	- return (zio);
	- }
	-
	- if (vd->vdev_ops->vdev_op_leaf) {
	- switch (zio->io_type) {
	- case ZIO_TYPE_READ:
	- if (vdev_cache_read(zio))
	- return (zio);
	- /* FALLTHROUGH */
	- case ZIO_TYPE_WRITE:
	- case ZIO_TYPE_FREE:
	- if ((zio = vdev_queue_io(zio)) == NULL)
	- return (NULL);
	-
	- if (!vdev_accessible(vd, zio)) {
	- zio->io_error = SET_ERROR(ENXIO);
	- zio_interrupt(zio);
	- return (NULL);
	- }
	- break;
	- }
	- /*
	- * Note that we ignore repair writes for TRIM because they can
	- * conflict with normal writes. This isn't an issue because, by
	- * definition, we only repair blocks that aren't freed.
	- */
	- if (zio->io_type == ZIO_TYPE_WRITE &&
	- !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
	- !trim_map_write_start(zio))
	- return (NULL);
	- }
	-
	- vd->vdev_ops->vdev_op_io_start(zio);
	- return (NULL);
	-}
	-
	-static zio_t *
	-zio_vdev_io_done(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
	- boolean_t unexpected_error = B_FALSE;
	-
	- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
	- return (NULL);
	- }
	-
	- ASSERT(zio->io_type == ZIO_TYPE_READ \|\|
	- zio->io_type == ZIO_TYPE_WRITE \|\| zio->io_type == ZIO_TYPE_FREE);
	-
	- if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
	- (zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE \|\|
	- zio->io_type == ZIO_TYPE_FREE)) {
	-
	- if (zio->io_type == ZIO_TYPE_WRITE &&
	- !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
	- trim_map_write_done(zio);
	-
	- vdev_queue_io_done(zio);
	-
	- if (zio->io_type == ZIO_TYPE_WRITE)
	- vdev_cache_write(zio);
	-
	- if (zio_injection_enabled && zio->io_error == 0)
	- zio->io_error = zio_handle_device_injection(vd,
	- zio, EIO);
	-
	- if (zio_injection_enabled && zio->io_error == 0)
	- zio->io_error = zio_handle_label_injection(zio, EIO);
	-
	- if (zio->io_error) {
	- if (zio->io_error == ENOTSUP &&
	- zio->io_type == ZIO_TYPE_FREE) {
	- /* Not all devices support TRIM. */
	- } else if (!vdev_accessible(vd, zio)) {
	- zio->io_error = SET_ERROR(ENXIO);
	- } else {
	- unexpected_error = B_TRUE;
	- }
	- }
	- }
	-
	- ops->vdev_op_io_done(zio);
	-
	- if (unexpected_error)
	- VERIFY(vdev_probe(vd, zio) == NULL);
	-
	- return (zio);
	-}
	-
	-/*
	- * This function is used to change the priority of an existing zio that is
	- * currently in-flight. This is used by the arc to upgrade priority in the
	- * event that a demand read is made for a block that is currently queued
	- * as a scrub or async read IO. Otherwise, the high priority read request
	- * would end up having to wait for the lower priority IO.
	- */
	-void
	-zio_change_priority(zio_t *pio, zio_priority_t priority)
	-{
	- zio_t cio, cio_next;
	- zio_link_t *zl = NULL;
	-
	- ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	-
	- if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
	- vdev_queue_change_io_priority(pio, priority);
	- } else {
	- pio->io_priority = priority;
	- }
	-
	- mutex_enter(&pio->io_lock);
	- for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
	- cio_next = zio_walk_children(pio, &zl);
	- zio_change_priority(cio, priority);
	- }
	- mutex_exit(&pio->io_lock);
	-}
	-
	-/*
	- * For non-raidz ZIOs, we can just copy aside the bad data read from the
	- * disk, and use that to finish the checksum ereport later.
	- */
	-static void
	-zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
	- const void *good_buf)
	-{
	- /* no processing needed */
	- zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
	-}
	-
	-/ARGSUSED/
	-void
	-zio_vsd_default_cksum_report(zio_t zio, zio_cksum_report_t zcr, void *ignored)
	-{
	- void *buf = zio_buf_alloc(zio->io_size);
	-
	- abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
	-
	- zcr->zcr_cbinfo = zio->io_size;
	- zcr->zcr_cbdata = buf;
	- zcr->zcr_finish = zio_vsd_default_cksum_finish;
	- zcr->zcr_free = zio_buf_free;
	-}
	-
	-static zio_t *
	-zio_vdev_io_assess(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	-
	- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
	- return (NULL);
	- }
	-
	- if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
	- spa_config_exit(zio->io_spa, SCL_ZIO, zio);
	-
	- if (zio->io_vsd != NULL) {
	- zio->io_vsd_ops->vsd_free(zio);
	- zio->io_vsd = NULL;
	- }
	-
	- if (zio_injection_enabled && zio->io_error == 0)
	- zio->io_error = zio_handle_fault_injection(zio, EIO);
	-
	- if (zio->io_type == ZIO_TYPE_FREE &&
	- zio->io_priority != ZIO_PRIORITY_NOW) {
	- switch (zio->io_error) {
	- case 0:
	- ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
	- ZIO_TRIM_STAT_BUMP(success);
	- break;
	- case EOPNOTSUPP:
	- ZIO_TRIM_STAT_BUMP(unsupported);
	- break;
	- default:
	- ZIO_TRIM_STAT_BUMP(failed);
	- break;
	- }
	- }
	-
	- /*
	- * If the I/O failed, determine whether we should attempt to retry it.
	- *
	- * On retry, we cut in line in the issue queue, since we don't want
	- * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
	- */
	- if (zio->io_error && vd == NULL &&
	- !(zio->io_flags & (ZIO_FLAG_DONT_RETRY \| ZIO_FLAG_IO_RETRY))) {
	- ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
	- ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
	- zio->io_error = 0;
	- zio->io_flags \|= ZIO_FLAG_IO_RETRY \|
	- ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_AGGREGATE;
	- zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
	- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
	- zio_requeue_io_start_cut_in_line);
	- return (NULL);
	- }
	-
	- /*
	- * If we got an error on a leaf device, convert it to ENXIO
	- * if the device is not accessible at all.
	- */
	- if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
	- !vdev_accessible(vd, zio))
	- zio->io_error = SET_ERROR(ENXIO);
	-
	- /*
	- * If we can't write to an interior vdev (mirror or RAID-Z),
	- * set vdev_cant_write so that we stop trying to allocate from it.
	- */
	- if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
	- vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
	- vd->vdev_cant_write = B_TRUE;
	- }
	-
	- /*
	- * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
	- * attempts will ever succeed. In this case we set a persistent bit so
	- * that we don't bother with it in the future.
	- */
	- if ((zio->io_error == ENOTSUP \|\| zio->io_error == ENOTTY) &&
	- zio->io_type == ZIO_TYPE_IOCTL &&
	- zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
	- vd->vdev_nowritecache = B_TRUE;
	-
	- if (zio->io_error)
	- zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	-
	- if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
	- zio->io_physdone != NULL) {
	- ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
	- ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
	- zio->io_physdone(zio->io_logical);
	- }
	-
	- return (zio);
	-}
	-
	-void
	-zio_vdev_io_reissue(zio_t *zio)
	-{
	- ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
	- ASSERT(zio->io_error == 0);
	-
	- zio->io_stage >>= 1;
	-}
	-
	-void
	-zio_vdev_io_redone(zio_t *zio)
	-{
	- ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
	-
	- zio->io_stage >>= 1;
	-}
	-
	-void
	-zio_vdev_io_bypass(zio_t *zio)
	-{
	- ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
	- ASSERT(zio->io_error == 0);
	-
	- zio->io_flags \|= ZIO_FLAG_IO_BYPASS;
	- zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
	-}
	-
	-/*
	- * ==========================================================================
	- * Generate and verify checksums
	- * ==========================================================================
	- */
	-static zio_t *
	-zio_checksum_generate(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	- enum zio_checksum checksum;
	-
	- if (bp == NULL) {
	- /*
	- * This is zio_write_phys().
	- * We're either generating a label checksum, or none at all.
	- */
	- checksum = zio->io_prop.zp_checksum;
	-
	- if (checksum == ZIO_CHECKSUM_OFF)
	- return (zio);
	-
	- ASSERT(checksum == ZIO_CHECKSUM_LABEL);
	- } else {
	- if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
	- ASSERT(!IO_IS_ALLOCATING(zio));
	- checksum = ZIO_CHECKSUM_GANG_HEADER;
	- } else {
	- checksum = BP_GET_CHECKSUM(bp);
	- }
	- }
	-
	- zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
	-
	- return (zio);
	-}
	-
	-static zio_t *
	-zio_checksum_verify(zio_t *zio)
	-{
	- zio_bad_cksum_t info;
	- blkptr_t *bp = zio->io_bp;
	- int error;
	-
	- ASSERT(zio->io_vd != NULL);
	-
	- if (bp == NULL) {
	- /*
	- * This is zio_read_phys().
	- * We're either verifying a label checksum, or nothing at all.
	- */
	- if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
	- return (zio);
	-
	- ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
	- }
	-
	- if ((error = zio_checksum_error(zio, &info)) != 0) {
	- zio->io_error = error;
	- if (error == ECKSUM &&
	- !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	- zfs_ereport_start_checksum(zio->io_spa,
	- zio->io_vd, zio, zio->io_offset,
	- zio->io_size, NULL, &info);
	- }
	- }
	-
	- return (zio);
	-}
	-
	-/*
	- * Called by RAID-Z to ensure we don't compute the checksum twice.
	- */
	-void
	-zio_checksum_verified(zio_t *zio)
	-{
	- zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
	-}
	-
	-/*
	- * ==========================================================================
	- * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
	- * An error of 0 indicates success. ENXIO indicates whole-device failure,
	- * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
	- * indicate errors that are specific to one I/O, and most likely permanent.
	- * Any other error is presumed to be worse because we weren't expecting it.
	- * ==========================================================================
	- */
	-int
	-zio_worst_error(int e1, int e2)
	-{
	- static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
	- int r1, r2;
	-
	- for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
	- if (e1 == zio_error_rank[r1])
	- break;
	-
	- for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
	- if (e2 == zio_error_rank[r2])
	- break;
	-
	- return (r1 > r2 ? e1 : e2);
	-}
	-
	-/*
	- * ==========================================================================
	- * I/O completion
	- * ==========================================================================
	- */
	-static zio_t *
	-zio_ready(zio_t *zio)
	-{
	- blkptr_t *bp = zio->io_bp;
	- zio_t pio, pio_next;
	- zio_link_t *zl = NULL;
	-
	- if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT \| ZIO_CHILD_DDT_BIT,
	- ZIO_WAIT_READY)) {
	- return (NULL);
	- }
	-
	- if (zio->io_ready) {
	- ASSERT(IO_IS_ALLOCATING(zio));
	- ASSERT(bp->blk_birth == zio->io_txg \|\| BP_IS_HOLE(bp) \|\|
	- (zio->io_flags & ZIO_FLAG_NOPWRITE));
	- ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
	-
	- zio->io_ready(zio);
	- }
	-
	- if (bp != NULL && bp != &zio->io_bp_copy)
	- zio->io_bp_copy = *bp;
	-
	- if (zio->io_error != 0) {
	- zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	-
	- if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	- ASSERT(IO_IS_ALLOCATING(zio));
	- ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	- ASSERT(zio->io_metaslab_class != NULL);
	-
	- /*
	- * We were unable to allocate anything, unreserve and
	- * issue the next I/O to allocate.
	- */
	- metaslab_class_throttle_unreserve(
	- zio->io_metaslab_class, zio->io_prop.zp_copies,
	- zio->io_allocator, zio);
	- zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
	- }
	- }
	-
	- mutex_enter(&zio->io_lock);
	- zio->io_state[ZIO_WAIT_READY] = 1;
	- pio = zio_walk_parents(zio, &zl);
	- mutex_exit(&zio->io_lock);
	-
	- /*
	- * As we notify zio's parents, new parents could be added.
	- * New parents go to the head of zio's io_parent_list, however,
	- * so we will (correctly) not notify them. The remainder of zio's
	- * io_parent_list, from 'pio_next' onward, cannot change because
	- * all parents must wait for us to be done before they can be done.
	- */
	- for (; pio != NULL; pio = pio_next) {
	- pio_next = zio_walk_parents(zio, &zl);
	- zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
	- }
	-
	- if (zio->io_flags & ZIO_FLAG_NODATA) {
	- if (BP_IS_GANG(bp)) {
	- zio->io_flags &= ~ZIO_FLAG_NODATA;
	- } else {
	- ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
	- zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
	- }
	- }
	-
	- if (zio_injection_enabled &&
	- zio->io_spa->spa_syncing_txg == zio->io_txg)
	- zio_handle_ignored_writes(zio);
	-
	- return (zio);
	-}
	-
	-/*
	- * Update the allocation throttle accounting.
	- */
	-static void
	-zio_dva_throttle_done(zio_t *zio)
	-{
	- zio_t *lio = zio->io_logical;
	- zio_t *pio = zio_unique_parent(zio);
	- vdev_t *vd = zio->io_vd;
	- int flags = METASLAB_ASYNC_ALLOC;
	-
	- ASSERT3P(zio->io_bp, !=, NULL);
	- ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
	- ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
	- ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
	- ASSERT(vd != NULL);
	- ASSERT3P(vd, ==, vd->vdev_top);
	- ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR \| ZIO_FLAG_IO_RETRY)));
	- ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
	- ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
	- ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
	-
	- /*
	- * Parents of gang children can have two flavors -- ones that
	- * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
	- * and ones that allocated the constituent blocks. The allocation
	- * throttle needs to know the allocating parent zio so we must find
	- * it here.
	- */
	- if (pio->io_child_type == ZIO_CHILD_GANG) {
	- /*
	- * If our parent is a rewrite gang child then our grandparent
	- * would have been the one that performed the allocation.
	- */
	- if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
	- pio = zio_unique_parent(pio);
	- flags \|= METASLAB_GANG_CHILD;
	- }
	-
	- ASSERT(IO_IS_ALLOCATING(pio));
	- ASSERT3P(zio, !=, zio->io_logical);
	- ASSERT(zio->io_logical != NULL);
	- ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
	- ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
	- ASSERT(zio->io_metaslab_class != NULL);
	-
	- mutex_enter(&pio->io_lock);
	- metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
	- pio->io_allocator, B_TRUE);
	- mutex_exit(&pio->io_lock);
	-
	- metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
	- pio->io_allocator, pio);
	-
	- /*
	- * Call into the pipeline to see if there is more work that
	- * needs to be done. If there is work to be done it will be
	- * dispatched to another taskq thread.
	- */
	- zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
	-}
	-
	-static zio_t *
	-zio_done(zio_t *zio)
	-{
	- spa_t *spa = zio->io_spa;
	- zio_t *lio = zio->io_logical;
	- blkptr_t *bp = zio->io_bp;
	- vdev_t *vd = zio->io_vd;
	- uint64_t psize = zio->io_size;
	- zio_t pio, pio_next;
	- zio_link_t *zl = NULL;
	-
	- /*
	- * If our children haven't all completed,
	- * wait for them and then repeat this pipeline stage.
	- */
	- if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
	- return (NULL);
	- }
	-
	- /*
	- * If the allocation throttle is enabled, then update the accounting.
	- * We only track child I/Os that are part of an allocating async
	- * write. We must do this since the allocation is performed
	- * by the logical I/O but the actual write is done by child I/Os.
	- */
	- if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
	- zio->io_child_type == ZIO_CHILD_VDEV) {
	- ASSERT(zio->io_metaslab_class != NULL);
	- ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
	- zio_dva_throttle_done(zio);
	- }
	-
	- /*
	- * If the allocation throttle is enabled, verify that
	- * we have decremented the refcounts for every I/O that was throttled.
	- */
	- if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
	- ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	- ASSERT(bp != NULL);
	-
	- metaslab_group_alloc_verify(spa, zio->io_bp, zio,
	- zio->io_allocator);
	- VERIFY(zfs_refcount_not_held(
	- &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
	- zio));
	- }
	-
	- for (int c = 0; c < ZIO_CHILD_TYPES; c++)
	- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
	- ASSERT(zio->io_children[c][w] == 0);
	-
	- if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
	- ASSERT(bp->blk_pad[0] == 0);
	- ASSERT(bp->blk_pad[1] == 0);
	- ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 \|\|
	- (bp == zio_unique_parent(zio)->io_bp));
	- if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
	- zio->io_bp_override == NULL &&
	- !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
	- ASSERT(!BP_SHOULD_BYTESWAP(bp));
	- ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
	- ASSERT(BP_COUNT_GANG(bp) == 0 \|\|
	- (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
	- }
	- if (zio->io_flags & ZIO_FLAG_NOPWRITE)
	- VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
	- }
	-
	- /*
	- * If there were child vdev/gang/ddt errors, they apply to us now.
	- */
	- zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
	- zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
	- zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
	-
	- /*
	- * If the I/O on the transformed data was successful, generate any
	- * checksum reports now while we still have the transformed data.
	- */
	- if (zio->io_error == 0) {
	- while (zio->io_cksum_report != NULL) {
	- zio_cksum_report_t *zcr = zio->io_cksum_report;
	- uint64_t align = zcr->zcr_align;
	- uint64_t asize = P2ROUNDUP(psize, align);
	- char *abuf = NULL;
	- abd_t *adata = zio->io_abd;
	-
	- if (asize != psize) {
	- adata = abd_alloc_linear(asize, B_TRUE);
	- abd_copy(adata, zio->io_abd, psize);
	- abd_zero_off(adata, psize, asize - psize);
	- }
	-
	- if (adata != NULL)
	- abuf = abd_borrow_buf_copy(adata, asize);
	-
	- zio->io_cksum_report = zcr->zcr_next;
	- zcr->zcr_next = NULL;
	- zcr->zcr_finish(zcr, abuf);
	- zfs_ereport_free_checksum(zcr);
	-
	- if (adata != NULL)
	- abd_return_buf(adata, abuf, asize);
	-
	- if (asize != psize)
	- abd_free(adata);
	- }
	- }
	-
	- zio_pop_transforms(zio); /* note: may set zio->io_error */
	-
	- vdev_stat_update(zio, psize);
	-
	- if (zio->io_error) {
	- /*
	- * If this I/O is attached to a particular vdev,
	- * generate an error message describing the I/O failure
	- * at the block level. We ignore these errors if the
	- * device is currently unavailable.
	- */
	- if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
	- zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
	-
	- if ((zio->io_error == EIO \|\| !(zio->io_flags &
	- (ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_DONT_PROPAGATE))) &&
	- zio == lio) {
	- /*
	- * For logical I/O requests, tell the SPA to log the
	- * error and generate a logical data ereport.
	- */
	- spa_log_error(spa, zio);
	- zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
	- 0, 0);
	- }
	- }
	-
	- if (zio->io_error && zio == lio) {
	- /*
	- * Determine whether zio should be reexecuted. This will
	- * propagate all the way to the root via zio_notify_parent().
	- */
	- ASSERT(vd == NULL && bp != NULL);
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	-
	- if (IO_IS_ALLOCATING(zio) &&
	- !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
	- if (zio->io_error != ENOSPC)
	- zio->io_reexecute \|= ZIO_REEXECUTE_NOW;
	- else
	- zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND;
	- }
	-
	- if ((zio->io_type == ZIO_TYPE_READ \|\|
	- zio->io_type == ZIO_TYPE_FREE) &&
	- !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
	- zio->io_error == ENXIO &&
	- spa_load_state(spa) == SPA_LOAD_NONE &&
	- spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
	- zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND;
	-
	- if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
	- zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND;
	-
	- /*
	- * Here is a possibly good place to attempt to do
	- * either combinatorial reconstruction or error correction
	- * based on checksums. It also might be a good place
	- * to send out preliminary ereports before we suspend
	- * processing.
	- */
	- }
	-
	- /*
	- * If there were logical child errors, they apply to us now.
	- * We defer this until now to avoid conflating logical child
	- * errors with errors that happened to the zio itself when
	- * updating vdev stats and reporting FMA events above.
	- */
	- zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
	-
	- if ((zio->io_error \|\| zio->io_reexecute) &&
	- IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
	- !(zio->io_flags & (ZIO_FLAG_IO_REWRITE \| ZIO_FLAG_NOPWRITE)))
	- zio_dva_unallocate(zio, zio->io_gang_tree, bp);
	-
	- zio_gang_tree_free(&zio->io_gang_tree);
	-
	- /*
	- * Godfather I/Os should never suspend.
	- */
	- if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
	- (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
	- zio->io_reexecute = 0;
	-
	- if (zio->io_reexecute) {
	- /*
	- * This is a logical I/O that wants to reexecute.
	- *
	- * Reexecute is top-down. When an i/o fails, if it's not
	- * the root, it simply notifies its parent and sticks around.
	- * The parent, seeing that it still has children in zio_done(),
	- * does the same. This percolates all the way up to the root.
	- * The root i/o will reexecute or suspend the entire tree.
	- *
	- * This approach ensures that zio_reexecute() honors
	- * all the original i/o dependency relationships, e.g.
	- * parents not executing until children are ready.
	- */
	- ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	-
	- zio->io_gang_leader = NULL;
	-
	- mutex_enter(&zio->io_lock);
	- zio->io_state[ZIO_WAIT_DONE] = 1;
	- mutex_exit(&zio->io_lock);
	-
	- /*
	- * "The Godfather" I/O monitors its children but is
	- * not a true parent to them. It will track them through
	- * the pipeline but severs its ties whenever they get into
	- * trouble (e.g. suspended). This allows "The Godfather"
	- * I/O to return status without blocking.
	- */
	- zl = NULL;
	- for (pio = zio_walk_parents(zio, &zl); pio != NULL;
	- pio = pio_next) {
	- zio_link_t *remove_zl = zl;
	- pio_next = zio_walk_parents(zio, &zl);
	-
	- if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
	- (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
	- zio_remove_child(pio, zio, remove_zl);
	- /*
	- * This is a rare code path, so we don't
	- * bother with "next_to_execute".
	- */
	- zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
	- NULL);
	- }
	- }
	-
	- if ((pio = zio_unique_parent(zio)) != NULL) {
	- /*
	- * We're not a root i/o, so there's nothing to do
	- * but notify our parent. Don't propagate errors
	- * upward since we haven't permanently failed yet.
	- */
	- ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
	- zio->io_flags \|= ZIO_FLAG_DONT_PROPAGATE;
	- /*
	- * This is a rare code path, so we don't bother with
	- * "next_to_execute".
	- */
	- zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
	- } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
	- /*
	- * We'd fail again if we reexecuted now, so suspend
	- * until conditions improve (e.g. device comes online).
	- */
	- zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
	- } else {
	- /*
	- * Reexecution is potentially a huge amount of work.
	- * Hand it off to the otherwise-unused claim taskq.
	- */
	-#if defined(illumos) \|\| !defined(_KERNEL)
	- ASSERT(zio->io_tqent.tqent_next == NULL);
	-#else
	- ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
	-#endif
	- spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
	- ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
	- 0, &zio->io_tqent);
	- }
	- return (NULL);
	- }
	-
	- ASSERT(zio->io_child_count == 0);
	- ASSERT(zio->io_reexecute == 0);
	- ASSERT(zio->io_error == 0 \|\| (zio->io_flags & ZIO_FLAG_CANFAIL));
	-
	- /*
	- * Report any checksum errors, since the I/O is complete.
	- */
	- while (zio->io_cksum_report != NULL) {
	- zio_cksum_report_t *zcr = zio->io_cksum_report;
	- zio->io_cksum_report = zcr->zcr_next;
	- zcr->zcr_next = NULL;
	- zcr->zcr_finish(zcr, NULL);
	- zfs_ereport_free_checksum(zcr);
	- }
	-
	- /*
	- * It is the responsibility of the done callback to ensure that this
	- * particular zio is no longer discoverable for adoption, and as
	- * such, cannot acquire any new parents.
	- */
	- if (zio->io_done)
	- zio->io_done(zio);
	-
	- mutex_enter(&zio->io_lock);
	- zio->io_state[ZIO_WAIT_DONE] = 1;
	- mutex_exit(&zio->io_lock);
	-
	- /*
	- * We are done executing this zio. We may want to execute a parent
	- * next. See the comment in zio_notify_parent().
	- */
	- zio_t *next_to_execute = NULL;
	- zl = NULL;
	- for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
	- zio_link_t *remove_zl = zl;
	- pio_next = zio_walk_parents(zio, &zl);
	- zio_remove_child(pio, zio, remove_zl);
	- zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
	- }
	-
	- if (zio->io_waiter != NULL) {
	- mutex_enter(&zio->io_lock);
	- zio->io_executor = NULL;
	- cv_broadcast(&zio->io_cv);
	- mutex_exit(&zio->io_lock);
	- } else {
	- zio_destroy(zio);
	- }
	-
	- return (next_to_execute);
	-}
	-
	-/*
	- * ==========================================================================
	- * I/O pipeline definition
	- * ==========================================================================
	- */
	-static zio_pipe_stage_t *zio_pipeline[] = {
	- NULL,
	- zio_read_bp_init,
	- zio_write_bp_init,
	- zio_free_bp_init,
	- zio_issue_async,
	- zio_write_compress,
	- zio_checksum_generate,
	- zio_nop_write,
	- zio_ddt_read_start,
	- zio_ddt_read_done,
	- zio_ddt_write,
	- zio_ddt_free,
	- zio_gang_assemble,
	- zio_gang_issue,
	- zio_dva_throttle,
	- zio_dva_allocate,
	- zio_dva_free,
	- zio_dva_claim,
	- zio_ready,
	- zio_vdev_io_start,
	- zio_vdev_io_done,
	- zio_vdev_io_assess,
	- zio_checksum_verify,
	- zio_done
	-};
	-
	-
	-
	-
	-/*
	- * Compare two zbookmark_phys_t's to see which we would reach first in a
	- * pre-order traversal of the object tree.
	- *
	- * This is simple in every case aside from the meta-dnode object. For all other
	- * objects, we traverse them in order (object 1 before object 2, and so on).
	- * However, all of these objects are traversed while traversing object 0, since
	- * the data it points to is the list of objects. Thus, we need to convert to a
	- * canonical representation so we can compare meta-dnode bookmarks to
	- * non-meta-dnode bookmarks.
	- *
	- * We do this by calculating "equivalents" for each field of the zbookmark.
	- * zbookmarks outside of the meta-dnode use their own object and level, and
	- * calculate the level 0 equivalent (the first L0 blkid that is contained in the
	- * blocks this bookmark refers to) by multiplying their blkid by their span
	- * (the number of L0 blocks contained within one block at their level).
	- * zbookmarks inside the meta-dnode calculate their object equivalent
	- * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
	- * level + 1<<31 (any value larger than a level could ever be) for their level.
	- * This causes them to always compare before a bookmark in their object
	- * equivalent, compare appropriately to bookmarks in other objects, and to
	- * compare appropriately to other bookmarks in the meta-dnode.
	- */
	-int
	-zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
	- const zbookmark_phys_t zb1, const zbookmark_phys_t zb2)
	-{
	- /*
	- * These variables represent the "equivalent" values for the zbookmark,
	- * after converting zbookmarks inside the meta dnode to their
	- * normal-object equivalents.
	- */
	- uint64_t zb1obj, zb2obj;
	- uint64_t zb1L0, zb2L0;
	- uint64_t zb1level, zb2level;
	-
	- if (zb1->zb_object == zb2->zb_object &&
	- zb1->zb_level == zb2->zb_level &&
	- zb1->zb_blkid == zb2->zb_blkid)
	- return (0);
	-
	- /*
	- * BP_SPANB calculates the span in blocks.
	- */
	- zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
	- zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
	-
	- if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
	- zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
	- zb1L0 = 0;
	- zb1level = zb1->zb_level + COMPARE_META_LEVEL;
	- } else {
	- zb1obj = zb1->zb_object;
	- zb1level = zb1->zb_level;
	- }
	-
	- if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
	- zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
	- zb2L0 = 0;
	- zb2level = zb2->zb_level + COMPARE_META_LEVEL;
	- } else {
	- zb2obj = zb2->zb_object;
	- zb2level = zb2->zb_level;
	- }
	-
	- /* Now that we have a canonical representation, do the comparison. */
	- if (zb1obj != zb2obj)
	- return (zb1obj < zb2obj ? -1 : 1);
	- else if (zb1L0 != zb2L0)
	- return (zb1L0 < zb2L0 ? -1 : 1);
	- else if (zb1level != zb2level)
	- return (zb1level > zb2level ? -1 : 1);
	- /*
	- * This can (theoretically) happen if the bookmarks have the same object
	- * and level, but different blkids, if the block sizes are not the same.
	- * There is presently no way to change the indirect block sizes
	- */
	- return (0);
	-}
	-
	-/*
	- * This function checks the following: given that last_block is the place that
	- * our traversal stopped last time, does that guarantee that we've visited
	- * every node under subtree_root? Therefore, we can't just use the raw output
	- * of zbookmark_compare. We have to pass in a modified version of
	- * subtree_root; by incrementing the block id, and then checking whether
	- * last_block is before or equal to that, we can tell whether or not having
	- * visited last_block implies that all of subtree_root's children have been
	- * visited.
	- */
	-boolean_t
	-zbookmark_subtree_completed(const dnode_phys_t *dnp,
	- const zbookmark_phys_t subtree_root, const zbookmark_phys_t last_block)
	-{
	- zbookmark_phys_t mod_zb = *subtree_root;
	- mod_zb.zb_blkid++;
	- ASSERT(last_block->zb_level == 0);
	-
	- /* The objset_phys_t isn't before anything. */
	- if (dnp == NULL)
	- return (B_FALSE);
	-
	- /*
	- * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
	- * data block size in sectors, because that variable is only used if
	- * the bookmark refers to a block in the meta-dnode. Since we don't
	- * know without examining it what object it refers to, and there's no
	- * harm in passing in this value in other cases, we always pass it in.
	- *
	- * We pass in 0 for the indirect block size shift because zb2 must be
	- * level 0. The indirect block size is only used to calculate the span
	- * of the bookmark, but since the bookmark must be level 0, the span is
	- * always 1, so the math works out.
	- *
	- * If you make changes to how the zbookmark_compare code works, be sure
	- * to make sure that this code still works afterwards.
	- */
	- return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
	- 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
	- last_block) <= 0);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
	@@ -1,475 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright 2013 Saso Kiselkov. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zio.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zil.h>
	-#include <sys/abd.h>
	-#include <zfs_fletcher.h>
	-
	-/*
	- * Checksum vectors.
	- *
	- * In the SPA, everything is checksummed. We support checksum vectors
	- * for three distinct reasons:
	- *
	- * 1. Different kinds of data need different levels of protection.
	- * For SPA metadata, we always want a very strong checksum.
	- * For user data, we let users make the trade-off between speed
	- * and checksum strength.
	- *
	- * 2. Cryptographic hash and MAC algorithms are an area of active research.
	- * It is likely that in future hash functions will be at least as strong
	- * as current best-of-breed, and may be substantially faster as well.
	- * We want the ability to take advantage of these new hashes as soon as
	- * they become available.
	- *
	- * 3. If someone develops hardware that can compute a strong hash quickly,
	- * we want the ability to take advantage of that hardware.
	- *
	- * Of course, we don't want a checksum upgrade to invalidate existing
	- * data, so we store the checksum function in eight bits of the bp.
	- * This gives us room for up to 256 different checksum functions.
	- *
	- * When writing a block, we always checksum it with the latest-and-greatest
	- * checksum function of the appropriate strength. When reading a block,
	- * we compare the expected checksum against the actual checksum, which we
	- * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
	- *
	- * SALTED CHECKSUMS
	- *
	- * To enable the use of less secure hash algorithms with dedup, we
	- * introduce the notion of salted checksums (MACs, really). A salted
	- * checksum is fed both a random 256-bit value (the salt) and the data
	- * to be checksummed. This salt is kept secret (stored on the pool, but
	- * never shown to the user). Thus even if an attacker knew of collision
	- * weaknesses in the hash algorithm, they won't be able to mount a known
	- * plaintext attack on the DDT, since the actual hash value cannot be
	- * known ahead of time. How the salt is used is algorithm-specific
	- * (some might simply prefix it to the data block, others might need to
	- * utilize a full-blown HMAC). On disk the salt is stored in a ZAP
	- * object in the MOS (DMU_POOL_CHECKSUM_SALT).
	- *
	- * CONTEXT TEMPLATES
	- *
	- * Some hashing algorithms need to perform a substantial amount of
	- * initialization work (e.g. salted checksums above may need to pre-hash
	- * the salt) before being able to process data. Performing this
	- * redundant work for each block would be wasteful, so we instead allow
	- * a checksum algorithm to do the work once (the first time it's used)
	- * and then keep this pre-initialized context as a template inside the
	- * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains
	- * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
	- * construct and destruct the pre-initialized checksum context. The
	- * pre-initialized context is then reused during each checksum
	- * invocation and passed to the checksum function.
	- */
	-
	-/ARGSUSED/
	-static void
	-abd_checksum_off(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
	-}
	-
	-/ARGSUSED/
	-void
	-abd_fletcher_2_native(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- fletcher_init(zcp);
	- (void) abd_iterate_func(abd, 0, size,
	- fletcher_2_incremental_native, zcp);
	-}
	-
	-/ARGSUSED/
	-void
	-abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- fletcher_init(zcp);
	- (void) abd_iterate_func(abd, 0, size,
	- fletcher_2_incremental_byteswap, zcp);
	-}
	-
	-/ARGSUSED/
	-void
	-abd_fletcher_4_native(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- fletcher_init(zcp);
	- (void) abd_iterate_func(abd, 0, size,
	- fletcher_4_incremental_native, zcp);
	-}
	-
	-/ARGSUSED/
	-void
	-abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
	- const void ctx_template, zio_cksum_t zcp)
	-{
	- fletcher_init(zcp);
	- (void) abd_iterate_func(abd, 0, size,
	- fletcher_4_incremental_byteswap, zcp);
	-}
	-
	-zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
	- {{NULL, NULL}, NULL, NULL, 0, "inherit"},
	- {{NULL, NULL}, NULL, NULL, 0, "on"},
	- {{abd_checksum_off, abd_checksum_off},
	- NULL, NULL, 0, "off"},
	- {{abd_checksum_SHA256, abd_checksum_SHA256},
	- NULL, NULL, ZCHECKSUM_FLAG_METADATA \| ZCHECKSUM_FLAG_EMBEDDED,
	- "label"},
	- {{abd_checksum_SHA256, abd_checksum_SHA256},
	- NULL, NULL, ZCHECKSUM_FLAG_METADATA \| ZCHECKSUM_FLAG_EMBEDDED,
	- "gang_header"},
	- {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
	- NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
	- {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
	- NULL, NULL, 0, "fletcher2"},
	- {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
	- NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
	- {{abd_checksum_SHA256, abd_checksum_SHA256},
	- NULL, NULL, ZCHECKSUM_FLAG_METADATA \| ZCHECKSUM_FLAG_DEDUP \|
	- ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
	- {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
	- NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
	- {{abd_checksum_off, abd_checksum_off},
	- NULL, NULL, 0, "noparity"},
	- {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap},
	- NULL, NULL, ZCHECKSUM_FLAG_METADATA \| ZCHECKSUM_FLAG_DEDUP \|
	- ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
	- {{abd_checksum_skein_native, abd_checksum_skein_byteswap},
	- abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
	- ZCHECKSUM_FLAG_METADATA \| ZCHECKSUM_FLAG_DEDUP \|
	- ZCHECKSUM_FLAG_SALTED \| ZCHECKSUM_FLAG_NOPWRITE, "skein"},
	-#ifdef illumos
	- {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap},
	- abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
	- ZCHECKSUM_FLAG_METADATA \| ZCHECKSUM_FLAG_SALTED \|
	- ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
	-#endif
	-};
	-
	-/*
	- * The flag corresponding to the "verify" in dedup=[checksum,]verify
	- * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
	- */
	-spa_feature_t
	-zio_checksum_to_feature(enum zio_checksum cksum)
	-{
	- VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
	-
	- switch (cksum) {
	- case ZIO_CHECKSUM_SHA512:
	- return (SPA_FEATURE_SHA512);
	- case ZIO_CHECKSUM_SKEIN:
	- return (SPA_FEATURE_SKEIN);
	-#ifdef illumos
	- case ZIO_CHECKSUM_EDONR:
	- return (SPA_FEATURE_EDONR);
	-#endif
	- }
	- return (SPA_FEATURE_NONE);
	-}
	-
	-enum zio_checksum
	-zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
	-{
	- ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
	- ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
	- ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
	-
	- if (child == ZIO_CHECKSUM_INHERIT)
	- return (parent);
	-
	- if (child == ZIO_CHECKSUM_ON)
	- return (ZIO_CHECKSUM_ON_VALUE);
	-
	- return (child);
	-}
	-
	-enum zio_checksum
	-zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
	- enum zio_checksum parent)
	-{
	- ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
	- ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
	- ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
	-
	- if (child == ZIO_CHECKSUM_INHERIT)
	- return (parent);
	-
	- if (child == ZIO_CHECKSUM_ON)
	- return (spa_dedup_checksum(spa));
	-
	- if (child == (ZIO_CHECKSUM_ON \| ZIO_CHECKSUM_VERIFY))
	- return (spa_dedup_checksum(spa) \| ZIO_CHECKSUM_VERIFY);
	-
	- ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
	- ZCHECKSUM_FLAG_DEDUP) \|\|
	- (child & ZIO_CHECKSUM_VERIFY) \|\| child == ZIO_CHECKSUM_OFF);
	-
	- return (child);
	-}
	-
	-/*
	- * Set the external verifier for a gang block based on <vdev, offset, txg>,
	- * a tuple which is guaranteed to be unique for the life of the pool.
	- */
	-static void
	-zio_checksum_gang_verifier(zio_cksum_t zcp, blkptr_t bp)
	-{
	- dva_t *dva = BP_IDENTITY(bp);
	- uint64_t txg = BP_PHYSICAL_BIRTH(bp);
	-
	- ASSERT(BP_IS_GANG(bp));
	-
	- ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
	-}
	-
	-/*
	- * Set the external verifier for a label block based on its offset.
	- * The vdev is implicit, and the txg is unknowable at pool open time --
	- * hence the logic in vdev_uberblock_load() to find the most recent copy.
	- */
	-static void
	-zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
	-{
	- ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
	-}
	-
	-/*
	- * Calls the template init function of a checksum which supports context
	- * templates and installs the template into the spa_t.
	- */
	-static void
	-zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
	-{
	- zio_checksum_info_t *ci = &zio_checksum_table[checksum];
	-
	- if (ci->ci_tmpl_init == NULL)
	- return;
	- if (spa->spa_cksum_tmpls[checksum] != NULL)
	- return;
	-
	- VERIFY(ci->ci_tmpl_free != NULL);
	- mutex_enter(&spa->spa_cksum_tmpls_lock);
	- if (spa->spa_cksum_tmpls[checksum] == NULL) {
	- spa->spa_cksum_tmpls[checksum] =
	- ci->ci_tmpl_init(&spa->spa_cksum_salt);
	- VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
	- }
	- mutex_exit(&spa->spa_cksum_tmpls_lock);
	-}
	-
	-/*
	- * Generate the checksum.
	- */
	-void
	-zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
	- abd_t *abd, uint64_t size)
	-{
	- blkptr_t *bp = zio->io_bp;
	- uint64_t offset = zio->io_offset;
	- zio_checksum_info_t *ci = &zio_checksum_table[checksum];
	- zio_cksum_t cksum;
	- spa_t *spa = zio->io_spa;
	-
	- ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
	- ASSERT(ci->ci_func[0] != NULL);
	-
	- zio_checksum_template_init(checksum, spa);
	-
	- if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
	- zio_eck_t *eck;
	- void *data = abd_to_buf(abd);
	-
	- if (checksum == ZIO_CHECKSUM_ZILOG2) {
	- zil_chain_t *zilc = data;
	-
	- size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
	- uint64_t);
	- eck = &zilc->zc_eck;
	- } else {
	- eck = (zio_eck_t )((char )data + size) - 1;
	- }
	- if (checksum == ZIO_CHECKSUM_GANG_HEADER)
	- zio_checksum_gang_verifier(&eck->zec_cksum, bp);
	- else if (checksum == ZIO_CHECKSUM_LABEL)
	- zio_checksum_label_verifier(&eck->zec_cksum, offset);
	- else
	- bp->blk_cksum = eck->zec_cksum;
	- eck->zec_magic = ZEC_MAGIC;
	- ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
	- &cksum);
	- eck->zec_cksum = cksum;
	- } else {
	- ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
	- &bp->blk_cksum);
	- }
	-}
	-
	-int
	-zio_checksum_error_impl(spa_t spa, blkptr_t bp, enum zio_checksum checksum,
	- abd_t abd, uint64_t size, uint64_t offset, zio_bad_cksum_t info)
	-{
	- zio_checksum_info_t *ci = &zio_checksum_table[checksum];
	- zio_cksum_t actual_cksum, expected_cksum;
	- int byteswap;
	-
	- if (checksum >= ZIO_CHECKSUM_FUNCTIONS \|\| ci->ci_func[0] == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- zio_checksum_template_init(checksum, spa);
	-
	- if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
	- zio_eck_t *eck;
	- zio_cksum_t verifier;
	- uint64_t data_size = size;
	- void *data = abd_borrow_buf_copy(abd, data_size);
	-
	- if (checksum == ZIO_CHECKSUM_ZILOG2) {
	- zil_chain_t *zilc = data;
	- uint64_t nused;
	-
	- eck = &zilc->zc_eck;
	- if (eck->zec_magic == ZEC_MAGIC) {
	- nused = zilc->zc_nused;
	- } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) {
	- nused = BSWAP_64(zilc->zc_nused);
	- } else {
	- abd_return_buf(abd, data, data_size);
	- return (SET_ERROR(ECKSUM));
	- }
	-
	- if (nused > data_size) {
	- abd_return_buf(abd, data, data_size);
	- return (SET_ERROR(ECKSUM));
	- }
	-
	- size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
	- } else {
	- eck = (zio_eck_t )((char )data + data_size) - 1;
	- }
	-
	- if (checksum == ZIO_CHECKSUM_GANG_HEADER)
	- zio_checksum_gang_verifier(&verifier, bp);
	- else if (checksum == ZIO_CHECKSUM_LABEL)
	- zio_checksum_label_verifier(&verifier, offset);
	- else
	- verifier = bp->blk_cksum;
	-
	- byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
	-
	- if (byteswap)
	- byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
	-
	- size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data;
	- expected_cksum = eck->zec_cksum;
	- eck->zec_cksum = verifier;
	- abd_return_buf_copy(abd, data, data_size);
	-
	- ci->ci_func[byteswap](abd, size,
	- spa->spa_cksum_tmpls[checksum], &actual_cksum);
	- abd_copy_from_buf_off(abd, &expected_cksum,
	- eck_offset, sizeof (zio_cksum_t));
	-
	- if (byteswap) {
	- byteswap_uint64_array(&expected_cksum,
	- sizeof (zio_cksum_t));
	- }
	- } else {
	- byteswap = BP_SHOULD_BYTESWAP(bp);
	- expected_cksum = bp->blk_cksum;
	- ci->ci_func[byteswap](abd, size,
	- spa->spa_cksum_tmpls[checksum], &actual_cksum);
	- }
	-
	- if (info != NULL) {
	- info->zbc_expected = expected_cksum;
	- info->zbc_actual = actual_cksum;
	- info->zbc_checksum_name = ci->ci_name;
	- info->zbc_byteswapped = byteswap;
	- info->zbc_injected = 0;
	- info->zbc_has_cksum = 1;
	- }
	-
	- if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
	- return (SET_ERROR(ECKSUM));
	-
	- return (0);
	-}
	-
	-int
	-zio_checksum_error(zio_t zio, zio_bad_cksum_t info)
	-{
	- blkptr_t *bp = zio->io_bp;
	- uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
	- (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
	- int error;
	- uint64_t size = (bp == NULL ? zio->io_size :
	- (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
	- uint64_t offset = zio->io_offset;
	- abd_t *data = zio->io_abd;
	- spa_t *spa = zio->io_spa;
	-
	- error = zio_checksum_error_impl(spa, bp, checksum, data, size,
	- offset, info);
	-
	- if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
	- error = zio_handle_fault_injection(zio, ECKSUM);
	- if (error != 0)
	- info->zbc_injected = 1;
	- }
	-
	- return (error);
	-}
	-
	-/*
	- * Called by a spa_t that's about to be deallocated. This steps through
	- * all of the checksum context templates and deallocates any that were
	- * initialized using the algorithm-specific template init function.
	- */
	-void
	-zio_checksum_templates_free(spa_t *spa)
	-{
	- for (enum zio_checksum checksum = 0;
	- checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
	- if (spa->spa_cksum_tmpls[checksum] != NULL) {
	- zio_checksum_info_t *ci = &zio_checksum_table[checksum];
	-
	- VERIFY(ci->ci_tmpl_free != NULL);
	- ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
	- spa->spa_cksum_tmpls[checksum] = NULL;
	- }
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
	@@ -1,215 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-/*
	- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	- * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/compress.h>
	-#include <sys/kstat.h>
	-#include <sys/spa.h>
	-#include <sys/zfeature.h>
	-#include <sys/zio.h>
	-#include <sys/zio_compress.h>
	-
	-typedef struct zcomp_stats {
	- kstat_named_t zcompstat_attempts;
	- kstat_named_t zcompstat_empty;
	- kstat_named_t zcompstat_skipped_insufficient_gain;
	-} zcomp_stats_t;
	-
	-static zcomp_stats_t zcomp_stats = {
	- { "attempts", KSTAT_DATA_UINT64 },
	- { "empty", KSTAT_DATA_UINT64 },
	- { "skipped_insufficient_gain", KSTAT_DATA_UINT64 }
	-};
	-
	-#define ZCOMPSTAT_INCR(stat, val) \
	- atomic_add_64(&zcomp_stats.stat.value.ui64, (val));
	-
	-#define ZCOMPSTAT_BUMP(stat) ZCOMPSTAT_INCR(stat, 1);
	-
	-kstat_t *zcomp_ksp;
	-
	-/*
	- * If nonzero, every 1/X decompression attempts will fail, simulating
	- * an undetected memory error.
	- */
	-uint64_t zio_decompress_fail_fraction = 0;
	-
	-/*
	- * Compression vectors.
	- */
	-zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
	- {"inherit", 0, NULL, NULL},
	- {"on", 0, NULL, NULL},
	- {"uncompressed", 0, NULL, NULL},
	- {"lzjb", 0, lzjb_compress, lzjb_decompress},
	- {"empty", 0, NULL, NULL},
	- {"gzip-1", 1, gzip_compress, gzip_decompress},
	- {"gzip-2", 2, gzip_compress, gzip_decompress},
	- {"gzip-3", 3, gzip_compress, gzip_decompress},
	- {"gzip-4", 4, gzip_compress, gzip_decompress},
	- {"gzip-5", 5, gzip_compress, gzip_decompress},
	- {"gzip-6", 6, gzip_compress, gzip_decompress},
	- {"gzip-7", 7, gzip_compress, gzip_decompress},
	- {"gzip-8", 8, gzip_compress, gzip_decompress},
	- {"gzip-9", 9, gzip_compress, gzip_decompress},
	- {"zle", 64, zle_compress, zle_decompress},
	- {"lz4", 0, lz4_compress, lz4_decompress}
	-};
	-
	-enum zio_compress
	-zio_compress_select(spa_t *spa, enum zio_compress child,
	- enum zio_compress parent)
	-{
	- enum zio_compress result;
	-
	- ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
	- ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
	- ASSERT(parent != ZIO_COMPRESS_INHERIT);
	-
	- result = child;
	- if (result == ZIO_COMPRESS_INHERIT)
	- result = parent;
	-
	- if (result == ZIO_COMPRESS_ON) {
	- if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS))
	- result = ZIO_COMPRESS_LZ4_ON_VALUE;
	- else
	- result = ZIO_COMPRESS_LEGACY_ON_VALUE;
	- }
	-
	- return (result);
	-}
	-
	-/ARGSUSED/
	-static int
	-zio_compress_zeroed_cb(void data, size_t len, void private)
	-{
	- uint64_t end = (uint64_t )((char *)data + len);
	- for (uint64_t word = (uint64_t )data; word < end; word++)
	- if (*word != 0)
	- return (1);
	-
	- return (0);
	-}
	-
	-size_t
	-zio_compress_data(enum zio_compress c, abd_t src, void dst, size_t s_len)
	-{
	- size_t c_len, d_len;
	- zio_compress_info_t *ci = &zio_compress_table[c];
	-
	- ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
	- ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY \|\| ci->ci_compress != NULL);
	-
	- ZCOMPSTAT_BUMP(zcompstat_attempts);
	-
	- /*
	- * If the data is all zeroes, we don't even need to allocate
	- * a block for it. We indicate this by returning zero size.
	- */
	- if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) {
	- ZCOMPSTAT_BUMP(zcompstat_empty);
	- return (0);
	- }
	-
	- if (c == ZIO_COMPRESS_EMPTY)
	- return (s_len);
	-
	- /* Compress at least 12.5% */
	- d_len = s_len - (s_len >> 3);
	-
	- /* No compression algorithms can read from ABDs directly */
	- void *tmp = abd_borrow_buf_copy(src, s_len);
	- c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level);
	- abd_return_buf(src, tmp, s_len);
	-
	- if (c_len > d_len) {
	- ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain);
	- return (s_len);
	- }
	-
	- ASSERT3U(c_len, <=, d_len);
	- return (c_len);
	-}
	-
	-int
	-zio_decompress_data_buf(enum zio_compress c, void src, void dst,
	- size_t s_len, size_t d_len)
	-{
	- zio_compress_info_t *ci = &zio_compress_table[c];
	- if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS \|\| ci->ci_decompress == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
	-}
	-
	-int
	-zio_decompress_data(enum zio_compress c, abd_t src, void dst,
	- size_t s_len, size_t d_len)
	-{
	- void *tmp = abd_borrow_buf_copy(src, s_len);
	- int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
	- abd_return_buf(src, tmp, s_len);
	-
	- /*
	- * Decompression shouldn't fail, because we've already verifyied
	- * the checksum. However, for extra protection (e.g. against bitflips
	- * in non-ECC RAM), we handle this error (and test it).
	- */
	- ASSERT0(ret);
	- if (zio_decompress_fail_fraction != 0 &&
	- spa_get_random(zio_decompress_fail_fraction) == 0)
	- ret = SET_ERROR(EINVAL);
	-
	- return (ret);
	-}
	-
	-void
	-zio_compress_init(void)
	-{
	-
	- zcomp_ksp = kstat_create("zfs", 0, "zcompstats", "misc",
	- KSTAT_TYPE_NAMED, sizeof (zcomp_stats) / sizeof (kstat_named_t),
	- KSTAT_FLAG_VIRTUAL);
	-
	- if (zcomp_ksp != NULL) {
	- zcomp_ksp->ks_data = &zcomp_stats;
	- kstat_install(zcomp_ksp);
	- }
	-}
	-
	-void
	-zio_compress_fini(void)
	-{
	- if (zcomp_ksp != NULL) {
	- kstat_delete(zcomp_ksp);
	- zcomp_ksp = NULL;
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
	@@ -1,755 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * ZFS fault injection
	- *
	- * To handle fault injection, we keep track of a series of zinject_record_t
	- * structures which describe which logical block(s) should be injected with a
	- * fault. These are kept in a global list. Each record corresponds to a given
	- * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
	- * or exported while the injection record exists.
	- *
	- * Device level injection is done using the 'zi_guid' field. If this is set, it
	- * means that the error is destined for a particular device, not a piece of
	- * data.
	- *
	- * This is a rather poor data structure and algorithm, but we don't expect more
	- * than a few faults at any one time, so it should be sufficient for our needs.
	- */
	-
	-#include <sys/arc.h>
	-#include <sys/zio_impl.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/dmu_objset.h>
	-#include <sys/fs/zfs.h>
	-
	-uint32_t zio_injection_enabled;
	-
	-/*
	- * Data describing each zinject handler registered on the system, and
	- * contains the list node linking the handler in the global zinject
	- * handler list.
	- */
	-typedef struct inject_handler {
	- int zi_id;
	- spa_t *zi_spa;
	- zinject_record_t zi_record;
	- uint64_t *zi_lanes;
	- int zi_next_lane;
	- list_node_t zi_link;
	-} inject_handler_t;
	-
	-/*
	- * List of all zinject handlers registered on the system, protected by
	- * the inject_lock defined below.
	- */
	-static list_t inject_handlers;
	-
	-/*
	- * This protects insertion into, and traversal of, the inject handler
	- * list defined above; as well as the inject_delay_count. Any time a
	- * handler is inserted or removed from the list, this lock should be
	- * taken as a RW_WRITER; and any time traversal is done over the list
	- * (without modification to it) this lock should be taken as a RW_READER.
	- */
	-static krwlock_t inject_lock;
	-
	-/*
	- * This holds the number of zinject delay handlers that have been
	- * registered on the system. It is protected by the inject_lock defined
	- * above. Thus modifications to this count must be a RW_WRITER of the
	- * inject_lock, and reads of this count must be (at least) a RW_READER
	- * of the lock.
	- */
	-static int inject_delay_count = 0;
	-
	-/*
	- * This lock is used only in zio_handle_io_delay(), refer to the comment
	- * in that function for more details.
	- */
	-static kmutex_t inject_delay_mtx;
	-
	-/*
	- * Used to assign unique identifying numbers to each new zinject handler.
	- */
	-static int inject_next_id = 1;
	-
	-/*
	- * Returns true if the given record matches the I/O in progress.
	- */
	-static boolean_t
	-zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
	- zinject_record_t *record, int error)
	-{
	- /*
	- * Check for a match against the MOS, which is based on type
	- */
	- if (zb->zb_objset == DMU_META_OBJSET &&
	- record->zi_objset == DMU_META_OBJSET &&
	- record->zi_object == DMU_META_DNODE_OBJECT) {
	- if (record->zi_type == DMU_OT_NONE \|\|
	- type == record->zi_type)
	- return (record->zi_freq == 0 \|\|
	- spa_get_random(100) < record->zi_freq);
	- else
	- return (B_FALSE);
	- }
	-
	- /*
	- * Check for an exact match.
	- */
	- if (zb->zb_objset == record->zi_objset &&
	- zb->zb_object == record->zi_object &&
	- zb->zb_level == record->zi_level &&
	- zb->zb_blkid >= record->zi_start &&
	- zb->zb_blkid <= record->zi_end &&
	- error == record->zi_error)
	- return (record->zi_freq == 0 \|\|
	- spa_get_random(100) < record->zi_freq);
	-
	- return (B_FALSE);
	-}
	-
	-/*
	- * Panic the system when a config change happens in the function
	- * specified by tag.
	- */
	-void
	-zio_handle_panic_injection(spa_t spa, char tag, uint64_t type)
	-{
	- inject_handler_t *handler;
	-
	- rw_enter(&inject_lock, RW_READER);
	-
	- for (handler = list_head(&inject_handlers); handler != NULL;
	- handler = list_next(&inject_handlers, handler)) {
	-
	- if (spa != handler->zi_spa)
	- continue;
	-
	- if (handler->zi_record.zi_type == type &&
	- strcmp(tag, handler->zi_record.zi_func) == 0)
	- panic("Panic requested in function %s\n", tag);
	- }
	-
	- rw_exit(&inject_lock);
	-}
	-
	-/*
	- * Determine if the I/O in question should return failure. Returns the errno
	- * to be returned to the caller.
	- */
	-int
	-zio_handle_fault_injection(zio_t *zio, int error)
	-{
	- int ret = 0;
	- inject_handler_t *handler;
	-
	- /*
	- * Ignore I/O not associated with any logical data.
	- */
	- if (zio->io_logical == NULL)
	- return (0);
	-
	- /*
	- * Currently, we only support fault injection on reads.
	- */
	- if (zio->io_type != ZIO_TYPE_READ)
	- return (0);
	-
	- rw_enter(&inject_lock, RW_READER);
	-
	- for (handler = list_head(&inject_handlers); handler != NULL;
	- handler = list_next(&inject_handlers, handler)) {
	-
	- if (zio->io_spa != handler->zi_spa \|\|
	- handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
	- continue;
	-
	- /* If this handler matches, return EIO */
	- if (zio_match_handler(&zio->io_logical->io_bookmark,
	- zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
	- &handler->zi_record, error)) {
	- ret = error;
	- break;
	- }
	- }
	-
	- rw_exit(&inject_lock);
	-
	- return (ret);
	-}
	-
	-/*
	- * Determine if the zio is part of a label update and has an injection
	- * handler associated with that portion of the label. Currently, we
	- * allow error injection in either the nvlist or the uberblock region of
	- * of the vdev label.
	- */
	-int
	-zio_handle_label_injection(zio_t *zio, int error)
	-{
	- inject_handler_t *handler;
	- vdev_t *vd = zio->io_vd;
	- uint64_t offset = zio->io_offset;
	- int label;
	- int ret = 0;
	-
	- if (offset >= VDEV_LABEL_START_SIZE &&
	- offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
	- return (0);
	-
	- rw_enter(&inject_lock, RW_READER);
	-
	- for (handler = list_head(&inject_handlers); handler != NULL;
	- handler = list_next(&inject_handlers, handler)) {
	- uint64_t start = handler->zi_record.zi_start;
	- uint64_t end = handler->zi_record.zi_end;
	-
	- if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
	- continue;
	-
	- /*
	- * The injection region is the relative offsets within a
	- * vdev label. We must determine the label which is being
	- * updated and adjust our region accordingly.
	- */
	- label = vdev_label_number(vd->vdev_psize, offset);
	- start = vdev_label_offset(vd->vdev_psize, label, start);
	- end = vdev_label_offset(vd->vdev_psize, label, end);
	-
	- if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
	- (offset >= start && offset <= end)) {
	- ret = error;
	- break;
	- }
	- }
	- rw_exit(&inject_lock);
	- return (ret);
	-}
	-
	-
	-int
	-zio_handle_device_injection(vdev_t vd, zio_t zio, int error)
	-{
	- inject_handler_t *handler;
	- int ret = 0;
	-
	- /*
	- * We skip over faults in the labels unless it's during
	- * device open (i.e. zio == NULL).
	- */
	- if (zio != NULL) {
	- uint64_t offset = zio->io_offset;
	-
	- if (offset < VDEV_LABEL_START_SIZE \|\|
	- offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
	- return (0);
	- }
	-
	- rw_enter(&inject_lock, RW_READER);
	-
	- for (handler = list_head(&inject_handlers); handler != NULL;
	- handler = list_next(&inject_handlers, handler)) {
	-
	- if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
	- continue;
	-
	- if (vd->vdev_guid == handler->zi_record.zi_guid) {
	- if (handler->zi_record.zi_failfast &&
	- (zio == NULL \|\| (zio->io_flags &
	- (ZIO_FLAG_IO_RETRY \| ZIO_FLAG_TRYHARD)))) {
	- continue;
	- }
	-
	- /* Handle type specific I/O failures */
	- if (zio != NULL &&
	- handler->zi_record.zi_iotype != ZIO_TYPES &&
	- handler->zi_record.zi_iotype != zio->io_type)
	- continue;
	-
	- if (handler->zi_record.zi_error == error) {
	- /*
	- * For a failed open, pretend like the device
	- * has gone away.
	- */
	- if (error == ENXIO)
	- vd->vdev_stat.vs_aux =
	- VDEV_AUX_OPEN_FAILED;
	-
	- /*
	- * Treat these errors as if they had been
	- * retried so that all the appropriate stats
	- * and FMA events are generated.
	- */
	- if (!handler->zi_record.zi_failfast &&
	- zio != NULL)
	- zio->io_flags \|= ZIO_FLAG_IO_RETRY;
	-
	- ret = error;
	- break;
	- }
	- if (handler->zi_record.zi_error == ENXIO) {
	- ret = SET_ERROR(EIO);
	- break;
	- }
	- }
	- }
	-
	- rw_exit(&inject_lock);
	-
	- return (ret);
	-}
	-
	-/*
	- * Simulate hardware that ignores cache flushes. For requested number
	- * of seconds nix the actual writing to disk.
	- */
	-void
	-zio_handle_ignored_writes(zio_t *zio)
	-{
	- inject_handler_t *handler;
	-
	- rw_enter(&inject_lock, RW_READER);
	-
	- for (handler = list_head(&inject_handlers); handler != NULL;
	- handler = list_next(&inject_handlers, handler)) {
	-
	- /* Ignore errors not destined for this pool */
	- if (zio->io_spa != handler->zi_spa \|\|
	- handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
	- continue;
	-
	- /*
	- * Positive duration implies # of seconds, negative
	- * a number of txgs
	- */
	- if (handler->zi_record.zi_timer == 0) {
	- if (handler->zi_record.zi_duration > 0)
	- handler->zi_record.zi_timer = ddi_get_lbolt64();
	- else
	- handler->zi_record.zi_timer = zio->io_txg;
	- }
	-
	- /* Have a "problem" writing 60% of the time */
	- if (spa_get_random(100) < 60)
	- zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
	- break;
	- }
	-
	- rw_exit(&inject_lock);
	-}
	-
	-void
	-spa_handle_ignored_writes(spa_t *spa)
	-{
	- inject_handler_t *handler;
	-
	- if (zio_injection_enabled == 0)
	- return;
	-
	- rw_enter(&inject_lock, RW_READER);
	-
	- for (handler = list_head(&inject_handlers); handler != NULL;
	- handler = list_next(&inject_handlers, handler)) {
	-
	- if (spa != handler->zi_spa \|\|
	- handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
	- continue;
	-
	- if (handler->zi_record.zi_duration > 0) {
	- VERIFY(handler->zi_record.zi_timer == 0 \|\|
	- handler->zi_record.zi_timer +
	- handler->zi_record.zi_duration * hz >
	- ddi_get_lbolt64());
	- } else {
	- /* duration is negative so the subtraction here adds */
	- VERIFY(handler->zi_record.zi_timer == 0 \|\|
	- handler->zi_record.zi_timer -
	- handler->zi_record.zi_duration >=
	- spa_syncing_txg(spa));
	- }
	- }
	-
	- rw_exit(&inject_lock);
	-}
	-
	-hrtime_t
	-zio_handle_io_delay(zio_t *zio)
	-{
	- vdev_t *vd = zio->io_vd;
	- inject_handler_t *min_handler = NULL;
	- hrtime_t min_target = 0;
	-
	- rw_enter(&inject_lock, RW_READER);
	-
	- /*
	- * inject_delay_count is a subset of zio_injection_enabled that
	- * is only incremented for delay handlers. These checks are
	- * mainly added to remind the reader why we're not explicitly
	- * checking zio_injection_enabled like the other functions.
	- */
	- IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
	- IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
	-
	- /*
	- * If there aren't any inject delay handlers registered, then we
	- * can short circuit and simply return 0 here. A value of zero
	- * informs zio_delay_interrupt() that this request should not be
	- * delayed. This short circuit keeps us from acquiring the
	- * inject_delay_mutex unnecessarily.
	- */
	- if (inject_delay_count == 0) {
	- rw_exit(&inject_lock);
	- return (0);
	- }
	-
	- /*
	- * Each inject handler has a number of "lanes" associated with
	- * it. Each lane is able to handle requests independently of one
	- * another, and at a latency defined by the inject handler
	- * record's zi_timer field. Thus if a handler in configured with
	- * a single lane with a 10ms latency, it will delay requests
	- * such that only a single request is completed every 10ms. So,
	- * if more than one request is attempted per each 10ms interval,
	- * the average latency of the requests will be greater than
	- * 10ms; but if only a single request is submitted each 10ms
	- * interval the average latency will be 10ms.
	- *
	- * We need to acquire this mutex to prevent multiple concurrent
	- * threads being assigned to the same lane of a given inject
	- * handler. The mutex allows us to perform the following two
	- * operations atomically:
	- *
	- * 1. determine the minimum handler and minimum target
	- * value of all the possible handlers
	- * 2. update that minimum handler's lane array
	- *
	- * Without atomicity, two (or more) threads could pick the same
	- * lane in step (1), and then conflict with each other in step
	- * (2). This could allow a single lane handler to process
	- * multiple requests simultaneously, which shouldn't be possible.
	- */
	- mutex_enter(&inject_delay_mtx);
	-
	- for (inject_handler_t *handler = list_head(&inject_handlers);
	- handler != NULL; handler = list_next(&inject_handlers, handler)) {
	- if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
	- continue;
	-
	- if (vd->vdev_guid != handler->zi_record.zi_guid)
	- continue;
	-
	- /*
	- * Defensive; should never happen as the array allocation
	- * occurs prior to inserting this handler on the list.
	- */
	- ASSERT3P(handler->zi_lanes, !=, NULL);
	-
	- /*
	- * This should never happen, the zinject command should
	- * prevent a user from setting an IO delay with zero lanes.
	- */
	- ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
	-
	- ASSERT3U(handler->zi_record.zi_nlanes, >,
	- handler->zi_next_lane);
	-
	- /*
	- * We want to issue this IO to the lane that will become
	- * idle the soonest, so we compare the soonest this
	- * specific handler can complete the IO with all other
	- * handlers, to find the lowest value of all possible
	- * lanes. We then use this lane to submit the request.
	- *
	- * Since each handler has a constant value for its
	- * delay, we can just use the "next" lane for that
	- * handler; as it will always be the lane with the
	- * lowest value for that particular handler (i.e. the
	- * lane that will become idle the soonest). This saves a
	- * scan of each handler's lanes array.
	- *
	- * There's two cases to consider when determining when
	- * this specific IO request should complete. If this
	- * lane is idle, we want to "submit" the request now so
	- * it will complete after zi_timer milliseconds. Thus,
	- * we set the target to now + zi_timer.
	- *
	- * If the lane is busy, we want this request to complete
	- * zi_timer milliseconds after the lane becomes idle.
	- * Since the 'zi_lanes' array holds the time at which
	- * each lane will become idle, we use that value to
	- * determine when this request should complete.
	- */
	- hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
	- hrtime_t busy = handler->zi_record.zi_timer +
	- handler->zi_lanes[handler->zi_next_lane];
	- hrtime_t target = MAX(idle, busy);
	-
	- if (min_handler == NULL) {
	- min_handler = handler;
	- min_target = target;
	- continue;
	- }
	-
	- ASSERT3P(min_handler, !=, NULL);
	- ASSERT3U(min_target, !=, 0);
	-
	- /*
	- * We don't yet increment the "next lane" variable since
	- * we still might find a lower value lane in another
	- * handler during any remaining iterations. Once we're
	- * sure we've selected the absolute minimum, we'll claim
	- * the lane and increment the handler's "next lane"
	- * field below.
	- */
	-
	- if (target < min_target) {
	- min_handler = handler;
	- min_target = target;
	- }
	- }
	-
	- /*
	- * 'min_handler' will be NULL if no IO delays are registered for
	- * this vdev, otherwise it will point to the handler containing
	- * the lane that will become idle the soonest.
	- */
	- if (min_handler != NULL) {
	- ASSERT3U(min_target, !=, 0);
	- min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
	-
	- /*
	- * If we've used all possible lanes for this handler,
	- * loop back and start using the first lane again;
	- * otherwise, just increment the lane index.
	- */
	- min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
	- min_handler->zi_record.zi_nlanes;
	- }
	-
	- mutex_exit(&inject_delay_mtx);
	- rw_exit(&inject_lock);
	-
	- return (min_target);
	-}
	-
	-/*
	- * Create a new handler for the given record. We add it to the list, adding
	- * a reference to the spa_t in the process. We increment zio_injection_enabled,
	- * which is the switch to trigger all fault injection.
	- */
	-int
	-zio_inject_fault(char name, int flags, int id, zinject_record_t *record)
	-{
	- inject_handler_t *handler;
	- int error;
	- spa_t *spa;
	-
	- /*
	- * If this is pool-wide metadata, make sure we unload the corresponding
	- * spa_t, so that the next attempt to load it will trigger the fault.
	- * We call spa_reset() to unload the pool appropriately.
	- */
	- if (flags & ZINJECT_UNLOAD_SPA)
	- if ((error = spa_reset(name)) != 0)
	- return (error);
	-
	- if (record->zi_cmd == ZINJECT_DELAY_IO) {
	- /*
	- * A value of zero for the number of lanes or for the
	- * delay time doesn't make sense.
	- */
	- if (record->zi_timer == 0 \|\| record->zi_nlanes == 0)
	- return (SET_ERROR(EINVAL));
	-
	- /*
	- * The number of lanes is directly mapped to the size of
	- * an array used by the handler. Thus, to ensure the
	- * user doesn't trigger an allocation that's "too large"
	- * we cap the number of lanes here.
	- */
	- if (record->zi_nlanes >= UINT16_MAX)
	- return (SET_ERROR(EINVAL));
	- }
	-
	- if (!(flags & ZINJECT_NULL)) {
	- /*
	- * spa_inject_ref() will add an injection reference, which will
	- * prevent the pool from being removed from the namespace while
	- * still allowing it to be unloaded.
	- */
	- if ((spa = spa_inject_addref(name)) == NULL)
	- return (SET_ERROR(ENOENT));
	-
	- handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
	-
	- handler->zi_spa = spa;
	- handler->zi_record = *record;
	-
	- if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
	- handler->zi_lanes = kmem_zalloc(
	- sizeof (handler->zi_lanes)
	- handler->zi_record.zi_nlanes, KM_SLEEP);
	- handler->zi_next_lane = 0;
	- } else {
	- handler->zi_lanes = NULL;
	- handler->zi_next_lane = 0;
	- }
	-
	- rw_enter(&inject_lock, RW_WRITER);
	-
	- /*
	- * We can't move this increment into the conditional
	- * above because we need to hold the RW_WRITER lock of
	- * inject_lock, and we don't want to hold that while
	- * allocating the handler's zi_lanes array.
	- */
	- if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
	- ASSERT3S(inject_delay_count, >=, 0);
	- inject_delay_count++;
	- ASSERT3S(inject_delay_count, >, 0);
	- }
	-
	- *id = handler->zi_id = inject_next_id++;
	- list_insert_tail(&inject_handlers, handler);
	- atomic_inc_32(&zio_injection_enabled);
	-
	- rw_exit(&inject_lock);
	- }
	-
	- /*
	- * Flush the ARC, so that any attempts to read this data will end up
	- * going to the ZIO layer. Note that this is a little overkill, but
	- * we don't have the necessary ARC interfaces to do anything else, and
	- * fault injection isn't a performance critical path.
	- */
	- if (flags & ZINJECT_FLUSH_ARC)
	- /*
	- * We must use FALSE to ensure arc_flush returns, since
	- * we're not preventing concurrent ARC insertions.
	- */
	- arc_flush(NULL, FALSE);
	-
	- return (0);
	-}
	-
	-/*
	- * Returns the next record with an ID greater than that supplied to the
	- * function. Used to iterate over all handlers in the system.
	- */
	-int
	-zio_inject_list_next(int id, char name, size_t buflen,
	- zinject_record_t *record)
	-{
	- inject_handler_t *handler;
	- int ret;
	-
	- mutex_enter(&spa_namespace_lock);
	- rw_enter(&inject_lock, RW_READER);
	-
	- for (handler = list_head(&inject_handlers); handler != NULL;
	- handler = list_next(&inject_handlers, handler))
	- if (handler->zi_id > *id)
	- break;
	-
	- if (handler) {
	- *record = handler->zi_record;
	- *id = handler->zi_id;
	- (void) strncpy(name, spa_name(handler->zi_spa), buflen);
	- ret = 0;
	- } else {
	- ret = SET_ERROR(ENOENT);
	- }
	-
	- rw_exit(&inject_lock);
	- mutex_exit(&spa_namespace_lock);
	-
	- return (ret);
	-}
	-
	-/*
	- * Clear the fault handler with the given identifier, or return ENOENT if none
	- * exists.
	- */
	-int
	-zio_clear_fault(int id)
	-{
	- inject_handler_t *handler;
	-
	- rw_enter(&inject_lock, RW_WRITER);
	-
	- for (handler = list_head(&inject_handlers); handler != NULL;
	- handler = list_next(&inject_handlers, handler))
	- if (handler->zi_id == id)
	- break;
	-
	- if (handler == NULL) {
	- rw_exit(&inject_lock);
	- return (SET_ERROR(ENOENT));
	- }
	-
	- if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
	- ASSERT3S(inject_delay_count, >, 0);
	- inject_delay_count--;
	- ASSERT3S(inject_delay_count, >=, 0);
	- }
	-
	- list_remove(&inject_handlers, handler);
	- rw_exit(&inject_lock);
	-
	- if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
	- ASSERT3P(handler->zi_lanes, !=, NULL);
	- kmem_free(handler->zi_lanes, sizeof (handler->zi_lanes)
	- handler->zi_record.zi_nlanes);
	- } else {
	- ASSERT3P(handler->zi_lanes, ==, NULL);
	- }
	-
	- spa_inject_delref(handler->zi_spa);
	- kmem_free(handler, sizeof (inject_handler_t));
	- atomic_dec_32(&zio_injection_enabled);
	-
	- return (0);
	-}
	-
	-void
	-zio_inject_init(void)
	-{
	- rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
	- mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
	- list_create(&inject_handlers, sizeof (inject_handler_t),
	- offsetof(inject_handler_t, zi_link));
	-}
	-
	-void
	-zio_inject_fini(void)
	-{
	- list_destroy(&inject_handlers);
	- mutex_destroy(&inject_delay_mtx);
	- rw_destroy(&inject_lock);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
	@@ -1,86 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Zero-length encoding. This is a fast and simple algorithm to eliminate
	- * runs of zeroes. Each chunk of compressed data begins with a length byte, b.
	- * If b < n (where n is the compression parameter) then the next b + 1 bytes
	- * are literal values. If b >= n then the next (256 - b + 1) bytes are zero.
	- */
	-#include <sys/types.h>
	-#include <sys/sysmacros.h>
	-
	-size_t
	-zle_compress(void s_start, void d_start, size_t s_len, size_t d_len, int n)
	-{
	- uchar_t *src = s_start;
	- uchar_t *dst = d_start;
	- uchar_t *s_end = src + s_len;
	- uchar_t *d_end = dst + d_len;
	-
	- while (src < s_end && dst < d_end - 1) {
	- uchar_t *first = src;
	- uchar_t *len = dst++;
	- if (src[0] == 0) {
	- uchar_t *last = src + (256 - n);
	- while (src < MIN(last, s_end) && src[0] == 0)
	- src++;
	- *len = src - first - 1 + n;
	- } else {
	- uchar_t *last = src + n;
	- if (d_end - dst < n)
	- break;
	- while (src < MIN(last, s_end) - 1 && (src[0] \| src[1]))
	- dst++ = src++;
	- if (src[0])
	- dst++ = src++;
	- *len = src - first - 1;
	- }
	- }
	- return (src == s_end ? dst - (uchar_t *)d_start : s_len);
	-}
	-
	-int
	-zle_decompress(void s_start, void d_start, size_t s_len, size_t d_len, int n)
	-{
	- uchar_t *src = s_start;
	- uchar_t *dst = d_start;
	- uchar_t *s_end = src + s_len;
	- uchar_t *d_end = dst + d_len;
	-
	- while (src < s_end && dst < d_end) {
	- int len = 1 + *src++;
	- if (len <= n) {
	- while (len-- != 0)
	- dst++ = src++;
	- } else {
	- len -= n;
	- while (len-- != 0)
	- *dst++ = 0;
	- }
	- }
	- return (dst == d_end ? 0 : -1);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
	@@ -1,187 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
	- * Copyright 2016 The MathWorks, Inc. All rights reserved.
	- */
	-
	-/*
	- * A Zero Reference Lock (ZRL) is a reference count that can lock out new
	- * references only when the count is zero and only without waiting if the count
	- * is not already zero. It is similar to a read-write lock in that it allows
	- * multiple readers and only a single writer, but it does not allow a writer to
	- * block while waiting for readers to exit, and therefore the question of
	- * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
	- * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
	- * is perfectly safe for the same reader to acquire the same lock multiple
	- * times. The fact that a ZRL is reentrant for readers (through multiple calls
	- * to zrl_add()) makes it convenient for determining whether something is
	- * actively referenced without the fuss of flagging lock ownership across
	- * function calls.
	- */
	-#include <sys/zrlock.h>
	-
	-/*
	- * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
	- * treated as zero references.
	- */
	-#define ZRL_LOCKED -1
	-#define ZRL_DESTROYED -2
	-
	-void
	-zrl_init(zrlock_t *zrl)
	-{
	- mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
	- zrl->zr_refcount = 0;
	- cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
	-#ifdef ZFS_DEBUG
	- zrl->zr_owner = NULL;
	- zrl->zr_caller = NULL;
	-#endif
	-}
	-
	-void
	-zrl_destroy(zrlock_t *zrl)
	-{
	- ASSERT0(zrl->zr_refcount);
	-
	- mutex_destroy(&zrl->zr_mtx);
	- zrl->zr_refcount = ZRL_DESTROYED;
	- cv_destroy(&zrl->zr_cv);
	-}
	-
	-void
	-zrl_add_impl(zrlock_t zrl, const char zc)
	-{
	- for (;;) {
	- uint32_t n = (uint32_t)zrl->zr_refcount;
	- while (n != ZRL_LOCKED) {
	- uint32_t cas = atomic_cas_32(
	- (uint32_t *)&zrl->zr_refcount, n, n + 1);
	- if (cas == n) {
	- ASSERT3S((int32_t)n, >=, 0);
	-#ifdef ZFS_DEBUG
	- if (zrl->zr_owner == curthread) {
	- DTRACE_PROBE2(zrlock__reentry,
	- zrlock_t *, zrl, uint32_t, n);
	- }
	- zrl->zr_owner = curthread;
	- zrl->zr_caller = zc;
	-#endif
	- return;
	- }
	- n = cas;
	- }
	-
	- mutex_enter(&zrl->zr_mtx);
	- while (zrl->zr_refcount == ZRL_LOCKED) {
	- cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
	- }
	- mutex_exit(&zrl->zr_mtx);
	- }
	-}
	-
	-void
	-zrl_remove(zrlock_t *zrl)
	-{
	- uint32_t n;
	-
	-#ifdef ZFS_DEBUG
	- if (zrl->zr_owner == curthread) {
	- zrl->zr_owner = NULL;
	- zrl->zr_caller = NULL;
	- }
	-#endif
	- n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
	- ASSERT3S((int32_t)n, >=, 0);
	-}
	-
	-int
	-zrl_tryenter(zrlock_t *zrl)
	-{
	- uint32_t n = (uint32_t)zrl->zr_refcount;
	-
	- if (n == 0) {
	- uint32_t cas = atomic_cas_32(
	- (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
	- if (cas == 0) {
	-#ifdef ZFS_DEBUG
	- ASSERT3P(zrl->zr_owner, ==, NULL);
	- zrl->zr_owner = curthread;
	-#endif
	- return (1);
	- }
	- }
	-
	- ASSERT3S((int32_t)n, >, ZRL_DESTROYED);
	-
	- return (0);
	-}
	-
	-void
	-zrl_exit(zrlock_t *zrl)
	-{
	- ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED);
	-
	- mutex_enter(&zrl->zr_mtx);
	-#ifdef ZFS_DEBUG
	- ASSERT3P(zrl->zr_owner, ==, curthread);
	- zrl->zr_owner = NULL;
	- membar_producer(); /* make sure the owner store happens first */
	-#endif
	- zrl->zr_refcount = 0;
	- cv_broadcast(&zrl->zr_cv);
	- mutex_exit(&zrl->zr_mtx);
	-}
	-
	-int
	-zrl_refcount(zrlock_t *zrl)
	-{
	- ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
	-
	- int n = (int)zrl->zr_refcount;
	- return (n <= 0 ? 0 : n);
	-}
	-
	-int
	-zrl_is_zero(zrlock_t *zrl)
	-{
	- ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
	-
	- return (zrl->zr_refcount <= 0);
	-}
	-
	-int
	-zrl_is_locked(zrlock_t *zrl)
	-{
	- ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
	-
	- return (zrl->zr_refcount == ZRL_LOCKED);
	-}
	-
	-#ifdef ZFS_DEBUG
	-kthread_t *
	-zrl_owner(zrlock_t *zrl)
	-{
	- return (zrl->zr_owner);
	-}
	-#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c
	@@ -1,431 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * This file and its contents are supplied under the terms of the
	- * Common Development and Distribution License ("CDDL"), version 1.0.
	- * You may only use this file in accordance with the terms of version
	- * 1.0 of the CDDL.
	- *
	- * A full copy of the text of the CDDL should have accompanied this
	- * source. A copy of the CDDL is also available via the Internet at
	- * http://www.illumos.org/license/CDDL.
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2017, 2019 by Delphix. All rights reserved.
	- */
	-
	-/*
	- * ZTHR Infrastructure
	- * ===================
	- *
	- * ZTHR threads are used for isolated operations that span multiple txgs
	- * within a SPA. They generally exist from SPA creation/loading and until
	- * the SPA is exported/destroyed. The ideal requirements for an operation
	- * to be modeled with a zthr are the following:
	- *
	- * 1] The operation needs to run over multiple txgs.
	- * 2] There is be a single point of reference in memory or on disk that
	- * indicates whether the operation should run/is running or has
	- * stopped.
	- *
	- * If the operation satisfies the above then the following rules guarantee
	- * a certain level of correctness:
	- *
	- * 1] Any thread EXCEPT the zthr changes the work indicator from stopped
	- * to running but not the opposite.
	- * 2] Only the zthr can change the work indicator from running to stopped
	- * (e.g. when it is done) but not the opposite.
	- *
	- * This way a normal zthr cycle should go like this:
	- *
	- * 1] An external thread changes the work indicator from stopped to
	- * running and wakes up the zthr.
	- * 2] The zthr wakes up, checks the indicator and starts working.
	- * 3] When the zthr is done, it changes the indicator to stopped, allowing
	- * a new cycle to start.
	- *
	- * Besides being awakened by other threads, a zthr can be configured
	- * during creation to wakeup on it's own after a specified interval
	- * [see zthr_create_timer()].
	- *
	- * Note: ZTHR threads are NOT a replacement for generic threads! Please
	- * ensure that they fit your use-case well before using them.
	- *
	- * == ZTHR creation
	- *
	- * Every zthr needs three inputs to start running:
	- *
	- * 1] A user-defined checker function (checkfunc) that decides whether
	- * the zthr should start working or go to sleep. The function should
	- * return TRUE when the zthr needs to work or FALSE to let it sleep,
	- * and should adhere to the following signature:
	- * boolean_t checkfunc_name(void args, zthr_t t);
	- *
	- * 2] A user-defined ZTHR function (func) which the zthr executes when
	- * it is not sleeping. The function should adhere to the following
	- * signature type:
	- * void func_name(void args, zthr_t t);
	- *
	- * 3] A void args pointer that will be passed to checkfunc and func
	- * implicitly by the infrastructure.
	- *
	- * The reason why the above API needs two different functions,
	- * instead of one that both checks and does the work, has to do with
	- * the zthr's internal state lock (zthr_state_lock) and the allowed
	- * cancellation windows. We want to hold the zthr_state_lock while
	- * running checkfunc but not while running func. This way the zthr
	- * can be cancelled while doing work and not while checking for work.
	- *
	- * To start a zthr:
	- * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
	- * or
	- * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func,
	- * args, max_sleep);
	- *
	- * After that you should be able to wakeup, cancel, and resume the
	- * zthr from another thread using the zthr_pointer.
	- *
	- * NOTE: ZTHR threads could potentially wake up spuriously and the
	- * user should take this into account when writing a checkfunc.
	- * [see ZTHR state transitions]
	- *
	- * == ZTHR cancellation
	- *
	- * ZTHR threads must be cancelled when their SPA is being exported
	- * or when they need to be paused so they don't interfere with other
	- * operations.
	- *
	- * To cancel a zthr:
	- * zthr_cancel(zthr_pointer);
	- *
	- * To resume it:
	- * zthr_resume(zthr_pointer);
	- *
	- * A zthr will implicitly check if it has received a cancellation
	- * signal every time func returns and every time it wakes up [see
	- * ZTHR state transitions below].
	- *
	- * At times, waiting for the zthr's func to finish its job may take
	- * time. This may be very time-consuming for some operations that
	- * need to cancel the SPA's zthrs (e.g spa_export). For this scenario
	- * the user can explicitly make their ZTHR function aware of incoming
	- * cancellation signals using zthr_iscancelled(). A common pattern for
	- * that looks like this:
	- *
	- * int
	- * func_name(void args, zthr_t t)
	- * {
	- * ... <unpack args> ...
	- * while (!work_done && !zthr_iscancelled(t)) {
	- * ... <do more work> ...
	- * }
	- * }
	- *
	- * == ZTHR cleanup
	- *
	- * Cancelling a zthr doesn't clean up its metadata (internal locks,
	- * function pointers to func and checkfunc, etc..). This is because
	- * we want to keep them around in case we want to resume the execution
	- * of the zthr later. Similarly for zthrs that exit themselves.
	- *
	- * To completely cleanup a zthr, cancel it first to ensure that it
	- * is not running and then use zthr_destroy().
	- *
	- * == ZTHR state transitions
	- *
	- * zthr creation
	- * +
	- * \|
	- * \| woke up
	- * \| +--------------+ sleep
	- * \| \| ^
	- * \| \| \|
	- * \| \| \| FALSE
	- * \| \| \|
	- * v v FALSE +
	- * cancelled? +---------> checkfunc?
	- * + ^ +
	- * \| \| \|
	- * \| \| \| TRUE
	- * \| \| \|
	- * \| \| func returned v
	- * \| +---------------+ func
	- * \|
	- * \| TRUE
	- * \|
	- * v
	- * zthr stopped running
	- *
	- * == Implementation of ZTHR requests
	- *
	- * ZTHR wakeup, cancel, and resume are requests on a zthr to
	- * change its internal state. Requests on a zthr are serialized
	- * using the zthr_request_lock, while changes in its internal
	- * state are protected by the zthr_state_lock. A request will
	- * first acquire the zthr_request_lock and then immediately
	- * acquire the zthr_state_lock. We do this so that incoming
	- * requests are serialized using the request lock, while still
	- * allowing us to use the state lock for thread communication
	- * via zthr_cv.
	- */
	-
	-#include <sys/zfs_context.h>
	-#include <sys/zthr.h>
	-
	-struct zthr {
	- /* running thread doing the work */
	- kthread_t *zthr_thread;
	-
	- /* lock protecting internal data & invariants */
	- kmutex_t zthr_state_lock;
	-
	- /* mutex that serializes external requests */
	- kmutex_t zthr_request_lock;
	-
	- /* notification mechanism for requests */
	- kcondvar_t zthr_cv;
	-
	- /* flag set to true if we are canceling the zthr */
	- boolean_t zthr_cancel;
	-
	- /*
	- * maximum amount of time that the zthr is spent sleeping;
	- * if this is 0, the thread doesn't wake up until it gets
	- * signaled.
	- */
	- hrtime_t zthr_wait_time;
	-
	- /* consumer-provided callbacks & data */
	- zthr_checkfunc_t *zthr_checkfunc;
	- zthr_func_t *zthr_func;
	- void *zthr_arg;
	-};
	-
	-static void
	-zthr_procedure(void *arg)
	-{
	- zthr_t *t = arg;
	-
	- mutex_enter(&t->zthr_state_lock);
	- ASSERT3P(t->zthr_thread, ==, curthread);
	-
	- while (!t->zthr_cancel) {
	- if (t->zthr_checkfunc(t->zthr_arg, t)) {
	- mutex_exit(&t->zthr_state_lock);
	- t->zthr_func(t->zthr_arg, t);
	- mutex_enter(&t->zthr_state_lock);
	- } else {
	- /* go to sleep */
	- if (t->zthr_wait_time == 0) {
	- cv_wait(&t->zthr_cv, &t->zthr_state_lock);
	- } else {
	- (void) cv_timedwait_hires(&t->zthr_cv,
	- &t->zthr_state_lock, t->zthr_wait_time,
	- MSEC2NSEC(1), 0);
	- }
	- }
	- }
	-
	- /*
	- * Clear out the kernel thread metadata and notify the
	- * zthr_cancel() thread that we've stopped running.
	- */
	- t->zthr_thread = NULL;
	- t->zthr_cancel = B_FALSE;
	- cv_broadcast(&t->zthr_cv);
	-
	- mutex_exit(&t->zthr_state_lock);
	- thread_exit();
	-}
	-
	-zthr_t *
	-zthr_create(zthr_checkfunc_t checkfunc, zthr_func_t func, void *arg)
	-{
	- return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0));
	-}
	-
	-/*
	- * Create a zthr with specified maximum sleep time. If the time
	- * in sleeping state exceeds max_sleep, a wakeup(do the check and
	- * start working if required) will be triggered.
	- */
	-zthr_t *
	-zthr_create_timer(zthr_checkfunc_t checkfunc, zthr_func_t func,
	- void *arg, hrtime_t max_sleep)
	-{
	- zthr_t t = kmem_zalloc(sizeof (t), KM_SLEEP);
	- mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL);
	- cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
	-
	- mutex_enter(&t->zthr_state_lock);
	- t->zthr_checkfunc = checkfunc;
	- t->zthr_func = func;
	- t->zthr_arg = arg;
	- t->zthr_wait_time = max_sleep;
	-
	- t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
	- 0, &p0, TS_RUN, minclsyspri);
	- mutex_exit(&t->zthr_state_lock);
	-
	- return (t);
	-}
	-
	-void
	-zthr_destroy(zthr_t *t)
	-{
	- ASSERT(!MUTEX_HELD(&t->zthr_state_lock));
	- ASSERT(!MUTEX_HELD(&t->zthr_request_lock));
	- VERIFY3P(t->zthr_thread, ==, NULL);
	- mutex_destroy(&t->zthr_request_lock);
	- mutex_destroy(&t->zthr_state_lock);
	- cv_destroy(&t->zthr_cv);
	- kmem_free(t, sizeof (*t));
	-}
	-
	-/*
	- * Wake up the zthr if it is sleeping. If the thread has been
	- * cancelled that does nothing.
	- */
	-void
	-zthr_wakeup(zthr_t *t)
	-{
	- mutex_enter(&t->zthr_request_lock);
	- mutex_enter(&t->zthr_state_lock);
	-
	- /*
	- * There are 4 states that we can find the zthr when issuing
	- * this broadcast:
	- *
	- * [1] The common case of the thread being asleep, at which
	- * point the broadcast will wake it up.
	- * [2] The thread has been cancelled. Waking up a cancelled
	- * thread is a no-op. Any work that is still left to be
	- * done should be handled the next time the thread is
	- * resumed.
	- * [3] The thread is doing work and is already up, so this
	- * is basically a no-op.
	- * [4] The thread was just created/resumed, in which case the
	- * behavior is similar to [3].
	- */
	- cv_broadcast(&t->zthr_cv);
	-
	- mutex_exit(&t->zthr_state_lock);
	- mutex_exit(&t->zthr_request_lock);
	-}
	-
	-/*
	- * Sends a cancel request to the zthr and blocks until the zthr is
	- * cancelled. If the zthr is not running (e.g. has been cancelled
	- * already), this is a no-op.
	- */
	-void
	-zthr_cancel(zthr_t *t)
	-{
	- mutex_enter(&t->zthr_request_lock);
	- mutex_enter(&t->zthr_state_lock);
	-
	- /*
	- * Since we are holding the zthr_state_lock at this point
	- * we can find the state in one of the following 4 states:
	- *
	- * [1] The thread has already been cancelled, therefore
	- * there is nothing for us to do.
	- * [2] The thread is sleeping, so we broadcast the CV first
	- * to wake it up and then we set the flag and we are
	- * waiting for it to exit.
	- * [3] The thread is doing work, in which case we just set
	- * the flag and wait for it to finish.
	- * [4] The thread was just created/resumed, in which case
	- * the behavior is similar to [3].
	- *
	- * Since requests are serialized, by the time that we get
	- * control back we expect that the zthr is cancelled and
	- * not running anymore.
	- */
	- if (t->zthr_thread != NULL) {
	- t->zthr_cancel = B_TRUE;
	-
	- /* broadcast in case the zthr is sleeping */
	- cv_broadcast(&t->zthr_cv);
	-
	- while (t->zthr_thread != NULL)
	- cv_wait(&t->zthr_cv, &t->zthr_state_lock);
	-
	- ASSERT(!t->zthr_cancel);
	- }
	-
	- mutex_exit(&t->zthr_state_lock);
	- mutex_exit(&t->zthr_request_lock);
	-}
	-
	-/*
	- * Sends a resume request to the supplied zthr. If the zthr is
	- * already running this is a no-op.
	- */
	-void
	-zthr_resume(zthr_t *t)
	-{
	- mutex_enter(&t->zthr_request_lock);
	- mutex_enter(&t->zthr_state_lock);
	-
	- ASSERT3P(&t->zthr_checkfunc, !=, NULL);
	- ASSERT3P(&t->zthr_func, !=, NULL);
	- ASSERT(!t->zthr_cancel);
	-
	- /*
	- * There are 4 states that we find the zthr in at this point
	- * given the locks that we hold:
	- *
	- * [1] The zthr was cancelled, so we spawn a new thread for
	- * the zthr (common case).
	- * [2] The zthr is running at which point this is a no-op.
	- * [3] The zthr is sleeping at which point this is a no-op.
	- * [4] The zthr was just spawned at which point this is a
	- * no-op.
	- */
	- if (t->zthr_thread == NULL) {
	- t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
	- 0, &p0, TS_RUN, minclsyspri);
	- }
	-
	- mutex_exit(&t->zthr_state_lock);
	- mutex_exit(&t->zthr_request_lock);
	-}
	-
	-/*
	- * This function is intended to be used by the zthr itself
	- * (specifically the zthr_func callback provided) to check
	- * if another thread has signaled it to stop running before
	- * doing some expensive operation.
	- *
	- * returns TRUE if we are in the middle of trying to cancel
	- * this thread.
	- *
	- * returns FALSE otherwise.
	- */
	-boolean_t
	-zthr_iscancelled(zthr_t *t)
	-{
	- ASSERT3P(t->zthr_thread, ==, curthread);
	-
	- /*
	- * The majority of the functions here grab zthr_request_lock
	- * first and then zthr_state_lock. This function only grabs
	- * the zthr_state_lock. That is because this function should
	- * only be called from the zthr_func to check if someone has
	- * issued a zthr_cancel() on the thread. If there is a zthr_cancel()
	- * happening concurrently, attempting to grab the request lock
	- * here would result in a deadlock.
	- *
	- * By grabbing only the zthr_state_lock this function is allowed
	- * to run concurrently with a zthr_cancel() request.
	- */
	- mutex_enter(&t->zthr_state_lock);
	- boolean_t cancelled = t->zthr_cancel;
	- mutex_exit(&t->zthr_state_lock);
	- return (cancelled);
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
	@@ -1,3347 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- *
	- * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Portions Copyright 2010 Robert Milkowski
	- *
	- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright (c) 2016 Actifio, Inc. All rights reserved.
	- */
	-
	-/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
	-
	-/*
	- * ZFS volume emulation driver.
	- *
	- * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
	- * Volumes are accessed through the symbolic links named:
	- *
	- * /dev/zvol/dsk/<pool_name>/<dataset_name>
	- * /dev/zvol/rdsk/<pool_name>/<dataset_name>
	- *
	- * These links are created by the /dev filesystem (sdev_zvolops.c).
	- * Volumes are persistent through reboot. No user command needs to be
	- * run before opening and using a device.
	- *
	- * FreeBSD notes.
	- * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
	- * in the system.
	- */
	-
	-#include <sys/types.h>
	-#include <sys/param.h>
	-#include <sys/kernel.h>
	-#include <sys/errno.h>
	-#include <sys/uio.h>
	-#include <sys/bio.h>
	-#include <sys/buf.h>
	-#include <sys/kmem.h>
	-#include <sys/conf.h>
	-#include <sys/cmn_err.h>
	-#include <sys/stat.h>
	-#include <sys/zap.h>
	-#include <sys/spa.h>
	-#include <sys/spa_impl.h>
	-#include <sys/zio.h>
	-#include <sys/disk.h>
	-#include <sys/dmu_traverse.h>
	-#include <sys/dnode.h>
	-#include <sys/dsl_dataset.h>
	-#include <sys/dsl_prop.h>
	-#include <sys/dkio.h>
	-#include <sys/byteorder.h>
	-#include <sys/sunddi.h>
	-#include <sys/dirent.h>
	-#include <sys/policy.h>
	-#include <sys/queue.h>
	-#include <sys/fs/zfs.h>
	-#include <sys/zfs_ioctl.h>
	-#include <sys/zil.h>
	-#include <sys/refcount.h>
	-#include <sys/zfs_znode.h>
	-#include <sys/zfs_rlock.h>
	-#include <sys/vdev_impl.h>
	-#include <sys/vdev_raidz.h>
	-#include <sys/zvol.h>
	-#include <sys/zil_impl.h>
	-#include <sys/dbuf.h>
	-#include <sys/dmu_tx.h>
	-#include <sys/zfeature.h>
	-#include <sys/zio_checksum.h>
	-#include <sys/zil_impl.h>
	-#include <sys/filio.h>
	-#include <sys/zfs_rlock.h>
	-
	-#include <geom/geom.h>
	-
	-#include "zfs_namecheck.h"
	-
	-#ifndef illumos
	-struct g_class zfs_zvol_class = {
	- .name = "ZFS::ZVOL",
	- .version = G_VERSION,
	-};
	-
	-DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
	-
	-#endif
	-void *zfsdev_state;
	-static char *zvol_tag = "zvol_tag";
	-
	-#define ZVOL_DUMPSIZE "dumpsize"
	-
	-/*
	- * This lock protects the zfsdev_state structure from being modified
	- * while it's being used, e.g. an open that comes in before a create
	- * finishes. It also protects temporary opens of the dataset so that,
	- * e.g., an open doesn't get a spurious EBUSY.
	- */
	-#ifdef illumos
	-kmutex_t zfsdev_state_lock;
	-#else
	-/*
	- * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
	- * spa_namespace_lock in the ZVOL code.
	- */
	-#define zfsdev_state_lock spa_namespace_lock
	-#endif
	-static uint32_t zvol_minors;
	-
	-#ifndef illumos
	-SYSCTL_DECL(_vfs_zfs);
	-SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	- "ZFS VOLUME");
	-static int volmode = ZFS_VOLMODE_GEOM;
	-SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
	- "Expose as GEOM providers (1), device files (2) or neither");
	-static boolean_t zpool_on_zvol = B_FALSE;
	-SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
	- "Allow zpools to use zvols as vdevs (DANGEROUS)");
	-
	-#endif
	-typedef struct zvol_extent {
	- list_node_t ze_node;
	- dva_t ze_dva; /* dva associated with this extent */
	- uint64_t ze_nblks; /* number of blocks in extent */
	-} zvol_extent_t;
	-
	-/*
	- * The in-core state of each volume.
	- */
	-typedef struct zvol_state {
	-#ifndef illumos
	- LIST_ENTRY(zvol_state) zv_links;
	-#endif
	- char zv_name[MAXPATHLEN]; /* pool/dd name */
	- uint64_t zv_volsize; /* amount of space we advertise */
	- uint64_t zv_volblocksize; /* volume block size */
	-#ifdef illumos
	- minor_t zv_minor; /* minor number */
	-#else
	- struct cdev zv_dev; / non-GEOM device */
	- struct g_provider zv_provider; / GEOM provider */
	-#endif
	- uint8_t zv_min_bs; /* minimum addressable block shift */
	- uint8_t zv_flags; /* readonly, dumpified, etc. */
	- objset_t zv_objset; / objset handle */
	-#ifdef illumos
	- uint32_t zv_open_count[OTYPCNT]; /* open counts */
	-#endif
	- uint32_t zv_total_opens; /* total open count */
	- uint32_t zv_sync_cnt; /* synchronous open count */
	- zilog_t zv_zilog; / ZIL handle */
	- list_t zv_extents; /* List of extents for dump */
	- rangelock_t zv_rangelock;
	- dnode_t zv_dn; / dnode hold */
	-#ifndef illumos
	- int zv_state;
	- int zv_volmode; /* Provide GEOM or cdev */
	- struct bio_queue_head zv_queue;
	- struct mtx zv_queue_mtx; /* zv_queue mutex */
	-#endif
	-} zvol_state_t;
	-
	-typedef enum {
	- ZVOL_ASYNC_CREATE_MINORS,
	- ZVOL_ASYNC_REMOVE_MINORS,
	- ZVOL_ASYNC_RENAME_MINORS,
	- ZVOL_ASYNC_MAX
	-} zvol_async_op_t;
	-
	-typedef struct {
	- zvol_async_op_t op;
	- char pool[ZFS_MAX_DATASET_NAME_LEN];
	- char name1[ZFS_MAX_DATASET_NAME_LEN];
	- char name2[ZFS_MAX_DATASET_NAME_LEN];
	-} zvol_task_t;
	-
	-#ifndef illumos
	-static LIST_HEAD(, zvol_state) all_zvols;
	-#endif
	-/*
	- * zvol specific flags
	- */
	-#define ZVOL_RDONLY 0x1
	-#define ZVOL_DUMPIFIED 0x2
	-#define ZVOL_EXCL 0x4
	-#define ZVOL_WCE 0x8
	-
	-/*
	- * zvol maximum transfer in one DMU tx.
	- */
	-int zvol_maxphys = DMU_MAX_ACCESS/2;
	-
	-/*
	- * Toggle unmap functionality.
	- */
	-boolean_t zvol_unmap_enabled = B_TRUE;
	-
	-/*
	- * If true, unmaps requested as synchronous are executed synchronously,
	- * otherwise all unmaps are asynchronous.
	- */
	-boolean_t zvol_unmap_sync_enabled = B_FALSE;
	-
	-#ifndef illumos
	-SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
	- &zvol_unmap_enabled, 0,
	- "Enable UNMAP functionality");
	-
	-SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_sync_enabled, CTLFLAG_RWTUN,
	- &zvol_unmap_sync_enabled, 0,
	- "UNMAPs requested as sync are executed synchronously");
	-
	-static d_open_t zvol_d_open;
	-static d_close_t zvol_d_close;
	-static d_read_t zvol_read;
	-static d_write_t zvol_write;
	-static d_ioctl_t zvol_d_ioctl;
	-static d_strategy_t zvol_strategy;
	-
	-static struct cdevsw zvol_cdevsw = {
	- .d_version = D_VERSION,
	- .d_open = zvol_d_open,
	- .d_close = zvol_d_close,
	- .d_read = zvol_read,
	- .d_write = zvol_write,
	- .d_ioctl = zvol_d_ioctl,
	- .d_strategy = zvol_strategy,
	- .d_name = "zvol",
	- .d_flags = D_DISK \| D_TRACKCLOSE,
	-};
	-
	-static void zvol_geom_run(zvol_state_t *zv);
	-static void zvol_geom_destroy(zvol_state_t *zv);
	-static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
	-static void zvol_geom_start(struct bio *bp);
	-static void zvol_geom_worker(void *arg);
	-static void zvol_log_truncate(zvol_state_t zv, dmu_tx_t tx, uint64_t off,
	- uint64_t len, boolean_t sync);
	-#endif /* !illumos */
	-
	-extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
	- nvlist_t , nvlist_t );
	-static int zvol_remove_zv(zvol_state_t *);
	-static int zvol_get_data(void arg, lr_write_t lr, char *buf,
	- struct lwb lwb, zio_t zio);
	-static int zvol_dumpify(zvol_state_t *zv);
	-static int zvol_dump_fini(zvol_state_t *zv);
	-static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
	-
	-static void
	-zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
	-{
	-#ifdef illumos
	- dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
	-
	- zv->zv_volsize = volsize;
	- VERIFY(ddi_prop_update_int64(dev, zfs_dip,
	- "Size", volsize) == DDI_SUCCESS);
	- VERIFY(ddi_prop_update_int64(dev, zfs_dip,
	- "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
	-
	- /* Notify specfs to invalidate the cached size */
	- spec_size_invalidate(dev, VBLK);
	- spec_size_invalidate(dev, VCHR);
	-#else /* !illumos */
	- zv->zv_volsize = volsize;
	- if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
	- struct g_provider *pp;
	-
	- pp = zv->zv_provider;
	- if (pp == NULL)
	- return;
	- g_topology_lock();
	-
	- /*
	- * Do not invoke resize event when initial size was zero.
	- * ZVOL initializes the size on first open, this is not
	- * real resizing.
	- */
	- if (pp->mediasize == 0)
	- pp->mediasize = zv->zv_volsize;
	- else
	- g_resize_provider(pp, zv->zv_volsize);
	- g_topology_unlock();
	- }
	-#endif /* illumos */
	-}
	-
	-int
	-zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
	-{
	- if (volsize == 0)
	- return (SET_ERROR(EINVAL));
	-
	- if (volsize % blocksize != 0)
	- return (SET_ERROR(EINVAL));
	-
	-#ifdef _ILP32
	- if (volsize - 1 > SPEC_MAXOFFSET_T)
	- return (SET_ERROR(EOVERFLOW));
	-#endif
	- return (0);
	-}
	-
	-int
	-zvol_check_volblocksize(uint64_t volblocksize)
	-{
	- if (volblocksize < SPA_MINBLOCKSIZE \|\|
	- volblocksize > SPA_OLD_MAXBLOCKSIZE \|\|
	- !ISP2(volblocksize))
	- return (SET_ERROR(EDOM));
	-
	- return (0);
	-}
	-
	-int
	-zvol_get_stats(objset_t os, nvlist_t nv)
	-{
	- int error;
	- dmu_object_info_t doi;
	- uint64_t val;
	-
	- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
	- if (error)
	- return (error);
	-
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
	-
	- error = dmu_object_info(os, ZVOL_OBJ, &doi);
	-
	- if (error == 0) {
	- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
	- doi.doi_data_block_size);
	- }
	-
	- return (error);
	-}
	-
	-static zvol_state_t *
	-zvol_minor_lookup(const char *name)
	-{
	-#ifdef illumos
	- minor_t minor;
	-#endif
	- zvol_state_t *zv;
	-
	- ASSERT(MUTEX_HELD(&zfsdev_state_lock));
	-
	-#ifdef illumos
	- for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
	- zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
	- if (zv == NULL)
	- continue;
	-#else
	- LIST_FOREACH(zv, &all_zvols, zv_links) {
	-#endif
	- if (strcmp(zv->zv_name, name) == 0)
	- return (zv);
	- }
	-
	- return (NULL);
	-}
	-
	-/* extent mapping arg */
	-struct maparg {
	- zvol_state_t *ma_zv;
	- uint64_t ma_blks;
	-};
	-
	-/ARGSUSED/
	-static int
	-zvol_map_block(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	- const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	-{
	- struct maparg *ma = arg;
	- zvol_extent_t *ze;
	- int bs = ma->ma_zv->zv_volblocksize;
	-
	- if (bp == NULL \|\| BP_IS_HOLE(bp) \|\|
	- zb->zb_object != ZVOL_OBJ \|\| zb->zb_level != 0)
	- return (0);
	-
	- VERIFY(!BP_IS_EMBEDDED(bp));
	-
	- VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
	- ma->ma_blks++;
	-
	- /* Abort immediately if we have encountered gang blocks */
	- if (BP_IS_GANG(bp))
	- return (SET_ERROR(EFRAGS));
	-
	- /*
	- * See if the block is at the end of the previous extent.
	- */
	- ze = list_tail(&ma->ma_zv->zv_extents);
	- if (ze &&
	- DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
	- DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
	- DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
	- ze->ze_nblks++;
	- return (0);
	- }
	-
	- dprintf_bp(bp, "%s", "next blkptr:");
	-
	- /* start a new extent */
	- ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
	- ze->ze_dva = bp->blk_dva[0]; /* structure assignment */
	- ze->ze_nblks = 1;
	- list_insert_tail(&ma->ma_zv->zv_extents, ze);
	- return (0);
	-}
	-
	-static void
	-zvol_free_extents(zvol_state_t *zv)
	-{
	- zvol_extent_t *ze;
	-
	- while (ze = list_head(&zv->zv_extents)) {
	- list_remove(&zv->zv_extents, ze);
	- kmem_free(ze, sizeof (zvol_extent_t));
	- }
	-}
	-
	-static int
	-zvol_get_lbas(zvol_state_t *zv)
	-{
	- objset_t *os = zv->zv_objset;
	- struct maparg ma;
	- int err;
	-
	- ma.ma_zv = zv;
	- ma.ma_blks = 0;
	- zvol_free_extents(zv);
	-
	- /* commit any in-flight changes before traversing the dataset */
	- txg_wait_synced(dmu_objset_pool(os), 0);
	- err = traverse_dataset(dmu_objset_ds(os), 0,
	- TRAVERSE_PRE \| TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
	- if (err \|\| ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
	- zvol_free_extents(zv);
	- return (err ? err : EIO);
	- }
	-
	- return (0);
	-}
	-
	-/* ARGSUSED */
	-void
	-zvol_create_cb(objset_t os, void arg, cred_t cr, dmu_tx_t tx)
	-{
	- zfs_creat_t *zct = arg;
	- nvlist_t *nvprops = zct->zct_props;
	- int error;
	- uint64_t volblocksize, volsize;
	-
	- VERIFY(nvlist_lookup_uint64(nvprops,
	- zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
	- if (nvlist_lookup_uint64(nvprops,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
	- volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
	-
	- /*
	- * These properties must be removed from the list so the generic
	- * property setting step won't apply to them.
	- */
	- VERIFY(nvlist_remove_all(nvprops,
	- zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
	- (void) nvlist_remove_all(nvprops,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
	-
	- error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
	- DMU_OT_NONE, 0, tx);
	- ASSERT(error == 0);
	-
	- error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
	- DMU_OT_NONE, 0, tx);
	- ASSERT(error == 0);
	-
	- error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
	- ASSERT(error == 0);
	-}
	-
	-/*
	- * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we
	- * implement DKIOCFREE/free-long-range.
	- */
	-static int
	-zvol_replay_truncate(void arg1, void arg2, boolean_t byteswap)
	-{
	- zvol_state_t *zv = arg1;
	- lr_truncate_t *lr = arg2;
	- uint64_t offset, length;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- offset = lr->lr_offset;
	- length = lr->lr_length;
	-
	- return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
	-}
	-
	-/*
	- * Replay a TX_WRITE ZIL transaction that didn't get committed
	- * after a system failure
	- */
	-static int
	-zvol_replay_write(void arg1, void arg2, boolean_t byteswap)
	-{
	- zvol_state_t *zv = arg1;
	- lr_write_t *lr = arg2;
	- objset_t *os = zv->zv_objset;
	- char data = (char )(lr + 1); /* data follows lr_write_t */
	- uint64_t offset, length;
	- dmu_tx_t *tx;
	- int error;
	-
	- if (byteswap)
	- byteswap_uint64_array(lr, sizeof (*lr));
	-
	- offset = lr->lr_offset;
	- length = lr->lr_length;
	-
	- /* If it's a dmu_sync() block, write the whole block */
	- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
	- uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
	- if (length < blocksize) {
	- offset -= offset % blocksize;
	- length = blocksize;
	- }
	- }
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- } else {
	- dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
	- dmu_tx_commit(tx);
	- }
	-
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static int
	-zvol_replay_err(void arg1, void arg2, boolean_t byteswap)
	-{
	- return (SET_ERROR(ENOTSUP));
	-}
	-
	-/*
	- * Callback vectors for replaying records.
	- * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
	- */
	-zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
	- zvol_replay_err, /* 0 no such transaction type */
	- zvol_replay_err, /* TX_CREATE */
	- zvol_replay_err, /* TX_MKDIR */
	- zvol_replay_err, /* TX_MKXATTR */
	- zvol_replay_err, /* TX_SYMLINK */
	- zvol_replay_err, /* TX_REMOVE */
	- zvol_replay_err, /* TX_RMDIR */
	- zvol_replay_err, /* TX_LINK */
	- zvol_replay_err, /* TX_RENAME */
	- zvol_replay_write, /* TX_WRITE */
	- zvol_replay_truncate, /* TX_TRUNCATE */
	- zvol_replay_err, /* TX_SETATTR */
	- zvol_replay_err, /* TX_ACL */
	- zvol_replay_err, /* TX_CREATE_ACL */
	- zvol_replay_err, /* TX_CREATE_ATTR */
	- zvol_replay_err, /* TX_CREATE_ACL_ATTR */
	- zvol_replay_err, /* TX_MKDIR_ACL */
	- zvol_replay_err, /* TX_MKDIR_ATTR */
	- zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
	- zvol_replay_err, /* TX_WRITE2 */
	-};
	-
	-#ifdef illumos
	-int
	-zvol_name2minor(const char name, minor_t minor)
	-{
	- zvol_state_t *zv;
	-
	- mutex_enter(&zfsdev_state_lock);
	- zv = zvol_minor_lookup(name);
	- if (minor && zv)
	- *minor = zv->zv_minor;
	- mutex_exit(&zfsdev_state_lock);
	- return (zv ? 0 : -1);
	-}
	-#endif /* illumos */
	-
	-/*
	- * Create a minor node (plus a whole lot more) for the specified volume.
	- */
	-static int
	-zvol_create_minor(const char *name)
	-{
	- zfs_soft_state_t *zs;
	- zvol_state_t *zv;
	- objset_t *os;
	-#ifdef illumos
	- dmu_object_info_t doi;
	- minor_t minor = 0;
	- char chrbuf[30], blkbuf[30];
	-#else
	- struct g_provider *pp;
	- struct g_geom *gp;
	- uint64_t mode;
	-#endif
	- int error;
	-
	-#ifndef illumos
	- ZFS_LOG(1, "Creating ZVOL %s...", name);
	-#endif
	-
	- mutex_enter(&zfsdev_state_lock);
	-
	- if (zvol_minor_lookup(name) != NULL) {
	- mutex_exit(&zfsdev_state_lock);
	- return (SET_ERROR(EEXIST));
	- }
	-
	- /* lie and say we're read-only */
	- error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
	-
	- if (error) {
	- mutex_exit(&zfsdev_state_lock);
	- return (error);
	- }
	-
	-#ifdef illumos
	- if ((minor = zfsdev_minor_alloc()) == 0) {
	- dmu_objset_disown(os, FTAG);
	- mutex_exit(&zfsdev_state_lock);
	- return (SET_ERROR(ENXIO));
	- }
	-
	- if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
	- dmu_objset_disown(os, FTAG);
	- mutex_exit(&zfsdev_state_lock);
	- return (SET_ERROR(EAGAIN));
	- }
	- (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
	- (char *)name);
	-
	- (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
	-
	- if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
	- minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
	- ddi_soft_state_free(zfsdev_state, minor);
	- dmu_objset_disown(os, FTAG);
	- mutex_exit(&zfsdev_state_lock);
	- return (SET_ERROR(EAGAIN));
	- }
	-
	- (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
	-
	- if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
	- minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
	- ddi_remove_minor_node(zfs_dip, chrbuf);
	- ddi_soft_state_free(zfsdev_state, minor);
	- dmu_objset_disown(os, FTAG);
	- mutex_exit(&zfsdev_state_lock);
	- return (SET_ERROR(EAGAIN));
	- }
	-
	- zs = ddi_get_soft_state(zfsdev_state, minor);
	- zs->zss_type = ZSST_ZVOL;
	- zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
	-#else /* !illumos */
	-
	- zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
	- zv->zv_state = 0;
	- error = dsl_prop_get_integer(name,
	- zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
	- if (error != 0 \|\| mode == ZFS_VOLMODE_DEFAULT)
	- mode = volmode;
	-
	- zv->zv_volmode = mode;
	- if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
	- g_topology_lock();
	- gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
	- gp->start = zvol_geom_start;
	- gp->access = zvol_geom_access;
	- pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
	- pp->flags \|= G_PF_DIRECT_RECEIVE \| G_PF_DIRECT_SEND;
	- pp->sectorsize = DEV_BSIZE;
	- pp->mediasize = 0;
	- pp->private = zv;
	-
	- zv->zv_provider = pp;
	- bioq_init(&zv->zv_queue);
	- mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
	- } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
	- struct make_dev_args args;
	-
	- make_dev_args_init(&args);
	- args.mda_flags = MAKEDEV_CHECKNAME \| MAKEDEV_WAITOK;
	- args.mda_devsw = &zvol_cdevsw;
	- args.mda_cr = NULL;
	- args.mda_uid = UID_ROOT;
	- args.mda_gid = GID_OPERATOR;
	- args.mda_mode = 0640;
	- args.mda_si_drv2 = zv;
	- error = make_dev_s(&args, &zv->zv_dev,
	- "%s/%s", ZVOL_DRIVER, name);
	- if (error != 0) {
	- kmem_free(zv, sizeof(*zv));
	- dmu_objset_disown(os, FTAG);
	- mutex_exit(&zfsdev_state_lock);
	- return (error);
	- }
	- zv->zv_dev->si_iosize_max = MAXPHYS;
	- }
	- LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
	-#endif /* illumos */
	-
	- (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
	- zv->zv_min_bs = DEV_BSHIFT;
	-#ifdef illumos
	- zv->zv_minor = minor;
	-#endif
	- zv->zv_objset = os;
	- if (dmu_objset_is_snapshot(os) \|\| !spa_writeable(dmu_objset_spa(os)))
	- zv->zv_flags \|= ZVOL_RDONLY;
	- rangelock_init(&zv->zv_rangelock, NULL, NULL);
	- list_create(&zv->zv_extents, sizeof (zvol_extent_t),
	- offsetof(zvol_extent_t, ze_node));
	-#ifdef illumos
	- /* get and cache the blocksize */
	- error = dmu_object_info(os, ZVOL_OBJ, &doi);
	- ASSERT(error == 0);
	- zv->zv_volblocksize = doi.doi_data_block_size;
	-#endif
	-
	- if (spa_writeable(dmu_objset_spa(os))) {
	- if (zil_replay_disable)
	- zil_destroy(dmu_objset_zil(os), B_FALSE);
	- else
	- zil_replay(os, zv, zvol_replay_vector);
	- }
	- dmu_objset_disown(os, FTAG);
	- zv->zv_objset = NULL;
	-
	- zvol_minors++;
	-
	- mutex_exit(&zfsdev_state_lock);
	-#ifndef illumos
	- if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
	- zvol_geom_run(zv);
	- g_topology_unlock();
	- }
	-
	- ZFS_LOG(1, "ZVOL %s created.", name);
	-#endif
	-
	- return (0);
	-}
	-
	-/*
	- * Remove minor node for the specified volume.
	- */
	-static int
	-zvol_remove_zv(zvol_state_t *zv)
	-{
	-#ifdef illumos
	- char nmbuf[20];
	- minor_t minor = zv->zv_minor;
	-#endif
	-
	- ASSERT(MUTEX_HELD(&zfsdev_state_lock));
	- if (zv->zv_total_opens != 0)
	- return (SET_ERROR(EBUSY));
	-
	-#ifdef illumos
	- (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
	- ddi_remove_minor_node(zfs_dip, nmbuf);
	-
	- (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
	- ddi_remove_minor_node(zfs_dip, nmbuf);
	-#else
	- ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
	-
	- LIST_REMOVE(zv, zv_links);
	- if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
	- g_topology_lock();
	- zvol_geom_destroy(zv);
	- g_topology_unlock();
	- } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
	- if (zv->zv_dev != NULL)
	- destroy_dev(zv->zv_dev);
	- }
	-#endif
	-
	- rangelock_fini(&zv->zv_rangelock);
	-
	- kmem_free(zv, sizeof (zvol_state_t));
	-#ifdef illumos
	- ddi_soft_state_free(zfsdev_state, minor);
	-#endif
	- zvol_minors--;
	- return (0);
	-}
	-
	-int
	-zvol_first_open(zvol_state_t *zv)
	-{
	- dmu_object_info_t doi;
	- objset_t *os;
	- uint64_t volsize;
	- int error;
	- uint64_t readonly;
	-
	- /* lie and say we're read-only */
	- error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
	- zvol_tag, &os);
	- if (error)
	- return (error);
	-
	- zv->zv_objset = os;
	- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
	- if (error) {
	- ASSERT(error == 0);
	- dmu_objset_disown(os, zvol_tag);
	- return (error);
	- }
	-
	- /* get and cache the blocksize */
	- error = dmu_object_info(os, ZVOL_OBJ, &doi);
	- if (error) {
	- ASSERT(error == 0);
	- dmu_objset_disown(os, zvol_tag);
	- return (error);
	- }
	- zv->zv_volblocksize = doi.doi_data_block_size;
	-
	- error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn);
	- if (error) {
	- dmu_objset_disown(os, zvol_tag);
	- return (error);
	- }
	-
	- zvol_size_changed(zv, volsize);
	- zv->zv_zilog = zil_open(os, zvol_get_data);
	-
	- VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
	- NULL) == 0);
	- if (readonly \|\| dmu_objset_is_snapshot(os) \|\|
	- !spa_writeable(dmu_objset_spa(os)))
	- zv->zv_flags \|= ZVOL_RDONLY;
	- else
	- zv->zv_flags &= ~ZVOL_RDONLY;
	- return (error);
	-}
	-
	-void
	-zvol_last_close(zvol_state_t *zv)
	-{
	- zil_close(zv->zv_zilog);
	- zv->zv_zilog = NULL;
	-
	- dnode_rele(zv->zv_dn, zvol_tag);
	- zv->zv_dn = NULL;
	-
	- /*
	- * Evict cached data
	- */
	- if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
	- !(zv->zv_flags & ZVOL_RDONLY))
	- txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
	- dmu_objset_evict_dbufs(zv->zv_objset);
	-
	- dmu_objset_disown(zv->zv_objset, zvol_tag);
	- zv->zv_objset = NULL;
	-}
	-
	-#ifdef illumos
	-int
	-zvol_prealloc(zvol_state_t *zv)
	-{
	- objset_t *os = zv->zv_objset;
	- dmu_tx_t *tx;
	- uint64_t refd, avail, usedobjs, availobjs;
	- uint64_t resid = zv->zv_volsize;
	- uint64_t off = 0;
	-
	- /* Check the space usage before attempting to allocate the space */
	- dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
	- if (avail < zv->zv_volsize)
	- return (SET_ERROR(ENOSPC));
	-
	- /* Free old extents if they exist */
	- zvol_free_extents(zv);
	-
	- while (resid != 0) {
	- int error;
	- uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
	- return (error);
	- }
	- dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
	- dmu_tx_commit(tx);
	- off += bytes;
	- resid -= bytes;
	- }
	- txg_wait_synced(dmu_objset_pool(os), 0);
	-
	- return (0);
	-}
	-#endif /* illumos */
	-
	-static int
	-zvol_update_volsize(objset_t *os, uint64_t volsize)
	-{
	- dmu_tx_t *tx;
	- int error;
	-
	- ASSERT(MUTEX_HELD(&zfsdev_state_lock));
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
	- dmu_tx_mark_netfree(tx);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- return (error);
	- }
	-
	- error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
	- &volsize, tx);
	- dmu_tx_commit(tx);
	-
	- if (error == 0)
	- error = dmu_free_long_range(os,
	- ZVOL_OBJ, volsize, DMU_OBJECT_END);
	- return (error);
	-}
	-
	-void
	-zvol_remove_minors_impl(const char *name)
	-{
	-#ifdef illumos
	- zvol_state_t *zv;
	- char *namebuf;
	- minor_t minor;
	-
	- namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
	- (void) strncpy(namebuf, name, strlen(name));
	- (void) strcat(namebuf, "/");
	- mutex_enter(&zfsdev_state_lock);
	- for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
	-
	- zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
	- if (zv == NULL)
	- continue;
	- if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
	- (void) zvol_remove_zv(zv);
	- }
	- kmem_free(namebuf, strlen(name) + 2);
	-
	- mutex_exit(&zfsdev_state_lock);
	-#else /* !illumos */
	- zvol_state_t zv, tzv;
	- size_t namelen;
	-
	- namelen = strlen(name);
	-
	- mutex_enter(&zfsdev_state_lock);
	-
	- LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
	- if (strcmp(zv->zv_name, name) == 0 \|\|
	- (strncmp(zv->zv_name, name, namelen) == 0 &&
	- strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' \|\|
	- zv->zv_name[namelen] == '@'))) {
	- (void) zvol_remove_zv(zv);
	- }
	- }
	-
	- mutex_exit(&zfsdev_state_lock);
	-#endif /* illumos */
	-}
	-
	-static int
	-zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
	-{
	- uint64_t old_volsize = 0ULL;
	- int error = 0;
	-
	- ASSERT(MUTEX_HELD(&zfsdev_state_lock));
	-
	- /*
	- * Reinitialize the dump area to the new size. If we
	- * failed to resize the dump area then restore it back to
	- * its original size. We must set the new volsize prior
	- * to calling dumpvp_resize() to ensure that the devices'
	- * size(9P) is not visible by the dump subsystem.
	- */
	- old_volsize = zv->zv_volsize;
	- zvol_size_changed(zv, volsize);
	-
	-#ifdef ZVOL_DUMP
	- if (zv->zv_flags & ZVOL_DUMPIFIED) {
	- if ((error = zvol_dumpify(zv)) != 0 \|\|
	- (error = dumpvp_resize()) != 0) {
	- int dumpify_error;
	-
	- (void) zvol_update_volsize(zv->zv_objset, old_volsize);
	- zvol_size_changed(zv, old_volsize);
	- dumpify_error = zvol_dumpify(zv);
	- error = dumpify_error ? dumpify_error : error;
	- }
	- }
	-#endif /* ZVOL_DUMP */
	-
	-#ifdef illumos
	- /*
	- * Generate a LUN expansion event.
	- */
	- if (error == 0) {
	- sysevent_id_t eid;
	- nvlist_t *attr;
	- char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
	-
	- (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
	- zv->zv_minor);
	-
	- VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
	-
	- (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
	- ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
	-
	- nvlist_free(attr);
	- kmem_free(physpath, MAXPATHLEN);
	- }
	-#endif /* illumos */
	- return (error);
	-}
	-
	-int
	-zvol_set_volsize(const char *name, uint64_t volsize)
	-{
	- zvol_state_t *zv = NULL;
	- objset_t *os;
	- int error;
	- dmu_object_info_t doi;
	- uint64_t readonly;
	- boolean_t owned = B_FALSE;
	-
	- error = dsl_prop_get_integer(name,
	- zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
	- if (error != 0)
	- return (error);
	- if (readonly)
	- return (SET_ERROR(EROFS));
	-
	- mutex_enter(&zfsdev_state_lock);
	- zv = zvol_minor_lookup(name);
	-
	- if (zv == NULL \|\| zv->zv_objset == NULL) {
	- if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
	- FTAG, &os)) != 0) {
	- mutex_exit(&zfsdev_state_lock);
	- return (error);
	- }
	- owned = B_TRUE;
	- if (zv != NULL)
	- zv->zv_objset = os;
	- } else {
	- os = zv->zv_objset;
	- }
	-
	- if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 \|\|
	- (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
	- goto out;
	-
	- error = zvol_update_volsize(os, volsize);
	-
	- if (error == 0 && zv != NULL)
	- error = zvol_update_live_volsize(zv, volsize);
	-out:
	- if (owned) {
	- dmu_objset_disown(os, FTAG);
	- if (zv != NULL)
	- zv->zv_objset = NULL;
	- }
	- mutex_exit(&zfsdev_state_lock);
	- return (error);
	-}
	-
	-/ARGSUSED/
	-#ifdef illumos
	-int
	-zvol_open(dev_t devp, int flag, int otyp, cred_t cr)
	-#else
	-static int
	-zvol_open(struct g_provider *pp, int flag, int count)
	-#endif
	-{
	- zvol_state_t *zv;
	- int err = 0;
	-#ifdef illumos
	-
	- mutex_enter(&zfsdev_state_lock);
	-
	- zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
	- if (zv == NULL) {
	- mutex_exit(&zfsdev_state_lock);
	- return (SET_ERROR(ENXIO));
	- }
	-
	- if (zv->zv_total_opens == 0)
	- err = zvol_first_open(zv);
	- if (err) {
	- mutex_exit(&zfsdev_state_lock);
	- return (err);
	- }
	-#else /* !illumos */
	- boolean_t locked = B_FALSE;
	-
	- if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
	- /*
	- * if zfs_geom_probe_vdev_key is set, that means that zfs is
	- * attempting to probe geom providers while looking for a
	- * replacement for a missing VDEV. In this case, the
	- * spa_namespace_lock will not be held, but it is still illegal
	- * to use a zvol as a vdev. Deadlocks can result if another
	- * thread has spa_namespace_lock
	- */
	- return (EOPNOTSUPP);
	- }
	- /*
	- * Protect against recursively entering spa_namespace_lock
	- * when spa_open() is used for a pool on a (local) ZVOL(s).
	- * This is needed since we replaced upstream zfsdev_state_lock
	- * with spa_namespace_lock in the ZVOL code.
	- * We are using the same trick as spa_open().
	- * Note that calls in zvol_first_open which need to resolve
	- * pool name to a spa object will enter spa_open()
	- * recursively, but that function already has all the
	- * necessary protection.
	- */
	- if (!MUTEX_HELD(&zfsdev_state_lock)) {
	- mutex_enter(&zfsdev_state_lock);
	- locked = B_TRUE;
	- }
	-
	- zv = pp->private;
	- if (zv == NULL) {
	- if (locked)
	- mutex_exit(&zfsdev_state_lock);
	- return (SET_ERROR(ENXIO));
	- }
	-
	- if (zv->zv_total_opens == 0) {
	- err = zvol_first_open(zv);
	- if (err) {
	- if (locked)
	- mutex_exit(&zfsdev_state_lock);
	- return (err);
	- }
	- pp->mediasize = zv->zv_volsize;
	- pp->stripeoffset = 0;
	- pp->stripesize = zv->zv_volblocksize;
	- }
	-#endif /* illumos */
	- if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
	- err = SET_ERROR(EROFS);
	- goto out;
	- }
	- if (zv->zv_flags & ZVOL_EXCL) {
	- err = SET_ERROR(EBUSY);
	- goto out;
	- }
	-#ifdef FEXCL
	- if (flag & FEXCL) {
	- if (zv->zv_total_opens != 0) {
	- err = SET_ERROR(EBUSY);
	- goto out;
	- }
	- zv->zv_flags \|= ZVOL_EXCL;
	- }
	-#endif
	-
	-#ifdef illumos
	- if (zv->zv_open_count[otyp] == 0 \|\| otyp == OTYP_LYR) {
	- zv->zv_open_count[otyp]++;
	- zv->zv_total_opens++;
	- }
	- mutex_exit(&zfsdev_state_lock);
	-#else
	- zv->zv_total_opens += count;
	- if (locked)
	- mutex_exit(&zfsdev_state_lock);
	-#endif
	-
	- return (err);
	-out:
	- if (zv->zv_total_opens == 0)
	- zvol_last_close(zv);
	-#ifdef illumos
	- mutex_exit(&zfsdev_state_lock);
	-#else
	- if (locked)
	- mutex_exit(&zfsdev_state_lock);
	-#endif
	- return (err);
	-}
	-
	-/ARGSUSED/
	-#ifdef illumos
	-int
	-zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
	-{
	- minor_t minor = getminor(dev);
	- zvol_state_t *zv;
	- int error = 0;
	-
	- mutex_enter(&zfsdev_state_lock);
	-
	- zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
	- if (zv == NULL) {
	- mutex_exit(&zfsdev_state_lock);
	-#else /* !illumos */
	-static int
	-zvol_close(struct g_provider *pp, int flag, int count)
	-{
	- zvol_state_t *zv;
	- int error = 0;
	- boolean_t locked = B_FALSE;
	-
	- /* See comment in zvol_open(). */
	- if (!MUTEX_HELD(&zfsdev_state_lock)) {
	- mutex_enter(&zfsdev_state_lock);
	- locked = B_TRUE;
	- }
	-
	- zv = pp->private;
	- if (zv == NULL) {
	- if (locked)
	- mutex_exit(&zfsdev_state_lock);
	-#endif /* illumos */
	- return (SET_ERROR(ENXIO));
	- }
	-
	- if (zv->zv_flags & ZVOL_EXCL) {
	- ASSERT(zv->zv_total_opens == 1);
	- zv->zv_flags &= ~ZVOL_EXCL;
	- }
	-
	- /*
	- * If the open count is zero, this is a spurious close.
	- * That indicates a bug in the kernel / DDI framework.
	- */
	-#ifdef illumos
	- ASSERT(zv->zv_open_count[otyp] != 0);
	-#endif
	- ASSERT(zv->zv_total_opens != 0);
	-
	- /*
	- * You may get multiple opens, but only one close.
	- */
	-#ifdef illumos
	- zv->zv_open_count[otyp]--;
	- zv->zv_total_opens--;
	-#else
	- zv->zv_total_opens -= count;
	-#endif
	-
	- if (zv->zv_total_opens == 0)
	- zvol_last_close(zv);
	-
	-#ifdef illumos
	- mutex_exit(&zfsdev_state_lock);
	-#else
	- if (locked)
	- mutex_exit(&zfsdev_state_lock);
	-#endif
	- return (error);
	-}
	-
	-/* ARGSUSED */
	-static void
	-zvol_get_done(zgd_t *zgd, int error)
	-{
	- if (zgd->zgd_db)
	- dmu_buf_rele(zgd->zgd_db, zgd);
	-
	- rangelock_exit(zgd->zgd_lr);
	-
	- kmem_free(zgd, sizeof (zgd_t));
	-}
	-
	-/*
	- * Get data to generate a TX_WRITE intent log record.
	- */
	-static int
	-zvol_get_data(void arg, lr_write_t lr, char buf, struct lwb lwb, zio_t *zio)
	-{
	- zvol_state_t *zv = arg;
	- uint64_t offset = lr->lr_offset;
	- uint64_t size = lr->lr_length; /* length of user data */
	- dmu_buf_t *db;
	- zgd_t *zgd;
	- int error;
	-
	- ASSERT3P(lwb, !=, NULL);
	- ASSERT3P(zio, !=, NULL);
	- ASSERT3U(size, !=, 0);
	-
	- zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
	- zgd->zgd_lwb = lwb;
	-
	- /*
	- * Write records come in two flavors: immediate and indirect.
	- * For small writes it's cheaper to store the data with the
	- * log record (immediate); for large writes it's cheaper to
	- * sync the data and get a pointer to it (indirect) so that
	- * we don't have to write the data twice.
	- */
	- if (buf != NULL) { /* immediate write */
	- zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
	- RL_READER);
	- error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
	- DMU_READ_NO_PREFETCH);
	- } else { /* indirect write */
	- /*
	- * Have to lock the whole block to ensure when it's written out
	- * and its checksum is being calculated that no one can change
	- * the data. Contrarily to zfs_get_data we need not re-check
	- * blocksize after we get the lock because it cannot be changed.
	- */
	- size = zv->zv_volblocksize;
	- offset = P2ALIGN(offset, size);
	- zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
	- RL_READER);
	- error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
	- DMU_READ_NO_PREFETCH);
	- if (error == 0) {
	- blkptr_t *bp = &lr->lr_blkptr;
	-
	- zgd->zgd_db = db;
	- zgd->zgd_bp = bp;
	-
	- ASSERT(db->db_offset == offset);
	- ASSERT(db->db_size == size);
	-
	- error = dmu_sync(zio, lr->lr_common.lrc_txg,
	- zvol_get_done, zgd);
	-
	- if (error == 0)
	- return (0);
	- }
	- }
	-
	- zvol_get_done(zgd, error);
	-
	- return (error);
	-}
	-
	-/*
	- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
	- *
	- * We store data in the log buffers if it's small enough.
	- * Otherwise we will later flush the data out via dmu_sync().
	- */
	-ssize_t zvol_immediate_write_sz = 32768;
	-#ifdef _KERNEL
	-SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
	- &zvol_immediate_write_sz, 0, "Minimal size for indirect log write");
	-#endif
	-
	-static void
	-zvol_log_write(zvol_state_t zv, dmu_tx_t tx, offset_t off, ssize_t resid,
	- boolean_t sync)
	-{
	- uint32_t blocksize = zv->zv_volblocksize;
	- zilog_t *zilog = zv->zv_zilog;
	- itx_wr_state_t write_state;
	-
	- if (zil_replaying(zilog, tx))
	- return;
	-
	- if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
	- write_state = WR_INDIRECT;
	- else if (!spa_has_slogs(zilog->zl_spa) &&
	- resid >= blocksize && blocksize > zvol_immediate_write_sz)
	- write_state = WR_INDIRECT;
	- else if (sync)
	- write_state = WR_COPIED;
	- else
	- write_state = WR_NEED_COPY;
	-
	- while (resid) {
	- itx_t *itx;
	- lr_write_t *lr;
	- itx_wr_state_t wr_state = write_state;
	- ssize_t len = resid;
	-
	- if (wr_state == WR_COPIED && resid > zil_max_copied_data(zilog))
	- wr_state = WR_NEED_COPY;
	- else if (wr_state == WR_INDIRECT)
	- len = MIN(blocksize - P2PHASE(off, blocksize), resid);
	-
	- itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
	- (wr_state == WR_COPIED ? len : 0));
	- lr = (lr_write_t *)&itx->itx_lr;
	- if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
	- off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
	- zil_itx_destroy(itx);
	- itx = zil_itx_create(TX_WRITE, sizeof (*lr));
	- lr = (lr_write_t *)&itx->itx_lr;
	- wr_state = WR_NEED_COPY;
	- }
	-
	- itx->itx_wr_state = wr_state;
	- lr->lr_foid = ZVOL_OBJ;
	- lr->lr_offset = off;
	- lr->lr_length = len;
	- lr->lr_blkoff = 0;
	- BP_ZERO(&lr->lr_blkptr);
	-
	- itx->itx_private = zv;
	-
	- if (!sync && (zv->zv_sync_cnt == 0))
	- itx->itx_sync = B_FALSE;
	-
	- zil_itx_assign(zilog, itx, tx);
	-
	- off += len;
	- resid -= len;
	- }
	-}
	-
	-#ifdef illumos
	-static int
	-zvol_dumpio_vdev(vdev_t vd, void addr, uint64_t offset, uint64_t origoffset,
	- uint64_t size, boolean_t doread, boolean_t isdump)
	-{
	- vdev_disk_t *dvd;
	- int c;
	- int numerrors = 0;
	-
	- if (vd->vdev_ops == &vdev_mirror_ops \|\|
	- vd->vdev_ops == &vdev_replacing_ops \|\|
	- vd->vdev_ops == &vdev_spare_ops) {
	- for (c = 0; c < vd->vdev_children; c++) {
	- int err = zvol_dumpio_vdev(vd->vdev_child[c],
	- addr, offset, origoffset, size, doread, isdump);
	- if (err != 0) {
	- numerrors++;
	- } else if (doread) {
	- break;
	- }
	- }
	- }
	-
	- if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
	- return (numerrors < vd->vdev_children ? 0 : EIO);
	-
	- if (doread && !vdev_readable(vd))
	- return (SET_ERROR(EIO));
	- else if (!doread && !vdev_writeable(vd))
	- return (SET_ERROR(EIO));
	-
	- if (vd->vdev_ops == &vdev_raidz_ops) {
	- return (vdev_raidz_physio(vd,
	- addr, size, offset, origoffset, doread, isdump));
	- }
	-
	- offset += VDEV_LABEL_START_SIZE;
	-
	- if (ddi_in_panic() \|\| isdump) {
	- ASSERT(!doread);
	- if (doread)
	- return (SET_ERROR(EIO));
	- dvd = vd->vdev_tsd;
	- ASSERT3P(dvd, !=, NULL);
	- return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
	- lbtodb(size)));
	- } else {
	- dvd = vd->vdev_tsd;
	- ASSERT3P(dvd, !=, NULL);
	- return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
	- offset, doread ? B_READ : B_WRITE));
	- }
	-}
	-
	-static int
	-zvol_dumpio(zvol_state_t zv, void addr, uint64_t offset, uint64_t size,
	- boolean_t doread, boolean_t isdump)
	-{
	- vdev_t *vd;
	- int error;
	- zvol_extent_t *ze;
	- spa_t *spa = dmu_objset_spa(zv->zv_objset);
	-
	- /* Must be sector aligned, and not stradle a block boundary. */
	- if (P2PHASE(offset, DEV_BSIZE) \|\| P2PHASE(size, DEV_BSIZE) \|\|
	- P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
	- return (SET_ERROR(EINVAL));
	- }
	- ASSERT(size <= zv->zv_volblocksize);
	-
	- /* Locate the extent this belongs to */
	- ze = list_head(&zv->zv_extents);
	- while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
	- offset -= ze->ze_nblks * zv->zv_volblocksize;
	- ze = list_next(&zv->zv_extents, ze);
	- }
	-
	- if (ze == NULL)
	- return (SET_ERROR(EINVAL));
	-
	- if (!ddi_in_panic())
	- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	-
	- vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
	- offset += DVA_GET_OFFSET(&ze->ze_dva);
	- error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
	- size, doread, isdump);
	-
	- if (!ddi_in_panic())
	- spa_config_exit(spa, SCL_STATE, FTAG);
	-
	- return (error);
	-}
	-
	-int
	-zvol_strategy(buf_t *bp)
	-{
	- zfs_soft_state_t *zs = NULL;
	-#else /* !illumos */
	-void
	-zvol_strategy(struct bio *bp)
	-{
	-#endif /* illumos */
	- zvol_state_t *zv;
	- uint64_t off, volsize;
	- size_t resid;
	- char *addr;
	- objset_t *os;
	- int error = 0;
	-#ifdef illumos
	- boolean_t doread = bp->b_flags & B_READ;
	-#else
	- boolean_t doread = 0;
	-#endif
	- boolean_t is_dumpified;
	- boolean_t sync;
	-
	-#ifdef illumos
	- if (getminor(bp->b_edev) == 0) {
	- error = SET_ERROR(EINVAL);
	- } else {
	- zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
	- if (zs == NULL)
	- error = SET_ERROR(ENXIO);
	- else if (zs->zss_type != ZSST_ZVOL)
	- error = SET_ERROR(EINVAL);
	- }
	-
	- if (error) {
	- bioerror(bp, error);
	- biodone(bp);
	- return (0);
	- }
	-
	- zv = zs->zss_data;
	-
	- if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
	- bioerror(bp, EROFS);
	- biodone(bp);
	- return (0);
	- }
	-
	- off = ldbtob(bp->b_blkno);
	-#else /* !illumos */
	- if (bp->bio_to)
	- zv = bp->bio_to->private;
	- else
	- zv = bp->bio_dev->si_drv2;
	-
	- if (zv == NULL) {
	- error = SET_ERROR(ENXIO);
	- goto out;
	- }
	-
	- if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
	- error = SET_ERROR(EROFS);
	- goto out;
	- }
	-
	- switch (bp->bio_cmd) {
	- case BIO_FLUSH:
	- goto sync;
	- case BIO_READ:
	- doread = 1;
	- case BIO_WRITE:
	- case BIO_DELETE:
	- break;
	- default:
	- error = EOPNOTSUPP;
	- goto out;
	- }
	-
	- off = bp->bio_offset;
	-#endif /* illumos */
	- volsize = zv->zv_volsize;
	-
	- os = zv->zv_objset;
	- ASSERT(os != NULL);
	-
	-#ifdef illumos
	- bp_mapin(bp);
	- addr = bp->b_un.b_addr;
	- resid = bp->b_bcount;
	-
	- if (resid > 0 && (off < 0 \|\| off >= volsize)) {
	- bioerror(bp, EIO);
	- biodone(bp);
	- return (0);
	- }
	-
	- is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
	- sync = ((!(bp->b_flags & B_ASYNC) &&
	- !(zv->zv_flags & ZVOL_WCE)) \|\|
	- (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
	- !doread && !is_dumpified;
	-#else /* !illumos */
	- addr = bp->bio_data;
	- resid = bp->bio_length;
	-
	- if (resid > 0 && (off < 0 \|\| off >= volsize)) {
	- error = SET_ERROR(EIO);
	- goto out;
	- }
	-
	- is_dumpified = B_FALSE;
	- sync = !doread && !is_dumpified &&
	- zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
	-#endif /* illumos */
	-
	- /*
	- * There must be no buffer changes when doing a dmu_sync() because
	- * we can't change the data whilst calculating the checksum.
	- */
	- locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid,
	- doread ? RL_READER : RL_WRITER);
	-
	-#ifndef illumos
	- if (bp->bio_cmd == BIO_DELETE) {
	- dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error != 0) {
	- dmu_tx_abort(tx);
	- } else {
	- zvol_log_truncate(zv, tx, off, resid, sync);
	- dmu_tx_commit(tx);
	- error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
	- off, resid);
	- resid = 0;
	- }
	- goto unlock;
	- }
	-#endif
	- while (resid != 0 && off < volsize) {
	- size_t size = MIN(resid, zvol_maxphys);
	-#ifdef illumos
	- if (is_dumpified) {
	- size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
	- error = zvol_dumpio(zv, addr, off, size,
	- doread, B_FALSE);
	- } else if (doread) {
	-#else
	- if (doread) {
	-#endif
	- error = dmu_read(os, ZVOL_OBJ, off, size, addr,
	- DMU_READ_PREFETCH);
	- } else {
	- dmu_tx_t *tx = dmu_tx_create(os);
	- dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- } else {
	- dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
	- zvol_log_write(zv, tx, off, size, sync);
	- dmu_tx_commit(tx);
	- }
	- }
	- if (error) {
	- /* convert checksum errors into IO errors */
	- if (error == ECKSUM)
	- error = SET_ERROR(EIO);
	- break;
	- }
	- off += size;
	- addr += size;
	- resid -= size;
	- }
	-#ifndef illumos
	-unlock:
	-#endif
	- rangelock_exit(lr);
	-
	-#ifdef illumos
	- if ((bp->b_resid = resid) == bp->b_bcount)
	- bioerror(bp, off > volsize ? EINVAL : error);
	-
	- if (sync)
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- biodone(bp);
	-
	- return (0);
	-#else /* !illumos */
	- bp->bio_completed = bp->bio_length - resid;
	- if (bp->bio_completed < bp->bio_length && off > volsize)
	- error = EINVAL;
	-
	- if (sync) {
	-sync:
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- }
	-out:
	- if (bp->bio_to)
	- g_io_deliver(bp, error);
	- else
	- biofinish(bp, NULL, error);
	-#endif /* illumos */
	-}
	-
	-#ifdef illumos
	-/*
	- * Set the buffer count to the zvol maximum transfer.
	- * Using our own routine instead of the default minphys()
	- * means that for larger writes we write bigger buffers on X86
	- * (128K instead of 56K) and flush the disk write cache less often
	- * (every zvol_maxphys - currently 1MB) instead of minphys (currently
	- * 56K on X86 and 128K on sparc).
	- */
	-void
	-zvol_minphys(struct buf *bp)
	-{
	- if (bp->b_bcount > zvol_maxphys)
	- bp->b_bcount = zvol_maxphys;
	-}
	-
	-int
	-zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
	-{
	- minor_t minor = getminor(dev);
	- zvol_state_t *zv;
	- int error = 0;
	- uint64_t size;
	- uint64_t boff;
	- uint64_t resid;
	-
	- zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
	- if (zv == NULL)
	- return (SET_ERROR(ENXIO));
	-
	- if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
	- return (SET_ERROR(EINVAL));
	-
	- boff = ldbtob(blkno);
	- resid = ldbtob(nblocks);
	-
	- VERIFY3U(boff + resid, <=, zv->zv_volsize);
	-
	- while (resid) {
	- size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
	- error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
	- if (error)
	- break;
	- boff += size;
	- addr += size;
	- resid -= size;
	- }
	-
	- return (error);
	-}
	-
	-/ARGSUSED/
	-int
	-zvol_read(dev_t dev, uio_t uio, cred_t cr)
	-{
	- minor_t minor = getminor(dev);
	-#else /* !illumos */
	-int
	-zvol_read(struct cdev dev, struct uio uio, int ioflag)
	-{
	-#endif /* illumos */
	- zvol_state_t *zv;
	- uint64_t volsize;
	- int error = 0;
	-
	-#ifdef illumos
	- zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
	- if (zv == NULL)
	- return (SET_ERROR(ENXIO));
	-#else
	- zv = dev->si_drv2;
	-#endif
	-
	- volsize = zv->zv_volsize;
	- /* uio_loffset == volsize isn't an error as its required for EOF processing. */
	- if (uio->uio_resid > 0 &&
	- (uio->uio_loffset < 0 \|\| uio->uio_loffset > volsize))
	- return (SET_ERROR(EIO));
	-
	-#ifdef illumos
	- if (zv->zv_flags & ZVOL_DUMPIFIED) {
	- error = physio(zvol_strategy, NULL, dev, B_READ,
	- zvol_minphys, uio);
	- return (error);
	- }
	-#endif
	-
	- locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
	- uio->uio_loffset, uio->uio_resid, RL_READER);
	- while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
	- uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
	-
	- /* don't read past the end */
	- if (bytes > volsize - uio->uio_loffset)
	- bytes = volsize - uio->uio_loffset;
	-
	- error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
	- if (error) {
	- /* convert checksum errors into IO errors */
	- if (error == ECKSUM)
	- error = SET_ERROR(EIO);
	- break;
	- }
	- }
	- rangelock_exit(lr);
	-
	- return (error);
	-}
	-
	-#ifdef illumos
	-/ARGSUSED/
	-int
	-zvol_write(dev_t dev, uio_t uio, cred_t cr)
	-{
	- minor_t minor = getminor(dev);
	-#else /* !illumos */
	-int
	-zvol_write(struct cdev dev, struct uio uio, int ioflag)
	-{
	-#endif /* illumos */
	- zvol_state_t *zv;
	- uint64_t volsize;
	- int error = 0;
	- boolean_t sync;
	-
	-#ifdef illumos
	- zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
	- if (zv == NULL)
	- return (SET_ERROR(ENXIO));
	-#else
	- zv = dev->si_drv2;
	-#endif
	-
	- volsize = zv->zv_volsize;
	- /* uio_loffset == volsize isn't an error as its required for EOF processing. */
	- if (uio->uio_resid > 0 &&
	- (uio->uio_loffset < 0 \|\| uio->uio_loffset > volsize))
	- return (SET_ERROR(EIO));
	-
	-#ifdef illumos
	- if (zv->zv_flags & ZVOL_DUMPIFIED) {
	- error = physio(zvol_strategy, NULL, dev, B_WRITE,
	- zvol_minphys, uio);
	- return (error);
	- }
	-
	- sync = !(zv->zv_flags & ZVOL_WCE) \|\|
	-#else
	- sync = (ioflag & IO_SYNC) \|\|
	-#endif
	- (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
	-
	- locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
	- uio->uio_loffset, uio->uio_resid, RL_WRITER);
	- while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
	- uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
	- uint64_t off = uio->uio_loffset;
	- dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
	-
	- if (bytes > volsize - off) /* don't write past the end */
	- bytes = volsize - off;
	-
	- dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- break;
	- }
	- error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
	- if (error == 0)
	- zvol_log_write(zv, tx, off, bytes, sync);
	- dmu_tx_commit(tx);
	-
	- if (error)
	- break;
	- }
	- rangelock_exit(lr);
	-
	- if (sync)
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- return (error);
	-}
	-
	-#ifdef illumos
	-int
	-zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
	-{
	- struct uuid uuid = EFI_RESERVED;
	- efi_gpe_t gpe = { 0 };
	- uint32_t crc;
	- dk_efi_t efi;
	- int length;
	- char *ptr;
	-
	- if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
	- return (SET_ERROR(EFAULT));
	- ptr = (char *)(uintptr_t)efi.dki_data_64;
	- length = efi.dki_length;
	- /*
	- * Some clients may attempt to request a PMBR for the
	- * zvol. Currently this interface will return EINVAL to
	- * such requests. These requests could be supported by
	- * adding a check for lba == 0 and consing up an appropriate
	- * PMBR.
	- */
	- if (efi.dki_lba < 1 \|\| efi.dki_lba > 2 \|\| length <= 0)
	- return (SET_ERROR(EINVAL));
	-
	- gpe.efi_gpe_StartingLBA = LE_64(34ULL);
	- gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
	- UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
	-
	- if (efi.dki_lba == 1) {
	- efi_gpt_t gpt = { 0 };
	-
	- gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
	- gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
	- gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
	- gpt.efi_gpt_MyLBA = LE_64(1ULL);
	- gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
	- gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
	- gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
	- gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
	- gpt.efi_gpt_SizeOfPartitionEntry =
	- LE_32(sizeof (efi_gpe_t));
	- CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
	- gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
	- CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
	- gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
	- if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
	- flag))
	- return (SET_ERROR(EFAULT));
	- ptr += sizeof (gpt);
	- length -= sizeof (gpt);
	- }
	- if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
	- length), flag))
	- return (SET_ERROR(EFAULT));
	- return (0);
	-}
	-
	-/*
	- * BEGIN entry points to allow external callers access to the volume.
	- */
	-/*
	- * Return the volume parameters needed for access from an external caller.
	- * These values are invariant as long as the volume is held open.
	- */
	-int
	-zvol_get_volume_params(minor_t minor, uint64_t *blksize,
	- uint64_t max_xfer_len, void minor_hdl, void objset_hdl, void *zil_hdl,
	- void rl_hdl, void dnode_hdl)
	-{
	- zvol_state_t *zv;
	-
	- zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
	- if (zv == NULL)
	- return (SET_ERROR(ENXIO));
	- if (zv->zv_flags & ZVOL_DUMPIFIED)
	- return (SET_ERROR(ENXIO));
	-
	- ASSERT(blksize && max_xfer_len && minor_hdl &&
	- objset_hdl && zil_hdl && rl_hdl && dnode_hdl);
	-
	- *blksize = zv->zv_volblocksize;
	- *max_xfer_len = (uint64_t)zvol_maxphys;
	- *minor_hdl = zv;
	- *objset_hdl = zv->zv_objset;
	- *zil_hdl = zv->zv_zilog;
	- *rl_hdl = &zv->zv_rangelock;
	- *dnode_hdl = zv->zv_dn;
	- return (0);
	-}
	-
	-/*
	- * Return the current volume size to an external caller.
	- * The size can change while the volume is open.
	- */
	-uint64_t
	-zvol_get_volume_size(void *minor_hdl)
	-{
	- zvol_state_t *zv = minor_hdl;
	-
	- return (zv->zv_volsize);
	-}
	-
	-/*
	- * Return the current WCE setting to an external caller.
	- * The WCE setting can change while the volume is open.
	- */
	-int
	-zvol_get_volume_wce(void *minor_hdl)
	-{
	- zvol_state_t *zv = minor_hdl;
	-
	- return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
	-}
	-
	-/*
	- * Entry point for external callers to zvol_log_write
	- */
	-void
	-zvol_log_write_minor(void minor_hdl, dmu_tx_t tx, offset_t off, ssize_t resid,
	- boolean_t sync)
	-{
	- zvol_state_t *zv = minor_hdl;
	-
	- zvol_log_write(zv, tx, off, resid, sync);
	-}
	-/*
	- * END entry points to allow external callers access to the volume.
	- */
	-#endif /* illumos */
	-
	-/*
	- * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
	- */
	-static void
	-zvol_log_truncate(zvol_state_t zv, dmu_tx_t tx, uint64_t off, uint64_t len,
	- boolean_t sync)
	-{
	- itx_t *itx;
	- lr_truncate_t *lr;
	- zilog_t *zilog = zv->zv_zilog;
	-
	- if (zil_replaying(zilog, tx))
	- return;
	-
	- itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
	- lr = (lr_truncate_t *)&itx->itx_lr;
	- lr->lr_foid = ZVOL_OBJ;
	- lr->lr_offset = off;
	- lr->lr_length = len;
	-
	- itx->itx_sync = (sync \|\| zv->zv_sync_cnt != 0);
	- zil_itx_assign(zilog, itx, tx);
	-}
	-
	-#ifdef illumos
	-/*
	- * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
	- * Also a dirtbag dkio ioctl for unmap/free-block functionality.
	- */
	-/ARGSUSED/
	-int
	-zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t cr, int rvalp)
	-{
	- zvol_state_t *zv;
	- struct dk_callback *dkc;
	- int error = 0;
	- locked_range_t *lr;
	-
	- mutex_enter(&zfsdev_state_lock);
	-
	- zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
	-
	- if (zv == NULL) {
	- mutex_exit(&zfsdev_state_lock);
	- return (SET_ERROR(ENXIO));
	- }
	- ASSERT(zv->zv_total_opens > 0);
	-
	- switch (cmd) {
	-
	- case DKIOCINFO:
	- {
	- struct dk_cinfo dki;
	-
	- bzero(&dki, sizeof (dki));
	- (void) strcpy(dki.dki_cname, "zvol");
	- (void) strcpy(dki.dki_dname, "zvol");
	- dki.dki_ctype = DKC_UNKNOWN;
	- dki.dki_unit = getminor(dev);
	- dki.dki_maxtransfer =
	- 1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
	- mutex_exit(&zfsdev_state_lock);
	- if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
	- error = SET_ERROR(EFAULT);
	- return (error);
	- }
	-
	- case DKIOCGMEDIAINFO:
	- {
	- struct dk_minfo dkm;
	-
	- bzero(&dkm, sizeof (dkm));
	- dkm.dki_lbsize = 1U << zv->zv_min_bs;
	- dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
	- dkm.dki_media_type = DK_UNKNOWN;
	- mutex_exit(&zfsdev_state_lock);
	- if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
	- error = SET_ERROR(EFAULT);
	- return (error);
	- }
	-
	- case DKIOCGMEDIAINFOEXT:
	- {
	- struct dk_minfo_ext dkmext;
	-
	- bzero(&dkmext, sizeof (dkmext));
	- dkmext.dki_lbsize = 1U << zv->zv_min_bs;
	- dkmext.dki_pbsize = zv->zv_volblocksize;
	- dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
	- dkmext.dki_media_type = DK_UNKNOWN;
	- mutex_exit(&zfsdev_state_lock);
	- if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
	- error = SET_ERROR(EFAULT);
	- return (error);
	- }
	-
	- case DKIOCGETEFI:
	- {
	- uint64_t vs = zv->zv_volsize;
	- uint8_t bs = zv->zv_min_bs;
	-
	- mutex_exit(&zfsdev_state_lock);
	- error = zvol_getefi((void *)arg, flag, vs, bs);
	- return (error);
	- }
	-
	- case DKIOCFLUSHWRITECACHE:
	- dkc = (struct dk_callback *)arg;
	- mutex_exit(&zfsdev_state_lock);
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
	- (*dkc->dkc_callback)(dkc->dkc_cookie, error);
	- error = 0;
	- }
	- return (error);
	-
	- case DKIOCGETWCE:
	- {
	- int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
	- if (ddi_copyout(&wce, (void *)arg, sizeof (int),
	- flag))
	- error = SET_ERROR(EFAULT);
	- break;
	- }
	- case DKIOCSETWCE:
	- {
	- int wce;
	- if (ddi_copyin((void *)arg, &wce, sizeof (int),
	- flag)) {
	- error = SET_ERROR(EFAULT);
	- break;
	- }
	- if (wce) {
	- zv->zv_flags \|= ZVOL_WCE;
	- mutex_exit(&zfsdev_state_lock);
	- } else {
	- zv->zv_flags &= ~ZVOL_WCE;
	- mutex_exit(&zfsdev_state_lock);
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- }
	- return (0);
	- }
	-
	- case DKIOCGGEOM:
	- case DKIOCGVTOC:
	- /*
	- * commands using these (like prtvtoc) expect ENOTSUP
	- * since we're emulating an EFI label
	- */
	- error = SET_ERROR(ENOTSUP);
	- break;
	-
	- case DKIOCDUMPINIT:
	- lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
	- RL_WRITER);
	- error = zvol_dumpify(zv);
	- rangelock_exit(lr);
	- break;
	-
	- case DKIOCDUMPFINI:
	- if (!(zv->zv_flags & ZVOL_DUMPIFIED))
	- break;
	- lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
	- RL_WRITER);
	- error = zvol_dump_fini(zv);
	- rangelock_exit(lr);
	- break;
	-
	- case DKIOCFREE:
	- {
	- dkioc_free_list_t *dfl;
	- dmu_tx_t *tx;
	-
	- if (!zvol_unmap_enabled)
	- break;
	-
	- if (!(flag & FKIOCTL)) {
	- error = dfl_copyin((void *)arg, &dfl, flag, KM_SLEEP);
	- if (error != 0)
	- break;
	- } else {
	- dfl = (dkioc_free_list_t *)arg;
	- ASSERT3U(dfl->dfl_num_exts, <=, DFL_COPYIN_MAX_EXTS);
	- if (dfl->dfl_num_exts > DFL_COPYIN_MAX_EXTS) {
	- error = SET_ERROR(EINVAL);
	- break;
	- }
	- }
	-
	- mutex_exit(&zfsdev_state_lock);
	-
	- for (int i = 0; i < dfl->dfl_num_exts; i++) {
	- uint64_t start = dfl->dfl_exts[i].dfle_start,
	- length = dfl->dfl_exts[i].dfle_length,
	- end = start + length;
	-
	- /*
	- * Apply Postel's Law to length-checking. If they
	- * overshoot, just blank out until the end, if there's
	- * a need to blank out anything.
	- */
	- if (start >= zv->zv_volsize)
	- continue; /* No need to do anything... */
	- if (end > zv->zv_volsize) {
	- end = DMU_OBJECT_END;
	- length = end - start;
	- }
	-
	- lr = rangelock_enter(&zv->zv_rangelock, start, length,
	- RL_WRITER);
	- tx = dmu_tx_create(zv->zv_objset);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error != 0) {
	- dmu_tx_abort(tx);
	- } else {
	- zvol_log_truncate(zv, tx, start, length,
	- B_TRUE);
	- dmu_tx_commit(tx);
	- error = dmu_free_long_range(zv->zv_objset,
	- ZVOL_OBJ, start, length);
	- }
	-
	- rangelock_exit(lr);
	-
	- if (error != 0)
	- break;
	- }
	-
	- /*
	- * If the write-cache is disabled, 'sync' property
	- * is set to 'always', or if the caller is asking for
	- * a synchronous free, commit this operation to the zil.
	- * This will sync any previous uncommitted writes to the
	- * zvol object.
	- * Can be overridden by the zvol_unmap_sync_enabled tunable.
	- */
	- if ((error == 0) && zvol_unmap_sync_enabled &&
	- (!(zv->zv_flags & ZVOL_WCE) \|\|
	- (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) \|\|
	- (dfl->dfl_flags & DF_WAIT_SYNC))) {
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- }
	-
	- if (!(flag & FKIOCTL))
	- dfl_free(dfl);
	-
	- return (error);
	- }
	-
	- default:
	- error = SET_ERROR(ENOTTY);
	- break;
	-
	- }
	- mutex_exit(&zfsdev_state_lock);
	- return (error);
	-}
	-#endif /* illumos */
	-
	-int
	-zvol_busy(void)
	-{
	- return (zvol_minors != 0);
	-}
	-
	-void
	-zvol_init(void)
	-{
	- VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
	- 1) == 0);
	-#ifdef illumos
	- mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
	-#else
	- ZFS_LOG(1, "ZVOL Initialized.");
	-#endif
	-}
	-
	-void
	-zvol_fini(void)
	-{
	-#ifdef illumos
	- mutex_destroy(&zfsdev_state_lock);
	-#endif
	- ddi_soft_state_fini(&zfsdev_state);
	- ZFS_LOG(1, "ZVOL Deinitialized.");
	-}
	-
	-#ifdef illumos
	-/ARGSUSED/
	-static int
	-zfs_mvdev_dump_feature_check(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
	- return (1);
	- return (0);
	-}
	-
	-/ARGSUSED/
	-static void
	-zfs_mvdev_dump_activate_feature_sync(void arg, dmu_tx_t tx)
	-{
	- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	-
	- spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
	-}
	-
	-static int
	-zvol_dump_init(zvol_state_t *zv, boolean_t resize)
	-{
	- dmu_tx_t *tx;
	- int error;
	- objset_t *os = zv->zv_objset;
	- spa_t *spa = dmu_objset_spa(os);
	- vdev_t *vd = spa->spa_root_vdev;
	- nvlist_t *nv = NULL;
	- uint64_t version = spa_version(spa);
	- uint64_t checksum, compress, refresrv, vbs, dedup;
	-
	- ASSERT(MUTEX_HELD(&zfsdev_state_lock));
	- ASSERT(vd->vdev_ops == &vdev_root_ops);
	-
	- error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
	- DMU_OBJECT_END);
	- if (error != 0)
	- return (error);
	- /* wait for dmu_free_long_range to actually free the blocks */
	- txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
	-
	- /*
	- * If the pool on which the dump device is being initialized has more
	- * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
	- * enabled. If so, bump that feature's counter to indicate that the
	- * feature is active. We also check the vdev type to handle the
	- * following case:
	- * # zpool create test raidz disk1 disk2 disk3
	- * Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
	- * the raidz vdev itself has 3 children.
	- */
	- if (vd->vdev_children > 1 \|\| vd->vdev_ops == &vdev_raidz_ops) {
	- if (!spa_feature_is_enabled(spa,
	- SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
	- return (SET_ERROR(ENOTSUP));
	- (void) dsl_sync_task(spa_name(spa),
	- zfs_mvdev_dump_feature_check,
	- zfs_mvdev_dump_activate_feature_sync, NULL,
	- 2, ZFS_SPACE_CHECK_RESERVED);
	- }
	-
	- if (!resize) {
	- error = dsl_prop_get_integer(zv->zv_name,
	- zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
	- if (error == 0) {
	- error = dsl_prop_get_integer(zv->zv_name,
	- zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
	- NULL);
	- }
	- if (error == 0) {
	- error = dsl_prop_get_integer(zv->zv_name,
	- zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
	- &refresrv, NULL);
	- }
	- if (error == 0) {
	- error = dsl_prop_get_integer(zv->zv_name,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
	- NULL);
	- }
	- if (version >= SPA_VERSION_DEDUP && error == 0) {
	- error = dsl_prop_get_integer(zv->zv_name,
	- zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
	- }
	- }
	- if (error != 0)
	- return (error);
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
	- dmu_tx_hold_bonus(tx, ZVOL_OBJ);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error != 0) {
	- dmu_tx_abort(tx);
	- return (error);
	- }
	-
	- /*
	- * If we are resizing the dump device then we only need to
	- * update the refreservation to match the newly updated
	- * zvolsize. Otherwise, we save off the original state of the
	- * zvol so that we can restore them if the zvol is ever undumpified.
	- */
	- if (resize) {
	- error = zap_update(os, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
	- &zv->zv_volsize, tx);
	- } else {
	- error = zap_update(os, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
	- &compress, tx);
	- if (error == 0) {
	- error = zap_update(os, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
	- &checksum, tx);
	- }
	- if (error == 0) {
	- error = zap_update(os, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
	- &refresrv, tx);
	- }
	- if (error == 0) {
	- error = zap_update(os, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
	- &vbs, tx);
	- }
	- if (error == 0) {
	- error = dmu_object_set_blocksize(
	- os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
	- }
	- if (version >= SPA_VERSION_DEDUP && error == 0) {
	- error = zap_update(os, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
	- &dedup, tx);
	- }
	- if (error == 0)
	- zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
	- }
	- dmu_tx_commit(tx);
	-
	- /*
	- * We only need update the zvol's property if we are initializing
	- * the dump area for the first time.
	- */
	- if (error == 0 && !resize) {
	- /*
	- * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
	- * function. Otherwise, use the old default -- OFF.
	- */
	- checksum = spa_feature_is_active(spa,
	- SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
	- ZIO_CHECKSUM_OFF;
	-
	- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- VERIFY(nvlist_add_uint64(nv,
	- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
	- VERIFY(nvlist_add_uint64(nv,
	- zfs_prop_to_name(ZFS_PROP_COMPRESSION),
	- ZIO_COMPRESS_OFF) == 0);
	- VERIFY(nvlist_add_uint64(nv,
	- zfs_prop_to_name(ZFS_PROP_CHECKSUM),
	- checksum) == 0);
	- if (version >= SPA_VERSION_DEDUP) {
	- VERIFY(nvlist_add_uint64(nv,
	- zfs_prop_to_name(ZFS_PROP_DEDUP),
	- ZIO_CHECKSUM_OFF) == 0);
	- }
	-
	- error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
	- nv, NULL);
	- nvlist_free(nv);
	- }
	-
	- /* Allocate the space for the dump */
	- if (error == 0)
	- error = zvol_prealloc(zv);
	- return (error);
	-}
	-
	-static int
	-zvol_dumpify(zvol_state_t *zv)
	-{
	- int error = 0;
	- uint64_t dumpsize = 0;
	- dmu_tx_t *tx;
	- objset_t *os = zv->zv_objset;
	-
	- if (zv->zv_flags & ZVOL_RDONLY)
	- return (SET_ERROR(EROFS));
	-
	- if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
	- 8, 1, &dumpsize) != 0 \|\| dumpsize != zv->zv_volsize) {
	- boolean_t resize = (dumpsize > 0);
	-
	- if ((error = zvol_dump_init(zv, resize)) != 0) {
	- (void) zvol_dump_fini(zv);
	- return (error);
	- }
	- }
	-
	- /*
	- * Build up our lba mapping.
	- */
	- error = zvol_get_lbas(zv);
	- if (error) {
	- (void) zvol_dump_fini(zv);
	- return (error);
	- }
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- (void) zvol_dump_fini(zv);
	- return (error);
	- }
	-
	- zv->zv_flags \|= ZVOL_DUMPIFIED;
	- error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
	- &zv->zv_volsize, tx);
	- dmu_tx_commit(tx);
	-
	- if (error) {
	- (void) zvol_dump_fini(zv);
	- return (error);
	- }
	-
	- txg_wait_synced(dmu_objset_pool(os), 0);
	- return (0);
	-}
	-
	-static int
	-zvol_dump_fini(zvol_state_t *zv)
	-{
	- dmu_tx_t *tx;
	- objset_t *os = zv->zv_objset;
	- nvlist_t *nv;
	- int error = 0;
	- uint64_t checksum, compress, refresrv, vbs, dedup;
	- uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
	-
	- /*
	- * Attempt to restore the zvol back to its pre-dumpified state.
	- * This is a best-effort attempt as it's possible that not all
	- * of these properties were initialized during the dumpify process
	- * (i.e. error during zvol_dump_init).
	- */
	-
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- return (error);
	- }
	- (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
	- dmu_tx_commit(tx);
	-
	- (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
	- (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
	- (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
	- (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
	-
	- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	- (void) nvlist_add_uint64(nv,
	- zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
	- (void) nvlist_add_uint64(nv,
	- zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
	- (void) nvlist_add_uint64(nv,
	- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
	- if (version >= SPA_VERSION_DEDUP &&
	- zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
	- zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
	- (void) nvlist_add_uint64(nv,
	- zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
	- }
	- (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
	- nv, NULL);
	- nvlist_free(nv);
	-
	- zvol_free_extents(zv);
	- zv->zv_flags &= ~ZVOL_DUMPIFIED;
	- (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
	- /* wait for dmu_free_long_range to actually free the blocks */
	- txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
	- tx = dmu_tx_create(os);
	- dmu_tx_hold_bonus(tx, ZVOL_OBJ);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error) {
	- dmu_tx_abort(tx);
	- return (error);
	- }
	- if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
	- zv->zv_volblocksize = vbs;
	- dmu_tx_commit(tx);
	-
	- return (0);
	-}
	-#else /* !illumos */
	-
	-static void
	-zvol_geom_run(zvol_state_t *zv)
	-{
	- struct g_provider *pp;
	-
	- pp = zv->zv_provider;
	- g_error_provider(pp, 0);
	-
	- kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
	- "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
	-}
	-
	-static void
	-zvol_geom_destroy(zvol_state_t *zv)
	-{
	- struct g_provider *pp;
	-
	- g_topology_assert();
	-
	- mtx_lock(&zv->zv_queue_mtx);
	- zv->zv_state = 1;
	- wakeup_one(&zv->zv_queue);
	- while (zv->zv_state != 2)
	- msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
	- mtx_destroy(&zv->zv_queue_mtx);
	-
	- pp = zv->zv_provider;
	- zv->zv_provider = NULL;
	- pp->private = NULL;
	- g_wither_geom(pp->geom, ENXIO);
	-}
	-
	-static int
	-zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
	-{
	- int count, error, flags;
	-
	- g_topology_assert();
	-
	- /*
	- * To make it easier we expect either open or close, but not both
	- * at the same time.
	- */
	- KASSERT((acr >= 0 && acw >= 0 && ace >= 0) \|\|
	- (acr <= 0 && acw <= 0 && ace <= 0),
	- ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
	- pp->name, acr, acw, ace));
	-
	- if (pp->private == NULL) {
	- if (acr <= 0 && acw <= 0 && ace <= 0)
	- return (0);
	- return (pp->error);
	- }
	-
	- /*
	- * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
	- * because GEOM already handles that and handles it a bit differently.
	- * GEOM allows for multiple read/exclusive consumers and ZFS allows
	- * only one exclusive consumer, no matter if it is reader or writer.
	- * I like better the way GEOM works so I'll leave it for GEOM to
	- * decide what to do.
	- */
	-
	- count = acr + acw + ace;
	- if (count == 0)
	- return (0);
	-
	- flags = 0;
	- if (acr != 0 \|\| ace != 0)
	- flags \|= FREAD;
	- if (acw != 0)
	- flags \|= FWRITE;
	-
	- g_topology_unlock();
	- if (count > 0)
	- error = zvol_open(pp, flags, count);
	- else
	- error = zvol_close(pp, flags, -count);
	- g_topology_lock();
	- return (error);
	-}
	-
	-static void
	-zvol_geom_start(struct bio *bp)
	-{
	- zvol_state_t *zv;
	- boolean_t first;
	-
	- zv = bp->bio_to->private;
	- ASSERT(zv != NULL);
	- switch (bp->bio_cmd) {
	- case BIO_FLUSH:
	- if (!THREAD_CAN_SLEEP())
	- goto enqueue;
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- g_io_deliver(bp, 0);
	- break;
	- case BIO_READ:
	- case BIO_WRITE:
	- case BIO_DELETE:
	- if (!THREAD_CAN_SLEEP())
	- goto enqueue;
	- zvol_strategy(bp);
	- break;
	- case BIO_GETATTR: {
	- spa_t *spa = dmu_objset_spa(zv->zv_objset);
	- uint64_t refd, avail, usedobjs, availobjs, val;
	-
	- if (g_handleattr_int(bp, "GEOM::candelete", 1))
	- return;
	- if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
	- dmu_objset_space(zv->zv_objset, &refd, &avail,
	- &usedobjs, &availobjs);
	- if (g_handleattr_off_t(bp, "blocksavail",
	- avail / DEV_BSIZE))
	- return;
	- } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
	- dmu_objset_space(zv->zv_objset, &refd, &avail,
	- &usedobjs, &availobjs);
	- if (g_handleattr_off_t(bp, "blocksused",
	- refd / DEV_BSIZE))
	- return;
	- } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
	- avail = metaslab_class_get_space(spa_normal_class(spa));
	- avail -= metaslab_class_get_alloc(spa_normal_class(spa));
	- if (g_handleattr_off_t(bp, "poolblocksavail",
	- avail / DEV_BSIZE))
	- return;
	- } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
	- refd = metaslab_class_get_alloc(spa_normal_class(spa));
	- if (g_handleattr_off_t(bp, "poolblocksused",
	- refd / DEV_BSIZE))
	- return;
	- }
	- /* FALLTHROUGH */
	- }
	- default:
	- g_io_deliver(bp, EOPNOTSUPP);
	- break;
	- }
	- return;
	-
	-enqueue:
	- mtx_lock(&zv->zv_queue_mtx);
	- first = (bioq_first(&zv->zv_queue) == NULL);
	- bioq_insert_tail(&zv->zv_queue, bp);
	- mtx_unlock(&zv->zv_queue_mtx);
	- if (first)
	- wakeup_one(&zv->zv_queue);
	-}
	-
	-static void
	-zvol_geom_worker(void *arg)
	-{
	- zvol_state_t *zv;
	- struct bio *bp;
	-
	- thread_lock(curthread);
	- sched_prio(curthread, PRIBIO);
	- thread_unlock(curthread);
	-
	- zv = arg;
	- for (;;) {
	- mtx_lock(&zv->zv_queue_mtx);
	- bp = bioq_takefirst(&zv->zv_queue);
	- if (bp == NULL) {
	- if (zv->zv_state == 1) {
	- zv->zv_state = 2;
	- wakeup(&zv->zv_state);
	- mtx_unlock(&zv->zv_queue_mtx);
	- kthread_exit();
	- }
	- msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO \| PDROP,
	- "zvol:io", 0);
	- continue;
	- }
	- mtx_unlock(&zv->zv_queue_mtx);
	- switch (bp->bio_cmd) {
	- case BIO_FLUSH:
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- g_io_deliver(bp, 0);
	- break;
	- case BIO_READ:
	- case BIO_WRITE:
	- case BIO_DELETE:
	- zvol_strategy(bp);
	- break;
	- default:
	- g_io_deliver(bp, EOPNOTSUPP);
	- break;
	- }
	- }
	-}
	-
	-extern boolean_t dataset_name_hidden(const char *name);
	-
	-static int
	-zvol_create_snapshots(objset_t os, const char name)
	-{
	- uint64_t cookie, obj;
	- char *sname;
	- int error, len;
	-
	- cookie = obj = 0;
	- sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	-
	-#if 0
	- (void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
	- DS_FIND_SNAPSHOTS);
	-#endif
	-
	- for (;;) {
	- len = snprintf(sname, MAXPATHLEN, "%s@", name);
	- if (len >= MAXPATHLEN) {
	- dmu_objset_rele(os, FTAG);
	- error = ENAMETOOLONG;
	- break;
	- }
	-
	- dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	- error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
	- sname + len, &obj, &cookie, NULL);
	- dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	- if (error != 0) {
	- if (error == ENOENT)
	- error = 0;
	- break;
	- }
	-
	- error = zvol_create_minor(sname);
	- if (error != 0 && error != EEXIST) {
	- printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
	- sname, error);
	- break;
	- }
	- }
	-
	- kmem_free(sname, MAXPATHLEN);
	- return (error);
	-}
	-
	-int
	-zvol_create_minors_impl(const char *name)
	-{
	- uint64_t cookie;
	- objset_t *os;
	- char osname, p;
	- int error, len;
	-
	- if (dataset_name_hidden(name))
	- return (0);
	-
	- if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
	- printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
	- name, error);
	- return (error);
	- }
	- if (dmu_objset_type(os) == DMU_OST_ZVOL) {
	- dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
	- dsl_pool_rele(dmu_objset_pool(os), FTAG);
	- error = zvol_create_minor(name);
	- if (error == 0 \|\| error == EEXIST) {
	- error = zvol_create_snapshots(os, name);
	- } else {
	- printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
	- name, error);
	- }
	- dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
	- dsl_dataset_rele(os->os_dsl_dataset, FTAG);
	- return (error);
	- }
	- if (dmu_objset_type(os) != DMU_OST_ZFS) {
	- dmu_objset_rele(os, FTAG);
	- return (0);
	- }
	-
	- osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	- if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
	- dmu_objset_rele(os, FTAG);
	- kmem_free(osname, MAXPATHLEN);
	- return (ENOENT);
	- }
	- p = osname + strlen(osname);
	- len = MAXPATHLEN - (p - osname);
	-
	-#if 0
	- /* Prefetch the datasets. */
	- cookie = 0;
	- while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
	- if (!dataset_name_hidden(osname))
	- (void) dmu_objset_prefetch(osname, NULL);
	- }
	-#endif
	-
	- cookie = 0;
	- while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
	- &cookie) == 0) {
	- dmu_objset_rele(os, FTAG);
	- (void)zvol_create_minors_impl(osname);
	- if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
	- printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
	- name, error);
	- return (error);
	- }
	- }
	-
	- dmu_objset_rele(os, FTAG);
	- kmem_free(osname, MAXPATHLEN);
	- return (0);
	-}
	-
	-static void
	-zvol_rename_minor(zvol_state_t zv, const char newname)
	-{
	- struct g_geom *gp;
	- struct g_provider *pp;
	- struct cdev *dev;
	-
	- ASSERT(MUTEX_HELD(&zfsdev_state_lock));
	-
	- if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
	- g_topology_lock();
	- pp = zv->zv_provider;
	- ASSERT(pp != NULL);
	- gp = pp->geom;
	- ASSERT(gp != NULL);
	-
	- zv->zv_provider = NULL;
	- g_wither_provider(pp, ENXIO);
	-
	- pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
	- pp->flags \|= G_PF_DIRECT_RECEIVE \| G_PF_DIRECT_SEND;
	- pp->sectorsize = DEV_BSIZE;
	- pp->mediasize = zv->zv_volsize;
	- pp->private = zv;
	- zv->zv_provider = pp;
	- g_error_provider(pp, 0);
	- g_topology_unlock();
	- } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
	- struct make_dev_args args;
	-
	- if ((dev = zv->zv_dev) != NULL) {
	- zv->zv_dev = NULL;
	- destroy_dev(dev);
	- if (zv->zv_total_opens > 0) {
	- zv->zv_flags &= ~ZVOL_EXCL;
	- zv->zv_total_opens = 0;
	- zvol_last_close(zv);
	- }
	- }
	-
	- make_dev_args_init(&args);
	- args.mda_flags = MAKEDEV_CHECKNAME \| MAKEDEV_WAITOK;
	- args.mda_devsw = &zvol_cdevsw;
	- args.mda_cr = NULL;
	- args.mda_uid = UID_ROOT;
	- args.mda_gid = GID_OPERATOR;
	- args.mda_mode = 0640;
	- args.mda_si_drv2 = zv;
	- if (make_dev_s(&args, &zv->zv_dev,
	- "%s/%s", ZVOL_DRIVER, newname) == 0)
	- zv->zv_dev->si_iosize_max = MAXPHYS;
	- }
	- strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
	-}
	-
	-void
	-zvol_rename_minors_impl(const char oldname, const char newname)
	-{
	- char name[MAXPATHLEN];
	- struct g_provider *pp;
	- struct g_geom *gp;
	- size_t oldnamelen, newnamelen;
	- zvol_state_t *zv;
	- char *namebuf;
	- boolean_t locked = B_FALSE;
	-
	- oldnamelen = strlen(oldname);
	- newnamelen = strlen(newname);
	-
	- /* See comment in zvol_open(). */
	- if (!MUTEX_HELD(&zfsdev_state_lock)) {
	- mutex_enter(&zfsdev_state_lock);
	- locked = B_TRUE;
	- }
	-
	- LIST_FOREACH(zv, &all_zvols, zv_links) {
	- if (strcmp(zv->zv_name, oldname) == 0) {
	- zvol_rename_minor(zv, newname);
	- } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
	- (zv->zv_name[oldnamelen] == '/' \|\|
	- zv->zv_name[oldnamelen] == '@')) {
	- snprintf(name, sizeof(name), "%s%c%s", newname,
	- zv->zv_name[oldnamelen],
	- zv->zv_name + oldnamelen + 1);
	- zvol_rename_minor(zv, name);
	- }
	- }
	-
	- if (locked)
	- mutex_exit(&zfsdev_state_lock);
	-}
	-
	-static zvol_task_t *
	-zvol_task_alloc(zvol_async_op_t op, const char name1, const char name2)
	-{
	- zvol_task_t *task;
	- char *delim;
	-
	- task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
	- task->op = op;
	- delim = strchr(name1, '/');
	- strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
	-
	- strlcpy(task->name1, name1, MAXNAMELEN);
	- if (name2 != NULL)
	- strlcpy(task->name2, name2, MAXNAMELEN);
	-
	- return (task);
	-}
	-
	-static void
	-zvol_task_free(zvol_task_t *task)
	-{
	- kmem_free(task, sizeof (zvol_task_t));
	-}
	-
	-/*
	- * The worker thread function performed asynchronously.
	- */
	-static void
	-zvol_task_cb(void *param)
	-{
	- zvol_task_t task = (zvol_task_t )param;
	-
	- switch (task->op) {
	- case ZVOL_ASYNC_CREATE_MINORS:
	- (void) zvol_create_minors_impl(task->name1);
	- break;
	- case ZVOL_ASYNC_REMOVE_MINORS:
	- zvol_remove_minors_impl(task->name1);
	- break;
	- case ZVOL_ASYNC_RENAME_MINORS:
	- zvol_rename_minors_impl(task->name1, task->name2);
	- break;
	- default:
	- VERIFY(0);
	- break;
	- }
	-
	- zvol_task_free(task);
	-}
	-
	-static void
	-zvol_minors_helper(spa_t spa, zvol_async_op_t op, const char name1,
	- const char *name2)
	-{
	- zvol_task_t *task;
	-
	- if (dataset_name_hidden(name1))
	- return;
	- if (name2 != NULL && dataset_name_hidden(name2))
	- return;
	- task = zvol_task_alloc(op, name1, name2);
	- (void)taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
	-}
	-
	-void
	-zvol_create_minors(spa_t spa, const char name)
	-{
	- zvol_minors_helper(spa, ZVOL_ASYNC_CREATE_MINORS, name, NULL);
	-}
	-
	-void
	-zvol_remove_minors(spa_t spa, const char name)
	-{
	- zvol_minors_helper(spa, ZVOL_ASYNC_REMOVE_MINORS, name, NULL);
	-}
	-
	-void
	-zvol_rename_minors(spa_t spa, const char oldname, const char *newname)
	-{
	- zvol_minors_helper(spa, ZVOL_ASYNC_RENAME_MINORS, oldname, newname);
	-}
	-
	-static int
	-zvol_d_open(struct cdev dev, int flags, int fmt, struct thread td)
	-{
	- zvol_state_t *zv = dev->si_drv2;
	- int err = 0;
	-
	- mutex_enter(&zfsdev_state_lock);
	- if (zv->zv_total_opens == 0)
	- err = zvol_first_open(zv);
	- if (err) {
	- mutex_exit(&zfsdev_state_lock);
	- return (err);
	- }
	- if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
	- err = SET_ERROR(EROFS);
	- goto out;
	- }
	- if (zv->zv_flags & ZVOL_EXCL) {
	- err = SET_ERROR(EBUSY);
	- goto out;
	- }
	-#ifdef FEXCL
	- if (flags & FEXCL) {
	- if (zv->zv_total_opens != 0) {
	- err = SET_ERROR(EBUSY);
	- goto out;
	- }
	- zv->zv_flags \|= ZVOL_EXCL;
	- }
	-#endif
	-
	- zv->zv_total_opens++;
	- if (flags & (FSYNC \| FDSYNC)) {
	- zv->zv_sync_cnt++;
	- if (zv->zv_sync_cnt == 1)
	- zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
	- }
	- mutex_exit(&zfsdev_state_lock);
	- return (err);
	-out:
	- if (zv->zv_total_opens == 0)
	- zvol_last_close(zv);
	- mutex_exit(&zfsdev_state_lock);
	- return (err);
	-}
	-
	-static int
	-zvol_d_close(struct cdev dev, int flags, int fmt, struct thread td)
	-{
	- zvol_state_t *zv = dev->si_drv2;
	-
	- mutex_enter(&zfsdev_state_lock);
	- if (zv->zv_flags & ZVOL_EXCL) {
	- ASSERT(zv->zv_total_opens == 1);
	- zv->zv_flags &= ~ZVOL_EXCL;
	- }
	-
	- /*
	- * If the open count is zero, this is a spurious close.
	- * That indicates a bug in the kernel / DDI framework.
	- */
	- ASSERT(zv->zv_total_opens != 0);
	-
	- /*
	- * You may get multiple opens, but only one close.
	- */
	- zv->zv_total_opens--;
	- if (flags & (FSYNC \| FDSYNC))
	- zv->zv_sync_cnt--;
	-
	- if (zv->zv_total_opens == 0)
	- zvol_last_close(zv);
	-
	- mutex_exit(&zfsdev_state_lock);
	- return (0);
	-}
	-
	-static int
	-zvol_d_ioctl(struct cdev dev, u_long cmd, caddr_t data, int fflag, struct thread td)
	-{
	- zvol_state_t *zv;
	- locked_range_t *lr;
	- off_t offset, length;
	- int i, error;
	- boolean_t sync;
	-
	- zv = dev->si_drv2;
	-
	- error = 0;
	- KASSERT(zv->zv_total_opens > 0,
	- ("Device with zero access count in zvol_d_ioctl"));
	-
	- i = IOCPARM_LEN(cmd);
	- switch (cmd) {
	- case DIOCGSECTORSIZE:
	- (u_int )data = DEV_BSIZE;
	- break;
	- case DIOCGMEDIASIZE:
	- (off_t )data = zv->zv_volsize;
	- break;
	- case DIOCGFLUSH:
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- break;
	- case DIOCGDELETE:
	- if (!zvol_unmap_enabled)
	- break;
	-
	- offset = ((off_t *)data)[0];
	- length = ((off_t *)data)[1];
	- if ((offset % DEV_BSIZE) != 0 \|\| (length % DEV_BSIZE) != 0 \|\|
	- offset < 0 \|\| offset >= zv->zv_volsize \|\|
	- length <= 0) {
	- printf("%s: offset=%jd length=%jd\n", __func__, offset,
	- length);
	- error = EINVAL;
	- break;
	- }
	-
	- lr = rangelock_enter(&zv->zv_rangelock, offset, length,
	- RL_WRITER);
	- dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
	- error = dmu_tx_assign(tx, TXG_WAIT);
	- if (error != 0) {
	- sync = FALSE;
	- dmu_tx_abort(tx);
	- } else {
	- sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
	- zvol_log_truncate(zv, tx, offset, length, sync);
	- dmu_tx_commit(tx);
	- error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
	- offset, length);
	- }
	- rangelock_exit(lr);
	- if (sync)
	- zil_commit(zv->zv_zilog, ZVOL_OBJ);
	- break;
	- case DIOCGSTRIPESIZE:
	- (off_t )data = zv->zv_volblocksize;
	- break;
	- case DIOCGSTRIPEOFFSET:
	- (off_t )data = 0;
	- break;
	- case DIOCGATTR: {
	- spa_t *spa = dmu_objset_spa(zv->zv_objset);
	- struct diocgattr_arg arg = (struct diocgattr_arg )data;
	- uint64_t refd, avail, usedobjs, availobjs;
	-
	- if (strcmp(arg->name, "GEOM::candelete") == 0)
	- arg->value.i = 1;
	- else if (strcmp(arg->name, "blocksavail") == 0) {
	- dmu_objset_space(zv->zv_objset, &refd, &avail,
	- &usedobjs, &availobjs);
	- arg->value.off = avail / DEV_BSIZE;
	- } else if (strcmp(arg->name, "blocksused") == 0) {
	- dmu_objset_space(zv->zv_objset, &refd, &avail,
	- &usedobjs, &availobjs);
	- arg->value.off = refd / DEV_BSIZE;
	- } else if (strcmp(arg->name, "poolblocksavail") == 0) {
	- avail = metaslab_class_get_space(spa_normal_class(spa));
	- avail -= metaslab_class_get_alloc(spa_normal_class(spa));
	- arg->value.off = avail / DEV_BSIZE;
	- } else if (strcmp(arg->name, "poolblocksused") == 0) {
	- refd = metaslab_class_get_alloc(spa_normal_class(spa));
	- arg->value.off = refd / DEV_BSIZE;
	- } else
	- error = ENOIOCTL;
	- break;
	- }
	- case FIOSEEKHOLE:
	- case FIOSEEKDATA: {
	- off_t off = (off_t )data;
	- uint64_t noff;
	- boolean_t hole;
	-
	- hole = (cmd == FIOSEEKHOLE);
	- noff = *off;
	- error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
	- *off = noff;
	- break;
	- }
	- default:
	- error = ENOIOCTL;
	- }
	-
	- return (error);
	-}
	-#endif /* illumos */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
	@@ -1,438 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#include <sys/param.h>
	-#include <sys/types.h>
	-#include <sys/time.h>
	-#include <sys/sysmacros.h>
	-#include <sys/systm.h>
	-#include <sys/proc.h>
	-#include <sys/mutex.h>
	-#include <sys/condvar.h>
	-#include <sys/callb.h>
	-#include <sys/kmem.h>
	-#include <sys/cmn_err.h>
	-#include <sys/debug.h>
	-#include <sys/kobj.h>
	-#include <sys/systm.h> /* for delay() */
	-#include <sys/taskq.h> /* For TASKQ_NAMELEN */
	-#include <sys/kernel.h>
	-
	-#define CB_MAXNAME TASKQ_NAMELEN
	-
	-/*
	- * The callb mechanism provides generic event scheduling/echoing.
	- * A callb function is registered and called on behalf of the event.
	- */
	-typedef struct callb {
	- struct callb c_next; / next in class or on freelist */
	- kthread_id_t c_thread; /* ptr to caller's thread struct */
	- char c_flag; /* info about the callb state */
	- uchar_t c_class; /* this callb's class */
	- kcondvar_t c_done_cv; /* signal callb completion */
	- boolean_t (c_func)(); / cb function: returns true if ok */
	- void c_arg; / arg to c_func */
	- char c_name[CB_MAXNAME+1]; /* debug:max func name length */
	-} callb_t;
	-
	-/*
	- * callb c_flag bitmap definitions
	- */
	-#define CALLB_FREE 0x0
	-#define CALLB_TAKEN 0x1
	-#define CALLB_EXECUTING 0x2
	-
	-/*
	- * Basic structure for a callb table.
	- * All callbs are organized into different class groups described
	- * by ct_class array.
	- * The callbs within a class are single-linked and normally run by a
	- * serial execution.
	- */
	-typedef struct callb_table {
	- kmutex_t ct_lock; /* protect all callb states */
	- callb_t ct_freelist; / free callb structures */
	- int ct_busy; /* != 0 prevents additions */
	- kcondvar_t ct_busy_cv; /* to wait for not busy */
	- int ct_ncallb; /* num of callbs allocated */
	- callb_t ct_first_cb[NCBCLASS]; / ptr to 1st callb in a class */
	-} callb_table_t;
	-
	-int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC;
	-
	-static callb_id_t callb_add_common(boolean_t ()(void , int),
	- void , int, char , kthread_id_t);
	-
	-static callb_table_t callb_table; /* system level callback table */
	-static callb_table_t *ct = &callb_table;
	-static kmutex_t callb_safe_mutex;
	-callb_cpr_t callb_cprinfo_safe = {
	- &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, 0, 0 };
	-
	-/*
	- * Init all callb tables in the system.
	- */
	-void
	-callb_init(void *dummy __unused)
	-{
	- callb_table.ct_busy = 0; /* mark table open for additions */
	- mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL);
	- mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL);
	-}
	-
	-void
	-callb_fini(void *dummy __unused)
	-{
	- callb_t *cp;
	- int i;
	-
	- mutex_enter(&ct->ct_lock);
	- for (i = 0; i < 16; i++) {
	- while ((cp = ct->ct_freelist) != NULL) {
	- ct->ct_freelist = cp->c_next;
	- ct->ct_ncallb--;
	- kmem_free(cp, sizeof (callb_t));
	- }
	- if (ct->ct_ncallb == 0)
	- break;
	- /* Not all callbacks finished, waiting for the rest. */
	- mutex_exit(&ct->ct_lock);
	- tsleep(ct, 0, "callb", hz / 4);
	- mutex_enter(&ct->ct_lock);
	- }
	- if (ct->ct_ncallb > 0)
	- printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb);
	- mutex_exit(&ct->ct_lock);
	- mutex_destroy(&callb_safe_mutex);
	- mutex_destroy(&callb_table.ct_lock);
	-}
	-
	-/*
	- * callout_add() is called to register func() be called later.
	- */
	-static callb_id_t
	-callb_add_common(boolean_t (func)(void arg, int code),
	- void arg, int class, char name, kthread_id_t t)
	-{
	- callb_t *cp;
	-
	- ASSERT(class < NCBCLASS);
	-
	- mutex_enter(&ct->ct_lock);
	- while (ct->ct_busy)
	- cv_wait(&ct->ct_busy_cv, &ct->ct_lock);
	- if ((cp = ct->ct_freelist) == NULL) {
	- ct->ct_ncallb++;
	- cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP);
	- }
	- ct->ct_freelist = cp->c_next;
	- cp->c_thread = t;
	- cp->c_func = func;
	- cp->c_arg = arg;
	- cp->c_class = (uchar_t)class;
	- cp->c_flag \|= CALLB_TAKEN;
	-#ifdef DEBUG
	- if (strlen(name) > CB_MAXNAME)
	- cmn_err(CE_WARN, "callb_add: name of callback function '%s' "
	- "too long -- truncated to %d chars",
	- name, CB_MAXNAME);
	-#endif
	- (void) strncpy(cp->c_name, name, CB_MAXNAME);
	- cp->c_name[CB_MAXNAME] = '\0';
	-
	- /*
	- * Insert the new callb at the head of its class list.
	- */
	- cp->c_next = ct->ct_first_cb[class];
	- ct->ct_first_cb[class] = cp;
	-
	- mutex_exit(&ct->ct_lock);
	- return ((callb_id_t)cp);
	-}
	-
	-/*
	- * The default function to add an entry to the callback table. Since
	- * it uses curthread as the thread identifier to store in the table,
	- * it should be used for the normal case of a thread which is calling
	- * to add ITSELF to the table.
	- */
	-callb_id_t
	-callb_add(boolean_t (func)(void arg, int code),
	- void arg, int class, char name)
	-{
	- return (callb_add_common(func, arg, class, name, curthread));
	-}
	-
	-/*
	- * A special version of callb_add() above for use by threads which
	- * might be adding an entry to the table on behalf of some other
	- * thread (for example, one which is constructed but not yet running).
	- * In this version the thread id is an argument.
	- */
	-callb_id_t
	-callb_add_thread(boolean_t (func)(void arg, int code),
	- void arg, int class, char name, kthread_id_t t)
	-{
	- return (callb_add_common(func, arg, class, name, t));
	-}
	-
	-/*
	- * callout_delete() is called to remove an entry identified by id
	- * that was originally placed there by a call to callout_add().
	- * return -1 if fail to delete a callb entry otherwise return 0.
	- */
	-int
	-callb_delete(callb_id_t id)
	-{
	- callb_t **pp;
	- callb_t me = (callb_t )id;
	-
	- mutex_enter(&ct->ct_lock);
	-
	- for (;;) {
	- pp = &ct->ct_first_cb[me->c_class];
	- while (pp != NULL && pp != me)
	- pp = &(*pp)->c_next;
	-
	-#ifdef DEBUG
	- if (*pp != me) {
	- cmn_err(CE_WARN, "callb delete bogus entry 0x%p",
	- (void *)me);
	- mutex_exit(&ct->ct_lock);
	- return (-1);
	- }
	-#endif /* DEBUG */
	-
	- /*
	- * It is not allowed to delete a callb in the middle of
	- * executing otherwise, the callb_execute() will be confused.
	- */
	- if (!(me->c_flag & CALLB_EXECUTING))
	- break;
	-
	- cv_wait(&me->c_done_cv, &ct->ct_lock);
	- }
	- /* relink the class list */
	- *pp = me->c_next;
	-
	- /* clean up myself and return the free callb to the head of freelist */
	- me->c_flag = CALLB_FREE;
	- me->c_next = ct->ct_freelist;
	- ct->ct_freelist = me;
	-
	- mutex_exit(&ct->ct_lock);
	- return (0);
	-}
	-
	-/*
	- * class: indicates to execute all callbs in the same class;
	- * code: optional argument for the callb functions.
	- * return: = 0: success
	- * != 0: ptr to string supplied when callback was registered
	- */
	-void *
	-callb_execute_class(int class, int code)
	-{
	- callb_t *cp;
	- void *ret = NULL;
	-
	- ASSERT(class < NCBCLASS);
	-
	- mutex_enter(&ct->ct_lock);
	-
	- for (cp = ct->ct_first_cb[class];
	- cp != NULL && ret == 0; cp = cp->c_next) {
	- while (cp->c_flag & CALLB_EXECUTING)
	- cv_wait(&cp->c_done_cv, &ct->ct_lock);
	- /*
	- * cont if the callb is deleted while we're sleeping
	- */
	- if (cp->c_flag == CALLB_FREE)
	- continue;
	- cp->c_flag \|= CALLB_EXECUTING;
	-
	-#ifdef CALLB_DEBUG
	- printf("callb_execute: name=%s func=%p arg=%p\n",
	- cp->c_name, (void )cp->c_func, (void )cp->c_arg);
	-#endif /* CALLB_DEBUG */
	-
	- mutex_exit(&ct->ct_lock);
	- /* If callback function fails, pass back client's name */
	- if (!(*cp->c_func)(cp->c_arg, code))
	- ret = cp->c_name;
	- mutex_enter(&ct->ct_lock);
	-
	- cp->c_flag &= ~CALLB_EXECUTING;
	- cv_broadcast(&cp->c_done_cv);
	- }
	- mutex_exit(&ct->ct_lock);
	- return (ret);
	-}
	-
	-/*
	- * callers make sure no recursive entries to this func.
	- * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure.
	- *
	- * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we
	- * use a cv_timedwait() in case the kernel thread is blocked.
	- *
	- * Note that this is a generic callback handler for daemon CPR and
	- * should NOT be changed to accommodate any specific requirement in a daemon.
	- * Individual daemons that require changes to the handler shall write
	- * callback routines in their own daemon modules.
	- */
	-boolean_t
	-callb_generic_cpr(void *arg, int code)
	-{
	- callb_cpr_t cp = (callb_cpr_t )arg;
	- clock_t ret = 0; /* assume success */
	-
	- mutex_enter(cp->cc_lockp);
	-
	- switch (code) {
	- case CB_CODE_CPR_CHKPT:
	- cp->cc_events \|= CALLB_CPR_START;
	-#ifdef CPR_NOT_THREAD_SAFE
	- while (!(cp->cc_events & CALLB_CPR_SAFE))
	- /* cv_timedwait() returns -1 if it times out. */
	- if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
	- cp->cc_lockp, (callb_timeout_sec * hz),
	- TR_CLOCK_TICK)) == -1)
	- break;
	-#endif
	- break;
	-
	- case CB_CODE_CPR_RESUME:
	- cp->cc_events &= ~CALLB_CPR_START;
	- cv_signal(&cp->cc_stop_cv);
	- break;
	- }
	- mutex_exit(cp->cc_lockp);
	- return (ret != -1);
	-}
	-
	-/*
	- * The generic callback function associated with kernel threads which
	- * are always considered safe.
	- */
	-/* ARGSUSED */
	-boolean_t
	-callb_generic_cpr_safe(void *arg, int code)
	-{
	- return (B_TRUE);
	-}
	-/*
	- * Prevent additions to callback table.
	- */
	-void
	-callb_lock_table(void)
	-{
	- mutex_enter(&ct->ct_lock);
	- ASSERT(ct->ct_busy == 0);
	- ct->ct_busy = 1;
	- mutex_exit(&ct->ct_lock);
	-}
	-
	-/*
	- * Allow additions to callback table.
	- */
	-void
	-callb_unlock_table(void)
	-{
	- mutex_enter(&ct->ct_lock);
	- ASSERT(ct->ct_busy != 0);
	- ct->ct_busy = 0;
	- cv_broadcast(&ct->ct_busy_cv);
	- mutex_exit(&ct->ct_lock);
	-}
	-
	-#ifdef illumos
	-/*
	- * Return a boolean value indicating whether a particular kernel thread is
	- * stopped in accordance with the cpr callback protocol. If returning
	- * false, also return a pointer to the thread name via the 2nd argument.
	- */
	-boolean_t
	-callb_is_stopped(kthread_id_t tp, caddr_t *thread_name)
	-{
	- callb_t *cp;
	- boolean_t ret_val;
	-
	- mutex_enter(&ct->ct_lock);
	-
	- for (cp = ct->ct_first_cb[CB_CL_CPR_DAEMON];
	- cp != NULL && tp != cp->c_thread; cp = cp->c_next)
	- ;
	-
	- ret_val = (cp != NULL);
	- if (ret_val) {
	- /*
	- * We found the thread in the callback table and have
	- * provisionally set the return value to true. Now
	- * see if it is marked "safe" and is sleeping or stopped.
	- */
	- callb_cpr_t ccp = (callb_cpr_t )cp->c_arg;
	-
	- thread_name = cp->c_name; / in case not stopped */
	- mutex_enter(ccp->cc_lockp);
	-
	- if (ccp->cc_events & CALLB_CPR_SAFE) {
	- int retry;
	-
	- mutex_exit(ccp->cc_lockp);
	- for (retry = 0; retry < CALLB_MAX_RETRY; retry++) {
	- thread_lock(tp);
	- if (tp->t_state & (TS_SLEEP \| TS_STOPPED)) {
	- thread_unlock(tp);
	- break;
	- }
	- thread_unlock(tp);
	- delay(CALLB_THREAD_DELAY);
	- }
	- ret_val = retry < CALLB_MAX_RETRY;
	- } else {
	- ret_val =
	- (ccp->cc_events & CALLB_CPR_ALWAYS_SAFE) != 0;
	- mutex_exit(ccp->cc_lockp);
	- }
	- } else {
	- /*
	- * Thread not found in callback table. Make the best
	- * attempt to identify the thread in the error message.
	- */
	- ulong_t offset;
	- char *sym = kobj_getsymname((uintptr_t)tp->t_startpc,
	- &offset);
	-
	- thread_name = sym ? sym : "unknown*";
	- }
	-
	- mutex_exit(&ct->ct_lock);
	- return (ret_val);
	-}
	-#endif /* illumos */
	-
	-SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
	-SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
	Index: head/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
	@@ -1,1399 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-/*
	- * Fault Management Architecture (FMA) Resource and Protocol Support
	- *
	- * The routines contained herein provide services to support kernel subsystems
	- * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
	- *
	- * Name-Value Pair Lists
	- *
	- * The embodiment of an FMA protocol element (event, fmri or authority) is a
	- * name-value pair list (nvlist_t). FMA-specific nvlist construtor and
	- * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
	- * to create an nvpair list using custom allocators. Callers may choose to
	- * allocate either from the kernel memory allocator, or from a preallocated
	- * buffer, useful in constrained contexts like high-level interrupt routines.
	- *
	- * Protocol Event and FMRI Construction
	- *
	- * Convenience routines are provided to construct nvlist events according to
	- * the FMA Event Protocol and Naming Schema specification for ereports and
	- * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
	- *
	- * ENA Manipulation
	- *
	- * Routines to generate ENA formats 0, 1 and 2 are available as well as
	- * routines to increment formats 1 and 2. Individual fields within the
	- * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
	- * fm_ena_format_get() and fm_ena_gen_get().
	- */
	-
	-#include <sys/types.h>
	-#include <sys/time.h>
	-#include <sys/sysevent.h>
	-#include <sys/nvpair.h>
	-#include <sys/cmn_err.h>
	-#include <sys/cpuvar.h>
	-#include <sys/sysmacros.h>
	-#include <sys/systm.h>
	-#include <sys/compress.h>
	-#include <sys/cpuvar.h>
	-#include <sys/kobj.h>
	-#include <sys/kstat.h>
	-#include <sys/processor.h>
	-#include <sys/pcpu.h>
	-#include <sys/sunddi.h>
	-#include <sys/systeminfo.h>
	-#include <sys/sysevent/eventdefs.h>
	-#include <sys/fm/util.h>
	-#include <sys/fm/protocol.h>
	-
	-/*
	- * URL and SUNW-MSG-ID value to display for fm_panic(), defined below. These
	- * values must be kept in sync with the FMA source code in usr/src/cmd/fm.
	- */
	-static const char *fm_url = "http://www.sun.com/msg";
	-static const char *fm_msgid = "SUNOS-8000-0G";
	-static char *volatile fm_panicstr = NULL;
	-
	-#ifdef illumos
	-errorq_t *ereport_errorq;
	-#endif
	-void *ereport_dumpbuf;
	-size_t ereport_dumplen;
	-
	-static uint_t ereport_chanlen = ERPT_EVCH_MAX;
	-static evchan_t *ereport_chan = NULL;
	-static ulong_t ereport_qlen = 0;
	-static size_t ereport_size = 0;
	-static int ereport_cols = 80;
	-
	-extern void fastreboot_disable_highpil(void);
	-
	-/*
	- * Common fault management kstats to record ereport generation
	- * failures
	- */
	-
	-struct erpt_kstat {
	- kstat_named_t erpt_dropped; /* num erpts dropped on post */
	- kstat_named_t erpt_set_failed; /* num erpt set failures */
	- kstat_named_t fmri_set_failed; /* num fmri set failures */
	- kstat_named_t payload_set_failed; /* num payload set failures */
	-};
	-
	-static struct erpt_kstat erpt_kstat_data = {
	- { "erpt-dropped", KSTAT_DATA_UINT64 },
	- { "erpt-set-failed", KSTAT_DATA_UINT64 },
	- { "fmri-set-failed", KSTAT_DATA_UINT64 },
	- { "payload-set-failed", KSTAT_DATA_UINT64 }
	-};
	-
	-#ifdef illumos
	-/ARGSUSED/
	-static void
	-fm_drain(void private, void data, errorq_elem_t *eep)
	-{
	- nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep);
	-
	- if (!KERNEL_PANICKED())
	- (void) fm_ereport_post(nvl, EVCH_TRYHARD);
	- else
	- fm_nvprint(nvl);
	-}
	-#endif
	-
	-void
	-fm_init(void)
	-{
	- kstat_t *ksp;
	-
	-#ifdef illumos
	- (void) sysevent_evc_bind(FM_ERROR_CHAN,
	- &ereport_chan, EVCH_CREAT \| EVCH_HOLD_PEND);
	-
	- (void) sysevent_evc_control(ereport_chan,
	- EVCH_SET_CHAN_LEN, &ereport_chanlen);
	-#endif
	-
	- if (ereport_qlen == 0)
	- ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
	-
	- if (ereport_size == 0)
	- ereport_size = ERPT_DATA_SZ;
	-
	-#ifdef illumos
	- ereport_errorq = errorq_nvcreate("fm_ereport_queue",
	- (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size,
	- FM_ERR_PIL, ERRORQ_VITAL);
	- if (ereport_errorq == NULL)
	- panic("failed to create required ereport error queue");
	-#endif
	-
	- ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP);
	- ereport_dumplen = ereport_size;
	-
	- /* Initialize ereport allocation and generation kstats */
	- ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED,
	- sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
	- KSTAT_FLAG_VIRTUAL);
	-
	- if (ksp != NULL) {
	- ksp->ks_data = &erpt_kstat_data;
	- kstat_install(ksp);
	- } else {
	- cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
	-
	- }
	-}
	-
	-#ifdef illumos
	-/*
	- * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of
	- * output so they aren't split across console lines, and return the end column.
	- */
	-/PRINTFLIKE4/
	-static int
	-fm_printf(int depth, int c, int cols, const char *format, ...)
	-{
	- va_list ap;
	- int width;
	- char c1;
	-
	- va_start(ap, format);
	- width = vsnprintf(&c1, sizeof (c1), format, ap);
	- va_end(ap);
	-
	- if (c + width >= cols) {
	- console_printf("\n\r");
	- c = 0;
	- if (format[0] != ' ' && depth > 0) {
	- console_printf(" ");
	- c++;
	- }
	- }
	-
	- va_start(ap, format);
	- console_vprintf(format, ap);
	- va_end(ap);
	-
	- return ((c + width) % cols);
	-}
	-
	-/*
	- * Recursively print a nvlist in the specified column width and return the
	- * column we end up in. This function is called recursively by fm_nvprint(),
	- * below. We generically format the entire nvpair using hexadecimal
	- * integers and strings, and elide any integer arrays. Arrays are basically
	- * used for cache dumps right now, so we suppress them so as not to overwhelm
	- * the amount of console output we produce at panic time. This can be further
	- * enhanced as FMA technology grows based upon the needs of consumers. All
	- * FMA telemetry is logged using the dump device transport, so the console
	- * output serves only as a fallback in case this procedure is unsuccessful.
	- */
	-static int
	-fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
	-{
	- nvpair_t *nvp;
	-
	- for (nvp = nvlist_next_nvpair(nvl, NULL);
	- nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
	-
	- data_type_t type = nvpair_type(nvp);
	- const char *name = nvpair_name(nvp);
	-
	- boolean_t b;
	- uint8_t i8;
	- uint16_t i16;
	- uint32_t i32;
	- uint64_t i64;
	- char *str;
	- nvlist_t *cnv;
	-
	- if (strcmp(name, FM_CLASS) == 0)
	- continue; /* already printed by caller */
	-
	- c = fm_printf(d, c, cols, " %s=", name);
	-
	- switch (type) {
	- case DATA_TYPE_BOOLEAN:
	- c = fm_printf(d + 1, c, cols, " 1");
	- break;
	-
	- case DATA_TYPE_BOOLEAN_VALUE:
	- (void) nvpair_value_boolean_value(nvp, &b);
	- c = fm_printf(d + 1, c, cols, b ? "1" : "0");
	- break;
	-
	- case DATA_TYPE_BYTE:
	- (void) nvpair_value_byte(nvp, &i8);
	- c = fm_printf(d + 1, c, cols, "%x", i8);
	- break;
	-
	- case DATA_TYPE_INT8:
	- (void) nvpair_value_int8(nvp, (void *)&i8);
	- c = fm_printf(d + 1, c, cols, "%x", i8);
	- break;
	-
	- case DATA_TYPE_UINT8:
	- (void) nvpair_value_uint8(nvp, &i8);
	- c = fm_printf(d + 1, c, cols, "%x", i8);
	- break;
	-
	- case DATA_TYPE_INT16:
	- (void) nvpair_value_int16(nvp, (void *)&i16);
	- c = fm_printf(d + 1, c, cols, "%x", i16);
	- break;
	-
	- case DATA_TYPE_UINT16:
	- (void) nvpair_value_uint16(nvp, &i16);
	- c = fm_printf(d + 1, c, cols, "%x", i16);
	- break;
	-
	- case DATA_TYPE_INT32:
	- (void) nvpair_value_int32(nvp, (void *)&i32);
	- c = fm_printf(d + 1, c, cols, "%x", i32);
	- break;
	-
	- case DATA_TYPE_UINT32:
	- (void) nvpair_value_uint32(nvp, &i32);
	- c = fm_printf(d + 1, c, cols, "%x", i32);
	- break;
	-
	- case DATA_TYPE_INT64:
	- (void) nvpair_value_int64(nvp, (void *)&i64);
	- c = fm_printf(d + 1, c, cols, "%llx",
	- (u_longlong_t)i64);
	- break;
	-
	- case DATA_TYPE_UINT64:
	- (void) nvpair_value_uint64(nvp, &i64);
	- c = fm_printf(d + 1, c, cols, "%llx",
	- (u_longlong_t)i64);
	- break;
	-
	- case DATA_TYPE_HRTIME:
	- (void) nvpair_value_hrtime(nvp, (void *)&i64);
	- c = fm_printf(d + 1, c, cols, "%llx",
	- (u_longlong_t)i64);
	- break;
	-
	- case DATA_TYPE_STRING:
	- (void) nvpair_value_string(nvp, &str);
	- c = fm_printf(d + 1, c, cols, "\"%s\"",
	- str ? str : "<NULL>");
	- break;
	-
	- case DATA_TYPE_NVLIST:
	- c = fm_printf(d + 1, c, cols, "[");
	- (void) nvpair_value_nvlist(nvp, &cnv);
	- c = fm_nvprintr(cnv, d + 1, c, cols);
	- c = fm_printf(d + 1, c, cols, " ]");
	- break;
	-
	- case DATA_TYPE_NVLIST_ARRAY: {
	- nvlist_t **val;
	- uint_t i, nelem;
	-
	- c = fm_printf(d + 1, c, cols, "[");
	- (void) nvpair_value_nvlist_array(nvp, &val, &nelem);
	- for (i = 0; i < nelem; i++) {
	- c = fm_nvprintr(val[i], d + 1, c, cols);
	- }
	- c = fm_printf(d + 1, c, cols, " ]");
	- }
	- break;
	-
	- case DATA_TYPE_BOOLEAN_ARRAY:
	- case DATA_TYPE_BYTE_ARRAY:
	- case DATA_TYPE_INT8_ARRAY:
	- case DATA_TYPE_UINT8_ARRAY:
	- case DATA_TYPE_INT16_ARRAY:
	- case DATA_TYPE_UINT16_ARRAY:
	- case DATA_TYPE_INT32_ARRAY:
	- case DATA_TYPE_UINT32_ARRAY:
	- case DATA_TYPE_INT64_ARRAY:
	- case DATA_TYPE_UINT64_ARRAY:
	- case DATA_TYPE_STRING_ARRAY:
	- c = fm_printf(d + 1, c, cols, "[...]");
	- break;
	- case DATA_TYPE_UNKNOWN:
	- c = fm_printf(d + 1, c, cols, "<unknown>");
	- break;
	- }
	- }
	-
	- return (c);
	-}
	-
	-void
	-fm_nvprint(nvlist_t *nvl)
	-{
	- char *class;
	- int c = 0;
	-
	- console_printf("\r");
	-
	- if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
	- c = fm_printf(0, c, ereport_cols, "%s", class);
	-
	- if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0)
	- console_printf("\n");
	-
	- console_printf("\n");
	-}
	-
	-/*
	- * Wrapper for panic() that first produces an FMA-style message for admins.
	- * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this
	- * is the one exception to that rule and the only error that gets messaged.
	- * This function is intended for use by subsystems that have detected a fatal
	- * error and enqueued appropriate ereports and wish to then force a panic.
	- */
	-/PRINTFLIKE1/
	-void
	-fm_panic(const char *format, ...)
	-{
	- va_list ap;
	-
	- (void) atomic_cas_ptr((void )&fm_panicstr, NULL, (void )format);
	-#if defined(__i386) \|\| defined(__amd64)
	- fastreboot_disable_highpil();
	-#endif /* __i386 \|\| __amd64 */
	- va_start(ap, format);
	- vpanic(format, ap);
	- va_end(ap);
	-}
	-
	-/*
	- * Simply tell the caller if fm_panicstr is set, ie. an fma event has
	- * caused the panic. If so, something other than the default panic
	- * diagnosis method will diagnose the cause of the panic.
	- */
	-int
	-is_fm_panic()
	-{
	- if (fm_panicstr)
	- return (1);
	- else
	- return (0);
	-}
	-
	-/*
	- * Print any appropriate FMA banner message before the panic message. This
	- * function is called by panicsys() and prints the message for fm_panic().
	- * We print the message here so that it comes after the system is quiesced.
	- * A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix).
	- * The rest of the message is for the console only and not needed in the log,
	- * so it is printed using console_printf(). We break it up into multiple
	- * chunks so as to avoid overflowing any small legacy prom_printf() buffers.
	- */
	-void
	-fm_banner(void)
	-{
	- timespec_t tod;
	- hrtime_t now;
	-
	- if (!fm_panicstr)
	- return; /* panic was not initiated by fm_panic(); do nothing */
	-
	- if (KERNEL_PANICKED()) {
	- tod = panic_hrestime;
	- now = panic_hrtime;
	- } else {
	- gethrestime(&tod);
	- now = gethrtime_waitfree();
	- }
	-
	- cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, "
	- "TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid);
	-
	- console_printf(
	-"\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n"
	-"EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n",
	- fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now);
	-
	- console_printf(
	-"PLATFORM: %s, CSN: -, HOSTNAME: %s\n"
	-"SOURCE: %s, REV: %s %s\n",
	- platform, utsname.nodename, utsname.sysname,
	- utsname.release, utsname.version);
	-
	- console_printf(
	-"DESC: Errors have been detected that require a reboot to ensure system\n"
	-"integrity. See %s/%s for more information.\n",
	- fm_url, fm_msgid);
	-
	- console_printf(
	-"AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n"
	-"IMPACT: The system will sync files, save a crash dump if needed, and reboot\n"
	-"REC-ACTION: Save the error summary below in case telemetry cannot be saved\n");
	-
	- console_printf("\n");
	-}
	-
	-/*
	- * Utility function to write all of the pending ereports to the dump device.
	- * This function is called at either normal reboot or panic time, and simply
	- * iterates over the in-transit messages in the ereport sysevent channel.
	- */
	-void
	-fm_ereport_dump(void)
	-{
	- evchanq_t *chq;
	- sysevent_t *sep;
	- erpt_dump_t ed;
	-
	- timespec_t tod;
	- hrtime_t now;
	- char *buf;
	- size_t len;
	-
	- if (KERNEL_PANICKED()) {
	- tod = panic_hrestime;
	- now = panic_hrtime;
	- } else {
	- if (ereport_errorq != NULL)
	- errorq_drain(ereport_errorq);
	- gethrestime(&tod);
	- now = gethrtime_waitfree();
	- }
	-
	- /*
	- * In the panic case, sysevent_evc_walk_init() will return NULL.
	- */
	- if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL &&
	- !KERNEL_PANICKED())
	- return; /* event channel isn't initialized yet */
	-
	- while ((sep = sysevent_evc_walk_step(chq)) != NULL) {
	- if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL)
	- break;
	-
	- ed.ed_magic = ERPT_MAGIC;
	- ed.ed_chksum = checksum32(buf, len);
	- ed.ed_size = (uint32_t)len;
	- ed.ed_pad = 0;
	- ed.ed_hrt_nsec = SE_TIME(sep);
	- ed.ed_hrt_base = now;
	- ed.ed_tod_base.sec = tod.tv_sec;
	- ed.ed_tod_base.nsec = tod.tv_nsec;
	-
	- dumpvp_write(&ed, sizeof (ed));
	- dumpvp_write(buf, len);
	- }
	-
	- sysevent_evc_walk_fini(chq);
	-}
	-#endif
	-
	-/*
	- * Post an error report (ereport) to the sysevent error channel. The error
	- * channel must be established with a prior call to sysevent_evc_create()
	- * before publication may occur.
	- */
	-void
	-fm_ereport_post(nvlist_t *ereport, int evc_flag)
	-{
	- size_t nvl_size = 0;
	- evchan_t *error_chan;
	- sysevent_id_t eid;
	-
	- (void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
	- if (nvl_size > ERPT_DATA_SZ \|\| nvl_size == 0) {
	- atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
	- return;
	- }
	-
	-#ifdef illumos
	- if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan,
	- EVCH_CREAT\|EVCH_HOLD_PEND) != 0) {
	- atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
	- return;
	- }
	-
	- if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
	- SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
	- atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
	- (void) sysevent_evc_unbind(error_chan);
	- return;
	- }
	- (void) sysevent_evc_unbind(error_chan);
	-#else
	- (void) ddi_log_sysevent(NULL, SUNW_VENDOR, EC_DEV_STATUS,
	- ESC_DEV_DLE, ereport, &eid, DDI_SLEEP);
	-#endif
	-}
	-
	-/*
	- * Wrapppers for FM nvlist allocators
	- */
	-/* ARGSUSED */
	-static void *
	-i_fm_alloc(nv_alloc_t *nva, size_t size)
	-{
	- return (kmem_zalloc(size, KM_SLEEP));
	-}
	-
	-/* ARGSUSED */
	-static void
	-i_fm_free(nv_alloc_t nva, void buf, size_t size)
	-{
	- kmem_free(buf, size);
	-}
	-
	-const nv_alloc_ops_t fm_mem_alloc_ops = {
	- NULL,
	- NULL,
	- i_fm_alloc,
	- i_fm_free,
	- NULL
	-};
	-
	-/*
	- * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer
	- * to the newly allocated nv_alloc_t structure is returned upon success or NULL
	- * is returned to indicate that the nv_alloc structure could not be created.
	- */
	-nv_alloc_t *
	-fm_nva_xcreate(char *buf, size_t bufsz)
	-{
	- nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
	-
	- if (bufsz == 0 \|\| nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
	- kmem_free(nvhdl, sizeof (nv_alloc_t));
	- return (NULL);
	- }
	-
	- return (nvhdl);
	-}
	-
	-/*
	- * Destroy a previously allocated nv_alloc structure. The fixed buffer
	- * associated with nva must be freed by the caller.
	- */
	-void
	-fm_nva_xdestroy(nv_alloc_t *nva)
	-{
	- nv_alloc_fini(nva);
	- kmem_free(nva, sizeof (nv_alloc_t));
	-}
	-
	-/*
	- * Create a new nv list. A pointer to a new nv list structure is returned
	- * upon success or NULL is returned to indicate that the structure could
	- * not be created. The newly created nv list is created and managed by the
	- * operations installed in nva. If nva is NULL, the default FMA nva
	- * operations are installed and used.
	- *
	- * When called from the kernel and nva == NULL, this function must be called
	- * from passive kernel context with no locks held that can prevent a
	- * sleeping memory allocation from occurring. Otherwise, this function may
	- * be called from other kernel contexts as long a valid nva created via
	- * fm_nva_create() is supplied.
	- */
	-nvlist_t *
	-fm_nvlist_create(nv_alloc_t *nva)
	-{
	- int hdl_alloced = 0;
	- nvlist_t *nvl;
	- nv_alloc_t *nvhdl;
	-
	- if (nva == NULL) {
	- nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
	-
	- if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
	- kmem_free(nvhdl, sizeof (nv_alloc_t));
	- return (NULL);
	- }
	- hdl_alloced = 1;
	- } else {
	- nvhdl = nva;
	- }
	-
	- if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
	- if (hdl_alloced) {
	- nv_alloc_fini(nvhdl);
	- kmem_free(nvhdl, sizeof (nv_alloc_t));
	- }
	- return (NULL);
	- }
	-
	- return (nvl);
	-}
	-
	-/*
	- * Destroy a previously allocated nvlist structure. flag indicates whether
	- * or not the associated nva structure should be freed (FM_NVA_FREE) or
	- * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows
	- * it to be re-used for future nvlist creation operations.
	- */
	-void
	-fm_nvlist_destroy(nvlist_t *nvl, int flag)
	-{
	- nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
	-
	- nvlist_free(nvl);
	-
	- if (nva != NULL) {
	- if (flag == FM_NVA_FREE)
	- fm_nva_xdestroy(nva);
	- }
	-}
	-
	-int
	-i_fm_payload_set(nvlist_t payload, const char name, va_list ap)
	-{
	- int nelem, ret = 0;
	- data_type_t type;
	-
	- while (ret == 0 && name != NULL) {
	- type = va_arg(ap, data_type_t);
	- switch (type) {
	- case DATA_TYPE_BYTE:
	- ret = nvlist_add_byte(payload, name,
	- va_arg(ap, uint_t));
	- break;
	- case DATA_TYPE_BYTE_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_byte_array(payload, name,
	- va_arg(ap, uchar_t *), nelem);
	- break;
	- case DATA_TYPE_BOOLEAN_VALUE:
	- ret = nvlist_add_boolean_value(payload, name,
	- va_arg(ap, boolean_t));
	- break;
	- case DATA_TYPE_BOOLEAN_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_boolean_array(payload, name,
	- va_arg(ap, boolean_t *), nelem);
	- break;
	- case DATA_TYPE_INT8:
	- ret = nvlist_add_int8(payload, name,
	- va_arg(ap, int));
	- break;
	- case DATA_TYPE_INT8_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_int8_array(payload, name,
	- va_arg(ap, int8_t *), nelem);
	- break;
	- case DATA_TYPE_UINT8:
	- ret = nvlist_add_uint8(payload, name,
	- va_arg(ap, uint_t));
	- break;
	- case DATA_TYPE_UINT8_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_uint8_array(payload, name,
	- va_arg(ap, uint8_t *), nelem);
	- break;
	- case DATA_TYPE_INT16:
	- ret = nvlist_add_int16(payload, name,
	- va_arg(ap, int));
	- break;
	- case DATA_TYPE_INT16_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_int16_array(payload, name,
	- va_arg(ap, int16_t *), nelem);
	- break;
	- case DATA_TYPE_UINT16:
	- ret = nvlist_add_uint16(payload, name,
	- va_arg(ap, uint_t));
	- break;
	- case DATA_TYPE_UINT16_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_uint16_array(payload, name,
	- va_arg(ap, uint16_t *), nelem);
	- break;
	- case DATA_TYPE_INT32:
	- ret = nvlist_add_int32(payload, name,
	- va_arg(ap, int32_t));
	- break;
	- case DATA_TYPE_INT32_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_int32_array(payload, name,
	- va_arg(ap, int32_t *), nelem);
	- break;
	- case DATA_TYPE_UINT32:
	- ret = nvlist_add_uint32(payload, name,
	- va_arg(ap, uint32_t));
	- break;
	- case DATA_TYPE_UINT32_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_uint32_array(payload, name,
	- va_arg(ap, uint32_t *), nelem);
	- break;
	- case DATA_TYPE_INT64:
	- ret = nvlist_add_int64(payload, name,
	- va_arg(ap, int64_t));
	- break;
	- case DATA_TYPE_INT64_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_int64_array(payload, name,
	- va_arg(ap, int64_t *), nelem);
	- break;
	- case DATA_TYPE_UINT64:
	- ret = nvlist_add_uint64(payload, name,
	- va_arg(ap, uint64_t));
	- break;
	- case DATA_TYPE_UINT64_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_uint64_array(payload, name,
	- va_arg(ap, uint64_t *), nelem);
	- break;
	- case DATA_TYPE_STRING:
	- ret = nvlist_add_string(payload, name,
	- va_arg(ap, char *));
	- break;
	- case DATA_TYPE_STRING_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_string_array(payload, name,
	- va_arg(ap, char **), nelem);
	- break;
	- case DATA_TYPE_NVLIST:
	- ret = nvlist_add_nvlist(payload, name,
	- va_arg(ap, nvlist_t *));
	- break;
	- case DATA_TYPE_NVLIST_ARRAY:
	- nelem = va_arg(ap, int);
	- ret = nvlist_add_nvlist_array(payload, name,
	- va_arg(ap, nvlist_t **), nelem);
	- break;
	- default:
	- ret = EINVAL;
	- }
	-
	- name = va_arg(ap, char *);
	- }
	- return (ret);
	-}
	-
	-void
	-fm_payload_set(nvlist_t *payload, ...)
	-{
	- int ret;
	- const char *name;
	- va_list ap;
	-
	- va_start(ap, payload);
	- name = va_arg(ap, char *);
	- ret = i_fm_payload_set(payload, name, ap);
	- va_end(ap);
	-
	- if (ret)
	- atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
	-}
	-
	-/*
	- * Set-up and validate the members of an ereport event according to:
	- *
	- * Member name Type Value
	- * ====================================================
	- * class string ereport
	- * version uint8_t 0
	- * ena uint64_t <ena>
	- * detector nvlist_t <detector>
	- * ereport-payload nvlist_t <var args>
	- *
	- * We don't actually add a 'version' member to the payload. Really,
	- * the version quoted to us by our caller is that of the category 1
	- * "ereport" event class (and we require FM_EREPORT_VERS0) but
	- * the payload version of the actual leaf class event under construction
	- * may be something else. Callers should supply a version in the varargs,
	- * or (better) we could take two version arguments - one for the
	- * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
	- * for the leaf class.
	- */
	-void
	-fm_ereport_set(nvlist_t ereport, int version, const char erpt_class,
	- uint64_t ena, const nvlist_t *detector, ...)
	-{
	- char ereport_class[FM_MAX_CLASS];
	- const char *name;
	- va_list ap;
	- int ret;
	-
	- if (version != FM_EREPORT_VERS0) {
	- atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
	- return;
	- }
	-
	- (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
	- FM_EREPORT_CLASS, erpt_class);
	- if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
	- atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
	- return;
	- }
	-
	- if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
	- atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
	- }
	-
	- if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
	- (nvlist_t *)detector) != 0) {
	- atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
	- }
	-
	- va_start(ap, detector);
	- name = va_arg(ap, const char *);
	- ret = i_fm_payload_set(ereport, name, ap);
	- va_end(ap);
	-
	- if (ret)
	- atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
	-}
	-
	-/*
	- * Set-up and validate the members of an hc fmri according to;
	- *
	- * Member name Type Value
	- * ===================================================
	- * version uint8_t 0
	- * auth nvlist_t <auth>
	- * hc-name string <name>
	- * hc-id string <id>
	- *
	- * Note that auth and hc-id are optional members.
	- */
	-
	-#define HC_MAXPAIRS 20
	-#define HC_MAXNAMELEN 50
	-
	-static int
	-fm_fmri_hc_set_common(nvlist_t fmri, int version, const nvlist_t auth)
	-{
	- if (version != FM_HC_SCHEME_VERSION) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return (0);
	- }
	-
	- if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 \|\|
	- nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return (0);
	- }
	-
	- if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
	- (nvlist_t *)auth) != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return (0);
	- }
	-
	- return (1);
	-}
	-
	-void
	-fm_fmri_hc_set(nvlist_t fmri, int version, const nvlist_t auth,
	- nvlist_t *snvl, int npairs, ...)
	-{
	- nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
	- nvlist_t *pairs[HC_MAXPAIRS];
	- va_list ap;
	- int i;
	-
	- if (!fm_fmri_hc_set_common(fmri, version, auth))
	- return;
	-
	- npairs = MIN(npairs, HC_MAXPAIRS);
	-
	- va_start(ap, npairs);
	- for (i = 0; i < npairs; i++) {
	- const char name = va_arg(ap, const char );
	- uint32_t id = va_arg(ap, uint32_t);
	- char idstr[11];
	-
	- (void) snprintf(idstr, sizeof (idstr), "%u", id);
	-
	- pairs[i] = fm_nvlist_create(nva);
	- if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 \|\|
	- nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- }
	- }
	- va_end(ap);
	-
	- if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	-
	- for (i = 0; i < npairs; i++)
	- fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
	-
	- if (snvl != NULL) {
	- if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- }
	- }
	-}
	-
	-/*
	- * Set-up and validate the members of an dev fmri according to:
	- *
	- * Member name Type Value
	- * ====================================================
	- * version uint8_t 0
	- * auth nvlist_t <auth>
	- * devpath string <devpath>
	- * [devid] string <devid>
	- * [target-port-l0id] string <target-port-lun0-id>
	- *
	- * Note that auth and devid are optional members.
	- */
	-void
	-fm_fmri_dev_set(nvlist_t fmri_dev, int version, const nvlist_t auth,
	- const char devpath, const char devid, const char *tpl0)
	-{
	- int err = 0;
	-
	- if (version != DEV_SCHEME_VERSION0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- err \|= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
	- err \|= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
	-
	- if (auth != NULL) {
	- err \|= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
	- (nvlist_t *)auth);
	- }
	-
	- err \|= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
	-
	- if (devid != NULL)
	- err \|= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
	-
	- if (tpl0 != NULL)
	- err \|= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
	-
	- if (err)
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	-
	-}
	-
	-/*
	- * Set-up and validate the members of an cpu fmri according to:
	- *
	- * Member name Type Value
	- * ====================================================
	- * version uint8_t 0
	- * auth nvlist_t <auth>
	- * cpuid uint32_t <cpu_id>
	- * cpumask uint8_t <cpu_mask>
	- * serial uint64_t <serial_id>
	- *
	- * Note that auth, cpumask, serial are optional members.
	- *
	- */
	-void
	-fm_fmri_cpu_set(nvlist_t fmri_cpu, int version, const nvlist_t auth,
	- uint32_t cpu_id, uint8_t cpu_maskp, const char serial_idp)
	-{
	- uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
	-
	- if (version < CPU_SCHEME_VERSION1) {
	- atomic_inc_64(failedp);
	- return;
	- }
	-
	- if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
	- atomic_inc_64(failedp);
	- return;
	- }
	-
	- if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
	- FM_FMRI_SCHEME_CPU) != 0) {
	- atomic_inc_64(failedp);
	- return;
	- }
	-
	- if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
	- (nvlist_t *)auth) != 0)
	- atomic_inc_64(failedp);
	-
	- if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
	- atomic_inc_64(failedp);
	-
	- if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
	- *cpu_maskp) != 0)
	- atomic_inc_64(failedp);
	-
	- if (serial_idp == NULL \|\| nvlist_add_string(fmri_cpu,
	- FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
	- atomic_inc_64(failedp);
	-}
	-
	-/*
	- * Set-up and validate the members of a mem according to:
	- *
	- * Member name Type Value
	- * ====================================================
	- * version uint8_t 0
	- * auth nvlist_t <auth> [optional]
	- * unum string <unum>
	- * serial string <serial> [optional*]
	- * offset uint64_t <offset> [optional]
	- *
	- * * serial is required if offset is present
	- */
	-void
	-fm_fmri_mem_set(nvlist_t fmri, int version, const nvlist_t auth,
	- const char unum, const char serial, uint64_t offset)
	-{
	- if (version != MEM_SCHEME_VERSION0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- if (!serial && (offset != (uint64_t)-1)) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- if (auth != NULL) {
	- if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
	- (nvlist_t *)auth) != 0) {
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- }
	- }
	-
	- if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- }
	-
	- if (serial != NULL) {
	- if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
	- (char **)&serial, 1) != 0) {
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- }
	- if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
	- FM_FMRI_MEM_OFFSET, offset) != 0) {
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- }
	- }
	-}
	-
	-void
	-fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
	- uint64_t vdev_guid)
	-{
	- if (version != ZFS_SCHEME_VERSION0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- }
	-
	- if (vdev_guid != 0) {
	- if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- }
	- }
	-}
	-
	-uint64_t
	-fm_ena_increment(uint64_t ena)
	-{
	- uint64_t new_ena;
	-
	- switch (ENA_FORMAT(ena)) {
	- case FM_ENA_FMT1:
	- new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
	- break;
	- case FM_ENA_FMT2:
	- new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
	- break;
	- default:
	- new_ena = 0;
	- }
	-
	- return (new_ena);
	-}
	-
	-uint64_t
	-fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
	-{
	- uint64_t ena = 0;
	-
	- switch (format) {
	- case FM_ENA_FMT1:
	- if (timestamp) {
	- ena = (uint64_t)((format & ENA_FORMAT_MASK) \|
	- ((cpuid << ENA_FMT1_CPUID_SHFT) &
	- ENA_FMT1_CPUID_MASK) \|
	- ((timestamp << ENA_FMT1_TIME_SHFT) &
	- ENA_FMT1_TIME_MASK));
	- } else {
	- ena = (uint64_t)((format & ENA_FORMAT_MASK) \|
	- ((cpuid << ENA_FMT1_CPUID_SHFT) &
	- ENA_FMT1_CPUID_MASK) \|
	- ((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) &
	- ENA_FMT1_TIME_MASK));
	- }
	- break;
	- case FM_ENA_FMT2:
	- ena = (uint64_t)((format & ENA_FORMAT_MASK) \|
	- ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
	- break;
	- default:
	- break;
	- }
	-
	- return (ena);
	-}
	-
	-uint64_t
	-fm_ena_generate(uint64_t timestamp, uchar_t format)
	-{
	- return (fm_ena_generate_cpu(timestamp, PCPU_GET(cpuid), format));
	-}
	-
	-uint64_t
	-fm_ena_generation_get(uint64_t ena)
	-{
	- uint64_t gen;
	-
	- switch (ENA_FORMAT(ena)) {
	- case FM_ENA_FMT1:
	- gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
	- break;
	- case FM_ENA_FMT2:
	- gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
	- break;
	- default:
	- gen = 0;
	- break;
	- }
	-
	- return (gen);
	-}
	-
	-uchar_t
	-fm_ena_format_get(uint64_t ena)
	-{
	-
	- return (ENA_FORMAT(ena));
	-}
	-
	-uint64_t
	-fm_ena_id_get(uint64_t ena)
	-{
	- uint64_t id;
	-
	- switch (ENA_FORMAT(ena)) {
	- case FM_ENA_FMT1:
	- id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
	- break;
	- case FM_ENA_FMT2:
	- id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
	- break;
	- default:
	- id = 0;
	- }
	-
	- return (id);
	-}
	-
	-uint64_t
	-fm_ena_time_get(uint64_t ena)
	-{
	- uint64_t time;
	-
	- switch (ENA_FORMAT(ena)) {
	- case FM_ENA_FMT1:
	- time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
	- break;
	- case FM_ENA_FMT2:
	- time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
	- break;
	- default:
	- time = 0;
	- }
	-
	- return (time);
	-}
	-
	-#ifdef illumos
	-/*
	- * Convert a getpcstack() trace to symbolic name+offset, and add the resulting
	- * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK.
	- */
	-void
	-fm_payload_stack_add(nvlist_t payload, const pc_t stack, int depth)
	-{
	- int i;
	- char *sym;
	- ulong_t off;
	- char *stkpp[FM_STK_DEPTH];
	- char buf[FM_STK_DEPTH * FM_SYM_SZ];
	- char *stkp = buf;
	-
	- for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) {
	- if ((sym = kobj_getsymname(stack[i], &off)) != NULL)
	- (void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off);
	- else
	- (void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]);
	- stkpp[i] = stkp;
	- }
	-
	- fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK,
	- DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL);
	-}
	-#endif
	-
	-#ifdef illumos
	-void
	-print_msg_hwerr(ctid_t ct_id, proc_t *p)
	-{
	- uprintf("Killed process %d (%s) in contract id %d "
	- "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id);
	-}
	-#endif
	-
	-void
	-fm_fmri_hc_create(nvlist_t fmri, int version, const nvlist_t auth,
	- nvlist_t snvl, nvlist_t bboard, int npairs, ...)
	-{
	- nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
	- nvlist_t *pairs[HC_MAXPAIRS];
	- nvlist_t **hcl;
	- uint_t n;
	- int i, j;
	- va_list ap;
	- char hcname, hcid;
	-
	- if (!fm_fmri_hc_set_common(fmri, version, auth))
	- return;
	-
	- /*
	- * copy the bboard nvpairs to the pairs array
	- */
	- if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
	- != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- for (i = 0; i < n; i++) {
	- if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
	- &hcname) != 0) {
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	- if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- pairs[i] = fm_nvlist_create(nva);
	- if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 \|\|
	- nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
	- for (j = 0; j <= i; j++) {
	- if (pairs[j] != NULL)
	- fm_nvlist_destroy(pairs[j],
	- FM_NVA_RETAIN);
	- }
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	- }
	-
	- /*
	- * create the pairs from passed in pairs
	- */
	- npairs = MIN(npairs, HC_MAXPAIRS);
	-
	- va_start(ap, npairs);
	- for (i = n; i < npairs + n; i++) {
	- const char name = va_arg(ap, const char );
	- uint32_t id = va_arg(ap, uint32_t);
	- char idstr[11];
	- (void) snprintf(idstr, sizeof (idstr), "%u", id);
	- pairs[i] = fm_nvlist_create(nva);
	- if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 \|\|
	- nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
	- for (j = 0; j <= i; j++) {
	- if (pairs[j] != NULL)
	- fm_nvlist_destroy(pairs[j],
	- FM_NVA_RETAIN);
	- }
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	- }
	- va_end(ap);
	-
	- /*
	- * Create the fmri hc list
	- */
	- if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
	- npairs + n) != 0) {
	- atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	-
	- for (i = 0; i < npairs + n; i++) {
	- fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
	- }
	-
	- if (snvl != NULL) {
	- if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
	- atomic_inc_64(
	- &erpt_kstat_data.fmri_set_failed.value.ui64);
	- return;
	- }
	- }
	-}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
	+++ head/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
	@@ -1,63 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#pragma ident "%Z%%M% %I% %E% SMI"
	-
	-#include <sys/nvpair.h>
	-
	-static void *
	-nv_alloc_sys(nv_alloc_t *nva, size_t size)
	-{
	- return (kmem_alloc(size, (int)(uintptr_t)nva->nva_arg));
	-}
	-
	-/ARGSUSED/
	-static void
	-nv_free_sys(nv_alloc_t nva, void buf, size_t size)
	-{
	- kmem_free(buf, size);
	-}
	-
	-static const nv_alloc_ops_t system_ops = {
	- NULL, /* nv_ao_init() */
	- NULL, /* nv_ao_fini() */
	- nv_alloc_sys, /* nv_ao_alloc() */
	- nv_free_sys, /* nv_ao_free() */
	- NULL /* nv_ao_reset() */
	-};
	-
	-nv_alloc_t nv_alloc_sleep_def = {
	- &system_ops,
	- (void *)KM_SLEEP
	-};
	-
	-nv_alloc_t nv_alloc_nosleep_def = {
	- &system_ops,
	- (void *)KM_NOSLEEP
	-};
	-
	-nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def;
	-nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def;
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
	@@ -1,313 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2014 Garrett D'Amore <garrett@damore.org>
	- *
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- * Copyright 2017 RackTop Systems.
	- */
	-
	-#ifndef _SYS_ACL_H
	-#define _SYS_ACL_H
	-
	-#include <sys/types.h>
	-#include <sys/acl_impl.h>
	-
	-#if defined(_KERNEL)
	-/*
	- * When compiling OpenSolaris kernel code, this file is included instead of the
	- * FreeBSD one. Include the original sys/acl.h as well.
	- */
	-#undef _SYS_ACL_H
	-#include_next <sys/acl.h>
	-#define _SYS_ACL_H
	-#endif /* _KERNEL */
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define MAX_ACL_ENTRIES (1024) /* max entries of each type */
	-typedef struct {
	- int a_type; /* the type of ACL entry */
	- uid_t a_id; /* the entry in -uid or gid */
	- o_mode_t a_perm; /* the permission field */
	-} aclent_t;
	-
	-typedef struct ace {
	- uid_t a_who; /* uid or gid */
	- uint32_t a_access_mask; /* read,write,... */
	- uint16_t a_flags; /* see below */
	- uint16_t a_type; /* allow or deny */
	-} ace_t;
	-
	-#ifndef _KERNEL
	-typedef struct acl_info acl_t;
	-#endif
	-
	-/*
	- * The following are Defined types for an aclent_t.
	- */
	-#define USER_OBJ (0x01) /* object owner */
	-#define USER (0x02) /* additional users */
	-#define GROUP_OBJ (0x04) /* owning group of the object */
	-#define GROUP (0x08) /* additional groups */
	-#define CLASS_OBJ (0x10) /* file group class and mask entry */
	-#define OTHER_OBJ (0x20) /* other entry for the object */
	-#define ACL_DEFAULT (0x1000) /* default flag */
	-/* default object owner */
	-#define DEF_USER_OBJ (ACL_DEFAULT \| USER_OBJ)
	-/* default additional users */
	-#define DEF_USER (ACL_DEFAULT \| USER)
	-/* default owning group */
	-#define DEF_GROUP_OBJ (ACL_DEFAULT \| GROUP_OBJ)
	-/* default additional groups */
	-#define DEF_GROUP (ACL_DEFAULT \| GROUP)
	-/* default mask entry */
	-#define DEF_CLASS_OBJ (ACL_DEFAULT \| CLASS_OBJ)
	-/* default other entry */
	-#define DEF_OTHER_OBJ (ACL_DEFAULT \| OTHER_OBJ)
	-
	-/*
	- * The following are defined for ace_t.
	- */
	-#define ACE_READ_DATA 0x00000001
	-#define ACE_LIST_DIRECTORY 0x00000001
	-#define ACE_WRITE_DATA 0x00000002
	-#define ACE_ADD_FILE 0x00000002
	-#define ACE_APPEND_DATA 0x00000004
	-#define ACE_ADD_SUBDIRECTORY 0x00000004
	-#define ACE_READ_NAMED_ATTRS 0x00000008
	-#define ACE_WRITE_NAMED_ATTRS 0x00000010
	-#define ACE_EXECUTE 0x00000020
	-#define ACE_DELETE_CHILD 0x00000040
	-#define ACE_READ_ATTRIBUTES 0x00000080
	-#define ACE_WRITE_ATTRIBUTES 0x00000100
	-#define ACE_DELETE 0x00010000
	-#define ACE_READ_ACL 0x00020000
	-#define ACE_WRITE_ACL 0x00040000
	-#define ACE_WRITE_OWNER 0x00080000
	-#define ACE_SYNCHRONIZE 0x00100000
	-
	-#define ACE_FILE_INHERIT_ACE 0x0001
	-#define ACE_DIRECTORY_INHERIT_ACE 0x0002
	-#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004
	-#define ACE_INHERIT_ONLY_ACE 0x0008
	-#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010
	-#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020
	-#define ACE_IDENTIFIER_GROUP 0x0040
	-#define ACE_INHERITED_ACE 0x0080
	-#define ACE_OWNER 0x1000
	-#define ACE_GROUP 0x2000
	-#define ACE_EVERYONE 0x4000
	-
	-#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000
	-#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001
	-#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002
	-#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003
	-
	-#define ACL_AUTO_INHERIT 0x0001
	-#define ACL_PROTECTED 0x0002
	-#define ACL_DEFAULTED 0x0004
	-#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT\|ACL_PROTECTED\| \
	- ACL_DEFAULTED)
	-
	-#if defined(_KERNEL) \|\| defined(_FAKE_KERNEL)
	-
	-/*
	- * These are only applicable in a CIFS context.
	- */
	-#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
	-#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
	-#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
	-#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
	-#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
	-#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
	-#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A
	-#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
	-#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C
	-#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D
	-#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E
	-#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
	-#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10
	-
	-#define ACE_ALL_TYPES 0x001F
	-
	-typedef struct ace_object {
	- uid_t a_who; /* uid or gid */
	- uint32_t a_access_mask; /* read,write,... */
	- uint16_t a_flags; /* see below */
	- uint16_t a_type; /* allow or deny */
	- uint8_t a_obj_type[16]; /* obj type */
	- uint8_t a_inherit_obj_type[16]; /* inherit obj */
	-} ace_object_t;
	-
	-#endif
	-
	-#define ACE_ALL_PERMS (ACE_READ_DATA\|ACE_LIST_DIRECTORY\|ACE_WRITE_DATA\| \
	- ACE_ADD_FILE\|ACE_APPEND_DATA\|ACE_ADD_SUBDIRECTORY\|ACE_READ_NAMED_ATTRS\| \
	- ACE_WRITE_NAMED_ATTRS\|ACE_EXECUTE\|ACE_DELETE_CHILD\|ACE_READ_ATTRIBUTES\| \
	- ACE_WRITE_ATTRIBUTES\|ACE_DELETE\|ACE_READ_ACL\|ACE_WRITE_ACL\| \
	- ACE_WRITE_OWNER\|ACE_SYNCHRONIZE)
	-
	-#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA\|ACE_APPEND_DATA\| \
	- ACE_WRITE_ATTRIBUTES\|ACE_WRITE_NAMED_ATTRS\|ACE_WRITE_ACL\| \
	- ACE_WRITE_OWNER\|ACE_DELETE\|ACE_DELETE_CHILD)
	-
	-#define ACE_READ_PERMS (ACE_READ_DATA\|ACE_READ_ACL\|ACE_READ_ATTRIBUTES\| \
	- ACE_READ_NAMED_ATTRS)
	-
	-#define ACE_WRITE_PERMS (ACE_WRITE_DATA\|ACE_APPEND_DATA\|ACE_WRITE_ATTRIBUTES\| \
	- ACE_WRITE_NAMED_ATTRS)
	-
	-#define ACE_MODIFY_PERMS (ACE_READ_DATA\|ACE_LIST_DIRECTORY\|ACE_WRITE_DATA\| \
	- ACE_ADD_FILE\|ACE_APPEND_DATA\|ACE_ADD_SUBDIRECTORY\|ACE_READ_NAMED_ATTRS\| \
	- ACE_WRITE_NAMED_ATTRS\|ACE_EXECUTE\|ACE_DELETE_CHILD\|ACE_READ_ATTRIBUTES\| \
	- ACE_WRITE_ATTRIBUTES\|ACE_DELETE\|ACE_READ_ACL\|ACE_SYNCHRONIZE)
	-/*
	- * The following flags are supported by both NFSv4 ACLs and ace_t.
	- */
	-#define ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE \| \
	- ACE_DIRECTORY_INHERIT_ACE \| \
	- ACE_NO_PROPAGATE_INHERIT_ACE \| \
	- ACE_INHERIT_ONLY_ACE \| \
	- ACE_INHERITED_ACE \| \
	- ACE_IDENTIFIER_GROUP)
	-
	-#define ACE_TYPE_FLAGS (ACE_OWNER\|ACE_GROUP\|ACE_EVERYONE\| \
	- ACE_IDENTIFIER_GROUP)
	-#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE\| ACL_INHERITED_ACE\| \
	- ACE_DIRECTORY_INHERIT_ACE\|ACE_NO_PROPAGATE_INHERIT_ACE\|ACE_INHERIT_ONLY_ACE)
	-
	-/* cmd args to acl(2) for aclent_t */
	-#define GETACL 1
	-#define SETACL 2
	-#define GETACLCNT 3
	-
	-/* cmd's to manipulate ace acls. */
	-#define ACE_GETACL 4
	-#define ACE_SETACL 5
	-#define ACE_GETACLCNT 6
	-
	-/* minimal acl entries from GETACLCNT */
	-#define MIN_ACL_ENTRIES 4
	-
	-#if !defined(_KERNEL)
	-
	-/* acl check errors */
	-#define GRP_ERROR 1
	-#define USER_ERROR 2
	-#define OTHER_ERROR 3
	-#define CLASS_ERROR 4
	-#define DUPLICATE_ERROR 5
	-#define MISS_ERROR 6
	-#define MEM_ERROR 7
	-#define ENTRY_ERROR 8
	-
	-
	-/*
	- * similar to ufs_acl.h: changed to char type for user commands (tar, cpio)
	- * Attribute types
	- */
	-#define UFSD_FREE ('0') /* Free entry */
	-#define UFSD_ACL ('1') /* Access Control Lists */
	-#define UFSD_DFACL ('2') /* reserved for future use */
	-#define ACE_ACL ('3') /* ace_t style acls */
	-
	-/*
	- * flag to [f]acl_get()
	- * controls whether a trivial acl should be returned.
	- */
	-#define ACL_NO_TRIVIAL 0x2
	-
	-
	-/*
	- * Flags to control acl_totext()
	- */
	-
	-#define ACL_APPEND_ID 0x1 /* append uid/gid to user/group entries */
	-#define ACL_COMPACT_FMT 0x2 /* build ACL in ls -V format */
	-#define ACL_NORESOLVE 0x4 /* don't do name service lookups */
	-#define ACL_SID_FMT 0x8 /* use usersid/groupsid when appropriate */
	-
	-/*
	- * Legacy aclcheck errors for aclent_t ACLs
	- */
	-#define EACL_GRP_ERROR GRP_ERROR
	-#define EACL_USER_ERROR USER_ERROR
	-#define EACL_OTHER_ERROR OTHER_ERROR
	-#define EACL_CLASS_ERROR CLASS_ERROR
	-#define EACL_DUPLICATE_ERROR DUPLICATE_ERROR
	-#define EACL_MISS_ERROR MISS_ERROR
	-#define EACL_MEM_ERROR MEM_ERROR
	-#define EACL_ENTRY_ERROR ENTRY_ERROR
	-
	-#define EACL_INHERIT_ERROR 9 /* invalid inherit flags */
	-#define EACL_FLAGS_ERROR 10 /* unknown flag value */
	-#define EACL_PERM_MASK_ERROR 11 /* unknown permission */
	-#define EACL_COUNT_ERROR 12 /* invalid acl count */
	-
	-#define EACL_INVALID_SLOT 13 /* invalid acl slot */
	-#define EACL_NO_ACL_ENTRY 14 /* Entry doesn't exist */
	-#define EACL_DIFF_TYPE 15 /* acls aren't same type */
	-
	-#define EACL_INVALID_USER_GROUP 16 /* need user/group name */
	-#define EACL_INVALID_STR 17 /* invalid acl string */
	-#define EACL_FIELD_NOT_BLANK 18 /* can't have blank field */
	-#define EACL_INVALID_ACCESS_TYPE 19 /* invalid access type */
	-#define EACL_UNKNOWN_DATA 20 /* Unrecognized data in ACL */
	-#define EACL_MISSING_FIELDS 21 /* missing fields in acl */
	-
	-#define EACL_INHERIT_NOTDIR 22 /* Need dir for inheritance */
	-
	-extern int aclcheck(aclent_t , int, int );
	-extern int acltomode(aclent_t , int, mode_t );
	-extern int aclfrommode(aclent_t , int, mode_t );
	-extern int aclsort(int, int, aclent_t *);
	-extern char acltotext(aclent_t , int);
	-extern aclent_t aclfromtext(char , int *);
	-extern void acl_free(acl_t *);
	-extern int acl_get(const char , int, acl_t *);
	-extern int facl_get(int, int, acl_t **);
	-extern int acl_set(const char , acl_t acl);
	-extern int facl_set(int, acl_t *acl);
	-extern int acl_strip(const char *, uid_t, gid_t, mode_t);
	-extern int acl_trivial(const char *);
	-extern char acl_totext(acl_t , int);
	-extern int acl_fromtext(const char , acl_t *);
	-extern int acl_check(acl_t *, int);
	-
	-#else /* !defined(_KERNEL) */
	-
	-extern void ksort(caddr_t, int, int, int ()(void , void *));
	-extern int cmp2acls(void , void );
	-
	-#endif /* !defined(_KERNEL) */
	-
	-extern int acl(const char path, int cmd, int cnt, void buf);
	-extern int facl(int fd, int cmd, int cnt, void *buf);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_ACL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
	@@ -115,7 +115,6 @@
	#define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n)))
	#define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n)))
	#define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n)))
	-#define __NORETURN __sun_attr__((__noreturn__))
	#define __CONST __sun_attr__((__const__))
	#define __PURE __sun_attr__((__pure__))

	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
	@@ -1,830 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
	- * Copyright 2017 RackTop Systems.
	- */
	-
	-#ifndef _SYS_CPUVAR_H
	-#define _SYS_CPUVAR_H
	-
	-#include <sys/thread.h>
	-#include <sys/sysinfo.h> /* has cpu_stat_t definition */
	-#include <sys/disp.h>
	-#include <sys/processor.h>
	-#include <sys/kcpc.h> /* has kcpc_ctx_t definition */
	-
	-#include <sys/loadavg.h>
	-#if (defined(_KERNEL) \|\| defined(_KMEMUSER)) && defined(_MACHDEP)
	-#include <sys/machcpuvar.h>
	-#endif
	-
	-#include <sys/types.h>
	-#include <sys/file.h>
	-#include <sys/bitmap.h>
	-#include <sys/rwlock.h>
	-#include <sys/msacct.h>
	-#if defined(__GNUC__) && defined(_ASM_INLINES) && defined(_KERNEL) && \
	- (defined(__i386) \|\| defined(__amd64))
	-#include <asm/cpuvar.h>
	-#endif
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-struct squeue_set_s;
	-
	-#define CPU_CACHE_COHERENCE_SIZE 64
	-
	-/*
	- * For fast event tracing.
	- */
	-struct ftrace_record;
	-typedef struct ftrace_data {
	- int ftd_state; /* ftrace flags */
	- kmutex_t ftd_unused; /* ftrace buffer lock, unused */
	- struct ftrace_record ftd_cur; / current record */
	- struct ftrace_record ftd_first; / first record */
	- struct ftrace_record ftd_last; / last record */
	-} ftrace_data_t;
	-
	-struct cyc_cpu;
	-struct nvlist;
	-
	-/*
	- * Per-CPU data.
	- *
	- * Be careful adding new members: if they are not the same in all modules (e.g.
	- * change size depending on a #define), CTF uniquification can fail to work
	- * properly. Furthermore, this is transitive in that it applies recursively to
	- * all types pointed to by cpu_t.
	- */
	-typedef struct cpu {
	- processorid_t cpu_id; /* CPU number */
	- processorid_t cpu_seqid; /* sequential CPU id (0..ncpus-1) */
	- volatile cpu_flag_t cpu_flags; /* flags indicating CPU state */
	- struct cpu cpu_self; / pointer to itself */
	- kthread_t cpu_thread; / current thread */
	- kthread_t cpu_idle_thread; / idle thread for this CPU */
	- kthread_t cpu_pause_thread; / pause thread for this CPU */
	- klwp_id_t cpu_lwp; /* current lwp (if any) */
	- klwp_id_t cpu_fpowner; /* currently loaded fpu owner */
	- struct cpupart cpu_part; / partition with this CPU */
	- struct lgrp_ld cpu_lpl; / pointer to this cpu's load */
	- int cpu_cache_offset; /* see kmem.c for details */
	-
	- /*
	- * Links to other CPUs. It is safe to walk these lists if
	- * one of the following is true:
	- * - cpu_lock held
	- * - preemption disabled via kpreempt_disable
	- * - PIL >= DISP_LEVEL
	- * - acting thread is an interrupt thread
	- * - all other CPUs are paused
	- */
	- struct cpu cpu_next; / next existing CPU */
	- struct cpu cpu_prev; / prev existing CPU */
	- struct cpu cpu_next_onln; / next online (enabled) CPU */
	- struct cpu cpu_prev_onln; / prev online (enabled) CPU */
	- struct cpu cpu_next_part; / next CPU in partition */
	- struct cpu cpu_prev_part; / prev CPU in partition */
	- struct cpu cpu_next_lgrp; / next CPU in latency group */
	- struct cpu cpu_prev_lgrp; / prev CPU in latency group */
	- struct cpu cpu_next_lpl; / next CPU in lgrp partition */
	- struct cpu *cpu_prev_lpl;
	-
	- struct cpu_pg cpu_pg; / cpu's processor groups */
	-
	- void cpu_reserved[4]; / reserved for future use */
	-
	- /*
	- * Scheduling variables.
	- */
	- disp_t cpu_disp; / dispatch queue data */
	- /*
	- * Note that cpu_disp is set before the CPU is added to the system
	- * and is never modified. Hence, no additional locking is needed
	- * beyond what's necessary to access the cpu_t structure.
	- */
	- char cpu_runrun; /* scheduling flag - set to preempt */
	- char cpu_kprunrun; /* force kernel preemption */
	- pri_t cpu_chosen_level; /* priority at which cpu */
	- /* was chosen for scheduling */
	- kthread_t cpu_dispthread; / thread selected for dispatch */
	- disp_lock_t cpu_thread_lock; /* dispatcher lock on current thread */
	- uint8_t cpu_disp_flags; /* flags used by dispatcher */
	- /*
	- * The following field is updated when ever the cpu_dispthread
	- * changes. Also in places, where the current thread(cpu_dispthread)
	- * priority changes. This is used in disp_lowpri_cpu()
	- */
	- pri_t cpu_dispatch_pri; /* priority of cpu_dispthread */
	- clock_t cpu_last_swtch; /* last time switched to new thread */
	-
	- /*
	- * Interrupt data.
	- */
	- caddr_t cpu_intr_stack; /* interrupt stack */
	- kthread_t cpu_intr_thread; / interrupt thread list */
	- uint_t cpu_intr_actv; /* interrupt levels active (bitmask) */
	- int cpu_base_spl; /* priority for highest rupt active */
	-
	- /*
	- * Statistics.
	- */
	- cpu_stats_t cpu_stats; /* per-CPU statistics */
	- struct kstat cpu_info_kstat; / kstat for cpu info */
	-
	- uintptr_t cpu_profile_pc; /* kernel PC in profile interrupt */
	- uintptr_t cpu_profile_upc; /* user PC in profile interrupt */
	- uintptr_t cpu_profile_pil; /* PIL when profile interrupted */
	-
	- ftrace_data_t cpu_ftrace; /* per cpu ftrace data */
	-
	- clock_t cpu_deadman_counter; /* used by deadman() */
	- uint_t cpu_deadman_countdown; /* used by deadman() */
	-
	- kmutex_t cpu_cpc_ctxlock; /* protects context for idle thread */
	- kcpc_ctx_t cpu_cpc_ctx; / performance counter context */
	-
	- /*
	- * Configuration information for the processor_info system call.
	- */
	- processor_info_t cpu_type_info; /* config info */
	- time_t cpu_state_begin; /* when CPU entered current state */
	- char cpu_cpr_flags; /* CPR related info */
	- struct cyc_cpu cpu_cyclic; / per cpu cyclic subsystem data */
	- struct squeue_set_s cpu_squeue_set; / per cpu squeue set */
	- struct nvlist cpu_props; / pool-related properties */
	-
	- krwlock_t cpu_ft_lock; /* DTrace: fasttrap lock */
	- uintptr_t cpu_dtrace_caller; /* DTrace: caller, if any */
	- hrtime_t cpu_dtrace_chillmark; /* DTrace: chill mark time */
	- hrtime_t cpu_dtrace_chilled; /* DTrace: total chill time */
	- volatile uint16_t cpu_mstate; /* cpu microstate */
	- volatile uint16_t cpu_mstate_gen; /* generation counter */
	- volatile hrtime_t cpu_mstate_start; /* cpu microstate start time */
	- volatile hrtime_t cpu_acct[NCMSTATES]; /* cpu microstate data */
	- hrtime_t cpu_intracct[NCMSTATES]; /* interrupt mstate data */
	- hrtime_t cpu_waitrq; /* cpu run-queue wait time */
	- struct loadavg_s cpu_loadavg; /* loadavg info for this cpu */
	-
	- char cpu_idstr; / for printing and debugging */
	- char cpu_brandstr; / for printing */
	-
	- /*
	- * Sum of all device interrupt weights that are currently directed at
	- * this cpu. Cleared at start of interrupt redistribution.
	- */
	- int32_t cpu_intr_weight;
	- void *cpu_vm_data;
	-
	- struct cpu_physid cpu_physid; / physical associations */
	-
	- uint64_t cpu_curr_clock; /* current clock freq in Hz */
	- char cpu_supp_freqs; / supported freqs in Hz */
	-
	- uintptr_t cpu_cpcprofile_pc; /* kernel PC in cpc interrupt */
	- uintptr_t cpu_cpcprofile_upc; /* user PC in cpc interrupt */
	-
	- /*
	- * Interrupt load factor used by dispatcher & softcall
	- */
	- hrtime_t cpu_intrlast; /* total interrupt time (nsec) */
	- int cpu_intrload; /* interrupt load factor (0-99%) */
	-
	- uint_t cpu_rotor; /* for cheap pseudo-random numbers */
	-
	- struct cu_cpu_info cpu_cu_info; / capacity & util. info */
	-
	- /*
	- * cpu_generation is updated whenever CPU goes on-line or off-line.
	- * Updates to cpu_generation are protected by cpu_lock.
	- *
	- * See CPU_NEW_GENERATION() macro below.
	- */
	- volatile uint_t cpu_generation; /* tracking on/off-line */
	-
	- /*
	- * New members must be added /before/ this member, as the CTF tools
	- * rely on this being the last field before cpu_m, so they can
	- * correctly calculate the offset when synthetically adding the cpu_m
	- * member in objects that do not have it. This fixup is required for
	- * uniquification to work correctly.
	- */
	- uintptr_t cpu_m_pad;
	-
	-#if (defined(_KERNEL) \|\| defined(_KMEMUSER)) && defined(_MACHDEP)
	- struct machcpu cpu_m; /* per architecture info */
	-#endif
	-} cpu_t;
	-
	-/*
	- * The cpu_core structure consists of per-CPU state available in any context.
	- * On some architectures, this may mean that the page(s) containing the
	- * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
	- * is up to the platform to assure that this is performed properly. Note that
	- * the structure is sized to avoid false sharing.
	- */
	-#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uint8_t) + \
	- sizeof (uintptr_t) + sizeof (kmutex_t))
	-#define CPUC_PADSIZE CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE
	-
	-typedef struct cpu_core {
	- uint16_t cpuc_dtrace_flags; /* DTrace flags */
	- uint8_t cpuc_dcpc_intr_state; /* DCPC provider intr state */
	- uint8_t cpuc_pad[CPUC_PADSIZE]; /* padding */
	- uintptr_t cpuc_dtrace_illval; /* DTrace illegal value */
	- kmutex_t cpuc_pid_lock; /* DTrace pid provider lock */
	-} cpu_core_t;
	-
	-#ifdef _KERNEL
	-extern cpu_core_t cpu_core[];
	-#endif /* _KERNEL */
	-
	-/*
	- * CPU_ON_INTR() macro. Returns non-zero if currently on interrupt stack.
	- * Note that this isn't a test for a high PIL. For example, cpu_intr_actv
	- * does not get updated when we go through sys_trap from TL>0 at high PIL.
	- * getpil() should be used instead to check for PIL levels.
	- */
	-#define CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1))
	-
	-/*
	- * Check to see if an interrupt thread might be active at a given ipl.
	- * If so return true.
	- * We must be conservative--it is ok to give a false yes, but a false no
	- * will cause disaster. (But if the situation changes after we check it is
	- * ok--the caller is trying to ensure that an interrupt routine has been
	- * exited).
	- * This is used when trying to remove an interrupt handler from an autovector
	- * list in avintr.c.
	- */
	-#define INTR_ACTIVE(cpup, level) \
	- ((level) <= LOCK_LEVEL ? \
	- ((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup)))
	-
	-/*
	- * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one
	- * looks at it. It's meant as a cheap mechanism to be incorporated in routines
	- * wanting to avoid biasing, but where true randomness isn't needed (just
	- * something that changes).
	- */
	-#define CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++)
	-
	-#if defined(_KERNEL) \|\| defined(_KMEMUSER)
	-
	-#define INTR_STACK_SIZE MAX(DEFAULTSTKSZ, PAGESIZE)
	-
	-/* MEMBERS PROTECTED BY "atomicity": cpu_flags */
	-
	-/*
	- * Flags in the CPU structure.
	- *
	- * These are protected by cpu_lock (except during creation).
	- *
	- * Offlined-CPUs have three stages of being offline:
	- *
	- * CPU_ENABLE indicates that the CPU is participating in I/O interrupts
	- * that can be directed at a number of different CPUs. If CPU_ENABLE
	- * is off, the CPU will not be given interrupts that can be sent elsewhere,
	- * but will still get interrupts from devices associated with that CPU only,
	- * and from other CPUs.
	- *
	- * CPU_OFFLINE indicates that the dispatcher should not allow any threads
	- * other than interrupt threads to run on that CPU. A CPU will not have
	- * CPU_OFFLINE set if there are any bound threads (besides interrupts).
	- *
	- * CPU_QUIESCED is set if p_offline was able to completely turn idle the
	- * CPU and it will not have to run interrupt threads. In this case it'll
	- * stay in the idle loop until CPU_QUIESCED is turned off.
	- *
	- * CPU_FROZEN is used only by CPR to mark CPUs that have been successfully
	- * suspended (in the suspend path), or have yet to be resumed (in the resume
	- * case).
	- *
	- * On some platforms CPUs can be individually powered off.
	- * The following flags are set for powered off CPUs: CPU_QUIESCED,
	- * CPU_OFFLINE, and CPU_POWEROFF. The following flags are cleared:
	- * CPU_RUNNING, CPU_READY, CPU_EXISTS, and CPU_ENABLE.
	- */
	-#define CPU_RUNNING 0x001 /* CPU running */
	-#define CPU_READY 0x002 /* CPU ready for cross-calls */
	-#define CPU_QUIESCED 0x004 /* CPU will stay in idle */
	-#define CPU_EXISTS 0x008 /* CPU is configured */
	-#define CPU_ENABLE 0x010 /* CPU enabled for interrupts */
	-#define CPU_OFFLINE 0x020 /* CPU offline via p_online */
	-#define CPU_POWEROFF 0x040 /* CPU is powered off */
	-#define CPU_FROZEN 0x080 /* CPU is frozen via CPR suspend */
	-#define CPU_SPARE 0x100 /* CPU offline available for use */
	-#define CPU_FAULTED 0x200 /* CPU offline diagnosed faulty */
	-
	-#define FMT_CPU_FLAGS \
	- "\20\12fault\11spare\10frozen" \
	- "\7poweroff\6offline\5enable\4exist\3quiesced\2ready\1run"
	-
	-#define CPU_ACTIVE(cpu) (((cpu)->cpu_flags & CPU_OFFLINE) == 0)
	-
	-/*
	- * Flags for cpu_offline(), cpu_faulted(), and cpu_spare().
	- */
	-#define CPU_FORCED 0x0001 /* Force CPU offline */
	-
	-/*
	- * DTrace flags.
	- */
	-#define CPU_DTRACE_NOFAULT 0x0001 /* Don't fault */
	-#define CPU_DTRACE_DROP 0x0002 /* Drop this ECB */
	-#define CPU_DTRACE_BADADDR 0x0004 /* DTrace fault: bad address */
	-#define CPU_DTRACE_BADALIGN 0x0008 /* DTrace fault: bad alignment */
	-#define CPU_DTRACE_DIVZERO 0x0010 /* DTrace fault: divide by zero */
	-#define CPU_DTRACE_ILLOP 0x0020 /* DTrace fault: illegal operation */
	-#define CPU_DTRACE_NOSCRATCH 0x0040 /* DTrace fault: out of scratch */
	-#define CPU_DTRACE_KPRIV 0x0080 /* DTrace fault: bad kernel access */
	-#define CPU_DTRACE_UPRIV 0x0100 /* DTrace fault: bad user access */
	-#define CPU_DTRACE_TUPOFLOW 0x0200 /* DTrace fault: tuple stack overflow */
	-#if defined(__sparc)
	-#define CPU_DTRACE_FAKERESTORE 0x0400 /* pid provider hint to getreg */
	-#endif
	-#define CPU_DTRACE_ENTRY 0x0800 /* pid provider hint to ustack() */
	-#define CPU_DTRACE_BADSTACK 0x1000 /* DTrace fault: bad stack */
	-
	-#define CPU_DTRACE_FAULT (CPU_DTRACE_BADADDR \| CPU_DTRACE_BADALIGN \| \
	- CPU_DTRACE_DIVZERO \| CPU_DTRACE_ILLOP \| \
	- CPU_DTRACE_NOSCRATCH \| CPU_DTRACE_KPRIV \| \
	- CPU_DTRACE_UPRIV \| CPU_DTRACE_TUPOFLOW \| \
	- CPU_DTRACE_BADSTACK)
	-#define CPU_DTRACE_ERROR (CPU_DTRACE_FAULT \| CPU_DTRACE_DROP)
	-
	-/*
	- * Dispatcher flags
	- * These flags must be changed only by the current CPU.
	- */
	-#define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */
	-#define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */
	-
	-#endif /* _KERNEL \|\| _KMEMUSER */
	-
	-#if (defined(_KERNEL) \|\| defined(_KMEMUSER)) && defined(_MACHDEP)
	-
	-/*
	- * Macros for manipulating sets of CPUs as a bitmap. Note that this
	- * bitmap may vary in size depending on the maximum CPU id a specific
	- * platform supports. This may be different than the number of CPUs
	- * the platform supports, since CPU ids can be sparse. We define two
	- * sets of macros; one for platforms where the maximum CPU id is less
	- * than the number of bits in a single word (32 in a 32-bit kernel,
	- * 64 in a 64-bit kernel), and one for platforms that require bitmaps
	- * of more than one word.
	- */
	-
	-#define CPUSET_WORDS BT_BITOUL(NCPU)
	-#define CPUSET_NOTINSET ((uint_t)-1)
	-
	-#if CPUSET_WORDS > 1
	-
	-typedef struct cpuset {
	- ulong_t cpub[CPUSET_WORDS];
	-} cpuset_t;
	-
	-/*
	- * Private functions for manipulating cpusets that do not fit in a
	- * single word. These should not be used directly; instead the
	- * CPUSET_* macros should be used so the code will be portable
	- * across different definitions of NCPU.
	- */
	-extern void cpuset_all(cpuset_t *);
	-extern void cpuset_all_but(cpuset_t *, uint_t);
	-extern int cpuset_isnull(cpuset_t *);
	-extern int cpuset_cmp(cpuset_t , cpuset_t );
	-extern void cpuset_only(cpuset_t *, uint_t);
	-extern uint_t cpuset_find(cpuset_t *);
	-extern void cpuset_bounds(cpuset_t , uint_t , uint_t *);
	-
	-#define CPUSET_ALL(set) cpuset_all(&(set))
	-#define CPUSET_ALL_BUT(set, cpu) cpuset_all_but(&(set), cpu)
	-#define CPUSET_ONLY(set, cpu) cpuset_only(&(set), cpu)
	-#define CPU_IN_SET(set, cpu) BT_TEST((set).cpub, cpu)
	-#define CPUSET_ADD(set, cpu) BT_SET((set).cpub, cpu)
	-#define CPUSET_DEL(set, cpu) BT_CLEAR((set).cpub, cpu)
	-#define CPUSET_ISNULL(set) cpuset_isnull(&(set))
	-#define CPUSET_ISEQUAL(set1, set2) cpuset_cmp(&(set1), &(set2))
	-
	-/*
	- * Find one CPU in the cpuset.
	- * Sets "cpu" to the id of the found CPU, or CPUSET_NOTINSET if no cpu
	- * could be found. (i.e. empty set)
	- */
	-#define CPUSET_FIND(set, cpu) { \
	- cpu = cpuset_find(&(set)); \
	-}
	-
	-/*
	- * Determine the smallest and largest CPU id in the set. Returns
	- * CPUSET_NOTINSET in smallest and largest when set is empty.
	- */
	-#define CPUSET_BOUNDS(set, smallest, largest) { \
	- cpuset_bounds(&(set), &(smallest), &(largest)); \
	-}
	-
	-/*
	- * Atomic cpuset operations
	- * These are safe to use for concurrent cpuset manipulations.
	- * "xdel" and "xadd" are exclusive operations, that set "result" to "0"
	- * if the add or del was successful, or "-1" if not successful.
	- * (e.g. attempting to add a cpu to a cpuset that's already there, or
	- * deleting a cpu that's not in the cpuset)
	- */
	-
	-#define CPUSET_ATOMIC_DEL(set, cpu) BT_ATOMIC_CLEAR((set).cpub, (cpu))
	-#define CPUSET_ATOMIC_ADD(set, cpu) BT_ATOMIC_SET((set).cpub, (cpu))
	-
	-#define CPUSET_ATOMIC_XADD(set, cpu, result) \
	- BT_ATOMIC_SET_EXCL((set).cpub, cpu, result)
	-
	-#define CPUSET_ATOMIC_XDEL(set, cpu, result) \
	- BT_ATOMIC_CLEAR_EXCL((set).cpub, cpu, result)
	-
	-
	-#define CPUSET_OR(set1, set2) { \
	- int _i; \
	- for (_i = 0; _i < CPUSET_WORDS; _i++) \
	- (set1).cpub[_i] \|= (set2).cpub[_i]; \
	-}
	-
	-#define CPUSET_XOR(set1, set2) { \
	- int _i; \
	- for (_i = 0; _i < CPUSET_WORDS; _i++) \
	- (set1).cpub[_i] ^= (set2).cpub[_i]; \
	-}
	-
	-#define CPUSET_AND(set1, set2) { \
	- int _i; \
	- for (_i = 0; _i < CPUSET_WORDS; _i++) \
	- (set1).cpub[_i] &= (set2).cpub[_i]; \
	-}
	-
	-#define CPUSET_ZERO(set) { \
	- int _i; \
	- for (_i = 0; _i < CPUSET_WORDS; _i++) \
	- (set).cpub[_i] = 0; \
	-}
	-
	-#elif CPUSET_WORDS == 1
	-
	-typedef ulong_t cpuset_t; /* a set of CPUs */
	-
	-#define CPUSET(cpu) (1UL << (cpu))
	-
	-#define CPUSET_ALL(set) ((void)((set) = ~0UL))
	-#define CPUSET_ALL_BUT(set, cpu) ((void)((set) = ~CPUSET(cpu)))
	-#define CPUSET_ONLY(set, cpu) ((void)((set) = CPUSET(cpu)))
	-#define CPU_IN_SET(set, cpu) ((set) & CPUSET(cpu))
	-#define CPUSET_ADD(set, cpu) ((void)((set) \|= CPUSET(cpu)))
	-#define CPUSET_DEL(set, cpu) ((void)((set) &= ~CPUSET(cpu)))
	-#define CPUSET_ISNULL(set) ((set) == 0)
	-#define CPUSET_ISEQUAL(set1, set2) ((set1) == (set2))
	-#define CPUSET_OR(set1, set2) ((void)((set1) \|= (set2)))
	-#define CPUSET_XOR(set1, set2) ((void)((set1) ^= (set2)))
	-#define CPUSET_AND(set1, set2) ((void)((set1) &= (set2)))
	-#define CPUSET_ZERO(set) ((void)((set) = 0))
	-
	-#define CPUSET_FIND(set, cpu) { \
	- cpu = (uint_t)(lowbit(set) - 1); \
	-}
	-
	-#define CPUSET_BOUNDS(set, smallest, largest) { \
	- smallest = (uint_t)(lowbit(set) - 1); \
	- largest = (uint_t)(highbit(set) - 1); \
	-}
	-
	-#define CPUSET_ATOMIC_DEL(set, cpu) atomic_and_ulong(&(set), ~CPUSET(cpu))
	-#define CPUSET_ATOMIC_ADD(set, cpu) atomic_or_ulong(&(set), CPUSET(cpu))
	-
	-#define CPUSET_ATOMIC_XADD(set, cpu, result) \
	- { result = atomic_set_long_excl(&(set), (cpu)); }
	-
	-#define CPUSET_ATOMIC_XDEL(set, cpu, result) \
	- { result = atomic_clear_long_excl(&(set), (cpu)); }
	-
	-#else /* CPUSET_WORDS <= 0 */
	-
	-#error NCPU is undefined or invalid
	-
	-#endif /* CPUSET_WORDS */
	-
	-extern cpuset_t cpu_seqid_inuse;
	-
	-#endif /* (_KERNEL \|\| _KMEMUSER) && _MACHDEP */
	-
	-#define CPU_CPR_OFFLINE 0x0
	-#define CPU_CPR_ONLINE 0x1
	-#define CPU_CPR_IS_OFFLINE(cpu) (((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE) == 0)
	-#define CPU_CPR_IS_ONLINE(cpu) ((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE)
	-#define CPU_SET_CPR_FLAGS(cpu, flag) ((cpu)->cpu_cpr_flags \|= flag)
	-
	-#if defined(_KERNEL) \|\| defined(_KMEMUSER)
	-
	-extern struct cpu cpu[]; / indexed by CPU number */
	-extern struct cpu *cpu_seq; / indexed by sequential CPU id */
	-extern cpu_t cpu_list; / list of CPUs */
	-extern cpu_t cpu_active; / list of active CPUs */
	-extern int ncpus; /* number of CPUs present */
	-extern int ncpus_online; /* number of CPUs not quiesced */
	-extern int max_ncpus; /* max present before ncpus is known */
	-extern int boot_max_ncpus; /* like max_ncpus but for real */
	-extern int boot_ncpus; /* # cpus present @ boot */
	-extern processorid_t max_cpuid; /* maximum CPU number */
	-extern struct cpu cpu_inmotion; / offline or partition move target */
	-extern cpu_t *clock_cpu_list;
	-extern processorid_t max_cpu_seqid_ever; /* maximum seqid ever given */
	-
	-#if defined(__i386) \|\| defined(__amd64)
	-extern struct cpu *curcpup(void);
	-#define CPU (curcpup()) /* Pointer to current CPU */
	-#else
	-#define CPU (curthread->t_cpu) /* Pointer to current CPU */
	-#endif
	-
	-/*
	- * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id
	- * as the target and to grab cpu_lock instead of requiring the caller
	- * to grab it.
	- */
	-#define CPU_CURRENT -3
	-
	-/*
	- * Per-CPU statistics
	- *
	- * cpu_stats_t contains numerous system and VM-related statistics, in the form
	- * of gauges or monotonically-increasing event occurrence counts.
	- */
	-
	-#define CPU_STATS_ENTER_K() kpreempt_disable()
	-#define CPU_STATS_EXIT_K() kpreempt_enable()
	-
	-#define CPU_STATS_ADD_K(class, stat, amount) \
	- { kpreempt_disable(); /* keep from switching CPUs */\
	- CPU_STATS_ADDQ(CPU, class, stat, amount); \
	- kpreempt_enable(); \
	- }
	-
	-#define CPU_STATS_ADDQ(cp, class, stat, amount) { \
	- extern void __dtrace_probe___cpu_##class##info_##stat(uint_t, \
	- uint64_t , cpu_t ); \
	- uint64_t *stataddr = &((cp)->cpu_stats.class.stat); \
	- __dtrace_probe___cpu_##class##info_##stat((amount), \
	- stataddr, cp); \
	- *(stataddr) += (amount); \
	-}
	-
	-#define CPU_STATS(cp, stat) \
	- ((cp)->cpu_stats.stat)
	-
	-/*
	- * Increment CPU generation value.
	- * This macro should be called whenever CPU goes on-line or off-line.
	- * Updates to cpu_generation should be protected by cpu_lock.
	- */
	-#define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++)
	-
	-#endif /* _KERNEL \|\| _KMEMUSER */
	-
	-/*
	- * CPU support routines (not for genassym.c)
	- */
	-#if (defined(_KERNEL) \|\| defined(_FAKE_KERNEL)) && defined(__STDC__)
	-
	-struct zone;
	-
	-void cpu_list_init(cpu_t *);
	-void cpu_add_unit(cpu_t *);
	-void cpu_del_unit(int cpuid);
	-void cpu_add_active(cpu_t *);
	-void cpu_kstat_init(cpu_t *);
	-void cpu_visibility_add(cpu_t , struct zone );
	-void cpu_visibility_remove(cpu_t , struct zone );
	-void cpu_visibility_configure(cpu_t , struct zone );
	-void cpu_visibility_unconfigure(cpu_t , struct zone );
	-void cpu_visibility_online(cpu_t , struct zone );
	-void cpu_visibility_offline(cpu_t , struct zone );
	-void cpu_create_intrstat(cpu_t *);
	-void cpu_delete_intrstat(cpu_t *);
	-int cpu_kstat_intrstat_update(kstat_t *, int);
	-void cpu_intr_swtch_enter(kthread_t *);
	-void cpu_intr_swtch_exit(kthread_t *);
	-
	-void mbox_lock_init(void); /* initialize cross-call locks */
	-void mbox_init(int cpun); /* initialize cross-calls */
	-void poke_cpu(int cpun); /* interrupt another CPU (to preempt) */
	-
	-/*
	- * values for safe_list. Pause state that CPUs are in.
	- */
	-#define PAUSE_IDLE 0 /* normal state */
	-#define PAUSE_READY 1 /* paused thread ready to spl */
	-#define PAUSE_WAIT 2 /* paused thread is spl-ed high */
	-#define PAUSE_DIE 3 /* tell pause thread to leave */
	-#define PAUSE_DEAD 4 /* pause thread has left */
	-
	-void mach_cpu_pause(volatile char *);
	-
	-void pause_cpus(cpu_t off_cp, void (func)(void ));
	-void start_cpus(void);
	-int cpus_paused(void);
	-
	-void cpu_pause_init(void);
	-cpu_t cpu_get(processorid_t cpun); / get the CPU struct associated */
	-
	-int cpu_online(cpu_t cp); / take cpu online */
	-int cpu_offline(cpu_t cp, int flags); / take cpu offline */
	-int cpu_spare(cpu_t cp, int flags); / take cpu to spare */
	-int cpu_faulted(cpu_t cp, int flags); / take cpu to faulted */
	-int cpu_poweron(cpu_t cp); / take powered-off cpu to offline */
	-int cpu_poweroff(cpu_t cp); / take offline cpu to powered-off */
	-
	-cpu_t cpu_intr_next(cpu_t cp); /* get next online CPU taking intrs */
	-int cpu_intr_count(cpu_t cp); / count # of CPUs handling intrs */
	-int cpu_intr_on(cpu_t cp); / CPU taking I/O interrupts? */
	-void cpu_intr_enable(cpu_t cp); / enable I/O interrupts */
	-int cpu_intr_disable(cpu_t cp); / disable I/O interrupts */
	-void cpu_intr_alloc(cpu_t cp, int n); / allocate interrupt threads */
	-
	-/*
	- * Routines for checking CPU states.
	- */
	-int cpu_is_online(cpu_t ); / check if CPU is online */
	-int cpu_is_nointr(cpu_t ); / check if CPU can service intrs */
	-int cpu_is_active(cpu_t ); / check if CPU can run threads */
	-int cpu_is_offline(cpu_t ); / check if CPU is offline */
	-int cpu_is_poweredoff(cpu_t ); / check if CPU is powered off */
	-
	-int cpu_flagged_online(cpu_flag_t); /* flags show CPU is online */
	-int cpu_flagged_nointr(cpu_flag_t); /* flags show CPU not handling intrs */
	-int cpu_flagged_active(cpu_flag_t); /* flags show CPU scheduling threads */
	-int cpu_flagged_offline(cpu_flag_t); /* flags show CPU is offline */
	-int cpu_flagged_poweredoff(cpu_flag_t); /* flags show CPU is powered off */
	-
	-/*
	- * The processor_info(2) state of a CPU is a simplified representation suitable
	- * for use by an application program. Kernel subsystems should utilize the
	- * internal per-CPU state as given by the cpu_flags member of the cpu structure,
	- * as this information may include platform- or architecture-specific state
	- * critical to a subsystem's disposition of a particular CPU.
	- */
	-void cpu_set_state(cpu_t ); / record/timestamp current state */
	-int cpu_get_state(cpu_t ); / get current cpu state */
	-const char cpu_get_state_str(cpu_t ); /* get current cpu state as string */
	-
	-
	-void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */
	-void cpu_set_supp_freqs(cpu_t , const char ); /* set the CPU supported */
	- /* frequencies */
	-
	-int cpu_configure(int);
	-int cpu_unconfigure(int);
	-void cpu_destroy_bound_threads(cpu_t *cp);
	-
	-extern int cpu_bind_thread(kthread_t *tp, processorid_t bind,
	- processorid_t obind, int error);
	-extern int cpu_unbind(processorid_t cpu_id, boolean_t force);
	-extern void thread_affinity_set(kthread_t *t, int cpu_id);
	-extern void thread_affinity_clear(kthread_t *t);
	-extern void affinity_set(int cpu_id);
	-extern void affinity_clear(void);
	-extern void init_cpu_mstate(struct cpu *, int);
	-extern void term_cpu_mstate(struct cpu *);
	-extern void new_cpu_mstate(int, hrtime_t);
	-extern void get_cpu_mstate(struct cpu , hrtime_t );
	-extern void thread_nomigrate(void);
	-extern void thread_allowmigrate(void);
	-extern void weakbinding_stop(void);
	-extern void weakbinding_start(void);
	-
	-/*
	- * The following routines affect the CPUs participation in interrupt processing,
	- * if that is applicable on the architecture. This only affects interrupts
	- * which aren't directed at the processor (not cross calls).
	- *
	- * cpu_disable_intr returns non-zero if interrupts were previously enabled.
	- */
	-int cpu_disable_intr(struct cpu cp); / stop issuing interrupts to cpu */
	-void cpu_enable_intr(struct cpu cp); / start issuing interrupts to cpu */
	-
	-/*
	- * The mutex cpu_lock protects cpu_flags for all CPUs, as well as the ncpus
	- * and ncpus_online counts.
	- */
	-extern kmutex_t cpu_lock; /* lock protecting CPU data */
	-
	-/*
	- * CPU state change events
	- *
	- * Various subsystems need to know when CPUs change their state. They get this
	- * information by registering CPU state change callbacks using
	- * register_cpu_setup_func(). Whenever any CPU changes its state, the callback
	- * function is called. The callback function is passed three arguments:
	- *
	- * Event, described by cpu_setup_t
	- * CPU ID
	- * Transparent pointer passed when registering the callback
	- *
	- * The callback function is called with cpu_lock held. The return value from the
	- * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG
	- * events. For these two events, non-zero return value indicates a failure and
	- * prevents successful completion of the operation.
	- *
	- * New events may be added in the future. Callback functions should ignore any
	- * events that they do not understand.
	- *
	- * The following events provide notification callbacks:
	- *
	- * CPU_INIT A new CPU is started and added to the list of active CPUs
	- * This event is only used during boot
	- *
	- * CPU_CONFIG A newly inserted CPU is prepared for starting running code
	- * This event is called by DR code
	- *
	- * CPU_UNCONFIG CPU has been powered off and needs cleanup
	- * This event is called by DR code
	- *
	- * CPU_ON CPU is enabled but does not run anything yet
	- *
	- * CPU_INTR_ON CPU is enabled and has interrupts enabled
	- *
	- * CPU_OFF CPU is going offline but can still run threads
	- *
	- * CPU_CPUPART_OUT CPU is going to move out of its partition
	- *
	- * CPU_CPUPART_IN CPU is going to move to a new partition
	- *
	- * CPU_SETUP CPU is set up during boot and can run threads
	- */
	-typedef enum {
	- CPU_INIT,
	- CPU_CONFIG,
	- CPU_UNCONFIG,
	- CPU_ON,
	- CPU_OFF,
	- CPU_CPUPART_IN,
	- CPU_CPUPART_OUT,
	- CPU_SETUP,
	- CPU_INTR_ON
	-} cpu_setup_t;
	-
	-typedef int cpu_setup_func_t(cpu_setup_t, int, void *);
	-
	-/*
	- * Routines used to register interest in cpu's being added to or removed
	- * from the system.
	- */
	-extern void register_cpu_setup_func(cpu_setup_func_t , void );
	-extern void unregister_cpu_setup_func(cpu_setup_func_t , void );
	-extern void cpu_state_change_notify(int, cpu_setup_t);
	-
	-/*
	- * Call specified function on the given CPU
	- */
	-typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t);
	-extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t);
	-
	-
	-/*
	- * Create various strings that describe the given CPU for the
	- * processor_info system call and configuration-related kstats.
	- */
	-#define CPU_IDSTRLEN 100
	-
	-extern void init_cpu_info(struct cpu *);
	-extern void populate_idstr(struct cpu *);
	-extern void cpu_vm_data_init(struct cpu *);
	-extern void cpu_vm_data_destroy(struct cpu *);
	-
	-#endif /* _KERNEL \|\| _FAKE_KERNEL */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_CPUVAR_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
	@@ -50,25 +50,22 @@
	#ifndef _ASM

	#include <sys/types.h>
	+#ifdef _KERNEL
	+#include <sys/endian.h>
	+#endif
	#include <sys/modctl.h>
	#include <sys/processor.h>
	-#ifdef illumos
	-#include <sys/systm.h>
	-#else
	#include <sys/cpuvar.h>
	#include <sys/param.h>
	#include <sys/linker.h>
	#include <sys/ioccom.h>
	+#include <sys/cred.h>
	+#include <sys/proc.h>
	+#include <sys/types.h>
	#include <sys/ucred.h>
	typedef int model_t;
	-#endif
	#include <sys/ctf_api.h>
	-#ifdef illumos
	-#include <sys/cyclic.h>
	-#include <sys/int_limits.h>
	-#else
	#include <sys/stdint.h>
	-#endif

	/*
	* DTrace Universal Constants and Typedefs
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
	@@ -50,6 +50,7 @@
	*/

	#include <sys/dtrace.h>
	+#include <sys/file.h>

	#ifndef illumos
	#ifdef __sparcv9
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
	@@ -1,97 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _SYS_FM_FS_ZFS_H
	-#define _SYS_FM_FS_ZFS_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#define ZFS_ERROR_CLASS "fs.zfs"
	-
	-#define FM_EREPORT_ZFS_CHECKSUM "checksum"
	-#define FM_EREPORT_ZFS_IO "io"
	-#define FM_EREPORT_ZFS_DATA "data"
	-#define FM_EREPORT_ZFS_POOL "zpool"
	-#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
	-#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
	-#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data"
	-#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas"
	-#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum"
	-#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small"
	-#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label"
	-#define FM_EREPORT_ZFS_IO_FAILURE "io_failure"
	-#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure"
	-#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay"
	-#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write"
	-
	-#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
	-#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode"
	-#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid"
	-#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context"
	-#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid"
	-#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type"
	-#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path"
	-#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid"
	-#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru"
	-#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
	-#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
	-#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
	-#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid"
	-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset"
	-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object"
	-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level"
	-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid"
	-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err"
	-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
	-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
	-#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
	-#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
	-#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual"
	-#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm"
	-#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap"
	-#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
	-#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap"
	-#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets"
	-#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears"
	-#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits"
	-#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits"
	-#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
	-#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
	-
	-#define FM_EREPORT_FAILMODE_WAIT "wait"
	-#define FM_EREPORT_FAILMODE_CONTINUE "continue"
	-#define FM_EREPORT_FAILMODE_PANIC "panic"
	-
	-#define FM_RESOURCE_REMOVED "removed"
	-#define FM_RESOURCE_AUTOREPLACE "autoreplace"
	-#define FM_RESOURCE_STATECHANGE "statechange"
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FM_FS_ZFS_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
	@@ -1,369 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
	- */
	-
	-#ifndef _SYS_FM_PROTOCOL_H
	-#define _SYS_FM_PROTOCOL_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#ifdef _KERNEL
	-#include <sys/varargs.h>
	-#include <sys/nvpair.h>
	-#else
	-#include <libnvpair.h>
	-#include <stdarg.h>
	-#endif
	-
	-/* FM common member names */
	-#define FM_CLASS "class"
	-#define FM_VERSION "version"
	-
	-/* FM protocol category 1 class names */
	-#define FM_EREPORT_CLASS "ereport"
	-#define FM_FAULT_CLASS "fault"
	-#define FM_DEFECT_CLASS "defect"
	-#define FM_RSRC_CLASS "resource"
	-#define FM_LIST_EVENT "list"
	-#define FM_IREPORT_CLASS "ireport"
	-
	-/* FM list.* event class values */
	-#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect"
	-#define FM_LIST_ISOLATED_CLASS FM_LIST_EVENT ".isolated"
	-#define FM_LIST_REPAIRED_CLASS FM_LIST_EVENT ".repaired"
	-#define FM_LIST_UPDATED_CLASS FM_LIST_EVENT ".updated"
	-#define FM_LIST_RESOLVED_CLASS FM_LIST_EVENT ".resolved"
	-
	-/* ereport class subcategory values */
	-#define FM_ERROR_CPU "cpu"
	-#define FM_ERROR_IO "io"
	-
	-/* ereport version and payload member names */
	-#define FM_EREPORT_VERS0 0
	-#define FM_EREPORT_VERSION FM_EREPORT_VERS0
	-
	-/* ereport payload member names */
	-#define FM_EREPORT_DETECTOR "detector"
	-#define FM_EREPORT_ENA "ena"
	-
	-/* list.* event payload member names */
	-#define FM_LIST_EVENT_SIZE "list-sz"
	-
	-/* ireport.* event payload member names */
	-#define FM_IREPORT_DETECTOR "detector"
	-#define FM_IREPORT_UUID "uuid"
	-#define FM_IREPORT_PRIORITY "pri"
	-#define FM_IREPORT_ATTRIBUTES "attr"
	-
	-/*
	- * list.suspect, isolated, updated, repaired and resolved
	- * versions/payload member names.
	- */
	-#define FM_SUSPECT_UUID "uuid"
	-#define FM_SUSPECT_DIAG_CODE "code"
	-#define FM_SUSPECT_DIAG_TIME "diag-time"
	-#define FM_SUSPECT_DE "de"
	-#define FM_SUSPECT_FAULT_LIST "fault-list"
	-#define FM_SUSPECT_FAULT_SZ "fault-list-sz"
	-#define FM_SUSPECT_FAULT_STATUS "fault-status"
	-#define FM_SUSPECT_INJECTED "__injected"
	-#define FM_SUSPECT_MESSAGE "message"
	-#define FM_SUSPECT_RETIRE "retire"
	-#define FM_SUSPECT_RESPONSE "response"
	-#define FM_SUSPECT_SEVERITY "severity"
	-
	-#define FM_SUSPECT_VERS0 0
	-#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0
	-
	-#define FM_SUSPECT_FAULTY 0x1
	-#define FM_SUSPECT_UNUSABLE 0x2
	-#define FM_SUSPECT_NOT_PRESENT 0x4
	-#define FM_SUSPECT_DEGRADED 0x8
	-#define FM_SUSPECT_REPAIRED 0x10
	-#define FM_SUSPECT_REPLACED 0x20
	-#define FM_SUSPECT_ACQUITTED 0x40
	-
	-/* fault event versions and payload member names */
	-#define FM_FAULT_VERS0 0
	-#define FM_FAULT_VERSION FM_FAULT_VERS0
	-
	-#define FM_FAULT_ASRU "asru"
	-#define FM_FAULT_FRU "fru"
	-#define FM_FAULT_FRU_LABEL "fru-label"
	-#define FM_FAULT_CERTAINTY "certainty"
	-#define FM_FAULT_RESOURCE "resource"
	-#define FM_FAULT_LOCATION "location"
	-
	-/* resource event versions and payload member names */
	-#define FM_RSRC_VERS0 0
	-#define FM_RSRC_VERSION FM_RSRC_VERS0
	-#define FM_RSRC_RESOURCE "resource"
	-
	-/* resource.fm.asru.* payload member names */
	-#define FM_RSRC_ASRU_UUID "uuid"
	-#define FM_RSRC_ASRU_CODE "code"
	-#define FM_RSRC_ASRU_FAULTY "faulty"
	-#define FM_RSRC_ASRU_REPAIRED "repaired"
	-#define FM_RSRC_ASRU_REPLACED "replaced"
	-#define FM_RSRC_ASRU_ACQUITTED "acquitted"
	-#define FM_RSRC_ASRU_RESOLVED "resolved"
	-#define FM_RSRC_ASRU_UNUSABLE "unusable"
	-#define FM_RSRC_ASRU_EVENT "event"
	-
	-/* resource.fm.xprt.* versions and payload member names */
	-#define FM_RSRC_XPRT_VERS0 0
	-#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0
	-#define FM_RSRC_XPRT_UUID "uuid"
	-#define FM_RSRC_XPRT_SUBCLASS "subclass"
	-#define FM_RSRC_XPRT_FAULT_STATUS "fault-status"
	-#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru"
	-
	-/*
	- * FM ENA Format Macros
	- */
	-#define ENA_FORMAT_MASK 0x3
	-#define ENA_FORMAT(ena) ((ena) & ENA_FORMAT_MASK)
	-
	-/* ENA format types */
	-#define FM_ENA_FMT0 0
	-#define FM_ENA_FMT1 1
	-#define FM_ENA_FMT2 2
	-
	-/* Format 1 */
	-#define ENA_FMT1_GEN_MASK 0x00000000000003FCull
	-#define ENA_FMT1_ID_MASK 0xFFFFFFFFFFFFFC00ull
	-#define ENA_FMT1_CPUID_MASK 0x00000000000FFC00ull
	-#define ENA_FMT1_TIME_MASK 0xFFFFFFFFFFF00000ull
	-#define ENA_FMT1_GEN_SHFT 2
	-#define ENA_FMT1_ID_SHFT 10
	-#define ENA_FMT1_CPUID_SHFT ENA_FMT1_ID_SHFT
	-#define ENA_FMT1_TIME_SHFT 20
	-
	-/* Format 2 */
	-#define ENA_FMT2_GEN_MASK 0x00000000000003FCull
	-#define ENA_FMT2_ID_MASK 0xFFFFFFFFFFFFFC00ull
	-#define ENA_FMT2_TIME_MASK ENA_FMT2_ID_MASK
	-#define ENA_FMT2_GEN_SHFT 2
	-#define ENA_FMT2_ID_SHFT 10
	-#define ENA_FMT2_TIME_SHFT ENA_FMT2_ID_SHFT
	-
	-/* Common FMRI type names */
	-#define FM_FMRI_AUTHORITY "authority"
	-#define FM_FMRI_SCHEME "scheme"
	-#define FM_FMRI_SVC_AUTHORITY "svc-authority"
	-#define FM_FMRI_FACILITY "facility"
	-
	-/* FMRI authority-type member names */
	-#define FM_FMRI_AUTH_CHASSIS "chassis-id"
	-#define FM_FMRI_AUTH_PRODUCT_SN "product-sn"
	-#define FM_FMRI_AUTH_PRODUCT "product-id"
	-#define FM_FMRI_AUTH_DOMAIN "domain-id"
	-#define FM_FMRI_AUTH_SERVER "server-id"
	-#define FM_FMRI_AUTH_HOST "host-id"
	-
	-#define FM_AUTH_VERS0 0
	-#define FM_FMRI_AUTH_VERSION FM_AUTH_VERS0
	-
	-/* scheme name values */
	-#define FM_FMRI_SCHEME_FMD "fmd"
	-#define FM_FMRI_SCHEME_DEV "dev"
	-#define FM_FMRI_SCHEME_HC "hc"
	-#define FM_FMRI_SCHEME_SVC "svc"
	-#define FM_FMRI_SCHEME_CPU "cpu"
	-#define FM_FMRI_SCHEME_MEM "mem"
	-#define FM_FMRI_SCHEME_MOD "mod"
	-#define FM_FMRI_SCHEME_PKG "pkg"
	-#define FM_FMRI_SCHEME_LEGACY "legacy-hc"
	-#define FM_FMRI_SCHEME_ZFS "zfs"
	-#define FM_FMRI_SCHEME_SW "sw"
	-
	-/* Scheme versions */
	-#define FMD_SCHEME_VERSION0 0
	-#define FM_FMD_SCHEME_VERSION FMD_SCHEME_VERSION0
	-#define DEV_SCHEME_VERSION0 0
	-#define FM_DEV_SCHEME_VERSION DEV_SCHEME_VERSION0
	-#define FM_HC_VERS0 0
	-#define FM_HC_SCHEME_VERSION FM_HC_VERS0
	-#define CPU_SCHEME_VERSION0 0
	-#define CPU_SCHEME_VERSION1 1
	-#define FM_CPU_SCHEME_VERSION CPU_SCHEME_VERSION1
	-#define MEM_SCHEME_VERSION0 0
	-#define FM_MEM_SCHEME_VERSION MEM_SCHEME_VERSION0
	-#define MOD_SCHEME_VERSION0 0
	-#define FM_MOD_SCHEME_VERSION MOD_SCHEME_VERSION0
	-#define PKG_SCHEME_VERSION0 0
	-#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0
	-#define LEGACY_SCHEME_VERSION0 0
	-#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0
	-#define SVC_SCHEME_VERSION0 0
	-#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0
	-#define ZFS_SCHEME_VERSION0 0
	-#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0
	-#define SW_SCHEME_VERSION0 0
	-#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0
	-
	-/* hc scheme member names */
	-#define FM_FMRI_HC_SERIAL_ID "serial"
	-#define FM_FMRI_HC_PART "part"
	-#define FM_FMRI_HC_REVISION "revision"
	-#define FM_FMRI_HC_ROOT "hc-root"
	-#define FM_FMRI_HC_LIST_SZ "hc-list-sz"
	-#define FM_FMRI_HC_LIST "hc-list"
	-#define FM_FMRI_HC_SPECIFIC "hc-specific"
	-
	-/* facility member names */
	-#define FM_FMRI_FACILITY_NAME "facility-name"
	-#define FM_FMRI_FACILITY_TYPE "facility-type"
	-
	-/* hc-list version and member names */
	-#define FM_FMRI_HC_NAME "hc-name"
	-#define FM_FMRI_HC_ID "hc-id"
	-
	-#define HC_LIST_VERSION0 0
	-#define FM_HC_LIST_VERSION HC_LIST_VERSION0
	-
	-/* hc-specific member names */
	-#define FM_FMRI_HC_SPECIFIC_OFFSET "offset"
	-#define FM_FMRI_HC_SPECIFIC_PHYSADDR "physaddr"
	-
	-/* fmd module scheme member names */
	-#define FM_FMRI_FMD_NAME "mod-name"
	-#define FM_FMRI_FMD_VERSION "mod-version"
	-
	-/* dev scheme member names */
	-#define FM_FMRI_DEV_ID "devid"
	-#define FM_FMRI_DEV_TGTPTLUN0 "target-port-l0id"
	-#define FM_FMRI_DEV_PATH "device-path"
	-
	-/* pkg scheme member names */
	-#define FM_FMRI_PKG_BASEDIR "pkg-basedir"
	-#define FM_FMRI_PKG_INST "pkg-inst"
	-#define FM_FMRI_PKG_VERSION "pkg-version"
	-
	-/* svc scheme member names */
	-#define FM_FMRI_SVC_NAME "svc-name"
	-#define FM_FMRI_SVC_INSTANCE "svc-instance"
	-#define FM_FMRI_SVC_CONTRACT_ID "svc-contract-id"
	-
	-/* svc-authority member names */
	-#define FM_FMRI_SVC_AUTH_SCOPE "scope"
	-#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-fqn"
	-
	-/* cpu scheme member names */
	-#define FM_FMRI_CPU_ID "cpuid"
	-#define FM_FMRI_CPU_SERIAL_ID "serial"
	-#define FM_FMRI_CPU_MASK "cpumask"
	-#define FM_FMRI_CPU_VID "cpuvid"
	-#define FM_FMRI_CPU_CPUFRU "cpufru"
	-#define FM_FMRI_CPU_CACHE_INDEX "cacheindex"
	-#define FM_FMRI_CPU_CACHE_WAY "cacheway"
	-#define FM_FMRI_CPU_CACHE_BIT "cachebit"
	-#define FM_FMRI_CPU_CACHE_TYPE "cachetype"
	-
	-#define FM_FMRI_CPU_CACHE_TYPE_L2 0
	-#define FM_FMRI_CPU_CACHE_TYPE_L3 1
	-
	-/* legacy-hc scheme member names */
	-#define FM_FMRI_LEGACY_HC "component"
	-#define FM_FMRI_LEGACY_HC_PREFIX FM_FMRI_SCHEME_HC":///" \
	- FM_FMRI_LEGACY_HC"="
	-
	-/* mem scheme member names */
	-#define FM_FMRI_MEM_UNUM "unum"
	-#define FM_FMRI_MEM_SERIAL_ID "serial"
	-#define FM_FMRI_MEM_PHYSADDR "physaddr"
	-#define FM_FMRI_MEM_MEMCONFIG "memconfig"
	-#define FM_FMRI_MEM_OFFSET "offset"
	-
	-/* mod scheme member names */
	-#define FM_FMRI_MOD_PKG "mod-pkg"
	-#define FM_FMRI_MOD_NAME "mod-name"
	-#define FM_FMRI_MOD_ID "mod-id"
	-#define FM_FMRI_MOD_DESC "mod-desc"
	-
	-/* zfs scheme member names */
	-#define FM_FMRI_ZFS_POOL "pool"
	-#define FM_FMRI_ZFS_VDEV "vdev"
	-
	-/* sw scheme member names - extra indentation for members of an nvlist */
	-#define FM_FMRI_SW_OBJ "object"
	-#define FM_FMRI_SW_OBJ_PATH "path"
	-#define FM_FMRI_SW_OBJ_ROOT "root"
	-#define FM_FMRI_SW_OBJ_PKG "pkg"
	-#define FM_FMRI_SW_SITE "site"
	-#define FM_FMRI_SW_SITE_TOKEN "token"
	-#define FM_FMRI_SW_SITE_MODULE "module"
	-#define FM_FMRI_SW_SITE_FILE "file"
	-#define FM_FMRI_SW_SITE_LINE "line"
	-#define FM_FMRI_SW_SITE_FUNC "func"
	-#define FM_FMRI_SW_CTXT "context"
	-#define FM_FMRI_SW_CTXT_ORIGIN "origin"
	-#define FM_FMRI_SW_CTXT_EXECNAME "execname"
	-#define FM_FMRI_SW_CTXT_PID "pid"
	-#define FM_FMRI_SW_CTXT_ZONE "zone"
	-#define FM_FMRI_SW_CTXT_CTID "ctid"
	-#define FM_FMRI_SW_CTXT_STACK "stack"
	-
	-extern nv_alloc_t fm_nva_xcreate(char , size_t);
	-extern void fm_nva_xdestroy(nv_alloc_t *);
	-
	-extern nvlist_t fm_nvlist_create(nv_alloc_t );
	-extern void fm_nvlist_destroy(nvlist_t *, int);
	-
	-#define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */
	-#define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */
	-
	-extern void fm_ereport_set(nvlist_t , int, const char , uint64_t,
	- const nvlist_t *, ...);
	-extern void fm_payload_set(nvlist_t *, ...);
	-extern int i_fm_payload_set(nvlist_t , const char , va_list);
	-extern void fm_fmri_hc_set(nvlist_t , int, const nvlist_t , nvlist_t *,
	- int, ...);
	-extern void fm_fmri_dev_set(nvlist_t , int, const nvlist_t , const char *,
	- const char , const char );
	-extern void fm_fmri_de_set(nvlist_t , int, const nvlist_t , const char *);
	-extern void fm_fmri_cpu_set(nvlist_t , int, const nvlist_t , uint32_t,
	- uint8_t , const char );
	-extern void fm_fmri_mem_set(nvlist_t , int, const nvlist_t , const char *,
	- const char *, uint64_t);
	-extern void fm_authority_set(nvlist_t , int, const char , const char *,
	- const char , const char );
	-extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
	-extern void fm_fmri_hc_create(nvlist_t , int, const nvlist_t , nvlist_t *,
	- nvlist_t *, int, ...);
	-
	-extern uint64_t fm_ena_increment(uint64_t);
	-extern uint64_t fm_ena_generate(uint64_t, uchar_t);
	-extern uint64_t fm_ena_generation_get(uint64_t);
	-extern uchar_t fm_ena_format_get(uint64_t);
	-extern uint64_t fm_ena_id_get(uint64_t);
	-extern uint64_t fm_ena_time_get(uint64_t);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FM_PROTOCOL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
	@@ -1,102 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2017 RackTop Systems.
	- */
	-
	-#ifndef _SYS_FM_UTIL_H
	-#define _SYS_FM_UTIL_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#include <sys/nvpair.h>
	-#include <sys/errorq.h>
	-
	-/*
	- * Shared user/kernel definitions for class length, error channel name,
	- * and kernel event publisher string.
	- */
	-#define FM_MAX_CLASS 100
	-#define FM_ERROR_CHAN "com.sun:fm:error"
	-#define FM_PUB "fm"
	-
	-/*
	- * ereport dump device transport support
	- *
	- * Ereports are written out to the dump device at a proscribed offset from the
	- * end, similar to in-transit log messages. The ereports are represented as a
	- * erpt_dump_t header followed by ed_size bytes of packed native nvlist data.
	- *
	- * NOTE: All of these constants and the header must be defined so they have the
	- * same representation for both 32-bit and 64-bit producers and consumers.
	- */
	-#define ERPT_MAGIC 0xf00d4eddU
	-#define ERPT_MAX_ERRS 16
	-#define ERPT_DATA_SZ (6 * 1024)
	-#define ERPT_EVCH_MAX 256
	-#define ERPT_HIWAT 64
	-
	-typedef struct erpt_dump {
	- uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */
	- uint32_t ed_chksum; /* checksum32() of packed nvlist data */
	- uint32_t ed_size; /* ereport (nvl) fixed buf size */
	- uint32_t ed_pad; /* reserved for future use */
	- hrtime_t ed_hrt_nsec; /* hrtime of this ereport */
	- hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */
	- struct {
	- uint64_t sec; /* seconds since gettimeofday() Epoch */
	- uint64_t nsec; /* nanoseconds past ed_tod_base.sec */
	- } ed_tod_base;
	-} erpt_dump_t;
	-
	-#if defined(_KERNEL) \|\| defined(_FAKE_KERNEL)
	-#include <sys/systm.h>
	-
	-#define FM_STK_DEPTH 20 /* maximum stack depth */
	-#define FM_SYM_SZ 64 /* maximum symbol size */
	-#define FM_ERR_PIL 2 /* PIL for ereport_errorq drain processing */
	-
	-#define FM_EREPORT_PAYLOAD_NAME_STACK "stack"
	-
	-extern errorq_t *ereport_errorq;
	-extern void *ereport_dumpbuf;
	-extern size_t ereport_dumplen;
	-
	-extern void fm_init(void);
	-extern void fm_nvprint(nvlist_t *);
	-#define fm_panic panic
	-extern void fm_banner(void);
	-
	-extern void fm_ereport_dump(void);
	-extern void fm_ereport_post(nvlist_t *, int);
	-
	-extern int is_fm_panic();
	-#endif /* _KERNEL \|\| _FAKE_KERNEL */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FM_UTIL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
	@@ -1,1248 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
	- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	- * Copyright (c) 2014 Integros [integros.com]
	- * Copyright 2017 Joyent, Inc.
	- * Copyright (c) 2019 Datto Inc.
	- * Copyright (c) 2017, Intel Corporation.
	- */
	-
	-/* Portions Copyright 2010 Robert Milkowski */
	-
	-#ifndef _SYS_FS_ZFS_H
	-#define _SYS_FS_ZFS_H
	-
	-#include <sys/types.h>
	-#include <sys/ioccom.h>
	-#include <sys/time.h>
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-/*
	- * Types and constants shared between userland and the kernel.
	- */
	-
	-/*
	- * Each dataset can be one of the following types. These constants can be
	- * combined into masks that can be passed to various functions.
	- */
	-typedef enum {
	- ZFS_TYPE_FILESYSTEM = (1 << 0),
	- ZFS_TYPE_SNAPSHOT = (1 << 1),
	- ZFS_TYPE_VOLUME = (1 << 2),
	- ZFS_TYPE_POOL = (1 << 3),
	- ZFS_TYPE_BOOKMARK = (1 << 4)
	-} zfs_type_t;
	-
	-/*
	- * NB: lzc_dataset_type should be updated whenever a new objset type is added,
	- * if it represents a real type of a dataset that can be created from userland.
	- */
	-typedef enum dmu_objset_type {
	- DMU_OST_NONE,
	- DMU_OST_META,
	- DMU_OST_ZFS,
	- DMU_OST_ZVOL,
	- DMU_OST_OTHER, /* For testing only! */
	- DMU_OST_ANY, /* Be careful! */
	- DMU_OST_NUMTYPES
	-} dmu_objset_type_t;
	-
	-#define ZFS_TYPE_DATASET \
	- (ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME \| ZFS_TYPE_SNAPSHOT)
	-
	-/*
	- * All of these include the terminating NUL byte.
	- */
	-#define ZAP_MAXNAMELEN 256
	-#define ZAP_MAXVALUELEN (1024 * 8)
	-#define ZAP_OLDMAXVALUELEN 1024
	-#define ZFS_MAX_DATASET_NAME_LEN 256
	-
	-/*
	- * Dataset properties are identified by these constants and must be added to
	- * the end of this list to ensure that external consumers are not affected
	- * by the change. If you make any changes to this list, be sure to update
	- * the property table in usr/src/common/zfs/zfs_prop.c.
	- */
	-typedef enum {
	- ZPROP_CONT = -2,
	- ZPROP_INVAL = -1,
	- ZFS_PROP_TYPE = 0,
	- ZFS_PROP_CREATION,
	- ZFS_PROP_USED,
	- ZFS_PROP_AVAILABLE,
	- ZFS_PROP_REFERENCED,
	- ZFS_PROP_COMPRESSRATIO,
	- ZFS_PROP_MOUNTED,
	- ZFS_PROP_ORIGIN,
	- ZFS_PROP_QUOTA,
	- ZFS_PROP_RESERVATION,
	- ZFS_PROP_VOLSIZE,
	- ZFS_PROP_VOLBLOCKSIZE,
	- ZFS_PROP_RECORDSIZE,
	- ZFS_PROP_MOUNTPOINT,
	- ZFS_PROP_SHARENFS,
	- ZFS_PROP_CHECKSUM,
	- ZFS_PROP_COMPRESSION,
	- ZFS_PROP_ATIME,
	- ZFS_PROP_DEVICES,
	- ZFS_PROP_EXEC,
	- ZFS_PROP_SETUID,
	- ZFS_PROP_READONLY,
	- ZFS_PROP_ZONED,
	- ZFS_PROP_SNAPDIR,
	- ZFS_PROP_ACLMODE,
	- ZFS_PROP_ACLINHERIT,
	- ZFS_PROP_CREATETXG,
	- ZFS_PROP_NAME, /* not exposed to the user */
	- ZFS_PROP_CANMOUNT,
	- ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
	- ZFS_PROP_XATTR,
	- ZFS_PROP_NUMCLONES, /* not exposed to the user */
	- ZFS_PROP_COPIES,
	- ZFS_PROP_VERSION,
	- ZFS_PROP_UTF8ONLY,
	- ZFS_PROP_NORMALIZE,
	- ZFS_PROP_CASE,
	- ZFS_PROP_VSCAN,
	- ZFS_PROP_NBMAND,
	- ZFS_PROP_SHARESMB,
	- ZFS_PROP_REFQUOTA,
	- ZFS_PROP_REFRESERVATION,
	- ZFS_PROP_GUID,
	- ZFS_PROP_PRIMARYCACHE,
	- ZFS_PROP_SECONDARYCACHE,
	- ZFS_PROP_USEDSNAP,
	- ZFS_PROP_USEDDS,
	- ZFS_PROP_USEDCHILD,
	- ZFS_PROP_USEDREFRESERV,
	- ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
	- ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
	- ZFS_PROP_DEFER_DESTROY,
	- ZFS_PROP_USERREFS,
	- ZFS_PROP_LOGBIAS,
	- ZFS_PROP_UNIQUE, /* not exposed to the user */
	- ZFS_PROP_OBJSETID, /* not exposed to the user */
	- ZFS_PROP_DEDUP,
	- ZFS_PROP_MLSLABEL,
	- ZFS_PROP_SYNC,
	- ZFS_PROP_DNODESIZE,
	- ZFS_PROP_REFRATIO,
	- ZFS_PROP_WRITTEN,
	- ZFS_PROP_CLONES,
	- ZFS_PROP_LOGICALUSED,
	- ZFS_PROP_LOGICALREFERENCED,
	- ZFS_PROP_INCONSISTENT, /* not exposed to the user */
	- ZFS_PROP_VOLMODE,
	- ZFS_PROP_FILESYSTEM_LIMIT,
	- ZFS_PROP_SNAPSHOT_LIMIT,
	- ZFS_PROP_FILESYSTEM_COUNT,
	- ZFS_PROP_SNAPSHOT_COUNT,
	- ZFS_PROP_REDUNDANT_METADATA,
	- ZFS_PROP_PREV_SNAP,
	- ZFS_PROP_RECEIVE_RESUME_TOKEN,
	- ZFS_PROP_REMAPTXG, /* not exposed to the user */
	- ZFS_PROP_SPECIAL_SMALL_BLOCKS,
	- ZFS_NUM_PROPS
	-} zfs_prop_t;
	-
	-typedef enum {
	- ZFS_PROP_USERUSED,
	- ZFS_PROP_USERQUOTA,
	- ZFS_PROP_GROUPUSED,
	- ZFS_PROP_GROUPQUOTA,
	- ZFS_NUM_USERQUOTA_PROPS
	-} zfs_userquota_prop_t;
	-
	-extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
	-
	-/*
	- * Pool properties are identified by these constants and must be added to the
	- * end of this list to ensure that external consumers are not affected
	- * by the change. If you make any changes to this list, be sure to update
	- * the property table in usr/src/common/zfs/zpool_prop.c.
	- */
	-typedef enum {
	- ZPOOL_PROP_INVAL = -1,
	- ZPOOL_PROP_NAME,
	- ZPOOL_PROP_SIZE,
	- ZPOOL_PROP_CAPACITY,
	- ZPOOL_PROP_ALTROOT,
	- ZPOOL_PROP_HEALTH,
	- ZPOOL_PROP_GUID,
	- ZPOOL_PROP_VERSION,
	- ZPOOL_PROP_BOOTFS,
	- ZPOOL_PROP_DELEGATION,
	- ZPOOL_PROP_AUTOREPLACE,
	- ZPOOL_PROP_CACHEFILE,
	- ZPOOL_PROP_FAILUREMODE,
	- ZPOOL_PROP_LISTSNAPS,
	- ZPOOL_PROP_AUTOEXPAND,
	- ZPOOL_PROP_DEDUPDITTO,
	- ZPOOL_PROP_DEDUPRATIO,
	- ZPOOL_PROP_FREE,
	- ZPOOL_PROP_ALLOCATED,
	- ZPOOL_PROP_READONLY,
	- ZPOOL_PROP_COMMENT,
	- ZPOOL_PROP_EXPANDSZ,
	- ZPOOL_PROP_FREEING,
	- ZPOOL_PROP_FRAGMENTATION,
	- ZPOOL_PROP_LEAKED,
	- ZPOOL_PROP_MAXBLOCKSIZE,
	- ZPOOL_PROP_BOOTSIZE,
	- ZPOOL_PROP_CHECKPOINT,
	- ZPOOL_PROP_TNAME,
	- ZPOOL_PROP_MAXDNODESIZE,
	- ZPOOL_PROP_MULTIHOST,
	- ZPOOL_NUM_PROPS
	-} zpool_prop_t;
	-
	-/* Small enough to not hog a whole line of printout in zpool(1M). */
	-#define ZPROP_MAX_COMMENT 32
	-
	-#define ZPROP_VALUE "value"
	-#define ZPROP_SOURCE "source"
	-
	-typedef enum {
	- ZPROP_SRC_NONE = 0x1,
	- ZPROP_SRC_DEFAULT = 0x2,
	- ZPROP_SRC_TEMPORARY = 0x4,
	- ZPROP_SRC_LOCAL = 0x8,
	- ZPROP_SRC_INHERITED = 0x10,
	- ZPROP_SRC_RECEIVED = 0x20
	-} zprop_source_t;
	-
	-#define ZPROP_SRC_ALL 0x3f
	-
	-#define ZPROP_SOURCE_VAL_RECVD "$recvd"
	-#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS"
	-/*
	- * Dataset flag implemented as a special entry in the props zap object
	- * indicating that the dataset has received properties on or after
	- * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
	- * just as it did in earlier versions, and thereafter, local properties are
	- * preserved.
	- */
	-#define ZPROP_HAS_RECVD "$hasrecvd"
	-
	-typedef enum {
	- ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
	- ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
	-} zprop_errflags_t;
	-
	-typedef int (zprop_func)(int, void );
	-
	-/*
	- * Properties to be set on the root file system of a new pool
	- * are stuffed into their own nvlist, which is then included in
	- * the properties nvlist with the pool properties.
	- */
	-#define ZPOOL_ROOTFS_PROPS "root-props-nvl"
	-
	-/*
	- * Length of 'written@' and 'written#'
	- */
	-#define ZFS_WRITTEN_PROP_PREFIX_LEN 8
	-
	-/*
	- * Dataset property functions shared between libzfs and kernel.
	- */
	-const char *zfs_prop_default_string(zfs_prop_t);
	-uint64_t zfs_prop_default_numeric(zfs_prop_t);
	-boolean_t zfs_prop_readonly(zfs_prop_t);
	-boolean_t zfs_prop_visible(zfs_prop_t prop);
	-boolean_t zfs_prop_inheritable(zfs_prop_t);
	-boolean_t zfs_prop_setonce(zfs_prop_t);
	-const char *zfs_prop_to_name(zfs_prop_t);
	-zfs_prop_t zfs_name_to_prop(const char *);
	-boolean_t zfs_prop_user(const char *);
	-boolean_t zfs_prop_userquota(const char *);
	-int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
	-int zfs_prop_string_to_index(zfs_prop_t, const char , uint64_t );
	-uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
	-boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
	-
	-/*
	- * Pool property functions shared between libzfs and kernel.
	- */
	-zpool_prop_t zpool_name_to_prop(const char *);
	-const char *zpool_prop_to_name(zpool_prop_t);
	-const char *zpool_prop_default_string(zpool_prop_t);
	-uint64_t zpool_prop_default_numeric(zpool_prop_t);
	-boolean_t zpool_prop_readonly(zpool_prop_t);
	-boolean_t zpool_prop_feature(const char *);
	-boolean_t zpool_prop_unsupported(const char *name);
	-int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
	-int zpool_prop_string_to_index(zpool_prop_t, const char , uint64_t );
	-uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
	-
	-/*
	- * Definitions for the Delegation.
	- */
	-typedef enum {
	- ZFS_DELEG_WHO_UNKNOWN = 0,
	- ZFS_DELEG_USER = 'u',
	- ZFS_DELEG_USER_SETS = 'U',
	- ZFS_DELEG_GROUP = 'g',
	- ZFS_DELEG_GROUP_SETS = 'G',
	- ZFS_DELEG_EVERYONE = 'e',
	- ZFS_DELEG_EVERYONE_SETS = 'E',
	- ZFS_DELEG_CREATE = 'c',
	- ZFS_DELEG_CREATE_SETS = 'C',
	- ZFS_DELEG_NAMED_SET = 's',
	- ZFS_DELEG_NAMED_SET_SETS = 'S'
	-} zfs_deleg_who_type_t;
	-
	-typedef enum {
	- ZFS_DELEG_NONE = 0,
	- ZFS_DELEG_PERM_LOCAL = 1,
	- ZFS_DELEG_PERM_DESCENDENT = 2,
	- ZFS_DELEG_PERM_LOCALDESCENDENT = 3,
	- ZFS_DELEG_PERM_CREATE = 4
	-} zfs_deleg_inherit_t;
	-
	-#define ZFS_DELEG_PERM_UID "uid"
	-#define ZFS_DELEG_PERM_GID "gid"
	-#define ZFS_DELEG_PERM_GROUPS "groups"
	-
	-#define ZFS_MLSLABEL_DEFAULT "none"
	-
	-#define ZFS_SMB_ACL_SRC "src"
	-#define ZFS_SMB_ACL_TARGET "target"
	-
	-typedef enum {
	- ZFS_CANMOUNT_OFF = 0,
	- ZFS_CANMOUNT_ON = 1,
	- ZFS_CANMOUNT_NOAUTO = 2
	-} zfs_canmount_type_t;
	-
	-typedef enum {
	- ZFS_LOGBIAS_LATENCY = 0,
	- ZFS_LOGBIAS_THROUGHPUT = 1
	-} zfs_logbias_op_t;
	-
	-typedef enum zfs_share_op {
	- ZFS_SHARE_NFS = 0,
	- ZFS_UNSHARE_NFS = 1,
	- ZFS_SHARE_SMB = 2,
	- ZFS_UNSHARE_SMB = 3
	-} zfs_share_op_t;
	-
	-typedef enum zfs_smb_acl_op {
	- ZFS_SMB_ACL_ADD,
	- ZFS_SMB_ACL_REMOVE,
	- ZFS_SMB_ACL_RENAME,
	- ZFS_SMB_ACL_PURGE
	-} zfs_smb_acl_op_t;
	-
	-typedef enum zfs_cache_type {
	- ZFS_CACHE_NONE = 0,
	- ZFS_CACHE_METADATA = 1,
	- ZFS_CACHE_ALL = 2
	-} zfs_cache_type_t;
	-
	-typedef enum {
	- ZFS_SYNC_STANDARD = 0,
	- ZFS_SYNC_ALWAYS = 1,
	- ZFS_SYNC_DISABLED = 2
	-} zfs_sync_type_t;
	-
	-typedef enum {
	- ZFS_VOLMODE_DEFAULT = 0,
	- ZFS_VOLMODE_GEOM = 1,
	- ZFS_VOLMODE_DEV = 2,
	- ZFS_VOLMODE_NONE = 3
	-} zfs_volmode_t;
	-
	-typedef enum {
	- ZFS_DNSIZE_LEGACY = 0,
	- ZFS_DNSIZE_AUTO = 1,
	- ZFS_DNSIZE_1K = 1024,
	- ZFS_DNSIZE_2K = 2048,
	- ZFS_DNSIZE_4K = 4096,
	- ZFS_DNSIZE_8K = 8192,
	- ZFS_DNSIZE_16K = 16384
	-} zfs_dnsize_type_t;
	-
	-typedef enum {
	- ZFS_REDUNDANT_METADATA_ALL,
	- ZFS_REDUNDANT_METADATA_MOST
	-} zfs_redundant_metadata_type_t;
	-
	-/*
	- * On-disk version number.
	- */
	-#define SPA_VERSION_1 1ULL
	-#define SPA_VERSION_2 2ULL
	-#define SPA_VERSION_3 3ULL
	-#define SPA_VERSION_4 4ULL
	-#define SPA_VERSION_5 5ULL
	-#define SPA_VERSION_6 6ULL
	-#define SPA_VERSION_7 7ULL
	-#define SPA_VERSION_8 8ULL
	-#define SPA_VERSION_9 9ULL
	-#define SPA_VERSION_10 10ULL
	-#define SPA_VERSION_11 11ULL
	-#define SPA_VERSION_12 12ULL
	-#define SPA_VERSION_13 13ULL
	-#define SPA_VERSION_14 14ULL
	-#define SPA_VERSION_15 15ULL
	-#define SPA_VERSION_16 16ULL
	-#define SPA_VERSION_17 17ULL
	-#define SPA_VERSION_18 18ULL
	-#define SPA_VERSION_19 19ULL
	-#define SPA_VERSION_20 20ULL
	-#define SPA_VERSION_21 21ULL
	-#define SPA_VERSION_22 22ULL
	-#define SPA_VERSION_23 23ULL
	-#define SPA_VERSION_24 24ULL
	-#define SPA_VERSION_25 25ULL
	-#define SPA_VERSION_26 26ULL
	-#define SPA_VERSION_27 27ULL
	-#define SPA_VERSION_28 28ULL
	-#define SPA_VERSION_5000 5000ULL
	-
	-/*
	- * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
	- * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
	- * and do the appropriate changes. Also bump the version number in
	- * usr/src/grub/capability.
	- */
	-#define SPA_VERSION SPA_VERSION_5000
	-#define SPA_VERSION_STRING "5000"
	-
	-/*
	- * Symbolic names for the changes that caused a SPA_VERSION switch.
	- * Used in the code when checking for presence or absence of a feature.
	- * Feel free to define multiple symbolic names for each version if there
	- * were multiple changes to on-disk structures during that version.
	- *
	- * NOTE: When checking the current SPA_VERSION in your code, be sure
	- * to use spa_version() since it reports the version of the
	- * last synced uberblock. Checking the in-flight version can
	- * be dangerous in some cases.
	- */
	-#define SPA_VERSION_INITIAL SPA_VERSION_1
	-#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
	-#define SPA_VERSION_SPARES SPA_VERSION_3
	-#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
	-#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3
	-#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
	-#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
	-#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
	-#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5
	-#define SPA_VERSION_BOOTFS SPA_VERSION_6
	-#define SPA_VERSION_SLOGS SPA_VERSION_7
	-#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8
	-#define SPA_VERSION_FUID SPA_VERSION_9
	-#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
	-#define SPA_VERSION_REFQUOTA SPA_VERSION_9
	-#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
	-#define SPA_VERSION_L2CACHE SPA_VERSION_10
	-#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11
	-#define SPA_VERSION_ORIGIN SPA_VERSION_11
	-#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11
	-#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
	-#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
	-#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
	-#define SPA_VERSION_USERSPACE SPA_VERSION_15
	-#define SPA_VERSION_STMF_PROP SPA_VERSION_16
	-#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
	-#define SPA_VERSION_USERREFS SPA_VERSION_18
	-#define SPA_VERSION_HOLES SPA_VERSION_19
	-#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
	-#define SPA_VERSION_DEDUP SPA_VERSION_21
	-#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
	-#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
	-#define SPA_VERSION_SA SPA_VERSION_24
	-#define SPA_VERSION_SCAN SPA_VERSION_25
	-#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
	-#define SPA_VERSION_DEADLISTS SPA_VERSION_26
	-#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
	-#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
	-#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28
	-#define SPA_VERSION_FEATURES SPA_VERSION_5000
	-
	-#define SPA_VERSION_IS_SUPPORTED(v) \
	- (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) \|\| \
	- ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
	-
	-/*
	- * ZPL version - rev'd whenever an incompatible on-disk format change
	- * occurs. This is independent of SPA/DMU/ZAP versioning. You must
	- * also update the version_table[] and help message in zfs_prop.c.
	- *
	- * When changing, be sure to teach GRUB how to read the new format!
	- * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
	- */
	-#define ZPL_VERSION_1 1ULL
	-#define ZPL_VERSION_2 2ULL
	-#define ZPL_VERSION_3 3ULL
	-#define ZPL_VERSION_4 4ULL
	-#define ZPL_VERSION_5 5ULL
	-#define ZPL_VERSION ZPL_VERSION_5
	-#define ZPL_VERSION_STRING "5"
	-
	-#define ZPL_VERSION_INITIAL ZPL_VERSION_1
	-#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
	-#define ZPL_VERSION_FUID ZPL_VERSION_3
	-#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
	-#define ZPL_VERSION_SYSATTR ZPL_VERSION_3
	-#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
	-#define ZPL_VERSION_SA ZPL_VERSION_5
	-
	-/* Rewind policy information */
	-#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
	-#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
	-#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */
	-#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */
	-#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */
	-#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */
	-#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */
	-
	-typedef struct zpool_load_policy {
	- uint32_t zlp_rewind; /* rewind policy requested */
	- uint64_t zlp_maxmeta; /* max acceptable meta-data errors */
	- uint64_t zlp_maxdata; /* max acceptable data errors */
	- uint64_t zlp_txg; /* specific txg to load */
	-} zpool_load_policy_t;
	-
	-/*
	- * The following are configuration names used in the nvlist describing a pool's
	- * configuration. New on-disk names should be prefixed with "<reverse-DNS>:"
	- * (e.g. "org.open-zfs:") to avoid conflicting names being developed
	- * independently.
	- */
	-#define ZPOOL_CONFIG_VERSION "version"
	-#define ZPOOL_CONFIG_POOL_NAME "name"
	-#define ZPOOL_CONFIG_POOL_STATE "state"
	-#define ZPOOL_CONFIG_POOL_TXG "txg"
	-#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
	-#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
	-#define ZPOOL_CONFIG_TOP_GUID "top_guid"
	-#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
	-#define ZPOOL_CONFIG_TYPE "type"
	-#define ZPOOL_CONFIG_CHILDREN "children"
	-#define ZPOOL_CONFIG_ID "id"
	-#define ZPOOL_CONFIG_GUID "guid"
	-#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object"
	-#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births"
	-#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
	-#define ZPOOL_CONFIG_PATH "path"
	-#define ZPOOL_CONFIG_DEVID "devid"
	-#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
	-#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
	-#define ZPOOL_CONFIG_ASHIFT "ashift"
	-#define ZPOOL_CONFIG_ASIZE "asize"
	-#define ZPOOL_CONFIG_DTL "DTL"
	-#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
	-#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */
	-#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */
	-#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
	-#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */
	-#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
	-#define ZPOOL_CONFIG_ERRCOUNT "error_count"
	-#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
	-#define ZPOOL_CONFIG_SPARES "spares"
	-#define ZPOOL_CONFIG_IS_SPARE "is_spare"
	-#define ZPOOL_CONFIG_NPARITY "nparity"
	-#define ZPOOL_CONFIG_HOSTID "hostid"
	-#define ZPOOL_CONFIG_HOSTNAME "hostname"
	-#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
	-#define ZPOOL_CONFIG_UNSPARE "unspare"
	-#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
	-#define ZPOOL_CONFIG_IS_LOG "is_log"
	-#define ZPOOL_CONFIG_L2CACHE "l2cache"
	-#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
	-#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
	-#define ZPOOL_CONFIG_IS_HOLE "is_hole"
	-#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram"
	-#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats"
	-#define ZPOOL_CONFIG_DDT_STATS "ddt_stats"
	-#define ZPOOL_CONFIG_SPLIT "splitcfg"
	-#define ZPOOL_CONFIG_ORIG_GUID "orig_guid"
	-#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
	-#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
	-#define ZPOOL_CONFIG_REMOVING "removing"
	-#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
	-#define ZPOOL_CONFIG_COMMENT "comment"
	-#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
	-#define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */
	-#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
	-#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
	-#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */
	-#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */
	-#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */
	-#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */
	-#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */
	-#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */
	-#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
	-#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */
	-#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
	-#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
	-#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
	-#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */
	-#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */
	-#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */
	-#define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */
	-#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */
	-#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
	-#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */
	-
	-/*
	- * The persistent vdev state is stored as separate values rather than a single
	- * 'vdev_state' entry. This is because a device can be in multiple states, such
	- * as offline and degraded.
	- */
	-#define ZPOOL_CONFIG_OFFLINE "offline"
	-#define ZPOOL_CONFIG_FAULTED "faulted"
	-#define ZPOOL_CONFIG_DEGRADED "degraded"
	-#define ZPOOL_CONFIG_REMOVED "removed"
	-#define ZPOOL_CONFIG_FRU "fru"
	-#define ZPOOL_CONFIG_AUX_STATE "aux_state"
	-
	-/* Pool load policy parameters */
	-#define ZPOOL_LOAD_POLICY "load-policy"
	-#define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy"
	-#define ZPOOL_LOAD_REQUEST_TXG "load-request-txg"
	-#define ZPOOL_LOAD_META_THRESH "load-meta-thresh"
	-#define ZPOOL_LOAD_DATA_THRESH "load-data-thresh"
	-
	-/* Rewind data discovered */
	-#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts"
	-#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors"
	-#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind"
	-
	-#define VDEV_TYPE_ROOT "root"
	-#define VDEV_TYPE_MIRROR "mirror"
	-#define VDEV_TYPE_REPLACING "replacing"
	-#define VDEV_TYPE_RAIDZ "raidz"
	-#define VDEV_TYPE_DISK "disk"
	-#define VDEV_TYPE_FILE "file"
	-#define VDEV_TYPE_MISSING "missing"
	-#define VDEV_TYPE_HOLE "hole"
	-#define VDEV_TYPE_SPARE "spare"
	-#define VDEV_TYPE_LOG "log"
	-#define VDEV_TYPE_L2CACHE "l2cache"
	-#define VDEV_TYPE_INDIRECT "indirect"
	-
	-/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */
	-#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \
	- "com.delphix:indirect_obsolete_sm"
	-#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
	- "com.delphix:obsolete_counts_are_precise"
	-#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
	- "com.delphix:pool_checkpoint_sm"
	-
	-#define VDEV_TOP_ZAP_ALLOCATION_BIAS \
	- "org.zfsonlinux:allocation_bias"
	-
	-/* vdev metaslab allocation bias */
	-#define VDEV_ALLOC_BIAS_LOG "log"
	-#define VDEV_ALLOC_BIAS_SPECIAL "special"
	-#define VDEV_ALLOC_BIAS_DEDUP "dedup"
	-
	-#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
	- "com.delphix:next_offset_to_initialize"
	-#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
	- "com.delphix:vdev_initialize_state"
	-#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
	- "com.delphix:vdev_initialize_action_time"
	-
	-/*
	- * This is needed in userland to report the minimum necessary device size.
	- *
	- * Note that the zfs test suite uses 64MB vdevs.
	- */
	-#define SPA_MINDEVSIZE (64ULL << 20)
	-
	-/*
	- * Set if the fragmentation has not yet been calculated. This can happen
	- * because the space maps have not been upgraded or the histogram feature
	- * is not enabled.
	- */
	-#define ZFS_FRAG_INVALID UINT64_MAX
	-
	-/*
	- * The location of the pool configuration repository, shared between kernel and
	- * userland.
	- */
	-#define ZPOOL_CACHE "/boot/zfs/zpool.cache"
	-
	-/*
	- * vdev states are ordered from least to most healthy.
	- * A vdev that's CANT_OPEN or below is considered unusable.
	- */
	-typedef enum vdev_state {
	- VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
	- VDEV_STATE_CLOSED, /* Not currently open */
	- VDEV_STATE_OFFLINE, /* Not allowed to open */
	- VDEV_STATE_REMOVED, /* Explicitly removed from system */
	- VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
	- VDEV_STATE_FAULTED, /* External request to fault device */
	- VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
	- VDEV_STATE_HEALTHY /* Presumed good */
	-} vdev_state_t;
	-
	-#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY
	-
	-/*
	- * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
	- * of the vdev stats structure uses these constants to distinguish why.
	- */
	-typedef enum vdev_aux {
	- VDEV_AUX_NONE, /* no error */
	- VDEV_AUX_OPEN_FAILED, /* ldi_open_() or vn_open() failed /
	- VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
	- VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
	- VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
	- VDEV_AUX_TOO_SMALL, /* vdev size is too small */
	- VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
	- VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
	- VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
	- VDEV_AUX_UNSUP_FEAT, /* unsupported features */
	- VDEV_AUX_SPARED, /* hot spare used in another pool */
	- VDEV_AUX_ERR_EXCEEDED, /* too many errors */
	- VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
	- VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
	- VDEV_AUX_EXTERNAL, /* external diagnosis */
	- VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */
	- VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */
	- VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */
	- VDEV_AUX_ACTIVE /* vdev active on a different host */
	-} vdev_aux_t;
	-
	-/*
	- * pool state. The following states are written to disk as part of the normal
	- * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining
	- * states are software abstractions used at various levels to communicate
	- * pool state.
	- */
	-typedef enum pool_state {
	- POOL_STATE_ACTIVE = 0, /* In active use */
	- POOL_STATE_EXPORTED, /* Explicitly exported */
	- POOL_STATE_DESTROYED, /* Explicitly destroyed */
	- POOL_STATE_SPARE, /* Reserved for hot spare use */
	- POOL_STATE_L2CACHE, /* Level 2 ARC device */
	- POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
	- POOL_STATE_UNAVAIL, /* Internal libzfs state */
	- POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
	-} pool_state_t;
	-
	-/*
	- * mmp state. The following states provide additional detail describing
	- * why a pool couldn't be safely imported.
	- */
	-typedef enum mmp_state {
	- MMP_STATE_ACTIVE = 0, /* In active use */
	- MMP_STATE_INACTIVE, /* Inactive and safe to import */
	- MMP_STATE_NO_HOSTID /* System hostid is not set */
	-} mmp_state_t;
	-
	-/*
	- * Scan Functions.
	- */
	-typedef enum pool_scan_func {
	- POOL_SCAN_NONE,
	- POOL_SCAN_SCRUB,
	- POOL_SCAN_RESILVER,
	- POOL_SCAN_FUNCS
	-} pool_scan_func_t;
	-
	-/*
	- * Used to control scrub pause and resume.
	- */
	-typedef enum pool_scrub_cmd {
	- POOL_SCRUB_NORMAL = 0,
	- POOL_SCRUB_PAUSE,
	- POOL_SCRUB_FLAGS_END
	-} pool_scrub_cmd_t;
	-
	-/*
	- * Initialize functions.
	- */
	-typedef enum pool_initialize_func {
	- POOL_INITIALIZE_DO,
	- POOL_INITIALIZE_CANCEL,
	- POOL_INITIALIZE_SUSPEND,
	- POOL_INITIALIZE_FUNCS
	-} pool_initialize_func_t;
	-
	-/*
	- * ZIO types. Needed to interpret vdev statistics below.
	- */
	-typedef enum zio_type {
	- ZIO_TYPE_NULL = 0,
	- ZIO_TYPE_READ,
	- ZIO_TYPE_WRITE,
	- ZIO_TYPE_FREE,
	- ZIO_TYPE_CLAIM,
	- ZIO_TYPE_IOCTL,
	- ZIO_TYPES
	-} zio_type_t;
	-
	-/*
	- * Pool statistics. Note: all fields should be 64-bit because this
	- * is passed between kernel and userland as an nvlist uint64 array.
	- */
	-typedef struct pool_scan_stat {
	- /* values stored on disk */
	- uint64_t pss_func; /* pool_scan_func_t */
	- uint64_t pss_state; /* dsl_scan_state_t */
	- uint64_t pss_start_time; /* scan start time */
	- uint64_t pss_end_time; /* scan end time */
	- uint64_t pss_to_examine; /* total bytes to scan */
	- uint64_t pss_examined; /* total bytes located by scanner */
	- uint64_t pss_to_process; /* total bytes to process */
	- uint64_t pss_processed; /* total processed bytes */
	- uint64_t pss_errors; /* scan errors */
	-
	- /* values not stored on disk */
	- uint64_t pss_pass_exam; /* examined bytes per scan pass */
	- uint64_t pss_pass_start; /* start time of a scan pass */
	- uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */
	- /* cumulative time scrub spent paused, needed for rate calculation */
	- uint64_t pss_pass_scrub_spent_paused;
	-
	- /* Sorted scrubbing new fields */
	- /* Stored on disk */
	- uint64_t pss_issued; /* total bytes checked by scanner */
	- /* Not stored on disk */
	- uint64_t pss_pass_issued; /* issued bytes per scan pass */
	-} pool_scan_stat_t;
	-
	-typedef struct pool_removal_stat {
	- uint64_t prs_state; /* dsl_scan_state_t */
	- uint64_t prs_removing_vdev;
	- uint64_t prs_start_time;
	- uint64_t prs_end_time;
	- uint64_t prs_to_copy; /* bytes that need to be copied */
	- uint64_t prs_copied; /* bytes copied so far */
	- /*
	- * bytes of memory used for indirect mappings.
	- * This includes all removed vdevs.
	- */
	- uint64_t prs_mapping_memory;
	-} pool_removal_stat_t;
	-
	-typedef enum dsl_scan_state {
	- DSS_NONE,
	- DSS_SCANNING,
	- DSS_FINISHED,
	- DSS_CANCELED,
	- DSS_NUM_STATES
	-} dsl_scan_state_t;
	-
	-typedef enum {
	- CS_NONE,
	- CS_CHECKPOINT_EXISTS,
	- CS_CHECKPOINT_DISCARDING,
	- CS_NUM_STATES
	-} checkpoint_state_t;
	-
	-typedef struct pool_checkpoint_stat {
	- uint64_t pcs_state; /* checkpoint_state_t */
	- uint64_t pcs_start_time; /* time checkpoint/discard started */
	- uint64_t pcs_space; /* checkpointed space */
	-} pool_checkpoint_stat_t;
	-
	-typedef enum {
	- VDEV_INITIALIZE_NONE,
	- VDEV_INITIALIZE_ACTIVE,
	- VDEV_INITIALIZE_CANCELED,
	- VDEV_INITIALIZE_SUSPENDED,
	- VDEV_INITIALIZE_COMPLETE
	-} vdev_initializing_state_t;
	-
	-/*
	- * Vdev statistics. Note: all fields should be 64-bit because this
	- * is passed between kernel and userland as an nvlist uint64 array.
	- */
	-typedef struct vdev_stat {
	- hrtime_t vs_timestamp; /* time since vdev load */
	- uint64_t vs_state; /* vdev state */
	- uint64_t vs_aux; /* see vdev_aux_t */
	- uint64_t vs_alloc; /* space allocated */
	- uint64_t vs_space; /* total capacity */
	- uint64_t vs_dspace; /* deflated capacity */
	- uint64_t vs_rsize; /* replaceable dev size */
	- uint64_t vs_esize; /* expandable dev size */
	- uint64_t vs_ops[ZIO_TYPES]; /* operation count */
	- uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
	- uint64_t vs_read_errors; /* read errors */
	- uint64_t vs_write_errors; /* write errors */
	- uint64_t vs_checksum_errors; /* checksum errors */
	- uint64_t vs_self_healed; /* self-healed bytes */
	- uint64_t vs_scan_removing; /* removing? */
	- uint64_t vs_scan_processed; /* scan processed bytes */
	- uint64_t vs_configured_ashift; /* TLV vdev_ashift */
	- uint64_t vs_logical_ashift; /* vdev_logical_ashift */
	- uint64_t vs_physical_ashift; /* vdev_physical_ashift */
	- uint64_t vs_fragmentation; /* device fragmentation */
	- uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
	- uint64_t vs_initialize_errors; /* initializing errors */
	- uint64_t vs_initialize_bytes_done; /* bytes initialized */
	- uint64_t vs_initialize_bytes_est; /* total bytes to initialize */
	- uint64_t vs_initialize_state; /* vdev_initialzing_state_t */
	- uint64_t vs_initialize_action_time; /* time_t */
	-} vdev_stat_t;
	-#define VDEV_STAT_VALID(field, uint64_t_field_count) \
	- ((uint64_t_field_count * sizeof(uint64_t)) >= \
	- (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field)))
	-
	-/*
	- * DDT statistics. Note: all fields should be 64-bit because this
	- * is passed between kernel and userland as an nvlist uint64 array.
	- */
	-typedef struct ddt_object {
	- uint64_t ddo_count; /* number of elments in ddt */
	- uint64_t ddo_dspace; /* size of ddt on disk */
	- uint64_t ddo_mspace; /* size of ddt in-core */
	-} ddt_object_t;
	-
	-typedef struct ddt_stat {
	- uint64_t dds_blocks; /* blocks */
	- uint64_t dds_lsize; /* logical size */
	- uint64_t dds_psize; /* physical size */
	- uint64_t dds_dsize; /* deflated allocated size */
	- uint64_t dds_ref_blocks; /* referenced blocks */
	- uint64_t dds_ref_lsize; /* referenced lsize * refcnt */
	- uint64_t dds_ref_psize; /* referenced psize * refcnt */
	- uint64_t dds_ref_dsize; /* referenced dsize * refcnt */
	-} ddt_stat_t;
	-
	-typedef struct ddt_histogram {
	- ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */
	-} ddt_histogram_t;
	-
	-#define ZVOL_DRIVER "zvol"
	-#define ZFS_DRIVER "zfs"
	-#define ZFS_DEV_NAME "zfs"
	-#define ZFS_DEV "/dev/" ZFS_DEV_NAME
	-#define ZFS_DISK_ROOT "/dev/dsk"
	-#define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/"
	-#define ZFS_RDISK_ROOT "/dev/rdsk"
	-#define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/"
	-
	-/* general zvol path */
	-#define ZVOL_DIR "/dev/zvol"
	-/* expansion */
	-#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:"
	-/* for dump and swap */
	-#define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/"
	-#define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/"
	-
	-#define ZVOL_PROP_NAME "name"
	-#define ZVOL_DEFAULT_BLOCKSIZE 8192
	-
	-/*
	- * /dev/zfs ioctl numbers.
	- *
	- * These numbers cannot change over time. New ioctl numbers must be appended.
	- */
	-typedef enum zfs_ioc {
	- /*
	- * Core features - 81/128 numbers reserved.
	- */
	-#ifdef __FreeBSD__
	- ZFS_IOC_FIRST = 0,
	-#else
	- ZFS_IOC_FIRST = ('Z' << 8),
	-#endif
	- ZFS_IOC = ZFS_IOC_FIRST,
	- ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST,
	- ZFS_IOC_POOL_DESTROY,
	- ZFS_IOC_POOL_IMPORT,
	- ZFS_IOC_POOL_EXPORT,
	- ZFS_IOC_POOL_CONFIGS,
	- ZFS_IOC_POOL_STATS,
	- ZFS_IOC_POOL_TRYIMPORT,
	- ZFS_IOC_POOL_SCAN,
	- ZFS_IOC_POOL_FREEZE,
	- ZFS_IOC_POOL_UPGRADE,
	- ZFS_IOC_POOL_GET_HISTORY,
	- ZFS_IOC_VDEV_ADD,
	- ZFS_IOC_VDEV_REMOVE,
	- ZFS_IOC_VDEV_SET_STATE,
	- ZFS_IOC_VDEV_ATTACH,
	- ZFS_IOC_VDEV_DETACH,
	- ZFS_IOC_VDEV_SETPATH,
	- ZFS_IOC_VDEV_SETFRU,
	- ZFS_IOC_OBJSET_STATS,
	- ZFS_IOC_OBJSET_ZPLPROPS,
	- ZFS_IOC_DATASET_LIST_NEXT,
	- ZFS_IOC_SNAPSHOT_LIST_NEXT,
	- ZFS_IOC_SET_PROP,
	- ZFS_IOC_CREATE,
	- ZFS_IOC_DESTROY,
	- ZFS_IOC_ROLLBACK,
	- ZFS_IOC_RENAME,
	- ZFS_IOC_RECV,
	- ZFS_IOC_SEND,
	- ZFS_IOC_INJECT_FAULT,
	- ZFS_IOC_CLEAR_FAULT,
	- ZFS_IOC_INJECT_LIST_NEXT,
	- ZFS_IOC_ERROR_LOG,
	- ZFS_IOC_CLEAR,
	- ZFS_IOC_PROMOTE,
	- ZFS_IOC_DESTROY_SNAPS,
	- ZFS_IOC_SNAPSHOT,
	- ZFS_IOC_DSOBJ_TO_DSNAME,
	- ZFS_IOC_OBJ_TO_PATH,
	- ZFS_IOC_POOL_SET_PROPS,
	- ZFS_IOC_POOL_GET_PROPS,
	- ZFS_IOC_SET_FSACL,
	- ZFS_IOC_GET_FSACL,
	- ZFS_IOC_SHARE,
	- ZFS_IOC_INHERIT_PROP,
	- ZFS_IOC_SMB_ACL,
	- ZFS_IOC_USERSPACE_ONE,
	- ZFS_IOC_USERSPACE_MANY,
	- ZFS_IOC_USERSPACE_UPGRADE,
	- ZFS_IOC_HOLD,
	- ZFS_IOC_RELEASE,
	- ZFS_IOC_GET_HOLDS,
	- ZFS_IOC_OBJSET_RECVD_PROPS,
	- ZFS_IOC_VDEV_SPLIT,
	- ZFS_IOC_NEXT_OBJ,
	- ZFS_IOC_DIFF,
	- ZFS_IOC_TMP_SNAPSHOT,
	- ZFS_IOC_OBJ_TO_STATS,
	- ZFS_IOC_JAIL,
	- ZFS_IOC_UNJAIL,
	- ZFS_IOC_POOL_REGUID,
	- ZFS_IOC_SPACE_WRITTEN,
	- ZFS_IOC_SPACE_SNAPS,
	- ZFS_IOC_SEND_PROGRESS,
	- ZFS_IOC_POOL_REOPEN,
	- ZFS_IOC_LOG_HISTORY,
	- ZFS_IOC_SEND_NEW,
	- ZFS_IOC_SEND_SPACE,
	- ZFS_IOC_CLONE,
	- ZFS_IOC_BOOKMARK,
	- ZFS_IOC_GET_BOOKMARKS,
	- ZFS_IOC_DESTROY_BOOKMARKS,
	-#ifdef __FreeBSD__
	- ZFS_IOC_NEXTBOOT,
	-#endif
	- ZFS_IOC_CHANNEL_PROGRAM,
	- ZFS_IOC_REMAP,
	- ZFS_IOC_POOL_CHECKPOINT,
	- ZFS_IOC_POOL_DISCARD_CHECKPOINT,
	- ZFS_IOC_POOL_INITIALIZE,
	- ZFS_IOC_POOL_SYNC,
	- ZFS_IOC_SET_BOOTENV,
	- ZFS_IOC_GET_BOOTENV,
	- ZFS_IOC_LAST
	-} zfs_ioc_t;
	-
	-/*
	- * ZFS-specific error codes used for returning descriptive errors
	- * to the userland through zfs ioctls.
	- *
	- * The enum implicitly includes all the error codes from errno.h.
	- * New code should use and extend this enum for errors that are
	- * not described precisely by generic errno codes.
	- *
	- * These numbers should not change over time. New entries should be appended.
	- */
	-typedef enum {
	- ZFS_ERR_CHECKPOINT_EXISTS = 1024,
	- ZFS_ERR_DISCARDING_CHECKPOINT,
	- ZFS_ERR_NO_CHECKPOINT,
	- ZFS_ERR_DEVRM_IN_PROGRESS,
	- ZFS_ERR_VDEV_TOO_BIG,
	- ZFS_ERR_IOC_CMD_UNAVAIL,
	- ZFS_ERR_IOC_ARG_UNAVAIL,
	- ZFS_ERR_IOC_ARG_REQUIRED,
	- ZFS_ERR_IOC_ARG_BADTYPE,
	- ZFS_ERR_WRONG_PARENT,
	-} zfs_errno_t;
	-
	-/*
	- * Internal SPA load state. Used by FMA diagnosis engine.
	- */
	-typedef enum {
	- SPA_LOAD_NONE, /* no load in progress */
	- SPA_LOAD_OPEN, /* normal open */
	- SPA_LOAD_IMPORT, /* import in progress */
	- SPA_LOAD_TRYIMPORT, /* tryimport in progress */
	- SPA_LOAD_RECOVER, /* recovery requested */
	- SPA_LOAD_ERROR, /* load failed */
	- SPA_LOAD_CREATE /* creation in progress */
	-} spa_load_state_t;
	-
	-/*
	- * Bookmark name values.
	- */
	-#define ZPOOL_ERR_LIST "error list"
	-#define ZPOOL_ERR_DATASET "dataset"
	-#define ZPOOL_ERR_OBJECT "object"
	-
	-#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1)
	-
	-/*
	- * The following are names used in the nvlist describing
	- * the pool's history log.
	- */
	-#define ZPOOL_HIST_RECORD "history record"
	-#define ZPOOL_HIST_TIME "history time"
	-#define ZPOOL_HIST_CMD "history command"
	-#define ZPOOL_HIST_WHO "history who"
	-#define ZPOOL_HIST_ZONE "history zone"
	-#define ZPOOL_HIST_HOST "history hostname"
	-#define ZPOOL_HIST_TXG "history txg"
	-#define ZPOOL_HIST_INT_EVENT "history internal event"
	-#define ZPOOL_HIST_INT_STR "history internal str"
	-#define ZPOOL_HIST_INT_NAME "internal_name"
	-#define ZPOOL_HIST_IOCTL "ioctl"
	-#define ZPOOL_HIST_INPUT_NVL "in_nvl"
	-#define ZPOOL_HIST_OUTPUT_NVL "out_nvl"
	-#define ZPOOL_HIST_DSNAME "dsname"
	-#define ZPOOL_HIST_DSID "dsid"
	-#define ZPOOL_HIST_ERRNO "errno"
	-
	-/*
	- * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE.
	- */
	-#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
	-#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
	-
	-/*
	- * Flags for ZFS_IOC_VDEV_SET_STATE
	- */
	-#define ZFS_ONLINE_CHECKREMOVE 0x1
	-#define ZFS_ONLINE_UNSPARE 0x2
	-#define ZFS_ONLINE_FORCEFAULT 0x4
	-#define ZFS_ONLINE_EXPAND 0x8
	-#define ZFS_OFFLINE_TEMPORARY 0x1
	-
	-/*
	- * Flags for ZFS_IOC_POOL_IMPORT
	- */
	-#define ZFS_IMPORT_NORMAL 0x0
	-#define ZFS_IMPORT_VERBATIM 0x1
	-#define ZFS_IMPORT_ANY_HOST 0x2
	-#define ZFS_IMPORT_MISSING_LOG 0x4
	-#define ZFS_IMPORT_ONLY 0x8
	-#define ZFS_IMPORT_CHECKPOINT 0x10
	-#define ZFS_IMPORT_TEMP_NAME 0x20
	-#define ZFS_IMPORT_SKIP_MMP 0x40
	-
	-/*
	- * Channel program argument/return nvlist keys and defaults.
	- */
	-#define ZCP_ARG_PROGRAM "program"
	-#define ZCP_ARG_ARGLIST "arg"
	-#define ZCP_ARG_SYNC "sync"
	-#define ZCP_ARG_INSTRLIMIT "instrlimit"
	-#define ZCP_ARG_MEMLIMIT "memlimit"
	-
	-#define ZCP_ARG_CLIARGV "argv"
	-
	-#define ZCP_RET_ERROR "error"
	-#define ZCP_RET_RETURN "return"
	-
	-#define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000)
	-#define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT)
	-#define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024)
	-#define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT)
	-
	-/*
	- * nvlist name constants. Facilitate restricting snapshot iteration range for
	- * the "list next snapshot" ioctl
	- */
	-#define SNAP_ITER_MIN_TXG "snap_iter_min_txg"
	-#define SNAP_ITER_MAX_TXG "snap_iter_max_txg"
	-
	-/*
	- * Sysevent payload members. ZFS will generate the following sysevents with the
	- * given payloads:
	- *
	- * ESC_ZFS_RESILVER_START
	- * ESC_ZFS_RESILVER_END
	- * ESC_ZFS_POOL_DESTROY
	- * ESC_ZFS_POOL_REGUID
	- *
	- * ZFS_EV_POOL_NAME DATA_TYPE_STRING
	- * ZFS_EV_POOL_GUID DATA_TYPE_UINT64
	- *
	- * ESC_ZFS_VDEV_REMOVE
	- * ESC_ZFS_VDEV_CLEAR
	- * ESC_ZFS_VDEV_CHECK
	- *
	- * ZFS_EV_POOL_NAME DATA_TYPE_STRING
	- * ZFS_EV_POOL_GUID DATA_TYPE_UINT64
	- * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional)
	- * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64
	- *
	- * ESC_ZFS_HISTORY_EVENT
	- *
	- * ZFS_EV_POOL_NAME DATA_TYPE_STRING
	- * ZFS_EV_POOL_GUID DATA_TYPE_UINT64
	- * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional)
	- * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional)
	- * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional)
	- * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional)
	- * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional)
	- * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional)
	- * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional)
	- * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional)
	- * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional)
	- * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional)
	- * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional)
	- * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional)
	- *
	- * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the
	- * history log nvlist. The keynames will be free of any spaces or other
	- * characters that could be potentially unexpected to consumers of the
	- * sysevents.
	- */
	-#define ZFS_EV_POOL_NAME "pool_name"
	-#define ZFS_EV_POOL_GUID "pool_guid"
	-#define ZFS_EV_VDEV_PATH "vdev_path"
	-#define ZFS_EV_VDEV_GUID "vdev_guid"
	-#define ZFS_EV_HIST_TIME "history_time"
	-#define ZFS_EV_HIST_CMD "history_command"
	-#define ZFS_EV_HIST_WHO "history_who"
	-#define ZFS_EV_HIST_ZONE "history_zone"
	-#define ZFS_EV_HIST_HOST "history_hostname"
	-#define ZFS_EV_HIST_TXG "history_txg"
	-#define ZFS_EV_HIST_INT_EVENT "history_internal_event"
	-#define ZFS_EV_HIST_INT_STR "history_internal_str"
	-#define ZFS_EV_HIST_INT_NAME "history_internal_name"
	-#define ZFS_EV_HIST_IOCTL "history_ioctl"
	-#define ZFS_EV_HIST_DSNAME "history_dsname"
	-#define ZFS_EV_HIST_DSID "history_dsid"
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_FS_ZFS_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
	@@ -1,93 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-#ifndef _ZUT_H
	-#define _ZUT_H
	-
	-/*
	- * IOCTLs for the zfs unit test driver
	- */
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#include <sys/param.h>
	-#include <sys/types.h>
	-#include <sys/stat.h>
	-
	-#define ZUT_DRIVER "zut"
	-#define ZUT_DEV "/dev/zut"
	-
	-#define ZUT_VERSION_STRING "1"
	-
	-/*
	- * /dev/zut ioctl numbers.
	- */
	-#define ZUT_IOC ('U' << 8)
	-
	-/* Request flags */
	-#define ZUT_IGNORECASE 0x01
	-#define ZUT_ACCFILTER 0x02
	-#define ZUT_XATTR 0x04
	-#define ZUT_EXTRDDIR 0x08
	-#define ZUT_GETSTAT 0x10
	-
	-typedef struct zut_lookup {
	- int zl_reqflags;
	- int zl_deflags; /* output */
	- int zl_retcode; /* output */
	- char zl_dir[MAXPATHLEN];
	- char zl_file[MAXNAMELEN];
	- char zl_xfile[MAXNAMELEN];
	- char zl_real[MAXPATHLEN]; /* output */
	- uint64_t zl_xvattrs; /* output */
	- struct stat64 zl_statbuf; /* output */
	-} zut_lookup_t;
	-
	-typedef struct zut_readdir {
	- uint64_t zr_buf; /* pointer to output buffer */
	- uint64_t zr_loffset; /* output */
	- char zr_dir[MAXPATHLEN];
	- char zr_file[MAXNAMELEN];
	- int zr_reqflags;
	- int zr_retcode; /* output */
	- int zr_eof; /* output */
	- uint_t zr_bytes; /* output */
	- uint_t zr_buflen;
	-} zut_readdir_t;
	-
	-typedef enum zut_ioc {
	- ZUT_IOC_MIN_CMD = ZUT_IOC - 1,
	- ZUT_IOC_LOOKUP = ZUT_IOC,
	- ZUT_IOC_READDIR,
	- ZUT_IOC_MAX_CMD
	-} zut_ioc_t;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _ZUT_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
	@@ -1,351 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _SYS_NVPAIR_H
	-#define _SYS_NVPAIR_H
	-
	-#include <sys/types.h>
	-#include <sys/time.h>
	-#include <sys/errno.h>
	-
	-#if defined(_KERNEL) && !defined(_BOOT)
	-#include <sys/kmem.h>
	-#endif
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-typedef enum {
	- DATA_TYPE_DONTCARE = -1,
	- DATA_TYPE_UNKNOWN = 0,
	- DATA_TYPE_BOOLEAN,
	- DATA_TYPE_BYTE,
	- DATA_TYPE_INT16,
	- DATA_TYPE_UINT16,
	- DATA_TYPE_INT32,
	- DATA_TYPE_UINT32,
	- DATA_TYPE_INT64,
	- DATA_TYPE_UINT64,
	- DATA_TYPE_STRING,
	- DATA_TYPE_BYTE_ARRAY,
	- DATA_TYPE_INT16_ARRAY,
	- DATA_TYPE_UINT16_ARRAY,
	- DATA_TYPE_INT32_ARRAY,
	- DATA_TYPE_UINT32_ARRAY,
	- DATA_TYPE_INT64_ARRAY,
	- DATA_TYPE_UINT64_ARRAY,
	- DATA_TYPE_STRING_ARRAY,
	- DATA_TYPE_HRTIME,
	- DATA_TYPE_NVLIST,
	- DATA_TYPE_NVLIST_ARRAY,
	- DATA_TYPE_BOOLEAN_VALUE,
	- DATA_TYPE_INT8,
	- DATA_TYPE_UINT8,
	- DATA_TYPE_BOOLEAN_ARRAY,
	- DATA_TYPE_INT8_ARRAY,
	-#if !defined(_KERNEL)
	- DATA_TYPE_UINT8_ARRAY,
	- DATA_TYPE_DOUBLE
	-#else
	- DATA_TYPE_UINT8_ARRAY
	-#endif
	-} data_type_t;
	-
	-typedef struct nvpair {
	- int32_t nvp_size; /* size of this nvpair */
	- int16_t nvp_name_sz; /* length of name string */
	- int16_t nvp_reserve; /* not used */
	- int32_t nvp_value_elem; /* number of elements for array types */
	- data_type_t nvp_type; /* type of value */
	- /* name string */
	- /* aligned ptr array for string arrays */
	- /* aligned array of data for value */
	-} nvpair_t;
	-
	-/* nvlist header */
	-typedef struct nvlist {
	- int32_t nvl_version;
	- uint32_t nvl_nvflag; /* persistent flags */
	- uint64_t nvl_priv; /* ptr to private data if not packed */
	- uint32_t nvl_flag;
	- int32_t nvl_pad; /* currently not used, for alignment */
	-} nvlist_t;
	-
	-/* nvp implementation version */
	-#define NV_VERSION 0
	-
	-/* nvlist pack encoding */
	-#define NV_ENCODE_NATIVE 0
	-#define NV_ENCODE_XDR 1
	-
	-/* nvlist persistent unique name flags, stored in nvl_nvflags */
	-#define NV_UNIQUE_NAME 0x1
	-#define NV_UNIQUE_NAME_TYPE 0x2
	-
	-/* nvlist lookup pairs related flags */
	-#define NV_FLAG_NOENTOK 0x1
	-
	-/* convenience macros */
	-#define NV_ALIGN(x) (((ulong_t)(x) + 7ul) & ~7ul)
	-#define NV_ALIGN4(x) (((x) + 3) & ~3)
	-
	-#define NVP_SIZE(nvp) ((nvp)->nvp_size)
	-#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t))
	-#define NVP_TYPE(nvp) ((nvp)->nvp_type)
	-#define NVP_NELEM(nvp) ((nvp)->nvp_value_elem)
	-#define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \
	- + (nvp)->nvp_name_sz))
	-
	-#define NVL_VERSION(nvl) ((nvl)->nvl_version)
	-#define NVL_SIZE(nvl) ((nvl)->nvl_size)
	-#define NVL_FLAG(nvl) ((nvl)->nvl_flag)
	-
	-/* NV allocator framework */
	-typedef struct nv_alloc_ops nv_alloc_ops_t;
	-
	-typedef struct nv_alloc {
	- const nv_alloc_ops_t *nva_ops;
	- void *nva_arg;
	-} nv_alloc_t;
	-
	-struct nv_alloc_ops {
	- int (nv_ao_init)(nv_alloc_t , __va_list);
	- void (nv_ao_fini)(nv_alloc_t );
	- void (nv_ao_alloc)(nv_alloc_t *, size_t);
	- void (nv_ao_free)(nv_alloc_t , void *, size_t);
	- void (nv_ao_reset)(nv_alloc_t );
	-};
	-
	-extern const nv_alloc_ops_t *nv_fixed_ops;
	-extern nv_alloc_t *nv_alloc_nosleep;
	-
	-#if defined(_KERNEL) && !defined(_BOOT)
	-extern nv_alloc_t *nv_alloc_sleep;
	-#endif
	-
	-int nv_alloc_init(nv_alloc_t , const nv_alloc_ops_t , /* args */ ...);
	-void nv_alloc_reset(nv_alloc_t *);
	-void nv_alloc_fini(nv_alloc_t *);
	-
	-/* list management */
	-int nvlist_alloc(nvlist_t **, uint_t, int);
	-void nvlist_free(nvlist_t *);
	-int nvlist_size(nvlist_t , size_t , int);
	-int nvlist_pack(nvlist_t , char , size_t , int, int);
	-int nvlist_unpack(char , size_t, nvlist_t *, int);
	-int nvlist_dup(nvlist_t , nvlist_t *, int);
	-int nvlist_merge(nvlist_t , nvlist_t , int);
	-
	-uint_t nvlist_nvflag(nvlist_t *);
	-
	-int nvlist_xalloc(nvlist_t *, uint_t, nv_alloc_t );
	-int nvlist_xpack(nvlist_t , char , size_t , int, nv_alloc_t *);
	-int nvlist_xunpack(char , size_t, nvlist_t , nv_alloc_t );
	-int nvlist_xdup(nvlist_t , nvlist_t , nv_alloc_t );
	-nv_alloc_t nvlist_lookup_nv_alloc(nvlist_t );
	-
	-int nvlist_add_nvpair(nvlist_t , nvpair_t );
	-int nvlist_add_boolean(nvlist_t , const char );
	-int nvlist_add_boolean_value(nvlist_t , const char , boolean_t);
	-int nvlist_add_byte(nvlist_t , const char , uchar_t);
	-int nvlist_add_int8(nvlist_t , const char , int8_t);
	-int nvlist_add_uint8(nvlist_t , const char , uint8_t);
	-int nvlist_add_int16(nvlist_t , const char , int16_t);
	-int nvlist_add_uint16(nvlist_t , const char , uint16_t);
	-int nvlist_add_int32(nvlist_t , const char , int32_t);
	-int nvlist_add_uint32(nvlist_t , const char , uint32_t);
	-int nvlist_add_int64(nvlist_t , const char , int64_t);
	-int nvlist_add_uint64(nvlist_t , const char , uint64_t);
	-int nvlist_add_string(nvlist_t , const char , const char *);
	-int nvlist_add_nvlist(nvlist_t , const char , nvlist_t *);
	-int nvlist_add_boolean_array(nvlist_t , const char , boolean_t *, uint_t);
	-int nvlist_add_byte_array(nvlist_t , const char , uchar_t *, uint_t);
	-int nvlist_add_int8_array(nvlist_t , const char , int8_t *, uint_t);
	-int nvlist_add_uint8_array(nvlist_t , const char , uint8_t *, uint_t);
	-int nvlist_add_int16_array(nvlist_t , const char , int16_t *, uint_t);
	-int nvlist_add_uint16_array(nvlist_t , const char , uint16_t *, uint_t);
	-int nvlist_add_int32_array(nvlist_t , const char , int32_t *, uint_t);
	-int nvlist_add_uint32_array(nvlist_t , const char , uint32_t *, uint_t);
	-int nvlist_add_int64_array(nvlist_t , const char , int64_t *, uint_t);
	-int nvlist_add_uint64_array(nvlist_t , const char , uint64_t *, uint_t);
	-int nvlist_add_string_array(nvlist_t , const char , char const , uint_t);
	-int nvlist_add_nvlist_array(nvlist_t , const char , nvlist_t **, uint_t);
	-int nvlist_add_hrtime(nvlist_t , const char , hrtime_t);
	-#if !defined(_KERNEL)
	-int nvlist_add_double(nvlist_t , const char , double);
	-#endif
	-
	-int nvlist_remove(nvlist_t , const char , data_type_t);
	-int nvlist_remove_all(nvlist_t , const char );
	-int nvlist_remove_nvpair(nvlist_t , nvpair_t );
	-
	-int nvlist_lookup_boolean(nvlist_t , const char );
	-int nvlist_lookup_boolean_value(nvlist_t , const char , boolean_t *);
	-int nvlist_lookup_byte(nvlist_t , const char , uchar_t *);
	-int nvlist_lookup_int8(nvlist_t , const char , int8_t *);
	-int nvlist_lookup_uint8(nvlist_t , const char , uint8_t *);
	-int nvlist_lookup_int16(nvlist_t , const char , int16_t *);
	-int nvlist_lookup_uint16(nvlist_t , const char , uint16_t *);
	-int nvlist_lookup_int32(nvlist_t , const char , int32_t *);
	-int nvlist_lookup_uint32(nvlist_t , const char , uint32_t *);
	-int nvlist_lookup_int64(nvlist_t , const char , int64_t *);
	-int nvlist_lookup_uint64(nvlist_t , const char , uint64_t *);
	-int nvlist_lookup_string(nvlist_t , const char , char **);
	-int nvlist_lookup_nvlist(nvlist_t , const char , nvlist_t **);
	-int nvlist_lookup_boolean_array(nvlist_t , const char ,
	- boolean_t *, uint_t );
	-int nvlist_lookup_byte_array(nvlist_t , const char , uchar_t *, uint_t );
	-int nvlist_lookup_int8_array(nvlist_t , const char , int8_t *, uint_t );
	-int nvlist_lookup_uint8_array(nvlist_t , const char , uint8_t *, uint_t );
	-int nvlist_lookup_int16_array(nvlist_t , const char , int16_t *, uint_t );
	-int nvlist_lookup_uint16_array(nvlist_t , const char , uint16_t *, uint_t );
	-int nvlist_lookup_int32_array(nvlist_t , const char , int32_t *, uint_t );
	-int nvlist_lookup_uint32_array(nvlist_t , const char , uint32_t *, uint_t );
	-int nvlist_lookup_int64_array(nvlist_t , const char , int64_t *, uint_t );
	-int nvlist_lookup_uint64_array(nvlist_t , const char , uint64_t *, uint_t );
	-int nvlist_lookup_string_array(nvlist_t , const char , char **, uint_t );
	-int nvlist_lookup_nvlist_array(nvlist_t , const char ,
	- nvlist_t **, uint_t );
	-int nvlist_lookup_hrtime(nvlist_t , const char , hrtime_t *);
	-int nvlist_lookup_pairs(nvlist_t *, int, ...);
	-#if !defined(_KERNEL)
	-int nvlist_lookup_double(nvlist_t , const char , double *);
	-#endif
	-
	-int nvlist_lookup_nvpair(nvlist_t , const char , nvpair_t **);
	-int nvlist_lookup_nvpair_embedded_index(nvlist_t , const char , nvpair_t **,
	- int , char *);
	-boolean_t nvlist_exists(nvlist_t , const char );
	-boolean_t nvlist_empty(nvlist_t *);
	-
	-/* processing nvpair */
	-nvpair_t nvlist_next_nvpair(nvlist_t , nvpair_t *);
	-nvpair_t nvlist_prev_nvpair(nvlist_t , nvpair_t *);
	-char nvpair_name(nvpair_t );
	-data_type_t nvpair_type(nvpair_t *);
	-int nvpair_type_is_array(nvpair_t *);
	-int nvpair_value_boolean_value(nvpair_t , boolean_t );
	-int nvpair_value_byte(nvpair_t , uchar_t );
	-int nvpair_value_int8(nvpair_t , int8_t );
	-int nvpair_value_uint8(nvpair_t , uint8_t );
	-int nvpair_value_int16(nvpair_t , int16_t );
	-int nvpair_value_uint16(nvpair_t , uint16_t );
	-int nvpair_value_int32(nvpair_t , int32_t );
	-int nvpair_value_uint32(nvpair_t , uint32_t );
	-int nvpair_value_int64(nvpair_t , int64_t );
	-int nvpair_value_uint64(nvpair_t , uint64_t );
	-int nvpair_value_string(nvpair_t , char *);
	-int nvpair_value_nvlist(nvpair_t , nvlist_t *);
	-int nvpair_value_boolean_array(nvpair_t , boolean_t , uint_t );
	-int nvpair_value_byte_array(nvpair_t , uchar_t , uint_t );
	-int nvpair_value_int8_array(nvpair_t , int8_t , uint_t );
	-int nvpair_value_uint8_array(nvpair_t , uint8_t , uint_t );
	-int nvpair_value_int16_array(nvpair_t , int16_t , uint_t );
	-int nvpair_value_uint16_array(nvpair_t , uint16_t , uint_t );
	-int nvpair_value_int32_array(nvpair_t , int32_t , uint_t );
	-int nvpair_value_uint32_array(nvpair_t , uint32_t , uint_t );
	-int nvpair_value_int64_array(nvpair_t , int64_t , uint_t );
	-int nvpair_value_uint64_array(nvpair_t , uint64_t , uint_t );
	-int nvpair_value_string_array(nvpair_t , char *, uint_t );
	-int nvpair_value_nvlist_array(nvpair_t , nvlist_t *, uint_t );
	-int nvpair_value_hrtime(nvpair_t , hrtime_t );
	-#if !defined(_KERNEL)
	-int nvpair_value_double(nvpair_t , double );
	-#endif
	-
	-nvlist_t *fnvlist_alloc(void);
	-void fnvlist_free(nvlist_t *);
	-size_t fnvlist_size(nvlist_t *);
	-char fnvlist_pack(nvlist_t , size_t *);
	-void fnvlist_pack_free(char *, size_t);
	-nvlist_t fnvlist_unpack(char , size_t);
	-nvlist_t fnvlist_dup(nvlist_t );
	-void fnvlist_merge(nvlist_t , nvlist_t );
	-size_t fnvlist_num_pairs(nvlist_t *);
	-
	-void fnvlist_add_boolean(nvlist_t , const char );
	-void fnvlist_add_boolean_value(nvlist_t , const char , boolean_t);
	-void fnvlist_add_byte(nvlist_t , const char , uchar_t);
	-void fnvlist_add_int8(nvlist_t , const char , int8_t);
	-void fnvlist_add_uint8(nvlist_t , const char , uint8_t);
	-void fnvlist_add_int16(nvlist_t , const char , int16_t);
	-void fnvlist_add_uint16(nvlist_t , const char , uint16_t);
	-void fnvlist_add_int32(nvlist_t , const char , int32_t);
	-void fnvlist_add_uint32(nvlist_t , const char , uint32_t);
	-void fnvlist_add_int64(nvlist_t , const char , int64_t);
	-void fnvlist_add_uint64(nvlist_t , const char , uint64_t);
	-void fnvlist_add_string(nvlist_t , const char , const char *);
	-void fnvlist_add_nvlist(nvlist_t , const char , nvlist_t *);
	-void fnvlist_add_nvpair(nvlist_t , nvpair_t );
	-void fnvlist_add_boolean_array(nvlist_t , const char , boolean_t *, uint_t);
	-void fnvlist_add_byte_array(nvlist_t , const char , uchar_t *, uint_t);
	-void fnvlist_add_int8_array(nvlist_t , const char , int8_t *, uint_t);
	-void fnvlist_add_uint8_array(nvlist_t , const char , uint8_t *, uint_t);
	-void fnvlist_add_int16_array(nvlist_t , const char , int16_t *, uint_t);
	-void fnvlist_add_uint16_array(nvlist_t , const char , uint16_t *, uint_t);
	-void fnvlist_add_int32_array(nvlist_t , const char , int32_t *, uint_t);
	-void fnvlist_add_uint32_array(nvlist_t , const char , uint32_t *, uint_t);
	-void fnvlist_add_int64_array(nvlist_t , const char , int64_t *, uint_t);
	-void fnvlist_add_uint64_array(nvlist_t , const char , uint64_t *, uint_t);
	-void fnvlist_add_string_array(nvlist_t , const char , char * const *, uint_t);
	-void fnvlist_add_nvlist_array(nvlist_t , const char , nvlist_t **, uint_t);
	-
	-void fnvlist_remove(nvlist_t , const char );
	-void fnvlist_remove_nvpair(nvlist_t , nvpair_t );
	-
	-nvpair_t fnvlist_lookup_nvpair(nvlist_t nvl, const char *name);
	-boolean_t fnvlist_lookup_boolean(nvlist_t nvl, const char name);
	-boolean_t fnvlist_lookup_boolean_value(nvlist_t nvl, const char name);
	-uchar_t fnvlist_lookup_byte(nvlist_t nvl, const char name);
	-int8_t fnvlist_lookup_int8(nvlist_t nvl, const char name);
	-int16_t fnvlist_lookup_int16(nvlist_t nvl, const char name);
	-int32_t fnvlist_lookup_int32(nvlist_t nvl, const char name);
	-int64_t fnvlist_lookup_int64(nvlist_t nvl, const char name);
	-uint8_t fnvlist_lookup_uint8_t(nvlist_t nvl, const char name);
	-uint16_t fnvlist_lookup_uint16(nvlist_t nvl, const char name);
	-uint32_t fnvlist_lookup_uint32(nvlist_t nvl, const char name);
	-uint64_t fnvlist_lookup_uint64(nvlist_t nvl, const char name);
	-char fnvlist_lookup_string(nvlist_t nvl, const char *name);
	-nvlist_t fnvlist_lookup_nvlist(nvlist_t nvl, const char *name);
	-
	-boolean_t fnvpair_value_boolean_value(nvpair_t *nvp);
	-uchar_t fnvpair_value_byte(nvpair_t *nvp);
	-int8_t fnvpair_value_int8(nvpair_t *nvp);
	-int16_t fnvpair_value_int16(nvpair_t *nvp);
	-int32_t fnvpair_value_int32(nvpair_t *nvp);
	-int64_t fnvpair_value_int64(nvpair_t *nvp);
	-uint8_t fnvpair_value_uint8_t(nvpair_t *nvp);
	-uint16_t fnvpair_value_uint16(nvpair_t *nvp);
	-uint32_t fnvpair_value_uint32(nvpair_t *nvp);
	-uint64_t fnvpair_value_uint64(nvpair_t *nvp);
	-char fnvpair_value_string(nvpair_t nvp);
	-nvlist_t fnvpair_value_nvlist(nvpair_t nvp);
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_NVPAIR_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
	@@ -1,90 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License, Version 1.0 only
	- * (the "License"). You may not use this file except in compliance
	- * with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-/*
	- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
	- * Use is subject to license terms.
	- */
	-
	-/*
	- * Copyright (c) 2017 by Delphix. All rights reserved.
	- */
	-
	-#ifndef _NVPAIR_IMPL_H
	-#define _NVPAIR_IMPL_H
	-
	-#ifdef __cplusplus
	-extern "C" {
	-#endif
	-
	-#include <sys/nvpair.h>
	-
	-/*
	- * The structures here provided for information and debugging purposes only
	- * may be changed in the future.
	- */
	-
	-/*
	- * implementation linked list for pre-packed data
	- */
	-typedef struct i_nvp i_nvp_t;
	-
	-struct i_nvp {
	- union {
	- /* ensure alignment */
	- uint64_t _nvi_align;
	-
	- struct {
	- /* pointer to next nvpair */
	- i_nvp_t *_nvi_next;
	-
	- /* pointer to prev nvpair */
	- i_nvp_t *_nvi_prev;
	-
	- /* next pair in table bucket */
	- i_nvp_t *_nvi_hashtable_next;
	- } _nvi;
	- } _nvi_un;
	-
	- /* nvpair */
	- nvpair_t nvi_nvp;
	-};
	-#define nvi_next _nvi_un._nvi._nvi_next
	-#define nvi_prev _nvi_un._nvi._nvi_prev
	-#define nvi_hashtable_next _nvi_un._nvi._nvi_hashtable_next
	-
	-typedef struct {
	- i_nvp_t nvp_list; / linked list of nvpairs */
	- i_nvp_t nvp_last; / last nvpair */
	- i_nvp_t nvp_curr; / current walker nvpair */
	- nv_alloc_t nvp_nva; / pluggable allocator */
	- uint32_t nvp_stat; /* internal state */
	-
	- i_nvp_t *nvp_hashtable; / table of entries used for lookup */
	- uint32_t nvp_nbuckets; /* # of buckets in hash table */
	- uint32_t nvp_nentries; /* # of entries in hash table */
	-} nvpriv_t;
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _NVPAIR_IMPL_H */
	Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
	+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
	@@ -1,427 +0,0 @@
	-/*
	- * CDDL HEADER START
	- *
	- * The contents of this file are subject to the terms of the
	- * Common Development and Distribution License (the "License").
	- * You may not use this file except in compliance with the License.
	- *
	- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	- * or http://www.opensolaris.org/os/licensing.
	- * See the License for the specific language governing permissions
	- * and limitations under the License.
	- *
	- * When distributing Covered Code, include this CDDL HEADER in each
	- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	- * If applicable, add the following below this CDDL HEADER, with the
	- * fields enclosed by brackets "[]" replaced with your own identifying
	- * information: Portions Copyright [yyyy] [name of copyright owner]
	- *
	- * CDDL HEADER END
	- */
	-
	-/*
	- * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright 2017 RackTop Systems.
	- */
	-
	-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
	-/* All Rights Reserved */
	-
	-/*
	- * University Copyright- Copyright (c) 1982, 1986, 1988
	- * The Regents of the University of California
	- * All Rights Reserved
	- *
	- * University Acknowledgment- Portions of this document are derived from
	- * software developed by the University of California, Berkeley, and its
	- * contributors.
	- */
	-
	-#ifndef _SYS_VNODE_H
	-#define _SYS_VNODE_H
	-
	-#include_next <sys/vnode.h>
	-
	-#define IS_DEVVP(vp) \
	- ((vp)->v_type == VCHR \|\| (vp)->v_type == VBLK \|\| (vp)->v_type == VFIFO)
	-
	-#define V_XATTRDIR 0x0000 /* attribute unnamed directory */
	-
	-#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */
	-
	-/*
	- * Structure of all optional attributes.
	- */
	-typedef struct xoptattr {
	- timestruc_t xoa_createtime; /* Create time of file */
	- uint8_t xoa_archive;
	- uint8_t xoa_system;
	- uint8_t xoa_readonly;
	- uint8_t xoa_hidden;
	- uint8_t xoa_nounlink;
	- uint8_t xoa_immutable;
	- uint8_t xoa_appendonly;
	- uint8_t xoa_nodump;
	- uint8_t xoa_opaque;
	- uint8_t xoa_av_quarantined;
	- uint8_t xoa_av_modified;
	- uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ];
	- uint8_t xoa_reparse;
	- uint64_t xoa_generation;
	- uint8_t xoa_offline;
	- uint8_t xoa_sparse;
	-} xoptattr_t;
	-
	-/*
	- * The xvattr structure is really a variable length structure that
	- * is made up of:
	- * - The classic vattr_t (xva_vattr)
	- * - a 32 bit quantity (xva_mapsize) that specifies the size of the
	- * attribute bitmaps in 32 bit words.
	- * - A pointer to the returned attribute bitmap (needed because the
	- * previous element, the requested attribute bitmap) is variable lenth.
	- * - The requested attribute bitmap, which is an array of 32 bit words.
	- * Callers use the XVA_SET_REQ() macro to set the bits corresponding to
	- * the attributes that are being requested.
	- * - The returned attribute bitmap, which is an array of 32 bit words.
	- * File systems that support optional attributes use the XVA_SET_RTN()
	- * macro to set the bits corresponding to the attributes that are being
	- * returned.
	- * - The xoptattr_t structure which contains the attribute values
	- *
	- * xva_mapsize determines how many words in the attribute bitmaps.
	- * Immediately following the attribute bitmaps is the xoptattr_t.
	- * xva_getxoptattr() is used to get the pointer to the xoptattr_t
	- * section.
	- */
	-
	-#define XVA_MAPSIZE 3 /* Size of attr bitmaps */
	-#define XVA_MAGIC 0x78766174 /* Magic # for verification */
	-
	-/*
	- * The xvattr structure is an extensible structure which permits optional
	- * attributes to be requested/returned. File systems may or may not support
	- * optional attributes. They do so at their own discretion but if they do
	- * support optional attributes, they must register the VFSFT_XVATTR feature
	- * so that the optional attributes can be set/retrived.
	- *
	- * The fields of the xvattr structure are:
	- *
	- * xva_vattr - The first element of an xvattr is a legacy vattr structure
	- * which includes the common attributes. If AT_XVATTR is set in the va_mask
	- * then the entire structure is treated as an xvattr. If AT_XVATTR is not
	- * set, then only the xva_vattr structure can be used.
	- *
	- * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification.
	- *
	- * xva_mapsize - Size of requested and returned attribute bitmaps.
	- *
	- * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the
	- * size of the array before it, xva_reqattrmap[], could change which means
	- * the location of xva_rtnattrmap[] could change. This will allow unbundled
	- * file systems to find the location of xva_rtnattrmap[] when the sizes change.
	- *
	- * xva_reqattrmap[] - Array of requested attributes. Attributes are
	- * represented by a specific bit in a specific element of the attribute
	- * map array. Callers set the bits corresponding to the attributes
	- * that the caller wants to get/set.
	- *
	- * xva_rtnattrmap[] - Array of attributes that the file system was able to
	- * process. Not all file systems support all optional attributes. This map
	- * informs the caller which attributes the underlying file system was able
	- * to set/get. (Same structure as the requested attributes array in terms
	- * of each attribute corresponding to specific bits and array elements.)
	- *
	- * xva_xoptattrs - Structure containing values of optional attributes.
	- * These values are only valid if the corresponding bits in xva_reqattrmap
	- * are set and the underlying file system supports those attributes.
	- */
	-typedef struct xvattr {
	- vattr_t xva_vattr; /* Embedded vattr structure */
	- uint32_t xva_magic; /* Magic Number */
	- uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */
	- uint32_t xva_rtnattrmapp; / Ptr to xva_rtnattrmap[] */
	- uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */
	- uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */
	- xoptattr_t xva_xoptattrs; /* Optional attributes */
	-} xvattr_t;
	-
	-/*
	- * Attributes of interest to the caller of setattr or getattr.
	- */
	-#define AT_TYPE 0x00001
	-#define AT_MODE 0x00002
	-#define AT_UID 0x00004
	-#define AT_GID 0x00008
	-#define AT_FSID 0x00010
	-#define AT_NODEID 0x00020
	-#define AT_NLINK 0x00040
	-#define AT_SIZE 0x00080
	-#define AT_ATIME 0x00100
	-#define AT_MTIME 0x00200
	-#define AT_CTIME 0x00400
	-#define AT_RDEV 0x00800
	-#define AT_BLKSIZE 0x01000
	-#define AT_NBLOCKS 0x02000
	-/* 0x04000 / / unused */
	-#define AT_SEQ 0x08000
	-/*
	- * If AT_XVATTR is set then there are additional bits to process in
	- * the xvattr_t's attribute bitmap. If this is not set then the bitmap
	- * MUST be ignored. Note that this bit must be set/cleared explicitly.
	- * That is, setting AT_ALL will NOT set AT_XVATTR.
	- */
	-#define AT_XVATTR 0x10000
	-
	-#define AT_ALL (AT_TYPE\|AT_MODE\|AT_UID\|AT_GID\|AT_FSID\|AT_NODEID\|\
	- AT_NLINK\|AT_SIZE\|AT_ATIME\|AT_MTIME\|AT_CTIME\|\
	- AT_RDEV\|AT_BLKSIZE\|AT_NBLOCKS\|AT_SEQ)
	-
	-#define AT_STAT (AT_MODE\|AT_UID\|AT_GID\|AT_FSID\|AT_NODEID\|AT_NLINK\|\
	- AT_SIZE\|AT_ATIME\|AT_MTIME\|AT_CTIME\|AT_RDEV\|AT_TYPE)
	-
	-#define AT_TIMES (AT_ATIME\|AT_MTIME\|AT_CTIME)
	-
	-#define AT_NOSET (AT_NLINK\|AT_RDEV\|AT_FSID\|AT_NODEID\|AT_TYPE\|\
	- AT_BLKSIZE\|AT_NBLOCKS\|AT_SEQ)
	-
	-/*
	- * Attribute bits used in the extensible attribute's (xva's) attribute
	- * bitmaps. Note that the bitmaps are made up of a variable length number
	- * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n"
	- * is the element in the bitmap (starting at 1). This convention is for
	- * the convenience of the maintainer to keep track of which element each
	- * attribute belongs to.
	- *
	- * NOTE THAT CONSUMERS MUST NOT USE THE XATn_* DEFINES DIRECTLY. CONSUMERS
	- * MUST USE THE XAT_* DEFINES.
	- */
	-#define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */
	-#define XAT0_CREATETIME 0x00000001 /* Create time of file */
	-#define XAT0_ARCHIVE 0x00000002 /* Archive */
	-#define XAT0_SYSTEM 0x00000004 /* System */
	-#define XAT0_READONLY 0x00000008 /* Readonly */
	-#define XAT0_HIDDEN 0x00000010 /* Hidden */
	-#define XAT0_NOUNLINK 0x00000020 /* Nounlink */
	-#define XAT0_IMMUTABLE 0x00000040 /* immutable */
	-#define XAT0_APPENDONLY 0x00000080 /* appendonly */
	-#define XAT0_NODUMP 0x00000100 /* nodump */
	-#define XAT0_OPAQUE 0x00000200 /* opaque */
	-#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */
	-#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */
	-#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */
	-#define XAT0_REPARSE 0x00002000 /* FS reparse point */
	-#define XAT0_GEN 0x00004000 /* object generation number */
	-#define XAT0_OFFLINE 0x00008000 /* offline */
	-#define XAT0_SPARSE 0x00010000 /* sparse */
	-
	-#define XAT0_ALL_ATTRS (XAT0_CREATETIME\|XAT0_ARCHIVE\|XAT0_SYSTEM\| \
	- XAT0_READONLY\|XAT0_HIDDEN\|XAT0_NOUNLINK\|XAT0_IMMUTABLE\|XAT0_APPENDONLY\| \
	- XAT0_NODUMP\|XAT0_OPAQUE\|XAT0_AV_QUARANTINED\| XAT0_AV_MODIFIED\| \
	- XAT0_AV_SCANSTAMP\|XAT0_REPARSE\|XATO_GEN\|XAT0_OFFLINE\|XAT0_SPARSE)
	-
	-/* Support for XAT_* optional attributes */
	-#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */
	-#define XVA_SHFT 32 /* Used to shift index */
	-
	-/*
	- * Used to pry out the index and attribute bits from the XAT_* attributes
	- * defined below. Note that we're masking things down to 32 bits then
	- * casting to uint32_t.
	- */
	-#define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK))
	-#define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK))
	-
	-/*
	- * The following defines present a "flat namespace" so that consumers don't
	- * need to keep track of which element belongs to which bitmap entry.
	- *
	- * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER
	- */
	-#define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) \| XAT0_CREATETIME)
	-#define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) \| XAT0_ARCHIVE)
	-#define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) \| XAT0_SYSTEM)
	-#define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) \| XAT0_READONLY)
	-#define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) \| XAT0_HIDDEN)
	-#define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) \| XAT0_NOUNLINK)
	-#define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) \| XAT0_IMMUTABLE)
	-#define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) \| XAT0_APPENDONLY)
	-#define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) \| XAT0_NODUMP)
	-#define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) \| XAT0_OPAQUE)
	-#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) \| XAT0_AV_QUARANTINED)
	-#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) \| XAT0_AV_MODIFIED)
	-#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) \| XAT0_AV_SCANSTAMP)
	-#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) \| XAT0_REPARSE)
	-#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) \| XAT0_GEN)
	-#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) \| XAT0_OFFLINE)
	-#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) \| XAT0_SPARSE)
	-
	-/*
	- * The returned attribute map array (xva_rtnattrmap[]) is located past the
	- * requested attribute map array (xva_reqattrmap[]). Its location changes
	- * when the array sizes change. We use a separate pointer in a known location
	- * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is
	- * set in xva_init()
	- */
	-#define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp)
	-
	-/*
	- * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap
	- * of requested attributes (xva_reqattrmap[]).
	- */
	-#define XVA_SET_REQ(xvap, attr) { \
	- ASSERT((xvap)->xva_vattr.va_mask \| AT_XVATTR); \
	- ASSERT((xvap)->xva_magic == XVA_MAGIC); \
	- (xvap)->xva_reqattrmap[XVA_INDEX(attr)] \|= XVA_ATTRBIT(attr); \
	-}
	-/*
	- * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap
	- * of requested attributes (xva_reqattrmap[]).
	- */
	-#define XVA_CLR_REQ(xvap, attr) { \
	- ASSERT((xvap)->xva_vattr.va_mask \| AT_XVATTR); \
	- ASSERT((xvap)->xva_magic == XVA_MAGIC); \
	- (xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr); \
	-}
	-
	-/*
	- * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap
	- * of returned attributes (xva_rtnattrmap[]).
	- */
	-#define XVA_SET_RTN(xvap, attr) { \
	- ASSERT((xvap)->xva_vattr.va_mask \| AT_XVATTR); \
	- ASSERT((xvap)->xva_magic == XVA_MAGIC); \
	- (XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] \|= XVA_ATTRBIT(attr); \
	-}
	-
	-/*
	- * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[])
	- * to see of the corresponding attribute bit is set. If so, returns non-zero.
	- */
	-#define XVA_ISSET_REQ(xvap, attr) \
	- ((((xvap)->xva_vattr.va_mask \| AT_XVATTR) && \
	- ((xvap)->xva_magic == XVA_MAGIC) && \
	- ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \
	- ((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0)
	-
	-/*
	- * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[])
	- * to see of the corresponding attribute bit is set. If so, returns non-zero.
	- */
	-#define XVA_ISSET_RTN(xvap, attr) \
	- ((((xvap)->xva_vattr.va_mask \| AT_XVATTR) && \
	- ((xvap)->xva_magic == XVA_MAGIC) && \
	- ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \
	- ((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0)
	-
	-#define MODEMASK 07777 /* mode bits plus permission bits */
	-#define PERMMASK 00777 /* permission bits */
	-
	-/*
	- * VOP_ACCESS flags
	- */
	-#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */
	-
	-/*
	- * Flags for vnode operations.
	- */
	-enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */
	-enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */
	-
	-/*
	- * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations
	- */
	-
	-typedef struct vsecattr {
	- uint_t vsa_mask; /* See below */
	- int vsa_aclcnt; /* ACL entry count */
	- void vsa_aclentp; / pointer to ACL entries */
	- int vsa_dfaclcnt; /* default ACL entry count */
	- void vsa_dfaclentp; / pointer to default ACL entries */
	- size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */
	- uint_t vsa_aclflags; /* ACE ACL flags */
	-} vsecattr_t;
	-
	-/* vsa_mask values */
	-#define VSA_ACL 0x0001
	-#define VSA_ACLCNT 0x0002
	-#define VSA_DFACL 0x0004
	-#define VSA_DFACLCNT 0x0008
	-#define VSA_ACE 0x0010
	-#define VSA_ACECNT 0x0020
	-#define VSA_ACE_ALLTYPES 0x0040
	-#define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */
	-
	-/*
	- * Structure used by various vnode operations to determine
	- * the context (pid, host, identity) of a caller.
	- *
	- * The cc_caller_id is used to identify one or more callers who invoke
	- * operations, possibly on behalf of others. For example, the NFS
	- * server could have it's own cc_caller_id which can be detected by
	- * vnode/vfs operations or (FEM) monitors on those operations. New
	- * caller IDs are generated by fs_new_caller_id().
	- */
	-typedef struct caller_context {
	- pid_t cc_pid; /* Process ID of the caller */
	- int cc_sysid; /* System ID, used for remote calls */
	- u_longlong_t cc_caller_id; /* Identifier for (set of) caller(s) */
	- ulong_t cc_flags;
	-} caller_context_t;
	-
	-struct taskq;
	-
	-/*
	- * Flags for VOP_LOOKUP
	- *
	- * Defined in file.h, but also possible, FIGNORECASE and FSEARCH
	- *
	- */
	-#define LOOKUP_DIR 0x01 /* want parent dir vp */
	-#define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */
	-#define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */
	-#define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */
	-
	-/*
	- * Flags for VOP_READDIR
	- */
	-#define V_RDDIR_ENTFLAGS 0x01 /* request dirent flags */
	-#define V_RDDIR_ACCFILTER 0x02 /* filter out inaccessible dirents */
	-
	-/*
	- * Public vnode manipulation functions.
	- */
	-#ifdef _KERNEL
	-
	-void vn_rele_async(struct vnode vp, struct taskq taskq);
	-
	-/*
	- * Extensible vnode attribute (xva) routines:
	- * xva_init() initializes an xvattr_t (zero struct, init mapsize, set AT_XATTR)
	- * xva_getxoptattr() returns a ponter to the xoptattr_t section of xvattr_t
	- */
	-void xva_init(xvattr_t *);
	-xoptattr_t xva_getxoptattr(xvattr_t ); /* Get ptr to xoptattr_t */
	-
	-#define VN_RELE_ASYNC(vp, taskq) { \
	- vn_rele_async(vp, taskq); \
	-}
	-
	-#endif /* _KERNEL */
	-
	-/*
	- * Flags to VOP_SETATTR/VOP_GETATTR.
	- */
	-#define ATTR_UTIME 0x01 /* non-default utime(2) request */
	-#define ATTR_EXEC 0x02 /* invocation from exec(2) */
	-#define ATTR_COMM 0x04 /* yield common vp attributes */
	-#define ATTR_HINT 0x08 /* information returned will be `hint' */
	-#define ATTR_REAL 0x10 /* yield attributes of the real vp */
	-#define ATTR_NOACLCHECK 0x20 /* Don't check ACL when checking permissions */
	-#define ATTR_TRIGGER 0x40 /* Mount first if vnode is a trigger mount */
	-
	-#ifdef __cplusplus
	-}
	-#endif
	-
	-#endif /* _SYS_VNODE_H */
	Index: head/sys/cddl/dev/dtrace/amd64/dtrace_subr.c
	===================================================================
	--- head/sys/cddl/dev/dtrace/amd64/dtrace_subr.c
	+++ head/sys/cddl/dev/dtrace/amd64/dtrace_subr.c
	@@ -36,7 +36,6 @@
	#include <sys/types.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	-#include <sys/kmem.h>
	#include <sys/smp.h>
	#include <sys/dtrace_impl.h>
	#include <sys/dtrace_bsd.h>
	Index: head/sys/cddl/dev/fbt/fbt.c
	===================================================================
	--- head/sys/cddl/dev/fbt/fbt.c
	+++ head/sys/cddl/dev/fbt/fbt.c
	@@ -34,6 +34,7 @@
	#include <sys/systm.h>
	#include <sys/conf.h>
	#include <sys/cpuvar.h>
	+#include <sys/endian.h>
	#include <sys/fcntl.h>
	#include <sys/filio.h>
	#include <sys/kdb.h>
	Index: head/sys/cddl/dev/profile/profile.c
	===================================================================
	--- head/sys/cddl/dev/profile/profile.c
	+++ head/sys/cddl/dev/profile/profile.c
	@@ -34,6 +34,7 @@
	#include <sys/systm.h>
	#include <sys/conf.h>
	#include <sys/cpuvar.h>
	+#include <sys/endian.h>
	#include <sys/fcntl.h>
	#include <sys/filio.h>
	#include <sys/kdb.h>
	Index: head/sys/cddl/dev/sdt/sdt.c
	===================================================================
	--- head/sys/cddl/dev/sdt/sdt.c
	+++ head/sys/cddl/dev/sdt/sdt.c
	@@ -44,6 +44,7 @@
	#include <sys/systm.h>

	#include <sys/conf.h>
	+#include <sys/endian.h>
	#include <sys/eventhandler.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	Index: head/sys/cddl/dev/systrace/systrace.c
	===================================================================
	--- head/sys/cddl/dev/systrace/systrace.c
	+++ head/sys/cddl/dev/systrace/systrace.c
	@@ -290,7 +290,7 @@
	static void
	systrace_destroy(void arg, dtrace_id_t id, void parg)
	{
	-#ifdef DEBUG
	+#ifdef SYSTRACE_DEBUG
	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);

	/*
	Index: head/sys/conf/files
	===================================================================
	--- head/sys/conf/files
	+++ head/sys/conf/files
	@@ -133,184 +133,231 @@
	cam/scsi/scsi_targ_bh.c optional targbh
	cam/scsi/scsi_target.c optional targ
	cam/scsi/smp_all.c optional scbus
	+
	# shared between zfs and dtrace
	-cddl/compat/opensolaris/kern/opensolaris.c optional zfs \| dtrace compile-with "${CDDL_C}"
	-cddl/compat/opensolaris/kern/opensolaris_cmn_err.c optional zfs \| dtrace compile-with "${CDDL_C}"
	-cddl/compat/opensolaris/kern/opensolaris_kmem.c optional zfs \| dtrace compile-with "${CDDL_C}"
	-cddl/compat/opensolaris/kern/opensolaris_misc.c optional zfs \| dtrace compile-with "${CDDL_C}"
	+cddl/compat/opensolaris/kern/opensolaris.c optional dtrace compile-with "${CDDL_C}"
	cddl/compat/opensolaris/kern/opensolaris_proc.c optional zfs \| dtrace compile-with "${CDDL_C}"
	-cddl/compat/opensolaris/kern/opensolaris_sunddi.c optional zfs \| dtrace compile-with "${CDDL_C}"
	-cddl/compat/opensolaris/kern/opensolaris_taskq.c optional zfs \| dtrace compile-with "${CDDL_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_misc.c optional zfs \| dtrace compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c optional zfs \| dtrace compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_taskq.c optional zfs \| dtrace compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_kmem.c optional zfs \| dtrace compile-with "${ZFS_C}"
	+
	+#zfs solaris portability layer
	+contrib/openzfs/module/os/freebsd/spl/acl_common.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/callb.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/list.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_acl.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_kstat.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_policy.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_string.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_uio.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_vfs.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_vm.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_zone.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/spl/spl_zlib.c optional zfs compile-with "${ZFS_C}"
	+
	+
	# zfs specific
	-cddl/compat/opensolaris/kern/opensolaris_acl.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_dtrace.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_kobj.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_kstat.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_lookup.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_policy.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_string.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_sysevent.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_uio.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_vfs.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_vm.c optional zfs compile-with "${ZFS_C}"
	-cddl/compat/opensolaris/kern/opensolaris_zone.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/acl/acl_common.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/avl/avl.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/lz4/lz4.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/unicode/u8_textprep.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/zfs/zfeature_common.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/zfs/zfs_comutil.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/zfs/zfs_deleg.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/zfs/zfs_prop.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/zfs/zpool_prop.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/common/zfs/zprop_common.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/vnode.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c optional zfs compile-with "${ZFS_C}" \
	- warning "kernel contains CDDL licensed ZFS filesystem"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/os/callb.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/os/fm.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/os/list.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/zmod/zmod.c optional zfs compile-with "${ZFS_C}"
	+
	+#zfs avl
	+contrib/openzfs/module/avl/avl.c optional zfs compile-with "${ZFS_C}"
	+
	# zfs lua support
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c optional zfs compile-with "${ZFS_C}"
	-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lapi.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lauxlib.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lbaselib.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lcode.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lcompat.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lcorolib.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lctype.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/ldebug.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/ldo.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lfunc.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lgc.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/llex.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lmem.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lobject.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lopcodes.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lparser.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lstate.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lstring.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lstrlib.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/ltable.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/ltablib.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/ltm.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lvm.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/lua/lzio.c optional zfs compile-with "${ZFS_C}"
	+
	+# zfs nvpair support
	+contrib/openzfs/module/nvpair/fnvpair.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/nvpair/nvpair.c optional zfs compile-with "${ZFS_RPC_C}"
	+contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/nvpair/nvpair_alloc_spl.c optional zfs compile-with "${ZFS_C}"
	+
	+#zfs platform compatibility code
	+contrib/openzfs/module/os/freebsd/zfs/abd_os.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/arc_os.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/crypto_os.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/dmu_os.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/hkdf.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/kmod_core.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/spa_os.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/spa_stats.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c optional zfs compile-with "${ZFS_C} -include $S/modules/zfs/zfs_config.h"
	+contrib/openzfs/module/os/freebsd/zfs/vdev_file.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/os/freebsd/zfs/zvol_os.c optional zfs compile-with "${ZFS_C}"
	+
	+#zfs unicode support
	+contrib/openzfs/module/unicode/uconv.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/unicode/u8_textprep.c optional zfs compile-with "${ZFS_C}"
	+
	+#zfs checksums / zcommon
	+contrib/openzfs/module/zcommon/cityhash.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfeature_common.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfs_comutil.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfs_deleg.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfs_fletcher.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfs_namecheck.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfs_prop.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zpool_prop.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zprop_common.c optional zfs compile-with "${ZFS_C}"
	+
	+#zfs core common code
	+contrib/openzfs/module/zfs/abd.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/aggsum.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/arc.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/blkptr.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/bplist.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/bpobj.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/bptree.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/btree.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/bqueue.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dbuf.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dbuf_stats.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dataset_kstats.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/ddt.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu_diff.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu_object.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu_objset.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu_recv.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu_redact.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu_send.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu_traverse.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu_tx.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dmu_zfetch.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dnode.c optional zfs compile-with "${ZFS_C}" \
	+ warning "kernel contains CDDL licensed ZFS filesystem"
	+contrib/openzfs/module/zfs/dnode_sync.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_bookmark.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_crypt.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_dataset.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_deadlist.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_deleg.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_destroy.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_dir.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_pool.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_prop.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_scan.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_synctask.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/dsl_userhold.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/fm.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/gzip.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/lzjb.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/lz4.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/metaslab.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/mmp.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/multilist.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/objlist.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/pathname.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/range_tree.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/refcount.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/rrwlock.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/sa.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/sha256.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/skein_zfs.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/spa.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/spa_boot.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/spa_checkpoint.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/spa_config.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/spa_errlog.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/spa_history.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/spa_log_spacemap.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/spa_misc.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/space_map.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/space_reftree.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/txg.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/uberblock.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/unique.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_indirect_births.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_indirect_mapping.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_initialize.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_label.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_mirror.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_missing.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_queue.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_raidz.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_raidz_math.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_rebuild.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_removal.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_root.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_trim.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zap.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zap_leaf.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zap_micro.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zcp.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zcp_get.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zcp_global.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zcp_iter.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zcp_set.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zcp_synctask.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfeature.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_byteswap.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_fm.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_fuid.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_ioctl.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_log.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_onexit.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_quota.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_ratelimit.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_replay.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_rlock.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zfs_sa.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zstd/zfs_zstd.c optional zfs zstdio compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zil.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zio.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zio_checksum.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zio_compress.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zio_inject.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zle.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zrlock.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zthr.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/zvol.c optional zfs compile-with "${ZFS_C}"
	+
	# dtrace specific
	cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c optional dtrace compile-with "${DTRACE_C}" \
	warning "kernel contains CDDL licensed DTRACE"
	Index: head/sys/conf/files.amd64
	===================================================================
	--- head/sys/conf/files.amd64
	+++ head/sys/conf/files.amd64
	@@ -463,3 +463,13 @@
	x86/xen/pv.c optional xenhvm
	x86/xen/pvcpu_enum.c optional xenhvm
	x86/xen/xen_pci_bus.c optional xenhvm
	+
	+contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfs_fletcher_intel.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zcommon/zfs_fletcher_sse.c optional zfs compile-with "${ZFS_C}"
	+
	+contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c optional zfs compile-with "${ZFS_C}"
	+contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c optional zfs compile-with "${ZFS_C}"
	Index: head/sys/conf/kern.pre.mk
	===================================================================
	--- head/sys/conf/kern.pre.mk
	+++ head/sys/conf/kern.pre.mk
	@@ -208,34 +208,82 @@
	ZSTD_DECOMPRESS_BLOCK_FLAGS= -fno-tree-vectorize
	.endif

	+ZINCDIR=$S/contrib/openzfs/include
	# Common for dtrace / zfs
	-CDDL_CFLAGS= -DFREEBSD_NAMECACHE -nostdinc -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S -I$S/cddl/contrib/opensolaris/common ${CFLAGS} -Wno-unknown-pragmas -Wno-missing-prototypes -Wno-undef -Wno-strict-prototypes -Wno-cast-qual -Wno-parentheses -Wno-redundant-decls -Wno-missing-braces -Wno-uninitialized -Wno-unused -Wno-inline -Wno-switch -Wno-pointer-arith -Wno-unknown-pragmas
	-CDDL_CFLAGS+= -include $S/cddl/compat/opensolaris/sys/debug_compat.h
	+CDDL_CFLAGS= \
	+ -DFREEBSD_NAMECACHE \
	+ -D_SYS_VMEM_H_ \
	+ -D__KERNEL \
	+ -D__KERNEL__ \
	+ -nostdinc \
	+ -include $S/modules/zfs/static_ccompile.h \
	+ -I${ZINCDIR} \
	+ -I${ZINCDIR}/spl \
	+ -I${ZINCDIR}/os/freebsd \
	+ -I${ZINCDIR}/os/freebsd/spl \
	+ -I${ZINCDIR}/os/freebsd/zfs \
	+ -I$S/modules/zfs \
	+ -I$S/contrib/openzfs/module/zstd/include \
	+ -I$S/contrib/openzfs/module/zstd/lib/freebsd/ \
	+ ${CFLAGS} \
	+ -Wno-unknown-pragmas \
	+ -Wno-missing-prototypes \
	+ -Wno-undef \
	+ -Wno-strict-prototypes \
	+ -Wno-cast-qual \
	+ -Wno-parentheses \
	+ -Wno-redundant-decls \
	+ -Wno-missing-braces \
	+ -Wno-uninitialized \
	+ -Wno-unused \
	+ -Wno-inline \
	+ -Wno-switch \
	+ -Wno-pointer-arith \
	+ -Wno-unknown-pragmas \
	+ -Wno-duplicate-decl-specifier \
	+ -include ${ZINCDIR}/os/freebsd/spl/sys/ccompile.h \
	+ -I$S/cddl/contrib/opensolaris/uts/common \
	+ -I$S -I$S/cddl/compat/opensolaris
	CDDL_C= ${CC} -c ${CDDL_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC}

	# Special flags for managing the compat compiles for ZFS
	-ZFS_CFLAGS= -DBUILDING_ZFS -I$S/cddl/contrib/opensolaris/uts/common/fs/zfs
	-ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/common/fs/zfs/lua
	-ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/common/zmod
	-ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/lz4
	-ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/zfs
	-ZFS_CFLAGS+= ${CDDL_CFLAGS}
	+ZFS_CFLAGS+= ${CDDL_CFLAGS} -DBUILDING_ZFS -DHAVE_UIO_ZEROCOPY \
	+ -DWITH_NETDUMP -D__KERNEL__ -D_SYS_CONDVAR_H_ -DSMP \
	+ -DIN_FREEBSD_BASE -DHAVE_KSID
	+
	+.if ${MACHINE_ARCH} == "amd64"
	+ZFS_CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F \
	+ -DHAVE_SSSE3 -DHAVE_AVX512BW
	+.endif
	+
	+.if ${MACHINE_ARCH} == "i386" \|\| ${MACHINE_ARCH} == "powerpc" \|\| \
	+ ${MACHINE_ARCH} == "arm"
	+ZFS_CFLAGS+= -DBITS_PER_LONG=32
	+.else
	+ZFS_CFLAGS+= -DBITS_PER_LONG=64
	+.endif
	+
	+
	ZFS_ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${ZFS_CFLAGS}
	ZFS_C= ${CC} -c ${ZFS_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC}
	+ZFS_RPC_C= ${CC} -c ${ZFS_CFLAGS} -DHAVE_RPC_TYPES ${WERROR} ${PROF} ${.IMPSRC}
	ZFS_S= ${CC} -c ${ZFS_ASM_CFLAGS} ${WERROR} ${.IMPSRC}

	+
	+
	# Special flags for managing the compat compiles for DTrace
	DTRACE_CFLAGS= -DBUILDING_DTRACE ${CDDL_CFLAGS} -I$S/cddl/dev/dtrace -I$S/cddl/dev/dtrace/${MACHINE_CPUARCH}
	.if ${MACHINE_CPUARCH} == "amd64" \|\| ${MACHINE_CPUARCH} == "i386"
	DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/intel -I$S/cddl/dev/dtrace/x86
	.endif
	-DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/util -I$S -DDIS_MEM -DSMP
	+DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/util -I$S -DDIS_MEM -DSMP -I$S/cddl/compat/opensolaris
	+DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/common
	DTRACE_ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${DTRACE_CFLAGS}
	DTRACE_C= ${CC} -c ${DTRACE_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC}
	DTRACE_S= ${CC} -c ${DTRACE_ASM_CFLAGS} ${WERROR} ${.IMPSRC}

	# Special flags for managing the compat compiles for DTrace/FBT
	-FBT_CFLAGS= -DBUILDING_DTRACE -nostdinc -I$S/cddl/dev/fbt/${MACHINE_CPUARCH} -I$S/cddl/dev/fbt -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S ${CDDL_CFLAGS}
	+FBT_CFLAGS= -DBUILDING_DTRACE -nostdinc -I$S/cddl/dev/fbt/${MACHINE_CPUARCH} -I$S/cddl/dev/fbt ${CDDL_CFLAGS} -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common
	.if ${MACHINE_CPUARCH} == "amd64" \|\| ${MACHINE_CPUARCH} == "i386"
	FBT_CFLAGS+= -I$S/cddl/dev/fbt/x86
	.endif
	Index: head/sys/conf/kmod.mk
	===================================================================
	--- head/sys/conf/kmod.mk
	+++ head/sys/conf/kmod.mk
	@@ -532,6 +532,22 @@
	OBJS_DEPEND_GUESS+= opt_global.h
	.endif

	+ZINCDIR=${SYSDIR}/contrib/openzfs/include
	+OPENZFS_CFLAGS= \
	+ -D_SYS_VMEM_H_ \
	+ -D__KERNEL__ \
	+ -nostdinc \
	+ -DSMP \
	+ -I${ZINCDIR} \
	+ -I${ZINCDIR}/spl \
	+ -I${ZINCDIR}/os/freebsd \
	+ -I${ZINCDIR}/os/freebsd/spl \
	+ -I${ZINCDIR}/os/freebsd/zfs \
	+ -I${SYSDIR}/cddl/compat/opensolaris \
	+ -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	+ -include ${ZINCDIR}/os/freebsd/spl/sys/ccompile.h
	+
	+
	.include <bsd.dep.mk>
	.include <bsd.clang-analyze.mk>
	.include <bsd.obj.mk>
	Index: head/sys/modules/Makefile
	===================================================================
	--- head/sys/modules/Makefile
	+++ head/sys/modules/Makefile
	@@ -563,7 +563,7 @@
	SUBDIR+= tests
	.endif

	-.if ${MK_ZFS} != "no" \|\| defined(ALL_MODULES)
	+.if ${MK_ZFS} != "no" \|\| (defined(ALL_MODULES) && ${MACHINE_CPUARCH} != "powerpc")
	SUBDIR+= zfs
	.endif

	Index: head/sys/modules/dtrace/dtaudit/Makefile
	===================================================================
	--- head/sys/modules/dtrace/dtaudit/Makefile
	+++ head/sys/modules/dtrace/dtaudit/Makefile
	@@ -8,9 +8,7 @@
	SRCS= audit_dtrace.c \
	vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}
	+CFLAGS+= ${OPENZFS_CFLAGS}

	.include <bsd.kmod.mk>

	Index: head/sys/modules/dtrace/dtmalloc/Makefile
	===================================================================
	--- head/sys/modules/dtrace/dtmalloc/Makefile
	+++ head/sys/modules/dtrace/dtmalloc/Makefile
	@@ -8,9 +8,7 @@
	SRCS= dtmalloc.c
	SRCS+= vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}
	+CFLAGS+= ${OPENZFS_CFLAGS}

	.include <bsd.kmod.mk>

	Index: head/sys/modules/dtrace/dtnfscl/Makefile
	===================================================================
	--- head/sys/modules/dtrace/dtnfscl/Makefile
	+++ head/sys/modules/dtrace/dtnfscl/Makefile
	@@ -8,9 +8,7 @@
	SRCS= nfs_clkdtrace.c \
	vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}
	+CFLAGS+= ${OPENZFS_CFLAGS}

	.include <bsd.kmod.mk>

	Index: head/sys/modules/dtrace/dtrace/Makefile
	===================================================================
	--- head/sys/modules/dtrace/dtrace/Makefile
	+++ head/sys/modules/dtrace/dtrace/Makefile
	@@ -20,9 +20,11 @@
	.PATH: ${SYSDIR}/cddl/dev/dtrace/x86
	SRCS+= dis_tables.c \
	instr_size.c
	-CFLAGS+= -I${SYSDIR}/cddl/contrib/opensolaris/uts/intel \
	- -I${SYSDIR}/cddl/dev/dtrace/x86
	+CFLAGS+= -I${SYSDIR}/cddl/contrib/opensolaris/uts/intel \
	+ -I${SYSDIR}/cddl/dev/dtrace/x86
	+
	.endif
	+CFLAGS+= ${OPENZFS_CFLAGS}

	SRCS+= bus_if.h device_if.h vnode_if.h

	@@ -56,7 +58,7 @@
	.include <bsd.kmod.mk>

	CFLAGS+= -include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h
	-
	+CFLAGS.dtrace_asm.S+= -D_SYS_ERRNO_H_ -D_SYS_PARAM_H_ -DLOCORE
	CWARNFLAGS+= -Wno-parentheses
	CWARNFLAGS+= -Wno-uninitialized
	CWARNFLAGS+= -Wno-cast-qual
	Index: head/sys/modules/dtrace/fasttrap/Makefile
	===================================================================
	--- head/sys/modules/dtrace/fasttrap/Makefile
	+++ head/sys/modules/dtrace/fasttrap/Makefile
	@@ -6,12 +6,10 @@

	KMOD= fasttrap
	SRCS= fasttrap.c fasttrap_isa.c
	-SRCS+= vnode_if.h
	+SRCS+= vnode_if.h opt_global.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common/dtrace \
	- -I${SYSDIR}
	+CFLAGS+= -include ${.OBJDIR}/opt_global.h
	+CFLAGS+= ${OPENZFS_CFLAGS}

	.if ${MACHINE_CPUARCH} == "amd64" \|\| ${MACHINE_CPUARCH} == "i386"
	CFLAGS+= -I${SYSDIR}/cddl/contrib/opensolaris/uts/intel
	Index: head/sys/modules/dtrace/fbt/Makefile
	===================================================================
	--- head/sys/modules/dtrace/fbt/Makefile
	+++ head/sys/modules/dtrace/fbt/Makefile
	@@ -8,6 +8,7 @@
	SRCS= fbt.c fbt_isa.c
	SRCS+= vnode_if.h

	+
	.if ${MACHINE_CPUARCH} == "amd64" \|\| ${MACHINE_CPUARCH} == "i386"
	CFLAGS+= -I${SYSDIR}/cddl/dev/fbt/x86
	.PATH: ${SYSDIR}/cddl/dev/fbt/x86
	@@ -16,10 +17,8 @@
	.PATH: ${SYSDIR}/cddl/dev/fbt/${MACHINE_CPUARCH}
	.endif

	-CFLAGS+= -I${SYSDIR}/cddl/dev/fbt \
	- -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}
	+CFLAGS+= ${OPENZFS_CFLAGS}
	+CFLAGS+= -I${SYSDIR}/cddl/dev/fbt

	.include <bsd.kmod.mk>

	Index: head/sys/modules/dtrace/profile/Makefile
	===================================================================
	--- head/sys/modules/dtrace/profile/Makefile
	+++ head/sys/modules/dtrace/profile/Makefile
	@@ -8,9 +8,7 @@
	SRCS= profile.c
	SRCS+= vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}
	+CFLAGS+= ${OPENZFS_CFLAGS}

	.include <bsd.kmod.mk>

	Index: head/sys/modules/dtrace/prototype/Makefile
	===================================================================
	--- head/sys/modules/dtrace/prototype/Makefile
	+++ head/sys/modules/dtrace/prototype/Makefile
	@@ -8,9 +8,7 @@
	SRCS= prototype.c
	SRCS+= vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}
	+CFLAGS+= ${OPENZFS_CFLAGS}

	.include <bsd.kmod.mk>

	Index: head/sys/modules/dtrace/sdt/Makefile
	===================================================================
	--- head/sys/modules/dtrace/sdt/Makefile
	+++ head/sys/modules/dtrace/sdt/Makefile
	@@ -8,10 +8,7 @@
	SRCS= sdt.c
	SRCS+= vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}
	+CFLAGS+= ${OPENZFS_CFLAGS}

	.include <bsd.kmod.mk>
	-
	CFLAGS+= -include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h
	Index: head/sys/modules/dtrace/systrace/Makefile
	===================================================================
	--- head/sys/modules/dtrace/systrace/Makefile
	+++ head/sys/modules/dtrace/systrace/Makefile
	@@ -8,10 +8,7 @@
	SRCS= systrace.c
	SRCS+= vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common/dtrace \
	- -I${SYSDIR}
	+CFLAGS+= ${OPENZFS_CFLAGS}

	.include <bsd.kmod.mk>

	Index: head/sys/modules/dtrace/systrace_freebsd32/Makefile
	===================================================================
	--- head/sys/modules/dtrace/systrace_freebsd32/Makefile
	+++ head/sys/modules/dtrace/systrace_freebsd32/Makefile
	@@ -8,9 +8,8 @@
	SRCS= systrace.c
	SRCS+= vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR} -DFREEBSD32_SYSTRACE
	+CFLAGS+= ${OPENZFS_CFLAGS}
	+CFLAGS+= -DFREEBSD32_SYSTRACE

	.include <bsd.kmod.mk>

	Index: head/sys/modules/dtrace/systrace_linux/Makefile
	===================================================================
	--- head/sys/modules/dtrace/systrace_linux/Makefile
	+++ head/sys/modules/dtrace/systrace_linux/Makefile
	@@ -9,9 +9,8 @@
	SRCS= systrace.c
	SRCS+= vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR} -DLINUX_SYSTRACE
	+CFLAGS+= ${OPENZFS_CFLAGS}
	+CFLAGS+= -DLINUX_SYSTRACE

	.include <bsd.kmod.mk>

	Index: head/sys/modules/dtrace/systrace_linux32/Makefile
	===================================================================
	--- head/sys/modules/dtrace/systrace_linux32/Makefile
	+++ head/sys/modules/dtrace/systrace_linux32/Makefile
	@@ -9,9 +9,8 @@
	SRCS= systrace.c
	SRCS+= vnode_if.h

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR} -DLINUX32_SYSTRACE
	+CFLAGS+= ${OPENZFS_CFLAGS}
	+CFLAGS+= -DLINUX32_SYSTRACE

	.include <bsd.kmod.mk>

	Index: head/sys/modules/opensolaris/Makefile
	===================================================================
	--- head/sys/modules/opensolaris/Makefile
	+++ head/sys/modules/opensolaris/Makefile
	@@ -3,14 +3,16 @@
	SYSDIR?= ${SRCTOP}/sys

	.PATH: ${SYSDIR}/cddl/compat/opensolaris/kern
	+.PATH: ${SYSDIR}/contrib/openzfs/module/os/freebsd/spl

	KMOD= opensolaris
	-SRCS= opensolaris.c \
	- opensolaris_cmn_err.c \
	- opensolaris_kmem.c \
	- opensolaris_misc.c \
	+SRCS= vnode_if.h \
	+ opensolaris.c \
	opensolaris_proc.c \
	- opensolaris_sunddi.c
	+ spl_cmn_err.c \
	+ spl_kmem.c \
	+ spl_misc.c \
	+ spl_sunddi.c

	_A=${SYSDIR}/cddl/contrib/opensolaris/common/atomic
	.if exists(${_A}/${MACHINE_CPUARCH}/opensolaris_atomic.S)
	@@ -23,9 +25,7 @@
	SRCS+= opensolaris_atomic.c
	.endif

	-CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \
	- -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
	- -I${SYSDIR}
	+CFLAGS+= ${OPENZFS_CFLAGS}

	EXPORT_SYMS= cpu_core

	Index: head/sys/modules/zfs/Makefile
	===================================================================
	--- head/sys/modules/zfs/Makefile
	+++ head/sys/modules/zfs/Makefile
	@@ -1,118 +1,344 @@
	# $FreeBSD$

	-SYSDIR?=${SRCTOP}/sys
	+SRCDIR=${SRCTOP}/sys/contrib/openzfs/module
	+INCDIR=${SRCTOP}/sys/contrib/openzfs/include

	KMOD= zfs

	-SRCS= bus_if.h device_if.h vnode_if.h opt_kstack_pages.h
	+.PATH: ${SRCDIR}/avl \
	+ ${SRCDIR}/lua \
	+ ${SRCDIR}/nvpair \
	+ ${SRCDIR}/os/freebsd/spl \
	+ ${SRCDIR}/os/freebsd/zfs \
	+ ${SRCDIR}/unicode \
	+ ${SRCDIR}/zcommon \
	+ ${SRCDIR}/zfs \
	+ ${SRCDIR}/zstd \
	+ ${SRCDIR}/zstd/lib

	-SUNW= ${SYSDIR}/cddl/contrib/opensolaris

	-.PATH: ${SUNW}/common/acl
	-SRCS+= acl_common.c
	-.PATH: ${SUNW}/common/avl
	-SRCS+= avl.c
	-.PATH: ${SUNW}/common/nvpair
	-SRCS+= opensolaris_nvpair.c
	-SRCS+= opensolaris_nvpair_alloc_fixed.c
	-SRCS+= opensolaris_fnvpair.c
	-.PATH: ${SYSDIR}/cddl/contrib/opensolaris/common/unicode
	-SRCS+= u8_textprep.c
	-.PATH: ${SUNW}/common/lz4
	-SRCS+= lz4.c
	+CFLAGS+= -I${INCDIR}
	+CFLAGS+= -I${INCDIR}/spl
	+CFLAGS+= -I${INCDIR}/os/freebsd
	+CFLAGS+= -I${INCDIR}/os/freebsd/spl
	+CFLAGS+= -I${INCDIR}/os/freebsd/zfs
	+CFLAGS+= -I${SRCDIR}/zstd/include
	+CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h
	+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/static_ccompile.h
	+CFLAGS+= -I${.CURDIR}

	-.PATH: ${SYSDIR}/cddl/compat/opensolaris/kern
	-SRCS+= opensolaris_acl.c
	-SRCS+= opensolaris_dtrace.c
	-SRCS+= opensolaris_kobj.c
	-SRCS+= opensolaris_kstat.c
	-SRCS+= opensolaris_lookup.c
	-SRCS+= opensolaris_policy.c
	-SRCS+= opensolaris_string.c
	-SRCS+= opensolaris_sysevent.c
	-SRCS+= opensolaris_taskq.c
	-SRCS+= opensolaris_uio.c
	-SRCS+= opensolaris_vfs.c
	-SRCS+= opensolaris_vm.c
	-SRCS+= opensolaris_zone.c
	+CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \
	+ -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ \
	+ -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DIN_FREEBSD_BASE -DHAVE_KSID

	-_A=${SYSDIR}/cddl/contrib/opensolaris/common/atomic
	-.if exists(${_A}/${MACHINE_CPUARCH}/opensolaris_atomic.S)
	-.PATH: ${_A}/${MACHINE_CPUARCH}
	-SRCS+= opensolaris_atomic.S
	-.elif exists(${_A}/${MACHINE_ARCH}/opensolaris_atomic.S)
	-.PATH: ${_A}/${MACHINE_ARCH}
	-SRCS+= opensolaris_atomic.S
	-.else
	-SRCS+= opensolaris_atomic.c
	+.if ${MACHINE_ARCH} == "amd64"
	+CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_AVX512BW -DHAVE_SSSE3
	.endif

	-.PATH: ${SUNW}/uts/common/fs
	-SRCS+= vnode.c
	+.if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true"
	+# kernel must also be built with this option for this to work
	+CFLAGS+= -DDEBUG_VFS_LOCKS
	+.endif

	-.PATH: ${SUNW}/uts/common/os
	-SRCS+= callb.c
	-SRCS+= fm.c
	-SRCS+= list.c
	-SRCS+= nvpair_alloc_system.c
	+.if defined(WITH_GCOV) && ${WITH_GCOV} == "true"
	+CFLAGS+= -fprofile-arcs -ftest-coverage
	+.endif

	-.PATH: ${SUNW}/uts/common/zmod
	-SRCS+= zmod.c
	+DEBUG_FLAGS=-g

	-.PATH: ${SYSDIR}/crypto/sha2
	-SRCS+= sha256c.c sha512c.c
	+.if ${MACHINE_ARCH} == "i386" \|\| ${MACHINE_ARCH} == "powerpc" \|\| \
	+ ${MACHINE_ARCH} == "arm"
	+CFLAGS+= -DBITS_PER_LONG=32
	+.else
	+CFLAGS+= -DBITS_PER_LONG=64
	+.endif

	-.PATH: ${SYSDIR}/crypto/skein
	-SRCS+= skein.c skein_block.c
	+SRCS= vnode_if.h device_if.h bus_if.h

	-.PATH: ${SUNW}/common/zfs
	-.include "${SUNW}/uts/common/Makefile.files"
	-.PATH: ${SUNW}/uts/common/fs/zfs
	-ZFS_SRCS= ${ZFS_OBJS:C/.o$/.c/}
	-SRCS+= ${ZFS_SRCS}
	-SRCS+= vdev_geom.c
	-SRCS+= trim_map.c
	-.PATH: ${SUNW}/uts/common/fs/zfs/lua
	-LUA_SRCS= ${LUA_OBJS:C/.o$/.c/}
	-SRCS+= ${LUA_SRCS}
	+# avl
	+SRCS+= avl.c

	-# Use FreeBSD's namecache.
	-CFLAGS+=-DFREEBSD_NAMECACHE
	+#lua
	+SRCS+= lapi.c \
	+ lauxlib.c \
	+ lbaselib.c \
	+ lcode.c \
	+ lcompat.c \
	+ lcorolib.c \
	+ lctype.c \
	+ ldebug.c \
	+ ldo.c \
	+ lfunc.c \
	+ lgc.c \
	+ llex.c \
	+ lmem.c \
	+ lobject.c \
	+ lopcodes.c \
	+ lparser.c \
	+ lstate.c \
	+ lstring.c \
	+ lstrlib.c \
	+ ltable.c \
	+ ltablib.c \
	+ ltm.c \
	+ lvm.c \
	+ lzio.c

	-CFLAGS+=-I${SYSDIR}/cddl/compat/opensolaris
	-CFLAGS+=-I${SUNW}/uts/common/fs/zfs
	-CFLAGS+=-I${SUNW}/uts/common/fs/zfs/lua
	-CFLAGS+=-I${SUNW}/uts/common/zmod
	-CFLAGS+=-I${SUNW}/uts/common
	-CFLAGS+=-I${SYSDIR}
	-CFLAGS+=-I${SUNW}/common/zfs
	-CFLAGS+=-I${SUNW}/common/lz4
	-CFLAGS+=-I${SUNW}/common
	-CFLAGS+=-DBUILDING_ZFS
	-CFLAGS.gcc+=-fms-extensions
	+#nvpair
	+SRCS+= nvpair.c \
	+ fnvpair.c \
	+ nvpair_alloc_spl.c \
	+ nvpair_alloc_fixed.c

	-.if ${MACHINE_ARCH} == "powerpc64"
	-CFLAGS.gcc+=-mminimal-toc
	-.endif
	+#os/freebsd/spl
	+SRCS+= acl_common.c \
	+ btree.c \
	+ callb.c \
	+ list.c \
	+ spl_acl.c \
	+ spl_cmn_err.c \
	+ spl_dtrace.c \
	+ spl_kmem.c \
	+ spl_kstat.c \
	+ spl_misc.c \
	+ spl_policy.c \
	+ spl_string.c \
	+ spl_sunddi.c \
	+ spl_sysevent.c \
	+ spl_taskq.c \
	+ spl_uio.c \
	+ spl_vfs.c \
	+ spl_vm.c \
	+ spl_zone.c \
	+ sha256c.c \
	+ sha512c.c \
	+ spl_procfs_list.c \
	+ spl_zlib.c

	-.ifdef ZFS_DEBUG
	-CFLAGS+=-DDEBUG=1
	-DEBUG_FLAGS=-g
	+
	+.if ${MACHINE_ARCH} == "i386" \|\| ${MACHINE_ARCH} == "powerpc" \|\| \
	+ ${MACHINE_ARCH} == "arm"
	+SRCS+= spl_atomic.c
	.endif

	+#os/freebsd/zfs
	+SRCS+= abd_os.c \
	+ crypto_os.c \
	+ dmu_os.c \
	+ hkdf.c \
	+ kmod_core.c \
	+ spa_os.c \
	+ sysctl_os.c \
	+ vdev_file.c \
	+ vdev_label_os.c \
	+ vdev_geom.c \
	+ zfs_acl.c \
	+ zfs_ctldir.c \
	+ zfs_dir.c \
	+ zfs_ioctl_compat.c \
	+ zfs_ioctl_os.c \
	+ zfs_log.c \
	+ zfs_replay.c \
	+ zfs_vfsops.c \
	+ zfs_vnops.c \
	+ zfs_znode.c \
	+ zio_crypt.c \
	+ zvol_os.c
	+
	+#unicode
	+SRCS+= uconv.c \
	+ u8_textprep.c
	+
	+#zcommon
	+SRCS+= zfeature_common.c \
	+ zfs_comutil.c \
	+ zfs_deleg.c \
	+ zfs_fletcher.c \
	+ zfs_fletcher_avx512.c \
	+ zfs_fletcher_intel.c \
	+ zfs_fletcher_sse.c \
	+ zfs_fletcher_superscalar.c \
	+ zfs_fletcher_superscalar4.c \
	+ zfs_namecheck.c \
	+ zfs_prop.c \
	+ zpool_prop.c \
	+ zprop_common.c
	+
	+#zfs
	+SRCS+= abd.c \
	+ aggsum.c \
	+ arc.c \
	+ arc_os.c \
	+ blkptr.c \
	+ bplist.c \
	+ bpobj.c \
	+ cityhash.c \
	+ dbuf.c \
	+ dbuf_stats.c \
	+ bptree.c \
	+ bqueue.c \
	+ dataset_kstats.c \
	+ ddt.c \
	+ ddt_zap.c \
	+ dmu.c \
	+ dmu_diff.c \
	+ dmu_object.c \
	+ dmu_objset.c \
	+ dmu_recv.c \
	+ dmu_redact.c \
	+ dmu_send.c \
	+ dmu_traverse.c \
	+ dmu_tx.c \
	+ dmu_zfetch.c \
	+ dnode.c \
	+ dnode_sync.c \
	+ dsl_dataset.c \
	+ dsl_deadlist.c \
	+ dsl_deleg.c \
	+ dsl_bookmark.c \
	+ dsl_dir.c \
	+ dsl_crypt.c \
	+ dsl_destroy.c \
	+ dsl_pool.c \
	+ dsl_prop.c \
	+ dsl_scan.c \
	+ dsl_synctask.c \
	+ dsl_userhold.c \
	+ fm.c \
	+ gzip.c \
	+ lzjb.c \
	+ lz4.c \
	+ metaslab.c \
	+ mmp.c \
	+ multilist.c \
	+ objlist.c \
	+ pathname.c \
	+ range_tree.c \
	+ refcount.c \
	+ rrwlock.c \
	+ sa.c \
	+ sha256.c \
	+ skein_zfs.c \
	+ spa.c \
	+ spa_boot.c \
	+ spa_checkpoint.c \
	+ spa_config.c \
	+ spa_errlog.c \
	+ spa_history.c \
	+ spa_log_spacemap.c \
	+ spa_misc.c \
	+ spa_stats.c \
	+ space_map.c \
	+ space_reftree.c \
	+ txg.c \
	+ uberblock.c \
	+ unique.c \
	+ vdev.c \
	+ vdev_cache.c \
	+ vdev_indirect.c \
	+ vdev_indirect_births.c \
	+ vdev_indirect_mapping.c \
	+ vdev_initialize.c \
	+ vdev_label.c \
	+ vdev_mirror.c \
	+ vdev_missing.c \
	+ vdev_queue.c \
	+ vdev_raidz.c \
	+ vdev_raidz_math.c \
	+ vdev_raidz_math_scalar.c \
	+ vdev_raidz_math_avx2.c \
	+ vdev_raidz_math_avx512bw.c \
	+ vdev_raidz_math_avx512f.c \
	+ vdev_raidz_math_sse2.c \
	+ vdev_raidz_math_ssse3.c \
	+ vdev_rebuild.c \
	+ vdev_removal.c \
	+ vdev_root.c \
	+ vdev_trim.c \
	+ zap.c \
	+ zap_leaf.c \
	+ zap_micro.c \
	+ zcp.c \
	+ zcp_get.c \
	+ zcp_global.c \
	+ zcp_iter.c \
	+ zcp_set.c \
	+ zcp_synctask.c \
	+ zfeature.c \
	+ zfs_byteswap.c \
	+ zfs_debug.c \
	+ zfs_file_os.c \
	+ zfs_fm.c \
	+ zfs_fuid.c \
	+ zfs_ioctl.c \
	+ zfs_onexit.c \
	+ zfs_quota.c \
	+ zfs_ratelimit.c \
	+ zfs_rlock.c \
	+ zfs_sa.c \
	+ zil.c \
	+ zio.c \
	+ zio_checksum.c \
	+ zio_compress.c \
	+ zio_inject.c \
	+ zle.c \
	+ zrlock.c \
	+ zthr.c \
	+ zvol.c
	+
	+SRCS+= zfs_zstd.c \
	+ zstd.c
	+
	.include <bsd.kmod.mk>

	-CFLAGS+= -include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h

	-CWARNFLAGS+=-Wno-missing-prototypes
	-CWARNFLAGS+=-Wno-undef
	-CWARNFLAGS+=-Wno-strict-prototypes
	-CWARNFLAGS+=-Wno-cast-qual
	-CWARNFLAGS+=-Wno-parentheses
	-CWARNFLAGS+=-Wno-redundant-decls
	-CWARNFLAGS+=-Wno-missing-braces
	-CWARNFLAGS+=-Wno-uninitialized
	-CWARNFLAGS+=-Wno-unused
	-CWARNFLAGS+=-Wno-inline
	-CWARNFLAGS+=-Wno-switch
	-CWARNFLAGS+=-Wno-pointer-arith
	+CFLAGS.gcc+= -Wno-pointer-to-int-cast
	+
	+CFLAGS.lapi.c= -Wno-cast-qual
	+CFLAGS.lcompat.c= -Wno-cast-qual
	+CFLAGS.lobject.c= -Wno-cast-qual
	+CFLAGS.ltable.c= -Wno-cast-qual
	+CFLAGS.lvm.c= -Wno-cast-qual
	+CFLAGS.nvpair.c= -Wno-cast-qual -DHAVE_RPC_TYPES
	+CFLAGS.spl_string.c= -Wno-cast-qual
	+CFLAGS.spl_vm.c= -Wno-cast-qual
	+CFLAGS.spl_zlib.c= -Wno-cast-qual
	+CFLAGS.abd.c= -Wno-cast-qual
	+CFLAGS.zfs_log.c= -Wno-cast-qual
	+CFLAGS.zfs_vnops.c= -Wno-pointer-arith
	+CFLAGS.u8_textprep.c= -Wno-cast-qual
	+CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith
	+CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith
	+CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith
	+CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith
	+CFLAGS.zprop_common.c= -Wno-cast-qual
	+CFLAGS.ddt.c= -Wno-cast-qual
	+CFLAGS.dmu.c= -Wno-cast-qual
	+CFLAGS.dmu_traverse.c= -Wno-cast-qual
	+CFLAGS.dsl_dir.c= -Wno-cast-qual
	+CFLAGS.dsl_deadlist.c= -Wno-cast-qual
	+CFLAGS.dsl_prop.c= -Wno-cast-qual
	+CFLAGS.fm.c= -Wno-cast-qual
	+CFLAGS.lz4.c= -Wno-cast-qual
	+CFLAGS.spa.c= -Wno-cast-qual
	+CFLAGS.spa_misc.c= -Wno-cast-qual
	+CFLAGS.sysctl_os.c= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	+CFLAGS.vdev_raidz.c= -Wno-cast-qual
	+CFLAGS.vdev_raidz_math.c= -Wno-cast-qual
	+CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual
	+CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
	+CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
	+CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
	+CFLAGS.zap_leaf.c= -Wno-cast-qual
	+CFLAGS.zap_micro.c= -Wno-cast-qual
	+CFLAGS.zcp.c= -Wno-cast-qual
	+CFLAGS.zfs_fm.c= -Wno-cast-qual
	+CFLAGS.zfs_ioctl.c= -Wno-cast-qual
	+CFLAGS.zil.c= -Wno-cast-qual
	+CFLAGS.zio.c= -Wno-cast-qual
	+CFLAGS.zrlock.c= -Wno-cast-qual
	+CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith
	+CFLAGS.zstd.c= -fno-tree-vectorize
	+.if ${MACHINE_CPUARCH} == "aarch64"
	+CFLAGS.zstd.c+= -include ${SRCDIR}/zstd/include/aarch64_compat.h
	+.endif
	Index: head/sys/modules/zfs/static_ccompile.h
	===================================================================
	--- head/sys/modules/zfs/static_ccompile.h
	+++ head/sys/modules/zfs/static_ccompile.h
	@@ -0,0 +1,29 @@
	+/*
	+ * $FreeBSD$
	+ */
	+
	+#ifndef _SPL_NVLIST_H_
	+#define _SPL_NVLIST_H_
	+
	+#ifdef INVARIANTS
	+#define ZFS_DEBUG
	+#endif
	+
	+#define nvlist_add_nvlist spl_nvlist_add_nvlist
	+#define nvlist_add_nvlist_array spl_nvlist_add_nvlist_array
	+#define nvlist_add_nvpair spl_nvlist_add_nvpair
	+#define nvlist_add_string spl_nvlist_add_string
	+#define nvlist_add_string_array spl_nvlist_add_string_array
	+#define nvlist_empty spl_nvlist_empty
	+#define nvlist_exists spl_nvlist_exists
	+#define nvlist_free spl_nvlist_free
	+#define nvlist_next_nvpair spl_nvlist_next_nvpair
	+#define nvlist_pack spl_nvlist_pack
	+#define nvlist_prev_nvpair spl_nvlist_prev_nvpair
	+#define nvlist_remove_nvpair spl_nvlist_remove_nvpair
	+#define nvlist_size spl_nvlist_size
	+#define nvlist_unpack spl_nvlist_unpack
	+
	+#define nvpair_type spl_nvpair_type
	+#define nvpair_name spl_nvpair_name
	+#endif
	Index: head/sys/modules/zfs/zfs_config.h
	===================================================================
	--- head/sys/modules/zfs/zfs_config.h
	+++ head/sys/modules/zfs/zfs_config.h
	@@ -0,0 +1,711 @@
	+/*
	+ * $FreeBSD$
	+ */
	+
	+/* zfs_config.h. Generated from zfs_config.h.in by configure. */
	+/* zfs_config.h.in. Generated from configure.ac by autoheader. */
	+
	+/* Define to 1 if translation of program messages to the user's native
	+ language is requested. */
	+/* #undef ENABLE_NLS */
	+
	+/* bio_end_io_t wants 1 arg */
	+/* #undef HAVE_1ARG_BIO_END_IO_T */
	+
	+/* lookup_bdev() wants 1 arg */
	+/* #undef HAVE_1ARG_LOOKUP_BDEV */
	+
	+/* submit_bio() wants 1 arg */
	+/* #undef HAVE_1ARG_SUBMIT_BIO */
	+
	+/* bdi_setup_and_register() wants 2 args */
	+/* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */
	+
	+/* lookup_bdev() wants 2 args */
	+/* #undef HAVE_2ARGS_LOOKUP_BDEV */
	+
	+/* vfs_getattr wants 2 args */
	+/* #undef HAVE_2ARGS_VFS_GETATTR */
	+
	+/* zlib_deflate_workspacesize() wants 2 args */
	+/* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */
	+
	+/* bdi_setup_and_register() wants 3 args */
	+/* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */
	+
	+/* vfs_getattr wants 3 args */
	+/* #undef HAVE_3ARGS_VFS_GETATTR */
	+
	+/* vfs_getattr wants 4 args */
	+/* #undef HAVE_4ARGS_VFS_GETATTR */
	+
	+/* kernel has access_ok with 'type' parameter */
	+/* #undef HAVE_ACCESS_OK_TYPE */
	+
	+/* posix_acl has refcount_t */
	+/* #undef HAVE_ACL_REFCOUNT */
	+
	+/* Define if host toolchain supports AES */
	+#define HAVE_AES 1
	+
	+#ifdef __amd64__
	+#ifndef RESCUE
	+/* Define if host toolchain supports AVX */
	+#define HAVE_AVX 1
	+#endif
	+
	+/* Define if host toolchain supports AVX2 */
	+#define HAVE_AVX2 1
	+
	+/* Define if host toolchain supports AVX512BW */
	+#define HAVE_AVX512BW 1
	+
	+/* Define if host toolchain supports AVX512CD */
	+#define HAVE_AVX512CD 1
	+
	+/* Define if host toolchain supports AVX512DQ */
	+#define HAVE_AVX512DQ 1
	+
	+/* Define if host toolchain supports AVX512ER */
	+#define HAVE_AVX512ER 1
	+
	+/* Define if host toolchain supports AVX512F */
	+#define HAVE_AVX512F 1
	+
	+/* Define if host toolchain supports AVX512IFMA */
	+#define HAVE_AVX512IFMA 1
	+
	+/* Define if host toolchain supports AVX512PF */
	+#define HAVE_AVX512PF 1
	+
	+/* Define if host toolchain supports AVX512VBMI */
	+#define HAVE_AVX512VBMI 1
	+
	+/* Define if host toolchain supports AVX512VL */
	+#define HAVE_AVX512VL 1
	+#endif
	+
	+/* bio->bi_opf is defined */
	+/* #undef HAVE_BIO_BI_OPF */
	+
	+/* bio->bi_status exists */
	+/* #undef HAVE_BIO_BI_STATUS */
	+
	+/* bio has bi_iter */
	+/* #undef HAVE_BIO_BVEC_ITER */
	+
	+/* bio_set_dev() is available */
	+/* #undef HAVE_BIO_SET_DEV */
	+
	+/* bio_set_dev() GPL-only */
	+/* #undef HAVE_BIO_SET_DEV_GPL_ONLY */
	+
	+/* bio_set_op_attrs is available */
	+/* #undef HAVE_BIO_SET_OP_ATTRS */
	+
	+/* blkdev_reread_part() exists */
	+/* #undef HAVE_BLKDEV_REREAD_PART */
	+
	+/* blkg_tryget() is available */
	+/* #undef HAVE_BLKG_TRYGET */
	+
	+/* blkg_tryget() GPL-only */
	+/* #undef HAVE_BLKG_TRYGET_GPL_ONLY */
	+
	+/* blk_alloc_queue() expects request function */
	+/* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */
	+
	+/* blk queue backing_dev_info is dynamic */
	+/* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */
	+
	+/* blk_queue_flag_clear() exists */
	+/* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */
	+
	+/* blk_queue_flag_set() exists */
	+/* #undef HAVE_BLK_QUEUE_FLAG_SET */
	+
	+/* blk_queue_flush() is available */
	+/* #undef HAVE_BLK_QUEUE_FLUSH */
	+
	+/* blk_queue_flush() is GPL-only */
	+/* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */
	+
	+/* blk_queue_secdiscard() is available */
	+/* #undef HAVE_BLK_QUEUE_SECDISCARD */
	+
	+/* blk_queue_secure_erase() is available */
	+/* #undef HAVE_BLK_QUEUE_SECURE_ERASE */
	+
	+/* blk_queue_write_cache() exists */
	+/* #undef HAVE_BLK_QUEUE_WRITE_CACHE */
	+
	+/* blk_queue_write_cache() is GPL-only */
	+/* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */
	+
	+/* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the
	+ CoreFoundation framework. */
	+/* #undef HAVE_CFLOCALECOPYCURRENT */
	+
	+/* Define to 1 if you have the Mac OS X function
	+ CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */
	+/* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */
	+
	+/* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in
	+ the CoreFoundation framework. */
	+/* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */
	+
	+/* clear_inode() is available */
	+/* #undef HAVE_CLEAR_INODE */
	+
	+/* dentry uses const struct dentry_operations */
	+/* #undef HAVE_CONST_DENTRY_OPERATIONS */
	+
	+/* current_time() exists */
	+/* #undef HAVE_CURRENT_TIME */
	+
	+/* Define if the GNU dcgettext() function is already present or preinstalled.
	+ */
	+/* #undef HAVE_DCGETTEXT */
	+
	+/* DECLARE_EVENT_CLASS() is available */
	+/* #undef HAVE_DECLARE_EVENT_CLASS */
	+
	+/* sops->dirty_inode() wants flags */
	+/* #undef HAVE_DIRTY_INODE_WITH_FLAGS */
	+
	+/* Define to 1 if you have the <dlfcn.h> header file. */
	+#define HAVE_DLFCN_H 1
	+
	+/* d_make_root() is available */
	+/* #undef HAVE_D_MAKE_ROOT */
	+
	+/* d_prune_aliases() is available */
	+/* #undef HAVE_D_PRUNE_ALIASES */
	+
	+/* dops->d_revalidate() operation takes nameidata */
	+/* #undef HAVE_D_REVALIDATE_NAMEIDATA */
	+
	+/* eops->encode_fh() wants child and parent inodes */
	+/* #undef HAVE_ENCODE_FH_WITH_INODE */
	+
	+/* sops->evict_inode() exists */
	+/* #undef HAVE_EVICT_INODE */
	+
	+/* fops->aio_fsync() exists */
	+/* #undef HAVE_FILE_AIO_FSYNC */
	+
	+/* file_dentry() is available */
	+/* #undef HAVE_FILE_DENTRY */
	+
	+/* file_inode() is available */
	+/* #undef HAVE_FILE_INODE */
	+
	+/* iops->follow_link() cookie */
	+/* #undef HAVE_FOLLOW_LINK_COOKIE */
	+
	+/* iops->follow_link() nameidata */
	+/* #undef HAVE_FOLLOW_LINK_NAMEIDATA */
	+
	+/* fops->fsync() with range */
	+/* #undef HAVE_FSYNC_RANGE */
	+
	+/* fops->fsync() without dentry */
	+/* #undef HAVE_FSYNC_WITHOUT_DENTRY */
	+
	+/* generic_start_io_acct()/generic_end_io_acct() available */
	+/* #undef HAVE_GENERIC_IO_ACCT_3ARG */
	+
	+/* generic_start_io_acct()/generic_end_io_acct() 4 arg available */
	+/* #undef HAVE_GENERIC_IO_ACCT_4ARG */
	+
	+/* generic_readlink is global */
	+/* #undef HAVE_GENERIC_READLINK */
	+
	+/* generic_setxattr() exists */
	+/* #undef HAVE_GENERIC_SETXATTR */
	+
	+/* generic_write_checks() takes kiocb */
	+/* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */
	+
	+/* Define if the GNU gettext() function is already present or preinstalled. */
	+/* #undef HAVE_GETTEXT */
	+
	+/* get_disk_and_module() is available */
	+/* #undef HAVE_GET_DISK_AND_MODULE */
	+
	+/* iops->get_link() cookie */
	+/* #undef HAVE_GET_LINK_COOKIE */
	+
	+/* iops->get_link() delayed */
	+/* #undef HAVE_GET_LINK_DELAYED */
	+
	+/* group_info->gid exists */
	+/* #undef HAVE_GROUP_INFO_GID */
	+
	+/* Define if you have the iconv() function and it works. */
	+#define HAVE_ICONV 1
	+
	+/* yes */
	+/* #undef HAVE_INODE_LOCK_SHARED */
	+
	+/* inode_set_flags() exists */
	+/* #undef HAVE_INODE_SET_FLAGS */
	+
	+/* inode_set_iversion() exists */
	+/* #undef HAVE_INODE_SET_IVERSION */
	+
	+/* inode->i_time's are timespec64 /
	+/* #undef HAVE_INODE_TIMESPEC64_TIMES */
	+
	+/* timestamp_truncate() exists */
	+/* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */
	+
	+/* Define to 1 if you have the <inttypes.h> header file. */
	+#define HAVE_INTTYPES_H 1
	+
	+/* in_compat_syscall() is available */
	+/* #undef HAVE_IN_COMPAT_SYSCALL */
	+
	+/* yes */
	+/* #undef HAVE_IO_SCHEDULE_TIMEOUT */
	+
	+/* Define to 1 if you have the `issetugid' function. */
	+#define HAVE_ISSETUGID 1
	+
	+/* kernel has kernel_fpu_* functions */
	+/* #undef HAVE_KERNEL_FPU */
	+
	+/* kernel has asm/fpu/api.h */
	+/* #undef HAVE_KERNEL_FPU_API_HEADER */
	+
	+/* kernel fpu internal */
	+/* #undef HAVE_KERNEL_FPU_INTERNAL */
	+
	+/* uncached_acl_sentinel() exists */
	+/* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */
	+
	+/* kernel does stack verification */
	+/* #undef HAVE_KERNEL_OBJTOOL */
	+
	+/* kernel_read() take loff_t pointer */
	+/* #undef HAVE_KERNEL_READ_PPOS */
	+
	+/* timer_list.function gets a timer_list */
	+/* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */
	+
	+/* struct timer_list has a flags member */
	+/* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */
	+
	+/* timer_setup() is available */
	+/* #undef HAVE_KERNEL_TIMER_SETUP */
	+
	+/* kernel_write() take loff_t pointer */
	+/* #undef HAVE_KERNEL_WRITE_PPOS */
	+
	+/* kmem_cache_create_usercopy() exists */
	+/* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */
	+
	+/* kstrtoul() exists */
	+/* #undef HAVE_KSTRTOUL */
	+
	+/* ktime_get_coarse_real_ts64() exists */
	+/* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */
	+
	+/* ktime_get_raw_ts64() exists */
	+/* #undef HAVE_KTIME_GET_RAW_TS64 */
	+
	+/* kvmalloc exists */
	+/* #undef HAVE_KVMALLOC */
	+
	+/* kernel has large stacks */
	+/* #undef HAVE_LARGE_STACKS */
	+
	+/* Define if you have libaio */
	+/* #undef HAVE_LIBAIO */
	+
	+/* Define if you have libblkid */
	+/* #undef HAVE_LIBBLKID */
	+
	+/* Define if you have libssl */
	+#define HAVE_LIBSSL 1
	+
	+/* Define to 1 if you have the `tirpc' library (-ltirpc). */
	+/* #undef HAVE_LIBTIRPC */
	+
	+/* Define if you have libudev */
	+/* #undef HAVE_LIBUDEV */
	+
	+/* Define if udev_device_get_is_initialized is available */
	+/* #undef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED */
	+
	+/* Define if you have libuuid */
	+/* #undef HAVE_LIBUUID */
	+
	+/* lseek_execute() is available */
	+/* #undef HAVE_LSEEK_EXECUTE */
	+
	+/* makedev() is declared in sys/mkdev.h */
	+/* #undef HAVE_MAKEDEV_IN_MKDEV */
	+
	+/* makedev() is declared in sys/sysmacros.h */
	+/* #undef HAVE_MAKEDEV_IN_SYSMACROS */
	+
	+/* Noting that make_request_fn() returns blk_qc_t */
	+/* #undef HAVE_MAKE_REQUEST_FN_RET_QC */
	+
	+/* Noting that make_request_fn() returns void */
	+/* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */
	+
	+/* Define to 1 if you have the <memory.h> header file. */
	+#define HAVE_MEMORY_H 1
	+
	+/* iops->create()/mkdir()/mknod() take umode_t */
	+/* #undef HAVE_MKDIR_UMODE_T */
	+
	+/* Define to 1 if you have the `mlockall' function. */
	+#define HAVE_MLOCKALL 1
	+
	+/* Define if host toolchain supports MOVBE */
	+#define HAVE_MOVBE 1
	+
	+/* new_sync_read()/new_sync_write() are available */
	+/* #undef HAVE_NEW_SYNC_READ */
	+
	+/* iops->getattr() takes a path */
	+/* #undef HAVE_PATH_IOPS_GETATTR */
	+
	+/* Define if host toolchain supports PCLMULQDQ */
	+#define HAVE_PCLMULQDQ 1
	+
	+/* posix_acl_chmod() exists */
	+/* #undef HAVE_POSIX_ACL_CHMOD */
	+
	+/* posix_acl_from_xattr() needs user_ns */
	+/* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */
	+
	+/* posix_acl_release() is available */
	+/* #undef HAVE_POSIX_ACL_RELEASE */
	+
	+/* posix_acl_release() is GPL-only */
	+/* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */
	+
	+/* posix_acl_valid() wants user namespace */
	+/* #undef HAVE_POSIX_ACL_VALID_WITH_NS */
	+
	+/* proc_ops structure exists */
	+/* #undef HAVE_PROC_OPS_STRUCT */
	+
	+/* iops->put_link() cookie */
	+/* #undef HAVE_PUT_LINK_COOKIE */
	+
	+/* iops->put_link() delayed */
	+/* #undef HAVE_PUT_LINK_DELAYED */
	+
	+/* iops->put_link() nameidata */
	+/* #undef HAVE_PUT_LINK_NAMEIDATA */
	+
	+/* If available, contains the Python version number currently in use. */
	+#define HAVE_PYTHON "3.7"
	+
	+/* qat is enabled and existed */
	+/* #undef HAVE_QAT */
	+
	+/* iops->rename() wants flags */
	+/* #undef HAVE_RENAME_WANTS_FLAGS */
	+
	+/* REQ_DISCARD is defined */
	+/* #undef HAVE_REQ_DISCARD */
	+
	+/* REQ_FLUSH is defined */
	+/* #undef HAVE_REQ_FLUSH */
	+
	+/* REQ_OP_DISCARD is defined */
	+/* #undef HAVE_REQ_OP_DISCARD */
	+
	+/* REQ_OP_FLUSH is defined */
	+/* #undef HAVE_REQ_OP_FLUSH */
	+
	+/* REQ_OP_SECURE_ERASE is defined */
	+/* #undef HAVE_REQ_OP_SECURE_ERASE */
	+
	+/* REQ_PREFLUSH is defined */
	+/* #undef HAVE_REQ_PREFLUSH */
	+
	+/* struct rw_semaphore has member activity */
	+/* #undef HAVE_RWSEM_ACTIVITY */
	+
	+/* struct rw_semaphore has atomic_long_t member count */
	+/* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */
	+
	+/* linux/sched/signal.h exists */
	+/* #undef HAVE_SCHED_SIGNAL_HEADER */
	+
	+/* setattr_prepare() is available */
	+/* #undef HAVE_SETATTR_PREPARE */
	+
	+/* iops->set_acl() exists */
	+/* #undef HAVE_SET_ACL */
	+
	+/* set_cached_acl() is usable */
	+/* #undef HAVE_SET_CACHED_ACL_USABLE */
	+
	+/* struct shrink_control exists */
	+/* #undef HAVE_SHRINK_CONTROL_STRUCT */
	+
	+/* new shrinker callback wants 2 args */
	+/* #undef HAVE_SINGLE_SHRINKER_CALLBACK */
	+
	+/* ->count_objects exists */
	+/* #undef HAVE_SPLIT_SHRINKER_CALLBACK */
	+
	+#if defined(__amd64__) \|\| defined(__i386__)
	+/* Define if host toolchain supports SSE */
	+#define HAVE_SSE 1
	+
	+/* Define if host toolchain supports SSE2 */
	+#define HAVE_SSE2 1
	+
	+/* Define if host toolchain supports SSE3 */
	+#define HAVE_SSE3 1
	+
	+/* Define if host toolchain supports SSE4.1 */
	+#define HAVE_SSE4_1 1
	+
	+/* Define if host toolchain supports SSE4.2 */
	+#define HAVE_SSE4_2 1
	+
	+/* Define if host toolchain supports SSSE3 */
	+#define HAVE_SSSE3 1
	+#endif
	+
	+/* STACK_FRAME_NON_STANDARD is defined */
	+/* #undef HAVE_STACK_FRAME_NON_STANDARD */
	+
	+/* Define to 1 if you have the <stdint.h> header file. */
	+#define HAVE_STDINT_H 1
	+
	+/* Define to 1 if you have the <stdlib.h> header file. */
	+#define HAVE_STDLIB_H 1
	+
	+/* Define to 1 if you have the <strings.h> header file. */
	+#define HAVE_STRINGS_H 1
	+
	+/* Define to 1 if you have the <string.h> header file. */
	+#define HAVE_STRING_H 1
	+
	+/* Define to 1 if you have the `strlcat' function. */
	+#define HAVE_STRLCAT 1
	+
	+/* Define to 1 if you have the `strlcpy' function. */
	+#define HAVE_STRLCPY 1
	+
	+/* super_setup_bdi_name() exits */
	+/* #undef HAVE_SUPER_SETUP_BDI_NAME */
	+
	+/* super_block->s_user_ns exists */
	+/* #undef HAVE_SUPER_USER_NS */
	+
	+/* Define to 1 if you have the <sys/stat.h> header file. */
	+#define HAVE_SYS_STAT_H 1
	+
	+/* Define to 1 if you have the <sys/types.h> header file. */
	+#define HAVE_SYS_TYPES_H 1
	+
	+/* i_op->tmpfile() exists */
	+/* #undef HAVE_TMPFILE */
	+
	+/* totalhigh_pages() exists */
	+/* #undef HAVE_TOTALHIGH_PAGES */
	+
	+/* kernel has totalram_pages() */
	+/* #undef HAVE_TOTALRAM_PAGES_FUNC */
	+
	+/* kernel has __kernel_fpu_* functions */
	+/* #undef HAVE_UNDERSCORE_KERNEL_FPU */
	+
	+/* Define to 1 if you have the <unistd.h> header file. */
	+#define HAVE_UNISTD_H 1
	+
	+/* iops->getattr() takes a vfsmount */
	+/* #undef HAVE_VFSMOUNT_IOPS_GETATTR */
	+
	+/* aops->direct_IO() uses iovec */
	+/* #undef HAVE_VFS_DIRECT_IO_IOVEC */
	+
	+/* aops->direct_IO() uses iov_iter without rw */
	+/* #undef HAVE_VFS_DIRECT_IO_ITER */
	+
	+/* aops->direct_IO() uses iov_iter with offset */
	+/* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */
	+
	+/* aops->direct_IO() uses iov_iter with rw and offset */
	+/* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */
	+
	+/* fops->iterate() is available */
	+/* #undef HAVE_VFS_ITERATE */
	+
	+/* fops->iterate_shared() is available */
	+/* #undef HAVE_VFS_ITERATE_SHARED */
	+
	+/* fops->readdir() is available */
	+/* #undef HAVE_VFS_READDIR */
	+
	+/* fops->read/write_iter() are available */
	+/* #undef HAVE_VFS_RW_ITERATE */
	+
	+/* __vmalloc page flags exists */
	+/* #undef HAVE_VMALLOC_PAGE_KERNEL */
	+
	+/* yes */
	+/* #undef HAVE_WAIT_ON_BIT_ACTION */
	+
	+/* wait_queue_entry_t exists */
	+/* #undef HAVE_WAIT_QUEUE_ENTRY_T */
	+
	+/* wq_head->head and wq_entry->entry exist */
	+/* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */
	+
	+/* xattr_handler->get() wants dentry */
	+/* #undef HAVE_XATTR_GET_DENTRY */
	+
	+/* xattr_handler->get() wants both dentry and inode */
	+/* #undef HAVE_XATTR_GET_DENTRY_INODE */
	+
	+/* xattr_handler->get() wants xattr_handler */
	+/* #undef HAVE_XATTR_GET_HANDLER */
	+
	+/* xattr_handler has name */
	+/* #undef HAVE_XATTR_HANDLER_NAME */
	+
	+/* xattr_handler->list() wants dentry */
	+/* #undef HAVE_XATTR_LIST_DENTRY */
	+
	+/* xattr_handler->list() wants xattr_handler */
	+/* #undef HAVE_XATTR_LIST_HANDLER */
	+
	+/* xattr_handler->list() wants simple */
	+/* #undef HAVE_XATTR_LIST_SIMPLE */
	+
	+/* xattr_handler->set() wants dentry */
	+/* #undef HAVE_XATTR_SET_DENTRY */
	+
	+/* xattr_handler->set() wants both dentry and inode */
	+/* #undef HAVE_XATTR_SET_DENTRY_INODE */
	+
	+/* xattr_handler->set() wants xattr_handler */
	+/* #undef HAVE_XATTR_SET_HANDLER */
	+
	+/* Define if you have zlib */
	+#define HAVE_ZLIB 1
	+
	+/* __posix_acl_chmod() exists */
	+/* #undef HAVE___POSIX_ACL_CHMOD */
	+
	+/* Define as const if the declaration of iconv() needs const. */
	+#define ICONV_CONST
	+
	+/* kernel exports FPU functions */
	+/* #undef KERNEL_EXPORTS_X86_FPU */
	+
	+/* Define to the sub-directory where libtool stores uninstalled libraries. */
	+#define LT_OBJDIR ".libs/"
	+
	+/* make_request_fn() return type */
	+/* #undef MAKE_REQUEST_FN_RET */
	+
	+/* hardened module_param_call */
	+/* #undef MODULE_PARAM_CALL_CONST */
	+
	+/* struct shrink_control has nid */
	+/* #undef SHRINK_CONTROL_HAS_NID */
	+
	+/* Defined for legacy compatibility. */
	+#define SPL_META_ALIAS ZFS_META_ALIAS
	+
	+/* Defined for legacy compatibility. */
	+#define SPL_META_RELEASE ZFS_META_RELEASE
	+
	+/* Defined for legacy compatibility. */
	+#define SPL_META_VERSION ZFS_META_VERSION
	+
	+/* True if ZFS is to be compiled for a FreeBSD system */
	+#define SYSTEM_FREEBSD 1
	+
	+/* True if ZFS is to be compiled for a Linux system */
	+/* #undef SYSTEM_LINUX */
	+
	+/* zfs debugging enabled */
	+/* #define ZFS_DEBUG 1 */
	+
	+/* /dev/zfs minor */
	+/* #undef ZFS_DEVICE_MINOR */
	+
	+/* enum node_stat_item contains NR_FILE_PAGES */
	+/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */
	+
	+/* enum node_stat_item contains NR_INACTIVE_ANON */
	+/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */
	+
	+/* enum node_stat_item contains NR_INACTIVE_FILE */
	+/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */
	+
	+/* enum node_stat_item contains NR_SLAB_RECLAIMABLE */
	+/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE */
	+
	+/* enum zone_stat_item contains NR_FILE_PAGES */
	+/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */
	+
	+/* enum zone_stat_item contains NR_INACTIVE_ANON */
	+/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */
	+
	+/* enum zone_stat_item contains NR_INACTIVE_FILE */
	+/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */
	+
	+/* enum zone_stat_item contains NR_SLAB_RECLAIMABLE */
	+/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_SLAB_RECLAIMABLE */
	+
	+/* global_node_page_state() exists */
	+/* #undef ZFS_GLOBAL_NODE_PAGE_STATE */
	+
	+/* global_zone_page_state() exists */
	+/* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */
	+
	+/* Define to 1 if GPL-only symbols can be used */
	+/* #undef ZFS_IS_GPL_COMPATIBLE */
	+
	+/* Define the project alias string. */
	+#define ZFS_META_ALIAS "zfs-0.8.0-1"
	+
	+/* Define the project author. */
	+#define ZFS_META_AUTHOR "OpenZFS on Linux"
	+
	+/* Define the project release date. */
	+/* #undef ZFS_META_DATA */
	+
	+/* Define the maximum compatible kernel version. */
	+#define ZFS_META_KVER_MAX "5.6"
	+
	+/* Define the minimum compatible kernel version. */
	+#define ZFS_META_KVER_MIN "3.10"
	+
	+/* Define the project license. */
	+#define ZFS_META_LICENSE "CDDL"
	+
	+/* Define the libtool library 'age' version information. */
	+/* #undef ZFS_META_LT_AGE */
	+
	+/* Define the libtool library 'current' version information. */
	+/* #undef ZFS_META_LT_CURRENT */
	+
	+/* Define the libtool library 'revision' version information. */
	+/* #undef ZFS_META_LT_REVISION */
	+
	+/* Define the project name. */
	+#define ZFS_META_NAME "zfs"
	+
	+/* Define the project release. */
	+#define ZFS_META_RELEASE "1"
	+
	+/* Define the project version. */
	+#define ZFS_META_VERSION "0.8.0"
	+
	Index: head/sys/modules/zfs/zfs_gitrev.h
	===================================================================
	--- head/sys/modules/zfs/zfs_gitrev.h
	+++ head/sys/modules/zfs/zfs_gitrev.h
	@@ -0,0 +1,5 @@
	+/*
	+ * $FreeBSD$
	+ */
	+
	+#define ZFS_META_GITREV "zfs-0.7.0-3175-g184df27ee"
	Index: head/sys/vm/vm.h
	===================================================================
	--- head/sys/vm/vm.h
	+++ head/sys/vm/vm.h
	@@ -112,7 +112,9 @@
	* Define it here for "applications" that include vm headers (e.g.,
	* genassym).
	*/
	+#ifndef HAVE_BOOLEAN
	typedef int boolean_t;
	+#endif

	/*
	* The exact set of memory attributes is machine dependent. However,
	Index: head/tests/sys/cddl/zfs/bin/file_write.c
	===================================================================
	--- head/tests/sys/cddl/zfs/bin/file_write.c
	+++ head/tests/sys/cddl/zfs/bin/file_write.c
	@@ -28,6 +28,7 @@
	#pragma ident "@(#)file_write.c 1.4 07/10/09 SMI"

	#include "file_common.h"
	+#include <inttypes.h>
	#include <libgen.h>

	static unsigned char bigbuffer[BIGBUFFERSIZE];
	@@ -180,14 +181,15 @@
	}
	noffset = lseek(bigfd, offset, SEEK_SET);
	if (noffset != offset) {
	- (void) printf("lseek %s (%lld/%lld) failed [%s]%d.Aborting!\n",
	+ (void) printf("lseek %s (%"PRId64"/%"PRId64") "
	+ "failed [%s]%d. Aborting!\n",
	filename, offset, noffset, strerror(errno), errno);
	exit(errno);
	}

	if (verbose) {
	(void) printf("%s: block_size = %d, write_count = %d, "
	- "offset = %lld, data = %s%d\n", filename, block_size,
	+ "offset = %"PRId64", data = %s%d\n", filename, block_size,
	write_count, offset,
	(fillchar == 0) ? "0->" : "",
	(fillchar == 0) ? DATA_RANGE : fillchar);
	@@ -197,17 +199,17 @@
	ssize_t n;

	if ((n = write(bigfd, &bigbuffer, block_size)) == -1) {
	- (void) printf("write failed (%ld), good_writes = %lld, "
	+ (void) printf("write failed (%ld), "
	+ "good_writes = %"PRId64", "
	"error: %s[%d]\n", (long)n, good_writes,
	- strerror(errno),
	- errno);
	+ strerror(errno), errno);
	exit(errno);
	}
	good_writes++;
	}

	if (verbose) {
	- (void) printf("Success: good_writes = %lld (%lld)\n",
	+ (void) printf("Success: good_writes = %"PRId64" (%"PRId64")\n",
	good_writes, (good_writes * block_size));
	}

	Index: head/tools/boot/rootgen.sh
	===================================================================
	--- head/tools/boot/rootgen.sh
	+++ head/tools/boot/rootgen.sh
	@@ -107,12 +107,12 @@
	cpsys ${src} ${mntpt}
	# need to make a couple of tweaks
	cat >> ${mntpt}/boot/loader.conf <<EOF
	+cryptodev_load=YES
	zfs_load=YES
	-opensolaris_load=YES
	EOF
	cp /boot/kernel/acl_nfs4.ko ${mntpt}/boot/kernel/acl_nfs4.ko
	+ cp /boot/kernel/cryptodev.ko ${mntpt}/boot/kernel/cryptodev.ko
	cp /boot/kernel/zfs.ko ${mntpt}/boot/kernel/zfs.ko
	- cp /boot/kernel/opensolaris.ko ${mntpt}/boot/kernel/opensolaris.ko
	# end tweaks
	zfs umount -f ${pool}/ROOT/default
	zfs set mountpoint=none ${pool}/ROOT/default
	@@ -146,12 +146,12 @@
	cpsys ${src} ${mntpt}
	# need to make a couple of tweaks
	cat >> ${mntpt}/boot/loader.conf <<EOF
	+cryptodev_load=YES
	zfs_load=YES
	-opensolaris_load=YES
	EOF
	cp /boot/kernel/acl_nfs4.ko ${mntpt}/boot/kernel/acl_nfs4.ko
	+ cp /boot/kernel/cryptodev.ko ${mntpt}/boot/kernel/cryptodev.ko
	cp /boot/kernel/zfs.ko ${mntpt}/boot/kernel/zfs.ko
	- cp /boot/kernel/opensolaris.ko ${mntpt}/boot/kernel/opensolaris.ko
	# end tweaks
	zfs umount -f ${pool}/ROOT/default
	zfs set mountpoint=none ${pool}/ROOT/default
	@@ -186,12 +186,12 @@
	cpsys ${src} ${mntpt}
	# need to make a couple of tweaks
	cat >> ${mntpt}/boot/loader.conf <<EOF
	+cryptodev_load=YES
	zfs_load=YES
	-opensolaris_load=YES
	EOF
	cp /boot/kernel/acl_nfs4.ko ${mntpt}/boot/kernel/acl_nfs4.ko
	+ cp /boot/kernel/cryptodev.ko ${mntpt}/boot/kernel/cryptodev.ko
	cp /boot/kernel/zfs.ko ${mntpt}/boot/kernel/zfs.ko
	- cp /boot/kernel/opensolaris.ko ${mntpt}/boot/kernel/opensolaris.ko
	# end tweaks
	zfs umount -f ${pool}/ROOT/default
	zfs set mountpoint=none ${pool}/ROOT/default
	@@ -268,12 +268,12 @@
	cpsys ${src} ${mntpt}
	# need to make a couple of tweaks
	cat >> ${mntpt}/boot/loader.conf <<EOF
	+cryptodev_load=YES
	zfs_load=YES
	-opensolaris_load=YES
	EOF
	cp /boot/kernel/acl_nfs4.ko ${mntpt}/boot/kernel/acl_nfs4.ko
	+ cp /boot/kernel/cryptodev.ko ${mntpt}/boot/kernel/cryptodev.ko
	cp /boot/kernel/zfs.ko ${mntpt}/boot/kernel/zfs.ko
	- cp /boot/kernel/opensolaris.ko ${mntpt}/boot/kernel/opensolaris.ko
	# end tweaks
	zfs umount -f ${pool}/ROOT/default
	zfs set mountpoint=none ${pool}/ROOT/default
	@@ -310,12 +310,12 @@
	cpsys ${src} ${mntpt}
	# need to make a couple of tweaks
	cat >> ${mntpt}/boot/loader.conf <<EOF
	+cryptodev_load=YES
	zfs_load=YES
	-opensolaris_load=YES
	EOF
	cp /boot/kernel/acl_nfs4.ko ${mntpt}/boot/kernel/acl_nfs4.ko
	+ cp /boot/kernel/cryptodev.ko ${mntpt}/boot/kernel/cryptodev.ko
	cp /boot/kernel/zfs.ko ${mntpt}/boot/kernel/zfs.ko
	- cp /boot/kernel/opensolaris.ko ${mntpt}/boot/kernel/opensolaris.ko
	# end tweaks
	zfs umount -f ${pool}/ROOT/default
	zfs set mountpoint=none ${pool}/ROOT/default
	@@ -352,12 +352,12 @@
	cpsys ${src} ${mntpt}
	# need to make a couple of tweaks
	cat >> ${mntpt}/boot/loader.conf <<EOF
	+cryptodev_load=YES
	zfs_load=YES
	-opensolaris_load=YES
	EOF
	cp /boot/kernel/acl_nfs4.ko ${mntpt}/boot/kernel/acl_nfs4.ko
	+ cp /boot/kernel/cryptodev.ko ${mntpt}/boot/kernel/cryptodev.ko
	cp /boot/kernel/zfs.ko ${mntpt}/boot/kernel/zfs.ko
	- cp /boot/kernel/opensolaris.ko ${mntpt}/boot/kernel/opensolaris.ko
	# end tweaks
	zfs umount -f ${pool}/ROOT/default
	zfs set mountpoint=none ${pool}/ROOT/default
	@@ -507,13 +507,13 @@
	cpsys ${src} ${mntpt}
	# need to make a couple of tweaks
	cat >> ${mntpt}/boot/loader.conf <<EOF
	+cryptodev_load=YES
	zfs_load=YES
	-opensolaris_load=YES
	geom_eli_load=YES
	EOF
	cp /boot/kernel/acl_nfs4.ko ${mntpt}/boot/kernel/acl_nfs4.ko
	+ cp /boot/kernel/cryptodev.ko ${mntpt}/boot/kernel/cryptodev.ko
	cp /boot/kernel/zfs.ko ${mntpt}/boot/kernel/zfs.ko
	- cp /boot/kernel/opensolaris.ko ${mntpt}/boot/kernel/opensolaris.ko
	cp /boot/kernel/geom_eli.ko ${mntpt}/boot/kernel/geom_eli.ko
	# end tweaks
	zfs umount -f ${pool}/ROOT/default
	@@ -556,13 +556,13 @@
	cpsys ${src} ${mntpt}
	# need to make a couple of tweaks
	cat >> ${mntpt}/boot/loader.conf <<EOF
	+cryptodev_load=YES
	zfs_load=YES
	-opensolaris_load=YES
	geom_eli_load=YES
	EOF
	cp /boot/kernel/acl_nfs4.ko ${mntpt}/boot/kernel/acl_nfs4.ko
	+ cp /boot/kernel/cryptodev.ko ${mntpt}/boot/kernel/cryptodev.ko
	cp /boot/kernel/zfs.ko ${mntpt}/boot/kernel/zfs.ko
	- cp /boot/kernel/opensolaris.ko ${mntpt}/boot/kernel/opensolaris.ko
	cp /boot/kernel/geom_eli.ko ${mntpt}/boot/kernel/geom_eli.ko
	# end tweaks
	zfs umount -f ${pool}/ROOT/default
	@@ -601,13 +601,13 @@
	cpsys ${src} ${mntpt}
	# need to make a couple of tweaks
	cat > ${mntpt}/boot/loader.conf <<EOF
	+cryptodev_load=YES
	zfs_load=YES
	-opensolaris_load=YES
	geom_eli_load=YES
	EOF
	cp /boot/kernel/acl_nfs4.ko ${mntpt}/boot/kernel/acl_nfs4.ko
	+ cp /boot/kernel/cryptodev.ko ${mntpt}/boot/kernel/cryptodev.ko
	cp /boot/kernel/zfs.ko ${mntpt}/boot/kernel/zfs.ko
	- cp /boot/kernel/opensolaris.ko ${mntpt}/boot/kernel/opensolaris.ko
	cp /boot/kernel/geom_eli.ko ${mntpt}/boot/kernel/geom_eli.ko
	# end tweaks
	zfs umount -f ${pool}/ROOT/default
	Index: head/usr.sbin/fstyp/Makefile
	===================================================================
	--- head/usr.sbin/fstyp/Makefile
	+++ head/usr.sbin/fstyp/Makefile
	@@ -28,23 +28,28 @@
	.if ${MK_ZFS} != "no"
	IGNORE_PRAGMA= YES

	-CFLAGS+= -DNEED_SOLARIS_BOOLEAN -DHAVE_ZFS
	-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
	-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
	-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
	-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
	+WARNS?= 0
	+CFLAGS.zfs.c+= -DIN_BASE
	+CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/include
	+CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
	+CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
	+CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
	+CFLAGS.zfs.c+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
	+CFLAGS.zfs.c+= -DHAVE_ISSETUGID
	+CFLAGS.zfs.c+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
	.endif

	-CFLAGS+=-I${SRCTOP}/sys
	+.for src in ${SRCS}
	+.if ${src} != "zfs.c"
	+CFLAGS.${src}+=-I${SRCTOP}/sys
	+.endif
	+.endfor

	+
	LIBADD= geom md ufs

	.if ${MK_ZFS} != "no"
	-LIBADD+=nvpair zfs
	+LIBADD+=nvpair zfs spl
	.endif

	.include <bsd.prog.mk>
	Index: head/usr.sbin/fstyp/zfs.c
	===================================================================
	--- head/usr.sbin/fstyp/zfs.c
	+++ head/usr.sbin/fstyp/zfs.c
	@@ -28,9 +28,7 @@
	__FBSDID("$FreeBSD$");

	#include <sys/types.h>
	-#include <cddl/compat/opensolaris/sys/types.h>
	#include <sys/time.h>
	-#include <cddl/compat/opensolaris/sys/time.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>

File Metadata

Mime Type: text/plain
Expires: Sat, Nov 16, 6:38 PM (22 h, 6 m)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: nEuG_FLBIZPt
Default Alt Text: D25872.diff (7 MB)

Offset	End	Complete
0	4194304	Yes
4194304	8215971	Yes

D25872.diffNo OneTemporaryActions

D25872.diffView Options

File Metadata

Event Timeline

D25872.diff
No OneTemporary
Actions

D25872.diff
View Options